[Lsb-messages] /var/www/bzr/lsb/devel/appbat r1010: add a 'new' version of entitychecker, for python >= 2.6
Mats Wichmann
mats at linuxfoundation.org
Sat Dec 31 02:06:39 UTC 2016
------------------------------------------------------------
revno: 1010
committer: Mats Wichmann <mats at linuxfoundation.org>
branch nick: appbat
timestamp: Fri 2016-12-30 19:06:39 -0700
message:
add a 'new' version of entitychecker, for python >= 2.6
added:
extras/entitycheck_new.py
-------------- next part --------------
=== added file 'extras/entitycheck_new.py'
--- a/extras/entitycheck_new.py 1970-01-01 00:00:00 +0000
+++ b/extras/entitycheck_new.py 2016-12-31 02:06:39 +0000
@@ -0,0 +1,660 @@
+#!/usr/bin/env python
+#
+# entitycheck.py
+#
+# Check for entitites, and optionally retrieve them if missing/out of date.
+# retrieval is done using a combination of ftp and http, depending
+# on how the files are made available by their providers.
+# If you need to use a proxy, first set 'http_proxy' and 'ftp_proxy'
+# in the environment, using the form: xxx_proxy=host:port
+# Calling an external command (wget) is also possible
+# Note: the internal fetcher may go away if wget works out
+#
+"""Entity check
+
+Checks a nALFS-style build area for required packages and patches
+
+Command line options:
+
+-q, --quiet -- operate silently
+-e FILE, --entityfile=FILE -- use entity file FILE [{entity_file}]
+-p PATH, --packagepath=PATH -- use PATH for packages [{package_path}]
+-d PATH, --patchpath=PATH -- use PATH for patches [{patch_path}]
+-g, --gensum -- generate crypto hashes for found entities
+-c, --checksum -- check crypto hashes against file
+--delete-bad -- delete files with bad checksums
+-s FILE, --sumfile=FILE -- use crypto hash file FILE [{md5sum_file}]
+-f, --fetch -- fetch missing pkgs
+--dryrun -- test what pkgs would be retrieved
+--show-extras -- report on unused pkgs/patches
+-u FILE, --updatefile=FILE -- use FILE for pkg locations [{update_file}]
+-z URL, --fallback=URL -- use URL as fallback for pkg retrieval
+ [{fallback_url}]
+--prefer-fallback -- prefer the fallback URL over upstream
+-w, --wget -- use wget to fetch files
+-h, --help -- print this text and exit
+"""
+
+import getopt
+import os
+import re
+import sys
+import time
+import traceback
+import urllib
+
+# Handle hashing module (md5 is the old way, hashlib is the new)
+try:
+ import hashlib
+except ImportError:
+ import md5 as hashlib
+
+# PATH DEFAULTS the only stuff that should need tuning
+# These are used as globals throughout
+sys.path.append('./extras')
+try:
+ from entitypath import epaths
+except ImportError:
+ print "Cannot find configuration file <extras/entitypath.py>"
+ print "Probably need to run ./Configure or ./configure"
+ sys.exit(1)
+
+# Behavior defaults: generally should not change, since
+# the command-line options are not set up to be inverses
+# (i.e. if default is changed to true, no way to set back to false)
+generate_sums = None
+check_sums = None
+fetch_files = None
+use_wget = False
+dry_run = None
+noisy = True
+show_extras = None
+delete_bad = False
+prefer_fallback = False
+
+
+def usage(code, msg=""):
+ """Print usage, possibly with an error message preceding"""
+ if msg:
+ print "ERROR: {}\n".format(msg)
+ print __doc__.format(**epaths)
+ sys.exit(code)
+
+
+class Entity(object):
+ """Entity class instantiated for each entity read from the entity file."""
+
+ hashfunc = hashlib.md5
+ BLOCKSIZE = 1024 * 1024
+
+ def __init__(self, name, fname):
+ self.name = name
+ self.fname = fname
+ self.fullpath = ''
+ self.hash = None
+
+ @classmethod
+ def from_re_match(cls, match):
+ """Instantiate an Entity instance using a match object.
+
+ When reading from the entities file, regular expression matching
+ is done to filter. This method takes the match object and unpacks
+ it to pass on to the class constructor.
+ """
+ name, fname = match.groups()
+ return cls(name, fname)
+
+ def __repr__(self):
+ return "Entity('{}', '{}')".format(self.name, self.fname)
+
+ def __str__(self):
+ """Entity class "string" is "\tfilename", as that's what some things want to print"""
+ return '\t%s' % self.fname
+
+
+ def dohash(self):
+ """Generate and store hash for this entity's filename"""
+ with open(self.fullpath, "rb") as f:
+ cksum = self.hashfunc()
+ while 1:
+ block = f.read(Entity.BLOCKSIZE)
+ if not block:
+ break
+ cksum.update(block)
+ self.hash = cksum.hexdigest()
+
+ def delete_file(self):
+ """Delete this entity's file, if it exists"""
+ if self.fullpath and os.path.exists(self.fullpath):
+ os.unlink(self.fullpath)
+
+ def running_output(self):
+ """Print running output of (front, message) using \b and \r tricks."""
+ if noisy:
+ output = "%s %s" % (self.front, self.message)
+ if len(output) > 79:
+ print "%s%.*s\r" % ("\b" * (len(self.front) + 1), 79, output),
+ else:
+ padding = " " * (79 - len(output))
+ print "%s%s%s\r" % ("\b" * (len(self.front) + 1), output,
+ padding),
+ sys.stdout.flush() # force text to be displayed
+
+ def feedback(self, block_count, block_size, total):
+ """Callback function for urllib.urlretrieve()"""
+ so_far = block_count * block_size
+ if not so_far:
+ return
+ pct = so_far * 100L / total
+ # Work out remaining time: need transfer rate first
+ now = time.time()
+ elapsed = now - self.start
+ rate = so_far / elapsed
+ left = (total - so_far) / rate
+ tmp = time.gmtime(left)
+ if tmp[3]:
+ timestr = "%2d:" % tmp[3]
+ else:
+ timestr = "" # skip hours if zero
+ timestr = "%s%02d:%02d remaining" % (timestr, tmp[4], tmp[5])
+ self.message = "%d of %d bytes (%d%%) %s" % (so_far, total, pct,
+ timestr)
+ self.running_output()
+
+ def fetch(self, locations, fallback, destination):
+ """Retrieve the file for an entity using urllib.
+
+ Calls urllib.urlretrieve() up to three times to retrieve the
+ package. Retrieval locations are looked up in 'locations',
+ the name of a locations instance must match our name, that
+ instance contains a primary and optional secondary url to
+ find the package. A 'fallback' url is tried if they fail.
+ """
+ self.message = "not found"
+ self.front = "%s:" % self.fname
+ for name, path, alternate in locations:
+ if self.name != name:
+ continue
+ pkgpath = "%s/%s" % (destination, self.fname)
+ print self.front,
+ sys.stdout.flush() # force text to be displayed
+
+ if dry_run: # just print a message and bail
+ self.message = "fetch %s from (%s, ...) to %s" % \
+ (self.fname, path, destination)
+ self.running_output()
+ print # line break so previous message isn't overwritten
+ self.message = "skipped"
+ break
+
+ # try up to three times to fetch the file
+ numtries = 1
+ if prefer_fallback:
+ urls = [fallback, path, alternate]
+ else:
+ urls = [path, alternate, fallback]
+ for url in urls:
+ if not url:
+ continue # skip alternate if not defined
+ to_get = "%s/%s" % (url, self.fname)
+
+ try:
+ self.start = time.time()
+ filename, mime = urllib.urlretrieve(to_get, pkgpath,
+ self.feedback)
+ if mime and mime.gettype() == 'text/html':
+ raise IOError
+ self.message = "completed"
+ break
+ except IOError:
+ sys.stdout.write("\n")
+ traceback.print_exc()
+ sys.stderr.write("try %d failed: %s\n" % (numtries, url))
+ numtries += 1
+ # if it died or was interrupted, remove the partial file
+ if os.path.exists(pkgpath):
+ os.remove(pkgpath)
+ # failed? just try next url in loop
+ # if we didn't "break" out of inner loop, the fetch failed
+ else:
+ self.message = "retrieval failed"
+ break
+
+ self.running_output()
+ print
+ if self.message == "completed":
+ return 1
+ return 0
+
+ def wget(self, locations, fallback, destination):
+ """Retrieve the file for an entity using wget.
+
+ Calls external command wget up to three times to retrieve the
+ package. Retrieval locations are looked up in 'locations',
+ the name of a locations instance must match our name, that
+ instance contains a primary and optional secondary url to
+ find the package. A 'fallback' url is tried if they fail.
+ """
+ self.message = "not found"
+ for name, path, alternate in locations:
+ if self.name != name:
+ continue
+ pkgpath = "%s/%s" % (destination, self.fname)
+
+ if dry_run: # just print a message and bail
+ print "fetch %s from (%s, ...) to %s" % \
+ (self.fname, path, destination)
+ self.message = "skipped"
+ break
+
+ # try up to three times to fetch the file
+ dir_ = os.getcwd()
+ os.chdir(destination)
+ if prefer_fallback:
+ urls = [fallback, path, alternate]
+ else:
+ urls = [path, alternate, fallback]
+ for url in urls:
+ if not url:
+ continue # skip alternate if not defined
+ to_get = url + os.sep + self.fname
+ try:
+ handle = os.popen("wget -c " + to_get)
+ if not handle.close():
+ self.message = "completed"
+ os.chdir(dir_)
+ return 1
+ except (IOError, KeyboardInterrupt):
+ # if it died or was interrupted, remove the partial file
+ # wget's restartability doesn't help, tool won't detect
+ # we're a partial file. Try to fix this ...
+ if os.path.exists(pkgpath):
+ os.remove(pkgpath)
+ # failed? just try next url in loop
+ else:
+ self.message = "retrieval failed"
+ os.chdir(dir_)
+ return 0
+
+
+def parse_packages():
+ """Look for package files in the entities file.
+
+ Desired lines look like the following, but need to skip XML comments:
+ <!ENTITY foo-package "foo-1.2.tar.bz2">
+ Returns a match object.
+ """
+ with open(epaths['entity_file']) as entities:
+ for line in entities.readlines():
+ if not re.search('<!--', line):
+ yield re.search(r'(\S+)-package\s+"([^"]+)"', line)
+
+
+def parse_patches():
+ """Look for patch files in the entities file.
+
+ Desired lines look like the following, but need to skip XML comments:
+ <!ENTITY foo-patch "foo-1.2.patch">
+ Returns a match object.
+ """
+ with open(epaths['entity_file']) as entities:
+ for line in entities.readlines():
+ if not re.search('<!--', line):
+ yield re.search(r'(\S+)-patch\s+"([^"]+)"', line)
+
+
+def parse_entities():
+ """Extract package and patch entities from the entity files.
+
+ Returns a tuple containing a list of each.
+ """
+ # Hack: we're not parsing the xml, just having the parse functions
+ # apply a regular expression match, which we hope we can write adequately.
+ # Cue stock regular expressions joke ("now you have two problems").
+
+ packages = [
+ Entity.from_re_match(match) for match in parse_packages() if match
+ ]
+ patches = [
+ Entity.from_re_match(match) for match in parse_patches() if match
+ ]
+
+ return (packages, patches)
+
+
+def parse_locations():
+ """Parse the locations file.
+
+ The locations file contains lines of the form:
+ pkgname url alternate_url // comment
+ Returns a list of location tuples after some adjustment.
+ """
+ # alternate_url is used in cases where the package may not stay
+ # in one upstream place over time (e.g., if it moves to an "old"
+ # directory when a new version is released) - if we guess the pattern!
+
+ with open(epaths['update_file']) as package_file:
+ locations = []
+ for line in package_file.readlines():
+ if line[0] == '#':
+ continue # '#' is used for an initial comment
+ bits = line.split()
+ if len(bits) < 3 or bits[1] == "none" or bits[1] == "None":
+ # Skip malformed lines:
+ # not enough fields, or with no retrieve location
+ continue
+ if bits[2] == "none" or bits[2] == "None":
+ bits[2] = None
+ locations.append((bits[0], bits[1], bits[2]))
+ return locations
+
+
+def check_missing(path, collection):
+ """Scan a collection of entities for missing files.
+
+ Generate checksums for found entities, if requested.
+ Returns a tuple with lists of (found, missing) entities.
+ """
+ found = []
+ missing = []
+ for item in collection:
+ # save fullpath in the entity instance since we'll use it again
+ item.fullpath = os.path.join(path, item.fname)
+ if not os.path.exists(item.fullpath):
+ missing.append(item)
+ else:
+ found.append(item)
+ if check_sums or generate_sums:
+ item.dohash()
+ return found, missing
+
+
+def check_extra(path, collection):
+ """Check for files in a path that are not described by entities.
+
+ Creates an entity instance for each and returns a list (this
+ is to be able to use a common print routine, only the names matter)
+ """
+ paths = {item.fname:item for item in collection}
+ notfound = [
+ Entity(None, filename) for filename in os.listdir(path)
+ if filename not in paths
+ ]
+ return notfound
+
+
+def check_checksums(collection, checksums):
+ """Check checksums on entities in collection.
+
+ Checked against 'checksums' dictionary.
+ Returns a tuple of entities with (bad, missing) checksums.
+ """
+ badsums = [
+ entity for entity in collection
+ if entity.fname in checksums and entity.hash != checksums[entity.fname]
+ ]
+ nosums = [entity for entity in collection if entity.fname not in checksums]
+ return badsums, nosums
+
+
+def dump_coll(collection, msg):
+ """Print a collection.
+
+ Use msg and item count for header, then print the file member of each item.
+ """
+ if collection:
+ print msg, len(collection)
+ for item in collection:
+ print item
+
+
+def report(fnd_pkg, fnd_pat, miss_pkg, miss_pat, extras):
+ """Generate package/patch report.
+
+ Global "noisy" controls whether there's any output
+ Return non-zero on fatal error (missing files)
+ """
+ rv = 0
+ if noisy:
+ print 'Checked entities from {entity_file}'.format(**epaths)
+ print 'Checked packages from {package_path}'.format(**epaths)
+ print 'Checked patches from {patch_path}'.format(**epaths)
+ print "Package entities found: ", len(fnd_pkg)
+ dump_coll(miss_pkg, "Missing packages:")
+ print "Patch entities found:", len(fnd_pat)
+ dump_coll(miss_pat, "Missing patches:")
+ if show_extras:
+ dump_coll(extras, "Files not in use:")
+ if miss_pkg or miss_pat:
+ rv = 1
+ return rv
+
+
+def sum_report(bad_pkg, bad_pat, no_pkg, no_pat):
+ """Generate checksum report
+ Global "noisy" controls whether there's any output
+ Return non-zero on fatal error (bad sums)
+ """
+ rv = 0
+ if noisy:
+ print "Checked checksums against {md5sum_file}".format(**epaths)
+ dump_coll(bad_pkg, "Packages with bad checksums:")
+ dump_coll(bad_pat, "Patches with bad checksums:")
+ dump_coll(no_pkg, "Packages without checksums:")
+ dump_coll(no_pat, "Patches without checksums:")
+ if bad_pkg or bad_pat:
+ rv = 1
+ return rv
+
+
+def fetch_report(retrieved, failed, missing):
+ """Generate file fetch report.
+ Global "noisy" controls whether there's any output
+ Return non-zero on fatal error (failed retrievals).
+ """
+ # "fetch" report split from regular report because we want report
+ # to show, then the fetch attempt, then the fetch report.
+ if noisy:
+ print "Updated %s packages" % retrieved
+ dump_coll(failed, "Packages which failed to retrieve:")
+ dump_coll(missing, "Entities missing:")
+
+ if failed or missing:
+ return 1
+ return 0
+
+
+def readhash():
+ """Read and parse a checksum file.
+ Returns a dictionary of sums indexed by filename.
+ """
+ with open(epaths['md5sum_file']) as sums:
+ checksums = {}
+ for line in sums.readlines():
+ (cksum, name) = line.split()
+ checksums[name] = cksum
+ return checksums
+
+
+def writehash(collection):
+ """Generate a new checksum file from checksums saved in entities. """
+ with open(epaths['md5sum_file'], 'w') as sums:
+ if noisy:
+ print "writing checksums to {md5sum_file}".format(**epaths)
+ for entity in collection:
+ sums.write("%s %s\n" % (entity.hash, entity.fname))
+
+
+def retrieve_packages(missing_packages):
+ """Retrieve packages identified as missing."""
+ locations = parse_locations()
+ retrieved = 0
+ fails, missing = [], []
+ if os.access(epaths['package_path'], os.W_OK):
+ print "Retrieving..."
+ for pkg in missing_packages:
+ # New
+ if use_wget:
+ if pkg.wget(locations, epaths['fallback_url'],
+ epaths['package_path']):
+ retrieved += 1
+ else:
+ if pkg.message == 'retrieval failed':
+ fails.append(pkg)
+ if pkg.message == 'not found':
+ # Should not happen, but we track so we can
+ # catch where an entity is defined but not listed
+ # in package_locations.
+ missing.append(pkg)
+ else:
+ if pkg.fetch(locations, epaths['fallback_url'],
+ epaths['package_path']):
+ retrieved += 1
+ else:
+ if pkg.message == 'retrieval failed':
+ fails.append(pkg)
+ if pkg.message == 'not found':
+ # Should not happen, but we track so we can
+ # catch where an entity is defined but not listed
+ # in package_locations.
+ missing.append(pkg)
+ else:
+ print 'Error: cannot write to package directory ({package_path})'.format(
+ **epaths)
+ return (retrieved, fails, missing)
+
+
+def delete_bad_checksums(collection):
+ """delete any file with a bad checksum"""
+ if collection:
+ for item in collection:
+ print "Deleting " + item.fname
+ item.delete_file()
+
+
+# Main
+# 1. Process command-line arguments
+shortopts = 'qe:p:d:gcs:fu:z:wh'
+longopts = [
+ 'quiet', 'entityfile=', 'packagepath=', 'patchpath=', 'gensum', 'checksum',
+ 'sumfile=', 'fetch', 'updatefile=', 'dryrun', 'fallback=', 'wget', 'help',
+ 'show-extras', 'delete-bad', 'prefer-fallback'
+]
+try:
+ opts, args = getopt.getopt(sys.argv[1:], shortopts, longopts)
+except getopt.error, msg:
+ usage(2, msg)
+
+if opts:
+ for opt, arg in opts:
+ if opt in ('--help', '-h'):
+ usage(0)
+ if opt in ('--entityfile', '-e'):
+ epaths['entity_file'] = arg
+ if opt in ('--packagepath', '-p'):
+ epaths['package_path'] = arg
+ if opt in ('--patchpath', '-d'):
+ epaths['patch_path'] = arg
+ if opt in ('--gensum', '-g'):
+ if check_sums:
+ usage(2, "check-sums and generate-sums are mutually exclusive")
+ generate_sums = 'yes'
+ if opt in ('--checksum', '-c'):
+ if generate_sums:
+ usage(2, "check-sums and generate-sums are mutually exclusive")
+ check_sums = 'yes'
+ if opt in ('--sumfile', '-s'):
+ epaths['md5sum_file'] = arg
+ if opt in ('--fetch', '-f'):
+ fetch_files = 'yes'
+ if opt in ('--updatefile', '-u'):
+ epaths['update_file'] = arg
+ if opt == '--dryrun':
+ dry_run = 'yes'
+ fetch_files = 'yes'
+ if opt in ('--fallback', '-z'):
+ epaths['fallback_url'] = arg
+ if opt in ('--wget', '-w'):
+ use_wget = True
+ if opt in ('--quiet', '-q'):
+ noisy = False
+ if opt == '--show-extras':
+ show_extras = 'yes'
+ if opt == '--delete-bad':
+ delete_bad = True
+ if opt == '--prefer-fallback':
+ prefer_fallback = True
+
+# 2. Check directories are okay up front
+# also saves time to make sure the checksum file is there
+if not os.path.isdir(epaths['package_path']):
+ usage(1, "Path to packages <{package_path}> is invalid".format(**epaths))
+if not os.path.isdir(epaths['patch_path']):
+ usage(1, "Path to patches <{patch_path}> is invalid".format(**epaths))
+if check_sums and not os.path.isfile(epaths['md5sum_file']):
+ usage(1, "Checksum file <{md5sum_file}> does not exist".format(**epaths))
+
+# 3. Parse the entity file and see if those entities exist
+packages, patches = parse_entities()
+found_packages, missing_packages = check_missing(epaths['package_path'],
+ packages)
+found_patches, missing_patches = check_missing(epaths['patch_path'], patches)
+
+# 4. Scan the package and patch directories for extra files
+if epaths['package_path'] == epaths['patch_path']:
+ extras = check_extra(epaths['package_path'],
+ found_packages + found_patches)
+else: # packages and patches in separate directories
+ extras = check_extra(epaths['package_path'], found_packages)
+ extras += check_extra(epaths['patch_path'], found_patches)
+
+# 5. check checksums, if requested
+# Whether doing sums or not, generate a report of the work to date
+# so do that work first - it makes the ordering a little more sane
+
+exitcode = report(found_packages, found_patches, missing_packages,
+ missing_patches, extras)
+
+if check_sums:
+ checksums = readhash()
+ bad_packages, no_packages = check_checksums(found_packages, checksums)
+ bad_patches, no_patches = check_checksums(found_patches, checksums)
+ exitcode += sum_report(bad_packages, bad_patches, no_packages, no_patches)
+
+# 6. Go fetch missing files if requested, and do another report
+if fetch_files:
+ retrieved = 0
+ fails, missing = [], []
+ if missing_packages:
+ retrieved, fails, missing = retrieve_packages(missing_packages)
+
+ # if we checked checksums, there may also be pkgs with bad cksums:
+ if check_sums and bad_packages:
+ r, f, m = retrieve_packages(bad_packages)
+ retrieved += r
+ fails += f
+ missing += m
+
+ # tell us what happened on the fetch
+ # drop the previous exitcode, otherwise we always exit with an error
+ exitcode = fetch_report(retrieved, fails, missing)
+
+# 7. Delete files with bad checksums if requested.
+# TODO: this looks dubious, what if we found bad files, fetched them,
+# and now the original list isn't all bad?
+if check_sums and delete_bad:
+ delete_bad_checksums(bad_packages + bad_patches)
+
+# 8. Quit non-zero if anything failed - missing entities, or
+# failed retrievals, etc.
+if exitcode:
+ if generate_sums:
+ print "Not writing checksum file, errors found"
+ sys.exit(exitcode)
+
+# 9. Generate a new checksum file, if requested (only if nothing fatal
+# happened above)
+if generate_sums:
+ writehash(found_packages + found_patches)
+
+sys.exit(exitcode)
More information about the lsb-messages
mailing list