[Lsb-messages] /var/www/bzr/lsb/devel/appbat r1010: add a 'new' version of entitychecker, for python >= 2.6

Mats Wichmann mats at linuxfoundation.org
Sat Dec 31 02:06:39 UTC 2016


------------------------------------------------------------
revno: 1010
committer: Mats Wichmann <mats at linuxfoundation.org>
branch nick: appbat
timestamp: Fri 2016-12-30 19:06:39 -0700
message:
  add a 'new' version of entitychecker, for python >= 2.6
added:
  extras/entitycheck_new.py
-------------- next part --------------
=== added file 'extras/entitycheck_new.py'
--- a/extras/entitycheck_new.py	1970-01-01 00:00:00 +0000
+++ b/extras/entitycheck_new.py	2016-12-31 02:06:39 +0000
@@ -0,0 +1,660 @@
+#!/usr/bin/env python
+#
+# entitycheck.py
+#
+# Check for entitites, and optionally retrieve them if missing/out of date.
+# retrieval is done using a combination of ftp and http, depending
+# on how the files are made available by their providers.
+# If you need to use a proxy, first set 'http_proxy' and 'ftp_proxy'
+# in the environment, using the form: xxx_proxy=host:port
+# Calling an external command (wget) is also possible
+# Note: the internal fetcher may go away if wget works out
+#
+"""Entity check
+
+Checks a nALFS-style build area for required packages and patches
+
+Command line options:
+
+-q, --quiet                   -- operate silently
+-e FILE, --entityfile=FILE    -- use entity file FILE [{entity_file}]
+-p PATH, --packagepath=PATH   -- use PATH for packages [{package_path}]
+-d PATH, --patchpath=PATH     -- use PATH for patches [{patch_path}]
+-g, --gensum                  -- generate crypto hashes for found entities
+-c, --checksum                -- check crypto hashes against file
+--delete-bad                  -- delete files with bad checksums
+-s FILE, --sumfile=FILE       -- use crypto hash file FILE [{md5sum_file}]
+-f, --fetch                   -- fetch missing pkgs
+--dryrun                      -- test what pkgs would be retrieved
+--show-extras                 -- report on unused pkgs/patches
+-u FILE, --updatefile=FILE    -- use FILE for pkg locations [{update_file}]
+-z URL, --fallback=URL        -- use URL as fallback for pkg retrieval
+                                 [{fallback_url}]
+--prefer-fallback             -- prefer the fallback URL over upstream
+-w, --wget                    -- use wget to fetch files
+-h, --help                    -- print this text and exit
+"""
+
+import getopt
+import os
+import re
+import sys
+import time
+import traceback
+import urllib
+
+# Handle hashing module (md5 is the old way, hashlib is the new)
+try:
+    import hashlib
+except ImportError:
+    import md5 as hashlib
+
+# PATH DEFAULTS the only stuff that should need tuning
+# These are used as globals throughout
+sys.path.append('./extras')
+try:
+    from entitypath import epaths
+except ImportError:
+    print "Cannot find configuration file <extras/entitypath.py>"
+    print "Probably need to run ./Configure or ./configure"
+    sys.exit(1)
+
+# Behavior defaults: generally should not change, since
+# the command-line options are not set up to be inverses
+# (i.e. if default is changed to true, no way to set back to false)
+generate_sums = None
+check_sums = None
+fetch_files = None
+use_wget = False
+dry_run = None
+noisy = True
+show_extras = None
+delete_bad = False
+prefer_fallback = False
+
+
+def usage(code, msg=""):
+    """Print usage, possibly with an error message preceding"""
+    if msg:
+        print "ERROR: {}\n".format(msg)
+    print __doc__.format(**epaths)
+    sys.exit(code)
+
+
+class Entity(object):
+    """Entity class instantiated for each entity read from the entity file."""
+
+    hashfunc = hashlib.md5
+    BLOCKSIZE = 1024 * 1024
+
+    def __init__(self, name, fname):
+        self.name = name
+        self.fname = fname
+        self.fullpath = ''
+        self.hash = None
+
+    @classmethod
+    def from_re_match(cls, match):
+        """Instantiate an Entity instance using a match object.
+
+        When reading from the entities file, regular expression matching
+        is done to filter. This method takes the match object and unpacks
+        it to pass on to the class constructor.
+        """
+        name, fname = match.groups()
+        return cls(name, fname)
+
+    def __repr__(self):
+        return "Entity('{}', '{}')".format(self.name, self.fname)
+
+    def __str__(self):
+        """Entity class "string" is "\tfilename", as that's what some things want to print"""
+        return '\t%s' % self.fname
+
+
+    def dohash(self):
+        """Generate and store hash for this entity's filename"""
+        with open(self.fullpath, "rb") as f:
+            cksum = self.hashfunc()
+            while 1:
+                block = f.read(Entity.BLOCKSIZE)
+                if not block:
+                    break
+                cksum.update(block)
+        self.hash = cksum.hexdigest()
+
+    def delete_file(self):
+        """Delete this entity's file, if it exists"""
+        if self.fullpath and os.path.exists(self.fullpath):
+            os.unlink(self.fullpath)
+
+    def running_output(self):
+        """Print running output of (front, message) using \b and \r tricks."""
+        if noisy:
+            output = "%s %s" % (self.front, self.message)
+            if len(output) > 79:
+                print "%s%.*s\r" % ("\b" * (len(self.front) + 1), 79, output),
+            else:
+                padding = " " * (79 - len(output))
+                print "%s%s%s\r" % ("\b" * (len(self.front) + 1), output,
+                                    padding),
+            sys.stdout.flush()  # force text to be displayed
+
+    def feedback(self, block_count, block_size, total):
+        """Callback function for urllib.urlretrieve()"""
+        so_far = block_count * block_size
+        if not so_far:
+            return
+        pct = so_far * 100L / total
+        # Work out remaining time: need transfer rate first
+        now = time.time()
+        elapsed = now - self.start
+        rate = so_far / elapsed
+        left = (total - so_far) / rate
+        tmp = time.gmtime(left)
+        if tmp[3]:
+            timestr = "%2d:" % tmp[3]
+        else:
+            timestr = ""  # skip hours if zero
+        timestr = "%s%02d:%02d remaining" % (timestr, tmp[4], tmp[5])
+        self.message = "%d of %d bytes (%d%%) %s" % (so_far, total, pct,
+                                                     timestr)
+        self.running_output()
+
+    def fetch(self, locations, fallback, destination):
+        """Retrieve the file for an entity using urllib.
+
+        Calls urllib.urlretrieve() up to three times to retrieve the
+        package. Retrieval locations are looked up in 'locations',
+        the name of a locations instance must match our name, that
+        instance contains a primary and optional secondary url to
+        find the package. A 'fallback' url is tried if they fail.
+        """
+        self.message = "not found"
+        self.front = "%s:" % self.fname
+        for name, path, alternate in locations:
+            if self.name != name:
+                continue
+            pkgpath = "%s/%s" % (destination, self.fname)
+            print self.front,
+            sys.stdout.flush()  # force text to be displayed
+
+            if dry_run:  # just print a message and bail
+                self.message = "fetch %s from (%s, ...) to %s" % \
+                               (self.fname, path, destination)
+                self.running_output()
+                print  # line break so previous message isn't overwritten
+                self.message = "skipped"
+                break
+
+            # try up to three times to fetch the file
+            numtries = 1
+            if prefer_fallback:
+                urls = [fallback, path, alternate]
+            else:
+                urls = [path, alternate, fallback]
+            for url in urls:
+                if not url:
+                    continue  # skip alternate if not defined
+                to_get = "%s/%s" % (url, self.fname)
+
+                try:
+                    self.start = time.time()
+                    filename, mime = urllib.urlretrieve(to_get, pkgpath,
+                                                        self.feedback)
+                    if mime and mime.gettype() == 'text/html':
+                        raise IOError
+                    self.message = "completed"
+                    break
+                except IOError:
+                    sys.stdout.write("\n")
+                    traceback.print_exc()
+                    sys.stderr.write("try %d failed: %s\n" % (numtries, url))
+                    numtries += 1
+                    # if it died or was interrupted, remove the partial file
+                    if os.path.exists(pkgpath):
+                        os.remove(pkgpath)
+                # failed? just try next url in loop
+                # if we didn't "break" out of inner loop, the fetch failed
+            else:
+                self.message = "retrieval failed"
+            break
+
+        self.running_output()
+        print
+        if self.message == "completed":
+            return 1
+        return 0
+
+    def wget(self, locations, fallback, destination):
+        """Retrieve the file for an entity using wget.
+
+        Calls external command wget up to three times to retrieve the
+        package. Retrieval locations are looked up in 'locations',
+        the name of a locations instance must match our name, that
+        instance contains a primary and optional secondary url to
+        find the package. A 'fallback' url is tried if they fail.
+        """
+        self.message = "not found"
+        for name, path, alternate in locations:
+            if self.name != name:
+                continue
+            pkgpath = "%s/%s" % (destination, self.fname)
+
+            if dry_run:  # just print a message and bail
+                print "fetch %s from (%s, ...) to %s" % \
+                               (self.fname, path, destination)
+                self.message = "skipped"
+                break
+
+            # try up to three times to fetch the file
+            dir_ = os.getcwd()
+            os.chdir(destination)
+            if prefer_fallback:
+                urls = [fallback, path, alternate]
+            else:
+                urls = [path, alternate, fallback]
+            for url in urls:
+                if not url:
+                    continue  # skip alternate if not defined
+                to_get = url + os.sep + self.fname
+                try:
+                    handle = os.popen("wget -c " + to_get)
+                    if not handle.close():
+                        self.message = "completed"
+                        os.chdir(dir_)
+                        return 1
+                except (IOError, KeyboardInterrupt):
+                    # if it died or was interrupted, remove the partial file
+                    # wget's restartability doesn't help, tool won't detect
+                    # we're a partial file. Try to fix this ...
+                    if os.path.exists(pkgpath):
+                        os.remove(pkgpath)
+                # failed? just try next url in loop
+            else:
+                self.message = "retrieval failed"
+                os.chdir(dir_)
+                return 0
+
+
+def parse_packages():
+    """Look for package files in the entities file.
+
+    Desired lines look like the following, but need to skip XML comments:
+    <!ENTITY foo-package "foo-1.2.tar.bz2">
+    Returns a match object.
+    """
+    with open(epaths['entity_file']) as entities:
+        for line in entities.readlines():
+            if not re.search('<!--', line):
+                yield re.search(r'(\S+)-package\s+"([^"]+)"', line)
+
+
+def parse_patches():
+    """Look for patch files in the entities file.
+
+    Desired lines look like the following, but need to skip XML comments:
+    <!ENTITY foo-patch "foo-1.2.patch">
+    Returns a match object.
+    """
+    with open(epaths['entity_file']) as entities:
+        for line in entities.readlines():
+            if not re.search('<!--', line):
+                yield re.search(r'(\S+)-patch\s+"([^"]+)"', line)
+
+
+def parse_entities():
+    """Extract package and patch entities from the entity files.
+
+    Returns a tuple containing a list of each.
+    """
+    # Hack: we're not parsing the xml, just having the parse functions
+    # apply a regular expression match, which we hope we can write adequately.
+    # Cue stock regular expressions joke ("now you have two problems").
+
+    packages = [
+        Entity.from_re_match(match) for match in parse_packages() if match
+    ]
+    patches = [
+        Entity.from_re_match(match) for match in parse_patches() if match
+    ]
+
+    return (packages, patches)
+
+
+def parse_locations():
+    """Parse the locations file.
+
+    The locations file contains lines of the form:
+       pkgname url alternate_url // comment
+    Returns a list of location tuples after some adjustment.
+    """
+    # alternate_url is used in cases where the package may not stay
+    # in one upstream place over time (e.g., if it moves to an "old"
+    # directory when a new version is released) - if we guess the pattern!
+
+    with open(epaths['update_file']) as package_file:
+        locations = []
+        for line in package_file.readlines():
+            if line[0] == '#':
+                continue  # '#' is used for an initial comment
+            bits = line.split()
+            if len(bits) < 3 or bits[1] == "none" or bits[1] == "None":
+                # Skip malformed lines:
+                # not enough fields, or with no retrieve location
+                continue
+            if bits[2] == "none" or bits[2] == "None":
+                bits[2] = None
+            locations.append((bits[0], bits[1], bits[2]))
+    return locations
+
+
+def check_missing(path, collection):
+    """Scan a collection of entities for missing files.
+
+    Generate checksums for found entities, if requested.
+    Returns a tuple with lists of (found, missing) entities.
+    """
+    found = []
+    missing = []
+    for item in collection:
+        # save fullpath in the entity instance since we'll use it again
+        item.fullpath = os.path.join(path, item.fname)
+        if not os.path.exists(item.fullpath):
+            missing.append(item)
+        else:
+            found.append(item)
+            if check_sums or generate_sums:
+                item.dohash()
+    return found, missing
+
+
+def check_extra(path, collection):
+    """Check for files in a path that are not described by entities.
+
+    Creates an entity instance for each and returns a list (this
+    is to be able to use a common print routine, only the names matter)
+    """
+    paths = {item.fname:item for item in collection}
+    notfound = [
+        Entity(None, filename) for filename in os.listdir(path)
+        if filename not in paths
+    ]
+    return notfound
+
+
+def check_checksums(collection, checksums):
+    """Check checksums on entities in collection.
+
+    Checked against 'checksums' dictionary.
+    Returns a tuple of entities with (bad, missing) checksums.
+    """
+    badsums = [
+        entity for entity in collection
+        if entity.fname in checksums and entity.hash != checksums[entity.fname]
+    ]
+    nosums = [entity for entity in collection if entity.fname not in checksums]
+    return badsums, nosums
+
+
+def dump_coll(collection, msg):
+    """Print a collection.
+
+    Use msg and item count for header, then print the file member of each item.
+    """
+    if collection:
+        print msg, len(collection)
+        for item in collection:
+            print item
+
+
+def report(fnd_pkg, fnd_pat, miss_pkg, miss_pat, extras):
+    """Generate package/patch report.
+
+    Global "noisy" controls whether there's any output
+    Return non-zero on fatal error (missing files)
+    """
+    rv = 0
+    if noisy:
+        print 'Checked entities from {entity_file}'.format(**epaths)
+        print 'Checked packages from {package_path}'.format(**epaths)
+        print 'Checked patches from {patch_path}'.format(**epaths)
+        print "Package entities found: ", len(fnd_pkg)
+        dump_coll(miss_pkg, "Missing packages:")
+        print "Patch entities found:", len(fnd_pat)
+        dump_coll(miss_pat, "Missing patches:")
+        if show_extras:
+            dump_coll(extras, "Files not in use:")
+    if miss_pkg or miss_pat:
+        rv = 1
+    return rv
+
+
+def sum_report(bad_pkg, bad_pat, no_pkg, no_pat):
+    """Generate checksum report
+    Global "noisy" controls whether there's any output
+    Return non-zero on fatal error (bad sums)
+    """
+    rv = 0
+    if noisy:
+        print "Checked checksums against {md5sum_file}".format(**epaths)
+        dump_coll(bad_pkg, "Packages with bad checksums:")
+        dump_coll(bad_pat, "Patches with bad checksums:")
+        dump_coll(no_pkg, "Packages without checksums:")
+        dump_coll(no_pat, "Patches without checksums:")
+    if bad_pkg or bad_pat:
+        rv = 1
+    return rv
+
+
+def fetch_report(retrieved, failed, missing):
+    """Generate file fetch report.
+    Global "noisy" controls whether there's any output
+    Return non-zero on fatal error (failed retrievals).
+    """
+    # "fetch" report split from regular report because we want report
+    # to show, then the fetch attempt, then the fetch report.
+    if noisy:
+        print "Updated %s packages" % retrieved
+        dump_coll(failed, "Packages which failed to retrieve:")
+        dump_coll(missing, "Entities missing:")
+
+    if failed or missing:
+        return 1
+    return 0
+
+
+def readhash():
+    """Read and parse a checksum file.
+    Returns a dictionary of sums indexed by filename.
+    """
+    with open(epaths['md5sum_file']) as sums:
+        checksums = {}
+        for line in sums.readlines():
+            (cksum, name) = line.split()
+            checksums[name] = cksum
+    return checksums
+
+
+def writehash(collection):
+    """Generate a new checksum file from checksums saved in entities. """
+    with open(epaths['md5sum_file'], 'w') as sums:
+        if noisy:
+            print "writing checksums to {md5sum_file}".format(**epaths)
+        for entity in collection:
+            sums.write("%s  %s\n" % (entity.hash, entity.fname))
+
+
+def retrieve_packages(missing_packages):
+    """Retrieve packages identified as missing."""
+    locations = parse_locations()
+    retrieved = 0
+    fails, missing = [], []
+    if os.access(epaths['package_path'], os.W_OK):
+        print "Retrieving..."
+        for pkg in missing_packages:
+            # New
+            if use_wget:
+                if pkg.wget(locations, epaths['fallback_url'],
+                            epaths['package_path']):
+                    retrieved += 1
+                else:
+                    if pkg.message == 'retrieval failed':
+                        fails.append(pkg)
+                    if pkg.message == 'not found':
+                        # Should not happen, but we track so we can
+                        # catch where an entity is defined but not listed
+                        # in package_locations.
+                        missing.append(pkg)
+            else:
+                if pkg.fetch(locations, epaths['fallback_url'],
+                             epaths['package_path']):
+                    retrieved += 1
+                else:
+                    if pkg.message == 'retrieval failed':
+                        fails.append(pkg)
+                    if pkg.message == 'not found':
+                        # Should not happen, but we track so we can
+                        # catch where an entity is defined but not listed
+                        # in package_locations.
+                        missing.append(pkg)
+    else:
+        print 'Error: cannot write to package directory ({package_path})'.format(
+            **epaths)
+    return (retrieved, fails, missing)
+
+
+def delete_bad_checksums(collection):
+    """delete any file with a bad checksum"""
+    if collection:
+        for item in collection:
+            print "Deleting " + item.fname
+            item.delete_file()
+
+
+# Main
+# 1. Process command-line arguments
+shortopts = 'qe:p:d:gcs:fu:z:wh'
+longopts = [
+    'quiet', 'entityfile=', 'packagepath=', 'patchpath=', 'gensum', 'checksum',
+    'sumfile=', 'fetch', 'updatefile=', 'dryrun', 'fallback=', 'wget', 'help',
+    'show-extras', 'delete-bad', 'prefer-fallback'
+]
+try:
+    opts, args = getopt.getopt(sys.argv[1:], shortopts, longopts)
+except getopt.error, msg:
+    usage(2, msg)
+
+if opts:
+    for opt, arg in opts:
+        if opt in ('--help', '-h'):
+            usage(0)
+        if opt in ('--entityfile', '-e'):
+            epaths['entity_file'] = arg
+        if opt in ('--packagepath', '-p'):
+            epaths['package_path'] = arg
+        if opt in ('--patchpath', '-d'):
+            epaths['patch_path'] = arg
+        if opt in ('--gensum', '-g'):
+            if check_sums:
+                usage(2, "check-sums and generate-sums are mutually exclusive")
+            generate_sums = 'yes'
+        if opt in ('--checksum', '-c'):
+            if generate_sums:
+                usage(2, "check-sums and generate-sums are mutually exclusive")
+            check_sums = 'yes'
+        if opt in ('--sumfile', '-s'):
+            epaths['md5sum_file'] = arg
+        if opt in ('--fetch', '-f'):
+            fetch_files = 'yes'
+        if opt in ('--updatefile', '-u'):
+            epaths['update_file'] = arg
+        if opt == '--dryrun':
+            dry_run = 'yes'
+            fetch_files = 'yes'
+        if opt in ('--fallback', '-z'):
+            epaths['fallback_url'] = arg
+        if opt in ('--wget', '-w'):
+            use_wget = True
+        if opt in ('--quiet', '-q'):
+            noisy = False
+        if opt == '--show-extras':
+            show_extras = 'yes'
+        if opt == '--delete-bad':
+            delete_bad = True
+        if opt == '--prefer-fallback':
+            prefer_fallback = True
+
+# 2. Check directories are okay up front
+# also saves time to make sure the checksum file is there
+if not os.path.isdir(epaths['package_path']):
+    usage(1, "Path to packages <{package_path}> is invalid".format(**epaths))
+if not os.path.isdir(epaths['patch_path']):
+    usage(1, "Path to patches <{patch_path}> is invalid".format(**epaths))
+if check_sums and not os.path.isfile(epaths['md5sum_file']):
+    usage(1, "Checksum file <{md5sum_file}> does not exist".format(**epaths))
+
+# 3. Parse the entity file and see if those entities exist
+packages, patches = parse_entities()
+found_packages, missing_packages = check_missing(epaths['package_path'],
+                                                 packages)
+found_patches, missing_patches = check_missing(epaths['patch_path'], patches)
+
+# 4. Scan the package and patch directories for extra files
+if epaths['package_path'] == epaths['patch_path']:
+    extras = check_extra(epaths['package_path'],
+                         found_packages + found_patches)
+else:  # packages and patches in separate directories
+    extras = check_extra(epaths['package_path'], found_packages)
+    extras += check_extra(epaths['patch_path'], found_patches)
+
+# 5. check checksums, if requested
+# Whether doing sums or not, generate a report of the work to date
+# so do that work first - it makes the ordering a little more sane
+
+exitcode = report(found_packages, found_patches, missing_packages,
+                  missing_patches, extras)
+
+if check_sums:
+    checksums = readhash()
+    bad_packages, no_packages = check_checksums(found_packages, checksums)
+    bad_patches, no_patches = check_checksums(found_patches, checksums)
+    exitcode += sum_report(bad_packages, bad_patches, no_packages, no_patches)
+
+# 6. Go fetch missing files if requested, and do another report
+if fetch_files:
+    retrieved = 0
+    fails, missing = [], []
+    if missing_packages:
+        retrieved, fails, missing = retrieve_packages(missing_packages)
+
+    # if we checked checksums, there may also be pkgs with bad cksums:
+    if check_sums and bad_packages:
+        r, f, m = retrieve_packages(bad_packages)
+        retrieved += r
+        fails += f
+        missing += m
+
+    # tell us what happened on the fetch
+    # drop the previous exitcode, otherwise we always exit with an error
+    exitcode = fetch_report(retrieved, fails, missing)
+
+# 7. Delete files with bad checksums if requested.
+# TODO: this looks dubious, what if we found bad files, fetched them,
+# and now the original list isn't all bad?
+if check_sums and delete_bad:
+    delete_bad_checksums(bad_packages + bad_patches)
+
+# 8. Quit non-zero if anything failed - missing entities, or
+# failed retrievals, etc.
+if exitcode:
+    if generate_sums:
+        print "Not writing checksum file, errors found"
+    sys.exit(exitcode)
+
+# 9. Generate a new checksum file, if requested (only if nothing fatal
+# happened above)
+if generate_sums:
+    writehash(found_packages + found_patches)
+
+sys.exit(exitcode)



More information about the lsb-messages mailing list