Mercurial > hgrepos > Python > apps > py-cutils
view cutils/treesum.py @ 266:0add8276e6b8
treesum: Handle errors like broken symlinks properly
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Tue, 18 Feb 2025 12:39:04 +0100 |
| parents | c3d6599c1b5e |
| children | b9aa65a30b4c |
line wrap: on
line source
# -*- coding: utf-8 -*- # :- # :Copyright: (c) 2020-2025 Franz Glasner # :License: BSD-3-Clause # :- r"""Generate and verify checksums for directory trees. """ from __future__ import print_function, absolute_import __all__ = [] import argparse import base64 import binascii import collections import datetime import errno import logging import os import re import stat import sys import time from . import (__version__, __revision__) from . import util from .util import cm from .util import digest from .util import walk from .util.crc32 import crc32 def main(argv=None): def _populate_generate_arguments(gp): """Use to populate command aliases. This is because :class:`argparse.ArgumentParser` does not support them for all supported Python versions. """ gp.add_argument( "--algorithm", "-a", action="store", type=util.argv2algo, help="1 (aka sha1), 224, 256 (aka sha256), 384, 512 (aka sha512), " "3 (alias for sha3-512), 3-224, 3-256, 3-384, 3-512, " "blake2b, blake2b-256, blake2s, " "blake2 (alias for blake2b), " "blake2-256 (alias for blake2b-256), " "md5. " "The default depends on availability in hashlib: " "blake2b-256, sha256 or sha1.") gp.add_argument( "--append-output", action="store_true", dest="append_output", help="Append to the output file instead of overwriting it.") gp.add_argument( "--base64", action="store_true", help="Output checksums in base64 notation, not hexadecimal " "(OpenBSD).") gp.add_argument( "--comment", action="append", default=[], help="Put given comment COMMENT into the output as \"COMMENT\". " "Can be given more than once.") gp.add_argument( "--follow-directory-symlinks", "-l", action=SymlinkAction, const="follow-directory-symlinks", default=FollowSymlinkConfig(False, False, True), dest="follow_symlinks", help="""Follow symbolic links to directories when walking a directory tree. Augments --physical and -p.""") gp.add_argument( "--follow-file-symlinks", action=SymlinkAction, const="follow-file-symlinks", default=FollowSymlinkConfig(False, False, True), dest="follow_symlinks", help="""Follow symbolic links to files when walking a directory tree. Augments --physical.""") gp.add_argument( "--full-mode", action="store_true", dest="metadata_full_mode", help="Consider all mode bits as returned from stat(2) when " "computing directory digests. " "Note that mode bits on symbolic links itself are not " "considered.") gp.add_argument( "--generator", choices=("normal", "full", "none"), default="normal", help="""Put a `GENERATOR' line into the output. `full' prints full Python and OS/platform version information, `normal' prints just whether Python 2 or Python 3 is used, and `none' suppresses the output completely. The default is `normal'.""") gp.add_argument( "--logical", "-L", action=SymlinkAction, dest="follow_symlinks", const=FollowSymlinkConfig(True, True, True), help="""Follow symbolic links everywhere: on command line arguments and -- while walking -- directory and file symbolic links. Overwrites any other symlink related options (--physical,-p, no-follow-directory-symlinks, no-follow-file-symlinks, et al.). """) gp.add_argument( "--minimal", nargs="?", const="", default=None, metavar="TAG", help="Produce minimal output only. If a TAG is given and not " "empty use it as the leading \"ROOT (<TAG>)\" output.") gp.add_argument( "--mmap", action="store_true", dest="mmap", default=None, help="Use mmap if available. Default is to determine " "automatically from the filesize.") gp.add_argument( "--mode", action="store_true", dest="metadata_mode", help="Consider the permission bits of stat(2) using S_IMODE (i.e. " "all bits without the filetype bits) when " "computing directory digests. Note that mode bits on " "symbolic links itself are not considered.") gp.add_argument( "--mtime", action="store_true", dest="metadata_mtime", help="Consider the mtime of files (non-directories) when " "generating digests for directories. Digests for files are " "not affected.") gp.add_argument( "--no-follow-directory-symlinks", action=SymlinkAction, const="no-follow-directory-symlinks", dest="follow_symlinks", help="""Do not follow symbolic links to directories when walking a directory tree. Augments --logical.""") gp.add_argument( "--no-follow-file-symlinks", action=SymlinkAction, const="no-follow-file-symlinks", dest="follow_symlinks", help="""Dont follow symbolic links to files when walking a directory tree. Augments --logical and -p.""") gp.add_argument( "--no-mmap", action="store_false", dest="mmap", default=None, help="Dont use mmap.") gp.add_argument( "--output", "-o", action="store", metavar="OUTPUT", help="Put the checksum into given file. " "If not given or if it is given as `-' then stdout is used.") gp.add_argument( "--physical", "-P", action=SymlinkAction, dest="follow_symlinks", const=FollowSymlinkConfig(False, False, False), help="""Do not follow any symbolic links whether they are given on the command line or when walking the directory tree. Overwrites any other symlink related options (--logical, -p, follow-directory-symlinks, follow-file-symlinks, et al.). This is the default.""") gp.add_argument( "-p", action=SymlinkAction, dest="follow_symlinks", const=FollowSymlinkConfig(False, False, True), help="""Do not follow any symbolic links to directories, whether they are given on the command line or when walking the directory tree, but follow symbolic links to files. Overwrites any other symlink related options (--logical, --physical, follow-directory-symlinks, no-follow-file-symlinks, et al.). This is the default.""") gp.add_argument( "--print-size", action="store_true", help="""Print the size of a file or the accumulated sizes of directory content into the output also. The size is not considered when computing digests. For symbolic links the size is not printed also.""") gp.add_argument( "--size-only", action="store_true", help="""Print only the size of files and for each directory its accumulated directory size. Digests are not computed.""") gp.add_argument( "--utf8", "--utf-8", action="store_true", help="""Encode all file paths using UTF-8 instead of the filesystem encoding. Add some error tag into the path if it cannot representated in Unicode cleanly.""") gp.add_argument( "directories", nargs="*", metavar="DIRECTORY") def _populate_info_arguments(ip): ip.add_argument( "--last", action="store_true", dest="print_only_last_block", help="Print only the last block of every given input file") ip.add_argument( "digest_files", nargs="+", metavar="TREESUM-DIGEST-FILE") parser = argparse.ArgumentParser( description="Generate and verify checksums for directory trees.", fromfile_prefix_chars='@', add_help=False) # # Global options for all sub-commands. # In a group because this allows a customized title. # gparser = parser.add_argument_group(title="Global Options") gparser.add_argument( "--debug", action="store_true", help="Activate debug logging to stderr") gparser.add_argument( "-v", "--version", action="version", version="%s (rv:%s)" % (__version__, __revision__), help="Show program's version number and exit") gparser.add_argument( "-h", "--help", action="help", help="Show this help message and exit") # # Subcommands # subparsers = parser.add_subparsers( dest="subcommand", title="Commands", description="This tool uses subcommands. " "To see detailed help for a specific subcommand use " "the -h/--help option after the subcommand name. " "A list of valid commands and their short descriptions " "is listed below:", metavar="COMMAND") genparser = subparsers.add_parser( "generate", help="Generate checksums for directory trees", description="Generate checksums for directory trees.") _populate_generate_arguments(genparser) # And an alias for "generate" genparser2 = subparsers.add_parser( "gen", help="Alias for \"generate\"", description="Generate checksums for directory trees. " "This is an alias to \"generate\".") _populate_generate_arguments(genparser2) infoparser = subparsers.add_parser( "info", help="Print some information from given treesum digest file", description="""Print some informations from given treesum digest files to stdout.""" ) _populate_info_arguments(infoparser) hparser = subparsers.add_parser( "help", help="Show this help message or a subcommand's help and exit", description="Show this help message or a subcommand's help and exit.") hparser.add_argument("help_command", nargs='?', metavar="COMMAND") vparser = subparsers.add_parser( "version", help="Show the program's version number and exit", description="Show the program's version number and exit.") # Parse leniently to just check for "version" and/or help opts, _dummy = parser.parse_known_args(args=argv) if opts.subcommand == "version": print("%s (rv:%s)" % (__version__, __revision__), file=sys.stdout) return 0 if opts.subcommand == "help": if not opts.help_command: parser.print_help() else: if opts.help_command == "generate": genparser.print_help() elif opts.help_command == "gen": genparser2.print_help() elif opts.help_command == "info": infoparser.print_help() elif opts.help_command == "version": vparser.print_help() elif opts.help_command == "help": hparser.print_help() else: parser.print_help() return 0 # Reparse strictly opts = parser.parse_args(args=argv) # Minimal logging -- just for debugging - not for more "normal" use logging.basicConfig( level=logging.DEBUG if opts.debug else logging.WARNING, stream=sys.stderr, format="[%(asctime)s][%(levelname)s][%(process)d:%(name)s] %(message)s" ) logging.captureWarnings(True) return treesum(opts) FollowSymlinkConfig = collections.namedtuple("FollowSymlinkConfig", ["command_line", "directory", "file"]) class SymlinkAction(argparse.Action): """`type' is fixed here. `dest' is a tuple with three items: 1. follow symlinks on the command line 2. follow directory symlinks while walking 3. follow file symlinks while walking (not yet implemented) """ def __init__(self, *args, **kwargs): if "nargs" in kwargs: raise ValueError("`nargs' not allowed") if "type" in kwargs: raise ValueError("`type' not allowed") c = kwargs.get("const", None) if c is None: raise ValueError("a const value is needed") if (not isinstance(c, FollowSymlinkConfig) and c not in ("follow-directory-symlinks", "no-follow-directory-symlinks", "follow-file-symlinks", "no-follow-file-symlinks")): raise ValueError( "invalid value for the `const' configuration value") default = kwargs.get("default", None) if (default is not None and not isinstance(default, FollowSymlinkConfig)): raise TypeError("invalid type for `default'") kwargs["nargs"] = 0 super(SymlinkAction, self).__init__(*args, **kwargs) def __call__(self, parser, namespace, values, option_string=None): curval = getattr(namespace, self.dest, None) if curval is None: curval = FollowSymlinkConfig(False, False, True) if isinstance(self.const, FollowSymlinkConfig): curval = self.const else: if self.const == "follow-directory-symlinks": curval = FollowSymlinkConfig( curval.command_line, True, curval.file) elif self.const == "no-follow-directory-symlinks": curval = FollowSymlinkConfig( curval.command_line, False, curval.file) elif self.const == "follow-file-symlinks": curval = FollowSymlinkConfig( curval.command_line, curval.directory, True) elif self.const == "no-follow-file-symlinks": curval = FollowSymlinkConfig( curval.command_line, curval.directory, False) else: assert False, "Implementation error: not yet implemented" # Not following symlinks to files is not yet supported: reset to True # if not curval.file: # curval = FollowSymlinkConfig( # curval.command_line, curval.directory, True) # logging.warning("Coercing options to `follow-file-symlinks'") setattr(namespace, self.dest, curval) def gen_generate_opts(directories=[], algorithm=util.default_algotag(), append_output=False, base64=False, comment=[], follow_symlinks=FollowSymlinkConfig(False, False, False), full_mode=False, generator="normal", logical=None, minimal=None, mode=False, mmap=None, mtime=False, output=None, print_size=False, size_only=False, utf8=False): if not isinstance(follow_symlinks, FollowSymlinkConfig): raise TypeError("`follow_symlinks' must be a FollowSymlinkConfig") # Not following symlinks to files is not yet supported: reset to True # if not follow_symlinks.file: # follow_symlinks = follow_symlinks._make([follow_symlinks.command_line, # follow_symlinks.directory, # True]) # logging.warning("Coercing to follow-symlinks-file") opts = argparse.Namespace( directories=directories, algorithm=util.argv2algo(algorithm), append_output=append_output, base64=base64, comment=comment, follow_symlinks=follow_symlinks, generator=generator, logical=logical, minimal=minimal, mmap=mmap, metadata_full_mode=full_mode, metadata_mode=mode, metadata_mtime=mtime, output=output, print_size=print_size, size_only=size_only, utf8=utf8) return opts def gen_info_opts(digest_files=[], last=False): opts = argparse.Namespace( digest_files=digest_files, print_only_last_block=last) return opts def treesum(opts): # XXX TBD: opts.check and opts.checklist (as in shasum.py) if opts.subcommand in ("generate", "gen"): return generate_treesum(opts) elif opts.subcommand == "info": return print_treesum_digestfile_infos(opts) else: raise RuntimeError( "command `{}' not yet handled".format(opts.subcommand)) def generate_treesum(opts): # Provide defaults if not opts.algorithm: opts.algorithm = util.argv2algo(util.default_algotag()) if not opts.directories: opts.directories.append(".") if opts.output is None or opts.output == "-": if hasattr(sys.stdout, "buffer"): out_cm = cm.nullcontext(sys.stdout.buffer) else: out_cm = cm.nullcontext(sys.stdout) else: if opts.append_output: out_cm = open(opts.output, "ab") else: out_cm = open(opts.output, "wb") out_cm = CRC32Output(out_cm) with out_cm as outfp: writer = TreesumWriter(outfp) for d in opts.directories: V1DirectoryTreesumGenerator( opts.algorithm, opts.mmap, opts.base64, opts.follow_symlinks, opts.generator, opts.metadata_mode, opts.metadata_full_mode, opts.metadata_mtime, opts.size_only, opts.print_size, opts.utf8, minimal=opts.minimal).generate( writer, d, comment=opts.comment) class V1DirectoryTreesumGenerator(object): def __init__(self, algorithm, use_mmap, use_base64, follow_symlinks, with_generator, with_metadata_mode, with_metadata_full_mode, with_metadata_mtime, size_only, print_size, utf8_mode, minimal=None,): super(V1DirectoryTreesumGenerator, self).__init__() self._algorithm = algorithm self._use_mmap = use_mmap self._use_base64 = use_base64 self._follow_symlinks = follow_symlinks self._with_generator = with_generator self._with_metadata_mode = with_metadata_mode self._with_metadata_full_mode = with_metadata_full_mode self._with_metadata_mtime = with_metadata_mtime self._size_only = size_only self._print_size = print_size self._utf8_mode = utf8_mode self._minimal = minimal def generate(self, writer, root, comment=None): """ :param outfp: a *binary* file with a "write()" and a "flush()" method """ self._writer = writer self._writer.start("1") self._writer.write_fsencoding(util.n(walk.getfsencoding().upper())) self._writer.flush() if self._with_generator == "none": pass # do nothing elif self._with_generator == "normal": self._writer.write_generator("PY2" if util.PY2 else "PY3") elif self._with_generator == "full": import platform info = "%s %s, %s" % (platform.python_implementation(), platform.python_version(), platform.platform()) self._writer.write_generator(info) else: raise NotImplementedError( "not implemented: %s" % (self._with_generator,)) # # Note: Given non-default flags that are relevant for # directory traversal. # flags = [] if self._with_metadata_full_mode: flags.append("with-metadata-fullmode") elif self._with_metadata_mode: flags.append("with-metadata-mode") if self._with_metadata_mtime: flags.append("with-metadata-mtime") flags.append("follow-symlinks-commandline" if self._follow_symlinks.command_line else "no-follow-symlinks-commandline") flags.append("follow-symlinks-directory" if self._follow_symlinks.directory else "no-follow-symlinks-directory") flags.append("follow-symlinks-file" if self._follow_symlinks.file else "no-follow-symlinks-file") if self._size_only: flags.append("size-only") flags.append("utf8-encoding" if self._utf8_mode else "fs-encoding") if self._print_size: flags.append("print-size") self._writer.write_flags(flags) if self._minimal is None: # Write execution timestamps in POSIX epoch and ISO format ts = int(time.time()) self._writer.write_timestamp(ts) ts = (datetime.datetime.utcfromtimestamp(ts)).isoformat("T") self._writer.write_isotimestamp(ts) if comment: for line in comment: self._writer.write_comment(line) if self._minimal is not None: self._writer.write_root( (walk.WalkDirEntry.alt_u8(self._minimal) if self._minimal else b"")) else: self._writer.write_root(walk.WalkDirEntry.alt_u8(root)) self._writer.flush() if not self._follow_symlinks.command_line and os.path.islink(root): linktgt = walk.WalkDirEntry.from_readlink(os.readlink(root)) linkdgst = self._algorithm[0]() linkdgst.update( util.interpolate_bytes( b"%d:%s,", len(linktgt.fspath), linktgt.fspath)) dir_dgst = self._algorithm[0]() dir_dgst.update(b"1:L,") dir_dgst.update( util.interpolate_bytes( b"%d:%s,", len(linkdgst.digest()), linkdgst.digest())) if self._size_only: self._writer.write_size(b"./@/", 0) else: self._writer.write_file_digest( self._algorithm[1], b"./@/", dir_dgst.digest(), self._use_base64) self._writer.flush() self._writer.finish() return self._generate(os.path.normpath(root), tuple()) self._writer.finish() def _generate(self, root, top): logging.debug("Handling %s/%r", root, top) path = os.path.join(root, *top) if top else root try: with walk.ScanDir(path) as dirscan: fsobjects = list(dirscan) except OSError as e: if self._utf8_mode: opath = walk.WalkDirEntry.alt_u8(path) else: opath = walk.WalkDirEntry.alt_fs(path) if e.errno == errno.ENOTDIR: # object exists but is not a directory errmsg = b"not a directory" elif e.errno in (errno.EACCES, errno.EPERM, getattr(errno, "ENOTCAPABLE", errno.EACCES)): # no permissions errmsg = ( b"access denied / no permissions / missing capabilities") elif e.errno == errno.ENOENT: # given object does not exist errmsg = b"no such file or directory" else: raise self._writer.write_error(util.interpolate_bytes( b"`%s': %s", opath, errmsg)) opath = join_output_path(top, None) if opath: if self._utf8_mode: opath = walk.WalkDirEntry.alt_u8(opath) else: opath = walk.WalkDirEntry.alt_fs(opath) if self._size_only: self._writer.write_size(opath, None) else: self._writer.write_file_digest(self._algorithm[1], opath, None) self._writer.flush() return (None, None) if self._utf8_mode: fsobjects.sort(key=walk.WalkDirEntry.sort_key_u8) else: fsobjects.sort(key=walk.WalkDirEntry.sort_key_fs) dir_dgst = self._algorithm[0]() dir_size = 0 dir_tainted = False for fso in fsobjects: if fso.is_dir: if fso.is_symlink and not self._follow_symlinks.directory: linktgt = walk.WalkDirEntry.from_readlink( os.readlink(fso.path)) # linktgt = util.fsencode(os.readlink(fso.path))) linkdgst = self._algorithm[0]() if self._utf8_mode: if linktgt.u8path is None: dir_tainted = True linkdgst.update(util.interpolate_bytes( b"%d:%s,", len(linktgt.alt_u8path), linktgt.alt_u8path)) else: linkdgst.update(util.interpolate_bytes( b"%d:%s,", len(linktgt.u8path), linktgt.u8path)) if fso.u8name is None: dir_tainted = True dir_dgst.update(util.interpolate_bytes( b"1:S,%d:%s,", len(fso.alt_u8name), fso.alt_u8name)) else: dir_dgst.update(util.interpolate_bytes( b"1:S,%d:%s,", len(fso.u8name), fso.u8name)) else: if linktgt.fspath is None: dir_tainted = True linkdgst.update(util.interpolate_bytes( b"%d:%s,", len(linktgt.alt_fspath), linktgt.alt_fspath)) else: linkdgst.update(util.interpolate_bytes( b"%d:%s,", len(linktgt.fspath), linktgt.fspath)) if fso.fsname is None: dir_tainted = True dir_dgst.update(util.interpolate_bytes( b"1:S,%d:%s,", len(fso.alt_fsname), fso.alt_fsname)) else: dir_dgst.update(util.interpolate_bytes( b"1:S,%d:%s,", len(fso.fsname), fso.fsname)) # # - no mtime and no mode for symlinks # - also does not count for dir_size # dir_dgst.update(util.interpolate_bytes( b"%d:%s,", len(linkdgst.digest()), linkdgst.digest())) opath = join_output_path(top, fso.name) if self._utf8_mode: opath = walk.WalkDirEntry.alt_u8(opath) else: opath = walk.WalkDirEntry.alt_fs(opath) if self._size_only: self._writer.write_size( util.interpolate_bytes(b"%s/./@/", opath), 0) else: self._writer.write_file_digest( self._algorithm[1], util.interpolate_bytes(b"%s/./@/", opath), linkdgst.digest(), self._use_base64) self._writer.flush() else: # # Follow the symlink to dir or handle a "real" directory # # Get subdir data from recursing into it sub_dir_dgst, sub_dir_size = self._generate( root, top + (fso.name, )) if sub_dir_dgst is None or sub_dir_size is None: # # This should not happen: # - top-level directories are handled above # - other filesystem objects should also have been # handled already # assert False dir_size += sub_dir_size if self._utf8_mode: if fso.u8name is None: dir_tainted = True dir_dgst.update(util.interpolate_bytes( b"1:d,%d:%s,", len(fso.alt_u8name), fso.alt_u8name)) else: dir_dgst.update(util.interpolate_bytes( b"1:d,%d:%s,", len(fso.u8name), fso.u8name)) else: if fso.fsname is None: dir_tainted = True dir_dgst.update(util.interpolate_bytes( b"1:d,%d:%s,", len(fso.alt_fsname), fso.alt_fsname)) else: dir_dgst.update(util.interpolate_bytes( b"1:d,%d:%s,", len(fso.fsname), fso.fsname)) dir_dgst.update(util.interpolate_bytes( b"%d:%s,", len(sub_dir_dgst), sub_dir_dgst)) if self._with_metadata_full_mode: modestr = util.b(normalized_mode_str(fso.stat.st_mode)) dir_dgst.update(util.interpolate_bytes( b"8:fullmode,%d:%s,", len(modestr), modestr)) elif self._with_metadata_mode: modestr = util.b(normalized_compatible_mode_str( fso.stat.st_mode)) dir_dgst.update(util.interpolate_bytes( b"4:mode,%d:%s,", len(modestr), modestr)) else: if fso.is_symlink and not self._follow_symlinks.file: linktgt = walk.WalkDirEntry.from_readlink( os.readlink(fso.path)) # linktgt = util.fsencode(os.readlink(fso.path))) linkdgst = self._algorithm[0]() if self._utf8_mode: if linktgt.u8path is None: dir_tainted = True linkdgst.update(util.interpolate_bytes( b"%d:%s,", len(linktgt.alt_u8path), linktgt.alt_u8path)) else: linkdgst.update(util.interpolate_bytes( b"%d:%s,", len(linktgt.u8path), linktgt.u8path)) if fso.u8name is None: dir_tainted = True dir_dgst.update(util.interpolate_bytes( b"1:F,%d:%s,", len(fso.alt_u8name), fso.alt_u8name)) else: dir_dgst.update(util.interpolate_bytes( b"1:F,%d:%s,", len(fso.u8name), fso.u8name)) else: if linktgt.fspath is None: dir_tainted = True linkdgst.update(util.interpolate_bytes( b"%d:%s,", len(linktgt.alt_fspath), linktgt.alt_fspath)) else: linkdgst.update(util.interpolate_bytes( b"%d:%s,", len(linktgt.fspath), linktgt.fspath)) if fso.fsname is None: dir_tainted = True dir_dgst.update(util.interpolate_bytes( b"1:F,%d:%s,", len(fso.alt_fsname), fso.alt_fsname)) else: dir_dgst.update(util.interpolate_bytes( b"1:F,%d:%s,", len(fso.fsname), fso.fsname)) # # - no mtime and no mode for symlinks # - also does not count for dir_size # dir_dgst.update(util.interpolate_bytes( b"%d:%s,", len(linkdgst.digest()), linkdgst.digest())) opath = join_output_path(top, fso.name) if self._utf8_mode: opath = walk.WalkDirEntry.alt_u8(opath) else: opath = walk.WalkDirEntry.alt_fs(opath) if self._size_only: self._writer.write_size( util.interpolate_bytes(b"%s/./@", opath), 0) else: self._writer.write_file_digest( self._algorithm[1], util.interpolate_bytes(b"%s/./@", opath), linkdgst.digest(), self._use_base64) self._writer.flush() else: # # Follow the symlink to file or handle a "real" file # if self._utf8_mode: if fso.u8name is None: dir_tainted = True dir_dgst.update(util.interpolate_bytes( b"1:f,%d:%s,", len(fso.alt_u8name), fso.alt_u8name)) else: dir_dgst.update(util.interpolate_bytes( b"1:f,%d:%s,", len(fso.u8name), fso.u8name)) else: if fso.fsname is None: dir_tainted = True dir_dgst.update(util.interpolate_bytes( b"1:f,%d:%s,", len(fso.alt_fsname), fso.alt_fsname)) else: dir_dgst.update(util.interpolate_bytes( b"1:f,%d:%s,", len(fso.fsname), fso.fsname)) opath = join_output_path(top, fso.name) if self._utf8_mode: opath = walk.WalkDirEntry.alt_u8(opath) else: opath = walk.WalkDirEntry.alt_fs(opath) if fso.stat is None: # # Error: most likely a broken symlink here # dir_tainted = True dir_dgst.update(util.interpolate_bytes( b"5:errno,%d:%s,", len(str(fso.stat_errno)), util.b(str(fso.stat_errno)))) self._writer.write_error(util.interpolate_bytes( b"errno %d: %s", fso.stat_errno, util.b(fso.stat_errstr, "utf-8"))) logging.error( "Directory entry has symlink problems: %r", opath) if self._size_only: self._writer.write_size(opath, None) else: self._writer.write_file_digest( self._algorithm[1], opath, None) else: # # Ok: File has normal stat info # # XXX FIXME: Handle special files (fifo, socket, # block or char devices, ...). # dir_size += fso.stat.st_size if self._with_metadata_mtime: mtime = datetime.datetime.utcfromtimestamp( int(fso.stat.st_mtime)) mtime = util.b(mtime.isoformat("T") + "Z") dir_dgst.update(util.interpolate_bytes( b"5:mtime,%d:%s,", len(mtime), mtime)) if self._with_metadata_full_mode: modestr = util.b( normalized_mode_str(fso.stat.st_mode)) dir_dgst.update(util.interpolate_bytes( b"8:fullmode,%d:%s,", len(modestr), modestr)) elif self._with_metadata_mode: modestr = util.b(normalized_compatible_mode_str( fso.stat.st_mode)) dir_dgst.update(util.interpolate_bytes( b"4:mode,%d:%s,", len(modestr), modestr)) if not self._size_only: dgst = digest.compute_digest_file( self._algorithm[0], fso.path, use_mmap=self._use_mmap) dir_dgst.update(util.interpolate_bytes( b"%d:%s,", len(dgst), dgst)) if self._size_only: self._writer.write_size(opath, fso.stat.st_size) else: sz = fso.stat.st_size if self._print_size else None self._writer.write_file_digest( self._algorithm[1], opath, dgst, use_base64=self._use_base64, size=sz) self._writer.flush() opath = join_output_path(top, None) if opath: if self._utf8_mode: opath = walk.WalkDirEntry.alt_u8(opath) else: opath = walk.WalkDirEntry.alt_fs(opath) if dir_tainted: # # IMPORTANT: Print errors BEFORE the associated digest or size # line. Otherwise the "info" command has a problem. # self._writer.write_error(b"directory is tainted") logging.error("Directory has filename and/or symlink problems: %r", opath) if self._size_only: self._writer.write_size(opath, dir_size) else: sz = dir_size if self._print_size else None self._writer.write_file_digest( self._algorithm[1], opath, dir_dgst.digest(), use_base64=self._use_base64, size=sz) self._writer.flush() return (dir_dgst.digest(), dir_size) def join_output_path(top, name): if name is None: # a path for a directory is to be computed if top: if isinstance(top[0], bytes): return b"/".join(top) + b"/" else: return u"/".join(top) + u"/" else: return b"" else: # a path for a normal file is to be computed if top: if isinstance(name, bytes): return b"/".join(top) + b"/" + name else: return u"/".join(top) + u"/" + name else: return name class CRC32Output(object): """Wrapper for a minimal binary file contextmanager that calculates the CRC32 of the written bytes on the fly. Also acts as context manager proxy for the given context manager. """ __slots__ = ("_fp_cm", "_fp", "_crc32") def __init__(self, fp_cm): self._fp_cm = fp_cm self._fp = None self.resetdigest() def __enter__(self): assert self._fp is None self._fp = self._fp_cm.__enter__() return self def __exit__(self, *args): rv = self._fp_cm.__exit__(*args) self._fp = None return rv def write(self, what): self._fp.write(what) self._crc32.update(what) def flush(self): self._fp.flush() def resetdigest(self): """Reset the current CRC digest""" self._crc32 = crc32() def hexcrcdigest(self): """ :rtype: str """ return self._crc32.hexdigest() def normalized_compatible_mode_str(mode): # XXX FIXME: Windows and "executable" modebits = stat.S_IMODE(mode) modestr = "%o" % (modebits,) if not modestr.startswith("0"): modestr = "0" + modestr return modestr def normalized_mode_str(mode): modestr = "%o" % (mode,) if not modestr.startswith("0"): modestr = "0" + modestr return modestr class TreesumWriter(object): """Writer to write treesum digest files in a format similar to BSD digest files. Wraps an output file pointer for a binary file. Provides high-level methods to write data lines. Also holds the current CRC for a block. """ LS = util.b(os.linesep) def __init__(self, outfp): self._outfp = outfp self._reset_crc() def _reset_crc(self): self._crc = crc32() def start(self, version): """Begin a new block, reset the current CRC and write the VERSION tag. """ self._reset_crc() self.write(b"VERSION = ") self.writeln(util.b(version)) def write_comment(self, comment): self.write(b"COMMENT (") self.write(util.b(comment, "utf-8")) self.writeln(b")") def write_generator(self, generator): self.write(b"GENERATOR (") self.write(util.b(generator, "utf-8")) self.writeln(b")") def write_error(self, error): self.write(b"ERROR (") self.write(util.b(error, "utf-8")) self.writeln(b")") def write_fsencoding(self, encoding): self.write(b"FSENCODING = ") self.writeln(util.b(encoding)) def write_flags(self, flags): self.write(b"FLAGS = ") if isinstance(flags, (str, bytes)): self.writeln(util.b(flags)) else: flags.sort() self.writeln(util.b(",".join(flags))) def write_timestamp(self, ts): self.write(b"TIMESTAMP = ") self.writeln(util.b(str(ts))) def write_isotimestamp(self, ts): self.write(b"ISOTIMESTAMP = ") self.writeln(util.b(ts)) def write_root(self, root): assert isinstance(root, bytes) self.write(b"ROOT (") self.write(root) self.writeln(b")") def write_size(self, filename, sz): assert isinstance(filename, bytes) self.write(b"SIZE (") self.write(filename) self.write(b")") if sz is not None: self.write(b" = ") self.write(util.b(str(sz))) self.writeln(b"") def write_file_digest(self, algorithm, filename, digest, use_base64=False, size=None): if digest is not None: digest = (base64.b64encode(digest) if use_base64 else binascii.hexlify(digest)) if filename != b"./@/": filename = util.normalize_filename(filename, True) self.write(util.b(algorithm)) self.write(b" (") self.write(filename) self.write(b")") if digest is not None or size is not None: self.write(b" = ") if digest is not None: self.write(digest) if size is not None: self.write(b",") self.write(util.b(str(size))) self.writeln(b"") def finish(self): """Finish a block and write the current CRC""" crc = self._crc.hexdigest() self.write(b"CRC32 = ") self.writeln(util.b(crc)) def writeln(self, line): """Write the bytes `line` into the output file and update the CRC accordingly. :param bytes line: The line to write to (without line ending) """ self.write(line) self.write(self.LS) def write(self, data): """Write `data` into the output file and update the CRC accordingly. :param bytes data: The data to write to and to update the CRC with """ if data: self._outfp.write(data) self._crc.update(data) def flush(self): self._outfp.flush() class TreesumReader(object): """Reader to read and/or verify treesum digest files. Supports the iterator and context manager protocol. """ PATTERN0 = re.compile(br"\A[ \t]*\r?\n\Z") # empty lines PATTERN1 = re.compile(br"\A(VERSION|FSENCODING|FLAGS|TIMESTAMP|ISOTIMESTAMP|CRC32)[ \t]*=[ \t]*([^ \t]+)[ \t]*\r?\n\Z") # noqa: E501 line too long PATTERN2 = re.compile(br"\A(ROOT|COMMENT|ERROR|GENERATOR)[ \t]*\((.*)\)[ \t]*\r?\n\Z") # noqa: E501 line too long PATTERN3 = re.compile(br"\ASIZE[ \t]*\((.*)\)([ \t]*=[ \t]*(\d+))?[ \t]*\r?\n\Z") # noqa: E501 line too long PATTERN4 = re.compile(br"\A([A-Za-z0-9_-]+)[ \t]*\((.*)\)([ \t]*=[ \t]*([A-Za-z0-9=+/]+)(,(\d+))?)?[ \t]*\r?\n\Z") # noqa: E501 line too long def __init__(self, _fp, _filename, _own_fp): self._fp = _fp self._own_fp = _own_fp self._filename = _filename self._line_no = 0 self._reset_crc() self._expect_crc = None # NOTE: tristate: None is different from False self._current_algo_name = self._current_algo_digest_size = None @classmethod def from_path(cls_, path): """Open file at `path` and return a reader that owns the file object""" return cls_(open(path, "rb"), path, True) @classmethod def from_binary_buffer(cls_, binary_fp, filename): return cls_(binary_fp, filename, False) def __enter__(self): return self def __exit__(self, *args): self.close() def close(self): if self._fp is not None: try: if self._own_fp: self._fp.close() finally: self._fp = None def __iter__(self): return self def __next__(self): rec = self.read_record() if rec is None: raise StopIteration() return rec if util.PY2: next = __next__ def all_records(self): """Iterator over all remaining records""" while True: rec = self.read_record() if rec is None: return yield rec def read_record(self): """Read and parse the "next" line. :returns: `None` at EOF or the parsed contents of the line :rtype: tuple or None """ # Loop to skip empty lines while True: line = self._get_next_line() if not line: # # Skip for empty files at the very beginning. # Check only after the first VERSION line. # if self._expect_crc is not None: if self._expect_crc: logging.warning("CRC32 is missing at EOF") return None if not self.PATTERN0.search(line): break self._update_crc(line) # # At the beginning transparently skip an eventually embedded signify # signature # if self._line_no == 1: if line.startswith(b"untrusted comment: "): line = self._get_next_line() if not line.endswith(b"\n"): raise binascii.Error("No valid signify signature value") # Try to decode for an early error check base64.b64decode(line[:-1]) mo = self.PATTERN1.search(line) if mo: if mo.group(1) == b"VERSION": if self._expect_crc: logging.warning("CRC32 missing before line %d", self._line_no) self._reset_crc() self._expect_crc = True self._update_crc(line) return ("VERSION", util.n(mo.group(2))) if mo.group(1) == b"CRC32": # TODO: check if self._expect_crc is None: logging.warning("Lone CRC32 before VERSION in line %d", self._line_no) else: if self._expect_crc: if (self._hex_crc() != mo.group(2).decode("latin1").upper()): logging.warning( "CRC32 mismatch in line %d:" " expected: %s, given: %s", self._line_no, self._hex_crc(), mo.group(2).decode("latin1").upper()) else: logging.warning("CRC32 before VERSION in line %d", self._line_no) # Do not update the CRC here but reset the state self._expect_crc = False return ("CRC32", util.n(mo.group(2))) else: self._update_crc(line) return (util.n(mo.group(1)), util.n(mo.group(2))) else: mo = self.PATTERN2.search(line) if mo: self._update_crc(line) if mo.group(1) in (b"COMMENT", b"ERROR", b"GENERATOR"): return (util.u(mo.group(1)), util.u(mo.group(2), "utf-8")) elif mo.group(1) == b"ROOT": return ("ROOT", mo.group(2)) assert False, line else: mo = self.PATTERN3.search(line) if mo: self._update_crc(line) if mo.group(2): return ("SIZE", mo.group(1), int(util.n(mo.group(3)), 10)) else: return ("SIZE", mo.group(1), None) else: mo = self.PATTERN4.search(line) if mo: self._update_crc(line) algo_name = util.n(mo.group(1)) if mo.group(3): if (len(mo.group(4)) == 2 * self._get_digest_size(algo_name)): # hex digest = binascii.unhexlify(mo.group(4)) else: # base64 digest = base64.b64decode(mo.group(4)) if mo.group(5): size = int(util.n(mo.group(6)), 10) else: size = None return (algo_name, mo.group(2), digest, size) else: return (algo_name, mo.group(2), None, None) else: assert False, line return line def _get_next_line(self): line = self._fp.readline(4096) # along PATH_MAX on Linux if line: self._line_no += 1 return line def _reset_crc(self): self._crc32 = crc32() def _update_crc(self, data): self._crc32.update(data) def _hex_crc(self): return self._crc32.hexdigest() def _get_digest_size(self, algo_name): if self._current_algo_name == algo_name: return self._current_algo_digest_size h = util.algotag2algotype(algo_name)() self._current_algo_name = algo_name self._current_algo_digest_size = h.digest_size return self._current_algo_digest_size def print_treesum_digestfile_infos(opts): print_infos_for_digestfile(opts.digest_files, opts.print_only_last_block) def print_infos_for_digestfile(digest_files, print_only_last_block=True): for fn in digest_files: if fn == "-": if util.PY2: reader = TreesumReader.from_binary_buffer(sys.stdin) else: reader = TreesumReader.from_binary_buffer(sys.stdin.buffer) else: reader = TreesumReader.from_path(fn) with reader: root = generator = flags = fsencoding = algorithm = digest \ = size = None errors = set() comments = [] in_block = False block_no = 0 for record in reader: if record[0] == "VERSION": assert record[1] == "1" # start a new block in_block = True block_no += 1 root = flags = algorithm = digest = size = None comments = [] elif record[0] == "GENERATOR": generator = record[1] elif record[0] == "FSENCODING": fsencoding = record[1] elif record[0] == "FLAGS": flags = record[1] elif record[0] == "ROOT": root = record[1] elif record[0] == "COMMENT": comments.append(record[1]) elif record[0] == "ERROR": errors.add(record[1]) elif record[0] in ("TIMESTAMP", "ISOTIMESTAMP"): pass elif record[0] == "CRC32": pass # in_block = False else: if not in_block: continue # digest line or size line if not record[1] or record[1] == b"./@/": if record[0] == "SIZE": algorithm = "SIZE" digest = None size = record[2] else: algorithm = record[0] digest = record[2] size = record[3] if not print_only_last_block: print_block_data( block_no, root, generator, fsencoding, flags, comments, errors, algorithm, digest, size) root = generator = flags = fsencoding = algorithm \ = digest = size = None errors = set() comments = [] in_block = False if print_only_last_block: if not in_block: if digest is not None or size is not None: print_block_data( block_no, root, generator, fsencoding, flags, comments, errors, algorithm, digest, size) else: logging.warning("missing block end") def print_block_data(block_no, tag, generator, fsencoding, flags, comments, errors, algorithm, digest, size): digeststr = util.n(binascii.hexlify(digest)) if digest else "<no digest>" sizestr = str(size) if size is not None else "<no size>" print("BLOCK No %d:" % (block_no,)) print(" Tag:", tag) print(" FS-Encoding:", fsencoding) if generator: print(" Generator:", generator) print(" Flags:", flags if flags else "<none>") if comments: print(" Comments:", comments) print(" Algorithm:", algorithm) if algorithm != "SIZE": print(" Digest:", digeststr) print(" Size:", sizestr) print(" Errors:", errors if errors else "<none>") if __name__ == "__main__": sys.exit(main())
