view cutils/treesum.py @ 238:ff13b2a863ba

Make selection from a character repertoire more evenly distributed
author Franz Glasner <fzglas.hg@dom66.de>
date Sat, 08 Feb 2025 08:54:02 +0100
parents 61e5b1c2685c
children 07a0bc723139
line wrap: on
line source

# -*- coding: utf-8 -*-
# :-
# :Copyright: (c) 2020-2025 Franz Glasner
# :License:   BSD-3-Clause
# :-
r"""Generate and verify checksums for directory trees.

"""

from __future__ import print_function, absolute_import


__all__ = []


import argparse
import base64
import binascii
import collections
import datetime
import logging
import os
import re
import stat
import sys
import time
import zlib

from . import (__version__, __revision__)
from . import util
from .util import cm
from .util import digest
from .util import walk


def main(argv=None):

    def _populate_generate_arguments(gp):
        """Use to populate command aliases.

        This is because :class:`argparse.ArgumentParser` does not
        support them for all supported Python versions.

        """
        gp.add_argument(
            "--algorithm", "-a", action="store", type=util.argv2algo,
            help="1 (aka sha1), 224, 256 (aka sha256), 384, 512 (aka sha512), "
                 "3 (alias for sha3-512), 3-224, 3-256, 3-384, 3-512, "
                 "blake2b, blake2b-256, blake2s, "
                 "blake2 (alias for blake2b), "
                 "blake2-256 (alias for blake2b-256), "
                 "md5. "
                 "The default depends on availability in hashlib: "
                 "blake2b-256, sha256 or sha1.")
        gp.add_argument(
            "--append-output", action="store_true", dest="append_output",
            help="Append to the output file instead of overwriting it.")
        gp.add_argument(
            "--base64", action="store_true",
            help="Output checksums in base64 notation, not hexadecimal "
                 "(OpenBSD).")
        gp.add_argument(
            "--comment", action="append", default=[],
            help="Put given comment COMMENT into the output as \"COMMENT\". "
                 "Can be given more than once.")
        gp.add_argument(
            "--follow-directory-symlinks", "-l", action=SymlinkAction,
            const="follow-directory-symlinks",
            default=FollowSymlinkConfig(False, False, True),
            dest="follow_symlinks",
            help="""Follow symbolic links to directories when walking a
directory tree. Augments --physical and -p.""")
        gp.add_argument(
            "--follow-file-symlinks", action=SymlinkAction,
            const="follow-file-symlinks",
            default=FollowSymlinkConfig(False, False, True),
            dest="follow_symlinks",
            help="""Follow symbolic links to files when walking a
directory tree. Augments --physical.""")
        gp.add_argument(
            "--full-mode", action="store_true", dest="metadata_full_mode",
            help="Consider all mode bits as returned from stat(2) when "
                 "computing directory digests. "
                 "Note that mode bits on symbolic links itself are not "
                 "considered.")
        gp.add_argument(
            "--generator", choices=("normal", "full", "none"),
            default="normal",
            help="""Put a `GENERATOR' line into the output.
`full' prints full Python and OS/platform version information,
`normal' prints just whether Python 2 or Python 3 is used, and `none'
suppresses the output completely. The default is `normal'.""")
        gp.add_argument(
            "--logical", "-L", action=SymlinkAction, dest="follow_symlinks",
            const=FollowSymlinkConfig(True, True, True),
            help="""Follow symbolic links everywhere: on command line
arguments and -- while walking -- directory and file symbolic links.
Overwrites any other symlink related options
(--physical,-p,  no-follow-directory-symlinks, no-follow-file-symlinks,
et al.).
""")
        gp.add_argument(
            "--minimal", nargs="?", const="", default=None, metavar="TAG",
            help="Produce minimal output only. If a TAG is given and not "
                 "empty use it as the leading \"ROOT (<TAG>)\" output.")
        gp.add_argument(
            "--mmap", action="store_true", dest="mmap", default=None,
            help="Use mmap if available. Default is to determine "
                 "automatically from the filesize.")
        gp.add_argument(
            "--mode", action="store_true", dest="metadata_mode",
            help="Consider the permission bits of stat(2) using S_IMODE (i.e. "
                 "all bits without the filetype bits) when "
                 "computing directory digests. Note that mode bits on "
                 "symbolic links itself are not considered.")
        gp.add_argument(
            "--mtime", action="store_true", dest="metadata_mtime",
            help="Consider the mtime of files (non-directories) when "
                 "generating digests for directories. Digests for files are "
                 "not affected.")
        gp.add_argument(
            "--no-follow-directory-symlinks", action=SymlinkAction,
            const="no-follow-directory-symlinks",
            dest="follow_symlinks",
            help="""Do not follow symbolic links to directories when walking a
directory tree. Augments --logical.""")
        gp.add_argument(
            "--no-follow-file-symlinks", action=SymlinkAction,
            const="no-follow-file-symlinks",
            dest="follow_symlinks",
            help="""Dont follow symbolic links to files when walking a
directory tree. Augments --logical and -p.""")
        gp.add_argument(
            "--no-mmap", action="store_false", dest="mmap", default=None,
            help="Dont use mmap.")
        gp.add_argument(
            "--output", "-o", action="store", metavar="OUTPUT",
            help="Put the checksum into given file. "
                 "If not given or if it is given as `-' then stdout is used.")
        gp.add_argument(
            "--physical", "-P", action=SymlinkAction, dest="follow_symlinks",
            const=FollowSymlinkConfig(False, False, False),
            help="""Do not follow any symbolic links whether they are given
on the command line or when walking the directory tree.
Overwrites any other symlink related options
(--logical, -p, follow-directory-symlinks, follow-file-symlinks, et al.).
This is the default.""")
        gp.add_argument(
            "-p", action=SymlinkAction, dest="follow_symlinks",
            const=FollowSymlinkConfig(False, False, True),
            help="""Do not follow any symbolic links to directories,
whether they are given on the command line or when walking the directory tree,
but follow symbolic links to files.
Overwrites any other symlink related options
(--logical, --physical, follow-directory-symlinks, no-follow-file-symlinks,
et al.).
This is the default.""")
        gp.add_argument(
            "--print-size", action="store_true",
            help="""Print the size of a file or the accumulated sizes of
directory content into the output also.
The size is not considered when computing digests. For symbolic links
the size is not printed also.""")
        gp.add_argument(
            "--size-only", action="store_true",
            help="""Print only the size of files and for each directory its
accumulated directory size. Digests are not computed.""")
        gp.add_argument(
            "--utf8", "--utf-8", action="store_true",
            help="""Encode all file paths using UTF-8 instead of
the filesystem encoding. Add some error tag into the path if it cannot
representated in Unicode cleanly.""")
        gp.add_argument(
            "directories", nargs="*", metavar="DIRECTORY")

    def _populate_info_arguments(ip):
        ip.add_argument(
            "--last", action="store_true", dest="print_only_last_block",
            help="Print only the last block of every given input file")
        ip.add_argument(
            "digest_files", nargs="+", metavar="TREESUM-DIGEST-FILE")

    parser = argparse.ArgumentParser(
        description="Generate and verify checksums for directory trees.",
        fromfile_prefix_chars='@',
        add_help=False)

    #
    # Global options for all sub-commands.
    # In a group because this allows a customized title.
    #
    gparser = parser.add_argument_group(title="Global Options")
    gparser.add_argument(
        "--debug", action="store_true",
        help="Activate debug logging to stderr")
    gparser.add_argument(
        "-v", "--version", action="version",
        version="%s (rv:%s)" % (__version__, __revision__),
        help="Show program's version number and exit")
    gparser.add_argument(
        "-h", "--help", action="help",
        help="Show this help message and exit")

    #
    # Subcommands
    #
    subparsers = parser.add_subparsers(
        dest="subcommand",
        title="Commands",
        description="This tool uses subcommands. "
                    "To see detailed help for a specific subcommand use "
                    "the -h/--help option after the subcommand name. "
                    "A list of valid commands and their short descriptions "
                    "is listed below:",
        metavar="COMMAND")

    genparser = subparsers.add_parser(
        "generate",
        help="Generate checksums for directory trees",
        description="Generate checksums for directory trees.")
    _populate_generate_arguments(genparser)
    # And an alias for "generate"
    genparser2 = subparsers.add_parser(
        "gen",
        help="Alias for \"generate\"",
        description="Generate checksums for directory trees. "
                    "This is an alias to \"generate\".")
    _populate_generate_arguments(genparser2)

    infoparser = subparsers.add_parser(
        "info",
        help="Print some information from given treesum digest file",
        description="""Print some informations from given treesum digest files
to stdout."""
    )
    _populate_info_arguments(infoparser)

    hparser = subparsers.add_parser(
        "help",
        help="Show this help message or a subcommand's help and exit",
        description="Show this help message or a subcommand's help and exit.")
    hparser.add_argument("help_command", nargs='?', metavar="COMMAND")

    vparser = subparsers.add_parser(
        "version",
        help="Show the program's version number and exit",
        description="Show the program's version number and exit.")

    # Parse leniently to just check for "version" and/or help
    opts, _dummy = parser.parse_known_args(args=argv)

    if opts.subcommand == "version":
        print("%s (rv:%s)" % (__version__, __revision__),
              file=sys.stdout)
        return 0
    if opts.subcommand == "help":
        if not opts.help_command:
            parser.print_help()
        else:
            if opts.help_command == "generate":
                genparser.print_help()
            elif opts.help_command == "gen":
                genparser2.print_help()
            elif opts.help_command == "info":
                infoparser.print_help()
            elif opts.help_command == "version":
                vparser.print_help()
            elif opts.help_command == "help":
                hparser.print_help()
            else:
                parser.print_help()
        return 0

    # Reparse strictly
    opts = parser.parse_args(args=argv)

    # Minimal logging -- just for debugging - not for more "normal" use
    logging.basicConfig(
        level=logging.DEBUG if opts.debug else logging.WARNING,
        stream=sys.stderr,
        format="[%(asctime)s][%(levelname)s][%(process)d:%(name)s] %(message)s"
    )
    logging.captureWarnings(True)

    return treesum(opts)


FollowSymlinkConfig = collections.namedtuple("FollowSymlinkConfig",
                                             ["command_line",
                                              "directory",
                                              "file"])


class SymlinkAction(argparse.Action):

    """`type' is fixed here.
    `dest' is a tuple with three items:

    1. follow symlinks on the command line
    2. follow directory symlinks while walking
    3. follow file symlinks while walking (not yet implemented)

    """

    def __init__(self, *args, **kwargs):
        if "nargs" in kwargs:
            raise ValueError("`nargs' not allowed")
        if "type" in kwargs:
            raise ValueError("`type' not allowed")
        c = kwargs.get("const", None)
        if c is None:
            raise ValueError("a const value is needed")
        if (not isinstance(c, FollowSymlinkConfig)
                and c not in ("follow-directory-symlinks",
                              "no-follow-directory-symlinks",
                              "follow-file-symlinks",
                              "no-follow-file-symlinks")):
            raise ValueError(
                "invalid value for the `const' configuration value")
        default = kwargs.get("default", None)
        if (default is not None
                and not isinstance(default, FollowSymlinkConfig)):
            raise TypeError("invalid type for `default'")
        kwargs["nargs"] = 0
        super(SymlinkAction, self).__init__(*args, **kwargs)

    def __call__(self, parser, namespace, values, option_string=None):
        curval = getattr(namespace, self.dest, None)
        if curval is None:
            curval = FollowSymlinkConfig(False, False, True)
        if isinstance(self.const, FollowSymlinkConfig):
            curval = self.const
        else:
            if self.const == "follow-directory-symlinks":
                curval = FollowSymlinkConfig(
                    curval.command_line, True, curval.file)
            elif self.const == "no-follow-directory-symlinks":
                curval = FollowSymlinkConfig(
                    curval.command_line, False, curval.file)
            elif self.const == "follow-file-symlinks":
                curval = FollowSymlinkConfig(
                    curval.command_line, curval.directory, True)
            elif self.const == "no-follow-file-symlinks":
                curval = FollowSymlinkConfig(
                    curval.command_line, curval.directory, False)
            else:
                assert False, "Implementation error: not yet implemented"

        # Not following symlinks to files is not yet supported: reset to True
#        if not curval.file:
#            curval = FollowSymlinkConfig(
#                curval.command_line, curval.directory, True)
#            logging.warning("Coercing options to `follow-file-symlinks'")
        setattr(namespace, self.dest, curval)


def gen_generate_opts(directories=[],
                      algorithm=util.default_algotag(),
                      append_output=False,
                      base64=False,
                      comment=[],
                      follow_symlinks=FollowSymlinkConfig(False, False, False),
                      full_mode=False,
                      generator="normal",
                      logical=None,
                      minimal=None,
                      mode=False,
                      mmap=None,
                      mtime=False,
                      output=None,
                      print_size=False,
                      size_only=False,
                      utf8=False):
    if not isinstance(follow_symlinks, FollowSymlinkConfig):
        raise TypeError("`follow_symlinks' must be a FollowSymlinkConfig")
    # Not following symlinks to files is not yet supported: reset to True
#    if not follow_symlinks.file:
#        follow_symlinks = follow_symlinks._make([follow_symlinks.command_line,
#                                                 follow_symlinks.directory,
#                                                 True])
#        logging.warning("Coercing to follow-symlinks-file")
    opts = argparse.Namespace(
        directories=directories,
        algorithm=util.argv2algo(algorithm),
        append_output=append_output,
        base64=base64,
        comment=comment,
        follow_symlinks=follow_symlinks,
        generator=generator,
        logical=logical,
        minimal=minimal,
        mmap=mmap,
        metadata_full_mode=full_mode,
        metadata_mode=mode,
        metadata_mtime=mtime,
        output=output,
        print_size=print_size,
        size_only=size_only,
        utf8=utf8)
    return opts


def gen_info_opts(digest_files=[], last=False):
    opts = argparse.Namespace(
        digest_files=digest_files,
        print_only_last_block=last)
    return opts


def treesum(opts):
    # XXX TBD: opts.check and opts.checklist (as in shasum.py)
    if opts.subcommand in ("generate", "gen"):
        return generate_treesum(opts)
    elif opts.subcommand == "info":
        return print_treesum_digestfile_infos(opts)
    else:
        raise RuntimeError(
            "command `{}' not yet handled".format(opts.subcommand))


def generate_treesum(opts):
    # Provide defaults
    if not opts.algorithm:
        opts.algorithm = util.argv2algo(util.default_algotag())
    if not opts.directories:
        opts.directories.append(".")

    if opts.output is None or opts.output == "-":
        if hasattr(sys.stdout, "buffer"):
            out_cm = cm.nullcontext(sys.stdout.buffer)
        else:
            out_cm = cm.nullcontext(sys.stdout)
    else:
        if opts.append_output:
            out_cm = open(opts.output, "ab")
        else:
            out_cm = open(opts.output, "wb")
    out_cm = CRC32Output(out_cm)

    with out_cm as outfp:
        for d in opts.directories:
            V1DirectoryTreesumGenerator(
                opts.algorithm, opts.mmap, opts.base64,
                opts.follow_symlinks,
                opts.generator,
                opts.metadata_mode,
                opts.metadata_full_mode,
                opts.metadata_mtime,
                opts.size_only,
                opts.print_size,
                opts.utf8,
                minimal=opts.minimal).generate(
                    outfp, d, comment=opts.comment)


class V1DirectoryTreesumGenerator(object):

    def __init__(self, algorithm, use_mmap, use_base64,
                 follow_symlinks,
                 with_generator,
                 with_metadata_mode, with_metadata_full_mode,
                 with_metadata_mtime, size_only, print_size, utf8_mode,
                 minimal=None,):
        super(V1DirectoryTreesumGenerator, self).__init__()
        self._algorithm = algorithm
        self._use_mmap = use_mmap
        self._use_base64 = use_base64
        self._follow_symlinks = follow_symlinks
        self._with_generator = with_generator
        self._with_metadata_mode = with_metadata_mode
        self._with_metadata_full_mode = with_metadata_full_mode
        self._with_metadata_mtime = with_metadata_mtime
        self._size_only = size_only
        self._print_size = print_size
        self._utf8_mode = utf8_mode
        self._minimal = minimal

    def generate(self, outfp, root, comment=None):
        """

        :param outfp: a *binary* file with a "write()" and a "flush()" method

        """
        self._outfp = outfp
        self._outfp.resetdigest()
        self._outfp.write(format_bsd_line("VERSION", "1", None, False))
        self._outfp.write(format_bsd_line(
            "FSENCODING", util.n(walk.getfsencoding().upper()), None, False))
        self._outfp.flush()

        if self._with_generator == "none":
            pass    # do nothing
        elif self._with_generator == "normal":
            self._outfp.write(format_bsd_line(
                "GENERATOR", None, b"PY2" if util.PY2 else b"PY3", False))
        elif self._with_generator == "full":
            import platform
            info = "%s %s, %s" % (platform.python_implementation(),
                                  platform.python_version(),
                                  platform.platform())
            self._outfp.write(format_bsd_line(
                "GENERATOR", None, info.encode("utf-8"), False))
        else:
            raise NotImplementedError(
                "not implemented: %s" % (self._with_generator,))

        #
        # Note: Given non-default flags that are relevant for
        #       directory traversal.
        #
        flags = []
        if self._with_metadata_full_mode:
            flags.append("with-metadata-fullmode")
        elif self._with_metadata_mode:
            flags.append("with-metadata-mode")
        if self._with_metadata_mtime:
            flags.append("with-metadata-mtime")
        flags.append("follow-symlinks-commandline"
                     if self._follow_symlinks.command_line
                     else "no-follow-symlinks-commandline")
        flags.append("follow-symlinks-directory"
                     if self._follow_symlinks.directory
                     else "no-follow-symlinks-directory")
        flags.append("follow-symlinks-file"
                     if self._follow_symlinks.file
                     else "no-follow-symlinks-file")
        if self._size_only:
            flags.append("size-only")
        flags.append("utf8-encoding" if self._utf8_mode else "fs-encoding")
        if self._print_size:
            flags.append("print-size")
        flags.sort()
        self._outfp.write(
            format_bsd_line("FLAGS", ",".join(flags), None, False))

        if self._minimal is None:
            # Write execution timestamps in POSIX epoch and ISO format
            ts = int(time.time())
            self._outfp.write(format_bsd_line("TIMESTAMP", ts, None, False))
            ts = (datetime.datetime.utcfromtimestamp(ts)).isoformat("T")
            self._outfp.write(format_bsd_line("ISOTIMESTAMP", ts, None, False))

            if comment:
                for line in comment:
                    self._outfp.write(
                        format_bsd_line("COMMENT", None, line, False))

        if self._minimal is not None:
            self._outfp.write(format_bsd_line(
                "ROOT",
                None,
                (walk.WalkDirEntry.alt_u8(self._minimal)
                 if self._minimal else b""),
                False))
        else:
            self._outfp.write(format_bsd_line(
                "ROOT", None, walk.WalkDirEntry.alt_u8(root), False))
        self._outfp.flush()

        if not self._follow_symlinks.command_line and os.path.islink(root):
            linktgt = walk.WalkDirEntry.from_readlink(os.readlink(root))
            linkdgst = self._algorithm[0]()
            linkdgst.update(
                util.interpolate_bytes(
                    b"%d:%s,", len(linktgt.fspath), linktgt.fspath))
            dir_dgst = self._algorithm[0]()
            dir_dgst.update(b"1:L,")
            dir_dgst.update(
                util.interpolate_bytes(
                    b"%d:%s,", len(linkdgst.digest()), linkdgst.digest()))
            if self._size_only:
                self._outfp.write(
                    format_bsd_line(
                        "SIZE",
                        None,
                        "./@/",
                        False,
                        0))
            else:
                self._outfp.write(
                    format_bsd_line(
                        self._algorithm[1],
                        dir_dgst.digest(),
                        "./@/",
                        self._use_base64))
            self._outfp.flush()
            self._outfp.write(format_bsd_line(
                "CRC32", self._outfp.hexcrcdigest(), None, False))
            return

        self._generate(os.path.normpath(root), tuple())
        self._outfp.write(format_bsd_line(
                "CRC32", self._outfp.hexcrcdigest(), None, False))

    def _generate(self, root, top):
        logging.debug("Handling %s/%r", root, top)
        path = os.path.join(root, *top) if top else root
        with walk.ScanDir(path) as dirscan:
            fsobjects = list(dirscan)
        if self._utf8_mode:
            fsobjects.sort(key=walk.WalkDirEntry.sort_key_u8)
        else:
            fsobjects.sort(key=walk.WalkDirEntry.sort_key_fs)
        dir_dgst = self._algorithm[0]()
        dir_size = 0
        dir_tainted = False
        for fso in fsobjects:
            if fso.is_dir:
                if fso.is_symlink and not self._follow_symlinks.directory:
                    linktgt = walk.WalkDirEntry.from_readlink(
                        os.readlink(fso.path))
                    # linktgt = util.fsencode(os.readlink(fso.path)))
                    linkdgst = self._algorithm[0]()
                    if self._utf8_mode:
                        if linktgt.u8path is None:
                            dir_tainted = True
                            linkdgst.update(util.interpolate_bytes(
                                b"%d:%s,",
                                len(linktgt.alt_u8path),
                                linktgt.alt_u8path))
                        else:
                            linkdgst.update(util.interpolate_bytes(
                                b"%d:%s,",
                                len(linktgt.u8path),
                                linktgt.u8path))
                        if fso.u8name is None:
                            dir_tainted = True
                            dir_dgst.update(util.interpolate_bytes(
                                b"1:S,%d:%s,",
                                len(fso.alt_u8name),
                                fso.alt_u8name))
                        else:
                            dir_dgst.update(util.interpolate_bytes(
                                b"1:S,%d:%s,", len(fso.u8name), fso.u8name))
                    else:
                        if linktgt.fspath is None:
                            dir_tainted = True
                            linkdgst.update(util.interpolate_bytes(
                                b"%d:%s,",
                                len(linktgt.alt_fspath),
                                linktgt.alt_fspath))
                        else:
                            linkdgst.update(util.interpolate_bytes(
                                b"%d:%s,",
                                len(linktgt.fspath),
                                linktgt.fspath))
                        if fso.fsname is None:
                            dir_tainted = True
                            dir_dgst.update(util.interpolate_bytes(
                                b"1:S,%d:%s,",
                                len(fso.alt_fsname),
                                fso.alt_fsname))
                        else:
                            dir_dgst.update(util.interpolate_bytes(
                                b"1:S,%d:%s,", len(fso.fsname), fso.fsname))
                    #
                    # - no mtime and no mode for symlinks
                    # - also does not count for dir_size
                    #
                    dir_dgst.update(util.interpolate_bytes(
                        b"%d:%s,",
                        len(linkdgst.digest()), linkdgst.digest()))
                    opath = join_output_path(top, fso.name)
                    if self._utf8_mode:
                        opath = walk.WalkDirEntry.alt_u8(opath)
                    else:
                        opath = walk.WalkDirEntry.alt_fs(opath)
                    if self._size_only:
                        self._outfp.write(format_bsd_line(
                            "SIZE", None,
                            util.interpolate_bytes(b"%s/./@/", opath),
                            False, 0))
                    else:
                        self._outfp.write(format_bsd_line(
                            self._algorithm[1],
                            linkdgst.digest(),
                            util.interpolate_bytes(b"%s/./@/", opath),
                            self._use_base64))
                    self._outfp.flush()
                else:
                    #
                    # Follow the symlink to dir or handle a "real" directory
                    #

                    # Get subdir data from recursing into it
                    sub_dir_dgst, sub_dir_size = self._generate(
                        root, top + (fso.name, ))

                    dir_size += sub_dir_size
                    if self._utf8_mode:
                        if fso.u8name is None:
                            dir_tainted = True
                            dir_dgst.update(util.interpolate_bytes(
                                b"1:d,%d:%s,",
                                len(fso.alt_u8name),
                                fso.alt_u8name))
                        else:
                            dir_dgst.update(util.interpolate_bytes(
                                b"1:d,%d:%s,", len(fso.u8name), fso.u8name))
                    else:
                        if fso.fsname is None:
                            dir_tainted = True
                            dir_dgst.update(util.interpolate_bytes(
                                b"1:d,%d:%s,",
                                len(fso.alt_fsname),
                                fso.alt_fsname))
                        else:
                            dir_dgst.update(util.interpolate_bytes(
                                b"1:d,%d:%s,", len(fso.fsname), fso.fsname))
                    dir_dgst.update(util.interpolate_bytes(
                        b"%d:%s,", len(sub_dir_dgst), sub_dir_dgst))
                    if self._with_metadata_full_mode:
                        modestr = util.b(normalized_mode_str(fso.stat.st_mode))
                        dir_dgst.update(util.interpolate_bytes(
                            b"8:fullmode,%d:%s,", len(modestr), modestr))
                    elif self._with_metadata_mode:
                        modestr = util.b(normalized_compatible_mode_str(
                            fso.stat.st_mode))
                        dir_dgst.update(util.interpolate_bytes(
                            b"4:mode,%d:%s,", len(modestr), modestr))
            else:
                if fso.is_symlink and not self._follow_symlinks.file:
                    linktgt = walk.WalkDirEntry.from_readlink(
                        os.readlink(fso.path))
                    # linktgt = util.fsencode(os.readlink(fso.path)))
                    linkdgst = self._algorithm[0]()
                    if self._utf8_mode:
                        if linktgt.u8path is None:
                            dir_tainted = True
                            linkdgst.update(util.interpolate_bytes(
                                b"%d:%s,",
                                len(linktgt.alt_u8path),
                                linktgt.alt_u8path))
                        else:
                            linkdgst.update(util.interpolate_bytes(
                                b"%d:%s,",
                                len(linktgt.u8path),
                                linktgt.u8path))
                        if fso.u8name is None:
                            dir_tainted = True
                            dir_dgst.update(util.interpolate_bytes(
                                b"1:F,%d:%s,",
                                len(fso.alt_u8name),
                                fso.alt_u8name))
                        else:
                            dir_dgst.update(util.interpolate_bytes(
                                b"1:F,%d:%s,", len(fso.u8name), fso.u8name))
                    else:
                        if linktgt.fspath is None:
                            dir_tainted = True
                            linkdgst.update(util.interpolate_bytes(
                                b"%d:%s,",
                                len(linktgt.alt_fspath),
                                linktgt.alt_fspath))
                        else:
                            linkdgst.update(util.interpolate_bytes(
                                b"%d:%s,",
                                len(linktgt.fspath),
                                linktgt.fspath))
                        if fso.fsname is None:
                            dir_tainted = True
                            dir_dgst.update(util.interpolate_bytes(
                                b"1:F,%d:%s,",
                                len(fso.alt_fsname),
                                fso.alt_fsname))
                        else:
                            dir_dgst.update(util.interpolate_bytes(
                                b"1:F,%d:%s,", len(fso.fsname), fso.fsname))
                    #
                    # - no mtime and no mode for symlinks
                    # - also does not count for dir_size
                    #
                    dir_dgst.update(util.interpolate_bytes(
                        b"%d:%s,",
                        len(linkdgst.digest()), linkdgst.digest()))
                    opath = join_output_path(top, fso.name)
                    if self._utf8_mode:
                        opath = walk.WalkDirEntry.alt_u8(opath)
                    else:
                        opath = walk.WalkDirEntry.alt_fs(opath)
                    if self._size_only:
                        self._outfp.write(format_bsd_line(
                            "SIZE", None,
                            util.interpolate_bytes(b"%s/./@", opath),
                            False, 0))
                    else:
                        self._outfp.write(format_bsd_line(
                            self._algorithm[1],
                            linkdgst.digest(),
                            util.interpolate_bytes(b"%s/./@", opath),
                            self._use_base64))
                    self._outfp.flush()
                else:
                    #
                    # Follow the symlink to file or handle a "real" file
                    #

                    if self._utf8_mode:
                        if fso.u8name is None:
                            dir_tainted = True
                            dir_dgst.update(util.interpolate_bytes(
                                b"1:f,%d:%s,",
                                len(fso.alt_u8name),
                                fso.alt_u8name))
                        else:
                            dir_dgst.update(util.interpolate_bytes(
                                b"1:f,%d:%s,", len(fso.u8name), fso.u8name))
                    else:
                        if fso.fsname is None:
                            dir_tainted = True
                            dir_dgst.update(util.interpolate_bytes(
                                b"1:f,%d:%s,",
                                len(fso.alt_fsname),
                                fso.alt_fsname))
                        else:
                            dir_dgst.update(util.interpolate_bytes(
                                b"1:f,%d:%s,", len(fso.fsname), fso.fsname))
                    dir_size += fso.stat.st_size
                    if self._with_metadata_mtime:
                        mtime = datetime.datetime.utcfromtimestamp(
                            int(fso.stat.st_mtime))
                        mtime = util.b(mtime.isoformat("T") + "Z")
                        dir_dgst.update(util.interpolate_bytes(
                            b"5:mtime,%d:%s,", len(mtime), mtime))
                    if self._with_metadata_full_mode:
                        modestr = util.b(normalized_mode_str(fso.stat.st_mode))
                        dir_dgst.update(util.interpolate_bytes(
                            b"8:fullmode,%d:%s,", len(modestr), modestr))
                    elif self._with_metadata_mode:
                        modestr = util.b(normalized_compatible_mode_str(
                            fso.stat.st_mode))
                        dir_dgst.update(util.interpolate_bytes(
                            b"4:mode,%d:%s,", len(modestr), modestr))
                    if not self._size_only:
                        dgst = digest.compute_digest_file(
                            self._algorithm[0],
                            fso.path,
                            use_mmap=self._use_mmap)
                        dir_dgst.update(util.interpolate_bytes(
                            b"%d:%s,", len(dgst), dgst))
                    opath = join_output_path(top, fso.name)
                    if self._utf8_mode:
                        opath = walk.WalkDirEntry.alt_u8(opath)
                    else:
                        opath = walk.WalkDirEntry.alt_fs(opath)
                    if self._size_only:
                        self._outfp.write(format_bsd_line(
                            "SIZE", None, opath, False, fso.stat.st_size))
                    else:
                        if self._print_size:
                            self._outfp.write(format_bsd_line(
                                self._algorithm[1],
                                dgst, opath,
                                self._use_base64,
                                fso.stat.st_size))
                        else:
                            self._outfp.write(format_bsd_line(
                                self._algorithm[1], dgst, opath,
                                self._use_base64))
                self._outfp.flush()
        opath = join_output_path(top, None)
        if opath:
            if self._utf8_mode:
                opath = walk.WalkDirEntry.alt_u8(opath)
            else:
                opath = walk.WalkDirEntry.alt_fs(opath)
        if self._size_only:
            self._outfp.write(format_bsd_line(
                    "SIZE", None, opath, False, dir_size))
        else:
            if dir_tainted:
                #
                # IMPORTANT: Print errors BEFORE the associated digest line.
                #            Otherwise the "info" command has a problem.
                #
                self._outfp.write(format_bsd_line(
                    b"ERROR", None, b"directory is tainted", False, None))
                logging.error("Directory has filename problems: %r", opath)
            if self._print_size:
                self._outfp.write(format_bsd_line(
                    self._algorithm[1], dir_dgst.digest(), opath,
                    self._use_base64, dir_size))
            else:
                self._outfp.write(format_bsd_line(
                    self._algorithm[1], dir_dgst.digest(), opath,
                    self._use_base64))
        self._outfp.flush()
        return (dir_dgst.digest(), dir_size)


def join_output_path(top, name):
    if name is None:
        # a path for a directory is to be computed
        if top:
            if isinstance(top[0], bytes):
                return b"/".join(top) + b"/"
            else:
                return u"/".join(top) + u"/"
        else:
            return b""
    else:
        # a path for a normal file is to be computed
        if top:
            if isinstance(name, bytes):
                return b"/".join(top) + b"/" + name
            else:
                return u"/".join(top) + u"/" + name
        else:
            return name


class CRC32Output(object):

    """Wrapper for a minimal binary file contextmanager that calculates
    the CRC32 of the written bytes on the fly.

    Also acts as context manager proxy for the given context manager.

    """

    __slots__ = ("_fp_cm", "_fp", "_crc32")

    def __init__(self, fp_cm):
        self._fp_cm = fp_cm
        self._fp = None
        self.resetdigest()

    def __enter__(self):
        assert self._fp is None
        self._fp = self._fp_cm.__enter__()
        return self

    def __exit__(self, *args):
        rv = self._fp_cm.__exit__(*args)
        self._fp = None
        return rv

    def write(self, what):
        self._fp.write(what)
        self._crc32 = zlib.crc32(what, self._crc32)

    def flush(self):
        self._fp.flush()

    def resetdigest(self):
        """Reset the current CRC digest"""
        self._crc32 = zlib.crc32(b"")

    def hexcrcdigest(self):
        """

        :rtype: str

        """
        return (hex(self.crcdigest())[2:]).upper()

    def crcdigest(self):
        """

        :rtype: int

        """
        if util.PY2:
            if self._crc32 < 0:
                # Return the bitpattern as unsigned 32-bit number
                return (~self._crc32 ^ 0xFFFFFFFF)
            else:
                return self._crc32
        else:
            return self._crc32


def normalized_compatible_mode_str(mode):
    # XXX FIXME: Windows and "executable"
    modebits = stat.S_IMODE(mode)
    modestr = "%o" % (modebits,)
    if not modestr.startswith("0"):
        modestr = "0" + modestr
    return modestr


def normalized_mode_str(mode):
    modestr = "%o" % (mode,)
    if not modestr.startswith("0"):
        modestr = "0" + modestr
    return modestr


def format_bsd_line(what, value, filename, use_base64, size=None):
    ls = util.b(os.linesep)
    if not isinstance(what, bytes):
        what = what.encode("ascii")
    if what == b"TIMESTAMP":
        assert filename is None
        return util.interpolate_bytes(b"TIMESTAMP = %d%s", value, ls)
    if what in (b"FSENCODING", b"ISOTIMESTAMP", b"FLAGS", b"VERSION",
                b"CRC32"):
        assert filename is None
        return util.interpolate_bytes(b"%s = %s%s", what, util.b(value), ls)
    assert filename is not None
    if what in (b"COMMENT", b"ERROR", b"GENERATOR"):
        return util.interpolate_bytes(
            b"%s (%s)%s", what, util.b(filename, "utf-8"), ls)
    if not isinstance(filename, bytes):
        filename = util.fsencode(filename)
    if what == b"SIZE":
        return util.interpolate_bytes(b"SIZE (%s) = %d%s", filename, size, ls)
    if value is None:
        return util.interpolate_bytes(b"%s (%s)%s", what, filename, ls)
    if use_base64:
        value = base64.b64encode(value)
    else:
        value = binascii.hexlify(value)
    if filename != b"./@/":
        filename = util.normalize_filename(filename, True)
    if size is None:
        return util.interpolate_bytes(
            b"%s (%s) = %s%s", what, filename, value, ls)
    else:
        return util.interpolate_bytes(
            b"%s (%s) = %s,%d%s", what, filename, value, size, ls)


class TreesumReader(object):

    """Reader to read and/or verify treesum digest files.

    Supports the iterator and context manager protocol.

    """

    PATTERN0 = re.compile(br"\A[ \t]*\r?\n\Z")   # empty lines
    PATTERN1 = re.compile(br"\A(VERSION|FSENCODING|FLAGS|TIMESTAMP|ISOTIMESTAMP|CRC32)[ \t]*=[ \t]*([^ \t]+)[ \t]*\r?\n\Z")      # noqa: E501  line too long
    PATTERN2 = re.compile(br"\A(ROOT|COMMENT|ERROR|GENERATOR)[ \t]*\((.*)\)[ \t]*\r?\n\Z")                                       # noqa: E501  line too long
    PATTERN3 = re.compile(br"\ASIZE[ \t]*\((.*)\)[ \t]*=[ \t]*(\d+)[ \t]*\r?\n\Z")                                               # noqa: E501  line too long
    PATTERN4 = re.compile(br"\A([A-Za-z0-9_-]+)[ \t]*\((.*)\)[ \t]*=[ \t]*([A-Za-z0-9=+/]+)(,(\d+))?[ \t]*\r?\n\Z")              # noqa: E501  line too long

    def __init__(self, _fp, _filename, _own_fp):
        self._fp = _fp
        self._own_fp = _own_fp
        self._filename = _filename
        self._line_no = 0
        self._reset_crc()
        self._expect_crc = None  # NOTE: tristate: None is different from False
        self._current_algo_name = self._current_algo_digest_size = None

    @classmethod
    def from_path(cls_, path):
        """Open file at `path` and return a reader that owns the file object"""
        return cls_(open(path, "rb"), path, True)

    @classmethod
    def from_binary_buffer(cls_, binary_fp, filename):
        return cls_(binary_fp, filename, False)

    def __enter__(self):
        return self

    def __exit__(self, *args):
        self.close()

    def close(self):
        if self._fp is not None:
            try:
                if self._own_fp:
                    self._fp.close()
            finally:
                self._fp = None

    def __iter__(self):
        return self

    def __next__(self):
        rec = self.read_record()
        if rec is None:
            raise StopIteration()
        return rec

    if util.PY2:
        next = __next__

    def all_records(self):
        """Iterator over all remaining records"""
        while True:
            rec = self.read_record()
            if rec is None:
                return
            yield rec

    def read_record(self):
        """Read and parse the "next" line.

        :returns: `None` at EOF or the parsed contents of the line
        :rtype: tuple or None

        """
        # Loop to skip empty lines
        while True:
            line = self._get_next_line()
            if not line:
                #
                # Skip for empty files at the very beginning.
                # Check only after the first VERSION line.
                #
                if self._expect_crc is not None:
                    if self._expect_crc:
                        logging.warning("CRC32 is missing at EOF")
                return None
            if not self.PATTERN0.search(line):
                break
            self._update_crc(line)
        #
        # At the beginning transparently skip an eventually  embedded signify
        # signature
        #
        if self._line_no == 1:
            if line.startswith(b"untrusted comment: "):
                line = self._get_next_line()
                if not line.endswith(b"\n"):
                    raise binascii.Error("No valid signify signature value")
                # Try to decode for an early error check
                base64.b64decode(line[:-1])
        mo = self.PATTERN1.search(line)
        if mo:
            if mo.group(1) == b"VERSION":
                if self._expect_crc:
                    logging.warning("CRC32 missing before line %d",
                                    self._line_no)
                self._reset_crc()
                self._expect_crc = True
                self._update_crc(line)
                return ("VERSION", util.n(mo.group(2)))
            if mo.group(1) == b"CRC32":
                # TODO: check
                if self._expect_crc is None:
                    logging.warning("Lone CRC32 before VERSION in line %d",
                                    self._line_no)
                else:
                    if self._expect_crc:
                        if (self._hex_crc()
                                != mo.group(2).decode("latin1").upper()):
                            logging.warning(
                                "CRC32 mismatch in line %d:"
                                " expected: %s, given: %s",
                                self._line_no,
                                self._hex_crc(),
                                mo.group(2).decode("latin1").upper())
                    else:
                        logging.warning("CRC32 before VERSION in line %d",
                                        self._line_no)
                # Do not update the CRC here but reset the state
                self._expect_crc = False
                return ("CRC32", util.n(mo.group(2)))
            else:
                self._update_crc(line)
                return (util.n(mo.group(1)), util.n(mo.group(2)))
        else:
            mo = self.PATTERN2.search(line)
            if mo:
                self._update_crc(line)
                if mo.group(1) in (b"COMMENT", b"ERROR", b"GENERATOR"):
                    return (util.u(mo.group(1)), util.u(mo.group(2), "utf-8"))
                elif mo.group(1) == b"ROOT":
                    return ("ROOT", mo.group(2))
                assert False, line
            else:
                mo = self.PATTERN3.search(line)
                if mo:
                    self._update_crc(line)
                    return ("SIZE", mo.group(1), int(util.n(mo.group(2)), 10))
                else:
                    mo = self.PATTERN4.search(line)
                    if mo:
                        self._update_crc(line)
                        algo_name = util.n(mo.group(1))
                        if (len(mo.group(3)) ==
                                2 * self._get_digest_size(algo_name)):
                            # hex
                            digest = binascii.unhexlify(mo.group(3))
                        else:
                            # base64
                            digest = base64.b64decode(mo.group(3))
                        if mo.group(4):
                            size = int(util.n(mo.group(5)), 10)
                        else:
                            size = None
                        return (algo_name, mo.group(2), digest, size)
                    else:
                        assert False, line
        return line

    def _get_next_line(self):
        line = self._fp.readline(4096)      # along PATH_MAX on Linux
        if line:
            self._line_no += 1
        return line

    def _reset_crc(self):
        self._crc32 = zlib.crc32(b"")

    def _update_crc(self, data):
        self._crc32 = zlib.crc32(data, self._crc32)

    def _hex_crc(self):
        return (hex(self._get_crc())[2:]).upper()

    def _get_crc(self):
        """Get the current CRC always as positive number with the same bit#
        pattern because Python2 yields negative numbers also.

        :return: The current CRC value as positive  number on all Python
                 versions
        :rtype: int

        """
        if util.PY2:
            if self._crc32 < 0:
                # Return the bitpattern as unsigned 32-bit number
                return (~self._crc32 ^ 0xFFFFFFFF)
            else:
                return self._crc32
        else:
            return self._crc32

    def _get_digest_size(self, algo_name):
        if self._current_algo_name == algo_name:
            return self._current_algo_digest_size
        h = util.algotag2algotype(algo_name)()
        self._current_algo_name = algo_name
        self._current_algo_digest_size = h.digest_size
        return self._current_algo_digest_size


def print_treesum_digestfile_infos(opts):
    print_infos_for_digestfile(opts.digest_files, opts.print_only_last_block)


def print_infos_for_digestfile(digest_files, print_only_last_block=True):
    for fn in digest_files:
        if fn == "-":
            if util.PY2:
                reader = TreesumReader.from_binary_buffer(sys.stdin)
            else:
                reader = TreesumReader.from_binary_buffer(sys.stdin.buffer)
        else:
            reader = TreesumReader.from_path(fn)

        with reader:
            root = generator = flags = fsencoding = algorithm = digest \
                = size = None
            errors = set()
            comments = []
            in_block = False
            block_no = 0
            for record in reader:
                if record[0] == "VERSION":
                    assert record[1] == "1"
                    # start a new block
                    in_block = True
                    block_no += 1
                    root = flags = algorithm = digest = size = None
                    comments = []
                elif record[0] == "GENERATOR":
                    generator = record[1]
                elif record[0] == "FSENCODING":
                    fsencoding = record[1]
                elif record[0] == "FLAGS":
                    flags = record[1]
                elif record[0] == "ROOT":
                    root = record[1]
                elif record[0] == "COMMENT":
                    comments.append(record[1])
                elif record[0] == "ERROR":
                    errors.add(record[1])
                elif record[0] in ("TIMESTAMP", "ISOTIMESTAMP"):
                    pass
                elif record[0] == "CRC32":
                    pass
                    # in_block = False
                else:
                    if not in_block:
                        continue
                    # digest line or size line
                    if not record[1] or record[1] == b"./@/":
                        if record[0] == "SIZE":
                            algorithm = "SIZE"
                            digest = None
                            size = record[2]
                        else:
                            algorithm = record[0]
                            digest = record[2]
                            size = record[3]
                        if not print_only_last_block:
                            print_block_data(
                                block_no,
                                root, generator, fsencoding, flags, comments,
                                errors, algorithm, digest, size)
                            root = generator = flags = fsencoding = algorithm \
                                = digest = size = None
                            errors = set()
                            comments = []
                        in_block = False
        if print_only_last_block:
            if not in_block:
                if digest is not None or size is not None:
                    print_block_data(
                        block_no,
                        root, generator, fsencoding, flags, comments, errors,
                        algorithm, digest, size)
            else:
                logging.warning("missing block end")


def print_block_data(block_no, tag, generator, fsencoding, flags, comments,
                     errors, algorithm, digest, size):
    digeststr = util.n(binascii.hexlify(digest)) if digest else "<no digest>"
    sizestr = str(size) if size is not None else "<no size>"
    print("BLOCK No %d:" % (block_no,))
    print("    Tag:", tag)
    print("    FS-Encoding:", fsencoding)
    if generator:
        print("    Generator:", generator)
    print("    Flags:", flags if flags else "<none>")
    if comments:
        print("    Comments:", comments)
    print("    Algorithm:", algorithm)
    if algorithm != "SIZE":
        print("    Digest:", digeststr)
    print("    Size:", sizestr)
    print("    Errors:", errors if errors else "<none>")


if __name__ == "__main__":
    sys.exit(main())