# HG changeset patch # User Franz Glasner # Date 1735820960 -3600 # Node ID 3bd3f32b5e6035aaa5b36ccc30cb824cc5486f4c # Parent 4a0c3c9eead744836076e911075df10deef6286a A first version of "treesum" is working diff -r 4a0c3c9eead7 -r 3bd3f32b5e60 README.txt --- a/README.txt Thu Jan 02 12:43:40 2025 +0100 +++ b/README.txt Thu Jan 02 13:29:20 2025 +0100 @@ -7,11 +7,15 @@ :Date: 2022-05-30 :Copyright: (c) 2020-2022 Franz Glasner :License: BSD 3-Clause "New" or "Revised" License -:ID: @(#) $HGid$ +:ID: @(#) $HGid$ -Pure Python implementations of coreutils. +Pure Python implementations of coreutils and some additional utilities. Currently implemented: - `dos2unix` as :command:`py-dos2unix` - `shasum` as :command:`py-shasum` + +Additional utilities: + +- :command:`py-treesum` to compute hash trees for directories. diff -r 4a0c3c9eead7 -r 3bd3f32b5e60 cutils/treesum.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cutils/treesum.py Thu Jan 02 13:29:20 2025 +0100 @@ -0,0 +1,185 @@ +# -*- coding: utf-8 -*- +# :- +# :Copyright: (c) 2020-2025 Franz Glasner +# :License: BSD-3-Clause +# :- +r"""Generate and verify checksums for directory trees. + +""" + +from __future__ import print_function, absolute_import + + +__all__ = [] + + +import argparse +import base64 +import binascii +import os +import sys + +from . import (__version__, __revision__) +from . import util +from .util import cm +from .util import digest +from .util import walk + + +def main(argv=None): + aparser = argparse.ArgumentParser( + description="Generate checksums for directory trees", + fromfile_prefix_chars='@') + aparser.add_argument( + "--algorithm", "-a", action="store", type=util.argv2algo, + help="1 (aka sha1), 224, 256, 384, 512, " + "3 (alias for sha3-512), 3-224, 3-256, 3-384, 3-512, " + "blake2b, blake2b-256 (default), blake2s, " + "blake2 (alias for blake2b), blake2-256 (alias for blake2b-256), " + "md5") + aparser.add_argument( + "--append-output", action="store_true", dest="append_output", + help="Append to the output file instead of overwriting it.") + aparser.add_argument( + "--base64", action="store_true", + help="Output checksums in base64 notation, not hexadecimal (OpenBSD).") + aparser.add_argument( + "--mmap", action="store_true", dest="mmap", default=None, + help="Use mmap if available. Default is to determine automatically " + "from the filesize.") + aparser.add_argument( + "--no-mmap", action="store_false", dest="mmap", default=None, + help="Dont use mmap.") + aparser.add_argument( + "--output", "-o", action="store", metavar="OUTPUT", + help="Put the checksum into given file. If not given of if it is given" + " as `-' then stdout is used.") + aparser.add_argument( + "--version", "-v", action="version", + version="%s (rv:%s)" % (__version__, __revision__)) + aparser.add_argument( + "directories", nargs="*", metavar="DIRECTORY") + + opts = aparser.parse_args(args=argv) + + if not opts.algorithm: + opts.algorithm = util.argv2algo("blake2b-256") + + return treesum(opts) + + +def gen_opts(directories=[], + algorithm="BLAKE2b-256", + append_output=False, + base64=False, + mmap=None, + output=None): + opts = argparse.Namespace(directories=directories, + algorithm=(util.algotag2algotype(algorithm), + algorithm), + append_output=append_output, + base64=base64, + mmap=mmap, + output=output) + return opts + + +def treesum(opts): + # XXX TBD: opts.check and opts.checklist (as in shasum.py) + return generate_treesum(opts) + + +def generate_treesum(opts): + if not opts.directories: + opts.directories.append(".") + + if opts.output is None or opts.output == "-": + if hasattr(sys.stdout, "buffer"): + out_cm = cm.nullcontext(sys.stdout.buffer) + else: + out_cm = cm.nullcontext(sys.stdout) + else: + if opts.append_output: + out_cm = open(opts.output, "ab") + else: + out_cm = open(opts.output, "wb") + + with out_cm as outfp: + for d in opts.directories: + generate_treesum_for_directory( + outfp, d, opts.algorithm, opts.mmap, opts.base64) + + +def generate_treesum_for_directory( + outfp, root, algorithm, use_mmap, use_base64): + """ + + :param outfp: a *binary* file with a "write()" and a "flush()" method + + """ + outfp.write(format_bsd_line("ROOT", None, root, False)) + outfp.flush() + dir_digests = {} + + for top, dirs, nondirs in walk.walk(root, follow_symlinks=False): + dir_dgst = algorithm[0]() + for dn in dirs: + if dn.is_symlink: + linktgt = util.fsencode(os.readlink(dn.path)) + linkdgst = algorithm[0]() + linkdgst.update(linktgt) + dir_dgst.update(b"1:S,%d:%s," % (len(dn.fsname), dn.fsname)) + dir_dgst.update(linkdgst.digest()) + if top: + opath = "/".join(top) + "/" + dn.name + else: + opath = dn.name + outfp.write( + format_bsd_line( + algorithm[1], + linkdgst.digest(), + "%s/./@" % (opath,), + use_base64)) + outfp.flush() + else: + # fetch from dir_digests + dgst = dir_digests[top + (dn.name,)] + dir_dgst.update(b"1:d,%d:%s," % (len(dn.fsname), dn.fsname)) + dir_dgst.update(dgst) + for fn in nondirs: + dir_dgst.update(b"1:f,%d:%s," % (len(fn.fsname), fn.fsname)) + dgst = digest.compute_digest_file( + algorithm[0], fn.path, use_mmap=use_mmap) + dir_dgst.update(dgst) + if top: + opath = "/".join(top) + "/" + fn.name + else: + opath = fn.name + outfp.write(format_bsd_line( + algorithm[1], dgst, opath, use_base64)) + outfp.flush() + outfp.write(format_bsd_line( + algorithm[1], dir_dgst.digest(), "/".join(top) + "/", use_base64)) + outfp.flush() + dir_digests[top] = dir_dgst.digest() + + +def format_bsd_line(digestname, digest, filename, use_base64): + ls = os.linesep if isinstance(os.linesep, bytes) \ + else os.linesep.encode("utf-8") + if not isinstance(digestname, bytes): + digestname = digestname.encode("ascii") + if not isinstance(filename, bytes): + filename = util.fsencode(filename) + if digest is None: + return b"%s (%s)%s" % (digestname, filename, ls) + if use_base64: + digest = base64.b64encode(digest) + else: + digest = binascii.hexlify(digest) + return b"%s (%s) = %s%s" \ + % (digestname, util.normalize_filename(filename, True), digest, ls) + + +if __name__ == "__main__": + sys.exit(main()) diff -r 4a0c3c9eead7 -r 3bd3f32b5e60 cutils/util/__init__.py --- a/cutils/util/__init__.py Thu Jan 02 12:43:40 2025 +0100 +++ b/cutils/util/__init__.py Thu Jan 02 13:29:20 2025 +0100 @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # :- -# :Copyright: (c) 2020-2024 Franz Glasner +# :Copyright: (c) 2020-2025 Franz Glasner # :License: BSD-3-Clause # :- r"""Utility package. @@ -18,6 +18,7 @@ import argparse import hashlib +import os def get_blake2b(): @@ -143,8 +144,25 @@ def normalize_filename(filename, strip_leading_dot_slash=False): - filename = filename.replace("\\", "/") - if strip_leading_dot_slash: - while filename.startswith("./"): - filename = filename[2:] + if isinstance(filename, bytes): + filename = filename.replace(b"\\", b"/") + if strip_leading_dot_slash: + while filename.startswith(b"./"): + filename = filename[2:] + else: + filename = filename.replace("\\", "/") + if strip_leading_dot_slash: + while filename.startswith("./"): + filename = filename[2:] return filename + + +def fsencode(what): + """A somewhat compatibility function for :func:`os.fsencode`. + + If `what` is of type :class:`bytes` no :func:`os.fsencode` is required. + + """ + if isinstance(what, bytes): + return what + return os.fsencode(what) diff -r 4a0c3c9eead7 -r 3bd3f32b5e60 setup.py --- a/setup.py Thu Jan 02 12:43:40 2025 +0100 +++ b/setup.py Thu Jan 02 13:29:20 2025 +0100 @@ -55,6 +55,7 @@ "console_scripts": [ "py-dos2unix=cutils.dos2unix:main", "py-shasum=cutils.shasum:main", + "py-treesum=cutils.treesum:main", ] } ) diff -r 4a0c3c9eead7 -r 3bd3f32b5e60 treesum.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/treesum.py Thu Jan 02 13:29:20 2025 +0100 @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- +# :- +# :Copyright: (c) 2020-2024 Franz Glasner +# :License: BSD-3-Clause +# :- +r"""Pure Python implementation of a directory tree checksum. + +""" + +from __future__ import absolute_import + +import sys + +import cutils.treesum + + +sys.exit(cutils.treesum.main())