changeset 177:089c40240061

Add an alternate implementation for generating directory tree digests: - Do not use something like os.walk() but use os.scandir() directly. - Recursively generate the subdirectory digests only when needed and in the right order. This fixes that the order of subdirectories in the output did not match the application order of its directory digests. The new implementation also should make filtering (that will be implemented later) easier. NOTE: The tree digests of the old and the new implementation are identical.
author Franz Glasner <fzglas.hg@dom66.de>
date Sat, 11 Jan 2025 17:41:28 +0100
parents 7f5d05a625fd
children dac26a2d9de5
files cutils/treesum.py cutils/util/walk.py
diffstat 2 files changed, 242 insertions(+), 1 deletions(-) [+]
line wrap: on
line diff
--- a/cutils/treesum.py	Sat Jan 11 13:20:14 2025 +0100
+++ b/cutils/treesum.py	Sat Jan 11 17:41:28 2025 +0100
@@ -281,6 +281,18 @@
 
     with out_cm as outfp:
         for d in opts.directories:
+
+            V1DirectoryTreesumGenerator(
+                opts.algorithm, opts.mmap, opts.base64, opts.logical,
+                opts.follow_directory_symlinks,
+                opts.metadata_mode,
+                opts.metadata_full_mode,
+                opts.metadata_mtime,
+                opts.size_only,
+                opts.print_size,
+                minimal=opts.minimal).generate(
+                    outfp, d, comment=opts.comment)
+
             generate_treesum_for_directory(
                 outfp, d, opts.algorithm, opts.mmap, opts.base64, opts.logical,
                 opts.follow_directory_symlinks,
@@ -293,6 +305,234 @@
                 comment=opts.comment)
 
 
+class V1DirectoryTreesumGenerator(object):
+
+    def __init__(self, algorithm, use_mmap, use_base64,
+                 handle_root_logical, follow_directory_symlinks,
+                 with_metadata_mode, with_metadata_full_mode,
+                 with_metadata_mtime, size_only, print_size,
+                 minimal=None,):
+        super(V1DirectoryTreesumGenerator, self).__init__()
+        self._algorithm = algorithm
+        self._use_mmap = use_mmap
+        self._use_base64 = use_base64
+        self._handle_root_logical = handle_root_logical
+        self._follow_directory_symlinks = follow_directory_symlinks
+        self._with_metadata_mode = with_metadata_mode
+        self._with_metadata_full_mode = with_metadata_full_mode
+        self._with_metadata_mtime = with_metadata_mtime
+        self._size_only = size_only
+        self._print_size = print_size
+        self._minimal = minimal
+
+    def generate(self, outfp, root, comment=None):
+        """
+
+        :param outfp: a *binary* file with a "write()" and a "flush()" method
+
+        """
+        self._outfp = outfp
+        self._outfp.write(format_bsd_line("VERSION", "1", None, False))
+        self._outfp.flush()
+
+        #
+        # Note: Given non-default flags that are relevant for
+        #       directory traversal.
+        #
+        flags = []
+        if self._with_metadata_full_mode:
+            flags.append("with-metadata-fullmode")
+        elif self._with_metadata_mode:
+            flags.append("with-metadata-mode")
+        if self._with_metadata_mtime:
+            flags.append("with-metadata-mtime")
+        if self._handle_root_logical:
+            flags.append("logical")
+        if self._follow_directory_symlinks:
+            flags.append("follow-directory-symlinks")
+        if self._size_only:
+            flags.append("size-only")
+        else:
+            if self._print_size:
+                flags.append("print-size")
+        if flags:
+            flags.sort()
+            self._outfp.write(
+                format_bsd_line("FLAGS", ",".join(flags), None, False))
+
+        if self._minimal is None:
+            # Write execution timestamps in POSIX epoch and ISO format
+            ts = int(time.time())
+            self._outfp.write(format_bsd_line("TIMESTAMP", ts, None, False))
+            ts = (datetime.datetime.utcfromtimestamp(ts)).isoformat("T")
+            self._outfp.write(format_bsd_line("ISOTIMESTAMP", ts, None, False))
+
+            if comment:
+                for line in comment:
+                    self._outfp.write(
+                        format_bsd_line("COMMENT", None, line, False))
+
+        if self._minimal is not None:
+            self._outfp.write(format_bsd_line(
+                "ROOT", None, self._minimal if self._minimal else "", False))
+        else:
+            self._outfp.write(format_bsd_line("ROOT", None, root, False))
+        self._outfp.flush()
+
+        if not self._handle_root_logical and os.path.islink(root):
+            linktgt = util.fsencode(os.readlink(root))
+            linkdgst = self._algorithm[0]()
+            linkdgst.update(
+                util.interpolate_bytes(b"%d:%s,", len(linktgt), linktgt))
+            dir_dgst = self._algorithm[0]()
+            dir_dgst.update(b"1:L,")
+            dir_dgst.update(
+                util.interpolate_bytes(
+                    b"%d:%s,", len(linkdgst.digest()), linkdgst.digest()))
+            if self._size_only:
+                self._outfp.write(
+                    format_bsd_line(
+                        "SIZE",
+                        None,
+                        "./@",
+                        False,
+                        0))
+            else:
+                self._outfp.write(
+                    format_bsd_line(
+                        self._algorithm[1],
+                        dir_dgst.digest(),
+                        "./@",
+                        self._use_base64))
+            self._outfp.flush()
+            return
+
+        self._generate(os.path.normpath(root), tuple())
+
+    def _generate(self, root, top):
+        logging.debug("Handling %s/%r", root, top)
+        path = os.path.join(root, *top) if top else root
+        with walk.ScanDir(path) as dirscan:
+            fsobjects = list(dirscan)
+        fsobjects.sort(key=walk.WalkDirEntry.sort_key)
+        dir_dgst = self._algorithm[0]()
+        dir_size = 0
+        for fso in fsobjects:
+            if fso.is_dir:
+                if fso.is_symlink and not self._follow_directory_symlinks:
+                    linktgt = util.fsencode(os.readlink(fso.path))
+                    linkdgst = self._algorithm[0]()
+                    linkdgst.update(
+                        util.interpolate_bytes(
+                            b"%d:%s,", len(linktgt), linktgt))
+                    dir_dgst.update(util.interpolate_bytes(
+                        b"1:S,%d:%s,", len(fso.fsname), fso.fsname))
+                    #
+                    # - no mtime and no mode for symlinks
+                    # - also does not count for dir_size
+                    #
+                    dir_dgst.update(util.interpolate_bytes(
+                        b"%d:%s,",
+                        len(linkdgst.digest()), linkdgst.digest()))
+                    opath = "/".join(top) + "/" + fso.name if top else fso.name
+                    if self._size_only:
+                        self._outfp.write(format_bsd_line(
+                            "SIZE", None, "%s/./@" % (opath,), False, 0))
+                    else:
+                        self._outfp.write(format_bsd_line(
+                            self._algorithm[1],
+                            linkdgst.digest(),
+                            "%s/./@" % (opath,),
+                            self._use_base64))
+                    self._outfp.flush()
+                else:
+                    #
+                    # Follow the symlink to dir or handle a "real" directory
+                    #
+
+                    # Get subdir data from recursing into it
+                    sub_dir_dgst, sub_dir_size = self._generate(
+                        root, top + (fso.name, ))
+
+                    dir_size += sub_dir_size
+                    dir_dgst.update(util.interpolate_bytes(
+                        b"1:d,%d:%s,", len(fso.fsname), fso.fsname))
+                    dir_dgst.update(util.interpolate_bytes(
+                        b"%d:%s,", len(sub_dir_dgst), sub_dir_dgst))
+                    if self._with_metadata_full_mode:
+                        modestr = normalized_mode_str(fso.stat.st_mode)
+                        if not isinstance(modestr, bytes):
+                            modestr = modestr.encode("ascii")
+                        dir_dgst.update(util.interpolate_bytes(
+                            b"8:fullmode,%d:%s,", len(modestr), modestr))
+                    elif self._with_metadata_mode:
+                        modestr = normalized_compatible_mode_str(
+                            fso.stat.st_mode)
+                        if not isinstance(modestr, bytes):
+                            modestr = modestr.encode("ascii")
+                        dir_dgst.update(util.interpolate_bytes(
+                            b"4:mode,%d:%s,", len(modestr), modestr))
+            else:
+                dir_dgst.update(util.interpolate_bytes(
+                    b"1:f,%d:%s,", len(fso.fsname), fso.fsname))
+                dir_size += fso.stat.st_size
+                if self._with_metadata_mtime:
+                    mtime = datetime.datetime.utcfromtimestamp(
+                        int(fso.stat.st_mtime))
+                    mtime = mtime.isoformat("T") + "Z"
+                    if not isinstance(mtime, bytes):
+                        mtime = mtime.encode("ascii")
+                    dir_dgst.update(util.interpolate_bytes(
+                        b"5:mtime,%d:%s,", len(mtime), mtime))
+                if self._with_metadata_full_mode:
+                    modestr = normalized_mode_str(fso.stat.st_mode)
+                    if not isinstance(modestr, bytes):
+                        modestr = modestr.encode("ascii")
+                    dir_dgst.update(util.interpolate_bytes(
+                        b"8:fullmode,%d:%s,", len(modestr), modestr))
+                elif self._with_metadata_mode:
+                    modestr = normalized_compatible_mode_str(fso.stat.st_mode)
+                    if not isinstance(modestr, bytes):
+                        modestr = modestr.encode("ascii")
+                    dir_dgst.update(util.interpolate_bytes(
+                        b"4:mode,%d:%s,", len(modestr), modestr))
+                if not self._size_only:
+                    dgst = digest.compute_digest_file(
+                        self._algorithm[0], fso.path, use_mmap=self._use_mmap)
+                    dir_dgst.update(util.interpolate_bytes(
+                        b"%d:%s,", len(dgst), dgst))
+                opath = "/".join(top) + "/" + fso.name if top else fso.name
+                if self._size_only:
+                    self._outfp.write(format_bsd_line(
+                        "SIZE", None, opath, False, fso.stat.st_size))
+                else:
+                    if self._print_size:
+                        self._outfp.write(format_bsd_line(
+                            self._algorithm[1], dgst, opath, self._use_base64,
+                            fso.stat.st_size))
+                    else:
+                        self._outfp.write(format_bsd_line(
+                            self._algorithm[1], dgst, opath,
+                            self._use_base64))
+                self._outfp.flush()
+
+        opath = "/".join(top) + "/" if top else ""
+        if self._size_only:
+            self._outfp.write(format_bsd_line(
+                    "SIZE", None, opath, False, dir_size))
+        else:
+            if self._print_size:
+                self._outfp.write(format_bsd_line(
+                    self._algorithm[1], dir_dgst.digest(), opath,
+                    self._use_base64, dir_size))
+            else:
+                self._outfp.write(format_bsd_line(
+                    self._algorithm[1], dir_dgst.digest(), opath,
+                    self._use_base64))
+        self._outfp.flush()
+        return (dir_dgst.digest(), dir_size)
+
+
 def generate_treesum_for_directory(
         outfp, root, algorithm, use_mmap, use_base64, handle_root_logical,
         follow_directory_symlinks, with_metadata_mode, with_metadata_full_mode,
--- a/cutils/util/walk.py	Sat Jan 11 13:20:14 2025 +0100
+++ b/cutils/util/walk.py	Sat Jan 11 17:41:28 2025 +0100
@@ -7,7 +7,8 @@
 
 """
 
-__all__ = ["walk"]
+__all__ = ["walk",
+           "ScanDir"]
 
 
 import os