changeset 266:0add8276e6b8

treesum: Handle errors like broken symlinks properly
author Franz Glasner <fzglas.hg@dom66.de>
date Tue, 18 Feb 2025 12:39:04 +0100
parents 188f448ab5e9
children b9aa65a30b4c
files cutils/treesum.py cutils/util/walk.py
diffstat 2 files changed, 176 insertions(+), 70 deletions(-) [+]
line wrap: on
line diff
--- a/cutils/treesum.py	Mon Feb 17 00:12:33 2025 +0100
+++ b/cutils/treesum.py	Tue Feb 18 12:39:04 2025 +0100
@@ -18,6 +18,7 @@
 import binascii
 import collections
 import datetime
+import errno
 import logging
 import os
 import re
@@ -576,8 +577,41 @@
     def _generate(self, root, top):
         logging.debug("Handling %s/%r", root, top)
         path = os.path.join(root, *top) if top else root
-        with walk.ScanDir(path) as dirscan:
-            fsobjects = list(dirscan)
+        try:
+            with walk.ScanDir(path) as dirscan:
+                fsobjects = list(dirscan)
+        except OSError as e:
+            if self._utf8_mode:
+                opath = walk.WalkDirEntry.alt_u8(path)
+            else:
+                opath = walk.WalkDirEntry.alt_fs(path)
+            if e.errno == errno.ENOTDIR:
+                # object exists but is not a directory
+                errmsg = b"not a directory"
+            elif e.errno in (errno.EACCES, errno.EPERM,
+                             getattr(errno, "ENOTCAPABLE", errno.EACCES)):
+                # no permissions
+                errmsg = (
+                    b"access denied / no permissions / missing capabilities")
+            elif e.errno == errno.ENOENT:
+                # given object does not exist
+                errmsg = b"no such file or directory"
+            else:
+                raise
+            self._writer.write_error(util.interpolate_bytes(
+                b"`%s': %s", opath, errmsg))
+            opath = join_output_path(top, None)
+            if opath:
+                if self._utf8_mode:
+                    opath = walk.WalkDirEntry.alt_u8(opath)
+                else:
+                    opath = walk.WalkDirEntry.alt_fs(opath)
+            if self._size_only:
+                self._writer.write_size(opath, None)
+            else:
+                self._writer.write_file_digest(self._algorithm[1], opath, None)
+            self._writer.flush()
+            return (None, None)
         if self._utf8_mode:
             fsobjects.sort(key=walk.WalkDirEntry.sort_key_u8)
         else:
@@ -666,6 +700,15 @@
                     sub_dir_dgst, sub_dir_size = self._generate(
                         root, top + (fso.name, ))
 
+                    if sub_dir_dgst is None or sub_dir_size is None:
+                        #
+                        # This should not happen:
+                        # - top-level directories are handled above
+                        # - other filesystem objects should also have been
+                        #   handled already
+                        #
+                        assert False
+
                     dir_size += sub_dir_size
                     if self._utf8_mode:
                         if fso.u8name is None:
@@ -794,42 +837,71 @@
                         else:
                             dir_dgst.update(util.interpolate_bytes(
                                 b"1:f,%d:%s,", len(fso.fsname), fso.fsname))
-                    dir_size += fso.stat.st_size
-                    if self._with_metadata_mtime:
-                        mtime = datetime.datetime.utcfromtimestamp(
-                            int(fso.stat.st_mtime))
-                        mtime = util.b(mtime.isoformat("T") + "Z")
-                        dir_dgst.update(util.interpolate_bytes(
-                            b"5:mtime,%d:%s,", len(mtime), mtime))
-                    if self._with_metadata_full_mode:
-                        modestr = util.b(normalized_mode_str(fso.stat.st_mode))
-                        dir_dgst.update(util.interpolate_bytes(
-                            b"8:fullmode,%d:%s,", len(modestr), modestr))
-                    elif self._with_metadata_mode:
-                        modestr = util.b(normalized_compatible_mode_str(
-                            fso.stat.st_mode))
-                        dir_dgst.update(util.interpolate_bytes(
-                            b"4:mode,%d:%s,", len(modestr), modestr))
-                    if not self._size_only:
-                        dgst = digest.compute_digest_file(
-                            self._algorithm[0],
-                            fso.path,
-                            use_mmap=self._use_mmap)
-                        dir_dgst.update(util.interpolate_bytes(
-                            b"%d:%s,", len(dgst), dgst))
                     opath = join_output_path(top, fso.name)
                     if self._utf8_mode:
                         opath = walk.WalkDirEntry.alt_u8(opath)
                     else:
                         opath = walk.WalkDirEntry.alt_fs(opath)
-                    if self._size_only:
-                        self._writer.write_size(opath, fso.stat.st_size)
+                    if fso.stat is None:
+                        #
+                        # Error: most likely a broken symlink here
+                        #
+                        dir_tainted = True
+                        dir_dgst.update(util.interpolate_bytes(
+                            b"5:errno,%d:%s,",
+                            len(str(fso.stat_errno)),
+                            util.b(str(fso.stat_errno))))
+                        self._writer.write_error(util.interpolate_bytes(
+                            b"errno %d: %s",
+                            fso.stat_errno,
+                            util.b(fso.stat_errstr, "utf-8")))
+                        logging.error(
+                            "Directory entry has symlink problems: %r",
+                            opath)
+                        if self._size_only:
+                            self._writer.write_size(opath, None)
+                        else:
+                            self._writer.write_file_digest(
+                                self._algorithm[1], opath, None)
                     else:
-                        sz = fso.stat.st_size if self._print_size else None
-                        self._writer.write_file_digest(
-                            self._algorithm[1], opath, dgst,
-                            use_base64=self._use_base64,
-                            size=sz)
+                        #
+                        # Ok: File has normal stat info
+                        #
+                        # XXX FIXME: Handle special files (fifo, socket,
+                        #            block or char devices, ...).
+                        #
+                        dir_size += fso.stat.st_size
+                        if self._with_metadata_mtime:
+                            mtime = datetime.datetime.utcfromtimestamp(
+                                int(fso.stat.st_mtime))
+                            mtime = util.b(mtime.isoformat("T") + "Z")
+                            dir_dgst.update(util.interpolate_bytes(
+                                b"5:mtime,%d:%s,", len(mtime), mtime))
+                        if self._with_metadata_full_mode:
+                            modestr = util.b(
+                                normalized_mode_str(fso.stat.st_mode))
+                            dir_dgst.update(util.interpolate_bytes(
+                                b"8:fullmode,%d:%s,", len(modestr), modestr))
+                        elif self._with_metadata_mode:
+                            modestr = util.b(normalized_compatible_mode_str(
+                                fso.stat.st_mode))
+                            dir_dgst.update(util.interpolate_bytes(
+                                b"4:mode,%d:%s,", len(modestr), modestr))
+                        if not self._size_only:
+                            dgst = digest.compute_digest_file(
+                                self._algorithm[0],
+                                fso.path,
+                                use_mmap=self._use_mmap)
+                            dir_dgst.update(util.interpolate_bytes(
+                                b"%d:%s,", len(dgst), dgst))
+                        if self._size_only:
+                            self._writer.write_size(opath, fso.stat.st_size)
+                        else:
+                            sz = fso.stat.st_size if self._print_size else None
+                            self._writer.write_file_digest(
+                                self._algorithm[1], opath, dgst,
+                                use_base64=self._use_base64,
+                                size=sz)
                 self._writer.flush()
         opath = join_output_path(top, None)
         if opath:
@@ -837,16 +909,17 @@
                 opath = walk.WalkDirEntry.alt_u8(opath)
             else:
                 opath = walk.WalkDirEntry.alt_fs(opath)
+        if dir_tainted:
+            #
+            # IMPORTANT: Print errors BEFORE the associated digest or size
+            #            line. Otherwise the "info" command has a problem.
+            #
+            self._writer.write_error(b"directory is tainted")
+            logging.error("Directory has filename and/or symlink problems: %r",
+                          opath)
         if self._size_only:
             self._writer.write_size(opath, dir_size)
         else:
-            if dir_tainted:
-                #
-                # IMPORTANT: Print errors BEFORE the associated digest line.
-                #            Otherwise the "info" command has a problem.
-                #
-                self._writer.write_error(b"directory is tainted")
-                logging.error("Directory has filename problems: %r", opath)
             sz = dir_size if self._print_size else None
             self._writer.write_file_digest(
                 self._algorithm[1], opath, dir_dgst.digest(),
@@ -1014,26 +1087,32 @@
         assert isinstance(filename, bytes)
         self.write(b"SIZE (")
         self.write(filename)
-        self.write(b") = ")
-        self.writeln(util.b(str(sz)))
+        self.write(b")")
+        if sz is not None:
+            self.write(b" = ")
+            self.write(util.b(str(sz)))
+        self.writeln(b"")
 
     def write_file_digest(self, algorithm, filename, digest,
                           use_base64=False, size=None):
-        digest = (base64.b64encode(digest)
-                  if use_base64
-                  else binascii.hexlify(digest))
+        if digest is not None:
+            digest = (base64.b64encode(digest)
+                      if use_base64
+                      else binascii.hexlify(digest))
         if filename != b"./@/":
             filename = util.normalize_filename(filename, True)
         self.write(util.b(algorithm))
         self.write(b" (")
         self.write(filename)
-        self.write(b") = ")
-        self.write(digest)
-        if size is not None:
-            self.write(b",")
-            self.writeln(util.b(str(size)))
-        else:
-            self.writeln(b"")
+        self.write(b")")
+        if digest is not None or size is not None:
+            self.write(b" = ")
+            if digest is not None:
+                self.write(digest)
+            if size is not None:
+                self.write(b",")
+                self.write(util.b(str(size)))
+        self.writeln(b"")
 
     def finish(self):
         """Finish a block and write the current CRC"""
@@ -1076,8 +1155,8 @@
     PATTERN0 = re.compile(br"\A[ \t]*\r?\n\Z")   # empty lines
     PATTERN1 = re.compile(br"\A(VERSION|FSENCODING|FLAGS|TIMESTAMP|ISOTIMESTAMP|CRC32)[ \t]*=[ \t]*([^ \t]+)[ \t]*\r?\n\Z")      # noqa: E501  line too long
     PATTERN2 = re.compile(br"\A(ROOT|COMMENT|ERROR|GENERATOR)[ \t]*\((.*)\)[ \t]*\r?\n\Z")                                       # noqa: E501  line too long
-    PATTERN3 = re.compile(br"\ASIZE[ \t]*\((.*)\)[ \t]*=[ \t]*(\d+)[ \t]*\r?\n\Z")                                               # noqa: E501  line too long
-    PATTERN4 = re.compile(br"\A([A-Za-z0-9_-]+)[ \t]*\((.*)\)[ \t]*=[ \t]*([A-Za-z0-9=+/]+)(,(\d+))?[ \t]*\r?\n\Z")              # noqa: E501  line too long
+    PATTERN3 = re.compile(br"\ASIZE[ \t]*\((.*)\)([ \t]*=[ \t]*(\d+))?[ \t]*\r?\n\Z")                                               # noqa: E501  line too long
+    PATTERN4 = re.compile(br"\A([A-Za-z0-9_-]+)[ \t]*\((.*)\)([ \t]*=[ \t]*([A-Za-z0-9=+/]+)(,(\d+))?)?[ \t]*\r?\n\Z")              # noqa: E501  line too long
 
     def __init__(self, _fp, _filename, _own_fp):
         self._fp = _fp
@@ -1211,24 +1290,31 @@
                 mo = self.PATTERN3.search(line)
                 if mo:
                     self._update_crc(line)
-                    return ("SIZE", mo.group(1), int(util.n(mo.group(2)), 10))
+                    if mo.group(2):
+                        return ("SIZE", mo.group(1),
+                                int(util.n(mo.group(3)), 10))
+                    else:
+                        return ("SIZE", mo.group(1), None)
                 else:
                     mo = self.PATTERN4.search(line)
                     if mo:
                         self._update_crc(line)
                         algo_name = util.n(mo.group(1))
-                        if (len(mo.group(3)) ==
-                                2 * self._get_digest_size(algo_name)):
-                            # hex
-                            digest = binascii.unhexlify(mo.group(3))
+                        if mo.group(3):
+                            if (len(mo.group(4)) ==
+                                    2 * self._get_digest_size(algo_name)):
+                                # hex
+                                digest = binascii.unhexlify(mo.group(4))
+                            else:
+                                # base64
+                                digest = base64.b64decode(mo.group(4))
+                            if mo.group(5):
+                                size = int(util.n(mo.group(6)), 10)
+                            else:
+                                size = None
+                            return (algo_name, mo.group(2), digest, size)
                         else:
-                            # base64
-                            digest = base64.b64decode(mo.group(3))
-                        if mo.group(4):
-                            size = int(util.n(mo.group(5)), 10)
-                        else:
-                            size = None
-                        return (algo_name, mo.group(2), digest, size)
+                            return (algo_name, mo.group(2), None, None)
                     else:
                         assert False, line
         return line
--- a/cutils/util/walk.py	Mon Feb 17 00:12:33 2025 +0100
+++ b/cutils/util/walk.py	Tue Feb 18 12:39:04 2025 +0100
@@ -53,7 +53,8 @@
     """
 
     __slots__ = ("_name", "_path",     # encoded as given in the ctor
-                 "_is_symlink", "_is_dir", "_stat_result",
+                 "_is_symlink", "_is_dir", "_stat_result", "_stat_errno",
+                 "_stat_errstr",
                  "_alt_fsname", "_alt_u8name")
 
     def __init__(self, name, path):
@@ -61,7 +62,8 @@
         """The name exactly as given in the ctor"""
         self._path = _unix_path(path)
         """The path as given in the ctor -- but normalized to have slashes"""
-        self._is_symlink = self._is_dir = self._stat_result = None
+        self._is_symlink = self._is_dir = self._stat_result = \
+            self._stat_errno = self._stat_errstr = None
         self._alt_fsname = self._alt_u8name = _notset
 
     @property
@@ -232,6 +234,14 @@
     def stat(self):
         return self._stat_result
 
+    @property
+    def stat_errno(self):
+        return self._stat_errno
+
+    @property
+    def stat_errstr(self):
+        return self._stat_errstr
+
     def __repr__(self):
         tag = ""
         if self._is_symlink:
@@ -261,8 +271,13 @@
             # is not a symbolic link, same behaviour than os.path.islink().
             #
             w._is_symlink = False
-        # Do not supress errors here and (consistently) follow symlinks
-        w._stat_result = entry.stat(follow_symlinks=True)
+        # Consistently follow symlinks
+        try:
+            w._stat_result = entry.stat(follow_symlinks=True)
+        except OSError as e:
+            w._stat_result = None
+            w._stat_errno = e.errno
+            w._stat_errstr = e.strerror
         return w
 
     @classmethod
@@ -286,7 +301,12 @@
             #
             w._is_symlink = False
         if _do_stat:
-            w._stat_result = os.stat(w._path)
+            try:
+                w._stat_result = os.stat(w._path)
+            except OSError as e:
+                w._stat_result = None
+                w._stat_errno = e.errno
+                w._stat_errstr = e.strerror
         return w
 
     @classmethod