changeset 204:07f1d79e6674

Fully implemented UTF-8 mode for treeview. While doing this refactored "normal" mode (using the filesystem encoding).
author Franz Glasner <fzglas.hg@dom66.de>
date Tue, 21 Jan 2025 20:31:48 +0100
parents 3a85f7bbe0b1
children 63088d3675bb
files cutils/treesum.py docs/notes.rst
diffstat 2 files changed, 144 insertions(+), 23 deletions(-) [+]
line wrap: on
line diff
--- a/cutils/treesum.py	Tue Jan 21 18:57:02 2025 +0100
+++ b/cutils/treesum.py	Tue Jan 21 20:31:48 2025 +0100
@@ -406,9 +406,14 @@
 
         if self._minimal is not None:
             self._outfp.write(format_bsd_line(
-                "ROOT", None, self._minimal if self._minimal else "", False))
+                "ROOT",
+                None,
+                (walk.WalkDirEntry.alt_u8(self._minimal)
+                 if self._minimal else b""),
+                False))
         else:
-            self._outfp.write(format_bsd_line("ROOT", None, root, False))
+            self._outfp.write(format_bsd_line(
+                "ROOT", None, walk.WalkDirEntry.alt_u8(root), False))
         self._outfp.flush()
 
         if not self._handle_root_logical and os.path.islink(root):
@@ -457,6 +462,7 @@
             fsobjects.sort(key=walk.WalkDirEntry.sort_key)
         dir_dgst = self._algorithm[0]()
         dir_size = 0
+        dir_tainted = False
         for fso in fsobjects:
             if fso.is_dir:
                 if fso.is_symlink and not self._follow_directory_symlinks:
@@ -464,11 +470,48 @@
                         os.readlink(fso.path))
                     # linktgt = util.fsencode(os.readlink(fso.path)))
                     linkdgst = self._algorithm[0]()
-                    linkdgst.update(
-                        util.interpolate_bytes(
-                            b"%d:%s,", len(linktgt.fspath), linktgt.fspath))
-                    dir_dgst.update(util.interpolate_bytes(
-                        b"1:S,%d:%s,", len(fso.fsname), fso.fsname))
+                    if self._utf8_mode:
+                        if linktgt.u8path is None:
+                            dir_tainted = True
+                            linkdgst.update(util.interpolate_bytes(
+                                b"%d:%s,",
+                                len(linktgt.alt_u8path),
+                                linktgt.alt_u8path))
+                        else:
+                            linkdgst.update(util.interpolate_bytes(
+                                b"%d:%s,",
+                                len(linktgt.u8path),
+                                linktgt.u8path))
+                        if fso.u8name is None:
+                            dir_tainted = True
+                            dir_dgst.update(util.interpolate_bytes(
+                                b"1:S,%d:%s,",
+                                len(fso.alt_u8name),
+                                fso.alt_u8name))
+                        else:
+                            dir_dgst.update(util.interpolate_bytes(
+                                b"1:S,%d:%s,", len(fso.u8name), fso.u8name))
+                    else:
+                        if linktgt.fspath is None:
+                            dir_tainted = True
+                            linkdgst.update(util.interpolate_bytes(
+                                b"%d:%s,",
+                                len(linktgt.alt_fspath),
+                                linktgt.alt_fspath))
+                        else:
+                            linkdgst.update(util.interpolate_bytes(
+                                b"%d:%s,",
+                                len(linktgt.fspath),
+                                linktgt.fspath))
+                        if fso.fsname is None:
+                            dir_tainted = True
+                            dir_dgst.update(util.interpolate_bytes(
+                                b"1:S,%d:%s,",
+                                len(fso.alt_fsname),
+                                fso.alt_fsname))
+                        else:
+                            dir_dgst.update(util.interpolate_bytes(
+                                b"1:S,%d:%s,", len(fso.fsname), fso.fsname))
                     #
                     # - no mtime and no mode for symlinks
                     # - also does not count for dir_size
@@ -476,7 +519,11 @@
                     dir_dgst.update(util.interpolate_bytes(
                         b"%d:%s,",
                         len(linkdgst.digest()), linkdgst.digest()))
-                    opath = "/".join(top) + "/" + fso.name if top else fso.name
+                    opath = join_output_path(top, fso.name)
+                    if self._utf8_mode:
+                        opath = walk.WalkDirEntry.alt_u8(opath)
+                    else:
+                        opath = walk.WalkDirEntry.alt_fs(opath)
                     if self._size_only:
                         self._outfp.write(format_bsd_line(
                             "SIZE", None, "%s/./@/" % (opath,), False, 0))
@@ -497,8 +544,26 @@
                         root, top + (fso.name, ))
 
                     dir_size += sub_dir_size
-                    dir_dgst.update(util.interpolate_bytes(
-                        b"1:d,%d:%s,", len(fso.fsname), fso.fsname))
+                    if self._utf8_mode:
+                        if fso.u8name is None:
+                            dir_tainted = True
+                            dir_dgst.update(util.interpolate_bytes(
+                                b"1:d,%d:%s,",
+                                len(fso.alt_u8name),
+                                fso.alt_u8name))
+                        else:
+                            dir_dgst.update(util.interpolate_bytes(
+                                b"1:d,%d:%s,", len(fso.u8name), fso.u8name))
+                    else:
+                        if fso.fsname is None:
+                            dir_tainted = True
+                            dir_dgst.update(util.interpolate_bytes(
+                                b"1:d,%d:%s,",
+                                len(fso.alt_fsname),
+                                fso.alt_fsname))
+                        else:
+                            dir_dgst.update(util.interpolate_bytes(
+                                b"1:d,%d:%s,", len(fso.fsname), fso.fsname))
                     dir_dgst.update(util.interpolate_bytes(
                         b"%d:%s,", len(sub_dir_dgst), sub_dir_dgst))
                     if self._with_metadata_full_mode:
@@ -511,8 +576,26 @@
                         dir_dgst.update(util.interpolate_bytes(
                             b"4:mode,%d:%s,", len(modestr), modestr))
             else:
-                dir_dgst.update(util.interpolate_bytes(
-                    b"1:f,%d:%s,", len(fso.fsname), fso.fsname))
+                if self._utf8_mode:
+                    if fso.u8name is None:
+                        dir_tainted = True
+                        dir_dgst.update(util.interpolate_bytes(
+                            b"1:f,%d:%s,",
+                            len(fso.alt_u8name),
+                            fso.alt_u8name))
+                    else:
+                        dir_dgst.update(util.interpolate_bytes(
+                            b"1:f,%d:%s,", len(fso.u8name), fso.u8name))
+                else:
+                    if fso.fsname is None:
+                        dir_tainted = True
+                        dir_dgst.update(util.interpolate_bytes(
+                            b"1:f,%d:%s,",
+                            len(fso.alt_fsname),
+                            fso.alt_fsname))
+                    else:
+                        dir_dgst.update(util.interpolate_bytes(
+                            b"1:f,%d:%s,", len(fso.fsname), fso.fsname))
                 dir_size += fso.stat.st_size
                 if self._with_metadata_mtime:
                     mtime = datetime.datetime.utcfromtimestamp(
@@ -534,7 +617,11 @@
                         self._algorithm[0], fso.path, use_mmap=self._use_mmap)
                     dir_dgst.update(util.interpolate_bytes(
                         b"%d:%s,", len(dgst), dgst))
-                opath = "/".join(top) + "/" + fso.name if top else fso.name
+                opath = join_output_path(top, fso.name)
+                if self._utf8_mode:
+                    opath = walk.WalkDirEntry.alt_u8(opath)
+                else:
+                    opath = walk.WalkDirEntry.alt_fs(opath)
                 if self._size_only:
                     self._outfp.write(format_bsd_line(
                         "SIZE", None, opath, False, fso.stat.st_size))
@@ -548,12 +635,19 @@
                             self._algorithm[1], dgst, opath,
                             self._use_base64))
                 self._outfp.flush()
-
-        opath = "/".join(top) + "/" if top else ""
+        opath = join_output_path(top, None)
+        if opath:
+            if self._utf8_mode:
+                opath = walk.WalkDirEntry.alt_u8(opath)
+            else:
+                opath = walk.WalkDirEntry.alt_fs(opath)
         if self._size_only:
             self._outfp.write(format_bsd_line(
                     "SIZE", None, opath, False, dir_size))
         else:
+            if dir_tainted:
+                self._outfp.write(format_bsd_line(
+                    b"ERROR", None, b"directory is tainted", False, None))
             if self._print_size:
                 self._outfp.write(format_bsd_line(
                     self._algorithm[1], dir_dgst.digest(), opath,
@@ -566,6 +660,27 @@
         return (dir_dgst.digest(), dir_size)
 
 
+def join_output_path(top, name):
+    if name is None:
+        # a path for a directory is to be computed
+        if top:
+            if isinstance(top[0], bytes):
+                return b"/".join(top) + b"/"
+            else:
+                return u"/".join(top) + u"/"
+        else:
+            return b""
+    else:
+        # a path for a normal file is to be computed
+        if top:
+            if isinstance(name, bytes):
+                return b"/".join(top) + b"/" + name
+            else:
+                return u"/".join(top) + u"/" + name
+        else:
+            return name
+
+
 class CRC32Output(object):
 
     """Wrapper for a minimal binary file contextmanager that calculates
@@ -652,9 +767,9 @@
         assert filename is None
         return util.interpolate_bytes(b"%s = %s%s", what, util.b(value), ls)
     assert filename is not None
-    if what == b"COMMENT":
+    if what in (b"COMMENT", b"ERROR"):
         return util.interpolate_bytes(
-            b"COMMENT (%s)%s", util.b(filename, "utf-8"), ls)
+            b"%s (%s)%s", what, util.b(filename, "utf-8"), ls)
     if not isinstance(filename, bytes):
         filename = util.fsencode(filename)
     if what == b"SIZE":
@@ -683,11 +798,11 @@
 
     """
 
-    PATTERN0 = re.compile(br"\A[ \t]*\r?\n\Z")     # empty lines
+    PATTERN0 = re.compile(br"\A[ \t]*\r?\n\Z")   # empty lines
     PATTERN1 = re.compile(br"\A(VERSION|FSENCODING|FLAGS|TIMESTAMP|ISOTIMESTAMP|CRC32)[ \t]*=[ \t]*([^ \t]+)[ \t]*\r?\n\Z")      # noqa: E501  line too long
-    PATTERN2 = re.compile(br"\A(ROOT|COMMENT)[ \t]*\((.*)\)[ \t]*\r?\n\Z")
-    PATTERN3 = re.compile(br"\ASIZE[ \t]*\((.*)\)[ \t]*=[ \t]*(\d+)[ \t]*\r?\n\Z")                                    # noqa: E501  line too long
-    PATTERN4 = re.compile(br"\A([A-Za-z0-9_-]+)[ \t]*\((.*)\)[ \t]*=[ \t]*([A-Za-z0-9=+/]+)(,(\d+))?[ \t]*\r?\n\Z")   # noqa: E501  line too long
+    PATTERN2 = re.compile(br"\A(ROOT|COMMENT|ERROR)[ \t]*\((.*)\)[ \t]*\r?\n\Z")                                                 # noqa: E501  line too long
+    PATTERN3 = re.compile(br"\ASIZE[ \t]*\((.*)\)[ \t]*=[ \t]*(\d+)[ \t]*\r?\n\Z")                                               # noqa: E501  line too long
+    PATTERN4 = re.compile(br"\A([A-Za-z0-9_-]+)[ \t]*\((.*)\)[ \t]*=[ \t]*([A-Za-z0-9=+/]+)(,(\d+))?[ \t]*\r?\n\Z")              # noqa: E501  line too long
 
     def __init__(self, _fp, _filename, _own_fp):
         self._fp = _fp
@@ -812,8 +927,8 @@
             mo = self.PATTERN2.search(line)
             if mo:
                 self._update_crc(line)
-                if mo.group(1) == b"COMMENT":
-                    return ("COMMENT", util.u(mo.group(2), "utf-8"))
+                if mo.group(1) in (b"COMMENT", b"ERROR"):
+                    return (util.u(mo.group(1)), util.u(mo.group(2), "utf-8"))
                 elif mo.group(1) == b"ROOT":
                     return ("ROOT", mo.group(2))
                 assert False, line
--- a/docs/notes.rst	Tue Jan 21 18:57:02 2025 +0100
+++ b/docs/notes.rst	Tue Jan 21 20:31:48 2025 +0100
@@ -29,3 +29,9 @@
 
       The result of :func:`os.getfilesystemencoding` that is in effect when
       generating treesum digests
+
+   ``ERROR``
+
+      Errors are reported:
+
+      - For directories if the one of filenames has a filename problem