diff cutils/util/walk.py @ 201:58d93453c307

Much more encoding-related methods for DirWalkEntry and some unittests
author Franz Glasner <fzglas.hg@dom66.de>
date Tue, 21 Jan 2025 14:30:06 +0100
parents b2aba84ca426
children 3a85f7bbe0b1
line wrap: on
line diff
--- a/cutils/util/walk.py	Fri Jan 17 20:12:58 2025 +0100
+++ b/cutils/util/walk.py	Tue Jan 21 14:30:06 2025 +0100
@@ -10,7 +10,7 @@
 from __future__ import print_function, absolute_import
 
 
-__all__ = ["ScanDir", "getfsencoding"]
+__all__ = ["WalkDirEntry", "ScanDir", "getfsencoding"]
 
 
 import os
@@ -26,9 +26,25 @@
 from . import PY2
 
 
+_notset = object()
+
+
 _FSENCODING = sys.getfilesystemencoding()
 
 
+if PY2:
+
+    def _unix_path(s):
+        if isinstance(s, bytes):
+            return s.replace(b"\\", b"/")
+        return s.replace(u"\\", u"/")
+
+else:
+
+    def _unix_path(s):
+        return s.replace("\\", "/")
+
+
 class WalkDirEntry(object):
 
     """A :class:`os.DirEntry` alike to be used in :func:`walk` and for
@@ -36,47 +52,188 @@
 
     """
 
-    __slots__ = ("_name", "_fsname", "_path", "_fspath", "_is_symlink",
-                 "_is_dir", "_stat_result")
+    __slots__ = ("_name", "_path",     # encoded as given in the ctor
+                 "_is_symlink", "_is_dir", "_stat_result",
+                 "_alt_fsname", "_alt_u8name")
 
-    def __init__(self, name):
-        self._name = name
-        if PY2:
-            assert isinstance(name, bytes)
-            self._fsname = name
-        else:
-            self._name = name
-            self._fsname = os.fsencode(name)
-        self._path = None
-        self._fspath = None
+    def __init__(self, name, path):
+        self._name = name    # the name as given in the constructor
+        """The name exactly as given in the ctor"""
+        self._path = _unix_path(path)
+        """The path as given in the ctor -- but normalized to have slashes"""
         self._is_symlink = self._is_dir = self._stat_result = None
+        self._alt_fsname = self._alt_u8name = _notset
 
     @property
     def name(self):
-        """The native name"""
+        """The original name exactly as given in the ctor"""
         return self._name
 
     @property
-    def fsname(self):
-        """The name as bytes"""
-        return self._fsname
-
-    @property
     def path(self):
-        """Always native"""
+        """The original path exactly as given in the ctor."""
         return self._path
 
     @property
+    def fsname(self):
+        """The name as bytes for the filesystem.
+
+        :rtype: bytes or None
+
+        """
+        if PY2:
+            if isinstance(self._name, bytes):
+                return self._name
+            try:
+                return self._name.encode(_FSENCODING, "strict")
+            except UnicodeError:
+                return None
+        else:
+            return os.fsencode(self._name)
+
+    @property
+    def alt_fsname(self):
+        """Alternative and "escaped" filesystem name -- always bytes.
+
+        :rtype: bytes
+
+        """
+        if self._alt_fsname is _notset:
+            if PY2:
+                if isinstance(self._name, bytes):
+                    self._alt_fsname = self._name
+                else:
+                    self._alt_fsname = self._name.encode(
+                        _FSENCODING, "backslashreplace")
+            else:
+                self._alt_fsname = os.fsencode(self._name)
+        return self._alt_fsname
+
+    @property
     def fspath(self):
-        """Always bytes"""
-        if self._path is not None:
-            if self._fspath is None:
-                if PY2:
-                    assert isinstance(self._path, bytes)
-                    self._fspath = self._path
+        """Always bytes.
+
+        :rtype: bytes or None
+
+        """
+        if PY2:
+            if isinstance(self._path, bytes):
+                return self._path
+            try:
+                return self._path.encode(_FSENCODING, "strict")
+            except UnicodeError:
+                return None
+        else:
+            return os.fsencode(self._path)
+
+    @property
+    def alt_fspath(self):
+        """Alternative and "escaped" filesystem path -- always bytes.
+
+        :rtype: bytes
+
+        """
+        if PY2:
+            if isinstance(self._path, bytes):
+                return self._path
+            return self._path.encode(_FSENCODING, "backslashreplace")
+        else:
+            return os.fsencode(self._path)
+
+    @property
+    def uname(self):
+        """Always "real", strictly encoded Unicode or `None` if this is not
+        possible.
+
+        :rtype: text or None
+
+        """
+        if PY2:
+            if isinstance(self._name, bytes):
+                try:
+                    return self._name.decode(_FSENCODING, "strict")
+                except UnicodeError:
+                    return None
+            else:
+                return self._name
+        else:
+            try:
+                self._name.encode("utf-8", "strict")
+            except UnicodeError:
+                return None
+            return self._name
+
+    @property
+    def upath(self):
+        """Always "real", strictly encoded Unicode or `None` if this is not
+        possible.
+
+        :rtype: text or None
+
+        """
+        if PY2:
+            if isinstance(self._path, bytes):
+                try:
+                    return self._path.decode(_FSENCODING, "strict")
+                except UnicodeError:
+                    return None
+            else:
+                return self._path
+        else:
+            try:
+                self._path.encode("utf-8", "strict")
+            except UnicodeError:
+                return None
+            return self._path
+
+    @property
+    def u8name(self):
+        """`.uname` as UTF-8 or `None` (as strict as `uname`)"""
+        n = self.uname
+        return n if n is None else n.encode("utf-8", "strict")
+
+    @property
+    def u8path(self):
+        """`.upath` as UTF-8 or `None` (as strict as `upath`"""
+        p = self.upath
+        return p if p is None else p.encode("utf-8", "strict")
+
+    @property
+    def alt_u8name(self):
+        if self._alt_u8name is _notset:
+            if PY2:
+                if isinstance(self._name, bytes):
+                    try:
+                        self._alt_u8name = (
+                            self._name
+                            .decode(_FSENCODING, "strict")
+                            .encode("utf-8", "strict"))
+                    except UnicodeError:
+                        self._alt_u8name = (
+                            self.surrogate_decode(self._name)
+                            .encode("ascii", "backslashreplace"))
                 else:
-                    self._fspath = os.fsencode(self._path)
-        return self._fspath
+                    self._alt_u8name = self._name.encode(
+                        "ascii", "backslashreplace")
+            else:
+                self._alt_u8name = self._name.encode(
+                    "utf-8", "backslashreplace")
+        return self._alt_u8name
+
+    @property
+    def alt_u8path(self):
+        if PY2:
+            if isinstance(self._path, bytes):
+                try:
+                    return (self._path.decode(_FSENCODING, "strict")
+                            .encode("utf-8", "strict"))
+                except UnicodeError:
+                    return (self.surrogate_decode(self._path)
+                            .encode("ascii", "backslashreplace"))
+            else:
+                return self._path.encode("ascii", "backslashreplace")
+        else:
+            return self._path.encode("utf-8", "backslashreplace")
 
     @property
     def is_symlink(self):
@@ -102,8 +259,7 @@
 
     @classmethod
     def from_direntry(cls_, entry):
-        w = cls_(entry.name)
-        w._path = entry.path
+        w = cls_(entry.name, entry.path)
         try:
             w._is_dir = entry.is_dir(follow_symlinks=True)
         except OSError:
@@ -125,9 +281,9 @@
         return w
 
     @classmethod
-    def from_path_name(cls_, path, name):
-        w = cls_(name)
-        w._path = os.path.join(path, name)
+    def from_path_name(cls_, path, name, _do_stat=True):
+        """`_nostat` is to be used only for testing purposes"""
+        w = cls_(name, os.path.join(path, name))
         try:
             w._is_dir = os.path.isdir(w._path)
         except OSError:
@@ -144,12 +300,40 @@
             # is not a symbolic link, same behaviour than os.path.islink().
             #
             w._is_symlink = False
-        w._stat_result = os.stat(w._path)
+        if _do_stat:
+            w._stat_result = os.stat(w._path)
+        return w
+
+    @classmethod
+    def from_readlink(cls_, path):
+        w = cls_(os.path.basename(path), path)
         return w
 
     @staticmethod
     def sort_key(entry):
-        return entry._fsname
+        return entry.alt_fsname     # because it should never throw
+
+    @staticmethod
+    def alt_sort_key(entry):
+        return entry.alt_u8name     # because it should never throw
+
+    if PY2:
+
+        @staticmethod
+        def surrogate_decode(what):
+            """Decode the bytes object `what` using surrogates from :pep:`383`
+            for all non-ASCII octets.
+
+            """
+            uwhat = []
+            assert isinstance(what, bytes)
+            for ch in what:
+                chcode = ord(ch)
+                if chcode <= 0x7f:
+                    uwhat.append(unichr(chcode))   # noqa: F821 unichr
+                else:
+                    uwhat.append(unichr(0xDC00 + chcode))  # noqa: F821 unichr
+            return u"".join(uwhat)
 
 
 if scandir: