changeset 122:1e5127028254

Move the real computation of digests from files and streams into dedicated submodule cutils.util.digest
author Franz Glasner <fzglas.hg@dom66.de>
date Wed, 01 Jan 2025 18:57:25 +0100
parents 2dc26a2f3d1c
children 4a0c3c9eead7
files cutils/shasum.py cutils/util/digest.py shasum.py
diffstat 3 files changed, 183 insertions(+), 164 deletions(-) [+]
line wrap: on
line diff
--- a/cutils/shasum.py	Wed Jan 01 17:52:41 2025 +0100
+++ b/cutils/shasum.py	Wed Jan 01 18:57:25 2025 +0100
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 # :-
-# :Copyright: (c) 2020-2024 Franz Glasner
+# :Copyright: (c) 2020-2025 Franz Glasner
 # :License:   BSD-3-Clause
 # :-
 r"""Pure Python implementation of `shasum`.
@@ -18,18 +18,14 @@
 import binascii
 import errno
 import io
-try:
-    import mmap
-except ImportError:
-    mmap = None
 import os
 import re
-import stat
 import sys
 
 from . import (__version__, __revision__)
 from . import util
 from .util import constants
+from .util import digest
 
 
 def main(argv=None):
@@ -170,8 +166,8 @@
                 for fn in dirfiles:
                     path = os.path.join(dirpath, fn)
                     out(opts.dest or sys.stdout,
-                        compute_digest_file(opts.algorithm[0], path,
-                                            use_mmap=opts.mmap),
+                        digest.compute_digest_file(
+                            opts.algorithm[0], path, use_mmap=opts.mmap),
                         path,
                         opts.algorithm[1],
                         True,
@@ -186,7 +182,7 @@
             else:
                 source = sys.stdin.buffer
             out(sys.stdout,
-                compute_digest_stream(opts.algorithm[0], source),
+                digest.compute_digest_stream(opts.algorithm[0], source),
                 None,
                 opts.algorithm[1],
                 True,
@@ -194,8 +190,8 @@
         else:
             for fn in opts.files:
                 out(opts.dest or sys.stdout,
-                    compute_digest_file(opts.algorithm[0], fn,
-                                        use_mmap=opts.mmap),
+                    digest.compute_digest_file(
+                        opts.algorithm[0], fn, use_mmap=opts.mmap),
                     fn,
                     opts.algorithm[1],
                     True,
@@ -260,7 +256,7 @@
             print("-: MISSING", file=dest)
         else:
             tag, algo, cl_filename, cl_digest = pl
-            computed_digest = compute_digest_stream(algo, source)
+            computed_digest = digest.compute_digest_stream(algo, source)
             if compare_digests_equal(computed_digest, cl_digest, algo):
                 res = "OK"
             else:
@@ -275,8 +271,8 @@
                 exit_code = 1
             else:
                 tag, algo, cl_filename, cl_digest = pl
-                computed_digest = compute_digest_file(algo, fn,
-                                                      use_mmap=opts.mmap)
+                computed_digest = digest.compute_digest_file(
+                    algo, fn, use_mmap=opts.mmap)
                 if compare_digests_equal(computed_digest, cl_digest, algo):
                     res = "OK"
                 else:
@@ -332,7 +328,7 @@
         assert opts.allow_distinfo
         return (None, None, tag)
     try:
-        d = compute_digest_file(algo, fn, use_mmap=opts.mmap)
+        d = digest.compute_digest_file(algo, fn, use_mmap=opts.mmap)
         if compare_digests_equal(d, digest, algo):
             return ("ok", fn, tag)
         else:
@@ -443,153 +439,5 @@
           file=dest)
 
 
-def compute_digest_file(hashobj, path, use_mmap=None):
-    """
-    :param hashobj: a :mod:`hashlib` compatible hash algorithm type or factory
-    :param path: filename within the filesystem or a file descriptor opened in
-                 binary mode (also a socket or pipe)
-    :param use_mmap: Use the :mod:`mmap` module if available.
-                     If `None` determine automatically.
-    :type use_mmap: bool or None
-    :return: the digest in binary form
-    :rtype: bytes
-
-    If a file descriptor is given is must support :func:`os.read`.
-
-    """
-    h = hashobj()
-    if isinstance(path, constants.PATH_TYPES):
-        flags = os.O_RDONLY | getattr(os, "O_BINARY", 0) \
-            | getattr(os, "O_SEQUENTIAL", 0) | getattr(os, "O_NOCTTY", 0)
-        fd = os.open(path, flags)
-        own_fd = True
-    else:
-        fd = path
-        own_fd = False
-    try:
-        try:
-            st = os.fstat(fd)
-        except TypeError:
-            #
-            # "fd" is most probably a Python socket object.
-            # (a pipe typically supports fstat)
-            #
-            use_mmap = False
-        else:
-            if stat.S_ISREG(st[stat.ST_MODE]):
-                filesize = st[stat.ST_SIZE]
-                if (use_mmap is None) \
-                        and (filesize > constants.MAX_AUTO_MAP_SIZE):
-                    #
-                    # This is borrowed from FreeBSD's cp(1) implementation:
-                    # Mmap and process if less than 8M (the limit is
-                    # so we don't totally trash memory on big files.
-                    # This is really a minor hack, but it wins some
-                    # CPU back.  Some filesystems, such as smbnetfs,
-                    # don't support mmap, so this is a best-effort
-                    # attempt.
-                    #
-                    use_mmap = False
-            else:
-                use_mmap = False
-        if use_mmap is None:
-            use_mmap = True
-        if mmap is None or not use_mmap:
-            # No mmap available or wanted -> use traditional low-level file IO
-            fadvise = getattr(os, "posix_fadvise", None)
-            if fadvise:
-                fadvise(fd, 0, 0, os.POSIX_FADV_SEQUENTIAL)
-            if not constants.PY2:
-                fileobj = io.FileIO(fd, mode="r", closefd=False)
-                buf = bytearray(constants.READ_CHUNK_SIZE)
-                with memoryview(buf) as full_view:
-                    while True:
-                        try:
-                            n = fileobj.readinto(buf)
-                        except OSError as e:
-                            if e.errno not in (errno.EAGAIN,
-                                               errno.EWOULDBLOCK,
-                                               errno.EINTR):
-                                raise
-                        else:
-                            if n == 0:
-                                break
-                            if n == constants.READ_CHUNK_SIZE:
-                                h.update(buf)
-                            else:
-                                with full_view[:n] as partial_view:
-                                    h.update(partial_view)
-            else:
-                while True:
-                    try:
-                        buf = os.read(fd, constants.READ_CHUNK_SIZE)
-                    except OSError as e:
-                        if e.errno not in (errno.EAGAIN,
-                                           errno.EWOULDBLOCK,
-                                           errno.EINTR):
-                            raise
-                    else:
-                        if len(buf) == 0:
-                            break
-                        h.update(buf)
-        else:
-            #
-            # Use mmap
-            #
-            # NOTE: On Windows mmapped files with length 0 are not supported.
-            #       So ensure to not call mmap.mmap() if the file size is 0.
-            #
-            madvise = getattr(mmap.mmap, "madvise", None)
-            if filesize <= constants.MAP_WINDOW_SIZE:
-                mapsize = filesize
-            else:
-                mapsize = constants.MAP_WINDOW_SIZE
-            mapoffset = 0
-            rest = filesize
-            while rest > 0:
-                m = mmap.mmap(fd,
-                              mapsize,
-                              access=mmap.ACCESS_READ,
-                              offset=mapoffset)
-                if madvise:
-                    madvise(m, mmap.MADV_SEQUENTIAL)
-                try:
-                    h.update(m)
-                finally:
-                    m.close()
-                rest -= mapsize
-                mapoffset += mapsize
-                if rest < mapsize:
-                    mapsize = rest
-    finally:
-        if own_fd:
-            os.close(fd)
-    return h.digest()
-
-
-def compute_digest_stream(hashobj, instream):
-    """
-
-    :param hashobj: a :mod:`hashlib` compatible hash algorithm type or factory
-    :param instream: a bytes input stream to read the data to be hashed from
-    :return: the digest in binary form
-    :rtype: bytes
-
-    """
-    h = hashobj()
-    while True:
-        try:
-            buf = instream.read(constants.READ_CHUNK_SIZE)
-        except OSError as e:
-            if e.errno not in (errno.EAGAIN, errno.EWOULDBLOCK, errno.EINTR):
-                raise
-        else:
-            if buf is not None:
-                if len(buf) == 0:
-                    break
-                h.update(buf)
-    return h.digest()
-
-
 if __name__ == "__main__":
     sys.exit(main())
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cutils/util/digest.py	Wed Jan 01 18:57:25 2025 +0100
@@ -0,0 +1,171 @@
+# -*- coding: utf-8 -*-
+# :-
+# :Copyright: (c) 2020-2025 Franz Glasner
+# :License:   BSD-3-Clause
+# :-
+r"""Utility sub-module to implement a file and stream digest computations.
+
+"""
+
+__all__ = ["compute_digest_file", "compute_digest_stream"]
+
+
+import errno
+import io
+import os
+try:
+    import mmap
+except ImportError:
+    mmap = None
+import stat
+
+from . import constants
+
+
+def compute_digest_file(hashobj, path, use_mmap=None):
+    """Compute the digest for a file with a filename of an open fd.
+
+    :param hashobj: a :mod:`hashlib` compatible hash algorithm type or factory
+    :param path: filename within the filesystem or a file descriptor opened in
+                 binary mode (also a socket or pipe)
+    :param use_mmap: Use the :mod:`mmap` module if available.
+                     If `None` determine automatically.
+    :type use_mmap: bool or None
+    :return: the digest in binary form
+    :rtype: bytes
+
+    If a file descriptor is given is must support :func:`os.read`.
+
+    """
+    h = hashobj()
+    if isinstance(path, constants.PATH_TYPES):
+        flags = os.O_RDONLY | getattr(os, "O_BINARY", 0) \
+            | getattr(os, "O_SEQUENTIAL", 0) | getattr(os, "O_NOCTTY", 0)
+        fd = os.open(path, flags)
+        own_fd = True
+    else:
+        fd = path
+        own_fd = False
+    try:
+        try:
+            st = os.fstat(fd)
+        except TypeError:
+            #
+            # "fd" is most probably a Python socket object.
+            # (a pipe typically supports fstat)
+            #
+            use_mmap = False
+        else:
+            if stat.S_ISREG(st[stat.ST_MODE]):
+                filesize = st[stat.ST_SIZE]
+                if (use_mmap is None) \
+                        and (filesize > constants.MAX_AUTO_MAP_SIZE):
+                    #
+                    # This is borrowed from FreeBSD's cp(1) implementation:
+                    # Mmap and process if less than 8M (the limit is
+                    # so we don't totally trash memory on big files.
+                    # This is really a minor hack, but it wins some
+                    # CPU back.  Some filesystems, such as smbnetfs,
+                    # don't support mmap, so this is a best-effort
+                    # attempt.
+                    #
+                    use_mmap = False
+            else:
+                use_mmap = False
+        if use_mmap is None:
+            use_mmap = True
+        if mmap is None or not use_mmap:
+            # No mmap available or wanted -> use traditional low-level file IO
+            fadvise = getattr(os, "posix_fadvise", None)
+            if fadvise:
+                fadvise(fd, 0, 0, os.POSIX_FADV_SEQUENTIAL)
+            if not constants.PY2:
+                fileobj = io.FileIO(fd, mode="r", closefd=False)
+                buf = bytearray(constants.READ_CHUNK_SIZE)
+                with memoryview(buf) as full_view:
+                    while True:
+                        try:
+                            n = fileobj.readinto(buf)
+                        except OSError as e:
+                            if e.errno not in (errno.EAGAIN,
+                                               errno.EWOULDBLOCK,
+                                               errno.EINTR):
+                                raise
+                        else:
+                            if n == 0:
+                                break
+                            if n == constants.READ_CHUNK_SIZE:
+                                h.update(buf)
+                            else:
+                                with full_view[:n] as partial_view:
+                                    h.update(partial_view)
+            else:
+                while True:
+                    try:
+                        buf = os.read(fd, constants.READ_CHUNK_SIZE)
+                    except OSError as e:
+                        if e.errno not in (errno.EAGAIN,
+                                           errno.EWOULDBLOCK,
+                                           errno.EINTR):
+                            raise
+                    else:
+                        if len(buf) == 0:
+                            break
+                        h.update(buf)
+        else:
+            #
+            # Use mmap
+            #
+            # NOTE: On Windows mmapped files with length 0 are not supported.
+            #       So ensure to not call mmap.mmap() if the file size is 0.
+            #
+            madvise = getattr(mmap.mmap, "madvise", None)
+            if filesize <= constants.MAP_WINDOW_SIZE:
+                mapsize = filesize
+            else:
+                mapsize = constants.MAP_WINDOW_SIZE
+            mapoffset = 0
+            rest = filesize
+            while rest > 0:
+                m = mmap.mmap(fd,
+                              mapsize,
+                              access=mmap.ACCESS_READ,
+                              offset=mapoffset)
+                if madvise:
+                    madvise(m, mmap.MADV_SEQUENTIAL)
+                try:
+                    h.update(m)
+                finally:
+                    m.close()
+                rest -= mapsize
+                mapoffset += mapsize
+                if rest < mapsize:
+                    mapsize = rest
+    finally:
+        if own_fd:
+            os.close(fd)
+    return h.digest()
+
+
+def compute_digest_stream(hashobj, instream):
+    """Compute the digest for a given byte string `instream`.
+
+    :param hashobj: a :mod:`hashlib` compatible hash algorithm type or factory
+    :param instream: a bytes input stream to read the data to be hashed from
+    :return: the digest in binary form
+    :rtype: bytes
+
+    """
+    h = hashobj()
+    while True:
+        try:
+            buf = instream.read(constants.READ_CHUNK_SIZE)
+        except OSError as e:
+            if e.errno not in (errno.EAGAIN, errno.EWOULDBLOCK, errno.EINTR):
+                raise
+        else:
+            if buf is not None:
+                if len(buf) == 0:
+                    break
+                h.update(buf)
+    return h.digest()
--- a/shasum.py	Wed Jan 01 17:52:41 2025 +0100
+++ b/shasum.py	Wed Jan 01 18:57:25 2025 +0100
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 # :-
-# :Copyright: (c) 2020-2024 Franz Glasner
+# :Copyright: (c) 2020-2025 Franz Glasner
 # :License:   BSD-3-Clause
 # :-
 r"""Pure Python implementation of `shasum`.