diff cutils/util/digest.py @ 122:1e5127028254

Move the real computation of digests from files and streams into dedicated submodule cutils.util.digest
author Franz Glasner <fzglas.hg@dom66.de>
date Wed, 01 Jan 2025 18:57:25 +0100
parents
children a813094ae4f5
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cutils/util/digest.py	Wed Jan 01 18:57:25 2025 +0100
@@ -0,0 +1,171 @@
+# -*- coding: utf-8 -*-
+# :-
+# :Copyright: (c) 2020-2025 Franz Glasner
+# :License:   BSD-3-Clause
+# :-
+r"""Utility sub-module to implement a file and stream digest computations.
+
+"""
+
+__all__ = ["compute_digest_file", "compute_digest_stream"]
+
+
+import errno
+import io
+import os
+try:
+    import mmap
+except ImportError:
+    mmap = None
+import stat
+
+from . import constants
+
+
+def compute_digest_file(hashobj, path, use_mmap=None):
+    """Compute the digest for a file with a filename of an open fd.
+
+    :param hashobj: a :mod:`hashlib` compatible hash algorithm type or factory
+    :param path: filename within the filesystem or a file descriptor opened in
+                 binary mode (also a socket or pipe)
+    :param use_mmap: Use the :mod:`mmap` module if available.
+                     If `None` determine automatically.
+    :type use_mmap: bool or None
+    :return: the digest in binary form
+    :rtype: bytes
+
+    If a file descriptor is given is must support :func:`os.read`.
+
+    """
+    h = hashobj()
+    if isinstance(path, constants.PATH_TYPES):
+        flags = os.O_RDONLY | getattr(os, "O_BINARY", 0) \
+            | getattr(os, "O_SEQUENTIAL", 0) | getattr(os, "O_NOCTTY", 0)
+        fd = os.open(path, flags)
+        own_fd = True
+    else:
+        fd = path
+        own_fd = False
+    try:
+        try:
+            st = os.fstat(fd)
+        except TypeError:
+            #
+            # "fd" is most probably a Python socket object.
+            # (a pipe typically supports fstat)
+            #
+            use_mmap = False
+        else:
+            if stat.S_ISREG(st[stat.ST_MODE]):
+                filesize = st[stat.ST_SIZE]
+                if (use_mmap is None) \
+                        and (filesize > constants.MAX_AUTO_MAP_SIZE):
+                    #
+                    # This is borrowed from FreeBSD's cp(1) implementation:
+                    # Mmap and process if less than 8M (the limit is
+                    # so we don't totally trash memory on big files.
+                    # This is really a minor hack, but it wins some
+                    # CPU back.  Some filesystems, such as smbnetfs,
+                    # don't support mmap, so this is a best-effort
+                    # attempt.
+                    #
+                    use_mmap = False
+            else:
+                use_mmap = False
+        if use_mmap is None:
+            use_mmap = True
+        if mmap is None or not use_mmap:
+            # No mmap available or wanted -> use traditional low-level file IO
+            fadvise = getattr(os, "posix_fadvise", None)
+            if fadvise:
+                fadvise(fd, 0, 0, os.POSIX_FADV_SEQUENTIAL)
+            if not constants.PY2:
+                fileobj = io.FileIO(fd, mode="r", closefd=False)
+                buf = bytearray(constants.READ_CHUNK_SIZE)
+                with memoryview(buf) as full_view:
+                    while True:
+                        try:
+                            n = fileobj.readinto(buf)
+                        except OSError as e:
+                            if e.errno not in (errno.EAGAIN,
+                                               errno.EWOULDBLOCK,
+                                               errno.EINTR):
+                                raise
+                        else:
+                            if n == 0:
+                                break
+                            if n == constants.READ_CHUNK_SIZE:
+                                h.update(buf)
+                            else:
+                                with full_view[:n] as partial_view:
+                                    h.update(partial_view)
+            else:
+                while True:
+                    try:
+                        buf = os.read(fd, constants.READ_CHUNK_SIZE)
+                    except OSError as e:
+                        if e.errno not in (errno.EAGAIN,
+                                           errno.EWOULDBLOCK,
+                                           errno.EINTR):
+                            raise
+                    else:
+                        if len(buf) == 0:
+                            break
+                        h.update(buf)
+        else:
+            #
+            # Use mmap
+            #
+            # NOTE: On Windows mmapped files with length 0 are not supported.
+            #       So ensure to not call mmap.mmap() if the file size is 0.
+            #
+            madvise = getattr(mmap.mmap, "madvise", None)
+            if filesize <= constants.MAP_WINDOW_SIZE:
+                mapsize = filesize
+            else:
+                mapsize = constants.MAP_WINDOW_SIZE
+            mapoffset = 0
+            rest = filesize
+            while rest > 0:
+                m = mmap.mmap(fd,
+                              mapsize,
+                              access=mmap.ACCESS_READ,
+                              offset=mapoffset)
+                if madvise:
+                    madvise(m, mmap.MADV_SEQUENTIAL)
+                try:
+                    h.update(m)
+                finally:
+                    m.close()
+                rest -= mapsize
+                mapoffset += mapsize
+                if rest < mapsize:
+                    mapsize = rest
+    finally:
+        if own_fd:
+            os.close(fd)
+    return h.digest()
+
+
+def compute_digest_stream(hashobj, instream):
+    """Compute the digest for a given byte string `instream`.
+
+    :param hashobj: a :mod:`hashlib` compatible hash algorithm type or factory
+    :param instream: a bytes input stream to read the data to be hashed from
+    :return: the digest in binary form
+    :rtype: bytes
+
+    """
+    h = hashobj()
+    while True:
+        try:
+            buf = instream.read(constants.READ_CHUNK_SIZE)
+        except OSError as e:
+            if e.errno not in (errno.EAGAIN, errno.EWOULDBLOCK, errno.EINTR):
+                raise
+        else:
+            if buf is not None:
+                if len(buf) == 0:
+                    break
+                h.update(buf)
+    return h.digest()