Mercurial > hgrepos > Python > apps > py-cutils
diff cutils/util/digest.py @ 122:1e5127028254
Move the real computation of digests from files and streams into dedicated submodule cutils.util.digest
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Wed, 01 Jan 2025 18:57:25 +0100 |
| parents | |
| children | a813094ae4f5 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cutils/util/digest.py Wed Jan 01 18:57:25 2025 +0100 @@ -0,0 +1,171 @@ +# -*- coding: utf-8 -*- +# :- +# :Copyright: (c) 2020-2025 Franz Glasner +# :License: BSD-3-Clause +# :- +r"""Utility sub-module to implement a file and stream digest computations. + +""" + +__all__ = ["compute_digest_file", "compute_digest_stream"] + + +import errno +import io +import os +try: + import mmap +except ImportError: + mmap = None +import stat + +from . import constants + + +def compute_digest_file(hashobj, path, use_mmap=None): + """Compute the digest for a file with a filename of an open fd. + + :param hashobj: a :mod:`hashlib` compatible hash algorithm type or factory + :param path: filename within the filesystem or a file descriptor opened in + binary mode (also a socket or pipe) + :param use_mmap: Use the :mod:`mmap` module if available. + If `None` determine automatically. + :type use_mmap: bool or None + :return: the digest in binary form + :rtype: bytes + + If a file descriptor is given is must support :func:`os.read`. + + """ + h = hashobj() + if isinstance(path, constants.PATH_TYPES): + flags = os.O_RDONLY | getattr(os, "O_BINARY", 0) \ + | getattr(os, "O_SEQUENTIAL", 0) | getattr(os, "O_NOCTTY", 0) + fd = os.open(path, flags) + own_fd = True + else: + fd = path + own_fd = False + try: + try: + st = os.fstat(fd) + except TypeError: + # + # "fd" is most probably a Python socket object. + # (a pipe typically supports fstat) + # + use_mmap = False + else: + if stat.S_ISREG(st[stat.ST_MODE]): + filesize = st[stat.ST_SIZE] + if (use_mmap is None) \ + and (filesize > constants.MAX_AUTO_MAP_SIZE): + # + # This is borrowed from FreeBSD's cp(1) implementation: + # Mmap and process if less than 8M (the limit is + # so we don't totally trash memory on big files. + # This is really a minor hack, but it wins some + # CPU back. Some filesystems, such as smbnetfs, + # don't support mmap, so this is a best-effort + # attempt. + # + use_mmap = False + else: + use_mmap = False + if use_mmap is None: + use_mmap = True + if mmap is None or not use_mmap: + # No mmap available or wanted -> use traditional low-level file IO + fadvise = getattr(os, "posix_fadvise", None) + if fadvise: + fadvise(fd, 0, 0, os.POSIX_FADV_SEQUENTIAL) + if not constants.PY2: + fileobj = io.FileIO(fd, mode="r", closefd=False) + buf = bytearray(constants.READ_CHUNK_SIZE) + with memoryview(buf) as full_view: + while True: + try: + n = fileobj.readinto(buf) + except OSError as e: + if e.errno not in (errno.EAGAIN, + errno.EWOULDBLOCK, + errno.EINTR): + raise + else: + if n == 0: + break + if n == constants.READ_CHUNK_SIZE: + h.update(buf) + else: + with full_view[:n] as partial_view: + h.update(partial_view) + else: + while True: + try: + buf = os.read(fd, constants.READ_CHUNK_SIZE) + except OSError as e: + if e.errno not in (errno.EAGAIN, + errno.EWOULDBLOCK, + errno.EINTR): + raise + else: + if len(buf) == 0: + break + h.update(buf) + else: + # + # Use mmap + # + # NOTE: On Windows mmapped files with length 0 are not supported. + # So ensure to not call mmap.mmap() if the file size is 0. + # + madvise = getattr(mmap.mmap, "madvise", None) + if filesize <= constants.MAP_WINDOW_SIZE: + mapsize = filesize + else: + mapsize = constants.MAP_WINDOW_SIZE + mapoffset = 0 + rest = filesize + while rest > 0: + m = mmap.mmap(fd, + mapsize, + access=mmap.ACCESS_READ, + offset=mapoffset) + if madvise: + madvise(m, mmap.MADV_SEQUENTIAL) + try: + h.update(m) + finally: + m.close() + rest -= mapsize + mapoffset += mapsize + if rest < mapsize: + mapsize = rest + finally: + if own_fd: + os.close(fd) + return h.digest() + + +def compute_digest_stream(hashobj, instream): + """Compute the digest for a given byte string `instream`. + + :param hashobj: a :mod:`hashlib` compatible hash algorithm type or factory + :param instream: a bytes input stream to read the data to be hashed from + :return: the digest in binary form + :rtype: bytes + + """ + h = hashobj() + while True: + try: + buf = instream.read(constants.READ_CHUNK_SIZE) + except OSError as e: + if e.errno not in (errno.EAGAIN, errno.EWOULDBLOCK, errno.EINTR): + raise + else: + if buf is not None: + if len(buf) == 0: + break + h.update(buf) + return h.digest()
