# HG changeset patch # User Franz Glasner # Date 1735754245 -3600 # Node ID 1e512702825481d0fb7a04e42c3d92d2fafab7d7 # Parent 2dc26a2f3d1c0aebd5ce26c832cbb2b8c555c5b1 Move the real computation of digests from files and streams into dedicated submodule cutils.util.digest diff -r 2dc26a2f3d1c -r 1e5127028254 cutils/shasum.py --- a/cutils/shasum.py Wed Jan 01 17:52:41 2025 +0100 +++ b/cutils/shasum.py Wed Jan 01 18:57:25 2025 +0100 @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # :- -# :Copyright: (c) 2020-2024 Franz Glasner +# :Copyright: (c) 2020-2025 Franz Glasner # :License: BSD-3-Clause # :- r"""Pure Python implementation of `shasum`. @@ -18,18 +18,14 @@ import binascii import errno import io -try: - import mmap -except ImportError: - mmap = None import os import re -import stat import sys from . import (__version__, __revision__) from . import util from .util import constants +from .util import digest def main(argv=None): @@ -170,8 +166,8 @@ for fn in dirfiles: path = os.path.join(dirpath, fn) out(opts.dest or sys.stdout, - compute_digest_file(opts.algorithm[0], path, - use_mmap=opts.mmap), + digest.compute_digest_file( + opts.algorithm[0], path, use_mmap=opts.mmap), path, opts.algorithm[1], True, @@ -186,7 +182,7 @@ else: source = sys.stdin.buffer out(sys.stdout, - compute_digest_stream(opts.algorithm[0], source), + digest.compute_digest_stream(opts.algorithm[0], source), None, opts.algorithm[1], True, @@ -194,8 +190,8 @@ else: for fn in opts.files: out(opts.dest or sys.stdout, - compute_digest_file(opts.algorithm[0], fn, - use_mmap=opts.mmap), + digest.compute_digest_file( + opts.algorithm[0], fn, use_mmap=opts.mmap), fn, opts.algorithm[1], True, @@ -260,7 +256,7 @@ print("-: MISSING", file=dest) else: tag, algo, cl_filename, cl_digest = pl - computed_digest = compute_digest_stream(algo, source) + computed_digest = digest.compute_digest_stream(algo, source) if compare_digests_equal(computed_digest, cl_digest, algo): res = "OK" else: @@ -275,8 +271,8 @@ exit_code = 1 else: tag, algo, cl_filename, cl_digest = pl - computed_digest = compute_digest_file(algo, fn, - use_mmap=opts.mmap) + computed_digest = digest.compute_digest_file( + algo, fn, use_mmap=opts.mmap) if compare_digests_equal(computed_digest, cl_digest, algo): res = "OK" else: @@ -332,7 +328,7 @@ assert opts.allow_distinfo return (None, None, tag) try: - d = compute_digest_file(algo, fn, use_mmap=opts.mmap) + d = digest.compute_digest_file(algo, fn, use_mmap=opts.mmap) if compare_digests_equal(d, digest, algo): return ("ok", fn, tag) else: @@ -443,153 +439,5 @@ file=dest) -def compute_digest_file(hashobj, path, use_mmap=None): - """ - :param hashobj: a :mod:`hashlib` compatible hash algorithm type or factory - :param path: filename within the filesystem or a file descriptor opened in - binary mode (also a socket or pipe) - :param use_mmap: Use the :mod:`mmap` module if available. - If `None` determine automatically. - :type use_mmap: bool or None - :return: the digest in binary form - :rtype: bytes - - If a file descriptor is given is must support :func:`os.read`. - - """ - h = hashobj() - if isinstance(path, constants.PATH_TYPES): - flags = os.O_RDONLY | getattr(os, "O_BINARY", 0) \ - | getattr(os, "O_SEQUENTIAL", 0) | getattr(os, "O_NOCTTY", 0) - fd = os.open(path, flags) - own_fd = True - else: - fd = path - own_fd = False - try: - try: - st = os.fstat(fd) - except TypeError: - # - # "fd" is most probably a Python socket object. - # (a pipe typically supports fstat) - # - use_mmap = False - else: - if stat.S_ISREG(st[stat.ST_MODE]): - filesize = st[stat.ST_SIZE] - if (use_mmap is None) \ - and (filesize > constants.MAX_AUTO_MAP_SIZE): - # - # This is borrowed from FreeBSD's cp(1) implementation: - # Mmap and process if less than 8M (the limit is - # so we don't totally trash memory on big files. - # This is really a minor hack, but it wins some - # CPU back. Some filesystems, such as smbnetfs, - # don't support mmap, so this is a best-effort - # attempt. - # - use_mmap = False - else: - use_mmap = False - if use_mmap is None: - use_mmap = True - if mmap is None or not use_mmap: - # No mmap available or wanted -> use traditional low-level file IO - fadvise = getattr(os, "posix_fadvise", None) - if fadvise: - fadvise(fd, 0, 0, os.POSIX_FADV_SEQUENTIAL) - if not constants.PY2: - fileobj = io.FileIO(fd, mode="r", closefd=False) - buf = bytearray(constants.READ_CHUNK_SIZE) - with memoryview(buf) as full_view: - while True: - try: - n = fileobj.readinto(buf) - except OSError as e: - if e.errno not in (errno.EAGAIN, - errno.EWOULDBLOCK, - errno.EINTR): - raise - else: - if n == 0: - break - if n == constants.READ_CHUNK_SIZE: - h.update(buf) - else: - with full_view[:n] as partial_view: - h.update(partial_view) - else: - while True: - try: - buf = os.read(fd, constants.READ_CHUNK_SIZE) - except OSError as e: - if e.errno not in (errno.EAGAIN, - errno.EWOULDBLOCK, - errno.EINTR): - raise - else: - if len(buf) == 0: - break - h.update(buf) - else: - # - # Use mmap - # - # NOTE: On Windows mmapped files with length 0 are not supported. - # So ensure to not call mmap.mmap() if the file size is 0. - # - madvise = getattr(mmap.mmap, "madvise", None) - if filesize <= constants.MAP_WINDOW_SIZE: - mapsize = filesize - else: - mapsize = constants.MAP_WINDOW_SIZE - mapoffset = 0 - rest = filesize - while rest > 0: - m = mmap.mmap(fd, - mapsize, - access=mmap.ACCESS_READ, - offset=mapoffset) - if madvise: - madvise(m, mmap.MADV_SEQUENTIAL) - try: - h.update(m) - finally: - m.close() - rest -= mapsize - mapoffset += mapsize - if rest < mapsize: - mapsize = rest - finally: - if own_fd: - os.close(fd) - return h.digest() - - -def compute_digest_stream(hashobj, instream): - """ - - :param hashobj: a :mod:`hashlib` compatible hash algorithm type or factory - :param instream: a bytes input stream to read the data to be hashed from - :return: the digest in binary form - :rtype: bytes - - """ - h = hashobj() - while True: - try: - buf = instream.read(constants.READ_CHUNK_SIZE) - except OSError as e: - if e.errno not in (errno.EAGAIN, errno.EWOULDBLOCK, errno.EINTR): - raise - else: - if buf is not None: - if len(buf) == 0: - break - h.update(buf) - return h.digest() - - if __name__ == "__main__": sys.exit(main()) diff -r 2dc26a2f3d1c -r 1e5127028254 cutils/util/digest.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cutils/util/digest.py Wed Jan 01 18:57:25 2025 +0100 @@ -0,0 +1,171 @@ +# -*- coding: utf-8 -*- +# :- +# :Copyright: (c) 2020-2025 Franz Glasner +# :License: BSD-3-Clause +# :- +r"""Utility sub-module to implement a file and stream digest computations. + +""" + +__all__ = ["compute_digest_file", "compute_digest_stream"] + + +import errno +import io +import os +try: + import mmap +except ImportError: + mmap = None +import stat + +from . import constants + + +def compute_digest_file(hashobj, path, use_mmap=None): + """Compute the digest for a file with a filename of an open fd. + + :param hashobj: a :mod:`hashlib` compatible hash algorithm type or factory + :param path: filename within the filesystem or a file descriptor opened in + binary mode (also a socket or pipe) + :param use_mmap: Use the :mod:`mmap` module if available. + If `None` determine automatically. + :type use_mmap: bool or None + :return: the digest in binary form + :rtype: bytes + + If a file descriptor is given is must support :func:`os.read`. + + """ + h = hashobj() + if isinstance(path, constants.PATH_TYPES): + flags = os.O_RDONLY | getattr(os, "O_BINARY", 0) \ + | getattr(os, "O_SEQUENTIAL", 0) | getattr(os, "O_NOCTTY", 0) + fd = os.open(path, flags) + own_fd = True + else: + fd = path + own_fd = False + try: + try: + st = os.fstat(fd) + except TypeError: + # + # "fd" is most probably a Python socket object. + # (a pipe typically supports fstat) + # + use_mmap = False + else: + if stat.S_ISREG(st[stat.ST_MODE]): + filesize = st[stat.ST_SIZE] + if (use_mmap is None) \ + and (filesize > constants.MAX_AUTO_MAP_SIZE): + # + # This is borrowed from FreeBSD's cp(1) implementation: + # Mmap and process if less than 8M (the limit is + # so we don't totally trash memory on big files. + # This is really a minor hack, but it wins some + # CPU back. Some filesystems, such as smbnetfs, + # don't support mmap, so this is a best-effort + # attempt. + # + use_mmap = False + else: + use_mmap = False + if use_mmap is None: + use_mmap = True + if mmap is None or not use_mmap: + # No mmap available or wanted -> use traditional low-level file IO + fadvise = getattr(os, "posix_fadvise", None) + if fadvise: + fadvise(fd, 0, 0, os.POSIX_FADV_SEQUENTIAL) + if not constants.PY2: + fileobj = io.FileIO(fd, mode="r", closefd=False) + buf = bytearray(constants.READ_CHUNK_SIZE) + with memoryview(buf) as full_view: + while True: + try: + n = fileobj.readinto(buf) + except OSError as e: + if e.errno not in (errno.EAGAIN, + errno.EWOULDBLOCK, + errno.EINTR): + raise + else: + if n == 0: + break + if n == constants.READ_CHUNK_SIZE: + h.update(buf) + else: + with full_view[:n] as partial_view: + h.update(partial_view) + else: + while True: + try: + buf = os.read(fd, constants.READ_CHUNK_SIZE) + except OSError as e: + if e.errno not in (errno.EAGAIN, + errno.EWOULDBLOCK, + errno.EINTR): + raise + else: + if len(buf) == 0: + break + h.update(buf) + else: + # + # Use mmap + # + # NOTE: On Windows mmapped files with length 0 are not supported. + # So ensure to not call mmap.mmap() if the file size is 0. + # + madvise = getattr(mmap.mmap, "madvise", None) + if filesize <= constants.MAP_WINDOW_SIZE: + mapsize = filesize + else: + mapsize = constants.MAP_WINDOW_SIZE + mapoffset = 0 + rest = filesize + while rest > 0: + m = mmap.mmap(fd, + mapsize, + access=mmap.ACCESS_READ, + offset=mapoffset) + if madvise: + madvise(m, mmap.MADV_SEQUENTIAL) + try: + h.update(m) + finally: + m.close() + rest -= mapsize + mapoffset += mapsize + if rest < mapsize: + mapsize = rest + finally: + if own_fd: + os.close(fd) + return h.digest() + + +def compute_digest_stream(hashobj, instream): + """Compute the digest for a given byte string `instream`. + + :param hashobj: a :mod:`hashlib` compatible hash algorithm type or factory + :param instream: a bytes input stream to read the data to be hashed from + :return: the digest in binary form + :rtype: bytes + + """ + h = hashobj() + while True: + try: + buf = instream.read(constants.READ_CHUNK_SIZE) + except OSError as e: + if e.errno not in (errno.EAGAIN, errno.EWOULDBLOCK, errno.EINTR): + raise + else: + if buf is not None: + if len(buf) == 0: + break + h.update(buf) + return h.digest() diff -r 2dc26a2f3d1c -r 1e5127028254 shasum.py --- a/shasum.py Wed Jan 01 17:52:41 2025 +0100 +++ b/shasum.py Wed Jan 01 18:57:25 2025 +0100 @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # :- -# :Copyright: (c) 2020-2024 Franz Glasner +# :Copyright: (c) 2020-2025 Franz Glasner # :License: BSD-3-Clause # :- r"""Pure Python implementation of `shasum`.