Mercurial > hgrepos > Python > apps > py-cutils
comparison cutils/util/digest.py @ 122:1e5127028254
Move the real computation of digests from files and streams into dedicated submodule cutils.util.digest
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Wed, 01 Jan 2025 18:57:25 +0100 |
| parents | |
| children | a813094ae4f5 |
comparison
equal
deleted
inserted
replaced
| 121:2dc26a2f3d1c | 122:1e5127028254 |
|---|---|
| 1 # -*- coding: utf-8 -*- | |
| 2 # :- | |
| 3 # :Copyright: (c) 2020-2025 Franz Glasner | |
| 4 # :License: BSD-3-Clause | |
| 5 # :- | |
| 6 r"""Utility sub-module to implement a file and stream digest computations. | |
| 7 | |
| 8 """ | |
| 9 | |
| 10 __all__ = ["compute_digest_file", "compute_digest_stream"] | |
| 11 | |
| 12 | |
| 13 import errno | |
| 14 import io | |
| 15 import os | |
| 16 try: | |
| 17 import mmap | |
| 18 except ImportError: | |
| 19 mmap = None | |
| 20 import stat | |
| 21 | |
| 22 from . import constants | |
| 23 | |
| 24 | |
| 25 def compute_digest_file(hashobj, path, use_mmap=None): | |
| 26 """Compute the digest for a file with a filename of an open fd. | |
| 27 | |
| 28 :param hashobj: a :mod:`hashlib` compatible hash algorithm type or factory | |
| 29 :param path: filename within the filesystem or a file descriptor opened in | |
| 30 binary mode (also a socket or pipe) | |
| 31 :param use_mmap: Use the :mod:`mmap` module if available. | |
| 32 If `None` determine automatically. | |
| 33 :type use_mmap: bool or None | |
| 34 :return: the digest in binary form | |
| 35 :rtype: bytes | |
| 36 | |
| 37 If a file descriptor is given is must support :func:`os.read`. | |
| 38 | |
| 39 """ | |
| 40 h = hashobj() | |
| 41 if isinstance(path, constants.PATH_TYPES): | |
| 42 flags = os.O_RDONLY | getattr(os, "O_BINARY", 0) \ | |
| 43 | getattr(os, "O_SEQUENTIAL", 0) | getattr(os, "O_NOCTTY", 0) | |
| 44 fd = os.open(path, flags) | |
| 45 own_fd = True | |
| 46 else: | |
| 47 fd = path | |
| 48 own_fd = False | |
| 49 try: | |
| 50 try: | |
| 51 st = os.fstat(fd) | |
| 52 except TypeError: | |
| 53 # | |
| 54 # "fd" is most probably a Python socket object. | |
| 55 # (a pipe typically supports fstat) | |
| 56 # | |
| 57 use_mmap = False | |
| 58 else: | |
| 59 if stat.S_ISREG(st[stat.ST_MODE]): | |
| 60 filesize = st[stat.ST_SIZE] | |
| 61 if (use_mmap is None) \ | |
| 62 and (filesize > constants.MAX_AUTO_MAP_SIZE): | |
| 63 # | |
| 64 # This is borrowed from FreeBSD's cp(1) implementation: | |
| 65 # Mmap and process if less than 8M (the limit is | |
| 66 # so we don't totally trash memory on big files. | |
| 67 # This is really a minor hack, but it wins some | |
| 68 # CPU back. Some filesystems, such as smbnetfs, | |
| 69 # don't support mmap, so this is a best-effort | |
| 70 # attempt. | |
| 71 # | |
| 72 use_mmap = False | |
| 73 else: | |
| 74 use_mmap = False | |
| 75 if use_mmap is None: | |
| 76 use_mmap = True | |
| 77 if mmap is None or not use_mmap: | |
| 78 # No mmap available or wanted -> use traditional low-level file IO | |
| 79 fadvise = getattr(os, "posix_fadvise", None) | |
| 80 if fadvise: | |
| 81 fadvise(fd, 0, 0, os.POSIX_FADV_SEQUENTIAL) | |
| 82 if not constants.PY2: | |
| 83 fileobj = io.FileIO(fd, mode="r", closefd=False) | |
| 84 buf = bytearray(constants.READ_CHUNK_SIZE) | |
| 85 with memoryview(buf) as full_view: | |
| 86 while True: | |
| 87 try: | |
| 88 n = fileobj.readinto(buf) | |
| 89 except OSError as e: | |
| 90 if e.errno not in (errno.EAGAIN, | |
| 91 errno.EWOULDBLOCK, | |
| 92 errno.EINTR): | |
| 93 raise | |
| 94 else: | |
| 95 if n == 0: | |
| 96 break | |
| 97 if n == constants.READ_CHUNK_SIZE: | |
| 98 h.update(buf) | |
| 99 else: | |
| 100 with full_view[:n] as partial_view: | |
| 101 h.update(partial_view) | |
| 102 else: | |
| 103 while True: | |
| 104 try: | |
| 105 buf = os.read(fd, constants.READ_CHUNK_SIZE) | |
| 106 except OSError as e: | |
| 107 if e.errno not in (errno.EAGAIN, | |
| 108 errno.EWOULDBLOCK, | |
| 109 errno.EINTR): | |
| 110 raise | |
| 111 else: | |
| 112 if len(buf) == 0: | |
| 113 break | |
| 114 h.update(buf) | |
| 115 else: | |
| 116 # | |
| 117 # Use mmap | |
| 118 # | |
| 119 # NOTE: On Windows mmapped files with length 0 are not supported. | |
| 120 # So ensure to not call mmap.mmap() if the file size is 0. | |
| 121 # | |
| 122 madvise = getattr(mmap.mmap, "madvise", None) | |
| 123 if filesize <= constants.MAP_WINDOW_SIZE: | |
| 124 mapsize = filesize | |
| 125 else: | |
| 126 mapsize = constants.MAP_WINDOW_SIZE | |
| 127 mapoffset = 0 | |
| 128 rest = filesize | |
| 129 while rest > 0: | |
| 130 m = mmap.mmap(fd, | |
| 131 mapsize, | |
| 132 access=mmap.ACCESS_READ, | |
| 133 offset=mapoffset) | |
| 134 if madvise: | |
| 135 madvise(m, mmap.MADV_SEQUENTIAL) | |
| 136 try: | |
| 137 h.update(m) | |
| 138 finally: | |
| 139 m.close() | |
| 140 rest -= mapsize | |
| 141 mapoffset += mapsize | |
| 142 if rest < mapsize: | |
| 143 mapsize = rest | |
| 144 finally: | |
| 145 if own_fd: | |
| 146 os.close(fd) | |
| 147 return h.digest() | |
| 148 | |
| 149 | |
| 150 def compute_digest_stream(hashobj, instream): | |
| 151 """Compute the digest for a given byte string `instream`. | |
| 152 | |
| 153 :param hashobj: a :mod:`hashlib` compatible hash algorithm type or factory | |
| 154 :param instream: a bytes input stream to read the data to be hashed from | |
| 155 :return: the digest in binary form | |
| 156 :rtype: bytes | |
| 157 | |
| 158 """ | |
| 159 h = hashobj() | |
| 160 while True: | |
| 161 try: | |
| 162 buf = instream.read(constants.READ_CHUNK_SIZE) | |
| 163 except OSError as e: | |
| 164 if e.errno not in (errno.EAGAIN, errno.EWOULDBLOCK, errno.EINTR): | |
| 165 raise | |
| 166 else: | |
| 167 if buf is not None: | |
| 168 if len(buf) == 0: | |
| 169 break | |
| 170 h.update(buf) | |
| 171 return h.digest() |
