comparison cutils/util/digest.py @ 122:1e5127028254

Move the real computation of digests from files and streams into dedicated submodule cutils.util.digest
author Franz Glasner <fzglas.hg@dom66.de>
date Wed, 01 Jan 2025 18:57:25 +0100
parents
children a813094ae4f5
comparison
equal deleted inserted replaced
121:2dc26a2f3d1c 122:1e5127028254
1 # -*- coding: utf-8 -*-
2 # :-
3 # :Copyright: (c) 2020-2025 Franz Glasner
4 # :License: BSD-3-Clause
5 # :-
6 r"""Utility sub-module to implement a file and stream digest computations.
7
8 """
9
10 __all__ = ["compute_digest_file", "compute_digest_stream"]
11
12
13 import errno
14 import io
15 import os
16 try:
17 import mmap
18 except ImportError:
19 mmap = None
20 import stat
21
22 from . import constants
23
24
25 def compute_digest_file(hashobj, path, use_mmap=None):
26 """Compute the digest for a file with a filename of an open fd.
27
28 :param hashobj: a :mod:`hashlib` compatible hash algorithm type or factory
29 :param path: filename within the filesystem or a file descriptor opened in
30 binary mode (also a socket or pipe)
31 :param use_mmap: Use the :mod:`mmap` module if available.
32 If `None` determine automatically.
33 :type use_mmap: bool or None
34 :return: the digest in binary form
35 :rtype: bytes
36
37 If a file descriptor is given is must support :func:`os.read`.
38
39 """
40 h = hashobj()
41 if isinstance(path, constants.PATH_TYPES):
42 flags = os.O_RDONLY | getattr(os, "O_BINARY", 0) \
43 | getattr(os, "O_SEQUENTIAL", 0) | getattr(os, "O_NOCTTY", 0)
44 fd = os.open(path, flags)
45 own_fd = True
46 else:
47 fd = path
48 own_fd = False
49 try:
50 try:
51 st = os.fstat(fd)
52 except TypeError:
53 #
54 # "fd" is most probably a Python socket object.
55 # (a pipe typically supports fstat)
56 #
57 use_mmap = False
58 else:
59 if stat.S_ISREG(st[stat.ST_MODE]):
60 filesize = st[stat.ST_SIZE]
61 if (use_mmap is None) \
62 and (filesize > constants.MAX_AUTO_MAP_SIZE):
63 #
64 # This is borrowed from FreeBSD's cp(1) implementation:
65 # Mmap and process if less than 8M (the limit is
66 # so we don't totally trash memory on big files.
67 # This is really a minor hack, but it wins some
68 # CPU back. Some filesystems, such as smbnetfs,
69 # don't support mmap, so this is a best-effort
70 # attempt.
71 #
72 use_mmap = False
73 else:
74 use_mmap = False
75 if use_mmap is None:
76 use_mmap = True
77 if mmap is None or not use_mmap:
78 # No mmap available or wanted -> use traditional low-level file IO
79 fadvise = getattr(os, "posix_fadvise", None)
80 if fadvise:
81 fadvise(fd, 0, 0, os.POSIX_FADV_SEQUENTIAL)
82 if not constants.PY2:
83 fileobj = io.FileIO(fd, mode="r", closefd=False)
84 buf = bytearray(constants.READ_CHUNK_SIZE)
85 with memoryview(buf) as full_view:
86 while True:
87 try:
88 n = fileobj.readinto(buf)
89 except OSError as e:
90 if e.errno not in (errno.EAGAIN,
91 errno.EWOULDBLOCK,
92 errno.EINTR):
93 raise
94 else:
95 if n == 0:
96 break
97 if n == constants.READ_CHUNK_SIZE:
98 h.update(buf)
99 else:
100 with full_view[:n] as partial_view:
101 h.update(partial_view)
102 else:
103 while True:
104 try:
105 buf = os.read(fd, constants.READ_CHUNK_SIZE)
106 except OSError as e:
107 if e.errno not in (errno.EAGAIN,
108 errno.EWOULDBLOCK,
109 errno.EINTR):
110 raise
111 else:
112 if len(buf) == 0:
113 break
114 h.update(buf)
115 else:
116 #
117 # Use mmap
118 #
119 # NOTE: On Windows mmapped files with length 0 are not supported.
120 # So ensure to not call mmap.mmap() if the file size is 0.
121 #
122 madvise = getattr(mmap.mmap, "madvise", None)
123 if filesize <= constants.MAP_WINDOW_SIZE:
124 mapsize = filesize
125 else:
126 mapsize = constants.MAP_WINDOW_SIZE
127 mapoffset = 0
128 rest = filesize
129 while rest > 0:
130 m = mmap.mmap(fd,
131 mapsize,
132 access=mmap.ACCESS_READ,
133 offset=mapoffset)
134 if madvise:
135 madvise(m, mmap.MADV_SEQUENTIAL)
136 try:
137 h.update(m)
138 finally:
139 m.close()
140 rest -= mapsize
141 mapoffset += mapsize
142 if rest < mapsize:
143 mapsize = rest
144 finally:
145 if own_fd:
146 os.close(fd)
147 return h.digest()
148
149
150 def compute_digest_stream(hashobj, instream):
151 """Compute the digest for a given byte string `instream`.
152
153 :param hashobj: a :mod:`hashlib` compatible hash algorithm type or factory
154 :param instream: a bytes input stream to read the data to be hashed from
155 :return: the digest in binary form
156 :rtype: bytes
157
158 """
159 h = hashobj()
160 while True:
161 try:
162 buf = instream.read(constants.READ_CHUNK_SIZE)
163 except OSError as e:
164 if e.errno not in (errno.EAGAIN, errno.EWOULDBLOCK, errno.EINTR):
165 raise
166 else:
167 if buf is not None:
168 if len(buf) == 0:
169 break
170 h.update(buf)
171 return h.digest()