comparison cutils/shasum.py @ 122:1e5127028254

Move the real computation of digests from files and streams into dedicated submodule cutils.util.digest
author Franz Glasner <fzglas.hg@dom66.de>
date Wed, 01 Jan 2025 18:57:25 +0100
parents a548783381b6
children a813094ae4f5
comparison
equal deleted inserted replaced
121:2dc26a2f3d1c 122:1e5127028254
1 # -*- coding: utf-8 -*- 1 # -*- coding: utf-8 -*-
2 # :- 2 # :-
3 # :Copyright: (c) 2020-2024 Franz Glasner 3 # :Copyright: (c) 2020-2025 Franz Glasner
4 # :License: BSD-3-Clause 4 # :License: BSD-3-Clause
5 # :- 5 # :-
6 r"""Pure Python implementation of `shasum`. 6 r"""Pure Python implementation of `shasum`.
7 7
8 """ 8 """
16 import argparse 16 import argparse
17 import base64 17 import base64
18 import binascii 18 import binascii
19 import errno 19 import errno
20 import io 20 import io
21 try:
22 import mmap
23 except ImportError:
24 mmap = None
25 import os 21 import os
26 import re 22 import re
27 import stat
28 import sys 23 import sys
29 24
30 from . import (__version__, __revision__) 25 from . import (__version__, __revision__)
31 from . import util 26 from . import util
32 from .util import constants 27 from .util import constants
28 from .util import digest
33 29
34 30
35 def main(argv=None): 31 def main(argv=None):
36 aparser = argparse.ArgumentParser( 32 aparser = argparse.ArgumentParser(
37 description="Python implementation of shasum", 33 description="Python implementation of shasum",
168 dirnames.sort() 164 dirnames.sort()
169 dirfiles.sort() 165 dirfiles.sort()
170 for fn in dirfiles: 166 for fn in dirfiles:
171 path = os.path.join(dirpath, fn) 167 path = os.path.join(dirpath, fn)
172 out(opts.dest or sys.stdout, 168 out(opts.dest or sys.stdout,
173 compute_digest_file(opts.algorithm[0], path, 169 digest.compute_digest_file(
174 use_mmap=opts.mmap), 170 opts.algorithm[0], path, use_mmap=opts.mmap),
175 path, 171 path,
176 opts.algorithm[1], 172 opts.algorithm[1],
177 True, 173 True,
178 opts.base64) 174 opts.base64)
179 else: 175 else:
184 msvcrt.setmode(sys.stdin.fileno(), os.O_BINARY) 180 msvcrt.setmode(sys.stdin.fileno(), os.O_BINARY)
185 source = sys.stdin 181 source = sys.stdin
186 else: 182 else:
187 source = sys.stdin.buffer 183 source = sys.stdin.buffer
188 out(sys.stdout, 184 out(sys.stdout,
189 compute_digest_stream(opts.algorithm[0], source), 185 digest.compute_digest_stream(opts.algorithm[0], source),
190 None, 186 None,
191 opts.algorithm[1], 187 opts.algorithm[1],
192 True, 188 True,
193 opts.base64) 189 opts.base64)
194 else: 190 else:
195 for fn in opts.files: 191 for fn in opts.files:
196 out(opts.dest or sys.stdout, 192 out(opts.dest or sys.stdout,
197 compute_digest_file(opts.algorithm[0], fn, 193 digest.compute_digest_file(
198 use_mmap=opts.mmap), 194 opts.algorithm[0], fn, use_mmap=opts.mmap),
199 fn, 195 fn,
200 opts.algorithm[1], 196 opts.algorithm[1],
201 True, 197 True,
202 opts.base64) 198 opts.base64)
203 return 0 199 return 0
258 if pl is None: 254 if pl is None:
259 exit_code = 1 255 exit_code = 1
260 print("-: MISSING", file=dest) 256 print("-: MISSING", file=dest)
261 else: 257 else:
262 tag, algo, cl_filename, cl_digest = pl 258 tag, algo, cl_filename, cl_digest = pl
263 computed_digest = compute_digest_stream(algo, source) 259 computed_digest = digest.compute_digest_stream(algo, source)
264 if compare_digests_equal(computed_digest, cl_digest, algo): 260 if compare_digests_equal(computed_digest, cl_digest, algo):
265 res = "OK" 261 res = "OK"
266 else: 262 else:
267 res = "FAILED" 263 res = "FAILED"
268 exit_code = 1 264 exit_code = 1
273 if pl is None: 269 if pl is None:
274 print("{}: MISSING".format(fn), file=dest) 270 print("{}: MISSING".format(fn), file=dest)
275 exit_code = 1 271 exit_code = 1
276 else: 272 else:
277 tag, algo, cl_filename, cl_digest = pl 273 tag, algo, cl_filename, cl_digest = pl
278 computed_digest = compute_digest_file(algo, fn, 274 computed_digest = digest.compute_digest_file(
279 use_mmap=opts.mmap) 275 algo, fn, use_mmap=opts.mmap)
280 if compare_digests_equal(computed_digest, cl_digest, algo): 276 if compare_digests_equal(computed_digest, cl_digest, algo):
281 res = "OK" 277 res = "OK"
282 else: 278 else:
283 exit_code = 1 279 exit_code = 1
284 res = "FAILED" 280 res = "FAILED"
330 tag, algo, fn, digest = parts 326 tag, algo, fn, digest = parts
331 if tag in ("SIZE", "TIMESTAMP"): 327 if tag in ("SIZE", "TIMESTAMP"):
332 assert opts.allow_distinfo 328 assert opts.allow_distinfo
333 return (None, None, tag) 329 return (None, None, tag)
334 try: 330 try:
335 d = compute_digest_file(algo, fn, use_mmap=opts.mmap) 331 d = digest.compute_digest_file(algo, fn, use_mmap=opts.mmap)
336 if compare_digests_equal(d, digest, algo): 332 if compare_digests_equal(d, digest, algo):
337 return ("ok", fn, tag) 333 return ("ok", fn, tag)
338 else: 334 else:
339 return ("failed", fn, tag) 335 return ("failed", fn, tag)
340 except EnvironmentError: 336 except EnvironmentError:
441 '*' if binary else ' ', 437 '*' if binary else ' ',
442 '-' if filename is None else util.normalize_filename(filename)), 438 '-' if filename is None else util.normalize_filename(filename)),
443 file=dest) 439 file=dest)
444 440
445 441
446 def compute_digest_file(hashobj, path, use_mmap=None):
447 """
448 :param hashobj: a :mod:`hashlib` compatible hash algorithm type or factory
449 :param path: filename within the filesystem or a file descriptor opened in
450 binary mode (also a socket or pipe)
451 :param use_mmap: Use the :mod:`mmap` module if available.
452 If `None` determine automatically.
453 :type use_mmap: bool or None
454 :return: the digest in binary form
455 :rtype: bytes
456
457 If a file descriptor is given is must support :func:`os.read`.
458
459 """
460 h = hashobj()
461 if isinstance(path, constants.PATH_TYPES):
462 flags = os.O_RDONLY | getattr(os, "O_BINARY", 0) \
463 | getattr(os, "O_SEQUENTIAL", 0) | getattr(os, "O_NOCTTY", 0)
464 fd = os.open(path, flags)
465 own_fd = True
466 else:
467 fd = path
468 own_fd = False
469 try:
470 try:
471 st = os.fstat(fd)
472 except TypeError:
473 #
474 # "fd" is most probably a Python socket object.
475 # (a pipe typically supports fstat)
476 #
477 use_mmap = False
478 else:
479 if stat.S_ISREG(st[stat.ST_MODE]):
480 filesize = st[stat.ST_SIZE]
481 if (use_mmap is None) \
482 and (filesize > constants.MAX_AUTO_MAP_SIZE):
483 #
484 # This is borrowed from FreeBSD's cp(1) implementation:
485 # Mmap and process if less than 8M (the limit is
486 # so we don't totally trash memory on big files.
487 # This is really a minor hack, but it wins some
488 # CPU back. Some filesystems, such as smbnetfs,
489 # don't support mmap, so this is a best-effort
490 # attempt.
491 #
492 use_mmap = False
493 else:
494 use_mmap = False
495 if use_mmap is None:
496 use_mmap = True
497 if mmap is None or not use_mmap:
498 # No mmap available or wanted -> use traditional low-level file IO
499 fadvise = getattr(os, "posix_fadvise", None)
500 if fadvise:
501 fadvise(fd, 0, 0, os.POSIX_FADV_SEQUENTIAL)
502 if not constants.PY2:
503 fileobj = io.FileIO(fd, mode="r", closefd=False)
504 buf = bytearray(constants.READ_CHUNK_SIZE)
505 with memoryview(buf) as full_view:
506 while True:
507 try:
508 n = fileobj.readinto(buf)
509 except OSError as e:
510 if e.errno not in (errno.EAGAIN,
511 errno.EWOULDBLOCK,
512 errno.EINTR):
513 raise
514 else:
515 if n == 0:
516 break
517 if n == constants.READ_CHUNK_SIZE:
518 h.update(buf)
519 else:
520 with full_view[:n] as partial_view:
521 h.update(partial_view)
522 else:
523 while True:
524 try:
525 buf = os.read(fd, constants.READ_CHUNK_SIZE)
526 except OSError as e:
527 if e.errno not in (errno.EAGAIN,
528 errno.EWOULDBLOCK,
529 errno.EINTR):
530 raise
531 else:
532 if len(buf) == 0:
533 break
534 h.update(buf)
535 else:
536 #
537 # Use mmap
538 #
539 # NOTE: On Windows mmapped files with length 0 are not supported.
540 # So ensure to not call mmap.mmap() if the file size is 0.
541 #
542 madvise = getattr(mmap.mmap, "madvise", None)
543 if filesize <= constants.MAP_WINDOW_SIZE:
544 mapsize = filesize
545 else:
546 mapsize = constants.MAP_WINDOW_SIZE
547 mapoffset = 0
548 rest = filesize
549 while rest > 0:
550 m = mmap.mmap(fd,
551 mapsize,
552 access=mmap.ACCESS_READ,
553 offset=mapoffset)
554 if madvise:
555 madvise(m, mmap.MADV_SEQUENTIAL)
556 try:
557 h.update(m)
558 finally:
559 m.close()
560 rest -= mapsize
561 mapoffset += mapsize
562 if rest < mapsize:
563 mapsize = rest
564 finally:
565 if own_fd:
566 os.close(fd)
567 return h.digest()
568
569
570 def compute_digest_stream(hashobj, instream):
571 """
572
573 :param hashobj: a :mod:`hashlib` compatible hash algorithm type or factory
574 :param instream: a bytes input stream to read the data to be hashed from
575 :return: the digest in binary form
576 :rtype: bytes
577
578 """
579 h = hashobj()
580 while True:
581 try:
582 buf = instream.read(constants.READ_CHUNK_SIZE)
583 except OSError as e:
584 if e.errno not in (errno.EAGAIN, errno.EWOULDBLOCK, errno.EINTR):
585 raise
586 else:
587 if buf is not None:
588 if len(buf) == 0:
589 break
590 h.update(buf)
591 return h.digest()
592
593
594 if __name__ == "__main__": 442 if __name__ == "__main__":
595 sys.exit(main()) 443 sys.exit(main())