comparison cutils/shasum.py @ 89:72684020f2f3

By default use mmap only for files up to 8MiB in size. This follows the FreeBSD cp(1) implementation.
author Franz Glasner <fzglas.hg@dom66.de>
date Thu, 21 Apr 2022 01:20:35 +0200
parents f69353f26937
children 42419f57eda9
comparison
equal deleted inserted replaced
88:f69353f26937 89:72684020f2f3
43 if pathlib: 43 if pathlib:
44 PATH_TYPES = (str, bytes, pathlib.Path) 44 PATH_TYPES = (str, bytes, pathlib.Path)
45 else: 45 else:
46 PATH_TYPES = (str, bytes) 46 PATH_TYPES = (str, bytes)
47 47
48 CHUNK_SIZE = 1024*1024 48 READ_CHUNK_SIZE = 2 * 1024 * 1024 # like BUFSIZE_MAX on FreeBSD
49 MAP_CHUNK_SIZE = 64*1024*1024 49 MAX_AUTO_MAP_SIZE = 8 * 1024 * 1024
50 MAP_WINDOW_SIZE = MAX_AUTO_MAP_SIZE # do not totally trash memory on big files
50 51
51 52
52 def main(argv=None): 53 def main(argv=None):
53 aparser = argparse.ArgumentParser( 54 aparser = argparse.ArgumentParser(
54 description="Python implementation of shasum", 55 description="Python implementation of shasum",
531 '*' if binary else ' ', 532 '*' if binary else ' ',
532 '-' if filename is None else normalize_filename(filename)), 533 '-' if filename is None else normalize_filename(filename)),
533 file=dest) 534 file=dest)
534 535
535 536
536 def compute_digest_file(hashobj, path, use_mmap=True): 537 def compute_digest_file(hashobj, path, use_mmap=None):
537 """ 538 """
538 :param hashobj: a :mod:`hashlib` compatible hash algorithm type or factory 539 :param hashobj: a :mod:`hashlib` compatible hash algorithm type or factory
539 :param path: filename within the filesystem or a file descriptor opened in 540 :param path: filename within the filesystem or a file descriptor opened in
540 binary mode (also a socket or pipe) 541 binary mode (also a socket or pipe)
541 :param bool use_mmap: use the :mod:`mmap` module if available 542 :param use_mmap: Use the :mod:`mmap` module if available.
543 If `None` determine automatically.
544 :type use_mmap: bool or None
542 :return: the digest in binary form 545 :return: the digest in binary form
543 :rtype: bytes 546 :rtype: bytes
544 547
545 If a file descriptor is given is must support :func:`os.read`. 548 If a file descriptor is given is must support :func:`os.read`.
546 549
564 # 567 #
565 use_mmap = False 568 use_mmap = False
566 else: 569 else:
567 if stat.S_ISREG(st[stat.ST_MODE]): 570 if stat.S_ISREG(st[stat.ST_MODE]):
568 filesize = st[stat.ST_SIZE] 571 filesize = st[stat.ST_SIZE]
572 if (use_mmap is None) \
573 and (filesize > MAX_AUTO_MAP_SIZE):
574 #
575 # This is borrowed from FreeBSD's cp(1) implementation:
576 # Mmap and process if less than 8M (the limit is
577 # so we don't totally trash memory on big files.
578 # This is really a minor hack, but it wins some
579 # CPU back. Some filesystems, such as smbnetfs,
580 # don't support mmap, so this is a best-effort
581 # attempt.
582 #
583 use_mmap = False
569 else: 584 else:
570 use_mmap = False 585 use_mmap = False
586 if use_mmap is None:
587 use_mmap = True
571 if mmap is None or not use_mmap: 588 if mmap is None or not use_mmap:
572 # No mmap available or wanted -> use traditional low-level file IO 589 # No mmap available or wanted -> use traditional low-level file IO
573 while True: 590 while True:
574 try: 591 try:
575 buf = os.read(fd, CHUNK_SIZE) 592 buf = os.read(fd, READ_CHUNK_SIZE)
576 except OSError as e: 593 except OSError as e:
577 if e.errno not in (errno.EAGAIN, errno.EWOULDBLOCK, 594 if e.errno not in (errno.EAGAIN, errno.EWOULDBLOCK,
578 errno.EINTR): 595 errno.EINTR):
579 raise 596 raise
580 else: 597 else:
587 # 604 #
588 # NOTE: On Windows mmapped files with length 0 are not supported. 605 # NOTE: On Windows mmapped files with length 0 are not supported.
589 # So ensure to not call mmap.mmap() if the file size is 0. 606 # So ensure to not call mmap.mmap() if the file size is 0.
590 # 607 #
591 madvise = getattr(mmap.mmap, "madvise", None) 608 madvise = getattr(mmap.mmap, "madvise", None)
592 if filesize < MAP_CHUNK_SIZE: 609 if filesize <= MAP_WINDOW_SIZE:
593 mapsize = filesize 610 mapsize = filesize
594 else: 611 else:
595 mapsize = MAP_CHUNK_SIZE 612 mapsize = MAP_WINDOW_SIZE
596 mapoffset = 0 613 mapoffset = 0
597 rest = filesize 614 rest = filesize
598 while rest > 0: 615 while rest > 0:
599 m = mmap.mmap(fd, 616 m = mmap.mmap(fd,
600 mapsize, 617 mapsize,
626 643
627 """ 644 """
628 h = hashobj() 645 h = hashobj()
629 while True: 646 while True:
630 try: 647 try:
631 buf = instream.read(CHUNK_SIZE) 648 buf = instream.read(READ_CHUNK_SIZE)
632 except OSError as e: 649 except OSError as e:
633 if e.errno not in (errno.EAGAIN, errno.EWOULDBLOCK, errno.EINTR): 650 if e.errno not in (errno.EAGAIN, errno.EWOULDBLOCK, errno.EINTR):
634 raise 651 raise
635 else: 652 else:
636 if buf is not None: 653 if buf is not None: