changeset 89:72684020f2f3

By default use mmap only for files up to 8MiB in size. This follows the FreeBSD cp(1) implementation.
author Franz Glasner <fzglas.hg@dom66.de>
date Thu, 21 Apr 2022 01:20:35 +0200
parents f69353f26937
children 42419f57eda9
files cutils/shasum.py
diffstat 1 files changed, 25 insertions(+), 8 deletions(-) [+]
line wrap: on
line diff
--- a/cutils/shasum.py	Thu Apr 21 00:24:49 2022 +0200
+++ b/cutils/shasum.py	Thu Apr 21 01:20:35 2022 +0200
@@ -45,8 +45,9 @@
     else:
         PATH_TYPES = (str, bytes)
 
-CHUNK_SIZE = 1024*1024
-MAP_CHUNK_SIZE = 64*1024*1024
+READ_CHUNK_SIZE = 2 * 1024 * 1024    # like BUFSIZE_MAX on FreeBSD
+MAX_AUTO_MAP_SIZE = 8 * 1024 * 1024
+MAP_WINDOW_SIZE = MAX_AUTO_MAP_SIZE  # do not totally trash memory on big files
 
 
 def main(argv=None):
@@ -533,12 +534,14 @@
           file=dest)
 
 
-def compute_digest_file(hashobj, path, use_mmap=True):
+def compute_digest_file(hashobj, path, use_mmap=None):
     """
     :param hashobj: a :mod:`hashlib` compatible hash algorithm type or factory
     :param path: filename within the filesystem or a file descriptor opened in
                  binary mode (also a socket or pipe)
-    :param bool use_mmap: use the :mod:`mmap` module if available
+    :param use_mmap: Use the :mod:`mmap` module if available.
+                     If `None` determine automatically.
+    :type use_mmap: bool or None
     :return: the digest in binary form
     :rtype: bytes
 
@@ -566,13 +569,27 @@
         else:
             if stat.S_ISREG(st[stat.ST_MODE]):
                 filesize = st[stat.ST_SIZE]
+                if (use_mmap is None) \
+                        and (filesize > MAX_AUTO_MAP_SIZE):
+                    #
+                    # This is borrowed from FreeBSD's cp(1) implementation:
+                    # Mmap and process if less than 8M (the limit is
+                    # so we don't totally trash memory on big files.
+                    # This is really a minor hack, but it wins some
+                    # CPU back.  Some filesystems, such as smbnetfs,
+                    # don't support mmap, so this is a best-effort
+                    # attempt.
+                    #
+                    use_mmap = False
             else:
                 use_mmap = False
+        if use_mmap is None:
+            use_mmap = True
         if mmap is None or not use_mmap:
             # No mmap available or wanted -> use traditional low-level file IO
             while True:
                 try:
-                    buf = os.read(fd, CHUNK_SIZE)
+                    buf = os.read(fd, READ_CHUNK_SIZE)
                 except OSError as e:
                     if e.errno not in (errno.EAGAIN, errno.EWOULDBLOCK,
                                        errno.EINTR):
@@ -589,10 +606,10 @@
             #       So ensure to not call mmap.mmap() if the file size is 0.
             #
             madvise = getattr(mmap.mmap, "madvise", None)
-            if filesize < MAP_CHUNK_SIZE:
+            if filesize <= MAP_WINDOW_SIZE:
                 mapsize = filesize
             else:
-                mapsize = MAP_CHUNK_SIZE
+                mapsize = MAP_WINDOW_SIZE
             mapoffset = 0
             rest = filesize
             while rest > 0:
@@ -628,7 +645,7 @@
     h = hashobj()
     while True:
         try:
-            buf = instream.read(CHUNK_SIZE)
+            buf = instream.read(READ_CHUNK_SIZE)
         except OSError as e:
             if e.errno not in (errno.EAGAIN, errno.EWOULDBLOCK, errno.EINTR):
                 raise