changeset 23:232063b73e45

Optimized reading of files by using mmap. Boots performance by 100%: computing the SHA-256 digest of a 400MB file is now twice as fast as sha256 on FreeBSD 12.0 (with Python 3.7).
author Franz Glasner <fzglas.hg@dom66.de>
date Sun, 06 Dec 2020 13:49:01 +0100
parents 6bdfc5ad4656
children 50ba05dc0eab
files shasum.py
diffstat 1 files changed, 77 insertions(+), 18 deletions(-) [+]
line wrap: on
line diff
--- a/shasum.py	Sat Dec 05 15:08:46 2020 +0100
+++ b/shasum.py	Sun Dec 06 13:49:01 2020 +0100
@@ -19,13 +19,20 @@
 import argparse
 import hashlib
 import io
+try:
+    import mmap
+except ImportError:
+    mmap = None
+import os
 import re
+import stat
 import sys
 
 
 PY2 = sys.version_info[0] < 3
 
-CHUNK_SIZE = 1024 * 1024 * 1024
+CHUNK_SIZE = 1024*1024
+MAP_CHUNK_SIZE = 64*1024*1024
 
 
 def main(argv=None):
@@ -106,18 +113,17 @@
         else:
             source = sys.stdin.buffer
         out(sys.stdout,
-            compute_digest(opts.algorithm[0], source),
+            compute_digest_stream(opts.algorithm[0], source),
             None,
             opts.algorithm[1],
             True)
     else:
         for fn in opts.files:
-            with open(fn, "rb") as source:
-                out(sys.stdout,
-                    compute_digest(opts.algorithm[0], source),
-                    fn,
-                    opts.algorithm[1],
-                    True)
+            out(sys.stdout,
+                compute_digest_file(opts.algorithm[0], fn),
+                fn,
+                opts.algorithm[1],
+                True)
     return 0
 
 
@@ -137,7 +143,7 @@
             print("-: MISSING")
         else:
             tag, algo, cl_filename, cl_digest = pl
-            computed_digest = compute_digest(algo, source)
+            computed_digest = compute_digest_stream(algo, source)
             if cl_digest.lower() == computed_digest.lower():
                 res = "OK"
             else:
@@ -152,8 +158,7 @@
                 exit_code = 1
             else:
                 tag, algo, cl_filename, cl_digest = pl
-                with open(fn, "rb") as source:
-                    computed_digest = compute_digest(algo, source)
+                computed_digest = compute_digest_file(algo, fn)
                 if cl_digest.lower() == computed_digest.lower():
                     res = "OK"
                 else:
@@ -199,12 +204,11 @@
             "improperly formatted digest line: {}".format(line))
     tag, algo, fn, digest = parts
     try:
-        with open(fn, "rb") as input:
-            d = compute_digest(algo, input)
-            if d.lower() == digest.lower():
-                return ("ok", fn, tag)
-            else:
-                return ("failed", fn, tag)
+        d = compute_digest_file(algo, fn)
+        if d.lower() == digest.lower():
+            return ("ok", fn, tag)
+        else:
+            return ("failed", fn, tag)
     except EnvironmentError:
         return ("missing", fn, tag)
 
@@ -367,7 +371,62 @@
           file=dest)
 
 
-def compute_digest(hashobj, instream):
+def compute_digest_file(hashobj, filename):
+    """
+    :param hashobj: a :mod:`hashlib` compatible hash algorithm type or factory
+    :param str filename: filename within the filesystem
+    :return: the digest in hex form
+    :rtype: str
+
+    """
+    h = hashobj()
+    flags = os.O_RDONLY
+    try:
+        flags |= os.O_BINARY
+    except AttributeError:
+        pass
+    fd = os.open(filename, flags)
+    try:
+        st = os.fstat(fd)
+        filesize = st[stat.ST_SIZE]
+        #
+        # On Windows mmapped file with length 0 are not supported
+        # -> use low-level IO
+        #
+        if mmap is None:
+            while True:
+                buf = os.read(fd, CHUNK_SIZE)
+                if len(buf) == 0:
+                    break
+                h.update(buf)
+        else:
+            # mmap
+            if filesize < MAP_CHUNK_SIZE:
+                mapsize = filesize
+                mapoffset = 0
+            else:
+                mapsize = MAP_CHUNK_SIZE
+                mapoffset = 0
+            rest = filesize
+            while rest > 0:
+                m = mmap.mmap(fd,
+                              mapsize,
+                              access=mmap.ACCESS_READ,
+                              offset=mapoffset)
+                try:
+                    h.update(m)
+                finally:
+                    m.close()
+                rest -= mapsize
+                mapoffset += mapsize
+                if rest < mapsize:
+                    mapsize = rest
+    finally:
+        os.close(fd)
+    return h.hexdigest()
+
+
+def compute_digest_stream(hashobj, instream):
     """
 
     :param hashobj: a :mod:`hashlib` compatible hash algorithm type or factory