Mercurial > hgrepos > Python > apps > py-cutils
changeset 23:232063b73e45
Optimized reading of files by using mmap.
Boots performance by 100%: computing the SHA-256 digest of a 400MB file is
now twice as fast as sha256 on FreeBSD 12.0 (with Python 3.7).
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Sun, 06 Dec 2020 13:49:01 +0100 |
| parents | 6bdfc5ad4656 |
| children | 50ba05dc0eab |
| files | shasum.py |
| diffstat | 1 files changed, 77 insertions(+), 18 deletions(-) [+] |
line wrap: on
line diff
--- a/shasum.py Sat Dec 05 15:08:46 2020 +0100 +++ b/shasum.py Sun Dec 06 13:49:01 2020 +0100 @@ -19,13 +19,20 @@ import argparse import hashlib import io +try: + import mmap +except ImportError: + mmap = None +import os import re +import stat import sys PY2 = sys.version_info[0] < 3 -CHUNK_SIZE = 1024 * 1024 * 1024 +CHUNK_SIZE = 1024*1024 +MAP_CHUNK_SIZE = 64*1024*1024 def main(argv=None): @@ -106,18 +113,17 @@ else: source = sys.stdin.buffer out(sys.stdout, - compute_digest(opts.algorithm[0], source), + compute_digest_stream(opts.algorithm[0], source), None, opts.algorithm[1], True) else: for fn in opts.files: - with open(fn, "rb") as source: - out(sys.stdout, - compute_digest(opts.algorithm[0], source), - fn, - opts.algorithm[1], - True) + out(sys.stdout, + compute_digest_file(opts.algorithm[0], fn), + fn, + opts.algorithm[1], + True) return 0 @@ -137,7 +143,7 @@ print("-: MISSING") else: tag, algo, cl_filename, cl_digest = pl - computed_digest = compute_digest(algo, source) + computed_digest = compute_digest_stream(algo, source) if cl_digest.lower() == computed_digest.lower(): res = "OK" else: @@ -152,8 +158,7 @@ exit_code = 1 else: tag, algo, cl_filename, cl_digest = pl - with open(fn, "rb") as source: - computed_digest = compute_digest(algo, source) + computed_digest = compute_digest_file(algo, fn) if cl_digest.lower() == computed_digest.lower(): res = "OK" else: @@ -199,12 +204,11 @@ "improperly formatted digest line: {}".format(line)) tag, algo, fn, digest = parts try: - with open(fn, "rb") as input: - d = compute_digest(algo, input) - if d.lower() == digest.lower(): - return ("ok", fn, tag) - else: - return ("failed", fn, tag) + d = compute_digest_file(algo, fn) + if d.lower() == digest.lower(): + return ("ok", fn, tag) + else: + return ("failed", fn, tag) except EnvironmentError: return ("missing", fn, tag) @@ -367,7 +371,62 @@ file=dest) -def compute_digest(hashobj, instream): +def compute_digest_file(hashobj, filename): + """ + :param hashobj: a :mod:`hashlib` compatible hash algorithm type or factory + :param str filename: filename within the filesystem + :return: the digest in hex form + :rtype: str + + """ + h = hashobj() + flags = os.O_RDONLY + try: + flags |= os.O_BINARY + except AttributeError: + pass + fd = os.open(filename, flags) + try: + st = os.fstat(fd) + filesize = st[stat.ST_SIZE] + # + # On Windows mmapped file with length 0 are not supported + # -> use low-level IO + # + if mmap is None: + while True: + buf = os.read(fd, CHUNK_SIZE) + if len(buf) == 0: + break + h.update(buf) + else: + # mmap + if filesize < MAP_CHUNK_SIZE: + mapsize = filesize + mapoffset = 0 + else: + mapsize = MAP_CHUNK_SIZE + mapoffset = 0 + rest = filesize + while rest > 0: + m = mmap.mmap(fd, + mapsize, + access=mmap.ACCESS_READ, + offset=mapoffset) + try: + h.update(m) + finally: + m.close() + rest -= mapsize + mapoffset += mapsize + if rest < mapsize: + mapsize = rest + finally: + os.close(fd) + return h.hexdigest() + + +def compute_digest_stream(hashobj, instream): """ :param hashobj: a :mod:`hashlib` compatible hash algorithm type or factory
