Mercurial > hgrepos > Python > apps > py-cutils
comparison shasum.py @ 23:232063b73e45
Optimized reading of files by using mmap.
Boots performance by 100%: computing the SHA-256 digest of a 400MB file is
now twice as fast as sha256 on FreeBSD 12.0 (with Python 3.7).
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Sun, 06 Dec 2020 13:49:01 +0100 |
| parents | 6bdfc5ad4656 |
| children | ea4bb192c437 |
comparison
equal
deleted
inserted
replaced
| 22:6bdfc5ad4656 | 23:232063b73e45 |
|---|---|
| 17 | 17 |
| 18 | 18 |
| 19 import argparse | 19 import argparse |
| 20 import hashlib | 20 import hashlib |
| 21 import io | 21 import io |
| 22 try: | |
| 23 import mmap | |
| 24 except ImportError: | |
| 25 mmap = None | |
| 26 import os | |
| 22 import re | 27 import re |
| 28 import stat | |
| 23 import sys | 29 import sys |
| 24 | 30 |
| 25 | 31 |
| 26 PY2 = sys.version_info[0] < 3 | 32 PY2 = sys.version_info[0] < 3 |
| 27 | 33 |
| 28 CHUNK_SIZE = 1024 * 1024 * 1024 | 34 CHUNK_SIZE = 1024*1024 |
| 35 MAP_CHUNK_SIZE = 64*1024*1024 | |
| 29 | 36 |
| 30 | 37 |
| 31 def main(argv=None): | 38 def main(argv=None): |
| 32 aparser = argparse.ArgumentParser( | 39 aparser = argparse.ArgumentParser( |
| 33 description="Python implementation of shasum", | 40 description="Python implementation of shasum", |
| 104 msvcrt.setmode(sys.stdin.fileno(), os.O_BINARY) | 111 msvcrt.setmode(sys.stdin.fileno(), os.O_BINARY) |
| 105 source = sys.stdin | 112 source = sys.stdin |
| 106 else: | 113 else: |
| 107 source = sys.stdin.buffer | 114 source = sys.stdin.buffer |
| 108 out(sys.stdout, | 115 out(sys.stdout, |
| 109 compute_digest(opts.algorithm[0], source), | 116 compute_digest_stream(opts.algorithm[0], source), |
| 110 None, | 117 None, |
| 111 opts.algorithm[1], | 118 opts.algorithm[1], |
| 112 True) | 119 True) |
| 113 else: | 120 else: |
| 114 for fn in opts.files: | 121 for fn in opts.files: |
| 115 with open(fn, "rb") as source: | 122 out(sys.stdout, |
| 116 out(sys.stdout, | 123 compute_digest_file(opts.algorithm[0], fn), |
| 117 compute_digest(opts.algorithm[0], source), | 124 fn, |
| 118 fn, | 125 opts.algorithm[1], |
| 119 opts.algorithm[1], | 126 True) |
| 120 True) | |
| 121 return 0 | 127 return 0 |
| 122 | 128 |
| 123 | 129 |
| 124 def verify_digests_with_checklist(opts): | 130 def verify_digests_with_checklist(opts): |
| 125 exit_code = 0 | 131 exit_code = 0 |
| 135 if pl is None: | 141 if pl is None: |
| 136 exit_code = 1 | 142 exit_code = 1 |
| 137 print("-: MISSING") | 143 print("-: MISSING") |
| 138 else: | 144 else: |
| 139 tag, algo, cl_filename, cl_digest = pl | 145 tag, algo, cl_filename, cl_digest = pl |
| 140 computed_digest = compute_digest(algo, source) | 146 computed_digest = compute_digest_stream(algo, source) |
| 141 if cl_digest.lower() == computed_digest.lower(): | 147 if cl_digest.lower() == computed_digest.lower(): |
| 142 res = "OK" | 148 res = "OK" |
| 143 else: | 149 else: |
| 144 res = "FAILED" | 150 res = "FAILED" |
| 145 exit_code = 1 | 151 exit_code = 1 |
| 150 if pl is None: | 156 if pl is None: |
| 151 print("{}: MISSING".format(fn)) | 157 print("{}: MISSING".format(fn)) |
| 152 exit_code = 1 | 158 exit_code = 1 |
| 153 else: | 159 else: |
| 154 tag, algo, cl_filename, cl_digest = pl | 160 tag, algo, cl_filename, cl_digest = pl |
| 155 with open(fn, "rb") as source: | 161 computed_digest = compute_digest_file(algo, fn) |
| 156 computed_digest = compute_digest(algo, source) | |
| 157 if cl_digest.lower() == computed_digest.lower(): | 162 if cl_digest.lower() == computed_digest.lower(): |
| 158 res = "OK" | 163 res = "OK" |
| 159 else: | 164 else: |
| 160 exit_code = 1 | 165 exit_code = 1 |
| 161 res = "FAILED" | 166 res = "FAILED" |
| 197 if not parts: | 202 if not parts: |
| 198 raise ValueError( | 203 raise ValueError( |
| 199 "improperly formatted digest line: {}".format(line)) | 204 "improperly formatted digest line: {}".format(line)) |
| 200 tag, algo, fn, digest = parts | 205 tag, algo, fn, digest = parts |
| 201 try: | 206 try: |
| 202 with open(fn, "rb") as input: | 207 d = compute_digest_file(algo, fn) |
| 203 d = compute_digest(algo, input) | 208 if d.lower() == digest.lower(): |
| 204 if d.lower() == digest.lower(): | 209 return ("ok", fn, tag) |
| 205 return ("ok", fn, tag) | 210 else: |
| 206 else: | 211 return ("failed", fn, tag) |
| 207 return ("failed", fn, tag) | |
| 208 except EnvironmentError: | 212 except EnvironmentError: |
| 209 return ("missing", fn, tag) | 213 return ("missing", fn, tag) |
| 210 | 214 |
| 211 | 215 |
| 212 def get_parsed_digest_line_from_checklist(checklist, opts, filename): | 216 def get_parsed_digest_line_from_checklist(checklist, opts, filename): |
| 365 '*' if binary else ' ', | 369 '*' if binary else ' ', |
| 366 '-' if filename is None else normalize_filename(filename)), | 370 '-' if filename is None else normalize_filename(filename)), |
| 367 file=dest) | 371 file=dest) |
| 368 | 372 |
| 369 | 373 |
| 370 def compute_digest(hashobj, instream): | 374 def compute_digest_file(hashobj, filename): |
| 375 """ | |
| 376 :param hashobj: a :mod:`hashlib` compatible hash algorithm type or factory | |
| 377 :param str filename: filename within the filesystem | |
| 378 :return: the digest in hex form | |
| 379 :rtype: str | |
| 380 | |
| 381 """ | |
| 382 h = hashobj() | |
| 383 flags = os.O_RDONLY | |
| 384 try: | |
| 385 flags |= os.O_BINARY | |
| 386 except AttributeError: | |
| 387 pass | |
| 388 fd = os.open(filename, flags) | |
| 389 try: | |
| 390 st = os.fstat(fd) | |
| 391 filesize = st[stat.ST_SIZE] | |
| 392 # | |
| 393 # On Windows mmapped file with length 0 are not supported | |
| 394 # -> use low-level IO | |
| 395 # | |
| 396 if mmap is None: | |
| 397 while True: | |
| 398 buf = os.read(fd, CHUNK_SIZE) | |
| 399 if len(buf) == 0: | |
| 400 break | |
| 401 h.update(buf) | |
| 402 else: | |
| 403 # mmap | |
| 404 if filesize < MAP_CHUNK_SIZE: | |
| 405 mapsize = filesize | |
| 406 mapoffset = 0 | |
| 407 else: | |
| 408 mapsize = MAP_CHUNK_SIZE | |
| 409 mapoffset = 0 | |
| 410 rest = filesize | |
| 411 while rest > 0: | |
| 412 m = mmap.mmap(fd, | |
| 413 mapsize, | |
| 414 access=mmap.ACCESS_READ, | |
| 415 offset=mapoffset) | |
| 416 try: | |
| 417 h.update(m) | |
| 418 finally: | |
| 419 m.close() | |
| 420 rest -= mapsize | |
| 421 mapoffset += mapsize | |
| 422 if rest < mapsize: | |
| 423 mapsize = rest | |
| 424 finally: | |
| 425 os.close(fd) | |
| 426 return h.hexdigest() | |
| 427 | |
| 428 | |
| 429 def compute_digest_stream(hashobj, instream): | |
| 371 """ | 430 """ |
| 372 | 431 |
| 373 :param hashobj: a :mod:`hashlib` compatible hash algorithm type or factory | 432 :param hashobj: a :mod:`hashlib` compatible hash algorithm type or factory |
| 374 :param instream: a bytes input stream to read the data to be hashed from | 433 :param instream: a bytes input stream to read the data to be hashed from |
| 375 :return: the digest in hex form | 434 :return: the digest in hex form |
