comparison shasum.py @ 23:232063b73e45

Optimized reading of files by using mmap. Boots performance by 100%: computing the SHA-256 digest of a 400MB file is now twice as fast as sha256 on FreeBSD 12.0 (with Python 3.7).
author Franz Glasner <fzglas.hg@dom66.de>
date Sun, 06 Dec 2020 13:49:01 +0100
parents 6bdfc5ad4656
children ea4bb192c437
comparison
equal deleted inserted replaced
22:6bdfc5ad4656 23:232063b73e45
17 17
18 18
19 import argparse 19 import argparse
20 import hashlib 20 import hashlib
21 import io 21 import io
22 try:
23 import mmap
24 except ImportError:
25 mmap = None
26 import os
22 import re 27 import re
28 import stat
23 import sys 29 import sys
24 30
25 31
26 PY2 = sys.version_info[0] < 3 32 PY2 = sys.version_info[0] < 3
27 33
28 CHUNK_SIZE = 1024 * 1024 * 1024 34 CHUNK_SIZE = 1024*1024
35 MAP_CHUNK_SIZE = 64*1024*1024
29 36
30 37
31 def main(argv=None): 38 def main(argv=None):
32 aparser = argparse.ArgumentParser( 39 aparser = argparse.ArgumentParser(
33 description="Python implementation of shasum", 40 description="Python implementation of shasum",
104 msvcrt.setmode(sys.stdin.fileno(), os.O_BINARY) 111 msvcrt.setmode(sys.stdin.fileno(), os.O_BINARY)
105 source = sys.stdin 112 source = sys.stdin
106 else: 113 else:
107 source = sys.stdin.buffer 114 source = sys.stdin.buffer
108 out(sys.stdout, 115 out(sys.stdout,
109 compute_digest(opts.algorithm[0], source), 116 compute_digest_stream(opts.algorithm[0], source),
110 None, 117 None,
111 opts.algorithm[1], 118 opts.algorithm[1],
112 True) 119 True)
113 else: 120 else:
114 for fn in opts.files: 121 for fn in opts.files:
115 with open(fn, "rb") as source: 122 out(sys.stdout,
116 out(sys.stdout, 123 compute_digest_file(opts.algorithm[0], fn),
117 compute_digest(opts.algorithm[0], source), 124 fn,
118 fn, 125 opts.algorithm[1],
119 opts.algorithm[1], 126 True)
120 True)
121 return 0 127 return 0
122 128
123 129
124 def verify_digests_with_checklist(opts): 130 def verify_digests_with_checklist(opts):
125 exit_code = 0 131 exit_code = 0
135 if pl is None: 141 if pl is None:
136 exit_code = 1 142 exit_code = 1
137 print("-: MISSING") 143 print("-: MISSING")
138 else: 144 else:
139 tag, algo, cl_filename, cl_digest = pl 145 tag, algo, cl_filename, cl_digest = pl
140 computed_digest = compute_digest(algo, source) 146 computed_digest = compute_digest_stream(algo, source)
141 if cl_digest.lower() == computed_digest.lower(): 147 if cl_digest.lower() == computed_digest.lower():
142 res = "OK" 148 res = "OK"
143 else: 149 else:
144 res = "FAILED" 150 res = "FAILED"
145 exit_code = 1 151 exit_code = 1
150 if pl is None: 156 if pl is None:
151 print("{}: MISSING".format(fn)) 157 print("{}: MISSING".format(fn))
152 exit_code = 1 158 exit_code = 1
153 else: 159 else:
154 tag, algo, cl_filename, cl_digest = pl 160 tag, algo, cl_filename, cl_digest = pl
155 with open(fn, "rb") as source: 161 computed_digest = compute_digest_file(algo, fn)
156 computed_digest = compute_digest(algo, source)
157 if cl_digest.lower() == computed_digest.lower(): 162 if cl_digest.lower() == computed_digest.lower():
158 res = "OK" 163 res = "OK"
159 else: 164 else:
160 exit_code = 1 165 exit_code = 1
161 res = "FAILED" 166 res = "FAILED"
197 if not parts: 202 if not parts:
198 raise ValueError( 203 raise ValueError(
199 "improperly formatted digest line: {}".format(line)) 204 "improperly formatted digest line: {}".format(line))
200 tag, algo, fn, digest = parts 205 tag, algo, fn, digest = parts
201 try: 206 try:
202 with open(fn, "rb") as input: 207 d = compute_digest_file(algo, fn)
203 d = compute_digest(algo, input) 208 if d.lower() == digest.lower():
204 if d.lower() == digest.lower(): 209 return ("ok", fn, tag)
205 return ("ok", fn, tag) 210 else:
206 else: 211 return ("failed", fn, tag)
207 return ("failed", fn, tag)
208 except EnvironmentError: 212 except EnvironmentError:
209 return ("missing", fn, tag) 213 return ("missing", fn, tag)
210 214
211 215
212 def get_parsed_digest_line_from_checklist(checklist, opts, filename): 216 def get_parsed_digest_line_from_checklist(checklist, opts, filename):
365 '*' if binary else ' ', 369 '*' if binary else ' ',
366 '-' if filename is None else normalize_filename(filename)), 370 '-' if filename is None else normalize_filename(filename)),
367 file=dest) 371 file=dest)
368 372
369 373
370 def compute_digest(hashobj, instream): 374 def compute_digest_file(hashobj, filename):
375 """
376 :param hashobj: a :mod:`hashlib` compatible hash algorithm type or factory
377 :param str filename: filename within the filesystem
378 :return: the digest in hex form
379 :rtype: str
380
381 """
382 h = hashobj()
383 flags = os.O_RDONLY
384 try:
385 flags |= os.O_BINARY
386 except AttributeError:
387 pass
388 fd = os.open(filename, flags)
389 try:
390 st = os.fstat(fd)
391 filesize = st[stat.ST_SIZE]
392 #
393 # On Windows mmapped file with length 0 are not supported
394 # -> use low-level IO
395 #
396 if mmap is None:
397 while True:
398 buf = os.read(fd, CHUNK_SIZE)
399 if len(buf) == 0:
400 break
401 h.update(buf)
402 else:
403 # mmap
404 if filesize < MAP_CHUNK_SIZE:
405 mapsize = filesize
406 mapoffset = 0
407 else:
408 mapsize = MAP_CHUNK_SIZE
409 mapoffset = 0
410 rest = filesize
411 while rest > 0:
412 m = mmap.mmap(fd,
413 mapsize,
414 access=mmap.ACCESS_READ,
415 offset=mapoffset)
416 try:
417 h.update(m)
418 finally:
419 m.close()
420 rest -= mapsize
421 mapoffset += mapsize
422 if rest < mapsize:
423 mapsize = rest
424 finally:
425 os.close(fd)
426 return h.hexdigest()
427
428
429 def compute_digest_stream(hashobj, instream):
371 """ 430 """
372 431
373 :param hashobj: a :mod:`hashlib` compatible hash algorithm type or factory 432 :param hashobj: a :mod:`hashlib` compatible hash algorithm type or factory
374 :param instream: a bytes input stream to read the data to be hashed from 433 :param instream: a bytes input stream to read the data to be hashed from
375 :return: the digest in hex form 434 :return: the digest in hex form