comparison cutils/treesum.py @ 177:089c40240061

Add an alternate implementation for generating directory tree digests: - Do not use something like os.walk() but use os.scandir() directly. - Recursively generate the subdirectory digests only when needed and in the right order. This fixes that the order of subdirectories in the output did not match the application order of its directory digests. The new implementation also should make filtering (that will be implemented later) easier. NOTE: The tree digests of the old and the new implementation are identical.
author Franz Glasner <fzglas.hg@dom66.de>
date Sat, 11 Jan 2025 17:41:28 +0100
parents 7f5d05a625fd
children dac26a2d9de5
comparison
equal deleted inserted replaced
176:7f5d05a625fd 177:089c40240061
279 else: 279 else:
280 out_cm = open(opts.output, "wb") 280 out_cm = open(opts.output, "wb")
281 281
282 with out_cm as outfp: 282 with out_cm as outfp:
283 for d in opts.directories: 283 for d in opts.directories:
284
285 V1DirectoryTreesumGenerator(
286 opts.algorithm, opts.mmap, opts.base64, opts.logical,
287 opts.follow_directory_symlinks,
288 opts.metadata_mode,
289 opts.metadata_full_mode,
290 opts.metadata_mtime,
291 opts.size_only,
292 opts.print_size,
293 minimal=opts.minimal).generate(
294 outfp, d, comment=opts.comment)
295
284 generate_treesum_for_directory( 296 generate_treesum_for_directory(
285 outfp, d, opts.algorithm, opts.mmap, opts.base64, opts.logical, 297 outfp, d, opts.algorithm, opts.mmap, opts.base64, opts.logical,
286 opts.follow_directory_symlinks, 298 opts.follow_directory_symlinks,
287 opts.metadata_mode, 299 opts.metadata_mode,
288 opts.metadata_full_mode, 300 opts.metadata_full_mode,
289 opts.metadata_mtime, 301 opts.metadata_mtime,
290 opts.size_only, 302 opts.size_only,
291 opts.print_size, 303 opts.print_size,
292 minimal=opts.minimal, 304 minimal=opts.minimal,
293 comment=opts.comment) 305 comment=opts.comment)
306
307
308 class V1DirectoryTreesumGenerator(object):
309
310 def __init__(self, algorithm, use_mmap, use_base64,
311 handle_root_logical, follow_directory_symlinks,
312 with_metadata_mode, with_metadata_full_mode,
313 with_metadata_mtime, size_only, print_size,
314 minimal=None,):
315 super(V1DirectoryTreesumGenerator, self).__init__()
316 self._algorithm = algorithm
317 self._use_mmap = use_mmap
318 self._use_base64 = use_base64
319 self._handle_root_logical = handle_root_logical
320 self._follow_directory_symlinks = follow_directory_symlinks
321 self._with_metadata_mode = with_metadata_mode
322 self._with_metadata_full_mode = with_metadata_full_mode
323 self._with_metadata_mtime = with_metadata_mtime
324 self._size_only = size_only
325 self._print_size = print_size
326 self._minimal = minimal
327
328 def generate(self, outfp, root, comment=None):
329 """
330
331 :param outfp: a *binary* file with a "write()" and a "flush()" method
332
333 """
334 self._outfp = outfp
335 self._outfp.write(format_bsd_line("VERSION", "1", None, False))
336 self._outfp.flush()
337
338 #
339 # Note: Given non-default flags that are relevant for
340 # directory traversal.
341 #
342 flags = []
343 if self._with_metadata_full_mode:
344 flags.append("with-metadata-fullmode")
345 elif self._with_metadata_mode:
346 flags.append("with-metadata-mode")
347 if self._with_metadata_mtime:
348 flags.append("with-metadata-mtime")
349 if self._handle_root_logical:
350 flags.append("logical")
351 if self._follow_directory_symlinks:
352 flags.append("follow-directory-symlinks")
353 if self._size_only:
354 flags.append("size-only")
355 else:
356 if self._print_size:
357 flags.append("print-size")
358 if flags:
359 flags.sort()
360 self._outfp.write(
361 format_bsd_line("FLAGS", ",".join(flags), None, False))
362
363 if self._minimal is None:
364 # Write execution timestamps in POSIX epoch and ISO format
365 ts = int(time.time())
366 self._outfp.write(format_bsd_line("TIMESTAMP", ts, None, False))
367 ts = (datetime.datetime.utcfromtimestamp(ts)).isoformat("T")
368 self._outfp.write(format_bsd_line("ISOTIMESTAMP", ts, None, False))
369
370 if comment:
371 for line in comment:
372 self._outfp.write(
373 format_bsd_line("COMMENT", None, line, False))
374
375 if self._minimal is not None:
376 self._outfp.write(format_bsd_line(
377 "ROOT", None, self._minimal if self._minimal else "", False))
378 else:
379 self._outfp.write(format_bsd_line("ROOT", None, root, False))
380 self._outfp.flush()
381
382 if not self._handle_root_logical and os.path.islink(root):
383 linktgt = util.fsencode(os.readlink(root))
384 linkdgst = self._algorithm[0]()
385 linkdgst.update(
386 util.interpolate_bytes(b"%d:%s,", len(linktgt), linktgt))
387 dir_dgst = self._algorithm[0]()
388 dir_dgst.update(b"1:L,")
389 dir_dgst.update(
390 util.interpolate_bytes(
391 b"%d:%s,", len(linkdgst.digest()), linkdgst.digest()))
392 if self._size_only:
393 self._outfp.write(
394 format_bsd_line(
395 "SIZE",
396 None,
397 "./@",
398 False,
399 0))
400 else:
401 self._outfp.write(
402 format_bsd_line(
403 self._algorithm[1],
404 dir_dgst.digest(),
405 "./@",
406 self._use_base64))
407 self._outfp.flush()
408 return
409
410 self._generate(os.path.normpath(root), tuple())
411
412 def _generate(self, root, top):
413 logging.debug("Handling %s/%r", root, top)
414 path = os.path.join(root, *top) if top else root
415 with walk.ScanDir(path) as dirscan:
416 fsobjects = list(dirscan)
417 fsobjects.sort(key=walk.WalkDirEntry.sort_key)
418 dir_dgst = self._algorithm[0]()
419 dir_size = 0
420 for fso in fsobjects:
421 if fso.is_dir:
422 if fso.is_symlink and not self._follow_directory_symlinks:
423 linktgt = util.fsencode(os.readlink(fso.path))
424 linkdgst = self._algorithm[0]()
425 linkdgst.update(
426 util.interpolate_bytes(
427 b"%d:%s,", len(linktgt), linktgt))
428 dir_dgst.update(util.interpolate_bytes(
429 b"1:S,%d:%s,", len(fso.fsname), fso.fsname))
430 #
431 # - no mtime and no mode for symlinks
432 # - also does not count for dir_size
433 #
434 dir_dgst.update(util.interpolate_bytes(
435 b"%d:%s,",
436 len(linkdgst.digest()), linkdgst.digest()))
437 opath = "/".join(top) + "/" + fso.name if top else fso.name
438 if self._size_only:
439 self._outfp.write(format_bsd_line(
440 "SIZE", None, "%s/./@" % (opath,), False, 0))
441 else:
442 self._outfp.write(format_bsd_line(
443 self._algorithm[1],
444 linkdgst.digest(),
445 "%s/./@" % (opath,),
446 self._use_base64))
447 self._outfp.flush()
448 else:
449 #
450 # Follow the symlink to dir or handle a "real" directory
451 #
452
453 # Get subdir data from recursing into it
454 sub_dir_dgst, sub_dir_size = self._generate(
455 root, top + (fso.name, ))
456
457 dir_size += sub_dir_size
458 dir_dgst.update(util.interpolate_bytes(
459 b"1:d,%d:%s,", len(fso.fsname), fso.fsname))
460 dir_dgst.update(util.interpolate_bytes(
461 b"%d:%s,", len(sub_dir_dgst), sub_dir_dgst))
462 if self._with_metadata_full_mode:
463 modestr = normalized_mode_str(fso.stat.st_mode)
464 if not isinstance(modestr, bytes):
465 modestr = modestr.encode("ascii")
466 dir_dgst.update(util.interpolate_bytes(
467 b"8:fullmode,%d:%s,", len(modestr), modestr))
468 elif self._with_metadata_mode:
469 modestr = normalized_compatible_mode_str(
470 fso.stat.st_mode)
471 if not isinstance(modestr, bytes):
472 modestr = modestr.encode("ascii")
473 dir_dgst.update(util.interpolate_bytes(
474 b"4:mode,%d:%s,", len(modestr), modestr))
475 else:
476 dir_dgst.update(util.interpolate_bytes(
477 b"1:f,%d:%s,", len(fso.fsname), fso.fsname))
478 dir_size += fso.stat.st_size
479 if self._with_metadata_mtime:
480 mtime = datetime.datetime.utcfromtimestamp(
481 int(fso.stat.st_mtime))
482 mtime = mtime.isoformat("T") + "Z"
483 if not isinstance(mtime, bytes):
484 mtime = mtime.encode("ascii")
485 dir_dgst.update(util.interpolate_bytes(
486 b"5:mtime,%d:%s,", len(mtime), mtime))
487 if self._with_metadata_full_mode:
488 modestr = normalized_mode_str(fso.stat.st_mode)
489 if not isinstance(modestr, bytes):
490 modestr = modestr.encode("ascii")
491 dir_dgst.update(util.interpolate_bytes(
492 b"8:fullmode,%d:%s,", len(modestr), modestr))
493 elif self._with_metadata_mode:
494 modestr = normalized_compatible_mode_str(fso.stat.st_mode)
495 if not isinstance(modestr, bytes):
496 modestr = modestr.encode("ascii")
497 dir_dgst.update(util.interpolate_bytes(
498 b"4:mode,%d:%s,", len(modestr), modestr))
499 if not self._size_only:
500 dgst = digest.compute_digest_file(
501 self._algorithm[0], fso.path, use_mmap=self._use_mmap)
502 dir_dgst.update(util.interpolate_bytes(
503 b"%d:%s,", len(dgst), dgst))
504 opath = "/".join(top) + "/" + fso.name if top else fso.name
505 if self._size_only:
506 self._outfp.write(format_bsd_line(
507 "SIZE", None, opath, False, fso.stat.st_size))
508 else:
509 if self._print_size:
510 self._outfp.write(format_bsd_line(
511 self._algorithm[1], dgst, opath, self._use_base64,
512 fso.stat.st_size))
513 else:
514 self._outfp.write(format_bsd_line(
515 self._algorithm[1], dgst, opath,
516 self._use_base64))
517 self._outfp.flush()
518
519 opath = "/".join(top) + "/" if top else ""
520 if self._size_only:
521 self._outfp.write(format_bsd_line(
522 "SIZE", None, opath, False, dir_size))
523 else:
524 if self._print_size:
525 self._outfp.write(format_bsd_line(
526 self._algorithm[1], dir_dgst.digest(), opath,
527 self._use_base64, dir_size))
528 else:
529 self._outfp.write(format_bsd_line(
530 self._algorithm[1], dir_dgst.digest(), opath,
531 self._use_base64))
532 self._outfp.flush()
533 return (dir_dgst.digest(), dir_size)
294 534
295 535
296 def generate_treesum_for_directory( 536 def generate_treesum_for_directory(
297 outfp, root, algorithm, use_mmap, use_base64, handle_root_logical, 537 outfp, root, algorithm, use_mmap, use_base64, handle_root_logical,
298 follow_directory_symlinks, with_metadata_mode, with_metadata_full_mode, 538 follow_directory_symlinks, with_metadata_mode, with_metadata_full_mode,