comparison cutils/treesum.py @ 204:07f1d79e6674

Fully implemented UTF-8 mode for treeview. While doing this refactored "normal" mode (using the filesystem encoding).
author Franz Glasner <fzglas.hg@dom66.de>
date Tue, 21 Jan 2025 20:31:48 +0100
parents b9b38584919b
children 63088d3675bb
comparison
equal deleted inserted replaced
203:3a85f7bbe0b1 204:07f1d79e6674
404 self._outfp.write( 404 self._outfp.write(
405 format_bsd_line("COMMENT", None, line, False)) 405 format_bsd_line("COMMENT", None, line, False))
406 406
407 if self._minimal is not None: 407 if self._minimal is not None:
408 self._outfp.write(format_bsd_line( 408 self._outfp.write(format_bsd_line(
409 "ROOT", None, self._minimal if self._minimal else "", False)) 409 "ROOT",
410 else: 410 None,
411 self._outfp.write(format_bsd_line("ROOT", None, root, False)) 411 (walk.WalkDirEntry.alt_u8(self._minimal)
412 if self._minimal else b""),
413 False))
414 else:
415 self._outfp.write(format_bsd_line(
416 "ROOT", None, walk.WalkDirEntry.alt_u8(root), False))
412 self._outfp.flush() 417 self._outfp.flush()
413 418
414 if not self._handle_root_logical and os.path.islink(root): 419 if not self._handle_root_logical and os.path.islink(root):
415 linktgt = walk.WalkDirEntry.from_readlink(os.readlink(root)) 420 linktgt = walk.WalkDirEntry.from_readlink(os.readlink(root))
416 linkdgst = self._algorithm[0]() 421 linkdgst = self._algorithm[0]()
455 fsobjects.sort(key=walk.WalkDirEntry.alt_sort_key) 460 fsobjects.sort(key=walk.WalkDirEntry.alt_sort_key)
456 else: 461 else:
457 fsobjects.sort(key=walk.WalkDirEntry.sort_key) 462 fsobjects.sort(key=walk.WalkDirEntry.sort_key)
458 dir_dgst = self._algorithm[0]() 463 dir_dgst = self._algorithm[0]()
459 dir_size = 0 464 dir_size = 0
465 dir_tainted = False
460 for fso in fsobjects: 466 for fso in fsobjects:
461 if fso.is_dir: 467 if fso.is_dir:
462 if fso.is_symlink and not self._follow_directory_symlinks: 468 if fso.is_symlink and not self._follow_directory_symlinks:
463 linktgt = walk.WalkDirEntry.from_readlink( 469 linktgt = walk.WalkDirEntry.from_readlink(
464 os.readlink(fso.path)) 470 os.readlink(fso.path))
465 # linktgt = util.fsencode(os.readlink(fso.path))) 471 # linktgt = util.fsencode(os.readlink(fso.path)))
466 linkdgst = self._algorithm[0]() 472 linkdgst = self._algorithm[0]()
467 linkdgst.update( 473 if self._utf8_mode:
468 util.interpolate_bytes( 474 if linktgt.u8path is None:
469 b"%d:%s,", len(linktgt.fspath), linktgt.fspath)) 475 dir_tainted = True
470 dir_dgst.update(util.interpolate_bytes( 476 linkdgst.update(util.interpolate_bytes(
471 b"1:S,%d:%s,", len(fso.fsname), fso.fsname)) 477 b"%d:%s,",
478 len(linktgt.alt_u8path),
479 linktgt.alt_u8path))
480 else:
481 linkdgst.update(util.interpolate_bytes(
482 b"%d:%s,",
483 len(linktgt.u8path),
484 linktgt.u8path))
485 if fso.u8name is None:
486 dir_tainted = True
487 dir_dgst.update(util.interpolate_bytes(
488 b"1:S,%d:%s,",
489 len(fso.alt_u8name),
490 fso.alt_u8name))
491 else:
492 dir_dgst.update(util.interpolate_bytes(
493 b"1:S,%d:%s,", len(fso.u8name), fso.u8name))
494 else:
495 if linktgt.fspath is None:
496 dir_tainted = True
497 linkdgst.update(util.interpolate_bytes(
498 b"%d:%s,",
499 len(linktgt.alt_fspath),
500 linktgt.alt_fspath))
501 else:
502 linkdgst.update(util.interpolate_bytes(
503 b"%d:%s,",
504 len(linktgt.fspath),
505 linktgt.fspath))
506 if fso.fsname is None:
507 dir_tainted = True
508 dir_dgst.update(util.interpolate_bytes(
509 b"1:S,%d:%s,",
510 len(fso.alt_fsname),
511 fso.alt_fsname))
512 else:
513 dir_dgst.update(util.interpolate_bytes(
514 b"1:S,%d:%s,", len(fso.fsname), fso.fsname))
472 # 515 #
473 # - no mtime and no mode for symlinks 516 # - no mtime and no mode for symlinks
474 # - also does not count for dir_size 517 # - also does not count for dir_size
475 # 518 #
476 dir_dgst.update(util.interpolate_bytes( 519 dir_dgst.update(util.interpolate_bytes(
477 b"%d:%s,", 520 b"%d:%s,",
478 len(linkdgst.digest()), linkdgst.digest())) 521 len(linkdgst.digest()), linkdgst.digest()))
479 opath = "/".join(top) + "/" + fso.name if top else fso.name 522 opath = join_output_path(top, fso.name)
523 if self._utf8_mode:
524 opath = walk.WalkDirEntry.alt_u8(opath)
525 else:
526 opath = walk.WalkDirEntry.alt_fs(opath)
480 if self._size_only: 527 if self._size_only:
481 self._outfp.write(format_bsd_line( 528 self._outfp.write(format_bsd_line(
482 "SIZE", None, "%s/./@/" % (opath,), False, 0)) 529 "SIZE", None, "%s/./@/" % (opath,), False, 0))
483 else: 530 else:
484 self._outfp.write(format_bsd_line( 531 self._outfp.write(format_bsd_line(
495 # Get subdir data from recursing into it 542 # Get subdir data from recursing into it
496 sub_dir_dgst, sub_dir_size = self._generate( 543 sub_dir_dgst, sub_dir_size = self._generate(
497 root, top + (fso.name, )) 544 root, top + (fso.name, ))
498 545
499 dir_size += sub_dir_size 546 dir_size += sub_dir_size
500 dir_dgst.update(util.interpolate_bytes( 547 if self._utf8_mode:
501 b"1:d,%d:%s,", len(fso.fsname), fso.fsname)) 548 if fso.u8name is None:
549 dir_tainted = True
550 dir_dgst.update(util.interpolate_bytes(
551 b"1:d,%d:%s,",
552 len(fso.alt_u8name),
553 fso.alt_u8name))
554 else:
555 dir_dgst.update(util.interpolate_bytes(
556 b"1:d,%d:%s,", len(fso.u8name), fso.u8name))
557 else:
558 if fso.fsname is None:
559 dir_tainted = True
560 dir_dgst.update(util.interpolate_bytes(
561 b"1:d,%d:%s,",
562 len(fso.alt_fsname),
563 fso.alt_fsname))
564 else:
565 dir_dgst.update(util.interpolate_bytes(
566 b"1:d,%d:%s,", len(fso.fsname), fso.fsname))
502 dir_dgst.update(util.interpolate_bytes( 567 dir_dgst.update(util.interpolate_bytes(
503 b"%d:%s,", len(sub_dir_dgst), sub_dir_dgst)) 568 b"%d:%s,", len(sub_dir_dgst), sub_dir_dgst))
504 if self._with_metadata_full_mode: 569 if self._with_metadata_full_mode:
505 modestr = util.b(normalized_mode_str(fso.stat.st_mode)) 570 modestr = util.b(normalized_mode_str(fso.stat.st_mode))
506 dir_dgst.update(util.interpolate_bytes( 571 dir_dgst.update(util.interpolate_bytes(
509 modestr = util.b(normalized_compatible_mode_str( 574 modestr = util.b(normalized_compatible_mode_str(
510 fso.stat.st_mode)) 575 fso.stat.st_mode))
511 dir_dgst.update(util.interpolate_bytes( 576 dir_dgst.update(util.interpolate_bytes(
512 b"4:mode,%d:%s,", len(modestr), modestr)) 577 b"4:mode,%d:%s,", len(modestr), modestr))
513 else: 578 else:
514 dir_dgst.update(util.interpolate_bytes( 579 if self._utf8_mode:
515 b"1:f,%d:%s,", len(fso.fsname), fso.fsname)) 580 if fso.u8name is None:
581 dir_tainted = True
582 dir_dgst.update(util.interpolate_bytes(
583 b"1:f,%d:%s,",
584 len(fso.alt_u8name),
585 fso.alt_u8name))
586 else:
587 dir_dgst.update(util.interpolate_bytes(
588 b"1:f,%d:%s,", len(fso.u8name), fso.u8name))
589 else:
590 if fso.fsname is None:
591 dir_tainted = True
592 dir_dgst.update(util.interpolate_bytes(
593 b"1:f,%d:%s,",
594 len(fso.alt_fsname),
595 fso.alt_fsname))
596 else:
597 dir_dgst.update(util.interpolate_bytes(
598 b"1:f,%d:%s,", len(fso.fsname), fso.fsname))
516 dir_size += fso.stat.st_size 599 dir_size += fso.stat.st_size
517 if self._with_metadata_mtime: 600 if self._with_metadata_mtime:
518 mtime = datetime.datetime.utcfromtimestamp( 601 mtime = datetime.datetime.utcfromtimestamp(
519 int(fso.stat.st_mtime)) 602 int(fso.stat.st_mtime))
520 mtime = util.b(mtime.isoformat("T") + "Z") 603 mtime = util.b(mtime.isoformat("T") + "Z")
532 if not self._size_only: 615 if not self._size_only:
533 dgst = digest.compute_digest_file( 616 dgst = digest.compute_digest_file(
534 self._algorithm[0], fso.path, use_mmap=self._use_mmap) 617 self._algorithm[0], fso.path, use_mmap=self._use_mmap)
535 dir_dgst.update(util.interpolate_bytes( 618 dir_dgst.update(util.interpolate_bytes(
536 b"%d:%s,", len(dgst), dgst)) 619 b"%d:%s,", len(dgst), dgst))
537 opath = "/".join(top) + "/" + fso.name if top else fso.name 620 opath = join_output_path(top, fso.name)
621 if self._utf8_mode:
622 opath = walk.WalkDirEntry.alt_u8(opath)
623 else:
624 opath = walk.WalkDirEntry.alt_fs(opath)
538 if self._size_only: 625 if self._size_only:
539 self._outfp.write(format_bsd_line( 626 self._outfp.write(format_bsd_line(
540 "SIZE", None, opath, False, fso.stat.st_size)) 627 "SIZE", None, opath, False, fso.stat.st_size))
541 else: 628 else:
542 if self._print_size: 629 if self._print_size:
546 else: 633 else:
547 self._outfp.write(format_bsd_line( 634 self._outfp.write(format_bsd_line(
548 self._algorithm[1], dgst, opath, 635 self._algorithm[1], dgst, opath,
549 self._use_base64)) 636 self._use_base64))
550 self._outfp.flush() 637 self._outfp.flush()
551 638 opath = join_output_path(top, None)
552 opath = "/".join(top) + "/" if top else "" 639 if opath:
640 if self._utf8_mode:
641 opath = walk.WalkDirEntry.alt_u8(opath)
642 else:
643 opath = walk.WalkDirEntry.alt_fs(opath)
553 if self._size_only: 644 if self._size_only:
554 self._outfp.write(format_bsd_line( 645 self._outfp.write(format_bsd_line(
555 "SIZE", None, opath, False, dir_size)) 646 "SIZE", None, opath, False, dir_size))
556 else: 647 else:
648 if dir_tainted:
649 self._outfp.write(format_bsd_line(
650 b"ERROR", None, b"directory is tainted", False, None))
557 if self._print_size: 651 if self._print_size:
558 self._outfp.write(format_bsd_line( 652 self._outfp.write(format_bsd_line(
559 self._algorithm[1], dir_dgst.digest(), opath, 653 self._algorithm[1], dir_dgst.digest(), opath,
560 self._use_base64, dir_size)) 654 self._use_base64, dir_size))
561 else: 655 else:
562 self._outfp.write(format_bsd_line( 656 self._outfp.write(format_bsd_line(
563 self._algorithm[1], dir_dgst.digest(), opath, 657 self._algorithm[1], dir_dgst.digest(), opath,
564 self._use_base64)) 658 self._use_base64))
565 self._outfp.flush() 659 self._outfp.flush()
566 return (dir_dgst.digest(), dir_size) 660 return (dir_dgst.digest(), dir_size)
661
662
663 def join_output_path(top, name):
664 if name is None:
665 # a path for a directory is to be computed
666 if top:
667 if isinstance(top[0], bytes):
668 return b"/".join(top) + b"/"
669 else:
670 return u"/".join(top) + u"/"
671 else:
672 return b""
673 else:
674 # a path for a normal file is to be computed
675 if top:
676 if isinstance(name, bytes):
677 return b"/".join(top) + b"/" + name
678 else:
679 return u"/".join(top) + u"/" + name
680 else:
681 return name
567 682
568 683
569 class CRC32Output(object): 684 class CRC32Output(object):
570 685
571 """Wrapper for a minimal binary file contextmanager that calculates 686 """Wrapper for a minimal binary file contextmanager that calculates
650 if what in (b"FSENCODING", b"ISOTIMESTAMP", b"FLAGS", b"VERSION", 765 if what in (b"FSENCODING", b"ISOTIMESTAMP", b"FLAGS", b"VERSION",
651 b"CRC32"): 766 b"CRC32"):
652 assert filename is None 767 assert filename is None
653 return util.interpolate_bytes(b"%s = %s%s", what, util.b(value), ls) 768 return util.interpolate_bytes(b"%s = %s%s", what, util.b(value), ls)
654 assert filename is not None 769 assert filename is not None
655 if what == b"COMMENT": 770 if what in (b"COMMENT", b"ERROR"):
656 return util.interpolate_bytes( 771 return util.interpolate_bytes(
657 b"COMMENT (%s)%s", util.b(filename, "utf-8"), ls) 772 b"%s (%s)%s", what, util.b(filename, "utf-8"), ls)
658 if not isinstance(filename, bytes): 773 if not isinstance(filename, bytes):
659 filename = util.fsencode(filename) 774 filename = util.fsencode(filename)
660 if what == b"SIZE": 775 if what == b"SIZE":
661 return util.interpolate_bytes(b"SIZE (%s) = %d%s", filename, size, ls) 776 return util.interpolate_bytes(b"SIZE (%s) = %d%s", filename, size, ls)
662 if value is None: 777 if value is None:
681 796
682 Supports the iterator and context manager protocol. 797 Supports the iterator and context manager protocol.
683 798
684 """ 799 """
685 800
686 PATTERN0 = re.compile(br"\A[ \t]*\r?\n\Z") # empty lines 801 PATTERN0 = re.compile(br"\A[ \t]*\r?\n\Z") # empty lines
687 PATTERN1 = re.compile(br"\A(VERSION|FSENCODING|FLAGS|TIMESTAMP|ISOTIMESTAMP|CRC32)[ \t]*=[ \t]*([^ \t]+)[ \t]*\r?\n\Z") # noqa: E501 line too long 802 PATTERN1 = re.compile(br"\A(VERSION|FSENCODING|FLAGS|TIMESTAMP|ISOTIMESTAMP|CRC32)[ \t]*=[ \t]*([^ \t]+)[ \t]*\r?\n\Z") # noqa: E501 line too long
688 PATTERN2 = re.compile(br"\A(ROOT|COMMENT)[ \t]*\((.*)\)[ \t]*\r?\n\Z") 803 PATTERN2 = re.compile(br"\A(ROOT|COMMENT|ERROR)[ \t]*\((.*)\)[ \t]*\r?\n\Z") # noqa: E501 line too long
689 PATTERN3 = re.compile(br"\ASIZE[ \t]*\((.*)\)[ \t]*=[ \t]*(\d+)[ \t]*\r?\n\Z") # noqa: E501 line too long 804 PATTERN3 = re.compile(br"\ASIZE[ \t]*\((.*)\)[ \t]*=[ \t]*(\d+)[ \t]*\r?\n\Z") # noqa: E501 line too long
690 PATTERN4 = re.compile(br"\A([A-Za-z0-9_-]+)[ \t]*\((.*)\)[ \t]*=[ \t]*([A-Za-z0-9=+/]+)(,(\d+))?[ \t]*\r?\n\Z") # noqa: E501 line too long 805 PATTERN4 = re.compile(br"\A([A-Za-z0-9_-]+)[ \t]*\((.*)\)[ \t]*=[ \t]*([A-Za-z0-9=+/]+)(,(\d+))?[ \t]*\r?\n\Z") # noqa: E501 line too long
691 806
692 def __init__(self, _fp, _filename, _own_fp): 807 def __init__(self, _fp, _filename, _own_fp):
693 self._fp = _fp 808 self._fp = _fp
694 self._own_fp = _own_fp 809 self._own_fp = _own_fp
695 self._filename = _filename 810 self._filename = _filename
810 return (util.n(mo.group(1)), util.n(mo.group(2))) 925 return (util.n(mo.group(1)), util.n(mo.group(2)))
811 else: 926 else:
812 mo = self.PATTERN2.search(line) 927 mo = self.PATTERN2.search(line)
813 if mo: 928 if mo:
814 self._update_crc(line) 929 self._update_crc(line)
815 if mo.group(1) == b"COMMENT": 930 if mo.group(1) in (b"COMMENT", b"ERROR"):
816 return ("COMMENT", util.u(mo.group(2), "utf-8")) 931 return (util.u(mo.group(1)), util.u(mo.group(2), "utf-8"))
817 elif mo.group(1) == b"ROOT": 932 elif mo.group(1) == b"ROOT":
818 return ("ROOT", mo.group(2)) 933 return ("ROOT", mo.group(2))
819 assert False, line 934 assert False, line
820 else: 935 else:
821 mo = self.PATTERN3.search(line) 936 mo = self.PATTERN3.search(line)