Mercurial > hgrepos > Python > apps > py-cutils
comparison cutils/treesum.py @ 188:2784fdcc99e5
Implement basic parsing of treesum output.
Including CRC32 checks.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Wed, 15 Jan 2025 14:41:36 +0100 |
| parents | 53614a724bf0 |
| children | 959c6d37b014 |
comparison
equal
deleted
inserted
replaced
| 187:7b41cd8692fc | 188:2784fdcc99e5 |
|---|---|
| 17 import base64 | 17 import base64 |
| 18 import binascii | 18 import binascii |
| 19 import datetime | 19 import datetime |
| 20 import logging | 20 import logging |
| 21 import os | 21 import os |
| 22 import re | |
| 22 import stat | 23 import stat |
| 23 import sys | 24 import sys |
| 24 import time | 25 import time |
| 25 import zlib | 26 import zlib |
| 26 | 27 |
| 454 dir_dgst.update(util.interpolate_bytes( | 455 dir_dgst.update(util.interpolate_bytes( |
| 455 b"1:d,%d:%s,", len(fso.fsname), fso.fsname)) | 456 b"1:d,%d:%s,", len(fso.fsname), fso.fsname)) |
| 456 dir_dgst.update(util.interpolate_bytes( | 457 dir_dgst.update(util.interpolate_bytes( |
| 457 b"%d:%s,", len(sub_dir_dgst), sub_dir_dgst)) | 458 b"%d:%s,", len(sub_dir_dgst), sub_dir_dgst)) |
| 458 if self._with_metadata_full_mode: | 459 if self._with_metadata_full_mode: |
| 459 modestr = normalized_mode_str(fso.stat.st_mode) | 460 modestr = util.b(normalized_mode_str(fso.stat.st_mode)) |
| 460 if not isinstance(modestr, bytes): | |
| 461 modestr = modestr.encode("ascii") | |
| 462 dir_dgst.update(util.interpolate_bytes( | 461 dir_dgst.update(util.interpolate_bytes( |
| 463 b"8:fullmode,%d:%s,", len(modestr), modestr)) | 462 b"8:fullmode,%d:%s,", len(modestr), modestr)) |
| 464 elif self._with_metadata_mode: | 463 elif self._with_metadata_mode: |
| 465 modestr = normalized_compatible_mode_str( | 464 modestr = util.b(normalized_compatible_mode_str( |
| 466 fso.stat.st_mode) | 465 fso.stat.st_mode)) |
| 467 if not isinstance(modestr, bytes): | |
| 468 modestr = modestr.encode("ascii") | |
| 469 dir_dgst.update(util.interpolate_bytes( | 466 dir_dgst.update(util.interpolate_bytes( |
| 470 b"4:mode,%d:%s,", len(modestr), modestr)) | 467 b"4:mode,%d:%s,", len(modestr), modestr)) |
| 471 else: | 468 else: |
| 472 dir_dgst.update(util.interpolate_bytes( | 469 dir_dgst.update(util.interpolate_bytes( |
| 473 b"1:f,%d:%s,", len(fso.fsname), fso.fsname)) | 470 b"1:f,%d:%s,", len(fso.fsname), fso.fsname)) |
| 474 dir_size += fso.stat.st_size | 471 dir_size += fso.stat.st_size |
| 475 if self._with_metadata_mtime: | 472 if self._with_metadata_mtime: |
| 476 mtime = datetime.datetime.utcfromtimestamp( | 473 mtime = datetime.datetime.utcfromtimestamp( |
| 477 int(fso.stat.st_mtime)) | 474 int(fso.stat.st_mtime)) |
| 478 mtime = mtime.isoformat("T") + "Z" | 475 mtime = util.b(mtime.isoformat("T") + "Z") |
| 479 if not isinstance(mtime, bytes): | |
| 480 mtime = mtime.encode("ascii") | |
| 481 dir_dgst.update(util.interpolate_bytes( | 476 dir_dgst.update(util.interpolate_bytes( |
| 482 b"5:mtime,%d:%s,", len(mtime), mtime)) | 477 b"5:mtime,%d:%s,", len(mtime), mtime)) |
| 483 if self._with_metadata_full_mode: | 478 if self._with_metadata_full_mode: |
| 484 modestr = normalized_mode_str(fso.stat.st_mode) | 479 modestr = util.b(normalized_mode_str(fso.stat.st_mode)) |
| 485 if not isinstance(modestr, bytes): | |
| 486 modestr = modestr.encode("ascii") | |
| 487 dir_dgst.update(util.interpolate_bytes( | 480 dir_dgst.update(util.interpolate_bytes( |
| 488 b"8:fullmode,%d:%s,", len(modestr), modestr)) | 481 b"8:fullmode,%d:%s,", len(modestr), modestr)) |
| 489 elif self._with_metadata_mode: | 482 elif self._with_metadata_mode: |
| 490 modestr = normalized_compatible_mode_str(fso.stat.st_mode) | 483 modestr = util.b(normalized_compatible_mode_str( |
| 491 if not isinstance(modestr, bytes): | 484 fso.stat.st_mode)) |
| 492 modestr = modestr.encode("ascii") | |
| 493 dir_dgst.update(util.interpolate_bytes( | 485 dir_dgst.update(util.interpolate_bytes( |
| 494 b"4:mode,%d:%s,", len(modestr), modestr)) | 486 b"4:mode,%d:%s,", len(modestr), modestr)) |
| 495 if not self._size_only: | 487 if not self._size_only: |
| 496 dgst = digest.compute_digest_file( | 488 dgst = digest.compute_digest_file( |
| 497 self._algorithm[0], fso.path, use_mmap=self._use_mmap) | 489 self._algorithm[0], fso.path, use_mmap=self._use_mmap) |
| 598 modestr = "0" + modestr | 590 modestr = "0" + modestr |
| 599 return modestr | 591 return modestr |
| 600 | 592 |
| 601 | 593 |
| 602 def format_bsd_line(what, value, filename, use_base64, size=None): | 594 def format_bsd_line(what, value, filename, use_base64, size=None): |
| 603 ls = os.linesep if isinstance(os.linesep, bytes) \ | 595 ls = util.b(os.linesep) |
| 604 else os.linesep.encode("utf-8") | |
| 605 if not isinstance(what, bytes): | 596 if not isinstance(what, bytes): |
| 606 what = what.encode("ascii") | 597 what = what.encode("ascii") |
| 607 if what == b"TIMESTAMP": | 598 if what == b"TIMESTAMP": |
| 608 assert filename is None | 599 assert filename is None |
| 609 return util.interpolate_bytes(b"TIMESTAMP = %d%s", value, ls) | 600 return util.interpolate_bytes(b"TIMESTAMP = %d%s", value, ls) |
| 610 if what in (b"ISOTIMESTAMP", b"FLAGS", b"VERSION", b"CRC32"): | 601 if what in (b"ISOTIMESTAMP", b"FLAGS", b"VERSION", b"CRC32"): |
| 611 assert filename is None | 602 assert filename is None |
| 612 if not isinstance(value, bytes): | 603 return util.interpolate_bytes(b"%s = %s%s", what, util.b(value), ls) |
| 613 value = value.encode("ascii") | |
| 614 return util.interpolate_bytes(b"%s = %s%s", what, value, ls) | |
| 615 assert filename is not None | 604 assert filename is not None |
| 616 if what == b"COMMENT": | 605 if what == b"COMMENT": |
| 617 if not isinstance(filename, bytes): | 606 return util.interpolate_bytes( |
| 618 filename = filename.encode("utf-8") | 607 b"COMMENT (%s)%s", util.b(filename, "utf-8"), ls) |
| 619 return util.interpolate_bytes(b"COMMENT (%s)%s", filename, ls) | |
| 620 if not isinstance(filename, bytes): | 608 if not isinstance(filename, bytes): |
| 621 filename = util.fsencode(filename) | 609 filename = util.fsencode(filename) |
| 622 if what == b"SIZE": | 610 if what == b"SIZE": |
| 623 return util.interpolate_bytes(b"SIZE (%s) = %d%s", filename, size, ls) | 611 return util.interpolate_bytes(b"SIZE (%s) = %d%s", filename, size, ls) |
| 624 if value is None: | 612 if value is None: |
| 635 else: | 623 else: |
| 636 return util.interpolate_bytes( | 624 return util.interpolate_bytes( |
| 637 b"%s (%s) = %s,%d%s", what, filename, value, size, ls) | 625 b"%s (%s) = %s,%d%s", what, filename, value, size, ls) |
| 638 | 626 |
| 639 | 627 |
| 628 class TreesumReader(object): | |
| 629 | |
| 630 """Reader to read and/or verify treesum digest files. | |
| 631 | |
| 632 Supports the iterator and context manager protocol. | |
| 633 | |
| 634 """ | |
| 635 | |
| 636 PATTERN0 = re.compile(br"\A[ \t]*\r?\n\Z") # empty lines | |
| 637 PATTERN1 = re.compile(br"\A(VERSION|FLAGS|TIMESTAMP|ISOTIMESTAMP|CRC32)[ \t]*=[ \t]*([^ \t]+)[ \t]*\r?\n\Z") # noqa: E501 line too long | |
| 638 PATTERN2 = re.compile(br"\A(ROOT|COMMENT)[ \t]*\((.*)\)[ \t]*\r?\n\Z") | |
| 639 PATTERN3 = re.compile(br"\ASIZE[ \t]*\((.*)\)[ \t]*=[ \t]*(\d+)[ \t]*\r?\n\Z") # noqa: E501 line too long | |
| 640 PATTERN4 = re.compile(br"\A([A-Za-z0-9_-]+)[ \t]*\((.*)\)[ \t]*=[ \t]*([A-Za-z0-9=+/]+)(,(\d+))?[ \t]*\r?\n\Z") # noqa: E501 line too long | |
| 641 | |
| 642 def __init__(self, _fp, _filename, _own_fp): | |
| 643 self._fp = _fp | |
| 644 self._own_fp = _own_fp | |
| 645 self._filename = _filename | |
| 646 self._line_no = 0 | |
| 647 self._reset_crc() | |
| 648 self._expect_crc = None # NOTE: tristate: None is different from False | |
| 649 self._current_algo_name = self._current_algo_digest_size = None | |
| 650 | |
| 651 @classmethod | |
| 652 def from_path(cls_, path): | |
| 653 """Open file at `path` and return a reader that owns the file object""" | |
| 654 return cls_(open(path, "rb"), path, True) | |
| 655 | |
| 656 @classmethod | |
| 657 def from_binary_buffer(cls_, binary_fp, filename): | |
| 658 return cls_(binary_fp, filename, False) | |
| 659 | |
| 660 def __enter__(self): | |
| 661 return self | |
| 662 | |
| 663 def __exit__(self, *args): | |
| 664 self.close() | |
| 665 | |
| 666 def close(self): | |
| 667 if self._fp is not None: | |
| 668 try: | |
| 669 if self._own_fp: | |
| 670 self._fp.close() | |
| 671 finally: | |
| 672 self._fp = None | |
| 673 | |
| 674 def __iter__(self): | |
| 675 return self | |
| 676 | |
| 677 def __next__(self): | |
| 678 rec = self.read_record() | |
| 679 if rec is None: | |
| 680 raise StopIteration() | |
| 681 return rec | |
| 682 | |
| 683 if util.PY2: | |
| 684 next = __next__ | |
| 685 | |
| 686 def all_records(self): | |
| 687 """Iterator over all remaining records""" | |
| 688 while True: | |
| 689 rec = self.read_record() | |
| 690 if rec is None: | |
| 691 return | |
| 692 yield rec | |
| 693 | |
| 694 def read_record(self): | |
| 695 """Read and parse the "next" line. | |
| 696 | |
| 697 :returns: `None` at EOF or the parsed contents of the line | |
| 698 :rtype: tuple or None | |
| 699 | |
| 700 """ | |
| 701 # Loop to skip empty lines | |
| 702 while True: | |
| 703 line = self._get_next_line() | |
| 704 if not line: | |
| 705 # | |
| 706 # Skip for empty files at the very beginning. | |
| 707 # Check only after the first VERSION line. | |
| 708 # | |
| 709 if self._expect_crc is not None: | |
| 710 if self._expect_crc: | |
| 711 logging.warning("CRC32 is missing at EOF") | |
| 712 return None | |
| 713 if not self.PATTERN0.search(line): | |
| 714 break | |
| 715 self._update_crc(line) | |
| 716 # | |
| 717 # At the beginning transparently skip an eventually embedded signify | |
| 718 # signature | |
| 719 # | |
| 720 if self._line_no == 1: | |
| 721 if line.startswith(b"untrusted comment: "): | |
| 722 line = self._get_next_line() | |
| 723 if not line.endswith(b"\n"): | |
| 724 raise binascii.Error("No valid signify signature value") | |
| 725 # Try to decode for an early error check | |
| 726 base64.b64decode(line[:-1]) | |
| 727 mo = self.PATTERN1.search(line) | |
| 728 if mo: | |
| 729 if mo.group(1) == b"VERSION": | |
| 730 if self._expect_crc: | |
| 731 logging.warning("CRC32 missing before line %d", | |
| 732 self._line_no) | |
| 733 self._reset_crc() | |
| 734 self._expect_crc = True | |
| 735 self._update_crc(line) | |
| 736 return ("VERSION", util.n(mo.group(2))) | |
| 737 if mo.group(1) == b"CRC32": | |
| 738 # TODO: check | |
| 739 if self._expect_crc is None: | |
| 740 logging.warning("Lone CRC32 before VERSION in line %d", | |
| 741 self._line_no) | |
| 742 else: | |
| 743 if self._expect_crc: | |
| 744 if (self._hex_crc() | |
| 745 != mo.group(2).decode("latin1").upper()): | |
| 746 logging.warning( | |
| 747 "CRC32 mismatch in line %d:" | |
| 748 " expected: %s, given: %s", | |
| 749 self._line_no, | |
| 750 self._hex_crc(), | |
| 751 mo.group(2).decode("latin1").upper()) | |
| 752 else: | |
| 753 logging.warning("CRC32 before VERSION in line %d", | |
| 754 self._line_no) | |
| 755 # Do not update the CRC here but reset the state | |
| 756 self._expect_crc = False | |
| 757 return ("CRC32", util.n(mo.group(2))) | |
| 758 else: | |
| 759 self._update_crc(line) | |
| 760 return (util.n(mo.group(1)), util.n(mo.group(2))) | |
| 761 else: | |
| 762 mo = self.PATTERN2.search(line) | |
| 763 if mo: | |
| 764 self._update_crc(line) | |
| 765 if mo.group(1) == b"COMMENT": | |
| 766 return ("COMMENT", util.u(mo.group(2), "utf-8")) | |
| 767 elif mo.group(1) == b"ROOT": | |
| 768 return ("ROOT", mo.group(2)) | |
| 769 assert False, line | |
| 770 else: | |
| 771 mo = self.PATTERN3.search(line) | |
| 772 if mo: | |
| 773 self._update_crc(line) | |
| 774 return ("SIZE", mo.group(1), int(util.n(mo.group(2)), 10)) | |
| 775 else: | |
| 776 mo = self.PATTERN4.search(line) | |
| 777 if mo: | |
| 778 self._update_crc(line) | |
| 779 algo_name = util.n(mo.group(1)) | |
| 780 if (len(mo.group(3)) == | |
| 781 2 * self._get_digest_size(algo_name)): | |
| 782 # hex | |
| 783 digest = binascii.unhexlify(mo.group(3)) | |
| 784 else: | |
| 785 # base64 | |
| 786 digest = base64.b64decode(mo.group(3)) | |
| 787 if mo.group(4): | |
| 788 size = int(util.n(mo.group(5)), 10) | |
| 789 else: | |
| 790 size = None | |
| 791 return (algo_name, mo.group(2), digest, size) | |
| 792 else: | |
| 793 assert False, line | |
| 794 return line | |
| 795 | |
| 796 def _get_next_line(self): | |
| 797 line = self._fp.readline(2048) | |
| 798 if line: | |
| 799 self._line_no += 1 | |
| 800 return line | |
| 801 | |
| 802 def _reset_crc(self): | |
| 803 self._crc32 = zlib.crc32(b"") | |
| 804 | |
| 805 def _update_crc(self, data): | |
| 806 self._crc32 = zlib.crc32(data, self._crc32) | |
| 807 | |
| 808 def _hex_crc(self): | |
| 809 return (hex(self._crc32)[2:]).upper() | |
| 810 | |
| 811 def _get_digest_size(self, algo_name): | |
| 812 if self._current_algo_name == algo_name: | |
| 813 return self._current_algo_digest_size | |
| 814 h = util.algotag2algotype(algo_name)() | |
| 815 self._current_algo_name = algo_name | |
| 816 self._current_algo_digest_size = h.digest_size | |
| 817 return self._current_algo_digest_size | |
| 818 | |
| 819 | |
| 640 if __name__ == "__main__": | 820 if __name__ == "__main__": |
| 641 sys.exit(main()) | 821 sys.exit(main()) |
