comparison cutils/treesum.py @ 188:2784fdcc99e5

Implement basic parsing of treesum output. Including CRC32 checks.
author Franz Glasner <fzglas.hg@dom66.de>
date Wed, 15 Jan 2025 14:41:36 +0100
parents 53614a724bf0
children 959c6d37b014
comparison
equal deleted inserted replaced
187:7b41cd8692fc 188:2784fdcc99e5
17 import base64 17 import base64
18 import binascii 18 import binascii
19 import datetime 19 import datetime
20 import logging 20 import logging
21 import os 21 import os
22 import re
22 import stat 23 import stat
23 import sys 24 import sys
24 import time 25 import time
25 import zlib 26 import zlib
26 27
454 dir_dgst.update(util.interpolate_bytes( 455 dir_dgst.update(util.interpolate_bytes(
455 b"1:d,%d:%s,", len(fso.fsname), fso.fsname)) 456 b"1:d,%d:%s,", len(fso.fsname), fso.fsname))
456 dir_dgst.update(util.interpolate_bytes( 457 dir_dgst.update(util.interpolate_bytes(
457 b"%d:%s,", len(sub_dir_dgst), sub_dir_dgst)) 458 b"%d:%s,", len(sub_dir_dgst), sub_dir_dgst))
458 if self._with_metadata_full_mode: 459 if self._with_metadata_full_mode:
459 modestr = normalized_mode_str(fso.stat.st_mode) 460 modestr = util.b(normalized_mode_str(fso.stat.st_mode))
460 if not isinstance(modestr, bytes):
461 modestr = modestr.encode("ascii")
462 dir_dgst.update(util.interpolate_bytes( 461 dir_dgst.update(util.interpolate_bytes(
463 b"8:fullmode,%d:%s,", len(modestr), modestr)) 462 b"8:fullmode,%d:%s,", len(modestr), modestr))
464 elif self._with_metadata_mode: 463 elif self._with_metadata_mode:
465 modestr = normalized_compatible_mode_str( 464 modestr = util.b(normalized_compatible_mode_str(
466 fso.stat.st_mode) 465 fso.stat.st_mode))
467 if not isinstance(modestr, bytes):
468 modestr = modestr.encode("ascii")
469 dir_dgst.update(util.interpolate_bytes( 466 dir_dgst.update(util.interpolate_bytes(
470 b"4:mode,%d:%s,", len(modestr), modestr)) 467 b"4:mode,%d:%s,", len(modestr), modestr))
471 else: 468 else:
472 dir_dgst.update(util.interpolate_bytes( 469 dir_dgst.update(util.interpolate_bytes(
473 b"1:f,%d:%s,", len(fso.fsname), fso.fsname)) 470 b"1:f,%d:%s,", len(fso.fsname), fso.fsname))
474 dir_size += fso.stat.st_size 471 dir_size += fso.stat.st_size
475 if self._with_metadata_mtime: 472 if self._with_metadata_mtime:
476 mtime = datetime.datetime.utcfromtimestamp( 473 mtime = datetime.datetime.utcfromtimestamp(
477 int(fso.stat.st_mtime)) 474 int(fso.stat.st_mtime))
478 mtime = mtime.isoformat("T") + "Z" 475 mtime = util.b(mtime.isoformat("T") + "Z")
479 if not isinstance(mtime, bytes):
480 mtime = mtime.encode("ascii")
481 dir_dgst.update(util.interpolate_bytes( 476 dir_dgst.update(util.interpolate_bytes(
482 b"5:mtime,%d:%s,", len(mtime), mtime)) 477 b"5:mtime,%d:%s,", len(mtime), mtime))
483 if self._with_metadata_full_mode: 478 if self._with_metadata_full_mode:
484 modestr = normalized_mode_str(fso.stat.st_mode) 479 modestr = util.b(normalized_mode_str(fso.stat.st_mode))
485 if not isinstance(modestr, bytes):
486 modestr = modestr.encode("ascii")
487 dir_dgst.update(util.interpolate_bytes( 480 dir_dgst.update(util.interpolate_bytes(
488 b"8:fullmode,%d:%s,", len(modestr), modestr)) 481 b"8:fullmode,%d:%s,", len(modestr), modestr))
489 elif self._with_metadata_mode: 482 elif self._with_metadata_mode:
490 modestr = normalized_compatible_mode_str(fso.stat.st_mode) 483 modestr = util.b(normalized_compatible_mode_str(
491 if not isinstance(modestr, bytes): 484 fso.stat.st_mode))
492 modestr = modestr.encode("ascii")
493 dir_dgst.update(util.interpolate_bytes( 485 dir_dgst.update(util.interpolate_bytes(
494 b"4:mode,%d:%s,", len(modestr), modestr)) 486 b"4:mode,%d:%s,", len(modestr), modestr))
495 if not self._size_only: 487 if not self._size_only:
496 dgst = digest.compute_digest_file( 488 dgst = digest.compute_digest_file(
497 self._algorithm[0], fso.path, use_mmap=self._use_mmap) 489 self._algorithm[0], fso.path, use_mmap=self._use_mmap)
598 modestr = "0" + modestr 590 modestr = "0" + modestr
599 return modestr 591 return modestr
600 592
601 593
602 def format_bsd_line(what, value, filename, use_base64, size=None): 594 def format_bsd_line(what, value, filename, use_base64, size=None):
603 ls = os.linesep if isinstance(os.linesep, bytes) \ 595 ls = util.b(os.linesep)
604 else os.linesep.encode("utf-8")
605 if not isinstance(what, bytes): 596 if not isinstance(what, bytes):
606 what = what.encode("ascii") 597 what = what.encode("ascii")
607 if what == b"TIMESTAMP": 598 if what == b"TIMESTAMP":
608 assert filename is None 599 assert filename is None
609 return util.interpolate_bytes(b"TIMESTAMP = %d%s", value, ls) 600 return util.interpolate_bytes(b"TIMESTAMP = %d%s", value, ls)
610 if what in (b"ISOTIMESTAMP", b"FLAGS", b"VERSION", b"CRC32"): 601 if what in (b"ISOTIMESTAMP", b"FLAGS", b"VERSION", b"CRC32"):
611 assert filename is None 602 assert filename is None
612 if not isinstance(value, bytes): 603 return util.interpolate_bytes(b"%s = %s%s", what, util.b(value), ls)
613 value = value.encode("ascii")
614 return util.interpolate_bytes(b"%s = %s%s", what, value, ls)
615 assert filename is not None 604 assert filename is not None
616 if what == b"COMMENT": 605 if what == b"COMMENT":
617 if not isinstance(filename, bytes): 606 return util.interpolate_bytes(
618 filename = filename.encode("utf-8") 607 b"COMMENT (%s)%s", util.b(filename, "utf-8"), ls)
619 return util.interpolate_bytes(b"COMMENT (%s)%s", filename, ls)
620 if not isinstance(filename, bytes): 608 if not isinstance(filename, bytes):
621 filename = util.fsencode(filename) 609 filename = util.fsencode(filename)
622 if what == b"SIZE": 610 if what == b"SIZE":
623 return util.interpolate_bytes(b"SIZE (%s) = %d%s", filename, size, ls) 611 return util.interpolate_bytes(b"SIZE (%s) = %d%s", filename, size, ls)
624 if value is None: 612 if value is None:
635 else: 623 else:
636 return util.interpolate_bytes( 624 return util.interpolate_bytes(
637 b"%s (%s) = %s,%d%s", what, filename, value, size, ls) 625 b"%s (%s) = %s,%d%s", what, filename, value, size, ls)
638 626
639 627
628 class TreesumReader(object):
629
630 """Reader to read and/or verify treesum digest files.
631
632 Supports the iterator and context manager protocol.
633
634 """
635
636 PATTERN0 = re.compile(br"\A[ \t]*\r?\n\Z") # empty lines
637 PATTERN1 = re.compile(br"\A(VERSION|FLAGS|TIMESTAMP|ISOTIMESTAMP|CRC32)[ \t]*=[ \t]*([^ \t]+)[ \t]*\r?\n\Z") # noqa: E501 line too long
638 PATTERN2 = re.compile(br"\A(ROOT|COMMENT)[ \t]*\((.*)\)[ \t]*\r?\n\Z")
639 PATTERN3 = re.compile(br"\ASIZE[ \t]*\((.*)\)[ \t]*=[ \t]*(\d+)[ \t]*\r?\n\Z") # noqa: E501 line too long
640 PATTERN4 = re.compile(br"\A([A-Za-z0-9_-]+)[ \t]*\((.*)\)[ \t]*=[ \t]*([A-Za-z0-9=+/]+)(,(\d+))?[ \t]*\r?\n\Z") # noqa: E501 line too long
641
642 def __init__(self, _fp, _filename, _own_fp):
643 self._fp = _fp
644 self._own_fp = _own_fp
645 self._filename = _filename
646 self._line_no = 0
647 self._reset_crc()
648 self._expect_crc = None # NOTE: tristate: None is different from False
649 self._current_algo_name = self._current_algo_digest_size = None
650
651 @classmethod
652 def from_path(cls_, path):
653 """Open file at `path` and return a reader that owns the file object"""
654 return cls_(open(path, "rb"), path, True)
655
656 @classmethod
657 def from_binary_buffer(cls_, binary_fp, filename):
658 return cls_(binary_fp, filename, False)
659
660 def __enter__(self):
661 return self
662
663 def __exit__(self, *args):
664 self.close()
665
666 def close(self):
667 if self._fp is not None:
668 try:
669 if self._own_fp:
670 self._fp.close()
671 finally:
672 self._fp = None
673
674 def __iter__(self):
675 return self
676
677 def __next__(self):
678 rec = self.read_record()
679 if rec is None:
680 raise StopIteration()
681 return rec
682
683 if util.PY2:
684 next = __next__
685
686 def all_records(self):
687 """Iterator over all remaining records"""
688 while True:
689 rec = self.read_record()
690 if rec is None:
691 return
692 yield rec
693
694 def read_record(self):
695 """Read and parse the "next" line.
696
697 :returns: `None` at EOF or the parsed contents of the line
698 :rtype: tuple or None
699
700 """
701 # Loop to skip empty lines
702 while True:
703 line = self._get_next_line()
704 if not line:
705 #
706 # Skip for empty files at the very beginning.
707 # Check only after the first VERSION line.
708 #
709 if self._expect_crc is not None:
710 if self._expect_crc:
711 logging.warning("CRC32 is missing at EOF")
712 return None
713 if not self.PATTERN0.search(line):
714 break
715 self._update_crc(line)
716 #
717 # At the beginning transparently skip an eventually embedded signify
718 # signature
719 #
720 if self._line_no == 1:
721 if line.startswith(b"untrusted comment: "):
722 line = self._get_next_line()
723 if not line.endswith(b"\n"):
724 raise binascii.Error("No valid signify signature value")
725 # Try to decode for an early error check
726 base64.b64decode(line[:-1])
727 mo = self.PATTERN1.search(line)
728 if mo:
729 if mo.group(1) == b"VERSION":
730 if self._expect_crc:
731 logging.warning("CRC32 missing before line %d",
732 self._line_no)
733 self._reset_crc()
734 self._expect_crc = True
735 self._update_crc(line)
736 return ("VERSION", util.n(mo.group(2)))
737 if mo.group(1) == b"CRC32":
738 # TODO: check
739 if self._expect_crc is None:
740 logging.warning("Lone CRC32 before VERSION in line %d",
741 self._line_no)
742 else:
743 if self._expect_crc:
744 if (self._hex_crc()
745 != mo.group(2).decode("latin1").upper()):
746 logging.warning(
747 "CRC32 mismatch in line %d:"
748 " expected: %s, given: %s",
749 self._line_no,
750 self._hex_crc(),
751 mo.group(2).decode("latin1").upper())
752 else:
753 logging.warning("CRC32 before VERSION in line %d",
754 self._line_no)
755 # Do not update the CRC here but reset the state
756 self._expect_crc = False
757 return ("CRC32", util.n(mo.group(2)))
758 else:
759 self._update_crc(line)
760 return (util.n(mo.group(1)), util.n(mo.group(2)))
761 else:
762 mo = self.PATTERN2.search(line)
763 if mo:
764 self._update_crc(line)
765 if mo.group(1) == b"COMMENT":
766 return ("COMMENT", util.u(mo.group(2), "utf-8"))
767 elif mo.group(1) == b"ROOT":
768 return ("ROOT", mo.group(2))
769 assert False, line
770 else:
771 mo = self.PATTERN3.search(line)
772 if mo:
773 self._update_crc(line)
774 return ("SIZE", mo.group(1), int(util.n(mo.group(2)), 10))
775 else:
776 mo = self.PATTERN4.search(line)
777 if mo:
778 self._update_crc(line)
779 algo_name = util.n(mo.group(1))
780 if (len(mo.group(3)) ==
781 2 * self._get_digest_size(algo_name)):
782 # hex
783 digest = binascii.unhexlify(mo.group(3))
784 else:
785 # base64
786 digest = base64.b64decode(mo.group(3))
787 if mo.group(4):
788 size = int(util.n(mo.group(5)), 10)
789 else:
790 size = None
791 return (algo_name, mo.group(2), digest, size)
792 else:
793 assert False, line
794 return line
795
796 def _get_next_line(self):
797 line = self._fp.readline(2048)
798 if line:
799 self._line_no += 1
800 return line
801
802 def _reset_crc(self):
803 self._crc32 = zlib.crc32(b"")
804
805 def _update_crc(self, data):
806 self._crc32 = zlib.crc32(data, self._crc32)
807
808 def _hex_crc(self):
809 return (hex(self._crc32)[2:]).upper()
810
811 def _get_digest_size(self, algo_name):
812 if self._current_algo_name == algo_name:
813 return self._current_algo_digest_size
814 h = util.algotag2algotype(algo_name)()
815 self._current_algo_name = algo_name
816 self._current_algo_digest_size = h.digest_size
817 return self._current_algo_digest_size
818
819
640 if __name__ == "__main__": 820 if __name__ == "__main__":
641 sys.exit(main()) 821 sys.exit(main())