comparison cutils/treesum.py @ 266:0add8276e6b8

treesum: Handle errors like broken symlinks properly
author Franz Glasner <fzglas.hg@dom66.de>
date Tue, 18 Feb 2025 12:39:04 +0100
parents c3d6599c1b5e
children b9aa65a30b4c
comparison
equal deleted inserted replaced
265:188f448ab5e9 266:0add8276e6b8
16 import argparse 16 import argparse
17 import base64 17 import base64
18 import binascii 18 import binascii
19 import collections 19 import collections
20 import datetime 20 import datetime
21 import errno
21 import logging 22 import logging
22 import os 23 import os
23 import re 24 import re
24 import stat 25 import stat
25 import sys 26 import sys
574 self._writer.finish() 575 self._writer.finish()
575 576
576 def _generate(self, root, top): 577 def _generate(self, root, top):
577 logging.debug("Handling %s/%r", root, top) 578 logging.debug("Handling %s/%r", root, top)
578 path = os.path.join(root, *top) if top else root 579 path = os.path.join(root, *top) if top else root
579 with walk.ScanDir(path) as dirscan: 580 try:
580 fsobjects = list(dirscan) 581 with walk.ScanDir(path) as dirscan:
582 fsobjects = list(dirscan)
583 except OSError as e:
584 if self._utf8_mode:
585 opath = walk.WalkDirEntry.alt_u8(path)
586 else:
587 opath = walk.WalkDirEntry.alt_fs(path)
588 if e.errno == errno.ENOTDIR:
589 # object exists but is not a directory
590 errmsg = b"not a directory"
591 elif e.errno in (errno.EACCES, errno.EPERM,
592 getattr(errno, "ENOTCAPABLE", errno.EACCES)):
593 # no permissions
594 errmsg = (
595 b"access denied / no permissions / missing capabilities")
596 elif e.errno == errno.ENOENT:
597 # given object does not exist
598 errmsg = b"no such file or directory"
599 else:
600 raise
601 self._writer.write_error(util.interpolate_bytes(
602 b"`%s': %s", opath, errmsg))
603 opath = join_output_path(top, None)
604 if opath:
605 if self._utf8_mode:
606 opath = walk.WalkDirEntry.alt_u8(opath)
607 else:
608 opath = walk.WalkDirEntry.alt_fs(opath)
609 if self._size_only:
610 self._writer.write_size(opath, None)
611 else:
612 self._writer.write_file_digest(self._algorithm[1], opath, None)
613 self._writer.flush()
614 return (None, None)
581 if self._utf8_mode: 615 if self._utf8_mode:
582 fsobjects.sort(key=walk.WalkDirEntry.sort_key_u8) 616 fsobjects.sort(key=walk.WalkDirEntry.sort_key_u8)
583 else: 617 else:
584 fsobjects.sort(key=walk.WalkDirEntry.sort_key_fs) 618 fsobjects.sort(key=walk.WalkDirEntry.sort_key_fs)
585 dir_dgst = self._algorithm[0]() 619 dir_dgst = self._algorithm[0]()
664 698
665 # Get subdir data from recursing into it 699 # Get subdir data from recursing into it
666 sub_dir_dgst, sub_dir_size = self._generate( 700 sub_dir_dgst, sub_dir_size = self._generate(
667 root, top + (fso.name, )) 701 root, top + (fso.name, ))
668 702
703 if sub_dir_dgst is None or sub_dir_size is None:
704 #
705 # This should not happen:
706 # - top-level directories are handled above
707 # - other filesystem objects should also have been
708 # handled already
709 #
710 assert False
711
669 dir_size += sub_dir_size 712 dir_size += sub_dir_size
670 if self._utf8_mode: 713 if self._utf8_mode:
671 if fso.u8name is None: 714 if fso.u8name is None:
672 dir_tainted = True 715 dir_tainted = True
673 dir_dgst.update(util.interpolate_bytes( 716 dir_dgst.update(util.interpolate_bytes(
792 len(fso.alt_fsname), 835 len(fso.alt_fsname),
793 fso.alt_fsname)) 836 fso.alt_fsname))
794 else: 837 else:
795 dir_dgst.update(util.interpolate_bytes( 838 dir_dgst.update(util.interpolate_bytes(
796 b"1:f,%d:%s,", len(fso.fsname), fso.fsname)) 839 b"1:f,%d:%s,", len(fso.fsname), fso.fsname))
797 dir_size += fso.stat.st_size
798 if self._with_metadata_mtime:
799 mtime = datetime.datetime.utcfromtimestamp(
800 int(fso.stat.st_mtime))
801 mtime = util.b(mtime.isoformat("T") + "Z")
802 dir_dgst.update(util.interpolate_bytes(
803 b"5:mtime,%d:%s,", len(mtime), mtime))
804 if self._with_metadata_full_mode:
805 modestr = util.b(normalized_mode_str(fso.stat.st_mode))
806 dir_dgst.update(util.interpolate_bytes(
807 b"8:fullmode,%d:%s,", len(modestr), modestr))
808 elif self._with_metadata_mode:
809 modestr = util.b(normalized_compatible_mode_str(
810 fso.stat.st_mode))
811 dir_dgst.update(util.interpolate_bytes(
812 b"4:mode,%d:%s,", len(modestr), modestr))
813 if not self._size_only:
814 dgst = digest.compute_digest_file(
815 self._algorithm[0],
816 fso.path,
817 use_mmap=self._use_mmap)
818 dir_dgst.update(util.interpolate_bytes(
819 b"%d:%s,", len(dgst), dgst))
820 opath = join_output_path(top, fso.name) 840 opath = join_output_path(top, fso.name)
821 if self._utf8_mode: 841 if self._utf8_mode:
822 opath = walk.WalkDirEntry.alt_u8(opath) 842 opath = walk.WalkDirEntry.alt_u8(opath)
823 else: 843 else:
824 opath = walk.WalkDirEntry.alt_fs(opath) 844 opath = walk.WalkDirEntry.alt_fs(opath)
825 if self._size_only: 845 if fso.stat is None:
826 self._writer.write_size(opath, fso.stat.st_size) 846 #
847 # Error: most likely a broken symlink here
848 #
849 dir_tainted = True
850 dir_dgst.update(util.interpolate_bytes(
851 b"5:errno,%d:%s,",
852 len(str(fso.stat_errno)),
853 util.b(str(fso.stat_errno))))
854 self._writer.write_error(util.interpolate_bytes(
855 b"errno %d: %s",
856 fso.stat_errno,
857 util.b(fso.stat_errstr, "utf-8")))
858 logging.error(
859 "Directory entry has symlink problems: %r",
860 opath)
861 if self._size_only:
862 self._writer.write_size(opath, None)
863 else:
864 self._writer.write_file_digest(
865 self._algorithm[1], opath, None)
827 else: 866 else:
828 sz = fso.stat.st_size if self._print_size else None 867 #
829 self._writer.write_file_digest( 868 # Ok: File has normal stat info
830 self._algorithm[1], opath, dgst, 869 #
831 use_base64=self._use_base64, 870 # XXX FIXME: Handle special files (fifo, socket,
832 size=sz) 871 # block or char devices, ...).
872 #
873 dir_size += fso.stat.st_size
874 if self._with_metadata_mtime:
875 mtime = datetime.datetime.utcfromtimestamp(
876 int(fso.stat.st_mtime))
877 mtime = util.b(mtime.isoformat("T") + "Z")
878 dir_dgst.update(util.interpolate_bytes(
879 b"5:mtime,%d:%s,", len(mtime), mtime))
880 if self._with_metadata_full_mode:
881 modestr = util.b(
882 normalized_mode_str(fso.stat.st_mode))
883 dir_dgst.update(util.interpolate_bytes(
884 b"8:fullmode,%d:%s,", len(modestr), modestr))
885 elif self._with_metadata_mode:
886 modestr = util.b(normalized_compatible_mode_str(
887 fso.stat.st_mode))
888 dir_dgst.update(util.interpolate_bytes(
889 b"4:mode,%d:%s,", len(modestr), modestr))
890 if not self._size_only:
891 dgst = digest.compute_digest_file(
892 self._algorithm[0],
893 fso.path,
894 use_mmap=self._use_mmap)
895 dir_dgst.update(util.interpolate_bytes(
896 b"%d:%s,", len(dgst), dgst))
897 if self._size_only:
898 self._writer.write_size(opath, fso.stat.st_size)
899 else:
900 sz = fso.stat.st_size if self._print_size else None
901 self._writer.write_file_digest(
902 self._algorithm[1], opath, dgst,
903 use_base64=self._use_base64,
904 size=sz)
833 self._writer.flush() 905 self._writer.flush()
834 opath = join_output_path(top, None) 906 opath = join_output_path(top, None)
835 if opath: 907 if opath:
836 if self._utf8_mode: 908 if self._utf8_mode:
837 opath = walk.WalkDirEntry.alt_u8(opath) 909 opath = walk.WalkDirEntry.alt_u8(opath)
838 else: 910 else:
839 opath = walk.WalkDirEntry.alt_fs(opath) 911 opath = walk.WalkDirEntry.alt_fs(opath)
912 if dir_tainted:
913 #
914 # IMPORTANT: Print errors BEFORE the associated digest or size
915 # line. Otherwise the "info" command has a problem.
916 #
917 self._writer.write_error(b"directory is tainted")
918 logging.error("Directory has filename and/or symlink problems: %r",
919 opath)
840 if self._size_only: 920 if self._size_only:
841 self._writer.write_size(opath, dir_size) 921 self._writer.write_size(opath, dir_size)
842 else: 922 else:
843 if dir_tainted:
844 #
845 # IMPORTANT: Print errors BEFORE the associated digest line.
846 # Otherwise the "info" command has a problem.
847 #
848 self._writer.write_error(b"directory is tainted")
849 logging.error("Directory has filename problems: %r", opath)
850 sz = dir_size if self._print_size else None 923 sz = dir_size if self._print_size else None
851 self._writer.write_file_digest( 924 self._writer.write_file_digest(
852 self._algorithm[1], opath, dir_dgst.digest(), 925 self._algorithm[1], opath, dir_dgst.digest(),
853 use_base64=self._use_base64, size=sz) 926 use_base64=self._use_base64, size=sz)
854 self._writer.flush() 927 self._writer.flush()
1012 1085
1013 def write_size(self, filename, sz): 1086 def write_size(self, filename, sz):
1014 assert isinstance(filename, bytes) 1087 assert isinstance(filename, bytes)
1015 self.write(b"SIZE (") 1088 self.write(b"SIZE (")
1016 self.write(filename) 1089 self.write(filename)
1017 self.write(b") = ") 1090 self.write(b")")
1018 self.writeln(util.b(str(sz))) 1091 if sz is not None:
1092 self.write(b" = ")
1093 self.write(util.b(str(sz)))
1094 self.writeln(b"")
1019 1095
1020 def write_file_digest(self, algorithm, filename, digest, 1096 def write_file_digest(self, algorithm, filename, digest,
1021 use_base64=False, size=None): 1097 use_base64=False, size=None):
1022 digest = (base64.b64encode(digest) 1098 if digest is not None:
1023 if use_base64 1099 digest = (base64.b64encode(digest)
1024 else binascii.hexlify(digest)) 1100 if use_base64
1101 else binascii.hexlify(digest))
1025 if filename != b"./@/": 1102 if filename != b"./@/":
1026 filename = util.normalize_filename(filename, True) 1103 filename = util.normalize_filename(filename, True)
1027 self.write(util.b(algorithm)) 1104 self.write(util.b(algorithm))
1028 self.write(b" (") 1105 self.write(b" (")
1029 self.write(filename) 1106 self.write(filename)
1030 self.write(b") = ") 1107 self.write(b")")
1031 self.write(digest) 1108 if digest is not None or size is not None:
1032 if size is not None: 1109 self.write(b" = ")
1033 self.write(b",") 1110 if digest is not None:
1034 self.writeln(util.b(str(size))) 1111 self.write(digest)
1035 else: 1112 if size is not None:
1036 self.writeln(b"") 1113 self.write(b",")
1114 self.write(util.b(str(size)))
1115 self.writeln(b"")
1037 1116
1038 def finish(self): 1117 def finish(self):
1039 """Finish a block and write the current CRC""" 1118 """Finish a block and write the current CRC"""
1040 crc = self._crc.hexdigest() 1119 crc = self._crc.hexdigest()
1041 self.write(b"CRC32 = ") 1120 self.write(b"CRC32 = ")
1074 """ 1153 """
1075 1154
1076 PATTERN0 = re.compile(br"\A[ \t]*\r?\n\Z") # empty lines 1155 PATTERN0 = re.compile(br"\A[ \t]*\r?\n\Z") # empty lines
1077 PATTERN1 = re.compile(br"\A(VERSION|FSENCODING|FLAGS|TIMESTAMP|ISOTIMESTAMP|CRC32)[ \t]*=[ \t]*([^ \t]+)[ \t]*\r?\n\Z") # noqa: E501 line too long 1156 PATTERN1 = re.compile(br"\A(VERSION|FSENCODING|FLAGS|TIMESTAMP|ISOTIMESTAMP|CRC32)[ \t]*=[ \t]*([^ \t]+)[ \t]*\r?\n\Z") # noqa: E501 line too long
1078 PATTERN2 = re.compile(br"\A(ROOT|COMMENT|ERROR|GENERATOR)[ \t]*\((.*)\)[ \t]*\r?\n\Z") # noqa: E501 line too long 1157 PATTERN2 = re.compile(br"\A(ROOT|COMMENT|ERROR|GENERATOR)[ \t]*\((.*)\)[ \t]*\r?\n\Z") # noqa: E501 line too long
1079 PATTERN3 = re.compile(br"\ASIZE[ \t]*\((.*)\)[ \t]*=[ \t]*(\d+)[ \t]*\r?\n\Z") # noqa: E501 line too long 1158 PATTERN3 = re.compile(br"\ASIZE[ \t]*\((.*)\)([ \t]*=[ \t]*(\d+))?[ \t]*\r?\n\Z") # noqa: E501 line too long
1080 PATTERN4 = re.compile(br"\A([A-Za-z0-9_-]+)[ \t]*\((.*)\)[ \t]*=[ \t]*([A-Za-z0-9=+/]+)(,(\d+))?[ \t]*\r?\n\Z") # noqa: E501 line too long 1159 PATTERN4 = re.compile(br"\A([A-Za-z0-9_-]+)[ \t]*\((.*)\)([ \t]*=[ \t]*([A-Za-z0-9=+/]+)(,(\d+))?)?[ \t]*\r?\n\Z") # noqa: E501 line too long
1081 1160
1082 def __init__(self, _fp, _filename, _own_fp): 1161 def __init__(self, _fp, _filename, _own_fp):
1083 self._fp = _fp 1162 self._fp = _fp
1084 self._own_fp = _own_fp 1163 self._own_fp = _own_fp
1085 self._filename = _filename 1164 self._filename = _filename
1209 assert False, line 1288 assert False, line
1210 else: 1289 else:
1211 mo = self.PATTERN3.search(line) 1290 mo = self.PATTERN3.search(line)
1212 if mo: 1291 if mo:
1213 self._update_crc(line) 1292 self._update_crc(line)
1214 return ("SIZE", mo.group(1), int(util.n(mo.group(2)), 10)) 1293 if mo.group(2):
1294 return ("SIZE", mo.group(1),
1295 int(util.n(mo.group(3)), 10))
1296 else:
1297 return ("SIZE", mo.group(1), None)
1215 else: 1298 else:
1216 mo = self.PATTERN4.search(line) 1299 mo = self.PATTERN4.search(line)
1217 if mo: 1300 if mo:
1218 self._update_crc(line) 1301 self._update_crc(line)
1219 algo_name = util.n(mo.group(1)) 1302 algo_name = util.n(mo.group(1))
1220 if (len(mo.group(3)) == 1303 if mo.group(3):
1221 2 * self._get_digest_size(algo_name)): 1304 if (len(mo.group(4)) ==
1222 # hex 1305 2 * self._get_digest_size(algo_name)):
1223 digest = binascii.unhexlify(mo.group(3)) 1306 # hex
1224 else: 1307 digest = binascii.unhexlify(mo.group(4))
1225 # base64 1308 else:
1226 digest = base64.b64decode(mo.group(3)) 1309 # base64
1227 if mo.group(4): 1310 digest = base64.b64decode(mo.group(4))
1228 size = int(util.n(mo.group(5)), 10) 1311 if mo.group(5):
1229 else: 1312 size = int(util.n(mo.group(6)), 10)
1230 size = None 1313 else:
1231 return (algo_name, mo.group(2), digest, size) 1314 size = None
1315 return (algo_name, mo.group(2), digest, size)
1316 else:
1317 return (algo_name, mo.group(2), None, None)
1232 else: 1318 else:
1233 assert False, line 1319 assert False, line
1234 return line 1320 return line
1235 1321
1236 def _get_next_line(self): 1322 def _get_next_line(self):