comparison mupdf-source/scripts/cmapclean.py @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 #!/usr/bin/env python3
2
3 # Parse a CMap file and dump it back out.
4
5 import sys
6
7 # Decode a subset of CMap syntax (only what is needed for our built-in resources)
8 # We require that tokens are whitespace separated.
9
10 def cleancmap(filename):
11 codespacerange = []
12 usecmap = ""
13 cmapname = ""
14 cmapversion = "1.0"
15 csi_registry = "(Adobe)"
16 csi_ordering = "(Unknown)"
17 csi_supplement = 1
18 wmode = 0
19 isbf = False
20
21 map = {}
22
23 def tocode(s):
24 if s[0] == '<' and s[-1] == '>':
25 return int(s[1:-1], 16)
26 return int(s, 10)
27
28 def map_cidchar(lo, v):
29 map[lo] = v
30
31 def map_cidrange(lo, hi, v):
32 while lo <= hi:
33 map[lo] = v
34 lo = lo + 1
35 v = v + 1
36
37 def add_bf(lo, v):
38 # Decode unicode surrogate pairs
39 if len(v) == 2 and v[0] >= 0xd800 and v[0] <= 0xdbff and v[1] >= 0xdc00 and v[1] <= 0xdfff:
40 map[lo] = ((v[0] - 0xd800) << 10) + (v[1] - 0xdc00) + 0x10000
41 elif len(v) == 1:
42 map[lo] = v[0]
43 elif len(v) <= 8:
44 map[lo] = v[:]
45 else:
46 print("/* warning: too long one-to-many mapping: %s */" % (v))
47
48 def map_bfchar(lo, bf):
49 bf = bf[1:-1] # drop < >
50 v = [int(bf[i:i+4],16) for i in range(0, len(bf), 4)]
51 add_bf(lo, v)
52
53 def map_bfrange(lo, hi, bf):
54 bf = bf[1:-1] # drop < >
55 v = [int(bf[i:i+4],16) for i in range(0, len(bf), 4)]
56 while lo <= hi:
57 add_bf(lo, v)
58 lo = lo + 1
59 v[-1] = v[-1] + 1
60
61 current = None
62 for line in open(filename, "r").readlines():
63 if line[0] == '%':
64 continue
65 line = line.strip().split()
66 if len(line) == 0:
67 continue
68 if line[0] == '/CMapVersion': cmapversion = line[1]
69 elif line[0] == '/CMapName': cmapname = line[1][1:]
70 elif line[0] == '/WMode': wmode = int(line[1])
71 elif line[0] == '/Registry': csi_registry = line[1]
72 elif line[0] == '/Ordering': csi_ordering = line[1]
73 elif line[0] == '/Supplement': csi_supplement = line[1]
74 elif len(line) > 1 and line[1] == 'usecmap': usecmap = line[0][1:]
75 elif len(line) > 1 and line[1] == 'begincodespacerange': current = 'codespacerange'
76 elif len(line) > 1 and line[1] == 'begincidrange': current = 'cidrange'
77 elif len(line) > 1 and line[1] == 'beginbfrange': current = 'bfrange'; isbf = True
78 elif len(line) > 1 and line[1] == 'begincidchar': current = 'cidchar'
79 elif len(line) > 1 and line[1] == 'beginbfchar': current = 'bfchar'; isbf = True
80 elif line[0] == 'begincodespacerange': current = 'codespacerange'
81 elif line[0] == 'begincidrange': current = 'cidrange'
82 elif line[0] == 'beginbfrange': current = 'bfrange'; isbf = True
83 elif line[0] == 'begincidchar': current = 'cidchar'
84 elif line[0] == 'beginbfchar': current = 'bfchar'; isbf = True
85 elif line[0].startswith("end"):
86 current = None
87 elif current == 'codespacerange' and len(line) == 2:
88 n, a, b = (len(line[0])-2)/2, tocode(line[0]), tocode(line[1])
89 codespacerange.append((n, a, b))
90 elif current == 'cidrange' and len(line) == 3:
91 a, b, c = tocode(line[0]), tocode(line[1]), tocode(line[2])
92 map_cidrange(a, b, c)
93 elif current == 'cidchar' and len(line) == 2:
94 a, b = tocode(line[0]), tocode(line[1])
95 map_cidchar(a, b)
96 elif current == 'bfchar' and len(line) == 2:
97 a, b = tocode(line[0]), line[1]
98 map_bfchar(a, b)
99 elif current == 'bfrange' and len(line) == 3:
100 a, b, c = tocode(line[0]), tocode(line[1]), line[2]
101 map_bfrange(a, b, c)
102
103 # Create ranges
104
105 singles = []
106 ranges = []
107 mranges = []
108
109 out_lo = -100
110 out_hi = -100
111 out_v_lo = 0
112 out_v_hi = 0
113
114 def flush_range():
115 if out_lo >= 0:
116 if out_lo == out_hi:
117 singles.append((out_lo, out_v_lo))
118 else:
119 ranges.append((out_lo, out_hi, out_v_lo))
120
121 keys = list(map.keys())
122 keys.sort()
123 for code in keys:
124 v = map[code]
125 if type(v) is not int:
126 flush_range()
127 out_lo = out_hi = -100
128 mranges.append((code, v))
129 else:
130 if code != out_hi + 1 or v != out_v_hi + 1:
131 flush_range()
132 out_lo = out_hi = code
133 out_v_lo = out_v_hi = v
134 else:
135 out_hi = out_hi + 1
136 out_v_hi = out_v_hi + 1
137 flush_range()
138
139 # Print CMap file
140
141 print("%!PS-Adobe-3.0 Resource-CMap")
142 print("%%DocumentNeededResources: procset (CIDInit)")
143 print("%%IncludeResource: procset (CIDInit)")
144 print("%%%%BeginResource: CMap (%s)" % cmapname)
145 print("%%%%Version: %s" % cmapversion)
146 print("%%EndComments")
147 print("/CIDInit /ProcSet findresource begin")
148 print("12 dict begin")
149 print("begincmap")
150 if usecmap: print("/%s usecmap" % usecmap)
151 print("/CIDSystemInfo 3 dict dup begin")
152 print(" /Registry %s def" % csi_registry)
153 print(" /Ordering %s def" % csi_ordering)
154 print(" /Supplement %s def" % csi_supplement)
155 print("end def")
156 print("/CMapName /%s def" % cmapname)
157 print("/CMapVersion %s def" % cmapversion)
158 print("/CMapType 1 def")
159 print("/WMode %d def" % wmode)
160
161 if len(codespacerange):
162 print("%d begincodespacerange" % len(codespacerange))
163 for r in codespacerange:
164 fmt = "<%%0%dx> <%%0%dx>" % (r[0]*2, r[0]*2)
165 print(fmt % (r[1], r[2]))
166 print("endcodespacerange")
167
168 if len(singles) > 0:
169 if isbf:
170 print("%d beginbfchar" % len(singles))
171 for s in singles:
172 print("<%04x> <%04x>" % s)
173 print("endbfchar")
174 else:
175 print("%d begincidchar" % len(singles))
176 for s in singles:
177 print("<%04x> %d" % s)
178 print("endcidchar")
179
180 if len(ranges) > 0:
181 if isbf:
182 print("%d beginbfrange" % len(ranges))
183 for r in ranges:
184 print("<%04x> <%04x> <%04x>" % r)
185 print("endbfrange")
186 else:
187 print("%d begincidrange" % len(ranges))
188 for r in ranges:
189 print("<%04x> <%04x> %d" % r)
190 print("endcidrange")
191
192 if len(mranges) > 0:
193 print("%d beginbfchar" % len(mranges))
194 for cid, v in mranges:
195 print("<%04x> <%s>" % (cid, "".join(["%04x" % ch for ch in v])))
196 print("endbfchar")
197
198 print("endcmap")
199 print("CMapName currentdict /CMap defineresource pop")
200 print("end")
201 print("end")
202 print("%%EndResource")
203 print("%%EOF")
204
205 for arg in sys.argv[1:]:
206 cleancmap(arg)