comparison mupdf-source/thirdparty/harfbuzz/src/gen-arabic-table.py @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 #!/usr/bin/env python3
2
3 """usage: ./gen-arabic-table.py ArabicShaping.txt UnicodeData.txt Blocks.txt
4
5 Input files:
6 * https://unicode.org/Public/UCD/latest/ucd/ArabicShaping.txt
7 * https://unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
8 * https://unicode.org/Public/UCD/latest/ucd/Blocks.txt
9 """
10
11 import os.path, sys
12
13 if len (sys.argv) != 4:
14 sys.exit (__doc__)
15
16 files = [open (x, encoding='utf-8') for x in sys.argv[1:]]
17
18 headers = [[files[0].readline (), files[0].readline ()], [files[2].readline (), files[2].readline ()]]
19 headers.append (["UnicodeData.txt does not have a header."])
20 while files[0].readline ().find ('##################') < 0:
21 pass
22
23 blocks = {}
24 def read_blocks(f):
25 global blocks
26 for line in f:
27
28 j = line.find ('#')
29 if j >= 0:
30 line = line[:j]
31
32 fields = [x.strip () for x in line.split (';')]
33 if len (fields) == 1:
34 continue
35
36 uu = fields[0].split ('..')
37 start = int (uu[0], 16)
38 if len (uu) == 1:
39 end = start
40 else:
41 end = int (uu[1], 16)
42
43 t = fields[1]
44
45 for u in range (start, end + 1):
46 blocks[u] = t
47
48 def print_joining_table(f):
49
50 values = {}
51 for line in f:
52
53 if line[0] == '#':
54 continue
55
56 fields = [x.strip () for x in line.split (';')]
57 if len (fields) == 1:
58 continue
59
60 u = int (fields[0], 16)
61
62 if fields[3] in ["ALAPH", "DALATH RISH"]:
63 value = "JOINING_GROUP_" + fields[3].replace(' ', '_')
64 else:
65 value = "JOINING_TYPE_" + fields[2]
66 values[u] = value
67
68 short_value = {}
69 for value in sorted (set ([v for v in values.values ()] + ['JOINING_TYPE_X'])):
70 short = ''.join(x[0] for x in value.split('_')[2:])
71 assert short not in short_value.values()
72 short_value[value] = short
73
74 print ()
75 for value,short in short_value.items():
76 print ("#define %s %s" % (short, value))
77
78 uu = sorted(values.keys())
79 num = len(values)
80 all_blocks = set([blocks[u] for u in uu])
81
82 last = -100000
83 ranges = []
84 for u in uu:
85 if u - last <= 1+16*5:
86 ranges[-1][-1] = u
87 else:
88 ranges.append([u,u])
89 last = u
90
91 print ()
92 print ("static const uint8_t joining_table[] =")
93 print ("{")
94 last_block = None
95 offset = 0
96 for start,end in ranges:
97
98 print ()
99 print ("#define joining_offset_0x%04xu %d" % (start, offset))
100
101 for u in range(start, end+1):
102
103 block = blocks.get(u, last_block)
104 value = values.get(u, "JOINING_TYPE_X")
105
106 if block != last_block or u == start:
107 if u != start:
108 print ()
109 if block in all_blocks:
110 print ("\n /* %s */" % block)
111 else:
112 print ("\n /* FILLER */")
113 last_block = block
114 if u % 32 != 0:
115 print ()
116 print (" /* %04X */" % (u//32*32), " " * (u % 32), end="")
117
118 if u % 32 == 0:
119 print ()
120 print (" /* %04X */ " % u, end="")
121 print ("%s," % short_value[value], end="")
122 print ()
123
124 offset += end - start + 1
125 print ()
126 occupancy = num * 100. / offset
127 print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy))
128 print ()
129
130 page_bits = 12
131 print ()
132 print ("static unsigned int")
133 print ("joining_type (hb_codepoint_t u)")
134 print ("{")
135 print (" switch (u >> %d)" % page_bits)
136 print (" {")
137 pages = set([u>>page_bits for u in [s for s,e in ranges]+[e for s,e in ranges]])
138 for p in sorted(pages):
139 print (" case 0x%0Xu:" % p)
140 for (start,end) in ranges:
141 if p not in [start>>page_bits, end>>page_bits]: continue
142 offset = "joining_offset_0x%04xu" % start
143 print (" if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return joining_table[u - 0x%04Xu + %s];" % (start, end, start, offset))
144 print (" break;")
145 print ("")
146 print (" default:")
147 print (" break;")
148 print (" }")
149 print (" return X;")
150 print ("}")
151 print ()
152 for value,short in short_value.items():
153 print ("#undef %s" % (short))
154 print ()
155
156 LIGATURES = (
157 0xF2EE, 0xFC08, 0xFC0E, 0xFC12, 0xFC32, 0xFC3F, 0xFC40, 0xFC41, 0xFC42,
158 0xFC44, 0xFC4E, 0xFC5E, 0xFC60, 0xFC61, 0xFC62, 0xFC6A, 0xFC6D, 0xFC6F,
159 0xFC70, 0xFC73, 0xFC75, 0xFC86, 0xFC8F, 0xFC91, 0xFC94, 0xFC9C, 0xFC9D,
160 0xFC9E, 0xFC9F, 0xFCA1, 0xFCA2, 0xFCA3, 0xFCA4, 0xFCA8, 0xFCAA, 0xFCAC,
161 0xFCB0, 0xFCC9, 0xFCCA, 0xFCCB, 0xFCCC, 0xFCCD, 0xFCCE, 0xFCCF, 0xFCD0,
162 0xFCD1, 0xFCD2, 0xFCD3, 0xFCD5, 0xFCDA, 0xFCDB, 0xFCDC, 0xFCDD, 0xFD30,
163 0xFD88, 0xFEF5, 0xFEF6, 0xFEF7, 0xFEF8, 0xFEF9, 0xFEFA, 0xFEFB, 0xFEFC,
164 0xF201, 0xF211, 0xF2EE,
165 )
166
167 def print_shaping_table(f):
168
169 shapes = {}
170 ligatures = {}
171 names = {}
172 lines = f.readlines()
173 lines += [
174 "F201;PUA ARABIC LIGATURE LELLAH ISOLATED FORM;Lo;0;AL;<isolated> 0644 0644 0647;;;;N;;;;;",
175 "F211;PUA ARABIC LIGATURE LAM WITH MEEM WITH JEEM INITIAL FORM;Lo;0;AL;<initial> 0644 0645 062C;;;;N;;;;;",
176 "F2EE;PUA ARABIC LIGATURE SHADDA WITH FATHATAN ISOLATED FORM;Lo;0;AL;<isolated> 0020 064B 0651;;;;N;;;;;",
177 ]
178 for line in lines:
179
180 fields = [x.strip () for x in line.split (';')]
181 if fields[5][0:1] != '<':
182 continue
183
184 items = fields[5].split (' ')
185 shape, items = items[0][1:-1], tuple (int (x, 16) for x in items[1:])
186 c = int (fields[0], 16)
187
188 if not shape in ['initial', 'medial', 'isolated', 'final']:
189 continue
190
191 if len (items) != 1:
192 # Mark ligatures start with space and are in visual order, so we
193 # remove the space and reverse the items.
194 if items[0] == 0x0020:
195 items = items[:0:-1]
196 shape = None
197 # We only care about a subset of ligatures
198 if c not in LIGATURES:
199 continue
200
201 # Save ligature
202 names[c] = fields[1]
203 if items not in ligatures:
204 ligatures[items] = {}
205 ligatures[items][shape] = c
206 else:
207 # Save shape
208 if items[0] not in names:
209 names[items[0]] = fields[1]
210 else:
211 names[items[0]] = os.path.commonprefix ([names[items[0]], fields[1]]).strip ()
212 if items[0] not in shapes:
213 shapes[items[0]] = {}
214 shapes[items[0]][shape] = c
215
216 print ()
217 print ("static const uint16_t shaping_table[][4] =")
218 print ("{")
219
220 keys = shapes.keys ()
221 min_u, max_u = min (keys), max (keys)
222 for u in range (min_u, max_u + 1):
223 s = [shapes[u][shape] if u in shapes and shape in shapes[u] else 0
224 for shape in ['initial', 'medial', 'final', 'isolated']]
225 value = ', '.join ("0x%04Xu" % c for c in s)
226 print (" {%s}, /* U+%04X %s */" % (value, u, names[u] if u in names else ""))
227
228 print ("};")
229 print ()
230 print ("#define SHAPING_TABLE_FIRST 0x%04Xu" % min_u)
231 print ("#define SHAPING_TABLE_LAST 0x%04Xu" % max_u)
232 print ()
233
234 ligas_2 = {}
235 ligas_3 = {}
236 ligas_mark_2 = {}
237 for key in ligatures.keys ():
238 for shape in ligatures[key]:
239 c = ligatures[key][shape]
240 if len(key) == 3:
241 if shape == 'isolated':
242 liga = (shapes[key[0]]['initial'], shapes[key[1]]['medial'], shapes[key[2]]['final'])
243 elif shape == 'final':
244 liga = (shapes[key[0]]['medial'], shapes[key[1]]['medial'], shapes[key[2]]['final'])
245 elif shape == 'initial':
246 liga = (shapes[key[0]]['initial'], shapes[key[1]]['medial'], shapes[key[2]]['medial'])
247 else:
248 raise Exception ("Unexpected shape", shape)
249 if liga[0] not in ligas_3:
250 ligas_3[liga[0]] = []
251 ligas_3[liga[0]].append ((liga[1], liga[2], c))
252 elif len(key) == 2:
253 if shape is None:
254 liga = key
255 if liga[0] not in ligas_mark_2:
256 ligas_mark_2[liga[0]] = []
257 ligas_mark_2[liga[0]].append ((liga[1], c))
258 continue
259 elif shape == 'isolated':
260 liga = (shapes[key[0]]['initial'], shapes[key[1]]['final'])
261 elif shape == 'final':
262 liga = (shapes[key[0]]['medial'], shapes[key[1]]['final'])
263 elif shape == 'initial':
264 liga = (shapes[key[0]]['initial'], shapes[key[1]]['medial'])
265 else:
266 raise Exception ("Unexpected shape", shape)
267 if liga[0] not in ligas_2:
268 ligas_2[liga[0]] = []
269 ligas_2[liga[0]].append ((liga[1], c))
270 else:
271 raise Exception ("Unexpected number of ligature components", key)
272 max_i = max (len (ligas_2[l]) for l in ligas_2)
273 print ()
274 print ("static const struct ligature_set_t {")
275 print (" uint16_t first;")
276 print (" struct ligature_pairs_t {")
277 print (" uint16_t components[1];")
278 print (" uint16_t ligature;")
279 print (" } ligatures[%d];" % max_i)
280 print ("} ligature_table[] =")
281 print ("{")
282 for first in sorted (ligas_2.keys ()):
283
284 print (" { 0x%04Xu, {" % (first))
285 for liga in ligas_2[first]:
286 print (" { {0x%04Xu}, 0x%04Xu }, /* %s */" % (liga[0], liga[1], names[liga[1]]))
287 print (" }},")
288
289 print ("};")
290 print ()
291
292 max_i = max (len (ligas_mark_2[l]) for l in ligas_mark_2)
293 print ()
294 print ("static const struct ligature_mark_set_t {")
295 print (" uint16_t first;")
296 print (" struct ligature_pairs_t {")
297 print (" uint16_t components[1];")
298 print (" uint16_t ligature;")
299 print (" } ligatures[%d];" % max_i)
300 print ("} ligature_mark_table[] =")
301 print ("{")
302 for first in sorted (ligas_mark_2.keys ()):
303
304 print (" { 0x%04Xu, {" % (first))
305 for liga in ligas_mark_2[first]:
306 print (" { {0x%04Xu}, 0x%04Xu }, /* %s */" % (liga[0], liga[1], names[liga[1]]))
307 print (" }},")
308
309 print ("};")
310 print ()
311
312 max_i = max (len (ligas_3[l]) for l in ligas_3)
313 print ()
314 print ("static const struct ligature_3_set_t {")
315 print (" uint16_t first;")
316 print (" struct ligature_triplets_t {")
317 print (" uint16_t components[2];")
318 print (" uint16_t ligature;")
319 print (" } ligatures[%d];" % max_i)
320 print ("} ligature_3_table[] =")
321 print ("{")
322 for first in sorted (ligas_3.keys ()):
323
324 print (" { 0x%04Xu, {" % (first))
325 for liga in ligas_3[first]:
326 print (" { {0x%04Xu, 0x%04Xu}, 0x%04Xu}, /* %s */" % (liga[0], liga[1], liga[2], names[liga[2]]))
327 print (" }},")
328
329 print ("};")
330 print ()
331
332
333
334 print ("/* == Start of generated table == */")
335 print ("/*")
336 print (" * The following table is generated by running:")
337 print (" *")
338 print (" * ./gen-arabic-table.py ArabicShaping.txt UnicodeData.txt Blocks.txt")
339 print (" *")
340 print (" * on files with these headers:")
341 print (" *")
342 for h in headers:
343 for l in h:
344 print (" * %s" % (l.strip()))
345 print (" */")
346 print ()
347 print ("#ifndef HB_OT_SHAPER_ARABIC_TABLE_HH")
348 print ("#define HB_OT_SHAPER_ARABIC_TABLE_HH")
349 print ()
350
351 read_blocks (files[2])
352 print_joining_table (files[0])
353 print_shaping_table (files[1])
354
355 print ()
356 print ("#endif /* HB_OT_SHAPER_ARABIC_TABLE_HH */")
357 print ()
358 print ("/* == End of generated table == */")