comparison mupdf-source/thirdparty/harfbuzz/src/gen-use-table.py @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 #!/usr/bin/env python3
2 # flake8: noqa: F821
3
4 import logging
5 logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
6
7 """usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt ArabicShaping.txt DerivedCoreProperties.txt UnicodeData.txt Blocks.txt Scripts.txt IndicSyllabicCategory-Additional.txt IndicPositionalCategory-Additional.txt
8
9 Input files:
10 * https://unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt
11 * https://unicode.org/Public/UCD/latest/ucd/IndicPositionalCategory.txt
12 * https://unicode.org/Public/UCD/latest/ucd/ArabicShaping.txt
13 * https://unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
14 * https://unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
15 * https://unicode.org/Public/UCD/latest/ucd/Blocks.txt
16 * https://unicode.org/Public/UCD/latest/ucd/Scripts.txt
17 * ms-use/IndicSyllabicCategory-Additional.txt
18 * ms-use/IndicPositionalCategory-Additional.txt
19 """
20
21 import sys
22
23 if len (sys.argv) != 10:
24 sys.exit (__doc__)
25
26 DISABLED_SCRIPTS = {
27 'Arabic',
28 'Lao',
29 'Samaritan',
30 'Syriac',
31 'Thai',
32 }
33
34 files = [open (x, encoding='utf-8') for x in sys.argv[1:]]
35
36 headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 4]
37 for j in range(7, 9):
38 for line in files[j]:
39 line = line.rstrip()
40 if not line:
41 break
42 headers[j - 1].append(line)
43 headers.append (["UnicodeData.txt does not have a header."])
44
45 unicode_data = [{} for _ in files]
46 values = [{} for _ in files]
47 for i, f in enumerate (files):
48 for line in f:
49
50 j = line.find ('#')
51 if j >= 0:
52 line = line[:j]
53
54 fields = [x.strip () for x in line.split (';')]
55 if len (fields) == 1:
56 continue
57
58 uu = fields[0].split ('..')
59 start = int (uu[0], 16)
60 if len (uu) == 1:
61 end = start
62 else:
63 end = int (uu[1], 16)
64
65 t = fields[1 if i not in [2, 4] else 2]
66
67 if i == 2:
68 t = 'jt_' + t
69 elif i == 3 and t != 'Default_Ignorable_Code_Point':
70 continue
71 elif i == 7 and t == 'Consonant_Final_Modifier':
72 # TODO: https://github.com/MicrosoftDocs/typography-issues/issues/336
73 t = 'Syllable_Modifier'
74 elif i == 8 and t == 'NA':
75 t = 'Not_Applicable'
76
77 i0 = i if i < 7 else i - 7
78 for u in range (start, end + 1):
79 unicode_data[i0][u] = t
80 values[i0][t] = values[i0].get (t, 0) + end - start + 1
81
82 defaults = ('Other', 'Not_Applicable', 'jt_X', '', 'Cn', 'No_Block', 'Unknown')
83
84 # Merge data into one dict:
85 for i,v in enumerate (defaults):
86 values[i][v] = values[i].get (v, 0) + 1
87 combined = {}
88 for i,d in enumerate (unicode_data):
89 for u,v in d.items ():
90 if not u in combined:
91 if i >= 4:
92 continue
93 combined[u] = list (defaults)
94 combined[u][i] = v
95 combined = {k: v for k, v in combined.items() if v[6] not in DISABLED_SCRIPTS}
96
97
98 property_names = [
99 # General_Category
100 'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc',
101 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po',
102 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',
103 # Indic_Syllabic_Category
104 'Other',
105 'Bindu',
106 'Visarga',
107 'Avagraha',
108 'Nukta',
109 'Virama',
110 'Pure_Killer',
111 'Invisible_Stacker',
112 'Vowel_Independent',
113 'Vowel_Dependent',
114 'Vowel',
115 'Consonant_Placeholder',
116 'Consonant',
117 'Consonant_Dead',
118 'Consonant_With_Stacker',
119 'Consonant_Prefixed',
120 'Consonant_Preceding_Repha',
121 'Consonant_Succeeding_Repha',
122 'Consonant_Subjoined',
123 'Consonant_Medial',
124 'Consonant_Final',
125 'Consonant_Head_Letter',
126 'Consonant_Initial_Postfixed',
127 'Modifying_Letter',
128 'Tone_Letter',
129 'Tone_Mark',
130 'Gemination_Mark',
131 'Cantillation_Mark',
132 'Register_Shifter',
133 'Syllable_Modifier',
134 'Consonant_Killer',
135 'Non_Joiner',
136 'Joiner',
137 'Number_Joiner',
138 'Number',
139 'Brahmi_Joining_Number',
140 'Symbol_Modifier',
141 'Hieroglyph',
142 'Hieroglyph_Joiner',
143 'Hieroglyph_Segment_Begin',
144 'Hieroglyph_Segment_End',
145 # Indic_Positional_Category
146 'Not_Applicable',
147 'Right',
148 'Left',
149 'Visual_Order_Left',
150 'Left_And_Right',
151 'Top',
152 'Bottom',
153 'Top_And_Bottom',
154 'Top_And_Bottom_And_Left',
155 'Top_And_Right',
156 'Top_And_Left',
157 'Top_And_Left_And_Right',
158 'Bottom_And_Left',
159 'Bottom_And_Right',
160 'Top_And_Bottom_And_Right',
161 'Overstruck',
162 # Joining_Type
163 'jt_C',
164 'jt_D',
165 'jt_L',
166 'jt_R',
167 'jt_T',
168 'jt_U',
169 'jt_X',
170 ]
171
172 class PropertyValue(object):
173 def __init__(self, name_):
174 self.name = name_
175 def __str__(self):
176 return self.name
177 def __eq__(self, other):
178 return self.name == (other if isinstance(other, str) else other.name)
179 def __ne__(self, other):
180 return not (self == other)
181 def __hash__(self):
182 return hash(str(self))
183
184 property_values = {}
185
186 for name in property_names:
187 value = PropertyValue(name)
188 assert value not in property_values
189 assert value not in globals()
190 property_values[name] = value
191 globals().update(property_values)
192
193
194 def is_BASE(U, UISC, UDI, UGC, AJT):
195 return (UISC in [Number, Consonant, Consonant_Head_Letter,
196 Tone_Letter,
197 Vowel_Independent,
198 ] or
199 # TODO: https://github.com/MicrosoftDocs/typography-issues/issues/484
200 AJT in [jt_C, jt_D, jt_L, jt_R] and UISC != Joiner or
201 (UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial,
202 Consonant_Subjoined, Vowel, Vowel_Dependent]))
203 def is_BASE_NUM(U, UISC, UDI, UGC, AJT):
204 return UISC == Brahmi_Joining_Number
205 def is_BASE_OTHER(U, UISC, UDI, UGC, AJT):
206 if UISC == Consonant_Placeholder: return True
207 return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
208 def is_CGJ(U, UISC, UDI, UGC, AJT):
209 # Also includes VARIATION_SELECTOR and ZWJ
210 return UISC == Joiner or UDI and UGC in [Mc, Me, Mn]
211 def is_CONS_FINAL(U, UISC, UDI, UGC, AJT):
212 return ((UISC == Consonant_Final and UGC != Lo) or
213 UISC == Consonant_Succeeding_Repha)
214 def is_CONS_FINAL_MOD(U, UISC, UDI, UGC, AJT):
215 return UISC == Syllable_Modifier
216 def is_CONS_MED(U, UISC, UDI, UGC, AJT):
217 # Consonant_Initial_Postfixed is new in Unicode 11; not in the spec.
218 return (UISC == Consonant_Medial and UGC != Lo or
219 UISC == Consonant_Initial_Postfixed)
220 def is_CONS_MOD(U, UISC, UDI, UGC, AJT):
221 return UISC in [Nukta, Gemination_Mark, Consonant_Killer]
222 def is_CONS_SUB(U, UISC, UDI, UGC, AJT):
223 return UISC == Consonant_Subjoined and UGC != Lo
224 def is_CONS_WITH_STACKER(U, UISC, UDI, UGC, AJT):
225 return UISC == Consonant_With_Stacker
226 def is_HALANT(U, UISC, UDI, UGC, AJT):
227 return UISC == Virama and not is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UDI, UGC, AJT)
228 def is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UDI, UGC, AJT):
229 # Split off of HALANT
230 return U == 0x0DCA
231 def is_HALANT_NUM(U, UISC, UDI, UGC, AJT):
232 return UISC == Number_Joiner
233 def is_HIEROGLYPH(U, UISC, UDI, UGC, AJT):
234 return UISC == Hieroglyph
235 def is_HIEROGLYPH_JOINER(U, UISC, UDI, UGC, AJT):
236 return UISC == Hieroglyph_Joiner
237 def is_HIEROGLYPH_SEGMENT_BEGIN(U, UISC, UDI, UGC, AJT):
238 return UISC == Hieroglyph_Segment_Begin
239 def is_HIEROGLYPH_SEGMENT_END(U, UISC, UDI, UGC, AJT):
240 return UISC == Hieroglyph_Segment_End
241 def is_INVISIBLE_STACKER(U, UISC, UDI, UGC, AJT):
242 # Split off of HALANT
243 return (UISC == Invisible_Stacker
244 and not is_SAKOT(U, UISC, UDI, UGC, AJT)
245 )
246 def is_ZWNJ(U, UISC, UDI, UGC, AJT):
247 return UISC == Non_Joiner
248 def is_OTHER(U, UISC, UDI, UGC, AJT):
249 # Also includes BASE_IND and SYM
250 return ((UGC == Po or UISC in [Consonant_Dead, Joiner, Modifying_Letter, Other])
251 and not is_BASE(U, UISC, UDI, UGC, AJT)
252 and not is_BASE_OTHER(U, UISC, UDI, UGC, AJT)
253 and not is_CGJ(U, UISC, UDI, UGC, AJT)
254 and not is_SYM_MOD(U, UISC, UDI, UGC, AJT)
255 and not is_Word_Joiner(U, UISC, UDI, UGC, AJT)
256 )
257 def is_REPHA(U, UISC, UDI, UGC, AJT):
258 return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed]
259 def is_SAKOT(U, UISC, UDI, UGC, AJT):
260 # Split off of HALANT
261 return U == 0x1A60
262 def is_SYM_MOD(U, UISC, UDI, UGC, AJT):
263 return UISC == Symbol_Modifier
264 def is_VOWEL(U, UISC, UDI, UGC, AJT):
265 return (UISC == Pure_Killer or
266 UGC != Lo and UISC in [Vowel, Vowel_Dependent])
267 def is_VOWEL_MOD(U, UISC, UDI, UGC, AJT):
268 return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
269 UGC != Lo and UISC == Bindu)
270 def is_Word_Joiner(U, UISC, UDI, UGC, AJT):
271 # Also includes Rsv
272 return (UDI and U not in [0x115F, 0x1160, 0x3164, 0xFFA0, 0x1BCA0, 0x1BCA1, 0x1BCA2, 0x1BCA3]
273 and UISC == Other
274 and not is_CGJ(U, UISC, UDI, UGC, AJT)
275 ) or UGC == Cn
276
277 use_mapping = {
278 'B': is_BASE,
279 'N': is_BASE_NUM,
280 'GB': is_BASE_OTHER,
281 'CGJ': is_CGJ,
282 'F': is_CONS_FINAL,
283 'FM': is_CONS_FINAL_MOD,
284 'M': is_CONS_MED,
285 'CM': is_CONS_MOD,
286 'SUB': is_CONS_SUB,
287 'CS': is_CONS_WITH_STACKER,
288 'H': is_HALANT,
289 'HVM': is_HALANT_OR_VOWEL_MODIFIER,
290 'HN': is_HALANT_NUM,
291 'IS': is_INVISIBLE_STACKER,
292 'G': is_HIEROGLYPH,
293 'J': is_HIEROGLYPH_JOINER,
294 'SB': is_HIEROGLYPH_SEGMENT_BEGIN,
295 'SE': is_HIEROGLYPH_SEGMENT_END,
296 'ZWNJ': is_ZWNJ,
297 'O': is_OTHER,
298 'R': is_REPHA,
299 'Sk': is_SAKOT,
300 'SM': is_SYM_MOD,
301 'V': is_VOWEL,
302 'VM': is_VOWEL_MOD,
303 'WJ': is_Word_Joiner,
304 }
305
306 use_positions = {
307 'F': {
308 'Abv': [Top],
309 'Blw': [Bottom],
310 'Pst': [Right],
311 },
312 'M': {
313 'Abv': [Top],
314 'Blw': [Bottom, Bottom_And_Left, Bottom_And_Right],
315 'Pst': [Right],
316 'Pre': [Left, Top_And_Bottom_And_Left],
317 },
318 'CM': {
319 'Abv': [Top],
320 'Blw': [Bottom, Overstruck],
321 },
322 'V': {
323 'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right],
324 'Blw': [Bottom, Overstruck, Bottom_And_Right],
325 'Pst': [Right],
326 'Pre': [Left, Top_And_Left, Top_And_Left_And_Right, Left_And_Right],
327 },
328 'VM': {
329 'Abv': [Top],
330 'Blw': [Bottom, Overstruck],
331 'Pst': [Right],
332 'Pre': [Left],
333 },
334 'SM': {
335 'Abv': [Top],
336 'Blw': [Bottom],
337 },
338 'H': None,
339 'HVM': None,
340 'IS': None,
341 'B': None,
342 'FM': {
343 'Abv': [Top],
344 'Blw': [Bottom],
345 'Pst': [Not_Applicable],
346 },
347 'R': None,
348 'SUB': None,
349 }
350
351 def map_to_use(data):
352 out = {}
353 items = use_mapping.items()
354 for U, (UISC, UIPC, AJT, UDI, UGC, UBlock, _) in data.items():
355
356 # Resolve Indic_Syllabic_Category
357
358 # TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC
359 if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark
360
361 # Tibetan:
362 # TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC
363 if 0x0F18 <= U <= 0x0F19 or 0x0F3E <= U <= 0x0F3F: UISC = Vowel_Dependent
364
365 # TODO: U+1CED should only be allowed after some of
366 # the nasalization marks, maybe only for U+1CE9..U+1CF1.
367 if U == 0x1CED: UISC = Tone_Mark
368
369 values = [k for k,v in items if v(U, UISC, UDI, UGC, AJT)]
370 assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UISC, UDI, UGC, AJT, values)
371 USE = values[0]
372
373 # Resolve Indic_Positional_Category
374
375 # TODO: https://github.com/harfbuzz/harfbuzz/pull/1037
376 # and https://github.com/harfbuzz/harfbuzz/issues/1631
377 if U in [0x11302, 0x11303, 0x114C1]: UIPC = Top
378
379 assert (UIPC in [Not_Applicable, Visual_Order_Left] or U == 0x0F7F or
380 USE in use_positions), "%s %s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UDI, UGC, AJT)
381
382 pos_mapping = use_positions.get(USE, None)
383 if pos_mapping:
384 values = [k for k,v in pos_mapping.items() if v and UIPC in v]
385 assert len(values) == 1, "%s %s %s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UDI, UGC, AJT, values)
386 USE = USE + values[0]
387
388 out[U] = (USE, UBlock)
389 return out
390
391 use_data = map_to_use(combined)
392
393 print ("/* == Start of generated table == */")
394 print ("/*")
395 print (" * The following table is generated by running:")
396 print (" *")
397 print (" * {} IndicSyllabicCategory.txt IndicPositionalCategory.txt ArabicShaping.txt DerivedCoreProperties.txt UnicodeData.txt Blocks.txt Scripts.txt IndicSyllabicCategory-Additional.txt IndicPositionalCategory-Additional.txt".format (sys.argv[0]))
398 print (" *")
399 print (" * on files with these headers:")
400 print (" *")
401 for h in headers:
402 for l in h:
403 print (" * %s" % (l.strip()))
404 print (" */")
405 print ()
406 print ("#ifndef HB_OT_SHAPER_USE_TABLE_HH")
407 print ("#define HB_OT_SHAPER_USE_TABLE_HH")
408 print ()
409 print ('#include "hb.hh"')
410 print ()
411 print ('#include "hb-ot-shaper-use-machine.hh"')
412 print ()
413
414 total = 0
415 used = 0
416 last_block = None
417 def print_block (block, start, end, use_data):
418 global total, used, last_block
419 if block and block != last_block:
420 print ()
421 print ()
422 print (" /* %s */" % block)
423 if start % 16:
424 print (' ' * (20 + (start % 16 * 6)), end='')
425 num = 0
426 assert start % 8 == 0
427 assert (end+1) % 8 == 0
428 for u in range (start, end+1):
429 if u % 16 == 0:
430 print ()
431 print (" /* %04X */" % u, end='')
432 if u in use_data:
433 num += 1
434 d = use_data.get (u)
435 if d is not None:
436 d = d[0]
437 elif u in unicode_data[4]:
438 d = 'O'
439 else:
440 d = 'WJ'
441 print ("%6s," % d, end='')
442
443 total += end - start + 1
444 used += num
445 if block:
446 last_block = block
447
448 uu = sorted (use_data.keys ())
449
450 last = -100000
451 num = 0
452 offset = 0
453 starts = []
454 ends = []
455 print ('#pragma GCC diagnostic push')
456 print ('#pragma GCC diagnostic ignored "-Wunused-macros"')
457 for k,v in sorted(use_mapping.items()):
458 if k in use_positions and use_positions[k]: continue
459 print ("#define %s USE(%s) /* %s */" % (k, k, v.__name__[3:]))
460 for k,v in sorted(use_positions.items()):
461 if not v: continue
462 for suf in v.keys():
463 tag = k + suf
464 print ("#define %s USE(%s)" % (tag, tag))
465 print ('#pragma GCC diagnostic pop')
466 print ("")
467
468
469 import packTab
470 data = {u:v[0] for u,v in use_data.items()}
471
472 DEFAULT = 5
473 COMPACT = 9
474 for compression in (DEFAULT, COMPACT):
475
476 logging.info(' Compression=%d:' % compression)
477 print()
478 if compression == DEFAULT:
479 print('#ifndef HB_OPTIMIZE_SIZE')
480 elif compression == COMPACT:
481 print('#else')
482 else:
483 assert False
484 print()
485
486 code = packTab.Code('hb_use')
487 sol = packTab.pack_table(data, compression=compression, default='O')
488 logging.info(' FullCost=%d' % (sol.fullCost))
489 sol.genCode(code, f'get_category')
490 code.print_c(linkage='static inline')
491 print ()
492
493 print('#endif')
494
495 print ()
496 for k in sorted(use_mapping.keys()):
497 if k in use_positions and use_positions[k]: continue
498 print ("#undef %s" % k)
499 for k,v in sorted(use_positions.items()):
500 if not v: continue
501 for suf in v.keys():
502 tag = k + suf
503 print ("#undef %s" % tag)
504 print ()
505 print ()
506 print ("#endif /* HB_OT_SHAPER_USE_TABLE_HH */")
507 print ("/* == End of generated table == */")