Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/harfbuzz/src/gen-indic-table.py @ 3:2c135c81b16c
MERGE: upstream PyMuPDF 1.26.4 with MuPDF 1.26.7
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:44:09 +0200 |
| parents | b50eed0cc0ef |
| children |
comparison
equal
deleted
inserted
replaced
| 0:6015a75abc2d | 3:2c135c81b16c |
|---|---|
| 1 #!/usr/bin/env python3 | |
| 2 | |
| 3 """usage: ./gen-indic-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt Blocks.txt | |
| 4 | |
| 5 Input files: | |
| 6 * https://unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt | |
| 7 * https://unicode.org/Public/UCD/latest/ucd/IndicPositionalCategory.txt | |
| 8 * https://unicode.org/Public/UCD/latest/ucd/Blocks.txt | |
| 9 """ | |
| 10 | |
| 11 import sys | |
| 12 | |
| 13 if len (sys.argv) != 4: | |
| 14 sys.exit (__doc__) | |
| 15 | |
| 16 ALLOWED_SINGLES = [0x00A0, 0x25CC] | |
| 17 ALLOWED_BLOCKS = [ | |
| 18 'Basic Latin', | |
| 19 'Latin-1 Supplement', | |
| 20 'Devanagari', | |
| 21 'Bengali', | |
| 22 'Gurmukhi', | |
| 23 'Gujarati', | |
| 24 'Oriya', | |
| 25 'Tamil', | |
| 26 'Telugu', | |
| 27 'Kannada', | |
| 28 'Malayalam', | |
| 29 'Myanmar', | |
| 30 'Khmer', | |
| 31 'Vedic Extensions', | |
| 32 'General Punctuation', | |
| 33 'Superscripts and Subscripts', | |
| 34 'Devanagari Extended', | |
| 35 'Myanmar Extended-B', | |
| 36 'Myanmar Extended-A', | |
| 37 ] | |
| 38 | |
| 39 files = [open (x, encoding='utf-8') for x in sys.argv[1:]] | |
| 40 | |
| 41 headers = [[f.readline () for i in range (2)] for f in files] | |
| 42 | |
| 43 unicode_data = [{} for _ in files] | |
| 44 for i, f in enumerate (files): | |
| 45 for line in f: | |
| 46 | |
| 47 j = line.find ('#') | |
| 48 if j >= 0: | |
| 49 line = line[:j] | |
| 50 | |
| 51 fields = [x.strip () for x in line.split (';')] | |
| 52 if len (fields) == 1: | |
| 53 continue | |
| 54 | |
| 55 uu = fields[0].split ('..') | |
| 56 start = int (uu[0], 16) | |
| 57 if len (uu) == 1: | |
| 58 end = start | |
| 59 else: | |
| 60 end = int (uu[1], 16) | |
| 61 | |
| 62 t = fields[1] | |
| 63 | |
| 64 for u in range (start, end + 1): | |
| 65 unicode_data[i][u] = t | |
| 66 | |
| 67 # Merge data into one dict: | |
| 68 defaults = ('Other', 'Not_Applicable', 'No_Block') | |
| 69 combined = {} | |
| 70 for i,d in enumerate (unicode_data): | |
| 71 for u,v in d.items (): | |
| 72 if i == 2 and not u in combined: | |
| 73 continue | |
| 74 if not u in combined: | |
| 75 combined[u] = list (defaults) | |
| 76 combined[u][i] = v | |
| 77 combined = {k:v for k,v in combined.items() if k in ALLOWED_SINGLES or v[2] in ALLOWED_BLOCKS} | |
| 78 | |
| 79 | |
| 80 # Convert categories & positions types | |
| 81 | |
| 82 categories = { | |
| 83 'indic' : [ | |
| 84 'X', | |
| 85 'C', | |
| 86 'V', | |
| 87 'N', | |
| 88 'H', | |
| 89 'ZWNJ', | |
| 90 'ZWJ', | |
| 91 'M', | |
| 92 'SM', | |
| 93 'A', | |
| 94 'VD', | |
| 95 'PLACEHOLDER', | |
| 96 'DOTTEDCIRCLE', | |
| 97 'RS', | |
| 98 'MPst', | |
| 99 'Repha', | |
| 100 'Ra', | |
| 101 'CM', | |
| 102 'Symbol', | |
| 103 'CS', | |
| 104 ], | |
| 105 'khmer' : [ | |
| 106 'VAbv', | |
| 107 'VBlw', | |
| 108 'VPre', | |
| 109 'VPst', | |
| 110 | |
| 111 'Robatic', | |
| 112 'Xgroup', | |
| 113 'Ygroup', | |
| 114 ], | |
| 115 'myanmar' : [ | |
| 116 'VAbv', | |
| 117 'VBlw', | |
| 118 'VPre', | |
| 119 'VPst', | |
| 120 | |
| 121 'IV', | |
| 122 'As', | |
| 123 'DB', | |
| 124 'GB', | |
| 125 'MH', | |
| 126 'MR', | |
| 127 'MW', | |
| 128 'MY', | |
| 129 'PT', | |
| 130 'VS', | |
| 131 'ML', | |
| 132 ], | |
| 133 } | |
| 134 | |
| 135 category_map = { | |
| 136 'Other' : 'X', | |
| 137 'Avagraha' : 'Symbol', | |
| 138 'Bindu' : 'SM', | |
| 139 'Brahmi_Joining_Number' : 'PLACEHOLDER', # Don't care. | |
| 140 'Cantillation_Mark' : 'A', | |
| 141 'Consonant' : 'C', | |
| 142 'Consonant_Dead' : 'C', | |
| 143 'Consonant_Final' : 'CM', | |
| 144 'Consonant_Head_Letter' : 'C', | |
| 145 'Consonant_Initial_Postfixed' : 'C', # TODO | |
| 146 'Consonant_Killer' : 'M', # U+17CD only. | |
| 147 'Consonant_Medial' : 'CM', | |
| 148 'Consonant_Placeholder' : 'PLACEHOLDER', | |
| 149 'Consonant_Preceding_Repha' : 'Repha', | |
| 150 'Consonant_Prefixed' : 'X', # Don't care. | |
| 151 'Consonant_Subjoined' : 'CM', | |
| 152 'Consonant_Succeeding_Repha' : 'CM', | |
| 153 'Consonant_With_Stacker' : 'CS', | |
| 154 'Gemination_Mark' : 'SM', # https://github.com/harfbuzz/harfbuzz/issues/552 | |
| 155 'Invisible_Stacker' : 'H', | |
| 156 'Joiner' : 'ZWJ', | |
| 157 'Modifying_Letter' : 'X', | |
| 158 'Non_Joiner' : 'ZWNJ', | |
| 159 'Nukta' : 'N', | |
| 160 'Number' : 'PLACEHOLDER', | |
| 161 'Number_Joiner' : 'PLACEHOLDER', # Don't care. | |
| 162 'Pure_Killer' : 'M', # Is like a vowel matra. | |
| 163 'Register_Shifter' : 'RS', | |
| 164 'Syllable_Modifier' : 'SM', | |
| 165 'Tone_Letter' : 'X', | |
| 166 'Tone_Mark' : 'N', | |
| 167 'Virama' : 'H', | |
| 168 'Visarga' : 'SM', | |
| 169 'Vowel' : 'V', | |
| 170 'Vowel_Dependent' : 'M', | |
| 171 'Vowel_Independent' : 'V', | |
| 172 } | |
| 173 position_map = { | |
| 174 'Not_Applicable' : 'END', | |
| 175 | |
| 176 'Left' : 'PRE_C', | |
| 177 'Top' : 'ABOVE_C', | |
| 178 'Bottom' : 'BELOW_C', | |
| 179 'Right' : 'POST_C', | |
| 180 | |
| 181 # These should resolve to the position of the last part of the split sequence. | |
| 182 'Bottom_And_Right' : 'POST_C', | |
| 183 'Left_And_Right' : 'POST_C', | |
| 184 'Top_And_Bottom' : 'BELOW_C', | |
| 185 'Top_And_Bottom_And_Left' : 'BELOW_C', | |
| 186 'Top_And_Bottom_And_Right' : 'POST_C', | |
| 187 'Top_And_Left' : 'ABOVE_C', | |
| 188 'Top_And_Left_And_Right' : 'POST_C', | |
| 189 'Top_And_Right' : 'POST_C', | |
| 190 | |
| 191 'Overstruck' : 'AFTER_MAIN', | |
| 192 'Visual_order_left' : 'PRE_M', | |
| 193 } | |
| 194 | |
| 195 category_overrides = { | |
| 196 | |
| 197 # These are the variation-selectors. They only appear in the Myanmar grammar | |
| 198 # but are not Myanmar-specific | |
| 199 0xFE00: 'VS', | |
| 200 0xFE01: 'VS', | |
| 201 0xFE02: 'VS', | |
| 202 0xFE03: 'VS', | |
| 203 0xFE04: 'VS', | |
| 204 0xFE05: 'VS', | |
| 205 0xFE06: 'VS', | |
| 206 0xFE07: 'VS', | |
| 207 0xFE08: 'VS', | |
| 208 0xFE09: 'VS', | |
| 209 0xFE0A: 'VS', | |
| 210 0xFE0B: 'VS', | |
| 211 0xFE0C: 'VS', | |
| 212 0xFE0D: 'VS', | |
| 213 0xFE0E: 'VS', | |
| 214 0xFE0F: 'VS', | |
| 215 | |
| 216 # These appear in the OT Myanmar spec, but are not Myanmar-specific | |
| 217 0x2015: 'PLACEHOLDER', | |
| 218 0x2022: 'PLACEHOLDER', | |
| 219 0x25FB: 'PLACEHOLDER', | |
| 220 0x25FC: 'PLACEHOLDER', | |
| 221 0x25FD: 'PLACEHOLDER', | |
| 222 0x25FE: 'PLACEHOLDER', | |
| 223 | |
| 224 | |
| 225 # Indic | |
| 226 | |
| 227 0x0930: 'Ra', # Devanagari | |
| 228 0x09B0: 'Ra', # Bengali | |
| 229 0x09F0: 'Ra', # Bengali | |
| 230 0x0A30: 'Ra', # Gurmukhi No Reph | |
| 231 0x0AB0: 'Ra', # Gujarati | |
| 232 0x0B30: 'Ra', # Oriya | |
| 233 0x0BB0: 'Ra', # Tamil No Reph | |
| 234 0x0C30: 'Ra', # Telugu Reph formed only with ZWJ | |
| 235 0x0CB0: 'Ra', # Kannada | |
| 236 0x0D30: 'Ra', # Malayalam No Reph, Logical Repha | |
| 237 | |
| 238 # The following act more like the Bindus. | |
| 239 0x0953: 'SM', | |
| 240 0x0954: 'SM', | |
| 241 | |
| 242 # U+0A40 GURMUKHI VOWEL SIGN II may be preceded by U+0A02 GURMUKHI SIGN BINDI. | |
| 243 0x0A40: 'MPst', | |
| 244 | |
| 245 # The following act like consonants. | |
| 246 0x0A72: 'C', | |
| 247 0x0A73: 'C', | |
| 248 0x1CF5: 'C', | |
| 249 0x1CF6: 'C', | |
| 250 | |
| 251 # TODO: The following should only be allowed after a Visarga. | |
| 252 # For now, just treat them like regular tone marks. | |
| 253 0x1CE2: 'A', | |
| 254 0x1CE3: 'A', | |
| 255 0x1CE4: 'A', | |
| 256 0x1CE5: 'A', | |
| 257 0x1CE6: 'A', | |
| 258 0x1CE7: 'A', | |
| 259 0x1CE8: 'A', | |
| 260 | |
| 261 # TODO: The following should only be allowed after some of | |
| 262 # the nasalization marks, maybe only for U+1CE9..U+1CF1. | |
| 263 # For now, just treat them like tone marks. | |
| 264 0x1CED: 'A', | |
| 265 | |
| 266 # The following take marks in standalone clusters, similar to Avagraha. | |
| 267 0xA8F2: 'Symbol', | |
| 268 0xA8F3: 'Symbol', | |
| 269 0xA8F4: 'Symbol', | |
| 270 0xA8F5: 'Symbol', | |
| 271 0xA8F6: 'Symbol', | |
| 272 0xA8F7: 'Symbol', | |
| 273 0x1CE9: 'Symbol', | |
| 274 0x1CEA: 'Symbol', | |
| 275 0x1CEB: 'Symbol', | |
| 276 0x1CEC: 'Symbol', | |
| 277 0x1CEE: 'Symbol', | |
| 278 0x1CEF: 'Symbol', | |
| 279 0x1CF0: 'Symbol', | |
| 280 0x1CF1: 'Symbol', | |
| 281 | |
| 282 0x0A51: 'M', # https://github.com/harfbuzz/harfbuzz/issues/524 | |
| 283 | |
| 284 # According to ScriptExtensions.txt, these Grantha marks may also be used in Tamil, | |
| 285 # so the Indic shaper needs to know their categories. | |
| 286 0x11301: 'SM', | |
| 287 0x11302: 'SM', | |
| 288 0x11303: 'SM', | |
| 289 0x1133B: 'N', | |
| 290 0x1133C: 'N', | |
| 291 | |
| 292 0x0AFB: 'N', # https://github.com/harfbuzz/harfbuzz/issues/552 | |
| 293 0x0B55: 'N', # https://github.com/harfbuzz/harfbuzz/issues/2849 | |
| 294 | |
| 295 0x09FC: 'PLACEHOLDER', # https://github.com/harfbuzz/harfbuzz/pull/1613 | |
| 296 0x0C80: 'PLACEHOLDER', # https://github.com/harfbuzz/harfbuzz/pull/623 | |
| 297 0x0D04: 'PLACEHOLDER', # https://github.com/harfbuzz/harfbuzz/pull/3511 | |
| 298 | |
| 299 0x25CC: 'DOTTEDCIRCLE', | |
| 300 | |
| 301 | |
| 302 # Khmer | |
| 303 | |
| 304 0x179A: 'Ra', | |
| 305 | |
| 306 0x17CC: 'Robatic', | |
| 307 0x17C9: 'Robatic', | |
| 308 0x17CA: 'Robatic', | |
| 309 | |
| 310 0x17C6: 'Xgroup', | |
| 311 0x17CB: 'Xgroup', | |
| 312 0x17CD: 'Xgroup', | |
| 313 0x17CE: 'Xgroup', | |
| 314 0x17CF: 'Xgroup', | |
| 315 0x17D0: 'Xgroup', | |
| 316 0x17D1: 'Xgroup', | |
| 317 | |
| 318 0x17C7: 'Ygroup', | |
| 319 0x17C8: 'Ygroup', | |
| 320 0x17DD: 'Ygroup', | |
| 321 0x17D3: 'Ygroup', # Just guessing. Uniscribe doesn't categorize it. | |
| 322 | |
| 323 0x17D9: 'PLACEHOLDER', # https://github.com/harfbuzz/harfbuzz/issues/2384 | |
| 324 | |
| 325 | |
| 326 # Myanmar | |
| 327 | |
| 328 # https://docs.microsoft.com/en-us/typography/script-development/myanmar#analyze | |
| 329 | |
| 330 0x104E: 'C', # The spec says C, IndicSyllableCategory says Consonant_Placeholder | |
| 331 | |
| 332 0x1004: 'Ra', | |
| 333 0x101B: 'Ra', | |
| 334 0x105A: 'Ra', | |
| 335 | |
| 336 0x1032: 'A', | |
| 337 0x1036: 'A', | |
| 338 | |
| 339 0x103A: 'As', | |
| 340 | |
| 341 #0x1040: 'D0', # XXX The spec says D0, but Uniscribe doesn't seem to do. | |
| 342 | |
| 343 0x103E: 'MH', | |
| 344 0x1060: 'ML', | |
| 345 0x103C: 'MR', | |
| 346 0x103D: 'MW', | |
| 347 0x1082: 'MW', | |
| 348 0x103B: 'MY', | |
| 349 0x105E: 'MY', | |
| 350 0x105F: 'MY', | |
| 351 | |
| 352 0x1063: 'PT', | |
| 353 0x1064: 'PT', | |
| 354 0x1069: 'PT', | |
| 355 0x106A: 'PT', | |
| 356 0x106B: 'PT', | |
| 357 0x106C: 'PT', | |
| 358 0x106D: 'PT', | |
| 359 0xAA7B: 'PT', | |
| 360 | |
| 361 0x1038: 'SM', | |
| 362 0x1087: 'SM', | |
| 363 0x1088: 'SM', | |
| 364 0x1089: 'SM', | |
| 365 0x108A: 'SM', | |
| 366 0x108B: 'SM', | |
| 367 0x108C: 'SM', | |
| 368 0x108D: 'SM', | |
| 369 0x108F: 'SM', | |
| 370 0x109A: 'SM', | |
| 371 0x109B: 'SM', | |
| 372 0x109C: 'SM', | |
| 373 | |
| 374 0x104A: 'PLACEHOLDER', | |
| 375 } | |
| 376 position_overrides = { | |
| 377 | |
| 378 0x0A51: 'BELOW_C', # https://github.com/harfbuzz/harfbuzz/issues/524 | |
| 379 | |
| 380 0x0B01: 'BEFORE_SUB', # Oriya Bindu is BeforeSub in the spec. | |
| 381 } | |
| 382 | |
| 383 def matra_pos_left(u, block): | |
| 384 return "PRE_M" | |
| 385 def matra_pos_right(u, block): | |
| 386 if block == 'Devanagari': return 'AFTER_SUB' | |
| 387 if block == 'Bengali': return 'AFTER_POST' | |
| 388 if block == 'Gurmukhi': return 'AFTER_POST' | |
| 389 if block == 'Gujarati': return 'AFTER_POST' | |
| 390 if block == 'Oriya': return 'AFTER_POST' | |
| 391 if block == 'Tamil': return 'AFTER_POST' | |
| 392 if block == 'Telugu': return 'BEFORE_SUB' if u <= 0x0C42 else 'AFTER_SUB' | |
| 393 if block == 'Kannada': return 'BEFORE_SUB' if u < 0x0CC3 or u > 0x0CD6 else 'AFTER_SUB' | |
| 394 if block == 'Malayalam': return 'AFTER_POST' | |
| 395 return 'AFTER_SUB' | |
| 396 def matra_pos_top(u, block): | |
| 397 # BENG and MLYM don't have top matras. | |
| 398 if block == 'Devanagari': return 'AFTER_SUB' | |
| 399 if block == 'Gurmukhi': return 'AFTER_POST' # Deviate from spec | |
| 400 if block == 'Gujarati': return 'AFTER_SUB' | |
| 401 if block == 'Oriya': return 'AFTER_MAIN' | |
| 402 if block == 'Tamil': return 'AFTER_SUB' | |
| 403 if block == 'Telugu': return 'BEFORE_SUB' | |
| 404 if block == 'Kannada': return 'BEFORE_SUB' | |
| 405 return 'AFTER_SUB' | |
| 406 def matra_pos_bottom(u, block): | |
| 407 if block == 'Devanagari': return 'AFTER_SUB' | |
| 408 if block == 'Bengali': return 'AFTER_SUB' | |
| 409 if block == 'Gurmukhi': return 'AFTER_POST' | |
| 410 if block == 'Gujarati': return 'AFTER_POST' | |
| 411 if block == 'Oriya': return 'AFTER_SUB' | |
| 412 if block == 'Tamil': return 'AFTER_POST' | |
| 413 if block == 'Telugu': return 'BEFORE_SUB' | |
| 414 if block == 'Kannada': return 'BEFORE_SUB' | |
| 415 if block == 'Malayalam': return 'AFTER_POST' | |
| 416 return "AFTER_SUB" | |
| 417 def indic_matra_position(u, pos, block): # Reposition matra | |
| 418 if pos == 'PRE_C': return matra_pos_left(u, block) | |
| 419 if pos == 'POST_C': return matra_pos_right(u, block) | |
| 420 if pos == 'ABOVE_C': return matra_pos_top(u, block) | |
| 421 if pos == 'BELOW_C': return matra_pos_bottom(u, block) | |
| 422 assert (False) | |
| 423 | |
| 424 def position_to_category(pos): | |
| 425 if pos == 'PRE_C': return 'VPre' | |
| 426 if pos == 'ABOVE_C': return 'VAbv' | |
| 427 if pos == 'BELOW_C': return 'VBlw' | |
| 428 if pos == 'POST_C': return 'VPst' | |
| 429 assert(False) | |
| 430 | |
| 431 | |
| 432 defaults = (category_map[defaults[0]], position_map[defaults[1]], defaults[2]) | |
| 433 | |
| 434 indic_data = {} | |
| 435 for k, (cat, pos, block) in combined.items(): | |
| 436 cat = category_map[cat] | |
| 437 pos = position_map[pos] | |
| 438 indic_data[k] = (cat, pos, block) | |
| 439 | |
| 440 for k,new_cat in category_overrides.items(): | |
| 441 (cat, pos, _) = indic_data.get(k, defaults) | |
| 442 indic_data[k] = (new_cat, pos, unicode_data[2][k]) | |
| 443 | |
| 444 # We only expect position for certain types | |
| 445 positioned_categories = ('CM', 'SM', 'RS', 'H', 'M', 'MPst') | |
| 446 for k, (cat, pos, block) in indic_data.items(): | |
| 447 if cat not in positioned_categories: | |
| 448 pos = 'END' | |
| 449 indic_data[k] = (cat, pos, block) | |
| 450 | |
| 451 # Position overrides are more complicated | |
| 452 | |
| 453 # Keep in sync with CONSONANT_FLAGS in the shaper | |
| 454 consonant_categories = ('C', 'CS', 'Ra','CM', 'V', 'PLACEHOLDER', 'DOTTEDCIRCLE') | |
| 455 matra_categories = ('M', 'MPst') | |
| 456 smvd_categories = ('SM', 'VD', 'A', 'Symbol') | |
| 457 for k, (cat, pos, block) in indic_data.items(): | |
| 458 if cat in consonant_categories: | |
| 459 pos = 'BASE_C' | |
| 460 elif cat in matra_categories: | |
| 461 if block.startswith('Khmer') or block.startswith('Myanmar'): | |
| 462 cat = position_to_category(pos) | |
| 463 else: | |
| 464 pos = indic_matra_position(k, pos, block) | |
| 465 elif cat in smvd_categories: | |
| 466 pos = 'SMVD'; | |
| 467 indic_data[k] = (cat, pos, block) | |
| 468 | |
| 469 for k,new_pos in position_overrides.items(): | |
| 470 (cat, pos, _) = indic_data.get(k, defaults) | |
| 471 indic_data[k] = (cat, new_pos, unicode_data[2][k]) | |
| 472 | |
| 473 | |
| 474 values = [{_: 1} for _ in defaults] | |
| 475 for vv in indic_data.values(): | |
| 476 for i,v in enumerate(vv): | |
| 477 values[i][v] = values[i].get (v, 0) + 1 | |
| 478 | |
| 479 | |
| 480 | |
| 481 | |
| 482 # Move the outliers NO-BREAK SPACE and DOTTED CIRCLE out | |
| 483 singles = {} | |
| 484 for u in ALLOWED_SINGLES: | |
| 485 singles[u] = indic_data[u] | |
| 486 del indic_data[u] | |
| 487 | |
| 488 print ("/* == Start of generated table == */") | |
| 489 print ("/*") | |
| 490 print (" * The following table is generated by running:") | |
| 491 print (" *") | |
| 492 print (" * ./gen-indic-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt Blocks.txt") | |
| 493 print (" *") | |
| 494 print (" * on files with these headers:") | |
| 495 print (" *") | |
| 496 for h in headers: | |
| 497 for l in h: | |
| 498 print (" * %s" % (l.strip())) | |
| 499 print (" */") | |
| 500 print () | |
| 501 print ('#include "hb.hh"') | |
| 502 print () | |
| 503 print ('#ifndef HB_NO_OT_SHAPE') | |
| 504 print () | |
| 505 print ('#include "hb-ot-shaper-indic.hh"') | |
| 506 print () | |
| 507 print ('#pragma GCC diagnostic push') | |
| 508 print ('#pragma GCC diagnostic ignored "-Wunused-macros"') | |
| 509 print () | |
| 510 | |
| 511 # Print categories | |
| 512 for shaper in categories: | |
| 513 print ('#include "hb-ot-shaper-%s-machine.hh"' % shaper) | |
| 514 print () | |
| 515 done = {} | |
| 516 for shaper, shaper_cats in categories.items(): | |
| 517 print ('/* %s */' % shaper) | |
| 518 for cat in shaper_cats: | |
| 519 v = shaper[0].upper() | |
| 520 if cat not in done: | |
| 521 print ("#define OT_%s %s_Cat(%s)" % (cat, v, cat)) | |
| 522 done[cat] = v | |
| 523 else: | |
| 524 print ('static_assert (OT_%s == %s_Cat(%s), "");' % (cat, v, cat)) | |
| 525 print () | |
| 526 | |
| 527 # Shorten values | |
| 528 short = [{ | |
| 529 "Repha": 'Rf', | |
| 530 "PLACEHOLDER": 'GB', | |
| 531 "DOTTEDCIRCLE": 'DC', | |
| 532 "VPst": 'VR', | |
| 533 "VPre": 'VL', | |
| 534 "Robatic": 'Rt', | |
| 535 "Xgroup": 'Xg', | |
| 536 "Ygroup": 'Yg', | |
| 537 "As": 'As', | |
| 538 },{ | |
| 539 "END": 'X', | |
| 540 "BASE_C": 'C', | |
| 541 "ABOVE_C": 'T', | |
| 542 "BELOW_C": 'B', | |
| 543 "POST_C": 'R', | |
| 544 "PRE_C": 'L', | |
| 545 "PRE_M": 'LM', | |
| 546 "AFTER_MAIN": 'A', | |
| 547 "AFTER_SUB": 'AS', | |
| 548 "BEFORE_SUB": 'BS', | |
| 549 "AFTER_POST": 'AP', | |
| 550 "SMVD": 'SM', | |
| 551 }] | |
| 552 all_shorts = [{},{}] | |
| 553 | |
| 554 # Add some of the values, to make them more readable, and to avoid duplicates | |
| 555 | |
| 556 for i in range (2): | |
| 557 for v,s in short[i].items (): | |
| 558 all_shorts[i][s] = v | |
| 559 | |
| 560 what = ["OT", "POS"] | |
| 561 what_short = ["_OT", "_POS"] | |
| 562 cat_defs = [] | |
| 563 for i in range (2): | |
| 564 vv = sorted (values[i].keys ()) | |
| 565 for v in vv: | |
| 566 v_no_and = v.replace ('_And_', '_') | |
| 567 if v in short[i]: | |
| 568 s = short[i][v] | |
| 569 else: | |
| 570 s = ''.join ([c for c in v_no_and if ord ('A') <= ord (c) <= ord ('Z')]) | |
| 571 if s in all_shorts[i]: | |
| 572 raise Exception ("Duplicate short value alias", v, all_shorts[i][s]) | |
| 573 all_shorts[i][s] = v | |
| 574 short[i][v] = s | |
| 575 cat_defs.append ((what_short[i] + '_' + s, what[i] + '_' + (v.upper () if i else v), str (values[i][v]), v)) | |
| 576 | |
| 577 maxlen_s = max ([len (c[0]) for c in cat_defs]) | |
| 578 maxlen_l = max ([len (c[1]) for c in cat_defs]) | |
| 579 maxlen_n = max ([len (c[2]) for c in cat_defs]) | |
| 580 for s in what_short: | |
| 581 print () | |
| 582 for c in [c for c in cat_defs if s in c[0]]: | |
| 583 print ("#define %s %s /* %s chars; %s */" % | |
| 584 (c[0].ljust (maxlen_s), c[1].ljust (maxlen_l), c[2].rjust (maxlen_n), c[3])) | |
| 585 print () | |
| 586 print ('#pragma GCC diagnostic pop') | |
| 587 print () | |
| 588 print ("#define INDIC_COMBINE_CATEGORIES(S,M) ((S) | ((M) << 8))") | |
| 589 print () | |
| 590 print ("#define _(S,M) INDIC_COMBINE_CATEGORIES (%s_##S, %s_##M)" % tuple(what_short)) | |
| 591 print () | |
| 592 print () | |
| 593 | |
| 594 total = 0 | |
| 595 used = 0 | |
| 596 last_block = None | |
| 597 def print_block (block, start, end, data): | |
| 598 global total, used, last_block | |
| 599 if block and block != last_block: | |
| 600 print () | |
| 601 print () | |
| 602 print (" /* %s */" % block) | |
| 603 num = 0 | |
| 604 assert start % 8 == 0 | |
| 605 assert (end+1) % 8 == 0 | |
| 606 for u in range (start, end+1): | |
| 607 if u % 8 == 0: | |
| 608 print () | |
| 609 print (" /* %04X */" % u, end="") | |
| 610 if u in data: | |
| 611 num += 1 | |
| 612 d = data.get (u, defaults) | |
| 613 print ("%9s" % ("_(%s,%s)," % (short[0][d[0]], short[1][d[1]])), end="") | |
| 614 | |
| 615 total += end - start + 1 | |
| 616 used += num | |
| 617 if block: | |
| 618 last_block = block | |
| 619 | |
| 620 uu = sorted (indic_data) | |
| 621 | |
| 622 last = -100000 | |
| 623 num = 0 | |
| 624 offset = 0 | |
| 625 starts = [] | |
| 626 ends = [] | |
| 627 print ("static const uint16_t indic_table[] = {") | |
| 628 for u in uu: | |
| 629 if u <= last: | |
| 630 continue | |
| 631 block = indic_data[u][2] | |
| 632 | |
| 633 start = u//8*8 | |
| 634 end = start+1 | |
| 635 while end in uu and block == indic_data[end][2]: | |
| 636 end += 1 | |
| 637 end = (end-1)//8*8 + 7 | |
| 638 | |
| 639 if start != last + 1: | |
| 640 if start - last <= 1+16*2: | |
| 641 print_block (None, last+1, start-1, indic_data) | |
| 642 else: | |
| 643 if last >= 0: | |
| 644 ends.append (last + 1) | |
| 645 offset += ends[-1] - starts[-1] | |
| 646 print () | |
| 647 print () | |
| 648 print ("#define indic_offset_0x%04xu %d" % (start, offset)) | |
| 649 starts.append (start) | |
| 650 | |
| 651 print_block (block, start, end, indic_data) | |
| 652 last = end | |
| 653 ends.append (last + 1) | |
| 654 offset += ends[-1] - starts[-1] | |
| 655 print () | |
| 656 print () | |
| 657 occupancy = used * 100. / total | |
| 658 page_bits = 12 | |
| 659 print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy)) | |
| 660 print () | |
| 661 print ("uint16_t") | |
| 662 print ("hb_indic_get_categories (hb_codepoint_t u)") | |
| 663 print ("{") | |
| 664 print (" switch (u >> %d)" % page_bits) | |
| 665 print (" {") | |
| 666 pages = set ([u>>page_bits for u in starts+ends+list (singles.keys ())]) | |
| 667 for p in sorted(pages): | |
| 668 print (" case 0x%0Xu:" % p) | |
| 669 for u,d in singles.items (): | |
| 670 if p != u>>page_bits: continue | |
| 671 print (" if (unlikely (u == 0x%04Xu)) return _(%s,%s);" % (u, short[0][d[0]], short[1][d[1]])) | |
| 672 for (start,end) in zip (starts, ends): | |
| 673 if p not in [start>>page_bits, end>>page_bits]: continue | |
| 674 offset = "indic_offset_0x%04xu" % start | |
| 675 print (" if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return indic_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset)) | |
| 676 print (" break;") | |
| 677 print ("") | |
| 678 print (" default:") | |
| 679 print (" break;") | |
| 680 print (" }") | |
| 681 print (" return _(X,X);") | |
| 682 print ("}") | |
| 683 print () | |
| 684 print ("#undef _") | |
| 685 print ("#undef INDIC_COMBINE_CATEGORIES") | |
| 686 for i in range (2): | |
| 687 print () | |
| 688 vv = sorted (values[i].keys ()) | |
| 689 for v in vv: | |
| 690 print ("#undef %s_%s" % | |
| 691 (what_short[i], short[i][v])) | |
| 692 print () | |
| 693 print ('#endif') | |
| 694 print () | |
| 695 print ("/* == End of generated table == */") | |
| 696 | |
| 697 # Maintain at least 50% occupancy in the table */ | |
| 698 if occupancy < 50: | |
| 699 raise Exception ("Table too sparse, please investigate: ", occupancy) |
