Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/harfbuzz/src/gen-ucd-table.py @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 #!/usr/bin/env python3 | |
| 2 | |
| 3 """usage: ./gen-ucd-table ucd.nounihan.grouped.xml [/path/to/hb-common.h] | |
| 4 | |
| 5 Input file: | |
| 6 * https://unicode.org/Public/UCD/latest/ucdxml/ucd.nounihan.grouped.zip | |
| 7 """ | |
| 8 | |
| 9 import sys, re | |
| 10 import logging | |
| 11 logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO) | |
| 12 | |
| 13 if len (sys.argv) not in (2, 3): | |
| 14 sys.exit (__doc__) | |
| 15 | |
| 16 # https://github.com/harfbuzz/packtab | |
| 17 import packTab | |
| 18 import packTab.ucdxml | |
| 19 | |
| 20 logging.info('Loading UCDXML...') | |
| 21 ucdxml = packTab.ucdxml.load_ucdxml(sys.argv[1]) | |
| 22 ucd = packTab.ucdxml.ucdxml_get_repertoire(ucdxml) | |
| 23 | |
| 24 hb_common_h = 'hb-common.h' if len (sys.argv) < 3 else sys.argv[2] | |
| 25 | |
| 26 logging.info('Preparing data tables...') | |
| 27 | |
| 28 | |
| 29 # This is how the data is encoded: | |
| 30 # | |
| 31 # General_Category (gc), Canonical_Combining_Class (ccc), | |
| 32 # and Script (sc) are encoded as integers. | |
| 33 # | |
| 34 # Mirroring character (bmg) is encoded as difference from | |
| 35 # the original character. | |
| 36 # | |
| 37 # Composition & Decomposition (dm) are encoded elaborately, | |
| 38 # as discussed below. | |
| 39 | |
| 40 gc = [u['gc'] for u in ucd] | |
| 41 ccc = [int(u['ccc']) for u in ucd] | |
| 42 bmg = [int(v, 16) - int(u) if v else 0 for u,v in enumerate(u['bmg'] for u in ucd)] | |
| 43 sc = [u['sc'] for u in ucd] | |
| 44 | |
| 45 | |
| 46 # Prepare Compose / Decompose data | |
| 47 # | |
| 48 # This code is very dense. See hb_ucd_compose() / hb_ucd_decompose() for the logic. | |
| 49 | |
| 50 dm = {i:tuple(int(v, 16) for v in u['dm'].split()) for i,u in enumerate(ucd) | |
| 51 if u['dm'] != '#' and u['dt'] == 'can' and not (0xAC00 <= i < 0xAC00+11172)} | |
| 52 ce = {i for i,u in enumerate(ucd) if u['Comp_Ex'] == 'Y'} | |
| 53 | |
| 54 assert not any(v for v in dm.values() if len(v) not in (1,2)) | |
| 55 dm1 = sorted(set(v for v in dm.values() if len(v) == 1)) | |
| 56 assert all((v[0] >> 16) in (0,2) for v in dm1) | |
| 57 dm1_p0_array = ['0x%04Xu' % (v[0] & 0xFFFF) for v in dm1 if (v[0] >> 16) == 0] | |
| 58 dm1_p2_array = ['0x%04Xu' % (v[0] & 0xFFFF) for v in dm1 if (v[0] >> 16) == 2] | |
| 59 dm1_order = {v:i+1 for i,v in enumerate(dm1)} | |
| 60 | |
| 61 dm2 = sorted((v+(i if i not in ce and not ccc[i] else 0,), v) | |
| 62 for i,v in dm.items() if len(v) == 2) | |
| 63 | |
| 64 filt = lambda v: ((v[0] & 0xFFFFF800) == 0x0000 and | |
| 65 (v[1] & 0xFFFFFF80) == 0x0300 and | |
| 66 (v[2] & 0xFFF0C000) == 0x0000) | |
| 67 dm2_u32_array = [v for v in dm2 if filt(v[0])] | |
| 68 dm2_u64_array = [v for v in dm2 if not filt(v[0])] | |
| 69 assert dm2_u32_array + dm2_u64_array == dm2 | |
| 70 dm2_u32_array = ["HB_CODEPOINT_ENCODE3_11_7_14 (0x%04Xu, 0x%04Xu, 0x%04Xu)" % v[0] for v in dm2_u32_array] | |
| 71 dm2_u64_array = ["HB_CODEPOINT_ENCODE3 (0x%04Xu, 0x%04Xu, 0x%04Xu)" % v[0] for v in dm2_u64_array] | |
| 72 | |
| 73 l = 1 + len(dm1_p0_array) + len(dm1_p2_array) | |
| 74 dm2_order = {v[1]:i+l for i,v in enumerate(dm2)} | |
| 75 | |
| 76 dm_order = {None: 0} | |
| 77 dm_order.update(dm1_order) | |
| 78 dm_order.update(dm2_order) | |
| 79 | |
| 80 | |
| 81 # Prepare General_Category / Script mapping arrays | |
| 82 | |
| 83 gc_order = dict() | |
| 84 for i,v in enumerate(('Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', | |
| 85 'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', | |
| 86 'Pi', 'Po', 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',)): | |
| 87 gc_order[i] = v | |
| 88 gc_order[v] = i | |
| 89 | |
| 90 sc_order = dict() | |
| 91 sc_array = [] | |
| 92 sc_re = re.compile(r"\b(HB_SCRIPT_[_A-Z]*).*HB_TAG [(]'(.)','(.)','(.)','(.)'[)]") | |
| 93 for line in open(hb_common_h): | |
| 94 m = sc_re.search (line) | |
| 95 if not m: continue | |
| 96 name = m.group(1) | |
| 97 tag = ''.join(m.group(i) for i in range(2, 6)) | |
| 98 i = len(sc_array) | |
| 99 sc_order[tag] = i | |
| 100 sc_order[i] = tag | |
| 101 sc_array.append(name) | |
| 102 | |
| 103 | |
| 104 # Write out main data | |
| 105 | |
| 106 DEFAULT = 'DEFAULT' | |
| 107 COMPACT = 'COMPACT' | |
| 108 SLOPPY = 'SLOPPY' | |
| 109 | |
| 110 compression_level = { | |
| 111 DEFAULT: 5, | |
| 112 COMPACT: 9, | |
| 113 SLOPPY: 9, | |
| 114 } | |
| 115 | |
| 116 logging.info('Generating output...') | |
| 117 print("/* == Start of generated table == */") | |
| 118 print("/*") | |
| 119 print(" * The following table is generated by running:") | |
| 120 print(" *") | |
| 121 print(" * ./gen-ucd-table.py ucd.nounihan.grouped.xml") | |
| 122 print(" *") | |
| 123 print(" * on file with this description:", ucdxml.description) | |
| 124 print(" */") | |
| 125 print() | |
| 126 print("#ifndef HB_UCD_TABLE_HH") | |
| 127 print("#define HB_UCD_TABLE_HH") | |
| 128 print() | |
| 129 print('#include "hb.hh"') | |
| 130 print() | |
| 131 | |
| 132 | |
| 133 # Write mapping data | |
| 134 | |
| 135 code = packTab.Code('_hb_ucd') | |
| 136 sc_array, _ = code.addArray('hb_script_t', 'sc_map', sc_array) | |
| 137 dm1_p0_array, _ = code.addArray('uint16_t', 'dm1_p0_map', dm1_p0_array) | |
| 138 dm1_p2_array, _ = code.addArray('uint16_t', 'dm1_p2_map', dm1_p2_array) | |
| 139 dm2_u32_array, _ = code.addArray('uint32_t', 'dm2_u32_map', dm2_u32_array) | |
| 140 dm2_u64_array, _ = code.addArray('uint64_t', 'dm2_u64_map', dm2_u64_array) | |
| 141 code.print_c(linkage='static inline') | |
| 142 | |
| 143 datasets = [ | |
| 144 ('gc', gc, 'Cn', gc_order), | |
| 145 ('ccc', ccc, 0, None), | |
| 146 ('bmg', bmg, 0, None), | |
| 147 ('sc', sc, 'Zzzz', sc_order), | |
| 148 ('dm', dm, None, dm_order), | |
| 149 ] | |
| 150 | |
| 151 | |
| 152 # Write main data | |
| 153 | |
| 154 for step in (DEFAULT, COMPACT, SLOPPY): | |
| 155 compression = compression_level[step] | |
| 156 logging.info(' Compression=%d:' % compression) | |
| 157 print() | |
| 158 if step == DEFAULT: | |
| 159 print('#ifndef HB_OPTIMIZE_SIZE') | |
| 160 elif step == COMPACT: | |
| 161 print('#elif !defined(HB_NO_UCD_UNASSIGNED)') | |
| 162 elif step == SLOPPY: | |
| 163 print('#else') | |
| 164 else: | |
| 165 assert False | |
| 166 print() | |
| 167 | |
| 168 if step == SLOPPY: | |
| 169 for i in range(len(gc)): | |
| 170 if (i % 128) and gc[i] == 'Cn': | |
| 171 gc[i] = gc[i - 1] | |
| 172 for i in range(len(gc) - 2, -1, -1): | |
| 173 if ((i + 1) % 128) and gc[i] == 'Cn': | |
| 174 gc[i] = gc[i + 1] | |
| 175 for i in range(len(sc)): | |
| 176 if (i % 128) and sc[i] == 'Zzzz': | |
| 177 sc[i] = sc[i - 1] | |
| 178 for i in range(len(sc) - 2, -1, -1): | |
| 179 if ((i + 1) % 128) and sc[i] == 'Zzzz': | |
| 180 sc[i] = sc[i + 1] | |
| 181 | |
| 182 | |
| 183 code = packTab.Code('_hb_ucd') | |
| 184 | |
| 185 for name,data,default,mapping in datasets: | |
| 186 sol = packTab.pack_table(data, default, mapping=mapping, compression=compression) | |
| 187 logging.info(' Dataset=%-8s FullCost=%d' % (name, sol.fullCost)) | |
| 188 sol.genCode(code, name) | |
| 189 | |
| 190 code.print_c(linkage='static inline') | |
| 191 | |
| 192 print() | |
| 193 | |
| 194 | |
| 195 print('#endif') | |
| 196 print() | |
| 197 | |
| 198 print() | |
| 199 print("#endif /* HB_UCD_TABLE_HH */") | |
| 200 print() | |
| 201 print("/* == End of generated table == */") | |
| 202 logging.info('Done.') |
