Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/harfbuzz/src/gen-vowel-constraints.py @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 #!/usr/bin/env python3 | |
| 2 | |
| 3 """Generator of the function to prohibit certain vowel sequences. | |
| 4 | |
| 5 It creates ``_hb_preprocess_text_vowel_constraints``, which inserts dotted | |
| 6 circles into sequences prohibited by the USE script development spec. | |
| 7 This function should be used as the ``preprocess_text`` of an | |
| 8 ``hb_ot_shaper_t``. | |
| 9 | |
| 10 usage: ./gen-vowel-constraints.py ms-use/IndicShapingInvalidCluster.txt Scripts.txt | |
| 11 | |
| 12 Input file: | |
| 13 * https://unicode.org/Public/UCD/latest/ucd/Scripts.txt | |
| 14 """ | |
| 15 | |
| 16 import collections | |
| 17 def write (s): | |
| 18 sys.stdout.flush () | |
| 19 sys.stdout.buffer.write (s.encode ('utf-8')) | |
| 20 import sys | |
| 21 | |
| 22 if len (sys.argv) != 3: | |
| 23 sys.exit (__doc__) | |
| 24 | |
| 25 with open (sys.argv[2], encoding='utf-8') as f: | |
| 26 scripts_header = [f.readline () for i in range (2)] | |
| 27 scripts = {} | |
| 28 script_order = {} | |
| 29 for line in f: | |
| 30 j = line.find ('#') | |
| 31 if j >= 0: | |
| 32 line = line[:j] | |
| 33 fields = [x.strip () for x in line.split (';')] | |
| 34 if len (fields) == 1: | |
| 35 continue | |
| 36 uu = fields[0].split ('..') | |
| 37 start = int (uu[0], 16) | |
| 38 if len (uu) == 1: | |
| 39 end = start | |
| 40 else: | |
| 41 end = int (uu[1], 16) | |
| 42 script = fields[1] | |
| 43 for u in range (start, end + 1): | |
| 44 scripts[u] = script | |
| 45 if script not in script_order: | |
| 46 script_order[script] = start | |
| 47 | |
| 48 class ConstraintSet (object): | |
| 49 """A set of prohibited code point sequences. | |
| 50 | |
| 51 Args: | |
| 52 constraint (List[int]): A prohibited code point sequence. | |
| 53 | |
| 54 """ | |
| 55 def __init__ (self, constraint): | |
| 56 # Either a list or a dictionary. As a list of code points, it | |
| 57 # represents a prohibited code point sequence. As a dictionary, | |
| 58 # it represents a set of prohibited sequences, where each item | |
| 59 # represents the set of prohibited sequences starting with the | |
| 60 # key (a code point) concatenated with any of the values | |
| 61 # (ConstraintSets). | |
| 62 self._c = constraint | |
| 63 | |
| 64 def add (self, constraint): | |
| 65 """Add a constraint to this set.""" | |
| 66 if not constraint: | |
| 67 return | |
| 68 first = constraint[0] | |
| 69 rest = constraint[1:] | |
| 70 if isinstance (self._c, list): | |
| 71 if constraint == self._c[:len (constraint)]: | |
| 72 self._c = constraint | |
| 73 elif self._c != constraint[:len (self._c)]: | |
| 74 self._c = {self._c[0]: ConstraintSet (self._c[1:])} | |
| 75 if isinstance (self._c, dict): | |
| 76 if first in self._c: | |
| 77 self._c[first].add (rest) | |
| 78 else: | |
| 79 self._c[first] = ConstraintSet (rest) | |
| 80 | |
| 81 @staticmethod | |
| 82 def _indent (depth): | |
| 83 return (' ' * depth).replace (' ', '\t') | |
| 84 | |
| 85 def __str__ (self, index=0, depth=4): | |
| 86 s = [] | |
| 87 indent = self._indent (depth) | |
| 88 if isinstance (self._c, list): | |
| 89 if len (self._c) == 0: | |
| 90 assert index == 2, 'Cannot use `matched` for this constraint; the general case has not been implemented' | |
| 91 s.append ('{}matched = true;\n'.format (indent)) | |
| 92 elif len (self._c) == 1: | |
| 93 assert index == 1, 'Cannot use `matched` for this constraint; the general case has not been implemented' | |
| 94 s.append ('{}matched = 0x{:04X}u == buffer->cur ({}).codepoint;\n'.format (indent, next (iter (self._c)), index or '')) | |
| 95 else: | |
| 96 s.append ('{}if (0x{:04X}u == buffer->cur ({}).codepoint &&\n'.format (indent, self._c[0], index or '')) | |
| 97 if index: | |
| 98 s.append ('{}buffer->idx + {} < count &&\n'.format (self._indent (depth + 2), index + 1)) | |
| 99 for i, cp in enumerate (self._c[1:], start=1): | |
| 100 s.append ('{}0x{:04X}u == buffer->cur ({}).codepoint{}\n'.format ( | |
| 101 self._indent (depth + 2), cp, index + i, ')' if i == len (self._c) - 1 else ' &&')) | |
| 102 s.append ('{}{{\n'.format (indent)) | |
| 103 for i in range (index): | |
| 104 s.append ('{}(void) buffer->next_glyph ();\n'.format (self._indent (depth + 1))) | |
| 105 s.append ('{}matched = true;\n'.format (self._indent (depth + 1))) | |
| 106 s.append ('{}}}\n'.format (indent)) | |
| 107 else: | |
| 108 s.append ('{}switch (buffer->cur ({}).codepoint)\n'.format(indent, index or '')) | |
| 109 s.append ('{}{{\n'.format (indent)) | |
| 110 cases = collections.defaultdict (set) | |
| 111 for first, rest in sorted (self._c.items ()): | |
| 112 cases[rest.__str__ (index + 1, depth + 2)].add (first) | |
| 113 for body, labels in sorted (cases.items (), key=lambda b_ls: sorted (b_ls[1])[0]): | |
| 114 for i, cp in enumerate (sorted (labels)): | |
| 115 if i % 4 == 0: | |
| 116 s.append (self._indent (depth + 1)) | |
| 117 else: | |
| 118 s.append (' ') | |
| 119 s.append ('case 0x{:04X}u:{}'.format (cp, '\n' if i % 4 == 3 else '')) | |
| 120 if len (labels) % 4 != 0: | |
| 121 s.append ('\n') | |
| 122 s.append (body) | |
| 123 s.append ('{}break;\n'.format (self._indent (depth + 2))) | |
| 124 s.append ('{}}}\n'.format (indent)) | |
| 125 return ''.join (s) | |
| 126 | |
| 127 constraints = {} | |
| 128 with open (sys.argv[1], encoding='utf-8') as f: | |
| 129 constraints_header = [] | |
| 130 while True: | |
| 131 line = f.readline ().strip () | |
| 132 if line == '#': | |
| 133 break | |
| 134 constraints_header.append(line) | |
| 135 for line in f: | |
| 136 j = line.find ('#') | |
| 137 if j >= 0: | |
| 138 line = line[:j] | |
| 139 constraint = [int (cp, 16) for cp in line.split (';')[0].split ()] | |
| 140 if not constraint: continue | |
| 141 assert 2 <= len (constraint), 'Prohibited sequence is too short: {}'.format (constraint) | |
| 142 script = scripts[constraint[0]] | |
| 143 if script in constraints: | |
| 144 constraints[script].add (constraint) | |
| 145 else: | |
| 146 constraints[script] = ConstraintSet (constraint) | |
| 147 assert constraints, 'No constraints found' | |
| 148 | |
| 149 print ('/* == Start of generated functions == */') | |
| 150 print ('/*') | |
| 151 print (' * The following functions are generated by running:') | |
| 152 print (' *') | |
| 153 print (' * %s ms-use/IndicShapingInvalidCluster.txt Scripts.txt' % sys.argv[0]) | |
| 154 print (' *') | |
| 155 print (' * on files with these headers:') | |
| 156 print (' *') | |
| 157 for line in constraints_header: | |
| 158 print (' * %s' % line.strip ()) | |
| 159 print (' *') | |
| 160 for line in scripts_header: | |
| 161 print (' * %s' % line.strip ()) | |
| 162 print (' */') | |
| 163 | |
| 164 print () | |
| 165 print ('#include "hb.hh"') | |
| 166 print () | |
| 167 print ('#ifndef HB_NO_OT_SHAPE') | |
| 168 print () | |
| 169 print ('#include "hb-ot-shaper-vowel-constraints.hh"') | |
| 170 print () | |
| 171 print ('static void') | |
| 172 print ('_output_dotted_circle (hb_buffer_t *buffer)') | |
| 173 print ('{') | |
| 174 print (' (void) buffer->output_glyph (0x25CCu);') | |
| 175 print (' _hb_glyph_info_reset_continuation (&buffer->prev());') | |
| 176 print ('}') | |
| 177 print () | |
| 178 print ('static void') | |
| 179 print ('_output_with_dotted_circle (hb_buffer_t *buffer)') | |
| 180 print ('{') | |
| 181 print (' _output_dotted_circle (buffer);') | |
| 182 print (' (void) buffer->next_glyph ();') | |
| 183 print ('}') | |
| 184 print () | |
| 185 | |
| 186 print ('void') | |
| 187 print ('_hb_preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan HB_UNUSED,') | |
| 188 print ('\t\t\t\t hb_buffer_t *buffer,') | |
| 189 print ('\t\t\t\t hb_font_t *font HB_UNUSED)') | |
| 190 print ('{') | |
| 191 print ('#ifdef HB_NO_OT_SHAPER_VOWEL_CONSTRAINTS') | |
| 192 print (' return;') | |
| 193 print ('#endif') | |
| 194 print (' if (buffer->flags & HB_BUFFER_FLAG_DO_NOT_INSERT_DOTTED_CIRCLE)') | |
| 195 print (' return;') | |
| 196 print () | |
| 197 print (' /* UGLY UGLY UGLY business of adding dotted-circle in the middle of') | |
| 198 print (' * vowel-sequences that look like another vowel. Data for each script') | |
| 199 print (' * collected from the USE script development spec.') | |
| 200 print (' *') | |
| 201 print (' * https://github.com/harfbuzz/harfbuzz/issues/1019') | |
| 202 print (' */') | |
| 203 print (' buffer->clear_output ();') | |
| 204 print (' unsigned int count = buffer->len;') | |
| 205 print (' switch ((unsigned) buffer->props.script)') | |
| 206 print (' {') | |
| 207 | |
| 208 for script, constraints in sorted (constraints.items (), key=lambda s_c: script_order[s_c[0]]): | |
| 209 print (' case HB_SCRIPT_{}:'.format (script.upper ())) | |
| 210 print (' for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;)') | |
| 211 print (' {') | |
| 212 print ('\tbool matched = false;') | |
| 213 write (str (constraints)) | |
| 214 print ('\t(void) buffer->next_glyph ();') | |
| 215 print ('\tif (matched) _output_with_dotted_circle (buffer);') | |
| 216 print (' }') | |
| 217 print (' break;') | |
| 218 print () | |
| 219 | |
| 220 print (' default:') | |
| 221 print (' break;') | |
| 222 print (' }') | |
| 223 print (' buffer->sync ();') | |
| 224 print ('}') | |
| 225 | |
| 226 print () | |
| 227 print () | |
| 228 print ('#endif') | |
| 229 print ('/* == End of generated functions == */') |
