diff mupdf-source/thirdparty/harfbuzz/src/gen-ucd-table.py @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/harfbuzz/src/gen-ucd-table.py	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,202 @@
+#!/usr/bin/env python3
+
+"""usage: ./gen-ucd-table ucd.nounihan.grouped.xml [/path/to/hb-common.h]
+
+Input file:
+* https://unicode.org/Public/UCD/latest/ucdxml/ucd.nounihan.grouped.zip
+"""
+
+import sys, re
+import logging
+logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
+
+if len (sys.argv) not in (2, 3):
+	sys.exit (__doc__)
+
+# https://github.com/harfbuzz/packtab
+import packTab
+import packTab.ucdxml
+
+logging.info('Loading UCDXML...')
+ucdxml = packTab.ucdxml.load_ucdxml(sys.argv[1])
+ucd = packTab.ucdxml.ucdxml_get_repertoire(ucdxml)
+
+hb_common_h = 'hb-common.h' if len (sys.argv) < 3 else sys.argv[2]
+
+logging.info('Preparing data tables...')
+
+
+# This is how the data is encoded:
+#
+# General_Category (gc), Canonical_Combining_Class (ccc),
+# and Script (sc) are encoded as integers.
+#
+# Mirroring character (bmg) is encoded as difference from
+# the original character.
+#
+# Composition & Decomposition (dm) are encoded elaborately,
+# as discussed below.
+
+gc = [u['gc'] for u in ucd]
+ccc = [int(u['ccc']) for u in ucd]
+bmg = [int(v, 16) - int(u) if v else 0 for u,v in enumerate(u['bmg'] for u in ucd)]
+sc = [u['sc'] for u in ucd]
+
+
+# Prepare Compose / Decompose data
+#
+# This code is very dense.  See hb_ucd_compose() / hb_ucd_decompose() for the logic.
+
+dm = {i:tuple(int(v, 16) for v in u['dm'].split()) for i,u in enumerate(ucd)
+      if u['dm'] != '#' and u['dt'] == 'can' and not (0xAC00 <= i < 0xAC00+11172)}
+ce = {i for i,u in enumerate(ucd) if u['Comp_Ex'] == 'Y'}
+
+assert not any(v for v in dm.values() if len(v) not in (1,2))
+dm1 = sorted(set(v for v in dm.values() if len(v) == 1))
+assert all((v[0] >> 16) in (0,2) for v in dm1)
+dm1_p0_array = ['0x%04Xu' % (v[0] & 0xFFFF) for v in dm1 if (v[0] >> 16) == 0]
+dm1_p2_array = ['0x%04Xu' % (v[0] & 0xFFFF) for v in dm1 if (v[0] >> 16) == 2]
+dm1_order = {v:i+1 for i,v in enumerate(dm1)}
+
+dm2 = sorted((v+(i if i not in ce and not ccc[i] else 0,), v)
+             for i,v in dm.items() if len(v) == 2)
+
+filt = lambda v: ((v[0] & 0xFFFFF800) == 0x0000 and
+                  (v[1] & 0xFFFFFF80) == 0x0300 and
+                  (v[2] & 0xFFF0C000) == 0x0000)
+dm2_u32_array = [v for v in dm2 if filt(v[0])]
+dm2_u64_array = [v for v in dm2 if not filt(v[0])]
+assert dm2_u32_array + dm2_u64_array == dm2
+dm2_u32_array = ["HB_CODEPOINT_ENCODE3_11_7_14 (0x%04Xu, 0x%04Xu, 0x%04Xu)" % v[0] for v in dm2_u32_array]
+dm2_u64_array = ["HB_CODEPOINT_ENCODE3 (0x%04Xu, 0x%04Xu, 0x%04Xu)" % v[0] for v in dm2_u64_array]
+
+l = 1 + len(dm1_p0_array) + len(dm1_p2_array)
+dm2_order = {v[1]:i+l for i,v in enumerate(dm2)}
+
+dm_order = {None: 0}
+dm_order.update(dm1_order)
+dm_order.update(dm2_order)
+
+
+# Prepare General_Category / Script mapping arrays
+
+gc_order = dict()
+for i,v in enumerate(('Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
+                      'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf',
+                      'Pi', 'Po', 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',)):
+    gc_order[i] = v
+    gc_order[v] = i
+
+sc_order = dict()
+sc_array = []
+sc_re = re.compile(r"\b(HB_SCRIPT_[_A-Z]*).*HB_TAG [(]'(.)','(.)','(.)','(.)'[)]")
+for line in open(hb_common_h):
+    m = sc_re.search (line)
+    if not m: continue
+    name = m.group(1)
+    tag = ''.join(m.group(i) for i in range(2, 6))
+    i = len(sc_array)
+    sc_order[tag] = i
+    sc_order[i] = tag
+    sc_array.append(name)
+
+
+# Write out main data
+
+DEFAULT = 'DEFAULT'
+COMPACT = 'COMPACT'
+SLOPPY  = 'SLOPPY'
+
+compression_level = {
+    DEFAULT: 5,
+    COMPACT: 9,
+    SLOPPY:  9,
+}
+
+logging.info('Generating output...')
+print("/* == Start of generated table == */")
+print("/*")
+print(" * The following table is generated by running:")
+print(" *")
+print(" *   ./gen-ucd-table.py ucd.nounihan.grouped.xml")
+print(" *")
+print(" * on file with this description:", ucdxml.description)
+print(" */")
+print()
+print("#ifndef HB_UCD_TABLE_HH")
+print("#define HB_UCD_TABLE_HH")
+print()
+print('#include "hb.hh"')
+print()
+
+
+# Write mapping data
+
+code = packTab.Code('_hb_ucd')
+sc_array, _ = code.addArray('hb_script_t', 'sc_map', sc_array)
+dm1_p0_array, _ = code.addArray('uint16_t', 'dm1_p0_map', dm1_p0_array)
+dm1_p2_array, _ = code.addArray('uint16_t', 'dm1_p2_map', dm1_p2_array)
+dm2_u32_array, _ = code.addArray('uint32_t', 'dm2_u32_map', dm2_u32_array)
+dm2_u64_array, _ = code.addArray('uint64_t', 'dm2_u64_map', dm2_u64_array)
+code.print_c(linkage='static inline')
+
+datasets = [
+    ('gc', gc, 'Cn', gc_order),
+    ('ccc', ccc, 0, None),
+    ('bmg', bmg, 0, None),
+    ('sc', sc, 'Zzzz', sc_order),
+    ('dm', dm, None, dm_order),
+]
+
+
+# Write main data
+
+for step in (DEFAULT, COMPACT, SLOPPY):
+    compression = compression_level[step]
+    logging.info('  Compression=%d:' % compression)
+    print()
+    if step == DEFAULT:
+        print('#ifndef HB_OPTIMIZE_SIZE')
+    elif step == COMPACT:
+        print('#elif !defined(HB_NO_UCD_UNASSIGNED)')
+    elif step == SLOPPY:
+        print('#else')
+    else:
+        assert False
+    print()
+
+    if step == SLOPPY:
+        for i in range(len(gc)):
+            if (i % 128) and gc[i] == 'Cn':
+                gc[i] = gc[i - 1]
+        for i in range(len(gc) - 2, -1, -1):
+            if ((i + 1) % 128) and gc[i] == 'Cn':
+                gc[i] = gc[i + 1]
+        for i in range(len(sc)):
+            if (i % 128) and sc[i] == 'Zzzz':
+                sc[i] = sc[i - 1]
+        for i in range(len(sc) - 2, -1, -1):
+            if ((i + 1) % 128) and sc[i] == 'Zzzz':
+                sc[i] = sc[i + 1]
+
+
+    code = packTab.Code('_hb_ucd')
+
+    for name,data,default,mapping in datasets:
+        sol = packTab.pack_table(data, default, mapping=mapping, compression=compression)
+        logging.info('      Dataset=%-8s FullCost=%d' % (name, sol.fullCost))
+        sol.genCode(code, name)
+
+    code.print_c(linkage='static inline')
+
+    print()
+
+
+print('#endif')
+print()
+
+print()
+print("#endif /* HB_UCD_TABLE_HH */")
+print()
+print("/* == End of generated table == */")
+logging.info('Done.')