comparison mupdf-source/scripts/makeencoding.py @ 3:2c135c81b16c

MERGE: upstream PyMuPDF 1.26.4 with MuPDF 1.26.7
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:44:09 +0200
parents b50eed0cc0ef
children
comparison
equal deleted inserted replaced
0:6015a75abc2d 3:2c135c81b16c
1 #!/usr/bin/env python3
2
3 # Convert unicode mapping table to C arrays mapping glyph names and unicode values.
4 #
5 # ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MISC/KOI8-U.TXT
6 # ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/8859-1.TXT
7 # ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/8859-7.TXT
8 # ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1250.TXT
9 # ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1251.TXT
10 # ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT
11 #
12
13 BANNED = [
14 "controlSTX", "controlSOT", "controlETX", "controlEOT", "controlENQ",
15 "controlACK", "controlBEL", "controlBS", "controlHT", "controlLF",
16 "controlVT", "controlFF", "controlCR", "controlSO", "controlSI",
17 "controlDLE", "controlDC1", "controlDC2", "controlDC3", "controlDC4",
18 "controlNAK", "controlSYN", "controlETB", "controlCAN", "controlEM",
19 "controlSUB", "controlESC", "controlFS", "controlGS", "controlRS",
20 "controlUS",
21 "SF100000", "SF110000", "SF010000", "SF030000", "SF020000", "SF040000",
22 "SF080000", "SF090000", "SF060000", "SF070000", "SF050000", "SF430000",
23 "SF240000", "SF510000", "SF390000", "SF250000", "SF500000", "SF490000",
24 "SF380000", "SF280000", "SF260000", "SF360000", "SF370000", "SF420000",
25 "SF190000", "SF230000", "SF410000", "SF450000", "SF460000", "SF400000",
26 "SF540000", "SF440000",
27 ]
28
29 glyphs = {}
30 for line in open("scripts/glyphlist.txt").readlines():
31 if line[0] != '#':
32 n, u = line.rstrip().split(';')
33 if len(u) == 4:
34 u = int(u, base=16)
35 if u not in glyphs and n not in BANNED:
36 glyphs[u] = n
37
38 def load_table(fn):
39 table = [0] * 256
40 for line in open(fn).readlines():
41 line = line.strip()
42 if line[0] != '#' and not line.endswith("#UNDEFINED"):
43 line = line.split()
44 c = int(line[0][2:], base=16)
45 u = int(line[1][2:], base=16)
46 table[c] = u
47 return table
48
49 def dump_table(name, table):
50 print("unsigned short fz_unicode_from_%s[256] = {" % name)
51 for u in table:
52 print('\t%d,' % u)
53 print("};")
54 print()
55
56 print("const char *fz_glyph_name_from_%s[%d] = {" % (name, len(table)))
57 for u in table:
58 if u in glyphs:
59 print('\t"%s",' % glyphs[u])
60 else:
61 print('\t_notdef,')
62 print("};")
63 print()
64
65 rev = []
66 i = 0
67 for u in table:
68 if u in glyphs:
69 if u >= 128:
70 rev += ['{0x%04x,%d},' % (u, i)]
71 i = i + 1
72 rev.sort()
73
74 print("static const struct { unsigned short u, c; } %s_from_unicode[] = {" % name)
75 for s in rev:
76 print("\t" + s)
77 print("};")
78 print()
79
80 dump_table("iso8859_1", load_table("scripts/8859-1.TXT"))
81 dump_table("iso8859_7", load_table("scripts/8859-7.TXT"))
82 dump_table("koi8u", load_table("scripts/KOI8-U.TXT"))
83 dump_table("windows_1250", load_table("scripts/CP1250.TXT"))
84 dump_table("windows_1251", load_table("scripts/CP1251.TXT"))
85 dump_table("windows_1252", load_table("scripts/CP1252.TXT"))