Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/scripts/makeencoding.py @ 3:2c135c81b16c
MERGE: upstream PyMuPDF 1.26.4 with MuPDF 1.26.7
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:44:09 +0200 |
| parents | b50eed0cc0ef |
| children |
comparison
equal
deleted
inserted
replaced
| 0:6015a75abc2d | 3:2c135c81b16c |
|---|---|
| 1 #!/usr/bin/env python3 | |
| 2 | |
| 3 # Convert unicode mapping table to C arrays mapping glyph names and unicode values. | |
| 4 # | |
| 5 # ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MISC/KOI8-U.TXT | |
| 6 # ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/8859-1.TXT | |
| 7 # ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/8859-7.TXT | |
| 8 # ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1250.TXT | |
| 9 # ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1251.TXT | |
| 10 # ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT | |
| 11 # | |
| 12 | |
| 13 BANNED = [ | |
| 14 "controlSTX", "controlSOT", "controlETX", "controlEOT", "controlENQ", | |
| 15 "controlACK", "controlBEL", "controlBS", "controlHT", "controlLF", | |
| 16 "controlVT", "controlFF", "controlCR", "controlSO", "controlSI", | |
| 17 "controlDLE", "controlDC1", "controlDC2", "controlDC3", "controlDC4", | |
| 18 "controlNAK", "controlSYN", "controlETB", "controlCAN", "controlEM", | |
| 19 "controlSUB", "controlESC", "controlFS", "controlGS", "controlRS", | |
| 20 "controlUS", | |
| 21 "SF100000", "SF110000", "SF010000", "SF030000", "SF020000", "SF040000", | |
| 22 "SF080000", "SF090000", "SF060000", "SF070000", "SF050000", "SF430000", | |
| 23 "SF240000", "SF510000", "SF390000", "SF250000", "SF500000", "SF490000", | |
| 24 "SF380000", "SF280000", "SF260000", "SF360000", "SF370000", "SF420000", | |
| 25 "SF190000", "SF230000", "SF410000", "SF450000", "SF460000", "SF400000", | |
| 26 "SF540000", "SF440000", | |
| 27 ] | |
| 28 | |
| 29 glyphs = {} | |
| 30 for line in open("scripts/glyphlist.txt").readlines(): | |
| 31 if line[0] != '#': | |
| 32 n, u = line.rstrip().split(';') | |
| 33 if len(u) == 4: | |
| 34 u = int(u, base=16) | |
| 35 if u not in glyphs and n not in BANNED: | |
| 36 glyphs[u] = n | |
| 37 | |
| 38 def load_table(fn): | |
| 39 table = [0] * 256 | |
| 40 for line in open(fn).readlines(): | |
| 41 line = line.strip() | |
| 42 if line[0] != '#' and not line.endswith("#UNDEFINED"): | |
| 43 line = line.split() | |
| 44 c = int(line[0][2:], base=16) | |
| 45 u = int(line[1][2:], base=16) | |
| 46 table[c] = u | |
| 47 return table | |
| 48 | |
| 49 def dump_table(name, table): | |
| 50 print("unsigned short fz_unicode_from_%s[256] = {" % name) | |
| 51 for u in table: | |
| 52 print('\t%d,' % u) | |
| 53 print("};") | |
| 54 print() | |
| 55 | |
| 56 print("const char *fz_glyph_name_from_%s[%d] = {" % (name, len(table))) | |
| 57 for u in table: | |
| 58 if u in glyphs: | |
| 59 print('\t"%s",' % glyphs[u]) | |
| 60 else: | |
| 61 print('\t_notdef,') | |
| 62 print("};") | |
| 63 print() | |
| 64 | |
| 65 rev = [] | |
| 66 i = 0 | |
| 67 for u in table: | |
| 68 if u in glyphs: | |
| 69 if u >= 128: | |
| 70 rev += ['{0x%04x,%d},' % (u, i)] | |
| 71 i = i + 1 | |
| 72 rev.sort() | |
| 73 | |
| 74 print("static const struct { unsigned short u, c; } %s_from_unicode[] = {" % name) | |
| 75 for s in rev: | |
| 76 print("\t" + s) | |
| 77 print("};") | |
| 78 print() | |
| 79 | |
| 80 dump_table("iso8859_1", load_table("scripts/8859-1.TXT")) | |
| 81 dump_table("iso8859_7", load_table("scripts/8859-7.TXT")) | |
| 82 dump_table("koi8u", load_table("scripts/KOI8-U.TXT")) | |
| 83 dump_table("windows_1250", load_table("scripts/CP1250.TXT")) | |
| 84 dump_table("windows_1251", load_table("scripts/CP1251.TXT")) | |
| 85 dump_table("windows_1252", load_table("scripts/CP1252.TXT")) |
