Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/scripts/makeencoding.py @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/scripts/makeencoding.py Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 + +# Convert unicode mapping table to C arrays mapping glyph names and unicode values. +# +# ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MISC/KOI8-U.TXT +# ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/8859-1.TXT +# ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/8859-7.TXT +# ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1250.TXT +# ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1251.TXT +# ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT +# + +BANNED = [ + "controlSTX", "controlSOT", "controlETX", "controlEOT", "controlENQ", + "controlACK", "controlBEL", "controlBS", "controlHT", "controlLF", + "controlVT", "controlFF", "controlCR", "controlSO", "controlSI", + "controlDLE", "controlDC1", "controlDC2", "controlDC3", "controlDC4", + "controlNAK", "controlSYN", "controlETB", "controlCAN", "controlEM", + "controlSUB", "controlESC", "controlFS", "controlGS", "controlRS", + "controlUS", + "SF100000", "SF110000", "SF010000", "SF030000", "SF020000", "SF040000", + "SF080000", "SF090000", "SF060000", "SF070000", "SF050000", "SF430000", + "SF240000", "SF510000", "SF390000", "SF250000", "SF500000", "SF490000", + "SF380000", "SF280000", "SF260000", "SF360000", "SF370000", "SF420000", + "SF190000", "SF230000", "SF410000", "SF450000", "SF460000", "SF400000", + "SF540000", "SF440000", +] + +glyphs = {} +for line in open("scripts/glyphlist.txt").readlines(): + if line[0] != '#': + n, u = line.rstrip().split(';') + if len(u) == 4: + u = int(u, base=16) + if u not in glyphs and n not in BANNED: + glyphs[u] = n + +def load_table(fn): + table = [0] * 256 + for line in open(fn).readlines(): + line = line.strip() + if line[0] != '#' and not line.endswith("#UNDEFINED"): + line = line.split() + c = int(line[0][2:], base=16) + u = int(line[1][2:], base=16) + table[c] = u + return table + +def dump_table(name, table): + print("unsigned short fz_unicode_from_%s[256] = {" % name) + for u in table: + print('\t%d,' % u) + print("};") + print() + + print("const char *fz_glyph_name_from_%s[%d] = {" % (name, len(table))) + for u in table: + if u in glyphs: + print('\t"%s",' % glyphs[u]) + else: + print('\t_notdef,') + print("};") + print() + + rev = [] + i = 0 + for u in table: + if u in glyphs: + if u >= 128: + rev += ['{0x%04x,%d},' % (u, i)] + i = i + 1 + rev.sort() + + print("static const struct { unsigned short u, c; } %s_from_unicode[] = {" % name) + for s in rev: + print("\t" + s) + print("};") + print() + +dump_table("iso8859_1", load_table("scripts/8859-1.TXT")) +dump_table("iso8859_7", load_table("scripts/8859-7.TXT")) +dump_table("koi8u", load_table("scripts/KOI8-U.TXT")) +dump_table("windows_1250", load_table("scripts/CP1250.TXT")) +dump_table("windows_1251", load_table("scripts/CP1251.TXT")) +dump_table("windows_1252", load_table("scripts/CP1252.TXT"))
