diff mupdf-source/scripts/cmapdump.py @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/scripts/cmapdump.py	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,217 @@
+#!/usr/bin/env python3
+
+# Parse a CMap file and dump it as a C struct.
+
+import sys
+
+# Decode a subset of CMap syntax (only what is needed for our built-in resources)
+# We require that tokens are whitespace separated.
+
+def dumpcmap(filename):
+	codespacerange = []
+	usecmap = ""
+	cmapname = ""
+	wmode = 0
+
+	map = {}
+
+	def tocode(s):
+		if s[0] == '<' and s[-1] == '>':
+			return int(s[1:-1], 16)
+		return int(s, 10)
+
+	def map_cidchar(lo, v):
+		map[lo] = v
+
+	def map_cidrange(lo, hi, v):
+		while lo <= hi:
+			map[lo] = v
+			lo = lo + 1
+			v = v + 1
+
+	def add_bf(lo, v):
+		# Decode unicode surrogate pairs
+		if len(v) == 2 and v[0] >= 0xd800 and v[0] <= 0xdbff and v[1] >= 0xdc00 and v[1] <= 0xdfff:
+			map[lo] = ((v[0] - 0xd800) << 10) + (v[1] - 0xdc00) + 0x10000
+		elif len(v) == 1:
+			map[lo] = v[0]
+		elif len(v) <= 8:
+			map[lo] = v[:]
+		else:
+			print("/* warning: too long one-to-many mapping: %s */" % (v))
+
+	def map_bfchar(lo, bf):
+		bf = bf[1:-1] # drop < >
+		v = [int(bf[i:i+4],16) for i in range(0, len(bf), 4)]
+		add_bf(lo, v)
+
+	def map_bfrange(lo, hi, bf):
+		bf = bf[1:-1] # drop < >
+		v = [int(bf[i:i+4],16) for i in range(0, len(bf), 4)]
+		while lo <= hi:
+			add_bf(lo, v)
+			lo = lo + 1
+			v[-1] = v[-1] + 1
+
+	current = None
+	for line in open(filename, "r").readlines():
+		if line[0] == '%':
+			continue
+		line = line.strip().split()
+		if len(line) == 0:
+			continue
+		if line[0] == '/CMapName':
+			cmapname = line[1][1:]
+		elif line[0] == '/WMode':
+			wmode = int(line[1])
+		elif len(line) > 1 and line[1] == 'usecmap':
+			usecmap = line[0][1:]
+		elif len(line) > 1 and line[1] == 'begincodespacerange': current = 'codespacerange'
+		elif len(line) > 1 and line[1] == 'begincidrange': current = 'cidrange'
+		elif len(line) > 1 and line[1] == 'beginbfrange': current = 'bfrange'
+		elif len(line) > 1 and line[1] == 'begincidchar': current = 'cidchar'
+		elif len(line) > 1 and line[1] == 'beginbfchar': current = 'bfchar'
+		elif line[0] == 'begincodespacerange': current = 'codespacerange'
+		elif line[0] == 'begincidrange': current = 'cidrange'
+		elif line[0] == 'beginbfrange': current = 'bfrange'
+		elif line[0] == 'begincidchar': current = 'cidchar'
+		elif line[0] == 'beginbfchar': current = 'bfchar'
+		elif line[0].startswith("end"):
+			current = None
+		elif current == 'codespacerange' and len(line) == 2:
+			n, a, b = (len(line[0])-2)/2, tocode(line[0]), tocode(line[1])
+			codespacerange.append((n, a, b))
+		elif current == 'cidrange' and len(line) == 3:
+			a, b, c = tocode(line[0]), tocode(line[1]), tocode(line[2])
+			map_cidrange(a, b, c)
+		elif current == 'cidchar' and len(line) == 2:
+			a, b = tocode(line[0]), tocode(line[1])
+			map_cidchar(a, b)
+		elif current == 'bfchar' and len(line) == 2:
+			a, b = tocode(line[0]), line[1]
+			map_bfchar(a, b)
+		elif current == 'bfrange' and len(line) == 3:
+			a, b, c = tocode(line[0]), tocode(line[1]), line[2]
+			map_bfrange(a, b, c)
+
+	# Create ranges
+
+	ranges = []
+	xranges = []
+	mranges = []
+	mdata = []
+
+	out_lo = -100
+	out_hi = -100
+	out_v_lo = 0
+	out_v_hi = 0
+
+	def flush_range():
+		if out_lo >= 0:
+			if out_lo > 0xffff or out_hi > 0xffff or out_v_lo > 0xffff:
+				xranges.append((out_lo, out_hi, out_v_lo))
+			else:
+				ranges.append((out_lo, out_hi, out_v_lo))
+
+	keys = list(map.keys())
+	keys.sort()
+	for code in keys:
+		v = map[code]
+		if type(v) is not int:
+			flush_range()
+			out_lo = out_hi = -100
+			mranges.append((code, len(mdata)))
+			mdata.append(len(v))
+			mdata.extend(v)
+		else:
+			if code != out_hi + 1 or v != out_v_hi + 1:
+				flush_range()
+				out_lo = out_hi = code
+				out_v_lo = out_v_hi = v
+			else:
+				out_hi = out_hi + 1
+				out_v_hi = out_v_hi + 1
+	flush_range()
+
+	# Print C file
+
+	cname = cmapname.replace('-', '_')
+
+	print()
+	print("/*", cmapname, "*/")
+	print()
+
+	if len(ranges) > 0:
+		print("static const pdf_range cmap_%s_ranges[] = {" % cname)
+		for r in ranges:
+			print("{0x%x,0x%x,0x%x}," % r)
+		print("};")
+		print()
+	if len(xranges) > 0:
+		print("static const pdf_xrange cmap_%s_xranges[] = {" % cname)
+		for r in xranges:
+			print("{0x%x,0x%x,0x%x}," % r)
+		print("};")
+		print()
+	if len(mranges) > 0:
+		print("static const pdf_mrange cmap_%s_mranges[] = {" % cname)
+		for r in mranges:
+			print("{0x%x,0x%x}," % r)
+		print("};")
+		print()
+		print("static const int cmap_%s_table[] = {" % cname)
+		n = mdata[0]
+		i = 0
+		for r in mdata:
+			if i <= n:
+				sys.stdout.write("0x%x," % r)
+				i = i + 1
+			else:
+				sys.stdout.write("\n0x%x," % r)
+				i = 1
+				n = r
+		sys.stdout.write("\n")
+		print("};")
+		print()
+
+	print("static pdf_cmap cmap_%s = {" % cname)
+	print("\t{ -1, pdf_drop_cmap_imp },")
+	print("\t/* cmapname */ \"%s\"," % cmapname)
+	print("\t/* usecmap */ \"%s\", NULL," % usecmap)
+	print("\t/* wmode */ %d," % wmode)
+	print("\t/* codespaces */ %d, {" % len(codespacerange))
+	if len(codespacerange) > 0:
+		for codespace in codespacerange:
+			fmt = "\t\t{ %%d, 0x%%0%dx, 0x%%0%dx }," % (codespace[0]*2, codespace[0]*2)
+			print(fmt % codespace)
+	else:
+			print("\t\t{ 0, 0, 0 },")
+	print("\t},")
+
+	if len(ranges) > 0:
+		print("\t%d, %d, (pdf_range*)cmap_%s_ranges," % (len(ranges),len(ranges),cname))
+	else:
+		print("\t0, 0, NULL, /* ranges */")
+
+	if len(xranges) > 0:
+		print("\t%d, %d, (pdf_xrange*)cmap_%s_xranges," % (len(xranges),len(xranges),cname))
+	else:
+		print("\t0, 0, NULL, /* xranges */")
+
+	if len(mranges) > 0:
+		print("\t%d, %d, (pdf_mrange*)cmap_%s_mranges," % (len(mranges),len(mranges),cname))
+	else:
+		print("\t0, 0, NULL, /* mranges */")
+
+	if len(mdata) > 0:
+		print("\t%d, %d, (int*)cmap_%s_table," % (len(mdata),len(mdata),cname))
+	else:
+		print("\t0, 0, NULL, /* table */")
+
+	print("\t0, 0, 0, NULL /* splay tree */")
+	print("};")
+
+print("/* This is an automatically generated file. Do not edit. */")
+
+for arg in sys.argv[1:]:
+	dumpcmap(arg)