diff mupdf-source/scripts/cmapshare.py @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/scripts/cmapshare.py	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+
+# Find and extract common CMap subsets.
+# Taken flattened CMaps as input, using only the 'cidchar' sections.
+# The outputs are truncated; so use 'cmapflatten.py' to clean them up.
+
+import sys, os
+
+def load_cmap_set(filename):
+	cmap = set()
+	active = False
+	for line in open(filename).readlines():
+		line = line.strip()
+		if line.endswith("endcidchar"): active = False
+		if active: cmap.add(line)
+		if line.endswith("begincidchar"): active = True
+	return cmap
+
+def load_cmap_prologue(filename):
+	prologue = []
+	for line in open(filename).readlines():
+		line = line.strip()
+		if line.endswith("begincidchar"):
+			break
+		prologue.append(line)
+	return prologue
+
+epilogue = [
+	'endcidchar',
+]
+
+common_name = os.path.basename(sys.argv[1])
+
+# First find the common subset
+common = load_cmap_set(sys.argv[2])
+for f in sys.argv[3:]:
+	common &= load_cmap_set(f)
+
+def print_cmap(filename, prologue, cmap):
+	out = open(filename, "w")
+	for line in prologue:
+		if not line.endswith("usecmap"):
+			print(line, file=out)
+		if line == 'begincmap':
+			print("/"+common_name, "usecmap", file=out)
+	print(len(cmap), "begincidchar", file=out)
+	for line in sorted(cmap):
+		print(line, file=out)
+	for line in epilogue:
+		print(line, file=out)
+
+# Print common subset
+print_cmap(sys.argv[1], ["/CMapName /%s" % common_name], common)
+
+# Now find unique bits
+for f in sys.argv[2:]:
+	cmap = load_cmap_set(f) - common
+	prologue = load_cmap_prologue(f)
+	print_cmap(f+".shared", prologue, cmap)