comparison tests/test_font.py @ 1:1d09e1dec1d9 upstream

ADD: PyMuPDF v1.26.4: the original sdist. It does not yet contain MuPDF. This normally will be downloaded when building PyMuPDF.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:37:51 +0200
parents
children a6bc019ac0b2
comparison
equal deleted inserted replaced
-1:000000000000 1:1d09e1dec1d9
1 """
2 Tests for the Font class.
3 """
4 import os
5 import platform
6 import pymupdf
7 import subprocess
8 import textwrap
9
10 import util
11
12
13 def test_font1():
14 text = "PyMuPDF"
15 font = pymupdf.Font("helv")
16 assert font.name == "Helvetica"
17 tl = font.text_length(text, fontsize=20)
18 cl = font.char_lengths(text, fontsize=20)
19 assert len(text) == len(cl)
20 assert abs(sum(cl) - tl) < pymupdf.EPSILON
21 for i in range(len(cl)):
22 assert cl[i] == font.glyph_advance(ord(text[i])) * 20
23 font2 = pymupdf.Font(fontbuffer=font.buffer)
24 codepoints1 = font.valid_codepoints()
25 codepoints2 = font2.valid_codepoints()
26 print('')
27 print(f'{len(codepoints1)=}')
28 print(f'{len(codepoints2)=}')
29 if 0:
30 for i, (ucs1, ucs2) in enumerate(zip(codepoints1, codepoints2)):
31 print(f' {i}: {ucs1=} {ucs2=} {"" if ucs2==ucs2 else "*"}')
32 assert font2.valid_codepoints() == font.valid_codepoints()
33
34 # Also check we can get font's bbox.
35 bbox1 = font.bbox
36 print(f'{bbox1=}')
37 if hasattr(pymupdf, 'mupdf'):
38 bbox2 = font.this.fz_font_bbox()
39 assert bbox2 == bbox1
40
41
42 def test_font2():
43 """Old and new length computation must be the same."""
44 font = pymupdf.Font("helv")
45 text = "PyMuPDF"
46 assert font.text_length(text) == pymupdf.get_text_length(text)
47
48
49 def test_fontname():
50 """Assert a valid PDF fontname."""
51 doc = pymupdf.open()
52 page = doc.new_page()
53 assert page.insert_font() # assert: a valid fontname works!
54 detected = False # preset indicator
55 try: # fontname check will fail first - don't need a font at all here
56 page.insert_font(fontname="illegal/char", fontfile="unimportant")
57 except ValueError as e:
58 if str(e).startswith("bad fontname chars"):
59 detected = True # illegal fontname detected
60 assert detected
61
62 def test_2608():
63 flags = (pymupdf.TEXT_DEHYPHENATE | pymupdf.TEXT_MEDIABOX_CLIP)
64 with pymupdf.open(os.path.abspath(f'{__file__}/../../tests/resources/2201.00069.pdf')) as doc:
65 page = doc[0]
66 blocks = page.get_text_blocks(flags=flags)
67 text = blocks[10][4]
68 with open(os.path.abspath(f'{__file__}/../../tests/test_2608_out'), 'wb') as f:
69 f.write(text.encode('utf8'))
70 path_expected = os.path.normpath(f'{__file__}/../../tests/resources/test_2608_expected')
71 path_expected_1_26 = os.path.normpath(f'{__file__}/../../tests/resources/test_2608_expected_1.26')
72 if pymupdf.mupdf_version_tuple >= (1, 27):
73 path_expected2 = path_expected
74 else:
75 path_expected2 = path_expected_1_26
76 with open(path_expected2, 'rb') as f:
77 expected = f.read().decode('utf8')
78 # Github windows x32 seems to insert \r characters; maybe something to
79 # do with the Python installation's line endings settings.
80 expected = expected.replace('\r', '')
81 print(f'test_2608(): {text.encode("utf8")=}')
82 print(f'test_2608(): {expected.encode("utf8")=}')
83 assert text == expected
84
85 def test_fontarchive():
86 import subprocess
87 arch = pymupdf.Archive()
88 css = pymupdf.css_for_pymupdf_font("notos", archive=arch, name="sans-serif")
89 print(css)
90 print(arch.entry_list)
91 assert arch.entry_list == \
92 [
93 {
94 'fmt': 'tree',
95 'entries':
96 [
97 'notosbo', 'notosbi', 'notosit', 'notos'
98 ],
99 'path': None
100 }
101 ]
102
103 def test_load_system_font():
104 if not hasattr(pymupdf, 'mupdf'):
105 print(f'test_load_system_font(): Not running on classic.')
106 return
107 trace = list()
108 def font_f(name, bold, italic, needs_exact_metrics):
109 trace.append((name, bold, italic, needs_exact_metrics))
110 #print(f'test_load_system_font():font_f(): Looking for font: {name=} {bold=} {italic=} {needs_exact_metrics=}.')
111 return None
112 def f_cjk(name, ordering, serif):
113 trace.append((name, ordering, serif))
114 #print(f'test_load_system_font():f_cjk(): Looking for font: {name=} {ordering=} {serif=}.')
115 return None
116 def f_fallback(script, language, serif, bold, italic):
117 trace.append((script, language, serif, bold, italic))
118 #print(f'test_load_system_font():f_fallback(): looking for font: {script=} {language=} {serif=} {bold=} {italic=}.')
119 return None
120 pymupdf.mupdf.fz_install_load_system_font_funcs(font_f, f_cjk, f_fallback)
121 f = pymupdf.mupdf.fz_load_system_font("some-font-name", 0, 0, 0)
122 assert trace == [
123 ('some-font-name', 0, 0, 0),
124 ], f'Incorrect {trace=}.'
125 print(f'test_load_system_font(): {f.m_internal=}')
126
127
128 def test_mupdf_subset_fonts2():
129 if not hasattr(pymupdf, 'mupdf'):
130 print('Not running on rebased.')
131 return
132 path = os.path.abspath(f'{__file__}/../../tests/resources/2.pdf')
133 with pymupdf.open(path) as doc:
134 n = len(doc)
135 pages = [i*2 for i in range(n//2)]
136 print(f'{pages=}.')
137 pymupdf.mupdf.pdf_subset_fonts2(pymupdf._as_pdf_document(doc), pages)
138
139
140 def test_3677():
141 pymupdf.TOOLS.set_subset_fontnames(True)
142 try:
143 path = os.path.abspath(f'{__file__}/../../tests/resources/test_3677.pdf')
144 font_names_expected = [
145 'BCDEEE+Aptos',
146 'BCDFEE+Aptos',
147 'BCDGEE+Calibri-Light',
148 'BCDHEE+Calibri-Light',
149 ]
150 font_names = list()
151 with pymupdf.open(path) as document:
152 for page in document:
153 for block in page.get_text('dict')['blocks']:
154 if block['type'] == 0:
155 if 'lines' in block.keys():
156 for line in block['lines']:
157 for span in line['spans']:
158 font_name=span['font']
159 print(font_name)
160 font_names.append(font_name)
161 assert font_names == font_names_expected, f'{font_names=}'
162 finally:
163 pymupdf.TOOLS.set_subset_fontnames(False)
164
165
166 def test_3933():
167 path = os.path.normpath(f'{__file__}/../../tests/resources/test_3933.pdf')
168 with pymupdf.open(path) as document:
169 page = document[0]
170 print(f'{len(page.get_fonts())=}')
171
172 expected = {
173 'BCDEEE+Calibri': 39,
174 'BCDFEE+SwissReSan-Regu': 53,
175 'BCDGEE+SwissReSan-Ital': 20,
176 'BCDHEE+SwissReSan-Bold': 20,
177 'BCDIEE+SwissReSan-Regu': 53,
178 'BCDJEE+Calibri': 39,
179 }
180
181 for xref, _, _, name, _, _ in page.get_fonts():
182 _, _, _, content = document.extract_font(xref)
183
184 if content:
185 font = pymupdf.Font(fontname=name, fontbuffer=content)
186 supported_symbols = font.valid_codepoints()
187 print(f'Font {name}: {len(supported_symbols)=}.', flush=1)
188 assert len(supported_symbols) == expected.get(name)
189
190
191 def test_3780():
192 path = os.path.normpath(f'{__file__}/../../tests/resources/test_3780.pdf')
193 with pymupdf.open(path) as document:
194 for page_i, page in enumerate(document):
195 for itm in page.get_fonts():
196 buff=document.extract_font(itm[0])[-1]
197 font=pymupdf.Font(fontbuffer=buff)
198 print(f'{page_i=}: xref {itm[0]} {font.name=} {font.ascender=} {font.descender=}.')
199 if page_i == 0:
200 d = page.get_text('dict')
201 #for n, v in d.items():
202 # print(f' {n}: {v!r}')
203 for i, block in enumerate(d['blocks']):
204 print(f'block {i}:')
205 for j, line in enumerate(block['lines']):
206 print(f' line {j}:')
207 for k, span in enumerate(line['spans']):
208 print(f' span {k}:')
209 for n, v in span.items():
210 print(f' {n}: {v!r}')
211
212
213 def test_3887():
214 print(f'{pymupdf.version=}')
215 path = os.path.normpath(f'{__file__}/../../tests/resources/test_3887.pdf')
216
217 path2 = os.path.normpath(f'{__file__}/../../tests/resources/test_3887.pdf.ez.pdf')
218 with pymupdf.open(path) as document:
219 document.subset_fonts(fallback=False)
220 document.ez_save(path2)
221
222 with pymupdf.open(path2) as document:
223 text = f"\u0391\u3001\u0392\u3001\u0393\u3001\u0394\u3001\u0395\u3001\u0396\u3001\u0397\u3001\u0398\u3001\u0399\u3001\u039a\u3001\u039b\u3001\u039c\u3001\u039d\u3001\u039e\u3001\u039f\u3001\u03a0\u3001\u03a1\u3001\u03a3\u3001\u03a4\u3001\u03a5\u3001\u03a6\u3001\u03a7\u3001\u03a8\u3001\u03a9\u3002\u03b1\u3001\u03b2\u3001\u03b3\u3001\u03b4\u3001\u03b5\u3001\u03b6\u3001\u03b7\u3001\u03b8\u3001\u03b9\u3001\u03ba\u3001\u03bb\u3001\u03bc\u3001\u03bd\u3001\u03be\u3001\u03bf\u3001\u03c0\u3001\u03c1\u3001\u03c2\u3001\u03c4\u3001\u03c5\u3001\u03c6\u3001\u03c7\u3001\u03c8\u3001\u03c9\u3002"
224 page = document[0]
225 chars = [c for b in page.get_text("rawdict",flags=0)["blocks"] for l in b["lines"] for s in l["spans"] for c in s["chars"]]
226 output = [c["c"] for c in chars]
227 print(f'text:\n {text}')
228 print(f'output:\n {output}')
229 pixmap = page.get_pixmap()
230 path_pixmap = f'{path}.0.png'
231 pixmap.save(path_pixmap)
232 print(f'Have saved to: {path_pixmap=}')
233 assert set(output)==set(text)
234
235
236 def test_4457():
237 print()
238 files = (
239 ('https://github.com/user-attachments/files/20862923/test_4457_a.pdf', 'test_4457_a.pdf', None, 4),
240 ('https://github.com/user-attachments/files/20862922/test_4457_b.pdf', 'test_4457_b.pdf', None, 9),
241 )
242 for url, name, size, rms_old_after_max in files:
243 path = util.download(url, name, size)
244
245 with pymupdf.open(path) as document:
246 page = document[0]
247
248 pixmap = document[0].get_pixmap()
249 path_pixmap = f'{path}.png'
250 pixmap.save(path_pixmap)
251 print(f'Have created: {path_pixmap=}')
252
253 text = page.get_text()
254 path_before = f'{path}.before.pdf'
255 path_after = f'{path}.after.pdf'
256 document.ez_save(path_before, garbage=4)
257 print(f'Have created {path_before=}')
258
259 document.subset_fonts()
260 document.ez_save(path_after, garbage=4)
261 print(f'Have created {path_after=}')
262
263 with pymupdf.open(path_before) as document:
264 text_before = document[0].get_text()
265 pixmap_before = document[0].get_pixmap()
266 path_pixmap_before = f'{path_before}.png'
267 pixmap_before.save(path_pixmap_before)
268 print(f'Have created: {path_pixmap_before=}')
269
270 with pymupdf.open(path_after) as document:
271 text_after = document[0].get_text()
272 pixmap_after = document[0].get_pixmap()
273 path_pixmap_after = f'{path_after}.png'
274 pixmap_after.save(path_pixmap_after)
275 print(f'Have created: {path_pixmap_after=}')
276
277 import gentle_compare
278 rms_before = gentle_compare.pixmaps_rms(pixmap, pixmap_before)
279 rms_after = gentle_compare.pixmaps_rms(pixmap, pixmap_after)
280 print(f'{rms_before=}')
281 print(f'{rms_after=}')
282
283 # Create .png file showing differences between <path> and <path_after>.
284 path_pixmap_after_diff = f'{path_after}.diff.png'
285 pixmap_after_diff = gentle_compare.pixmaps_diff(pixmap, pixmap_after)
286 pixmap_after_diff.save(path_pixmap_after_diff)
287 print(f'Have created: {path_pixmap_after_diff}')
288
289 # Extract text from <path>, <path_before> and <path_after> and write to
290 # files so we can show differences with `diff`.
291 path_text = os.path.normpath(f'{__file__}/../../tests/test_4457.txt')
292 path_text_before = f'{path_text}.before.txt'
293 path_text_after = f'{path_text}.after.txt'
294 with open(path_text, 'w', encoding='utf8') as f:
295 f.write(text)
296 with open(path_text_before, 'w', encoding='utf8') as f:
297 f.write(text_before)
298 with open(path_text_after, 'w', encoding='utf8') as f:
299 f.write(text_after)
300
301 # Can't write text to stdout on Windows because of encoding errors.
302 if platform.system() != 'Windows':
303 print(f'text:\n{textwrap.indent(text, " ")}')
304 print(f'text_before:\n{textwrap.indent(text_before, " ")}')
305 print(f'text_after:\n{textwrap.indent(text_after, " ")}')
306 print(f'{path_text=}')
307 print(f'{path_text_before=}')
308 print(f'{path_text_after=}')
309
310 command = f'diff -u {path_text} {path_text_before}'
311 print(f'Running: {command}', flush=1)
312 subprocess.run(command, shell=1)
313
314 command = f'diff -u {path_text} {path_text_after}'
315 print(f'Running: {command}', flush=1)
316 subprocess.run(command, shell=1)
317
318 assert text_before == text
319 assert rms_before == 0
320
321 if pymupdf.mupdf_version_tuple >= (1, 26, 6):
322 assert rms_after == 0
323 else:
324 # As of 2025-05-20 there are some differences in some characters,
325 # e.g. the non-ascii characters in `Philipp Krahenbuhl`. See
326 # <path_pixmap> and <path_pixmap_after>.
327 assert abs(rms_after - rms_old_after_max) < 2
328
329 # Avoid test failure caused by mupdf warnings.
330 wt = pymupdf.TOOLS.mupdf_warnings()
331 print(f'{wt=}')
332 assert wt == 'bogus font ascent/descent values (0 / 0)\n... repeated 5 times...'