Mercurial > hgrepos > Python2 > PyMuPDF
diff tests/test_font.py @ 3:2c135c81b16c
MERGE: upstream PyMuPDF 1.26.4 with MuPDF 1.26.7
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:44:09 +0200 |
| parents | 1d09e1dec1d9 |
| children | a6bc019ac0b2 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/test_font.py Mon Sep 15 11:44:09 2025 +0200 @@ -0,0 +1,332 @@ +""" +Tests for the Font class. +""" +import os +import platform +import pymupdf +import subprocess +import textwrap + +import util + + +def test_font1(): + text = "PyMuPDF" + font = pymupdf.Font("helv") + assert font.name == "Helvetica" + tl = font.text_length(text, fontsize=20) + cl = font.char_lengths(text, fontsize=20) + assert len(text) == len(cl) + assert abs(sum(cl) - tl) < pymupdf.EPSILON + for i in range(len(cl)): + assert cl[i] == font.glyph_advance(ord(text[i])) * 20 + font2 = pymupdf.Font(fontbuffer=font.buffer) + codepoints1 = font.valid_codepoints() + codepoints2 = font2.valid_codepoints() + print('') + print(f'{len(codepoints1)=}') + print(f'{len(codepoints2)=}') + if 0: + for i, (ucs1, ucs2) in enumerate(zip(codepoints1, codepoints2)): + print(f' {i}: {ucs1=} {ucs2=} {"" if ucs2==ucs2 else "*"}') + assert font2.valid_codepoints() == font.valid_codepoints() + + # Also check we can get font's bbox. + bbox1 = font.bbox + print(f'{bbox1=}') + if hasattr(pymupdf, 'mupdf'): + bbox2 = font.this.fz_font_bbox() + assert bbox2 == bbox1 + + +def test_font2(): + """Old and new length computation must be the same.""" + font = pymupdf.Font("helv") + text = "PyMuPDF" + assert font.text_length(text) == pymupdf.get_text_length(text) + + +def test_fontname(): + """Assert a valid PDF fontname.""" + doc = pymupdf.open() + page = doc.new_page() + assert page.insert_font() # assert: a valid fontname works! + detected = False # preset indicator + try: # fontname check will fail first - don't need a font at all here + page.insert_font(fontname="illegal/char", fontfile="unimportant") + except ValueError as e: + if str(e).startswith("bad fontname chars"): + detected = True # illegal fontname detected + assert detected + +def test_2608(): + flags = (pymupdf.TEXT_DEHYPHENATE | pymupdf.TEXT_MEDIABOX_CLIP) + with pymupdf.open(os.path.abspath(f'{__file__}/../../tests/resources/2201.00069.pdf')) as doc: + page = doc[0] + blocks = page.get_text_blocks(flags=flags) + text = blocks[10][4] + with open(os.path.abspath(f'{__file__}/../../tests/test_2608_out'), 'wb') as f: + f.write(text.encode('utf8')) + path_expected = os.path.normpath(f'{__file__}/../../tests/resources/test_2608_expected') + path_expected_1_26 = os.path.normpath(f'{__file__}/../../tests/resources/test_2608_expected_1.26') + if pymupdf.mupdf_version_tuple >= (1, 27): + path_expected2 = path_expected + else: + path_expected2 = path_expected_1_26 + with open(path_expected2, 'rb') as f: + expected = f.read().decode('utf8') + # Github windows x32 seems to insert \r characters; maybe something to + # do with the Python installation's line endings settings. + expected = expected.replace('\r', '') + print(f'test_2608(): {text.encode("utf8")=}') + print(f'test_2608(): {expected.encode("utf8")=}') + assert text == expected + +def test_fontarchive(): + import subprocess + arch = pymupdf.Archive() + css = pymupdf.css_for_pymupdf_font("notos", archive=arch, name="sans-serif") + print(css) + print(arch.entry_list) + assert arch.entry_list == \ + [ + { + 'fmt': 'tree', + 'entries': + [ + 'notosbo', 'notosbi', 'notosit', 'notos' + ], + 'path': None + } + ] + +def test_load_system_font(): + if not hasattr(pymupdf, 'mupdf'): + print(f'test_load_system_font(): Not running on classic.') + return + trace = list() + def font_f(name, bold, italic, needs_exact_metrics): + trace.append((name, bold, italic, needs_exact_metrics)) + #print(f'test_load_system_font():font_f(): Looking for font: {name=} {bold=} {italic=} {needs_exact_metrics=}.') + return None + def f_cjk(name, ordering, serif): + trace.append((name, ordering, serif)) + #print(f'test_load_system_font():f_cjk(): Looking for font: {name=} {ordering=} {serif=}.') + return None + def f_fallback(script, language, serif, bold, italic): + trace.append((script, language, serif, bold, italic)) + #print(f'test_load_system_font():f_fallback(): looking for font: {script=} {language=} {serif=} {bold=} {italic=}.') + return None + pymupdf.mupdf.fz_install_load_system_font_funcs(font_f, f_cjk, f_fallback) + f = pymupdf.mupdf.fz_load_system_font("some-font-name", 0, 0, 0) + assert trace == [ + ('some-font-name', 0, 0, 0), + ], f'Incorrect {trace=}.' + print(f'test_load_system_font(): {f.m_internal=}') + + +def test_mupdf_subset_fonts2(): + if not hasattr(pymupdf, 'mupdf'): + print('Not running on rebased.') + return + path = os.path.abspath(f'{__file__}/../../tests/resources/2.pdf') + with pymupdf.open(path) as doc: + n = len(doc) + pages = [i*2 for i in range(n//2)] + print(f'{pages=}.') + pymupdf.mupdf.pdf_subset_fonts2(pymupdf._as_pdf_document(doc), pages) + + +def test_3677(): + pymupdf.TOOLS.set_subset_fontnames(True) + try: + path = os.path.abspath(f'{__file__}/../../tests/resources/test_3677.pdf') + font_names_expected = [ + 'BCDEEE+Aptos', + 'BCDFEE+Aptos', + 'BCDGEE+Calibri-Light', + 'BCDHEE+Calibri-Light', + ] + font_names = list() + with pymupdf.open(path) as document: + for page in document: + for block in page.get_text('dict')['blocks']: + if block['type'] == 0: + if 'lines' in block.keys(): + for line in block['lines']: + for span in line['spans']: + font_name=span['font'] + print(font_name) + font_names.append(font_name) + assert font_names == font_names_expected, f'{font_names=}' + finally: + pymupdf.TOOLS.set_subset_fontnames(False) + + +def test_3933(): + path = os.path.normpath(f'{__file__}/../../tests/resources/test_3933.pdf') + with pymupdf.open(path) as document: + page = document[0] + print(f'{len(page.get_fonts())=}') + + expected = { + 'BCDEEE+Calibri': 39, + 'BCDFEE+SwissReSan-Regu': 53, + 'BCDGEE+SwissReSan-Ital': 20, + 'BCDHEE+SwissReSan-Bold': 20, + 'BCDIEE+SwissReSan-Regu': 53, + 'BCDJEE+Calibri': 39, + } + + for xref, _, _, name, _, _ in page.get_fonts(): + _, _, _, content = document.extract_font(xref) + + if content: + font = pymupdf.Font(fontname=name, fontbuffer=content) + supported_symbols = font.valid_codepoints() + print(f'Font {name}: {len(supported_symbols)=}.', flush=1) + assert len(supported_symbols) == expected.get(name) + + +def test_3780(): + path = os.path.normpath(f'{__file__}/../../tests/resources/test_3780.pdf') + with pymupdf.open(path) as document: + for page_i, page in enumerate(document): + for itm in page.get_fonts(): + buff=document.extract_font(itm[0])[-1] + font=pymupdf.Font(fontbuffer=buff) + print(f'{page_i=}: xref {itm[0]} {font.name=} {font.ascender=} {font.descender=}.') + if page_i == 0: + d = page.get_text('dict') + #for n, v in d.items(): + # print(f' {n}: {v!r}') + for i, block in enumerate(d['blocks']): + print(f'block {i}:') + for j, line in enumerate(block['lines']): + print(f' line {j}:') + for k, span in enumerate(line['spans']): + print(f' span {k}:') + for n, v in span.items(): + print(f' {n}: {v!r}') + + +def test_3887(): + print(f'{pymupdf.version=}') + path = os.path.normpath(f'{__file__}/../../tests/resources/test_3887.pdf') + + path2 = os.path.normpath(f'{__file__}/../../tests/resources/test_3887.pdf.ez.pdf') + with pymupdf.open(path) as document: + document.subset_fonts(fallback=False) + document.ez_save(path2) + + with pymupdf.open(path2) as document: + text = f"\u0391\u3001\u0392\u3001\u0393\u3001\u0394\u3001\u0395\u3001\u0396\u3001\u0397\u3001\u0398\u3001\u0399\u3001\u039a\u3001\u039b\u3001\u039c\u3001\u039d\u3001\u039e\u3001\u039f\u3001\u03a0\u3001\u03a1\u3001\u03a3\u3001\u03a4\u3001\u03a5\u3001\u03a6\u3001\u03a7\u3001\u03a8\u3001\u03a9\u3002\u03b1\u3001\u03b2\u3001\u03b3\u3001\u03b4\u3001\u03b5\u3001\u03b6\u3001\u03b7\u3001\u03b8\u3001\u03b9\u3001\u03ba\u3001\u03bb\u3001\u03bc\u3001\u03bd\u3001\u03be\u3001\u03bf\u3001\u03c0\u3001\u03c1\u3001\u03c2\u3001\u03c4\u3001\u03c5\u3001\u03c6\u3001\u03c7\u3001\u03c8\u3001\u03c9\u3002" + page = document[0] + chars = [c for b in page.get_text("rawdict",flags=0)["blocks"] for l in b["lines"] for s in l["spans"] for c in s["chars"]] + output = [c["c"] for c in chars] + print(f'text:\n {text}') + print(f'output:\n {output}') + pixmap = page.get_pixmap() + path_pixmap = f'{path}.0.png' + pixmap.save(path_pixmap) + print(f'Have saved to: {path_pixmap=}') + assert set(output)==set(text) + + +def test_4457(): + print() + files = ( + ('https://github.com/user-attachments/files/20862923/test_4457_a.pdf', 'test_4457_a.pdf', None, 4), + ('https://github.com/user-attachments/files/20862922/test_4457_b.pdf', 'test_4457_b.pdf', None, 9), + ) + for url, name, size, rms_old_after_max in files: + path = util.download(url, name, size) + + with pymupdf.open(path) as document: + page = document[0] + + pixmap = document[0].get_pixmap() + path_pixmap = f'{path}.png' + pixmap.save(path_pixmap) + print(f'Have created: {path_pixmap=}') + + text = page.get_text() + path_before = f'{path}.before.pdf' + path_after = f'{path}.after.pdf' + document.ez_save(path_before, garbage=4) + print(f'Have created {path_before=}') + + document.subset_fonts() + document.ez_save(path_after, garbage=4) + print(f'Have created {path_after=}') + + with pymupdf.open(path_before) as document: + text_before = document[0].get_text() + pixmap_before = document[0].get_pixmap() + path_pixmap_before = f'{path_before}.png' + pixmap_before.save(path_pixmap_before) + print(f'Have created: {path_pixmap_before=}') + + with pymupdf.open(path_after) as document: + text_after = document[0].get_text() + pixmap_after = document[0].get_pixmap() + path_pixmap_after = f'{path_after}.png' + pixmap_after.save(path_pixmap_after) + print(f'Have created: {path_pixmap_after=}') + + import gentle_compare + rms_before = gentle_compare.pixmaps_rms(pixmap, pixmap_before) + rms_after = gentle_compare.pixmaps_rms(pixmap, pixmap_after) + print(f'{rms_before=}') + print(f'{rms_after=}') + + # Create .png file showing differences between <path> and <path_after>. + path_pixmap_after_diff = f'{path_after}.diff.png' + pixmap_after_diff = gentle_compare.pixmaps_diff(pixmap, pixmap_after) + pixmap_after_diff.save(path_pixmap_after_diff) + print(f'Have created: {path_pixmap_after_diff}') + + # Extract text from <path>, <path_before> and <path_after> and write to + # files so we can show differences with `diff`. + path_text = os.path.normpath(f'{__file__}/../../tests/test_4457.txt') + path_text_before = f'{path_text}.before.txt' + path_text_after = f'{path_text}.after.txt' + with open(path_text, 'w', encoding='utf8') as f: + f.write(text) + with open(path_text_before, 'w', encoding='utf8') as f: + f.write(text_before) + with open(path_text_after, 'w', encoding='utf8') as f: + f.write(text_after) + + # Can't write text to stdout on Windows because of encoding errors. + if platform.system() != 'Windows': + print(f'text:\n{textwrap.indent(text, " ")}') + print(f'text_before:\n{textwrap.indent(text_before, " ")}') + print(f'text_after:\n{textwrap.indent(text_after, " ")}') + print(f'{path_text=}') + print(f'{path_text_before=}') + print(f'{path_text_after=}') + + command = f'diff -u {path_text} {path_text_before}' + print(f'Running: {command}', flush=1) + subprocess.run(command, shell=1) + + command = f'diff -u {path_text} {path_text_after}' + print(f'Running: {command}', flush=1) + subprocess.run(command, shell=1) + + assert text_before == text + assert rms_before == 0 + + if pymupdf.mupdf_version_tuple >= (1, 26, 6): + assert rms_after == 0 + else: + # As of 2025-05-20 there are some differences in some characters, + # e.g. the non-ascii characters in `Philipp Krahenbuhl`. See + # <path_pixmap> and <path_pixmap_after>. + assert abs(rms_after - rms_old_after_max) < 2 + + # Avoid test failure caused by mupdf warnings. + wt = pymupdf.TOOLS.mupdf_warnings() + print(f'{wt=}') + assert wt == 'bogus font ascent/descent values (0 / 0)\n... repeated 5 times...'
