view tests/test_font.py @ 46:7ee69f120f19 default tip

>>>>> tag v1.26.5+1 for changeset b74429b0f5c4
author Franz Glasner <fzglas.hg@dom66.de>
date Sat, 11 Oct 2025 17:17:30 +0200
parents a6bc019ac0b2
children
line wrap: on
line source

"""
Tests for the Font class.
"""
import os
import platform
import pymupdf
import subprocess
import textwrap

import util


def test_font1():
    text = "PyMuPDF"
    font = pymupdf.Font("helv")
    assert font.name == "Helvetica"
    tl = font.text_length(text, fontsize=20)
    cl = font.char_lengths(text, fontsize=20)
    assert len(text) == len(cl)
    assert abs(sum(cl) - tl) < pymupdf.EPSILON
    for i in range(len(cl)):
        assert cl[i] == font.glyph_advance(ord(text[i])) * 20
    font2 = pymupdf.Font(fontbuffer=font.buffer)
    codepoints1 = font.valid_codepoints()
    codepoints2 = font2.valid_codepoints()
    print('')
    print(f'{len(codepoints1)=}')
    print(f'{len(codepoints2)=}')
    if 0:
        for i, (ucs1, ucs2) in enumerate(zip(codepoints1, codepoints2)):
            print(f'    {i}: {ucs1=} {ucs2=} {"" if ucs2==ucs2 else "*"}')
    assert font2.valid_codepoints() == font.valid_codepoints()
    
    # Also check we can get font's bbox.
    bbox1 = font.bbox
    print(f'{bbox1=}')
    if hasattr(pymupdf, 'mupdf'):
        bbox2 = font.this.fz_font_bbox()
        assert bbox2 == bbox1


def test_font2():
    """Old and new length computation must be the same."""
    font = pymupdf.Font("helv")
    text = "PyMuPDF"
    assert font.text_length(text) == pymupdf.get_text_length(text)


def test_fontname():
    """Assert a valid PDF fontname."""
    doc = pymupdf.open()
    page = doc.new_page()
    assert page.insert_font()  # assert: a valid fontname works!
    detected = False  # preset indicator
    try:  # fontname check will fail first - don't need a font at all here
        page.insert_font(fontname="illegal/char", fontfile="unimportant")
    except ValueError as e:
        if str(e).startswith("bad fontname chars"):
            detected = True  # illegal fontname detected
    assert detected

def test_2608():
    flags = (pymupdf.TEXT_DEHYPHENATE | pymupdf.TEXT_MEDIABOX_CLIP)
    with pymupdf.open(os.path.abspath(f'{__file__}/../../tests/resources/2201.00069.pdf')) as doc:
        page = doc[0]
        blocks = page.get_text_blocks(flags=flags)
        text = blocks[10][4]
        with open(os.path.abspath(f'{__file__}/../../tests/test_2608_out'), 'wb') as f:
            f.write(text.encode('utf8'))
        path_expected = os.path.normpath(f'{__file__}/../../tests/resources/test_2608_expected')
        path_expected_1_26 = os.path.normpath(f'{__file__}/../../tests/resources/test_2608_expected_1.26')
        if pymupdf.mupdf_version_tuple >= (1, 27):
            path_expected2 = path_expected
        else:
            path_expected2 = path_expected_1_26
        with open(path_expected2, 'rb') as f:
            expected = f.read().decode('utf8')
        # Github windows x32 seems to insert \r characters; maybe something to
        # do with the Python installation's line endings settings.
        expected = expected.replace('\r', '')
        print(f'test_2608(): {text.encode("utf8")=}')
        print(f'test_2608(): {expected.encode("utf8")=}')
        assert text == expected

def test_fontarchive():
    if os.environ.get('PYODIDE_ROOT'):
        print('test_fontarchive(): not running on Pyodide - we get ValueError: No font code \'notos\' found in pymupdf-fonts..')
        return
        
    import subprocess
    arch = pymupdf.Archive()
    css = pymupdf.css_for_pymupdf_font("notos", archive=arch, name="sans-serif")
    print(css)
    print(arch.entry_list)
    assert arch.entry_list == \
            [
                {
                    'fmt': 'tree',
                    'entries':
                    [
                        'notosbo', 'notosbi', 'notosit', 'notos'
                    ],
                    'path': None
                }
            ]

def test_load_system_font():
    if not hasattr(pymupdf, 'mupdf'):
        print(f'test_load_system_font(): Not running on classic.')
        return
    trace = list()
    def font_f(name, bold, italic, needs_exact_metrics):
        trace.append((name, bold, italic, needs_exact_metrics))
        #print(f'test_load_system_font():font_f(): Looking for font: {name=} {bold=} {italic=} {needs_exact_metrics=}.')
        return None
    def f_cjk(name, ordering, serif):
        trace.append((name, ordering, serif))
        #print(f'test_load_system_font():f_cjk(): Looking for font: {name=} {ordering=} {serif=}.')
        return None
    def f_fallback(script, language, serif, bold, italic):
        trace.append((script, language, serif, bold, italic))
        #print(f'test_load_system_font():f_fallback(): looking for font: {script=} {language=} {serif=} {bold=} {italic=}.')
        return None
    pymupdf.mupdf.fz_install_load_system_font_funcs(font_f, f_cjk, f_fallback)
    f = pymupdf.mupdf.fz_load_system_font("some-font-name", 0, 0, 0)
    assert trace == [
            ('some-font-name', 0, 0, 0),
            ], f'Incorrect {trace=}.'
    print(f'test_load_system_font(): {f.m_internal=}')


def test_mupdf_subset_fonts2():
    if not hasattr(pymupdf, 'mupdf'):
        print('Not running on rebased.')
        return
    path = os.path.abspath(f'{__file__}/../../tests/resources/2.pdf')
    with pymupdf.open(path) as doc:
        n = len(doc)
        pages = [i*2 for i in range(n//2)]
        print(f'{pages=}.')
        pymupdf.mupdf.pdf_subset_fonts2(pymupdf._as_pdf_document(doc), pages)


def test_3677():
    pymupdf.TOOLS.set_subset_fontnames(True)
    try:
        path = os.path.abspath(f'{__file__}/../../tests/resources/test_3677.pdf')
        font_names_expected = [
                'BCDEEE+Aptos',
                'BCDFEE+Aptos',
                'BCDGEE+Calibri-Light',
                'BCDHEE+Calibri-Light',
                ]
        font_names = list()
        with pymupdf.open(path) as document:
            for page in document:
                 for block in page.get_text('dict')['blocks']:
                        if block['type'] == 0:
                            if 'lines' in block.keys():
                                for line in block['lines']:
                                    for span in line['spans']:
                                        font_name=span['font']
                                        print(font_name)
                                        font_names.append(font_name)
        assert font_names == font_names_expected, f'{font_names=}'
    finally:
        pymupdf.TOOLS.set_subset_fontnames(False)


def test_3933():
    path = os.path.normpath(f'{__file__}/../../tests/resources/test_3933.pdf')
    with pymupdf.open(path) as document:
        page = document[0]
        print(f'{len(page.get_fonts())=}')
    
        expected = {
                'BCDEEE+Calibri': 39,
                'BCDFEE+SwissReSan-Regu':  53,
                'BCDGEE+SwissReSan-Ital':  20,
                'BCDHEE+SwissReSan-Bold':  20,
                'BCDIEE+SwissReSan-Regu':  53,
                'BCDJEE+Calibri':  39,
                }
                
        for xref, _, _, name, _, _ in page.get_fonts():
            _, _, _, content = document.extract_font(xref)

            if content:
                font = pymupdf.Font(fontname=name, fontbuffer=content)
                supported_symbols = font.valid_codepoints()
                print(f'Font {name}: {len(supported_symbols)=}.', flush=1)
                assert len(supported_symbols) == expected.get(name)


def test_3780():
    path = os.path.normpath(f'{__file__}/../../tests/resources/test_3780.pdf')
    with pymupdf.open(path) as document:
        for page_i, page in enumerate(document):
            for itm in page.get_fonts():
                buff=document.extract_font(itm[0])[-1]
                font=pymupdf.Font(fontbuffer=buff)
                print(f'{page_i=}: xref {itm[0]} {font.name=} {font.ascender=} {font.descender=}.')
            if page_i == 0:
                d = page.get_text('dict')
                #for n, v in d.items():
                #    print(f'    {n}: {v!r}')
                for i, block in enumerate(d['blocks']):
                    print(f'block {i}:')
                    for j, line in enumerate(block['lines']):
                        print(f'    line {j}:')
                        for k, span in enumerate(line['spans']):
                            print(f'        span {k}:')
                            for n, v in span.items():
                                print(f'            {n}: {v!r}')


def test_3887():
    print(f'{pymupdf.version=}')
    path = os.path.normpath(f'{__file__}/../../tests/resources/test_3887.pdf')
    
    path2 = os.path.normpath(f'{__file__}/../../tests/resources/test_3887.pdf.ez.pdf')
    with pymupdf.open(path) as document:
        document.subset_fonts(fallback=False)
        document.ez_save(path2)
    
    with pymupdf.open(path2) as document:
        text = f"\u0391\u3001\u0392\u3001\u0393\u3001\u0394\u3001\u0395\u3001\u0396\u3001\u0397\u3001\u0398\u3001\u0399\u3001\u039a\u3001\u039b\u3001\u039c\u3001\u039d\u3001\u039e\u3001\u039f\u3001\u03a0\u3001\u03a1\u3001\u03a3\u3001\u03a4\u3001\u03a5\u3001\u03a6\u3001\u03a7\u3001\u03a8\u3001\u03a9\u3002\u03b1\u3001\u03b2\u3001\u03b3\u3001\u03b4\u3001\u03b5\u3001\u03b6\u3001\u03b7\u3001\u03b8\u3001\u03b9\u3001\u03ba\u3001\u03bb\u3001\u03bc\u3001\u03bd\u3001\u03be\u3001\u03bf\u3001\u03c0\u3001\u03c1\u3001\u03c2\u3001\u03c4\u3001\u03c5\u3001\u03c6\u3001\u03c7\u3001\u03c8\u3001\u03c9\u3002"
        page = document[0]
        chars = [c for b in page.get_text("rawdict",flags=0)["blocks"] for l in b["lines"] for s in l["spans"] for c in s["chars"]]
        output = [c["c"] for c in chars]
        print(f'text:\n    {text}')
        print(f'output:\n    {output}')
        pixmap = page.get_pixmap()
        path_pixmap = f'{path}.0.png'
        pixmap.save(path_pixmap)
        print(f'Have saved to: {path_pixmap=}')
        assert set(output)==set(text)


def test_4457():
    if os.environ.get('PYODIDE_ROOT'):
        print('test_4457(): not running on Pyodide - cannot run child processes.')
        return
        
    print()
    files = (
            ('https://github.com/user-attachments/files/20862923/test_4457_a.pdf', 'test_4457_a.pdf', None, 4),
            ('https://github.com/user-attachments/files/20862922/test_4457_b.pdf', 'test_4457_b.pdf', None, 9),
            )
    for url, name, size, rms_old_after_max in files:
        path = util.download(url, name, size)
        
        with pymupdf.open(path) as document:
            page = document[0]
            
            pixmap = document[0].get_pixmap()
            path_pixmap = f'{path}.png'
            pixmap.save(path_pixmap)
            print(f'Have created: {path_pixmap=}')
            
            text = page.get_text()
            path_before = f'{path}.before.pdf'
            path_after = f'{path}.after.pdf'
            document.ez_save(path_before, garbage=4)
            print(f'Have created {path_before=}')
            
            document.subset_fonts()
            document.ez_save(path_after, garbage=4)
            print(f'Have created {path_after=}')
        
        with pymupdf.open(path_before) as document:
            text_before = document[0].get_text()
            pixmap_before = document[0].get_pixmap()
            path_pixmap_before = f'{path_before}.png'
            pixmap_before.save(path_pixmap_before)
            print(f'Have created: {path_pixmap_before=}')
        
        with pymupdf.open(path_after) as document:
            text_after = document[0].get_text()
            pixmap_after = document[0].get_pixmap()
            path_pixmap_after = f'{path_after}.png'
            pixmap_after.save(path_pixmap_after)
            print(f'Have created: {path_pixmap_after=}')
        
        import gentle_compare
        rms_before = gentle_compare.pixmaps_rms(pixmap, pixmap_before)
        rms_after = gentle_compare.pixmaps_rms(pixmap, pixmap_after)
        print(f'{rms_before=}')
        print(f'{rms_after=}')
        
        # Create .png file showing differences between <path> and <path_after>.
        path_pixmap_after_diff = f'{path_after}.diff.png'
        pixmap_after_diff = gentle_compare.pixmaps_diff(pixmap, pixmap_after)
        pixmap_after_diff.save(path_pixmap_after_diff)
        print(f'Have created: {path_pixmap_after_diff}')
        
        # Extract text from <path>, <path_before> and <path_after> and write to
        # files so we can show differences with `diff`.
        path_text = os.path.normpath(f'{__file__}/../../tests/test_4457.txt')
        path_text_before = f'{path_text}.before.txt'
        path_text_after = f'{path_text}.after.txt'
        with open(path_text, 'w', encoding='utf8') as f:
            f.write(text)
        with open(path_text_before, 'w', encoding='utf8') as f:
            f.write(text_before)
        with open(path_text_after, 'w', encoding='utf8') as f:
            f.write(text_after)
        
        # Can't write text to stdout on Windows because of encoding errors.
        if platform.system() != 'Windows':
            print(f'text:\n{textwrap.indent(text, "    ")}')
            print(f'text_before:\n{textwrap.indent(text_before, "    ")}')
            print(f'text_after:\n{textwrap.indent(text_after, "    ")}')
            print(f'{path_text=}')
            print(f'{path_text_before=}')
            print(f'{path_text_after=}')
        
            command = f'diff -u {path_text} {path_text_before}'
            print(f'Running: {command}', flush=1)
            subprocess.run(command, shell=1)
            
            command = f'diff -u {path_text} {path_text_after}'
            print(f'Running: {command}', flush=1)
            subprocess.run(command, shell=1)
        
        assert text_before == text
        assert rms_before == 0
        
        if pymupdf.mupdf_version_tuple >= (1, 26, 6):
            assert rms_after == 0
        else:
            # As of 2025-05-20 there are some differences in some characters,
            # e.g. the non-ascii characters in `Philipp Krahenbuhl`.  See
            # <path_pixmap> and <path_pixmap_after>.
            assert abs(rms_after - rms_old_after_max) < 2
    
    # Avoid test failure caused by mupdf warnings.
    wt = pymupdf.TOOLS.mupdf_warnings()
    print(f'{wt=}')
    assert wt == 'bogus font ascent/descent values (0 / 0)\n... repeated 5 times...'