Mercurial > hgrepos > Python2 > PyMuPDF

"""
Fill a given text in a rectangle on some PDF page using
1. TextWriter object
2. Basic text output

Check text is indeed contained in given rectangle.
"""
import pymupdf

import gentle_compare

import os
import textwrap

# codespell:ignore-begin
text = """Der Kleine Schwertwal (Pseudorca crassidens), auch bekannt als Unechter oder Schwarzer Schwertwal, ist eine Art der Delfine (Delphinidae) und der einzige rezente Vertreter der Gattung Pseudorca.

Er ähnelt dem Orca in Form und Proportionen, ist aber einfarbig schwarz und mit einer Maximallänge von etwa sechs Metern deutlich kleiner.

Kleine Schwertwale bilden Schulen von durchschnittlich zehn bis fünfzig Tieren, wobei sie sich auch mit anderen Delfinen vergesellschaften und sich meistens abseits der Küsten aufhalten.

Sie sind in allen Ozeanen gemäßigter, subtropischer und tropischer Breiten beheimatet, sind jedoch vor allem in wärmeren Jahreszeiten auch bis in die gemäßigte bis subpolare Zone südlich der Südspitze Südamerikas, vor Nordeuropa und bis vor Kanada anzutreffen."""
# codespell:ignore-end

def test_textbox1():
    """Use TextWriter for text insertion."""
    doc = pymupdf.open()
    page = doc.new_page()
    rect = pymupdf.Rect(50, 50, 400, 400)
    blue = (0, 0, 1)
    tw = pymupdf.TextWriter(page.rect, color=blue)
    tw.fill_textbox(
        rect,
        text,
        align=pymupdf.TEXT_ALIGN_LEFT,
        fontsize=12,
    )
    tw.write_text(page, morph=(rect.tl, pymupdf.Matrix(1, 1)))
    # check text containment
    assert page.get_text() == page.get_text(clip=rect)
    page.write_text(writers=tw)


def test_textbox2():
    """Use basic text insertion."""
    doc = pymupdf.open()
    ocg = doc.add_ocg("ocg1")
    page = doc.new_page()
    rect = pymupdf.Rect(50, 50, 400, 400)
    blue = pymupdf.utils.getColor("lightblue")
    red = pymupdf.utils.getColorHSV("red")
    page.insert_textbox(
        rect,
        text,
        align=pymupdf.TEXT_ALIGN_LEFT,
        fontsize=12,
        color=blue,
        oc=ocg,
    )
    # check text containment
    assert page.get_text() == page.get_text(clip=rect)


def test_textbox3():
    """Use TextWriter for text insertion."""
    doc = pymupdf.open()
    page = doc.new_page()
    font = pymupdf.Font("cjk")
    rect = pymupdf.Rect(50, 50, 400, 400)
    blue = (0, 0, 1)
    tw = pymupdf.TextWriter(page.rect, color=blue)
    tw.fill_textbox(
        rect,
        text,
        align=pymupdf.TEXT_ALIGN_LEFT,
        font=font,
        fontsize=12,
        right_to_left=True,
    )
    tw.write_text(page, morph=(rect.tl, pymupdf.Matrix(1, 1)))
    # check text containment
    assert page.get_text() == page.get_text(clip=rect)
    doc.scrub()
    doc.subset_fonts()


def test_textbox4():
    """Use TextWriter for text insertion."""
    doc = pymupdf.open()
    ocg = doc.add_ocg("ocg1")
    page = doc.new_page()
    rect = pymupdf.Rect(50, 50, 400, 600)
    blue = (0, 0, 1)
    tw = pymupdf.TextWriter(page.rect, color=blue)
    tw.fill_textbox(
        rect,
        text,
        align=pymupdf.TEXT_ALIGN_LEFT,
        fontsize=12,
        font=pymupdf.Font("cour"),
        right_to_left=True,
    )
    tw.write_text(page, oc=ocg, morph=(rect.tl, pymupdf.Matrix(1, 1)))
    # check text containment
    assert page.get_text() == page.get_text(clip=rect)


def test_textbox5():
    """Using basic text insertion."""
    small_glyph_heights0 = pymupdf.TOOLS.set_small_glyph_heights()
    pymupdf.TOOLS.set_small_glyph_heights(True)
    try:
        doc = pymupdf.open()
        page = doc.new_page()
        r = pymupdf.Rect(100, 100, 150, 150)
        text = "words and words and words and more words..."
        rc = -1
        fontsize = 12
        page.draw_rect(r)
        while rc < 0:
            rc = page.insert_textbox(
                r,
                text,
                fontsize=fontsize,
                align=pymupdf.TEXT_ALIGN_JUSTIFY,
            )
            fontsize -= 0.5

        blocks = page.get_text("blocks")
        bbox = pymupdf.Rect(blocks[0][:4])
        assert bbox in r
    finally:
        # Must restore small_glyph_heights, otherwise other tests can fail.
        pymupdf.TOOLS.set_small_glyph_heights(small_glyph_heights0)


def test_2637():
    """Ensure correct calculation of fitting text."""
    doc = pymupdf.open()
    page = doc.new_page()
    text = (
        "The morning sun painted the sky with hues of orange and pink. "
        "Birds chirped harmoniously, greeting the new day. "
        "Nature awakened, filling the air with life and promise."
    )
    rect = pymupdf.Rect(50, 50, 500, 280)
    fontsize = 50
    rc = -1
    while rc < 0:  # look for largest font size that makes the text fit
        rc = page.insert_textbox(rect, text, fontname="hebo", fontsize=fontsize)
        fontsize -= 1

    # confirm text won't lap outside rect
    blocks = page.get_text("blocks")
    bbox = pymupdf.Rect(blocks[0][:4])
    assert bbox in rect


def test_htmlbox1():
    """Write HTML-styled text into a rect with different rotations.

    The text is styled and contains a link.
    Then extract the text again, and
    - assert that text was written in the 4 different angles,
    - assert that text properties are correct (bold, italic, color),
    - assert that the link has been correctly inserted.

    We try to insert into a rectangle that is too small, setting
    scale=False and confirming we have a negative return code.
    """
    if not hasattr(pymupdf, "mupdf"):
        print("'test_htmlbox1' not executed in classic.")
        return

    rect = pymupdf.Rect(100, 100, 200, 200)  # this only works with scale=True

    base_text = """Lorem ipsum dolor sit amet, consectetur adipisici elit, sed eiusmod tempor incidunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquid ex ea commodi consequat. Quis aute iure reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint obcaecat cupiditat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."""

    text = """Lorem ipsum dolor sit amet, consectetur adipisici elit, sed eiusmod tempor incidunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation <b>ullamco</b> <i>laboris</i> nisi ut aliquid ex ea commodi consequat. Quis aute iure reprehenderit in <span style="color: #0f0;font-weight:bold;">voluptate</span> velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint obcaecat cupiditat non proident, sunt in culpa qui <a href="https://www.artifex.com">officia</a> deserunt mollit anim id est laborum."""

    doc = pymupdf.Document()

    for rot in (0, 90, 180, 270):
        wdirs = ((1, 0), (0, -1), (-1, 0), (0, 1))  # all writing directions
        page = doc.new_page()
        spare_height, scale = page.insert_htmlbox(rect, text, rotate=rot, scale_low=1)
        assert spare_height < 0
        assert scale == 1
        spare_height, scale = page.insert_htmlbox(rect, text, rotate=rot, scale_low=0)
        page.draw_rect(rect, (1, 0, 0))
        doc.save(os.path.normpath(f'{__file__}/../../tests/test_htmlbox1.pdf'))
        assert abs(spare_height - 3.8507) < 0.001
        assert 0 < scale < 1
        page = doc.reload_page(page)
        link = page.get_links()[0]  # extracts the links on the page

        assert link["uri"] == "https://www.artifex.com"

        # Assert plain text is complete.
        # We must remove line breaks and any ligatures for this.
        assert base_text == page.get_text(flags=0)[:-1].replace("\n", " ")

        encounters = 0  # counts the words with selected properties
        for b in page.get_text("dict")["blocks"]:
            for l in b["lines"]:
                wdir = l["dir"]  # writing direction
                assert wdir == wdirs[page.number]
                for s in l["spans"]:
                    stext = s["text"]
                    color = pymupdf.sRGB_to_pdf(s["color"])
                    bold = bool(s["flags"] & 16)
                    italic = bool(s["flags"] & 2)
                    if stext in ("ullamco", "laboris", "voluptate"):
                        encounters += 1
                        if stext == "ullamco":
                            assert bold is True
                            assert italic is False
                            assert color == pymupdf.pdfcolor["black"]
                        elif stext == "laboris":
                            assert bold is False
                            assert italic is True
                            assert color == pymupdf.pdfcolor["black"]
                        elif stext == "voluptate":
                            assert bold is True
                            assert italic is False
                            assert color == pymupdf.pdfcolor["green"]
                    else:
                        assert bold is False
                        assert italic is False
        # all 3 special special words were encountered
        assert encounters == 3


def test_htmlbox2():
    """Test insertion without scaling"""
    if not hasattr(pymupdf, "mupdf"):
        print("'test_htmlbox2' not executed in classic.")
        return

    doc = pymupdf.open()
    rect = pymupdf.Rect(100, 100, 200, 200)  # large enough to hold text
    page = doc.new_page()
    bottoms = set()
    for rot in (0, 90, 180, 270):
        spare_height, scale = page.insert_htmlbox(
            rect, "Hello, World!", scale_low=1, rotate=rot
        )
        assert scale == 1
        assert 0 < spare_height < rect.height
        bottoms.add(spare_height)
    assert len(bottoms) == 1  # same result for all rotations


def test_htmlbox3():
    """Test insertion with opacity"""
    if not hasattr(pymupdf, "mupdf"):
        print("'test_htmlbox3' not executed in classic.")
        return

    rect = pymupdf.Rect(100, 250, 300, 350)
    text = """<span style="color:red;font-size:20px;">Just some text.</span>"""
    doc = pymupdf.open()
    page = doc.new_page()

    # insert some text with opacity
    page.insert_htmlbox(rect, text, opacity=0.5)

    # lowlevel-extract inserted text to access opacity
    span = page.get_texttrace()[0]
    assert span["opacity"] == 0.5


def test_3559():
    doc = pymupdf.Document()
    page = doc.new_page()
    text_insert="""<body><h3></h3></body>"""
    rect = pymupdf.Rect(100, 100, 200, 200)
    page.insert_htmlbox(rect, text_insert)


def test_3916():
    doc = pymupdf.open()
    rect = pymupdf.Rect(100, 100, 101, 101) # Too small for the text.
    page = doc.new_page()
    spare_height, scale = page.insert_htmlbox(rect, "Hello, World!", scale_low=0.5)
    assert spare_height == -1


def test_4400():
    with pymupdf.open() as document:
        page = document.new_page()
        writer = pymupdf.TextWriter(page.rect)
        text = '111111111'
        print(f'Calling writer.fill_textbox().', flush=1)
        writer.fill_textbox(rect=pymupdf.Rect(0, 0, 100, 20), pos=(80, 0), text=text, fontsize=8)


def test_4613():
    print()
    text = 3 * 'abcdefghijklmnopqrstuvwxyz\nABCDEFGHIJKLMNOPQRSTUVWXYZ\n'
    story = pymupdf.Story(text)
    rect = pymupdf.Rect(10, 10, 100, 100)

    # Test default operation where we get additional scaling down because of
    # the long words in our text.
    print(f'test_4613(): ### Testing default operation.')
    with pymupdf.open() as doc:
        page = doc.new_page()
        spare_height, scale = page.insert_htmlbox(rect, story)
        print(f'test_4613(): {spare_height=} {scale=}')
        # The additional down-scaling from the long word widths results in
        # spare vertical space.
        page.draw_rect(rect, (1, 0, 0))
        path = os.path.normpath(f'{__file__}/../../tests/test_4613.pdf')
        doc.save(path)

        path_pixmap = os.path.normpath(f'{__file__}/../../tests/test_4613.png')
        path_pixmap_expected = os.path.normpath(f'{__file__}/../../tests/resources/test_4613.png')
        pixmap = page.get_pixmap(dpi=300)
        pixmap.save(path_pixmap)

        pixmap_diff = gentle_compare.pixmaps_diff(path_pixmap_expected, pixmap)
        pixmap_diff.save(os.path.normpath(f'{__file__}/../../tests/test_4613-diff.png'))

        rms = gentle_compare.pixmaps_rms(pixmap, path_pixmap_expected)
        print(f'{rms=}')
        assert rms == 0, f'{rms=}'

        assert abs(spare_height - 45.7536) < 0.1
        assert abs(scale - 0.4009) < 0.01

        new_text = page.get_text('text', clip=rect)
        print(f'test_4613(): new_text:')
        print(textwrap.indent(new_text, '    '))
        assert new_text == text

    # Check with _scale_word_width=False - ignore too-wide words.
    print(f'test_4613(): ### Testing with _scale_word_width=False.')
    with pymupdf.open() as doc:
        page = doc.new_page()
        spare_height, scale = page.insert_htmlbox(rect, story, _scale_word_width=False)
        print(f'test_4613(): _scale_word_width=False: {spare_height=} {scale=}')
        # With _scale_word_width=False we allow long words to extend beyond the
        # rect, so we should have spare_height == 0 and only a small amount of
        # down-scaling.
        assert spare_height == 0
        assert abs(scale - 0.914) < 0.01
        new_text = page.get_text('text', clip=rect)
        print(f'test_4613(): new_text:')
        print(textwrap.indent(new_text, '    '))
        assert new_text == textwrap.dedent('''
                abcdefghijklmno
                ABCDEFGHIJKLM
                abcdefghijklmno
                ABCDEFGHIJKLM
                abcdefghijklmno
                ABCDEFGHIJKLM
                ''')[1:]


    # Check that we get no fit if scale_low is not low enough.
    print(f'test_4613(): ### Testing with scale_low too high to allow a fit.')
    with pymupdf.open() as doc:
        page = doc.new_page()
        scale_low=0.6
        spare_height, scale = page.insert_htmlbox(rect, story, scale_low=scale_low)
        print(f'test_4613(): {scale_low=}: {spare_height=} {scale=}')
        assert spare_height == -1
        assert scale == scale_low
author	Franz Glasner <fzglas.hg@dom66.de>
date	Sat, 11 Oct 2025 17:17:30 +0200
parents	a6bc019ac0b2
children