Mercurial > hgrepos > Python2 > PyMuPDF

"""
* Join multiple PDFs into a new one.
* Compare with stored earlier result:
    - must have identical object definitions
    - must have different trailers
* Try inserting files in a loop.
"""

import io
import os
import re
import pymupdf
from pymupdf import mupdf

scriptdir = os.path.abspath(os.path.dirname(__file__))
resources = os.path.join(scriptdir, "resources")

def approx_parse( text):
    '''
    Splits <text> into sequence of (text, number) pairs. Where sequence of
    [0-9.] is not convertible to a number (e.g. '4.5.6'), <number> will be
    None.
    '''
    ret = []
    for m in re.finditer('([^0-9]+)([0-9.]*)', text):
        text = m.group(1)
        try:
            number = float( m.group(2))
        except Exception:
            text += m.group(2)
            number = None
        ret.append( (text, number))
    return ret

def approx_compare( a, b, max_delta):
    '''
    Compares <a> and <b>, allowing numbers to differ by up to <delta>.
    '''
    aa = approx_parse( a)
    bb = approx_parse( b)
    if len(aa) != len(bb):
        return 1
    ret = 1
    for (at, an), (bt, bn) in zip( aa, bb):
        if at != bt:
            break
        if an is not None and bn is not None:
            if abs( an - bn) >= max_delta:
                print( f'diff={an-bn}: an={an} bn={bn}')
                break
        elif (an is None) != (bn is None):
            break
    else:
        ret = 0
    if ret:
        print( f'Differ:\n    a={a!r}\n    b={b!r}')
    return ret


def test_insert():
    all_text_original = []  # text on input pages
    all_text_combined = []  # text on resulting output pages
    # prepare input PDFs
    doc1 = pymupdf.open()
    for i in range(5):  # just arbitrary number of pages
        text = f"doc 1, page {i}"  # the 'globally' unique text
        page = doc1.new_page()
        page.insert_text((100, 72), text)
        all_text_original.append(text)

    doc2 = pymupdf.open()
    for i in range(4):
        text = f"doc 2, page {i}"
        page = doc2.new_page()
        page.insert_text((100, 72), text)
        all_text_original.append(text)

    doc3 = pymupdf.open()
    for i in range(3):
        text = f"doc 3, page {i}"
        page = doc3.new_page()
        page.insert_text((100, 72), text)
        all_text_original.append(text)

    doc4 = pymupdf.open()
    for i in range(6):
        text = f"doc 4, page {i}"
        page = doc4.new_page()
        page.insert_text((100, 72), text)
        all_text_original.append(text)

    new_doc = pymupdf.open()  # make combined PDF of input files
    new_doc.insert_pdf(doc1)
    new_doc.insert_pdf(doc2)
    new_doc.insert_pdf(doc3)
    new_doc.insert_pdf(doc4)
    # read text from all pages and store in list
    for page in new_doc:
        all_text_combined.append(page.get_text().replace("\n", ""))
    # the lists must be equal
    assert all_text_combined == all_text_original


def test_issue1417_insertpdf_in_loop():
    """Using a context manager instead of explicitly closing files"""
    f = os.path.join(resources, "1.pdf")
    big_doc = pymupdf.open()
    fd1 = os.open( f, os.O_RDONLY)
    os.close( fd1)
    for n in range(0, 1025):
        with pymupdf.open(f) as pdf:
            big_doc.insert_pdf(pdf)
        # Create a raw file descriptor. If the above pymupdf.open() context leaks
        # a file descriptor, fd will be seen to increment.
        fd2 = os.open( f, os.O_RDONLY)
        assert fd2 == fd1
        os.close( fd2)
    big_doc.close()


def _test_insert_adobe():
    path = os.path.abspath( f'{__file__}/../../../PyMuPDF-performance/adobe.pdf')
    if not os.path.exists(path):
        print(f'Not running test_insert_adobe() because does not exist: {os.path.relpath(path)}')
        return
    a = pymupdf.Document()
    b = pymupdf.Document(path)
    a.insert_pdf(b)


def _2861_2871_merge_pdf(content: bytes, coverpage: bytes):
    with pymupdf.Document(stream=coverpage, filetype="pdf") as coverpage_pdf:
        with pymupdf.Document(stream=content, filetype="pdf") as content_pdf:
            coverpage_pdf.insert_pdf(content_pdf)
            doc = coverpage_pdf.write()
            return doc

def test_2861():
    path = os.path.abspath(f'{__file__}/../../tests/resources/test_2861.pdf')
    with open(path, "rb") as content_pdf:
        with open(path, "rb") as coverpage_pdf:
            content = content_pdf.read()
            coverpage = coverpage_pdf.read()
            _2861_2871_merge_pdf(content, coverpage)

def test_2871():
    path = os.path.abspath(f'{__file__}/../../tests/resources/test_2871.pdf')
    with open(path, "rb") as content_pdf:
        with open(path, "rb") as coverpage_pdf:
            content = content_pdf.read()
            coverpage = coverpage_pdf.read()
            _2861_2871_merge_pdf(content, coverpage)


def test_3789():

    file_path = os.path.abspath(f'{__file__}/../../tests/resources/test_3789.pdf')
    result_path = os.path.abspath(f'{__file__}/../../tests/test_3789_out')
    pages_per_split = 5

    # Clean pdf
    doc = pymupdf.open(file_path)
    tmp = io.BytesIO()
    tmp.write(doc.write(garbage=4, deflate=True))

    source_doc = pymupdf.Document('pdf', tmp.getvalue())
    tmp.close()

    # Calculate the number of pages per split file and the number of split files
    page_range = pages_per_split - 1
    split_range = range(0, source_doc.page_count, pages_per_split)
    num_splits = len(split_range)

    # Loop through each split range and create a new PDF file
    for i, start in enumerate(split_range):
        output_doc = pymupdf.open()

        # Determine the ending page for this split file
        to_page = start + page_range if i < num_splits - 1 else -1
        output_doc.insert_pdf(source_doc, from_page=start, to_page=to_page)

        # Save the output document to a file and add the path to the list of split files
        path = f'{result_path}_{i}.pdf'
        output_doc.save(path, garbage=2)
        print(f'Have saved to {path=}.')

        # If this is the last split file, exit the loop
        if to_page == -1:
            break


def test_widget_insert():
    """Confirm copy of form fields / widgets."""
    tar = pymupdf.open(os.path.join(resources, "merge-form1.pdf"))
    pc0 = tar.page_count  # for later assertion
    src = pymupdf.open(os.path.join(resources, "interfield-calculation.pdf"))
    pc1 = src.page_count  # for later assertion

    tarpdf = pymupdf._as_pdf_document(tar)
    tar_field_count = mupdf.pdf_array_len(
        mupdf.pdf_dict_getp(mupdf.pdf_trailer(tarpdf), "Root/AcroForm/Fields")
    )
    tar_co_count = mupdf.pdf_array_len(
        mupdf.pdf_dict_getp(mupdf.pdf_trailer(tarpdf), "Root/AcroForm/CO")
    )
    srcpdf = pymupdf._as_pdf_document(src)
    src_field_count = mupdf.pdf_array_len(
        mupdf.pdf_dict_getp(mupdf.pdf_trailer(srcpdf), "Root/AcroForm/Fields")
    )
    src_co_count = mupdf.pdf_array_len(
        mupdf.pdf_dict_getp(mupdf.pdf_trailer(srcpdf), "Root/AcroForm/CO")
    )

    tar.insert_pdf(src)
    new_field_count = mupdf.pdf_array_len(
        mupdf.pdf_dict_getp(mupdf.pdf_trailer(tarpdf), "Root/AcroForm/Fields")
    )
    new_co_count = mupdf.pdf_array_len(
        mupdf.pdf_dict_getp(mupdf.pdf_trailer(tarpdf), "Root/AcroForm/CO")
    )
    assert tar.page_count == pc0 + pc1
    assert new_field_count == tar_field_count + src_field_count
    assert new_co_count == tar_co_count + src_co_count


def names_and_kids(doc):
    """Return a list of dictionaries with keys "name" and "kids".

    "name" is the name of a root field in "Root/AcroForm/Fields", and
    "kids" is the count of its immediate children.
    """
    rc = []
    pdf = pymupdf._as_pdf_document(doc)
    fields = mupdf.pdf_dict_getl(
        mupdf.pdf_trailer(pdf),
        pymupdf.PDF_NAME("Root"),
        pymupdf.PDF_NAME("AcroForm"),
        pymupdf.PDF_NAME("Fields"),
    )
    if not fields.pdf_is_array():
        return rc
    root_count = fields.pdf_array_len()
    if not root_count:
        return rc
    for i in range(root_count):
        field = fields.pdf_array_get(i)
        kids = field.pdf_dict_get(pymupdf.PDF_NAME("Kids"))
        kid_count = kids.pdf_array_len()
        T = field.pdf_dict_get_text_string(pymupdf.PDF_NAME("T"))
        field_dict = {"name": T, "kids": kid_count}
        rc.append(field_dict)
    return rc


def test_merge_checks1():
    """Merge Form PDFs making any duplicate names unique."""
    merge_file1 = os.path.join(resources, "merge-form1.pdf")
    merge_file2 = os.path.join(resources, "merge-form2.pdf")
    tar = pymupdf.open(merge_file1)
    rc0 = names_and_kids(tar)
    src = pymupdf.open(merge_file2)
    rc1 = names_and_kids(src)
    tar.insert_pdf(src, join_duplicates=False)
    rc2 = names_and_kids(tar)
    assert len(rc2) == len(rc0) + len(rc1)


def test_merge_checks2():
    # Join / merge Form PDFs joining any duplicate names in the src PDF.
    merge_file1 = os.path.join(resources, "merge-form1.pdf")
    merge_file2 = os.path.join(resources, "merge-form2.pdf")
    tar = pymupdf.open(merge_file1)
    rc0 = names_and_kids(tar)  # list of root names and kid counts
    names0 = [itm["name"] for itm in rc0]  # root names in target
    kids0 = sum([itm["kids"] for itm in rc0])  # number of kids in target

    src = pymupdf.open(merge_file2)
    rc1 = names_and_kids(src)  # list of root namesand kids in source PDF
    dup_count = 0  # counts duplicate names in source PDF
    dup_kids = 0  # counts the expected kids after merge

    for itm in rc1:  # walk root fields of source pdf
        if itm["name"] not in names0:  # not a duplicate name
            continue
        # if target field has kids, add their count, else add 1
        dup_kids0 = sum([i["kids"] for i in rc0 if i["name"] == itm["name"]])
        dup_kids += dup_kids0 if dup_kids0 else 1
        # if source field has kids add their count, else add 1
        dup_kids += itm["kids"] if itm["kids"] else 1

    names1 = [itm["name"] for itm in rc1]  # names in source

    tar.insert_pdf(src, join_duplicates=True)  # join merging any duplicate names

    rc2 = names_and_kids(tar)  # get names and kid counts in resulting PDF
    names2 = [itm["name"] for itm in rc2]  # resulting names in target
    kids2 = sum([itm["kids"] for itm in rc2])  # total resulting kid count

    assert len(set(names0 + names1)) == len(names2)
    assert kids2 == dup_kids


test_4412_path = os.path.normpath(f'{__file__}/../../tests/resources/test_4412.pdf')

def test_4412():
    # This tests whether a page from a PDF containing widgets found in the wild
    # can be inserted into a new document with default options (widget=True)
    # and widget=False.
    print()
    for widget in True, False:
        print(f'{widget=}', flush=1)
        with pymupdf.open(test_4412_path) as doc, pymupdf.open() as new_doc:
            buf = io.BytesIO()
            new_doc.insert_pdf(doc, from_page=1, to_page=1)
            new_doc.save(buf)
            assert len(new_doc)==1


def test_4571():
    path = os.path.normpath(f'{__file__}/../../tests/resources/test_4571.pdf')
    path_out = os.path.normpath(f'{__file__}/../../tests/resources/test_4571_out.pdf')
    with pymupdf.open() as newdocument:
        with pymupdf.open(path) as document:
            newdocument.insert_pdf(document)
        newdocument.save(path_out, garbage=4, clean=False)
        print(f'Have saved to: {path_out=}')
    with open(path_out, 'rb') as f:
        content = f.read()
    if pymupdf.mupdf_version_tuple >= (1, 26, 6):
        # Correct.
        assert b'<</Type/Pages/Count 6/Kids[4 0 R 6 0 R 12 0 R 13 0 R 14 0 R 15 0 R]>>' in content
    else:
        # Incorrect.
        assert b'<</Type/Pages/Count 6/Kids[4 0 R 6 0 R 12 0 R 4 0 R 6 0 R 12 0 R]>>' in content
author	Franz Glasner <fzglas.hg@dom66.de>
date	Fri, 19 Sep 2025 19:59:23 +0200
parents	1d09e1dec1d9
children