Mercurial > hgrepos > Python2 > PyMuPDF
diff tests/test_insertpdf.py @ 1:1d09e1dec1d9 upstream
ADD: PyMuPDF v1.26.4: the original sdist.
It does not yet contain MuPDF. This normally will be downloaded when
building PyMuPDF.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:37:51 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/test_insertpdf.py Mon Sep 15 11:37:51 2025 +0200 @@ -0,0 +1,335 @@ +""" +* Join multiple PDFs into a new one. +* Compare with stored earlier result: + - must have identical object definitions + - must have different trailers +* Try inserting files in a loop. +""" + +import io +import os +import re +import pymupdf +from pymupdf import mupdf + +scriptdir = os.path.abspath(os.path.dirname(__file__)) +resources = os.path.join(scriptdir, "resources") + +def approx_parse( text): + ''' + Splits <text> into sequence of (text, number) pairs. Where sequence of + [0-9.] is not convertible to a number (e.g. '4.5.6'), <number> will be + None. + ''' + ret = [] + for m in re.finditer('([^0-9]+)([0-9.]*)', text): + text = m.group(1) + try: + number = float( m.group(2)) + except Exception: + text += m.group(2) + number = None + ret.append( (text, number)) + return ret + +def approx_compare( a, b, max_delta): + ''' + Compares <a> and <b>, allowing numbers to differ by up to <delta>. + ''' + aa = approx_parse( a) + bb = approx_parse( b) + if len(aa) != len(bb): + return 1 + ret = 1 + for (at, an), (bt, bn) in zip( aa, bb): + if at != bt: + break + if an is not None and bn is not None: + if abs( an - bn) >= max_delta: + print( f'diff={an-bn}: an={an} bn={bn}') + break + elif (an is None) != (bn is None): + break + else: + ret = 0 + if ret: + print( f'Differ:\n a={a!r}\n b={b!r}') + return ret + + +def test_insert(): + all_text_original = [] # text on input pages + all_text_combined = [] # text on resulting output pages + # prepare input PDFs + doc1 = pymupdf.open() + for i in range(5): # just arbitrary number of pages + text = f"doc 1, page {i}" # the 'globally' unique text + page = doc1.new_page() + page.insert_text((100, 72), text) + all_text_original.append(text) + + doc2 = pymupdf.open() + for i in range(4): + text = f"doc 2, page {i}" + page = doc2.new_page() + page.insert_text((100, 72), text) + all_text_original.append(text) + + doc3 = pymupdf.open() + for i in range(3): + text = f"doc 3, page {i}" + page = doc3.new_page() + page.insert_text((100, 72), text) + all_text_original.append(text) + + doc4 = pymupdf.open() + for i in range(6): + text = f"doc 4, page {i}" + page = doc4.new_page() + page.insert_text((100, 72), text) + all_text_original.append(text) + + new_doc = pymupdf.open() # make combined PDF of input files + new_doc.insert_pdf(doc1) + new_doc.insert_pdf(doc2) + new_doc.insert_pdf(doc3) + new_doc.insert_pdf(doc4) + # read text from all pages and store in list + for page in new_doc: + all_text_combined.append(page.get_text().replace("\n", "")) + # the lists must be equal + assert all_text_combined == all_text_original + + +def test_issue1417_insertpdf_in_loop(): + """Using a context manager instead of explicitly closing files""" + f = os.path.join(resources, "1.pdf") + big_doc = pymupdf.open() + fd1 = os.open( f, os.O_RDONLY) + os.close( fd1) + for n in range(0, 1025): + with pymupdf.open(f) as pdf: + big_doc.insert_pdf(pdf) + # Create a raw file descriptor. If the above pymupdf.open() context leaks + # a file descriptor, fd will be seen to increment. + fd2 = os.open( f, os.O_RDONLY) + assert fd2 == fd1 + os.close( fd2) + big_doc.close() + + +def _test_insert_adobe(): + path = os.path.abspath( f'{__file__}/../../../PyMuPDF-performance/adobe.pdf') + if not os.path.exists(path): + print(f'Not running test_insert_adobe() because does not exist: {os.path.relpath(path)}') + return + a = pymupdf.Document() + b = pymupdf.Document(path) + a.insert_pdf(b) + + +def _2861_2871_merge_pdf(content: bytes, coverpage: bytes): + with pymupdf.Document(stream=coverpage, filetype="pdf") as coverpage_pdf: + with pymupdf.Document(stream=content, filetype="pdf") as content_pdf: + coverpage_pdf.insert_pdf(content_pdf) + doc = coverpage_pdf.write() + return doc + +def test_2861(): + path = os.path.abspath(f'{__file__}/../../tests/resources/test_2861.pdf') + with open(path, "rb") as content_pdf: + with open(path, "rb") as coverpage_pdf: + content = content_pdf.read() + coverpage = coverpage_pdf.read() + _2861_2871_merge_pdf(content, coverpage) + +def test_2871(): + path = os.path.abspath(f'{__file__}/../../tests/resources/test_2871.pdf') + with open(path, "rb") as content_pdf: + with open(path, "rb") as coverpage_pdf: + content = content_pdf.read() + coverpage = coverpage_pdf.read() + _2861_2871_merge_pdf(content, coverpage) + + +def test_3789(): + + file_path = os.path.abspath(f'{__file__}/../../tests/resources/test_3789.pdf') + result_path = os.path.abspath(f'{__file__}/../../tests/test_3789_out') + pages_per_split = 5 + + # Clean pdf + doc = pymupdf.open(file_path) + tmp = io.BytesIO() + tmp.write(doc.write(garbage=4, deflate=True)) + + source_doc = pymupdf.Document('pdf', tmp.getvalue()) + tmp.close() + + # Calculate the number of pages per split file and the number of split files + page_range = pages_per_split - 1 + split_range = range(0, source_doc.page_count, pages_per_split) + num_splits = len(split_range) + + # Loop through each split range and create a new PDF file + for i, start in enumerate(split_range): + output_doc = pymupdf.open() + + # Determine the ending page for this split file + to_page = start + page_range if i < num_splits - 1 else -1 + output_doc.insert_pdf(source_doc, from_page=start, to_page=to_page) + + # Save the output document to a file and add the path to the list of split files + path = f'{result_path}_{i}.pdf' + output_doc.save(path, garbage=2) + print(f'Have saved to {path=}.') + + # If this is the last split file, exit the loop + if to_page == -1: + break + + +def test_widget_insert(): + """Confirm copy of form fields / widgets.""" + tar = pymupdf.open(os.path.join(resources, "merge-form1.pdf")) + pc0 = tar.page_count # for later assertion + src = pymupdf.open(os.path.join(resources, "interfield-calculation.pdf")) + pc1 = src.page_count # for later assertion + + tarpdf = pymupdf._as_pdf_document(tar) + tar_field_count = mupdf.pdf_array_len( + mupdf.pdf_dict_getp(mupdf.pdf_trailer(tarpdf), "Root/AcroForm/Fields") + ) + tar_co_count = mupdf.pdf_array_len( + mupdf.pdf_dict_getp(mupdf.pdf_trailer(tarpdf), "Root/AcroForm/CO") + ) + srcpdf = pymupdf._as_pdf_document(src) + src_field_count = mupdf.pdf_array_len( + mupdf.pdf_dict_getp(mupdf.pdf_trailer(srcpdf), "Root/AcroForm/Fields") + ) + src_co_count = mupdf.pdf_array_len( + mupdf.pdf_dict_getp(mupdf.pdf_trailer(srcpdf), "Root/AcroForm/CO") + ) + + tar.insert_pdf(src) + new_field_count = mupdf.pdf_array_len( + mupdf.pdf_dict_getp(mupdf.pdf_trailer(tarpdf), "Root/AcroForm/Fields") + ) + new_co_count = mupdf.pdf_array_len( + mupdf.pdf_dict_getp(mupdf.pdf_trailer(tarpdf), "Root/AcroForm/CO") + ) + assert tar.page_count == pc0 + pc1 + assert new_field_count == tar_field_count + src_field_count + assert new_co_count == tar_co_count + src_co_count + + +def names_and_kids(doc): + """Return a list of dictionaries with keys "name" and "kids". + + "name" is the name of a root field in "Root/AcroForm/Fields", and + "kids" is the count of its immediate children. + """ + rc = [] + pdf = pymupdf._as_pdf_document(doc) + fields = mupdf.pdf_dict_getl( + mupdf.pdf_trailer(pdf), + pymupdf.PDF_NAME("Root"), + pymupdf.PDF_NAME("AcroForm"), + pymupdf.PDF_NAME("Fields"), + ) + if not fields.pdf_is_array(): + return rc + root_count = fields.pdf_array_len() + if not root_count: + return rc + for i in range(root_count): + field = fields.pdf_array_get(i) + kids = field.pdf_dict_get(pymupdf.PDF_NAME("Kids")) + kid_count = kids.pdf_array_len() + T = field.pdf_dict_get_text_string(pymupdf.PDF_NAME("T")) + field_dict = {"name": T, "kids": kid_count} + rc.append(field_dict) + return rc + + +def test_merge_checks1(): + """Merge Form PDFs making any duplicate names unique.""" + merge_file1 = os.path.join(resources, "merge-form1.pdf") + merge_file2 = os.path.join(resources, "merge-form2.pdf") + tar = pymupdf.open(merge_file1) + rc0 = names_and_kids(tar) + src = pymupdf.open(merge_file2) + rc1 = names_and_kids(src) + tar.insert_pdf(src, join_duplicates=False) + rc2 = names_and_kids(tar) + assert len(rc2) == len(rc0) + len(rc1) + + +def test_merge_checks2(): + # Join / merge Form PDFs joining any duplicate names in the src PDF. + merge_file1 = os.path.join(resources, "merge-form1.pdf") + merge_file2 = os.path.join(resources, "merge-form2.pdf") + tar = pymupdf.open(merge_file1) + rc0 = names_and_kids(tar) # list of root names and kid counts + names0 = [itm["name"] for itm in rc0] # root names in target + kids0 = sum([itm["kids"] for itm in rc0]) # number of kids in target + + src = pymupdf.open(merge_file2) + rc1 = names_and_kids(src) # list of root namesand kids in source PDF + dup_count = 0 # counts duplicate names in source PDF + dup_kids = 0 # counts the expected kids after merge + + for itm in rc1: # walk root fields of source pdf + if itm["name"] not in names0: # not a duplicate name + continue + # if target field has kids, add their count, else add 1 + dup_kids0 = sum([i["kids"] for i in rc0 if i["name"] == itm["name"]]) + dup_kids += dup_kids0 if dup_kids0 else 1 + # if source field has kids add their count, else add 1 + dup_kids += itm["kids"] if itm["kids"] else 1 + + names1 = [itm["name"] for itm in rc1] # names in source + + tar.insert_pdf(src, join_duplicates=True) # join merging any duplicate names + + rc2 = names_and_kids(tar) # get names and kid counts in resulting PDF + names2 = [itm["name"] for itm in rc2] # resulting names in target + kids2 = sum([itm["kids"] for itm in rc2]) # total resulting kid count + + assert len(set(names0 + names1)) == len(names2) + assert kids2 == dup_kids + + +test_4412_path = os.path.normpath(f'{__file__}/../../tests/resources/test_4412.pdf') + +def test_4412(): + # This tests whether a page from a PDF containing widgets found in the wild + # can be inserted into a new document with default options (widget=True) + # and widget=False. + print() + for widget in True, False: + print(f'{widget=}', flush=1) + with pymupdf.open(test_4412_path) as doc, pymupdf.open() as new_doc: + buf = io.BytesIO() + new_doc.insert_pdf(doc, from_page=1, to_page=1) + new_doc.save(buf) + assert len(new_doc)==1 + + +def test_4571(): + path = os.path.normpath(f'{__file__}/../../tests/resources/test_4571.pdf') + path_out = os.path.normpath(f'{__file__}/../../tests/resources/test_4571_out.pdf') + with pymupdf.open() as newdocument: + with pymupdf.open(path) as document: + newdocument.insert_pdf(document) + newdocument.save(path_out, garbage=4, clean=False) + print(f'Have saved to: {path_out=}') + with open(path_out, 'rb') as f: + content = f.read() + if pymupdf.mupdf_version_tuple >= (1, 26, 6): + # Correct. + assert b'<</Type/Pages/Count 6/Kids[4 0 R 6 0 R 12 0 R 13 0 R 14 0 R 15 0 R]>>' in content + else: + # Incorrect. + assert b'<</Type/Pages/Count 6/Kids[4 0 R 6 0 R 12 0 R 4 0 R 6 0 R 12 0 R]>>' in content +
