Mercurial > hgrepos > Python2 > PyMuPDF
comparison tests/test_insertpdf.py @ 1:1d09e1dec1d9 upstream
ADD: PyMuPDF v1.26.4: the original sdist.
It does not yet contain MuPDF. This normally will be downloaded when
building PyMuPDF.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:37:51 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 1:1d09e1dec1d9 |
|---|---|
| 1 """ | |
| 2 * Join multiple PDFs into a new one. | |
| 3 * Compare with stored earlier result: | |
| 4 - must have identical object definitions | |
| 5 - must have different trailers | |
| 6 * Try inserting files in a loop. | |
| 7 """ | |
| 8 | |
| 9 import io | |
| 10 import os | |
| 11 import re | |
| 12 import pymupdf | |
| 13 from pymupdf import mupdf | |
| 14 | |
| 15 scriptdir = os.path.abspath(os.path.dirname(__file__)) | |
| 16 resources = os.path.join(scriptdir, "resources") | |
| 17 | |
| 18 def approx_parse( text): | |
| 19 ''' | |
| 20 Splits <text> into sequence of (text, number) pairs. Where sequence of | |
| 21 [0-9.] is not convertible to a number (e.g. '4.5.6'), <number> will be | |
| 22 None. | |
| 23 ''' | |
| 24 ret = [] | |
| 25 for m in re.finditer('([^0-9]+)([0-9.]*)', text): | |
| 26 text = m.group(1) | |
| 27 try: | |
| 28 number = float( m.group(2)) | |
| 29 except Exception: | |
| 30 text += m.group(2) | |
| 31 number = None | |
| 32 ret.append( (text, number)) | |
| 33 return ret | |
| 34 | |
| 35 def approx_compare( a, b, max_delta): | |
| 36 ''' | |
| 37 Compares <a> and <b>, allowing numbers to differ by up to <delta>. | |
| 38 ''' | |
| 39 aa = approx_parse( a) | |
| 40 bb = approx_parse( b) | |
| 41 if len(aa) != len(bb): | |
| 42 return 1 | |
| 43 ret = 1 | |
| 44 for (at, an), (bt, bn) in zip( aa, bb): | |
| 45 if at != bt: | |
| 46 break | |
| 47 if an is not None and bn is not None: | |
| 48 if abs( an - bn) >= max_delta: | |
| 49 print( f'diff={an-bn}: an={an} bn={bn}') | |
| 50 break | |
| 51 elif (an is None) != (bn is None): | |
| 52 break | |
| 53 else: | |
| 54 ret = 0 | |
| 55 if ret: | |
| 56 print( f'Differ:\n a={a!r}\n b={b!r}') | |
| 57 return ret | |
| 58 | |
| 59 | |
| 60 def test_insert(): | |
| 61 all_text_original = [] # text on input pages | |
| 62 all_text_combined = [] # text on resulting output pages | |
| 63 # prepare input PDFs | |
| 64 doc1 = pymupdf.open() | |
| 65 for i in range(5): # just arbitrary number of pages | |
| 66 text = f"doc 1, page {i}" # the 'globally' unique text | |
| 67 page = doc1.new_page() | |
| 68 page.insert_text((100, 72), text) | |
| 69 all_text_original.append(text) | |
| 70 | |
| 71 doc2 = pymupdf.open() | |
| 72 for i in range(4): | |
| 73 text = f"doc 2, page {i}" | |
| 74 page = doc2.new_page() | |
| 75 page.insert_text((100, 72), text) | |
| 76 all_text_original.append(text) | |
| 77 | |
| 78 doc3 = pymupdf.open() | |
| 79 for i in range(3): | |
| 80 text = f"doc 3, page {i}" | |
| 81 page = doc3.new_page() | |
| 82 page.insert_text((100, 72), text) | |
| 83 all_text_original.append(text) | |
| 84 | |
| 85 doc4 = pymupdf.open() | |
| 86 for i in range(6): | |
| 87 text = f"doc 4, page {i}" | |
| 88 page = doc4.new_page() | |
| 89 page.insert_text((100, 72), text) | |
| 90 all_text_original.append(text) | |
| 91 | |
| 92 new_doc = pymupdf.open() # make combined PDF of input files | |
| 93 new_doc.insert_pdf(doc1) | |
| 94 new_doc.insert_pdf(doc2) | |
| 95 new_doc.insert_pdf(doc3) | |
| 96 new_doc.insert_pdf(doc4) | |
| 97 # read text from all pages and store in list | |
| 98 for page in new_doc: | |
| 99 all_text_combined.append(page.get_text().replace("\n", "")) | |
| 100 # the lists must be equal | |
| 101 assert all_text_combined == all_text_original | |
| 102 | |
| 103 | |
| 104 def test_issue1417_insertpdf_in_loop(): | |
| 105 """Using a context manager instead of explicitly closing files""" | |
| 106 f = os.path.join(resources, "1.pdf") | |
| 107 big_doc = pymupdf.open() | |
| 108 fd1 = os.open( f, os.O_RDONLY) | |
| 109 os.close( fd1) | |
| 110 for n in range(0, 1025): | |
| 111 with pymupdf.open(f) as pdf: | |
| 112 big_doc.insert_pdf(pdf) | |
| 113 # Create a raw file descriptor. If the above pymupdf.open() context leaks | |
| 114 # a file descriptor, fd will be seen to increment. | |
| 115 fd2 = os.open( f, os.O_RDONLY) | |
| 116 assert fd2 == fd1 | |
| 117 os.close( fd2) | |
| 118 big_doc.close() | |
| 119 | |
| 120 | |
| 121 def _test_insert_adobe(): | |
| 122 path = os.path.abspath( f'{__file__}/../../../PyMuPDF-performance/adobe.pdf') | |
| 123 if not os.path.exists(path): | |
| 124 print(f'Not running test_insert_adobe() because does not exist: {os.path.relpath(path)}') | |
| 125 return | |
| 126 a = pymupdf.Document() | |
| 127 b = pymupdf.Document(path) | |
| 128 a.insert_pdf(b) | |
| 129 | |
| 130 | |
| 131 def _2861_2871_merge_pdf(content: bytes, coverpage: bytes): | |
| 132 with pymupdf.Document(stream=coverpage, filetype="pdf") as coverpage_pdf: | |
| 133 with pymupdf.Document(stream=content, filetype="pdf") as content_pdf: | |
| 134 coverpage_pdf.insert_pdf(content_pdf) | |
| 135 doc = coverpage_pdf.write() | |
| 136 return doc | |
| 137 | |
| 138 def test_2861(): | |
| 139 path = os.path.abspath(f'{__file__}/../../tests/resources/test_2861.pdf') | |
| 140 with open(path, "rb") as content_pdf: | |
| 141 with open(path, "rb") as coverpage_pdf: | |
| 142 content = content_pdf.read() | |
| 143 coverpage = coverpage_pdf.read() | |
| 144 _2861_2871_merge_pdf(content, coverpage) | |
| 145 | |
| 146 def test_2871(): | |
| 147 path = os.path.abspath(f'{__file__}/../../tests/resources/test_2871.pdf') | |
| 148 with open(path, "rb") as content_pdf: | |
| 149 with open(path, "rb") as coverpage_pdf: | |
| 150 content = content_pdf.read() | |
| 151 coverpage = coverpage_pdf.read() | |
| 152 _2861_2871_merge_pdf(content, coverpage) | |
| 153 | |
| 154 | |
| 155 def test_3789(): | |
| 156 | |
| 157 file_path = os.path.abspath(f'{__file__}/../../tests/resources/test_3789.pdf') | |
| 158 result_path = os.path.abspath(f'{__file__}/../../tests/test_3789_out') | |
| 159 pages_per_split = 5 | |
| 160 | |
| 161 # Clean pdf | |
| 162 doc = pymupdf.open(file_path) | |
| 163 tmp = io.BytesIO() | |
| 164 tmp.write(doc.write(garbage=4, deflate=True)) | |
| 165 | |
| 166 source_doc = pymupdf.Document('pdf', tmp.getvalue()) | |
| 167 tmp.close() | |
| 168 | |
| 169 # Calculate the number of pages per split file and the number of split files | |
| 170 page_range = pages_per_split - 1 | |
| 171 split_range = range(0, source_doc.page_count, pages_per_split) | |
| 172 num_splits = len(split_range) | |
| 173 | |
| 174 # Loop through each split range and create a new PDF file | |
| 175 for i, start in enumerate(split_range): | |
| 176 output_doc = pymupdf.open() | |
| 177 | |
| 178 # Determine the ending page for this split file | |
| 179 to_page = start + page_range if i < num_splits - 1 else -1 | |
| 180 output_doc.insert_pdf(source_doc, from_page=start, to_page=to_page) | |
| 181 | |
| 182 # Save the output document to a file and add the path to the list of split files | |
| 183 path = f'{result_path}_{i}.pdf' | |
| 184 output_doc.save(path, garbage=2) | |
| 185 print(f'Have saved to {path=}.') | |
| 186 | |
| 187 # If this is the last split file, exit the loop | |
| 188 if to_page == -1: | |
| 189 break | |
| 190 | |
| 191 | |
| 192 def test_widget_insert(): | |
| 193 """Confirm copy of form fields / widgets.""" | |
| 194 tar = pymupdf.open(os.path.join(resources, "merge-form1.pdf")) | |
| 195 pc0 = tar.page_count # for later assertion | |
| 196 src = pymupdf.open(os.path.join(resources, "interfield-calculation.pdf")) | |
| 197 pc1 = src.page_count # for later assertion | |
| 198 | |
| 199 tarpdf = pymupdf._as_pdf_document(tar) | |
| 200 tar_field_count = mupdf.pdf_array_len( | |
| 201 mupdf.pdf_dict_getp(mupdf.pdf_trailer(tarpdf), "Root/AcroForm/Fields") | |
| 202 ) | |
| 203 tar_co_count = mupdf.pdf_array_len( | |
| 204 mupdf.pdf_dict_getp(mupdf.pdf_trailer(tarpdf), "Root/AcroForm/CO") | |
| 205 ) | |
| 206 srcpdf = pymupdf._as_pdf_document(src) | |
| 207 src_field_count = mupdf.pdf_array_len( | |
| 208 mupdf.pdf_dict_getp(mupdf.pdf_trailer(srcpdf), "Root/AcroForm/Fields") | |
| 209 ) | |
| 210 src_co_count = mupdf.pdf_array_len( | |
| 211 mupdf.pdf_dict_getp(mupdf.pdf_trailer(srcpdf), "Root/AcroForm/CO") | |
| 212 ) | |
| 213 | |
| 214 tar.insert_pdf(src) | |
| 215 new_field_count = mupdf.pdf_array_len( | |
| 216 mupdf.pdf_dict_getp(mupdf.pdf_trailer(tarpdf), "Root/AcroForm/Fields") | |
| 217 ) | |
| 218 new_co_count = mupdf.pdf_array_len( | |
| 219 mupdf.pdf_dict_getp(mupdf.pdf_trailer(tarpdf), "Root/AcroForm/CO") | |
| 220 ) | |
| 221 assert tar.page_count == pc0 + pc1 | |
| 222 assert new_field_count == tar_field_count + src_field_count | |
| 223 assert new_co_count == tar_co_count + src_co_count | |
| 224 | |
| 225 | |
| 226 def names_and_kids(doc): | |
| 227 """Return a list of dictionaries with keys "name" and "kids". | |
| 228 | |
| 229 "name" is the name of a root field in "Root/AcroForm/Fields", and | |
| 230 "kids" is the count of its immediate children. | |
| 231 """ | |
| 232 rc = [] | |
| 233 pdf = pymupdf._as_pdf_document(doc) | |
| 234 fields = mupdf.pdf_dict_getl( | |
| 235 mupdf.pdf_trailer(pdf), | |
| 236 pymupdf.PDF_NAME("Root"), | |
| 237 pymupdf.PDF_NAME("AcroForm"), | |
| 238 pymupdf.PDF_NAME("Fields"), | |
| 239 ) | |
| 240 if not fields.pdf_is_array(): | |
| 241 return rc | |
| 242 root_count = fields.pdf_array_len() | |
| 243 if not root_count: | |
| 244 return rc | |
| 245 for i in range(root_count): | |
| 246 field = fields.pdf_array_get(i) | |
| 247 kids = field.pdf_dict_get(pymupdf.PDF_NAME("Kids")) | |
| 248 kid_count = kids.pdf_array_len() | |
| 249 T = field.pdf_dict_get_text_string(pymupdf.PDF_NAME("T")) | |
| 250 field_dict = {"name": T, "kids": kid_count} | |
| 251 rc.append(field_dict) | |
| 252 return rc | |
| 253 | |
| 254 | |
| 255 def test_merge_checks1(): | |
| 256 """Merge Form PDFs making any duplicate names unique.""" | |
| 257 merge_file1 = os.path.join(resources, "merge-form1.pdf") | |
| 258 merge_file2 = os.path.join(resources, "merge-form2.pdf") | |
| 259 tar = pymupdf.open(merge_file1) | |
| 260 rc0 = names_and_kids(tar) | |
| 261 src = pymupdf.open(merge_file2) | |
| 262 rc1 = names_and_kids(src) | |
| 263 tar.insert_pdf(src, join_duplicates=False) | |
| 264 rc2 = names_and_kids(tar) | |
| 265 assert len(rc2) == len(rc0) + len(rc1) | |
| 266 | |
| 267 | |
| 268 def test_merge_checks2(): | |
| 269 # Join / merge Form PDFs joining any duplicate names in the src PDF. | |
| 270 merge_file1 = os.path.join(resources, "merge-form1.pdf") | |
| 271 merge_file2 = os.path.join(resources, "merge-form2.pdf") | |
| 272 tar = pymupdf.open(merge_file1) | |
| 273 rc0 = names_and_kids(tar) # list of root names and kid counts | |
| 274 names0 = [itm["name"] for itm in rc0] # root names in target | |
| 275 kids0 = sum([itm["kids"] for itm in rc0]) # number of kids in target | |
| 276 | |
| 277 src = pymupdf.open(merge_file2) | |
| 278 rc1 = names_and_kids(src) # list of root namesand kids in source PDF | |
| 279 dup_count = 0 # counts duplicate names in source PDF | |
| 280 dup_kids = 0 # counts the expected kids after merge | |
| 281 | |
| 282 for itm in rc1: # walk root fields of source pdf | |
| 283 if itm["name"] not in names0: # not a duplicate name | |
| 284 continue | |
| 285 # if target field has kids, add their count, else add 1 | |
| 286 dup_kids0 = sum([i["kids"] for i in rc0 if i["name"] == itm["name"]]) | |
| 287 dup_kids += dup_kids0 if dup_kids0 else 1 | |
| 288 # if source field has kids add their count, else add 1 | |
| 289 dup_kids += itm["kids"] if itm["kids"] else 1 | |
| 290 | |
| 291 names1 = [itm["name"] for itm in rc1] # names in source | |
| 292 | |
| 293 tar.insert_pdf(src, join_duplicates=True) # join merging any duplicate names | |
| 294 | |
| 295 rc2 = names_and_kids(tar) # get names and kid counts in resulting PDF | |
| 296 names2 = [itm["name"] for itm in rc2] # resulting names in target | |
| 297 kids2 = sum([itm["kids"] for itm in rc2]) # total resulting kid count | |
| 298 | |
| 299 assert len(set(names0 + names1)) == len(names2) | |
| 300 assert kids2 == dup_kids | |
| 301 | |
| 302 | |
| 303 test_4412_path = os.path.normpath(f'{__file__}/../../tests/resources/test_4412.pdf') | |
| 304 | |
| 305 def test_4412(): | |
| 306 # This tests whether a page from a PDF containing widgets found in the wild | |
| 307 # can be inserted into a new document with default options (widget=True) | |
| 308 # and widget=False. | |
| 309 print() | |
| 310 for widget in True, False: | |
| 311 print(f'{widget=}', flush=1) | |
| 312 with pymupdf.open(test_4412_path) as doc, pymupdf.open() as new_doc: | |
| 313 buf = io.BytesIO() | |
| 314 new_doc.insert_pdf(doc, from_page=1, to_page=1) | |
| 315 new_doc.save(buf) | |
| 316 assert len(new_doc)==1 | |
| 317 | |
| 318 | |
| 319 def test_4571(): | |
| 320 path = os.path.normpath(f'{__file__}/../../tests/resources/test_4571.pdf') | |
| 321 path_out = os.path.normpath(f'{__file__}/../../tests/resources/test_4571_out.pdf') | |
| 322 with pymupdf.open() as newdocument: | |
| 323 with pymupdf.open(path) as document: | |
| 324 newdocument.insert_pdf(document) | |
| 325 newdocument.save(path_out, garbage=4, clean=False) | |
| 326 print(f'Have saved to: {path_out=}') | |
| 327 with open(path_out, 'rb') as f: | |
| 328 content = f.read() | |
| 329 if pymupdf.mupdf_version_tuple >= (1, 26, 6): | |
| 330 # Correct. | |
| 331 assert b'<</Type/Pages/Count 6/Kids[4 0 R 6 0 R 12 0 R 13 0 R 14 0 R 15 0 R]>>' in content | |
| 332 else: | |
| 333 # Incorrect. | |
| 334 assert b'<</Type/Pages/Count 6/Kids[4 0 R 6 0 R 12 0 R 4 0 R 6 0 R 12 0 R]>>' in content | |
| 335 |
