comparison tests/test_insertpdf.py @ 3:2c135c81b16c

MERGE: upstream PyMuPDF 1.26.4 with MuPDF 1.26.7
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:44:09 +0200
parents 1d09e1dec1d9
children
comparison
equal deleted inserted replaced
0:6015a75abc2d 3:2c135c81b16c
1 """
2 * Join multiple PDFs into a new one.
3 * Compare with stored earlier result:
4 - must have identical object definitions
5 - must have different trailers
6 * Try inserting files in a loop.
7 """
8
9 import io
10 import os
11 import re
12 import pymupdf
13 from pymupdf import mupdf
14
15 scriptdir = os.path.abspath(os.path.dirname(__file__))
16 resources = os.path.join(scriptdir, "resources")
17
18 def approx_parse( text):
19 '''
20 Splits <text> into sequence of (text, number) pairs. Where sequence of
21 [0-9.] is not convertible to a number (e.g. '4.5.6'), <number> will be
22 None.
23 '''
24 ret = []
25 for m in re.finditer('([^0-9]+)([0-9.]*)', text):
26 text = m.group(1)
27 try:
28 number = float( m.group(2))
29 except Exception:
30 text += m.group(2)
31 number = None
32 ret.append( (text, number))
33 return ret
34
35 def approx_compare( a, b, max_delta):
36 '''
37 Compares <a> and <b>, allowing numbers to differ by up to <delta>.
38 '''
39 aa = approx_parse( a)
40 bb = approx_parse( b)
41 if len(aa) != len(bb):
42 return 1
43 ret = 1
44 for (at, an), (bt, bn) in zip( aa, bb):
45 if at != bt:
46 break
47 if an is not None and bn is not None:
48 if abs( an - bn) >= max_delta:
49 print( f'diff={an-bn}: an={an} bn={bn}')
50 break
51 elif (an is None) != (bn is None):
52 break
53 else:
54 ret = 0
55 if ret:
56 print( f'Differ:\n a={a!r}\n b={b!r}')
57 return ret
58
59
60 def test_insert():
61 all_text_original = [] # text on input pages
62 all_text_combined = [] # text on resulting output pages
63 # prepare input PDFs
64 doc1 = pymupdf.open()
65 for i in range(5): # just arbitrary number of pages
66 text = f"doc 1, page {i}" # the 'globally' unique text
67 page = doc1.new_page()
68 page.insert_text((100, 72), text)
69 all_text_original.append(text)
70
71 doc2 = pymupdf.open()
72 for i in range(4):
73 text = f"doc 2, page {i}"
74 page = doc2.new_page()
75 page.insert_text((100, 72), text)
76 all_text_original.append(text)
77
78 doc3 = pymupdf.open()
79 for i in range(3):
80 text = f"doc 3, page {i}"
81 page = doc3.new_page()
82 page.insert_text((100, 72), text)
83 all_text_original.append(text)
84
85 doc4 = pymupdf.open()
86 for i in range(6):
87 text = f"doc 4, page {i}"
88 page = doc4.new_page()
89 page.insert_text((100, 72), text)
90 all_text_original.append(text)
91
92 new_doc = pymupdf.open() # make combined PDF of input files
93 new_doc.insert_pdf(doc1)
94 new_doc.insert_pdf(doc2)
95 new_doc.insert_pdf(doc3)
96 new_doc.insert_pdf(doc4)
97 # read text from all pages and store in list
98 for page in new_doc:
99 all_text_combined.append(page.get_text().replace("\n", ""))
100 # the lists must be equal
101 assert all_text_combined == all_text_original
102
103
104 def test_issue1417_insertpdf_in_loop():
105 """Using a context manager instead of explicitly closing files"""
106 f = os.path.join(resources, "1.pdf")
107 big_doc = pymupdf.open()
108 fd1 = os.open( f, os.O_RDONLY)
109 os.close( fd1)
110 for n in range(0, 1025):
111 with pymupdf.open(f) as pdf:
112 big_doc.insert_pdf(pdf)
113 # Create a raw file descriptor. If the above pymupdf.open() context leaks
114 # a file descriptor, fd will be seen to increment.
115 fd2 = os.open( f, os.O_RDONLY)
116 assert fd2 == fd1
117 os.close( fd2)
118 big_doc.close()
119
120
121 def _test_insert_adobe():
122 path = os.path.abspath( f'{__file__}/../../../PyMuPDF-performance/adobe.pdf')
123 if not os.path.exists(path):
124 print(f'Not running test_insert_adobe() because does not exist: {os.path.relpath(path)}')
125 return
126 a = pymupdf.Document()
127 b = pymupdf.Document(path)
128 a.insert_pdf(b)
129
130
131 def _2861_2871_merge_pdf(content: bytes, coverpage: bytes):
132 with pymupdf.Document(stream=coverpage, filetype="pdf") as coverpage_pdf:
133 with pymupdf.Document(stream=content, filetype="pdf") as content_pdf:
134 coverpage_pdf.insert_pdf(content_pdf)
135 doc = coverpage_pdf.write()
136 return doc
137
138 def test_2861():
139 path = os.path.abspath(f'{__file__}/../../tests/resources/test_2861.pdf')
140 with open(path, "rb") as content_pdf:
141 with open(path, "rb") as coverpage_pdf:
142 content = content_pdf.read()
143 coverpage = coverpage_pdf.read()
144 _2861_2871_merge_pdf(content, coverpage)
145
146 def test_2871():
147 path = os.path.abspath(f'{__file__}/../../tests/resources/test_2871.pdf')
148 with open(path, "rb") as content_pdf:
149 with open(path, "rb") as coverpage_pdf:
150 content = content_pdf.read()
151 coverpage = coverpage_pdf.read()
152 _2861_2871_merge_pdf(content, coverpage)
153
154
155 def test_3789():
156
157 file_path = os.path.abspath(f'{__file__}/../../tests/resources/test_3789.pdf')
158 result_path = os.path.abspath(f'{__file__}/../../tests/test_3789_out')
159 pages_per_split = 5
160
161 # Clean pdf
162 doc = pymupdf.open(file_path)
163 tmp = io.BytesIO()
164 tmp.write(doc.write(garbage=4, deflate=True))
165
166 source_doc = pymupdf.Document('pdf', tmp.getvalue())
167 tmp.close()
168
169 # Calculate the number of pages per split file and the number of split files
170 page_range = pages_per_split - 1
171 split_range = range(0, source_doc.page_count, pages_per_split)
172 num_splits = len(split_range)
173
174 # Loop through each split range and create a new PDF file
175 for i, start in enumerate(split_range):
176 output_doc = pymupdf.open()
177
178 # Determine the ending page for this split file
179 to_page = start + page_range if i < num_splits - 1 else -1
180 output_doc.insert_pdf(source_doc, from_page=start, to_page=to_page)
181
182 # Save the output document to a file and add the path to the list of split files
183 path = f'{result_path}_{i}.pdf'
184 output_doc.save(path, garbage=2)
185 print(f'Have saved to {path=}.')
186
187 # If this is the last split file, exit the loop
188 if to_page == -1:
189 break
190
191
192 def test_widget_insert():
193 """Confirm copy of form fields / widgets."""
194 tar = pymupdf.open(os.path.join(resources, "merge-form1.pdf"))
195 pc0 = tar.page_count # for later assertion
196 src = pymupdf.open(os.path.join(resources, "interfield-calculation.pdf"))
197 pc1 = src.page_count # for later assertion
198
199 tarpdf = pymupdf._as_pdf_document(tar)
200 tar_field_count = mupdf.pdf_array_len(
201 mupdf.pdf_dict_getp(mupdf.pdf_trailer(tarpdf), "Root/AcroForm/Fields")
202 )
203 tar_co_count = mupdf.pdf_array_len(
204 mupdf.pdf_dict_getp(mupdf.pdf_trailer(tarpdf), "Root/AcroForm/CO")
205 )
206 srcpdf = pymupdf._as_pdf_document(src)
207 src_field_count = mupdf.pdf_array_len(
208 mupdf.pdf_dict_getp(mupdf.pdf_trailer(srcpdf), "Root/AcroForm/Fields")
209 )
210 src_co_count = mupdf.pdf_array_len(
211 mupdf.pdf_dict_getp(mupdf.pdf_trailer(srcpdf), "Root/AcroForm/CO")
212 )
213
214 tar.insert_pdf(src)
215 new_field_count = mupdf.pdf_array_len(
216 mupdf.pdf_dict_getp(mupdf.pdf_trailer(tarpdf), "Root/AcroForm/Fields")
217 )
218 new_co_count = mupdf.pdf_array_len(
219 mupdf.pdf_dict_getp(mupdf.pdf_trailer(tarpdf), "Root/AcroForm/CO")
220 )
221 assert tar.page_count == pc0 + pc1
222 assert new_field_count == tar_field_count + src_field_count
223 assert new_co_count == tar_co_count + src_co_count
224
225
226 def names_and_kids(doc):
227 """Return a list of dictionaries with keys "name" and "kids".
228
229 "name" is the name of a root field in "Root/AcroForm/Fields", and
230 "kids" is the count of its immediate children.
231 """
232 rc = []
233 pdf = pymupdf._as_pdf_document(doc)
234 fields = mupdf.pdf_dict_getl(
235 mupdf.pdf_trailer(pdf),
236 pymupdf.PDF_NAME("Root"),
237 pymupdf.PDF_NAME("AcroForm"),
238 pymupdf.PDF_NAME("Fields"),
239 )
240 if not fields.pdf_is_array():
241 return rc
242 root_count = fields.pdf_array_len()
243 if not root_count:
244 return rc
245 for i in range(root_count):
246 field = fields.pdf_array_get(i)
247 kids = field.pdf_dict_get(pymupdf.PDF_NAME("Kids"))
248 kid_count = kids.pdf_array_len()
249 T = field.pdf_dict_get_text_string(pymupdf.PDF_NAME("T"))
250 field_dict = {"name": T, "kids": kid_count}
251 rc.append(field_dict)
252 return rc
253
254
255 def test_merge_checks1():
256 """Merge Form PDFs making any duplicate names unique."""
257 merge_file1 = os.path.join(resources, "merge-form1.pdf")
258 merge_file2 = os.path.join(resources, "merge-form2.pdf")
259 tar = pymupdf.open(merge_file1)
260 rc0 = names_and_kids(tar)
261 src = pymupdf.open(merge_file2)
262 rc1 = names_and_kids(src)
263 tar.insert_pdf(src, join_duplicates=False)
264 rc2 = names_and_kids(tar)
265 assert len(rc2) == len(rc0) + len(rc1)
266
267
268 def test_merge_checks2():
269 # Join / merge Form PDFs joining any duplicate names in the src PDF.
270 merge_file1 = os.path.join(resources, "merge-form1.pdf")
271 merge_file2 = os.path.join(resources, "merge-form2.pdf")
272 tar = pymupdf.open(merge_file1)
273 rc0 = names_and_kids(tar) # list of root names and kid counts
274 names0 = [itm["name"] for itm in rc0] # root names in target
275 kids0 = sum([itm["kids"] for itm in rc0]) # number of kids in target
276
277 src = pymupdf.open(merge_file2)
278 rc1 = names_and_kids(src) # list of root namesand kids in source PDF
279 dup_count = 0 # counts duplicate names in source PDF
280 dup_kids = 0 # counts the expected kids after merge
281
282 for itm in rc1: # walk root fields of source pdf
283 if itm["name"] not in names0: # not a duplicate name
284 continue
285 # if target field has kids, add their count, else add 1
286 dup_kids0 = sum([i["kids"] for i in rc0 if i["name"] == itm["name"]])
287 dup_kids += dup_kids0 if dup_kids0 else 1
288 # if source field has kids add their count, else add 1
289 dup_kids += itm["kids"] if itm["kids"] else 1
290
291 names1 = [itm["name"] for itm in rc1] # names in source
292
293 tar.insert_pdf(src, join_duplicates=True) # join merging any duplicate names
294
295 rc2 = names_and_kids(tar) # get names and kid counts in resulting PDF
296 names2 = [itm["name"] for itm in rc2] # resulting names in target
297 kids2 = sum([itm["kids"] for itm in rc2]) # total resulting kid count
298
299 assert len(set(names0 + names1)) == len(names2)
300 assert kids2 == dup_kids
301
302
303 test_4412_path = os.path.normpath(f'{__file__}/../../tests/resources/test_4412.pdf')
304
305 def test_4412():
306 # This tests whether a page from a PDF containing widgets found in the wild
307 # can be inserted into a new document with default options (widget=True)
308 # and widget=False.
309 print()
310 for widget in True, False:
311 print(f'{widget=}', flush=1)
312 with pymupdf.open(test_4412_path) as doc, pymupdf.open() as new_doc:
313 buf = io.BytesIO()
314 new_doc.insert_pdf(doc, from_page=1, to_page=1)
315 new_doc.save(buf)
316 assert len(new_doc)==1
317
318
319 def test_4571():
320 path = os.path.normpath(f'{__file__}/../../tests/resources/test_4571.pdf')
321 path_out = os.path.normpath(f'{__file__}/../../tests/resources/test_4571_out.pdf')
322 with pymupdf.open() as newdocument:
323 with pymupdf.open(path) as document:
324 newdocument.insert_pdf(document)
325 newdocument.save(path_out, garbage=4, clean=False)
326 print(f'Have saved to: {path_out=}')
327 with open(path_out, 'rb') as f:
328 content = f.read()
329 if pymupdf.mupdf_version_tuple >= (1, 26, 6):
330 # Correct.
331 assert b'<</Type/Pages/Count 6/Kids[4 0 R 6 0 R 12 0 R 13 0 R 14 0 R 15 0 R]>>' in content
332 else:
333 # Incorrect.
334 assert b'<</Type/Pages/Count 6/Kids[4 0 R 6 0 R 12 0 R 4 0 R 6 0 R 12 0 R]>>' in content
335