comparison tests/test_mupdf_regressions.py @ 1:1d09e1dec1d9 upstream

ADD: PyMuPDF v1.26.4: the original sdist. It does not yet contain MuPDF. This normally will be downloaded when building PyMuPDF.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:37:51 +0200
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 1:1d09e1dec1d9
1 import pymupdf
2 import os
3 import gentle_compare
4
5 scriptdir = os.path.abspath(os.path.dirname(__file__))
6
7
8 def test_707448():
9 """Confirm page content cleaning does not destroy page appearance."""
10 filename = os.path.join(scriptdir, "resources", "test-707448.pdf")
11 doc = pymupdf.open(filename)
12 page = doc[0]
13 words0 = page.get_text("words")
14 page.clean_contents(sanitize=True)
15 words1 = page.get_text("words")
16 assert gentle_compare.gentle_compare(words0, words1)
17
18
19 def test_707673():
20 """Confirm page content cleaning does not destroy page appearance.
21
22 Fails starting with MuPDF v1.23.9.
23
24 Fixed in:
25 commit 779b8234529cb82aa1e92826854c7bb98b19e44b (golden/master)
26 """
27 filename = os.path.join(scriptdir, "resources", "test-707673.pdf")
28 doc = pymupdf.open(filename)
29 page = doc[0]
30 words0 = page.get_text("words")
31 page.clean_contents(sanitize=True)
32 words1 = page.get_text("words")
33 ok = gentle_compare.gentle_compare(words0, words1)
34 assert ok
35
36
37 def test_707727():
38 """Confirm page content cleaning does not destroy page appearance.
39
40 MuPDF issue: https://bugs.ghostscript.com/show_bug.cgi?id=707727
41 """
42 filename = os.path.join(scriptdir, "resources", "test_3362.pdf")
43 doc = pymupdf.open(filename)
44 page = doc[0]
45 pix0 = page.get_pixmap()
46 page.clean_contents(sanitize=True)
47 page = doc.reload_page(page) # required to prevent re-use
48 pix1 = page.get_pixmap()
49 rms = gentle_compare.pixmaps_rms(pix0, pix1)
50 print(f'{rms=}', flush=1)
51 pix0.save(os.path.normpath(f'{__file__}/../../tests/test_707727_pix0.png'))
52 pix1.save(os.path.normpath(f'{__file__}/../../tests/test_707727_pix1.png'))
53 if pymupdf.mupdf_version_tuple >= (1, 25, 2):
54 # New sanitising gives small fp rounding errors.
55 assert rms < 0.05
56 else:
57 assert rms == 0
58
59
60 def test_707721():
61 """Confirm text extraction works for nested MCID with Type 3 fonts.
62 PyMuPDF issue https://github.com/pymupdf/PyMuPDF/issues/3357
63 MuPDF issue: https://bugs.ghostscript.com/show_bug.cgi?id=707721
64 """
65 filename = os.path.join(scriptdir, "resources", "test_3357.pdf")
66 doc = pymupdf.open(filename)
67 page = doc[0]
68 ok = page.get_text()
69 assert ok
70
71
72 def test_3376():
73 """Check fix of MuPDF bug 707733.
74
75 https://bugs.ghostscript.com/show_bug.cgi?id=707733
76 PyMuPDF issue https://github.com/pymupdf/PyMuPDF/issues/3376
77
78 Test file contains a redaction for the first 3 words: "Table of Contents".
79 Test strategy:
80 - extract all words (sorted)
81 - apply redactions
82 - extract words again
83 - confirm: we now have 3 words less and remaining words are equal.
84 """
85 filename = os.path.join(scriptdir, "resources", "test_3376.pdf")
86 doc = pymupdf.open(filename)
87 page = doc[0]
88 words0 = page.get_text("words", sort=True)
89 words0_s = words0[:3] # first 3 words
90 words0_e = words0[3:] # remaining words
91 assert " ".join([w[4] for w in words0_s]) == "Table of Contents"
92
93 page.apply_redactions()
94
95 words1 = page.get_text("words", sort=True)
96
97 ok = gentle_compare.gentle_compare(words0_e, words1)
98 assert ok