diff tests/test_mupdf_regressions.py @ 1:1d09e1dec1d9 upstream

ADD: PyMuPDF v1.26.4: the original sdist. It does not yet contain MuPDF. This normally will be downloaded when building PyMuPDF.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:37:51 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/test_mupdf_regressions.py	Mon Sep 15 11:37:51 2025 +0200
@@ -0,0 +1,98 @@
+import pymupdf
+import os
+import gentle_compare
+
+scriptdir = os.path.abspath(os.path.dirname(__file__))
+
+
+def test_707448():
+    """Confirm page content cleaning does not destroy page appearance."""
+    filename = os.path.join(scriptdir, "resources", "test-707448.pdf")
+    doc = pymupdf.open(filename)
+    page = doc[0]
+    words0 = page.get_text("words")
+    page.clean_contents(sanitize=True)
+    words1 = page.get_text("words")
+    assert gentle_compare.gentle_compare(words0, words1)
+
+
+def test_707673():
+    """Confirm page content cleaning does not destroy page appearance.
+
+    Fails starting with MuPDF v1.23.9.
+
+    Fixed in:
+    commit 779b8234529cb82aa1e92826854c7bb98b19e44b (golden/master)
+    """
+    filename = os.path.join(scriptdir, "resources", "test-707673.pdf")
+    doc = pymupdf.open(filename)
+    page = doc[0]
+    words0 = page.get_text("words")
+    page.clean_contents(sanitize=True)
+    words1 = page.get_text("words")
+    ok = gentle_compare.gentle_compare(words0, words1)
+    assert ok
+
+
+def test_707727():
+    """Confirm page content cleaning does not destroy page appearance.
+
+    MuPDF issue: https://bugs.ghostscript.com/show_bug.cgi?id=707727
+    """
+    filename = os.path.join(scriptdir, "resources", "test_3362.pdf")
+    doc = pymupdf.open(filename)
+    page = doc[0]
+    pix0 = page.get_pixmap()
+    page.clean_contents(sanitize=True)
+    page = doc.reload_page(page)  # required to prevent re-use
+    pix1 = page.get_pixmap()
+    rms = gentle_compare.pixmaps_rms(pix0, pix1)
+    print(f'{rms=}', flush=1)
+    pix0.save(os.path.normpath(f'{__file__}/../../tests/test_707727_pix0.png'))
+    pix1.save(os.path.normpath(f'{__file__}/../../tests/test_707727_pix1.png'))
+    if pymupdf.mupdf_version_tuple >= (1, 25, 2):
+        # New sanitising gives small fp rounding errors.
+        assert rms < 0.05
+    else:
+        assert rms == 0
+
+
+def test_707721():
+    """Confirm text extraction works for nested MCID with Type 3 fonts.
+    PyMuPDF issue https://github.com/pymupdf/PyMuPDF/issues/3357
+    MuPDF issue: https://bugs.ghostscript.com/show_bug.cgi?id=707721
+    """
+    filename = os.path.join(scriptdir, "resources", "test_3357.pdf")
+    doc = pymupdf.open(filename)
+    page = doc[0]
+    ok = page.get_text()
+    assert ok
+
+
+def test_3376():
+    """Check fix of MuPDF bug 707733.
+
+    https://bugs.ghostscript.com/show_bug.cgi?id=707733
+    PyMuPDF issue https://github.com/pymupdf/PyMuPDF/issues/3376
+
+    Test file contains a redaction for the first 3 words: "Table of Contents".
+    Test strategy:
+    - extract all words (sorted)
+    - apply redactions
+    - extract words again
+    - confirm: we now have 3 words less and remaining words are equal.
+    """
+    filename = os.path.join(scriptdir, "resources", "test_3376.pdf")
+    doc = pymupdf.open(filename)
+    page = doc[0]
+    words0 = page.get_text("words", sort=True)
+    words0_s = words0[:3]  # first 3 words
+    words0_e = words0[3:]  # remaining words
+    assert " ".join([w[4] for w in words0_s]) == "Table of Contents"
+
+    page.apply_redactions()
+
+    words1 = page.get_text("words", sort=True)
+
+    ok = gentle_compare.gentle_compare(words0_e, words1)
+    assert ok