diff tests/test_tables.py @ 1:1d09e1dec1d9 upstream

ADD: PyMuPDF v1.26.4: the original sdist. It does not yet contain MuPDF. This normally will be downloaded when building PyMuPDF.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:37:51 +0200
parents
children a6bc019ac0b2
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/test_tables.py	Mon Sep 15 11:37:51 2025 +0200
@@ -0,0 +1,460 @@
+import os
+import io
+from pprint import pprint
+import textwrap
+import pickle
+import platform
+
+import pymupdf
+
+scriptdir = os.path.abspath(os.path.dirname(__file__))
+filename = os.path.join(scriptdir, "resources", "chinese-tables.pdf")
+pickle_file = os.path.join(scriptdir, "resources", "chinese-tables.pickle")
+
+
+def test_table1():
+    """Compare pickled tables with those of the current run."""
+    pickle_in = open(pickle_file, "rb")
+    doc = pymupdf.open(filename)
+    page = doc[0]
+    tabs = page.find_tables()
+    cells = tabs[0].cells + tabs[1].cells  # all table cell tuples on page
+    extracts = [tabs[0].extract(), tabs[1].extract()]  # all table cell content
+    old_data = pickle.load(pickle_in)  # previously saved data
+
+    # Compare cell contents
+    assert old_data["extracts"] == extracts  # same cell contents
+
+    # Compare cell coordinates.
+    # Cell rectangles may get somewhat larger due to more cautious border
+    # computations, but any differences must be small.
+    old_cells = old_data["cells"][0] + old_data["cells"][1]
+    assert len(cells) == len(old_cells)
+    for i in range(len(cells)):
+        c1 = pymupdf.Rect(cells[i])  # new cell coordinates
+        c0 = pymupdf.Rect(old_cells[i])  # old cell coordinates
+        assert c0 in c1  # always: old contained in new
+        assert abs(c1 - c0) < 0.2  # difference must be small
+
+
+def test_table2():
+    """Confirm header properties."""
+    doc = pymupdf.open(filename)
+    page = doc[0]
+    tab1, tab2 = page.find_tables().tables
+    # both tables contain their header data
+    assert tab1.header.external == False
+    assert tab1.header.cells == tab1.rows[0].cells
+    assert tab2.header.external == False
+    assert tab2.header.cells == tab2.rows[0].cells
+
+
+def test_2812():
+    """Ensure table detection and extraction independent from page rotation.
+
+    Make 4 pages with rotations 0, 90, 180 and 270 degrees respectively.
+    Each page shows the same 8x5 table.
+    We will check that each table is detected and delivers the same content.
+    """
+    doc = pymupdf.open()
+    # Page 0: rotation 0
+    page = doc.new_page(width=842, height=595)
+    rect = page.rect + (72, 72, -72, -72)
+    cols = 5
+    rows = 8
+    # define the cells, draw the grid and insert unique text in each cell.
+    cells = pymupdf.make_table(rect, rows=rows, cols=cols)
+    for i in range(rows):
+        for j in range(cols):
+            page.draw_rect(cells[i][j])
+    for i in range(rows):
+        for j in range(cols):
+            page.insert_textbox(
+                cells[i][j],
+                f"cell[{i}][{j}]",
+                align=pymupdf.TEXT_ALIGN_CENTER,
+            )
+    page.clean_contents()
+
+    # Page 1: rotation 90 degrees
+    page = doc.new_page()
+    rect = page.rect + (72, 72, -72, -72)
+    cols = 8
+    rows = 5
+    cells = pymupdf.make_table(rect, rows=rows, cols=cols)
+    for i in range(rows):
+        for j in range(cols):
+            page.draw_rect(cells[i][j])
+    for i in range(rows):
+        for j in range(cols):
+            page.insert_textbox(
+                cells[i][j],
+                f"cell[{j}][{rows-i-1}]",
+                rotate=90,
+                align=pymupdf.TEXT_ALIGN_CENTER,
+            )
+    page.set_rotation(90)
+    page.clean_contents()
+
+    # Page 2: rotation 180 degrees
+    page = doc.new_page(width=842, height=595)
+    rect = page.rect + (72, 72, -72, -72)
+    cols = 5
+    rows = 8
+    cells = pymupdf.make_table(rect, rows=rows, cols=cols)
+    for i in range(rows):
+        for j in range(cols):
+            page.draw_rect(cells[i][j])
+    for i in range(rows):
+        for j in range(cols):
+            page.insert_textbox(
+                cells[i][j],
+                f"cell[{rows-i-1}][{cols-j-1}]",
+                rotate=180,
+                align=pymupdf.TEXT_ALIGN_CENTER,
+            )
+    page.set_rotation(180)
+    page.clean_contents()
+
+    # Page 3: rotation 270 degrees
+    page = doc.new_page()
+    rect = page.rect + (72, 72, -72, -72)
+    cols = 8
+    rows = 5
+    cells = pymupdf.make_table(rect, rows=rows, cols=cols)
+    for i in range(rows):
+        for j in range(cols):
+            page.draw_rect(cells[i][j])
+    for i in range(rows):
+        for j in range(cols):
+            page.insert_textbox(
+                cells[i][j],
+                f"cell[{cols-j-1}][{i}]",
+                rotate=270,
+                align=pymupdf.TEXT_ALIGN_CENTER,
+            )
+    page.set_rotation(270)
+    page.clean_contents()
+
+    pdfdata = doc.tobytes()
+    # doc.ez_save("test-2812.pdf")
+    doc.close()
+
+    # -------------------------------------------------------------------------
+    # Test PDF prepared. Extract table on each page and
+    # ensure identical extracted table data.
+    # -------------------------------------------------------------------------
+    doc = pymupdf.open("pdf", pdfdata)
+    extracts = []
+    for page in doc:
+        tabs = page.find_tables()
+        assert len(tabs.tables) == 1
+        tab = tabs[0]
+        fp = io.StringIO()
+        pprint(tab.extract(), stream=fp)
+        extracts.append(fp.getvalue())
+        fp = None
+        assert tab.row_count == 8
+        assert tab.col_count == 5
+    e0 = extracts[0]
+    for e in extracts[1:]:
+        assert e == e0
+
+
+def test_2979():
+    """This tests fix #2979 and #3001.
+
+    2979: identical cell count for each row
+    3001: no change of global glyph heights
+    """
+    filename = os.path.join(scriptdir, "resources", "test_2979.pdf")
+    doc = pymupdf.open(filename)
+    page = doc[0]
+    tab = page.find_tables()[0]  # extract the table
+    lengths = set()  # stores all row cell counts
+    for e in tab.extract():
+        lengths.add(len(e))  # store number of cells for row
+
+    # test 2979
+    assert len(lengths) == 1
+
+    # test 3001
+    assert (
+        pymupdf.TOOLS.set_small_glyph_heights() is False
+    ), f"{pymupdf.TOOLS.set_small_glyph_heights()=}"
+
+    wt = pymupdf.TOOLS.mupdf_warnings()
+    if pymupdf.mupdf_version_tuple >= (1, 26, 0):
+        assert (
+            wt
+            == "bogus font ascent/descent values (3117 / -2463)\n... repeated 2 times..."
+        )
+    else:
+        assert not wt
+
+
+def test_3062():
+    """Tests the fix for #3062.
+    After table extraction, a rotated page should behave and look
+    like as before."""
+    if platform.python_implementation() == 'GraalVM':
+        print(f'test_3062(): Not running because slow on GraalVM.')
+        return
+    
+    filename = os.path.join(scriptdir, "resources", "test_3062.pdf")
+    doc = pymupdf.open(filename)
+    page = doc[0]
+    tab0 = page.find_tables()[0]
+    cells0 = tab0.cells
+
+    page = None
+    page = doc[0]
+    tab1 = page.find_tables()[0]
+    cells1 = tab1.cells
+    assert cells1 == cells0
+
+
+def test_strict_lines():
+    """Confirm that ignoring borderless rectangles improves table detection."""
+    filename = os.path.join(scriptdir, "resources", "strict-yes-no.pdf")
+    doc = pymupdf.open(filename)
+    page = doc[0]
+
+    tab1 = page.find_tables()[0]
+    tab2 = page.find_tables(strategy="lines_strict")[0]
+    assert tab2.row_count < tab1.row_count
+    assert tab2.col_count < tab1.col_count
+
+
+def test_add_lines():
+    """Test new parameter add_lines for table recognition."""
+    if platform.python_implementation() == 'GraalVM':
+        print(f'test_add_lines(): Not running because breaks later tests on GraalVM.')
+        return
+    
+    filename = os.path.join(scriptdir, "resources", "small-table.pdf")
+    doc = pymupdf.open(filename)
+    page = doc[0]
+    assert page.find_tables().tables == []
+
+    more_lines = [
+        ((238.9949951171875, 200.0), (238.9949951171875, 300.0)),
+        ((334.5559997558594, 200.0), (334.5559997558594, 300.0)),
+        ((433.1809997558594, 200.0), (433.1809997558594, 300.0)),
+    ]
+
+    # these 3 additional vertical lines should additional 3 columns
+    tab2 = page.find_tables(add_lines=more_lines)[0]
+    assert tab2.col_count == 4
+    assert tab2.row_count == 5
+
+
+def test_3148():
+    """Ensure correct extraction text of rotated text."""
+    doc = pymupdf.open()
+    page = doc.new_page()
+    rect = pymupdf.Rect(100, 100, 300, 300)
+    text = (
+        "rotation 0 degrees",
+        "rotation 90 degrees",
+        "rotation 180 degrees",
+        "rotation 270 degrees",
+    )
+    degrees = (0, 90, 180, 270)
+    delta = (2, 2, -2, -2)
+    cells = pymupdf.make_table(rect, cols=3, rows=4)
+    for i in range(3):
+        for j in range(4):
+            page.draw_rect(cells[j][i])
+            k = (i + j) % 4
+            page.insert_textbox(cells[j][i] + delta, text[k], rotate=degrees[k])
+    # doc.save("multi-degree.pdf")
+    tabs = page.find_tables()
+    tab = tabs[0]
+    for extract in tab.extract():
+        for item in extract:
+            item = item.replace("\n", " ")
+            assert item in text
+
+
+def test_3179():
+    """Test correct separation of multiple tables on page."""
+    filename = os.path.join(scriptdir, "resources", "test_3179.pdf")
+    doc = pymupdf.open(filename)
+    page = doc[0]
+    tabs = page.find_tables()
+    assert len(tabs.tables) == 3
+
+
+def test_battery_file():
+    """Tests correctly ignoring non-table suspects.
+
+    Earlier versions erroneously tried to identify table headers
+    where there existed no table at all.
+    """
+    filename = os.path.join(scriptdir, "resources", "battery-file-22.pdf")
+    doc = pymupdf.open(filename)
+    page = doc[0]
+    tabs = page.find_tables()
+    assert len(tabs.tables) == 0
+
+
+def test_markdown():
+    """Confirm correct markdown output."""
+    filename = os.path.join(scriptdir, "resources", "strict-yes-no.pdf")
+    doc = pymupdf.open(filename)
+    page = doc[0]
+    tab = page.find_tables(strategy="lines_strict")[0]
+    if pymupdf.mupdf_version_tuple < (1, 26, 3):
+        md_expected = textwrap.dedent('''
+                |Header1|Header2|Header3|
+                |---|---|---|
+                |Col11<br>Col12|~~Col21~~<br>~~Col22~~|Col31<br>Col32<br>Col33|
+                |Col13|~~Col23~~|Col34<br>Col35|
+                |Col14|~~Col24~~|Col36|
+                |Col15|~~Col25~~<br>~~Col26~~||
+                
+                ''').lstrip()
+    else:
+        md_expected = (
+            "|Header1|Header2|Header3|\n"
+            "|---|---|---|\n"
+            "|Col11<br>Col12|Col21<br>Col22|Col31<br>Col32<br>Col33|\n"
+            "|Col13|Col23|Col34<br>Col35|\n"
+            "|Col14|Col24|Col36|\n"
+            "|Col15|Col25<br>Col26||\n\n"
+        )
+
+
+    md = tab.to_markdown()
+    assert md == md_expected, f'Incorrect md:\n{textwrap.indent(md, "    ")}'
+
+
+def test_paths_param():
+    """Confirm acceptance of supplied vector graphics list."""
+    filename = os.path.join(scriptdir, "resources", "strict-yes-no.pdf")
+    doc = pymupdf.open(filename)
+    page = doc[0]
+    tabs = page.find_tables(paths=[])  # will cause all tables are missed
+    assert tabs.tables == []
+
+
+def test_boxes_param():
+    """Confirm acceptance of supplied boxes list."""
+    filename = os.path.join(scriptdir, "resources", "small-table.pdf")
+    doc = pymupdf.open(filename)
+    page = doc[0]
+    paths = page.get_drawings()
+    box0 = page.cluster_drawings(drawings=paths)[0]
+    boxes = [box0]
+    words = page.get_text("words")
+    x_vals = [w[0] - 5 for w in words if w[4] in ("min", "max", "avg")]
+    for x in x_vals:
+        r = +box0
+        r.x1 = x
+        boxes.append(r)
+
+    y_vals = sorted(set([round(w[3]) for w in words]))
+    for y in y_vals[:-1]:  # skip last one to avoid empty row
+        r = +box0
+        r.y1 = y
+        boxes.append(r)
+
+    tabs = page.find_tables(paths=[], add_boxes=boxes)
+    tab = tabs.tables[0]
+    assert tab.extract() == [
+        ["Boiling Points °C", "min", "max", "avg"],
+        ["Noble gases", "-269", "-62", "-170.5"],
+        ["Nonmetals", "-253", "4827", "414.1"],
+        ["Metalloids", "335", "3900", "741.5"],
+        ["Metals", "357", ">5000", "2755.9"],
+    ]
+
+
+def test_dotted_grid():
+    """Confirm dotted lines are detected as gridlines."""
+    filename = os.path.join(scriptdir, "resources", "dotted-gridlines.pdf")
+    doc = pymupdf.open(filename)
+    page = doc[0]
+    tabs = page.find_tables()
+    assert len(tabs.tables) == 3  # must be 3 tables
+    t0, t1, t2 = tabs  # extract them
+    # check that they have expected dimensions
+    assert t0.row_count, t0.col_count == (11, 12)
+    assert t1.row_count, t1.col_count == (25, 11)
+    assert t2.row_count, t2.col_count == (1, 10)
+
+
+def test_4017():
+    path = os.path.normpath(f"{__file__}/../../tests/resources/test_4017.pdf")
+    with pymupdf.open(path) as document:
+        page = document[0]
+
+        tables = page.find_tables(add_lines=None)
+        print(f"{len(tables.tables)=}.")
+        tables_text = list()
+        for i, table in enumerate(tables):
+            print(f"## {i=}.")
+            t = table.extract()
+            for tt in t:
+                print(f"    {tt}")
+
+        # 2024-11-29: expect current incorrect output for last two tables.
+
+        expected_a = [
+            ["Class A/B Overcollateralization", "131.44%", ">=", "122.60%", "", "PASS"],
+            [None, None, None, None, None, "PASS"],
+            ["Class D Overcollateralization", "112.24%", ">=", "106.40%", "", "PASS"],
+            [None, None, None, None, None, "PASS"],
+            ["Event of Default", "156.08%", ">=", "102.50%", "", "PASS"],
+            [None, None, None, None, None, "PASS"],
+            ["Class A/B Interest Coverage", "N/A", ">=", "120.00%", "", "N/A"],
+            [None, None, None, None, None, "N/A"],
+            ["Class D Interest Coverage", "N/A", ">=", "105.00%", "", "N/A"],
+        ]
+        assert tables[-2].extract() == expected_a
+
+        expected_b = [
+            [
+                "Moody's Maximum Rating Factor Test",
+                "2,577",
+                "<=",
+                "3,250",
+                "",
+                "PASS",
+                "2,581",
+            ],
+            [None, None, None, None, None, "PASS", None],
+            [
+                "Minimum Floating Spread",
+                "3.5006%",
+                ">=",
+                "2.0000%",
+                "",
+                "PASS",
+                "3.4871%",
+            ],
+            [None, None, None, None, None, "PASS", None],
+            [
+                "Minimum Weighted Average S&P Recovery\nRate Test",
+                "40.50%",
+                ">=",
+                "40.00%",
+                "",
+                "PASS",
+                "40.40%",
+            ],
+            [None, None, None, None, None, "PASS", None],
+            ["Weighted Average Life", "4.83", "<=", "9.00", "", "PASS", "4.92"],
+        ]
+        assert tables[-1].extract() == expected_b
+
+
+def test_md_styles():
+    """Test output of table with MD-styled cells."""
+    filename = os.path.join(scriptdir, "resources", "test-styled-table.pdf")
+    doc = pymupdf.open(filename)
+    page = doc[0]
+    tabs = page.find_tables()[0]
+    text = """|Column 1|Column 2|Column 3|\n|---|---|---|\n|Zelle (0,0)|**Bold (0,1)**|Zelle (0,2)|\n|~~Strikeout (1,0), Zeile 1~~<br>~~Hier kommt Zeile 2.~~|Zelle (1,1)|~~Strikeout (1,2)~~|\n|**`Bold-monospaced`**<br>**`(2,0)`**|_Italic (2,1)_|**_Bold-italic_**<br>**_(2,2)_**|\n|Zelle (3,0)|~~**Bold-strikeout**~~<br>~~**(3,1)**~~|Zelle (3,2)|\n\n"""
+    assert tabs.to_markdown() == text