Python2/PyMuPDF: tests/test_tables.py comparison

comparison tests/test_tables.py @ 1:1d09e1dec1d9 upstream

ADD: PyMuPDF v1.26.4: the original sdist. It does not yet contain MuPDF. This normally will be downloaded when building PyMuPDF.

author	Franz Glasner <fzglas.hg@dom66.de>
date	Mon, 15 Sep 2025 11:37:51 +0200
parents
children	a6bc019ac0b2

comparison

equal deleted inserted replaced

--1:000000000000
+:1d09e1dec1d9
+import os
+import io
+from pprint import pprint
+import textwrap
+import pickle
+import platform
+import pymupdf
+scriptdir = os.path.abspath(os.path.dirname(__file__))
+filename = os.path.join(scriptdir, "resources", "chinese-tables.pdf")
+pickle_file = os.path.join(scriptdir, "resources", "chinese-tables.pickle")
+def test_table1():
+"""Compare pickled tables with those of the current run."""
+pickle_in = open(pickle_file, "rb")
+doc = pymupdf.open(filename)
+page = doc[0]
+tabs = page.find_tables()
+cells = tabs[0].cells + tabs[1].cells  # all table cell tuples on page
+extracts = [tabs[0].extract(), tabs[1].extract()]  # all table cell content
+old_data = pickle.load(pickle_in)  # previously saved data
+# Compare cell contents
+assert old_data["extracts"] == extracts  # same cell contents
+# Compare cell coordinates.
+# Cell rectangles may get somewhat larger due to more cautious border
+# computations, but any differences must be small.
+old_cells = old_data["cells"][0] + old_data["cells"][1]
+assert len(cells) == len(old_cells)
+for i in range(len(cells)):
+c1 = pymupdf.Rect(cells[i])  # new cell coordinates
+c0 = pymupdf.Rect(old_cells[i])  # old cell coordinates
+assert c0 in c1  # always: old contained in new
+assert abs(c1 - c0) < 0.2  # difference must be small
+def test_table2():
+"""Confirm header properties."""
+doc = pymupdf.open(filename)
+page = doc[0]
+tab1, tab2 = page.find_tables().tables
+# both tables contain their header data
+assert tab1.header.external == False
+assert tab1.header.cells == tab1.rows[0].cells
+assert tab2.header.external == False
+assert tab2.header.cells == tab2.rows[0].cells
+def test_2812():
+"""Ensure table detection and extraction independent from page rotation.
+Make 4 pages with rotations 0, 90, 180 and 270 degrees respectively.
+Each page shows the same 8x5 table.
+We will check that each table is detected and delivers the same content.
+"""
+doc = pymupdf.open()
+# Page 0: rotation 0
+page = doc.new_page(width=842, height=595)
+rect = page.rect + (72, 72, -72, -72)
+cols = 5
+rows = 8
+# define the cells, draw the grid and insert unique text in each cell.
+cells = pymupdf.make_table(rect, rows=rows, cols=cols)
+for i in range(rows):
+for j in range(cols):
+page.draw_rect(cells[i][j])
+for i in range(rows):
+for j in range(cols):
+page.insert_textbox(
+cells[i][j],
+f"cell[{i}][{j}]",
+align=pymupdf.TEXT_ALIGN_CENTER,
+)
+page.clean_contents()
+# Page 1: rotation 90 degrees
+page = doc.new_page()
+rect = page.rect + (72, 72, -72, -72)
+cols = 8
+rows = 5
+cells = pymupdf.make_table(rect, rows=rows, cols=cols)
+for i in range(rows):
+for j in range(cols):
+page.draw_rect(cells[i][j])
+for i in range(rows):
+for j in range(cols):
+page.insert_textbox(
+cells[i][j],
+f"cell[{j}][{rows-i-1}]",
+rotate=90,
+align=pymupdf.TEXT_ALIGN_CENTER,
+)
+page.set_rotation(90)
+page.clean_contents()
+# Page 2: rotation 180 degrees
+page = doc.new_page(width=842, height=595)
+rect = page.rect + (72, 72, -72, -72)
+cols = 5
+rows = 8
+cells = pymupdf.make_table(rect, rows=rows, cols=cols)
+for i in range(rows):
+for j in range(cols):
+page.draw_rect(cells[i][j])
+for i in range(rows):
+for j in range(cols):
+page.insert_textbox(
+cells[i][j],
+f"cell[{rows-i-1}][{cols-j-1}]",
+rotate=180,
+align=pymupdf.TEXT_ALIGN_CENTER,
+)
+page.set_rotation(180)
+page.clean_contents()
+# Page 3: rotation 270 degrees
+page = doc.new_page()
+rect = page.rect + (72, 72, -72, -72)
+cols = 8
+rows = 5
+cells = pymupdf.make_table(rect, rows=rows, cols=cols)
+for i in range(rows):
+for j in range(cols):
+page.draw_rect(cells[i][j])
+for i in range(rows):
+for j in range(cols):
+page.insert_textbox(
+cells[i][j],
+f"cell[{cols-j-1}][{i}]",
+rotate=270,
+align=pymupdf.TEXT_ALIGN_CENTER,
+)
+page.set_rotation(270)
+page.clean_contents()
+pdfdata = doc.tobytes()
+# doc.ez_save("test-2812.pdf")
+doc.close()
+# -------------------------------------------------------------------------
+# Test PDF prepared. Extract table on each page and
+# ensure identical extracted table data.
+# -------------------------------------------------------------------------
+doc = pymupdf.open("pdf", pdfdata)
+extracts = []
+for page in doc:
+tabs = page.find_tables()
+assert len(tabs.tables) == 1
+tab = tabs[0]
+fp = io.StringIO()
+pprint(tab.extract(), stream=fp)
+extracts.append(fp.getvalue())
+fp = None
+assert tab.row_count == 8
+assert tab.col_count == 5
+e0 = extracts[0]
+for e in extracts[1:]:
+assert e == e0
+def test_2979():
+"""This tests fix #2979 and #3001.
+2979: identical cell count for each row
+3001: no change of global glyph heights
+"""
+filename = os.path.join(scriptdir, "resources", "test_2979.pdf")
+doc = pymupdf.open(filename)
+page = doc[0]
+tab = page.find_tables()[0]  # extract the table
+lengths = set()  # stores all row cell counts
+for e in tab.extract():
+lengths.add(len(e))  # store number of cells for row
+# test 2979
+assert len(lengths) == 1
+# test 3001
+assert (
+pymupdf.TOOLS.set_small_glyph_heights() is False
+), f"{pymupdf.TOOLS.set_small_glyph_heights()=}"
+wt = pymupdf.TOOLS.mupdf_warnings()
+if pymupdf.mupdf_version_tuple >= (1, 26, 0):
+assert (
+wt
+== "bogus font ascent/descent values (3117 / -2463)\n... repeated 2 times..."
+)
+else:
+assert not wt
+def test_3062():
+"""Tests the fix for #3062.
+After table extraction, a rotated page should behave and look
+like as before."""
+if platform.python_implementation() == 'GraalVM':
+print(f'test_3062(): Not running because slow on GraalVM.')
+return
+filename = os.path.join(scriptdir, "resources", "test_3062.pdf")
+doc = pymupdf.open(filename)
+page = doc[0]
+tab0 = page.find_tables()[0]
+cells0 = tab0.cells
+page = None
+page = doc[0]
+tab1 = page.find_tables()[0]
+cells1 = tab1.cells
+assert cells1 == cells0
+def test_strict_lines():
+"""Confirm that ignoring borderless rectangles improves table detection."""
+filename = os.path.join(scriptdir, "resources", "strict-yes-no.pdf")
+doc = pymupdf.open(filename)
+page = doc[0]
+tab1 = page.find_tables()[0]
+tab2 = page.find_tables(strategy="lines_strict")[0]
+assert tab2.row_count < tab1.row_count
+assert tab2.col_count < tab1.col_count
+def test_add_lines():
+"""Test new parameter add_lines for table recognition."""
+if platform.python_implementation() == 'GraalVM':
+print(f'test_add_lines(): Not running because breaks later tests on GraalVM.')
+return
+filename = os.path.join(scriptdir, "resources", "small-table.pdf")
+doc = pymupdf.open(filename)
+page = doc[0]
+assert page.find_tables().tables == []
+more_lines = [
+((238.9949951171875, 200.0), (238.9949951171875, 300.0)),
+((334.5559997558594, 200.0), (334.5559997558594, 300.0)),
+((433.1809997558594, 200.0), (433.1809997558594, 300.0)),
+]
+# these 3 additional vertical lines should additional 3 columns
+tab2 = page.find_tables(add_lines=more_lines)[0]
+assert tab2.col_count == 4
+assert tab2.row_count == 5
+def test_3148():
+"""Ensure correct extraction text of rotated text."""
+doc = pymupdf.open()
+page = doc.new_page()
+rect = pymupdf.Rect(100, 100, 300, 300)
+text = (
+"rotation 0 degrees",
+"rotation 90 degrees",
+"rotation 180 degrees",
+"rotation 270 degrees",
+)
+degrees = (0, 90, 180, 270)
+delta = (2, 2, -2, -2)
+cells = pymupdf.make_table(rect, cols=3, rows=4)
+for i in range(3):
+for j in range(4):
+page.draw_rect(cells[j][i])
+k = (i + j) % 4
+page.insert_textbox(cells[j][i] + delta, text[k], rotate=degrees[k])
+# doc.save("multi-degree.pdf")
+tabs = page.find_tables()
+tab = tabs[0]
+for extract in tab.extract():
+for item in extract:
+item = item.replace("\n", " ")
+assert item in text
+def test_3179():
+"""Test correct separation of multiple tables on page."""
+filename = os.path.join(scriptdir, "resources", "test_3179.pdf")
+doc = pymupdf.open(filename)
+page = doc[0]
+tabs = page.find_tables()
+assert len(tabs.tables) == 3
+def test_battery_file():
+"""Tests correctly ignoring non-table suspects.
+Earlier versions erroneously tried to identify table headers
+where there existed no table at all.
+"""
+filename = os.path.join(scriptdir, "resources", "battery-file-22.pdf")
+doc = pymupdf.open(filename)
+page = doc[0]
+tabs = page.find_tables()
+assert len(tabs.tables) == 0
+def test_markdown():
+"""Confirm correct markdown output."""
+filename = os.path.join(scriptdir, "resources", "strict-yes-no.pdf")
+doc = pymupdf.open(filename)
+page = doc[0]
+tab = page.find_tables(strategy="lines_strict")[0]
+if pymupdf.mupdf_version_tuple < (1, 26, 3):
+md_expected = textwrap.dedent('''
+|Header1|Header2|Header3|
+|---|---|---|
+|Col11<br>Col12|~~Col21~~<br>~~Col22~~|Col31<br>Col32<br>Col33|
+|Col13|~~Col23~~|Col34<br>Col35|
+|Col14|~~Col24~~|Col36|
+|Col15|~~Col25~~<br>~~Col26~~||
+''').lstrip()
+else:
+md_expected = (
+"|Header1|Header2|Header3|\n"
+"|---|---|---|\n"
+"|Col11<br>Col12|Col21<br>Col22|Col31<br>Col32<br>Col33|\n"
+"|Col13|Col23|Col34<br>Col35|\n"
+"|Col14|Col24|Col36|\n"
+"|Col15|Col25<br>Col26||\n\n"
+)
+md = tab.to_markdown()
+assert md == md_expected, f'Incorrect md:\n{textwrap.indent(md, "    ")}'
+def test_paths_param():
+"""Confirm acceptance of supplied vector graphics list."""
+filename = os.path.join(scriptdir, "resources", "strict-yes-no.pdf")
+doc = pymupdf.open(filename)
+page = doc[0]
+tabs = page.find_tables(paths=[])  # will cause all tables are missed
+assert tabs.tables == []
+def test_boxes_param():
+"""Confirm acceptance of supplied boxes list."""
+filename = os.path.join(scriptdir, "resources", "small-table.pdf")
+doc = pymupdf.open(filename)
+page = doc[0]
+paths = page.get_drawings()
+box0 = page.cluster_drawings(drawings=paths)[0]
+boxes = [box0]
+words = page.get_text("words")
+x_vals = [w[0] - 5 for w in words if w[4] in ("min", "max", "avg")]
+for x in x_vals:
+r = +box0
+r.x1 = x
+boxes.append(r)
+y_vals = sorted(set([round(w[3]) for w in words]))
+for y in y_vals[:-1]:  # skip last one to avoid empty row
+r = +box0
+r.y1 = y
+boxes.append(r)
+tabs = page.find_tables(paths=[], add_boxes=boxes)
+tab = tabs.tables[0]
+assert tab.extract() == [
+["Boiling Points °C", "min", "max", "avg"],
+["Noble gases", "-269", "-62", "-170.5"],
+["Nonmetals", "-253", "4827", "414.1"],
+["Metalloids", "335", "3900", "741.5"],
+["Metals", "357", ">5000", "2755.9"],
+]
+def test_dotted_grid():
+"""Confirm dotted lines are detected as gridlines."""
+filename = os.path.join(scriptdir, "resources", "dotted-gridlines.pdf")
+doc = pymupdf.open(filename)
+page = doc[0]
+tabs = page.find_tables()
+assert len(tabs.tables) == 3  # must be 3 tables
+t0, t1, t2 = tabs  # extract them
+# check that they have expected dimensions
+assert t0.row_count, t0.col_count == (11, 12)
+assert t1.row_count, t1.col_count == (25, 11)
+assert t2.row_count, t2.col_count == (1, 10)
+def test_4017():
+path = os.path.normpath(f"{__file__}/../../tests/resources/test_4017.pdf")
+with pymupdf.open(path) as document:
+page = document[0]
+tables = page.find_tables(add_lines=None)
+print(f"{len(tables.tables)=}.")
+tables_text = list()
+for i, table in enumerate(tables):
+print(f"## {i=}.")
+t = table.extract()
+for tt in t:
+print(f"    {tt}")
+# 2024-11-29: expect current incorrect output for last two tables.
+expected_a = [
+["Class A/B Overcollateralization", "131.44%", ">=", "122.60%", "", "PASS"],
+[None, None, None, None, None, "PASS"],
+["Class D Overcollateralization", "112.24%", ">=", "106.40%", "", "PASS"],
+[None, None, None, None, None, "PASS"],
+["Event of Default", "156.08%", ">=", "102.50%", "", "PASS"],
+[None, None, None, None, None, "PASS"],
+["Class A/B Interest Coverage", "N/A", ">=", "120.00%", "", "N/A"],
+[None, None, None, None, None, "N/A"],
+["Class D Interest Coverage", "N/A", ">=", "105.00%", "", "N/A"],
+]
+assert tables[-2].extract() == expected_a
+expected_b = [
+[
+"Moody's Maximum Rating Factor Test",
+"2,577",
+"<=",
+"3,250",
+"",
+"PASS",
+"2,581",
+],
+[None, None, None, None, None, "PASS", None],
+[
+"Minimum Floating Spread",
+"3.5006%",
+">=",
+"2.0000%",
+"",
+"PASS",
+"3.4871%",
+],
+[None, None, None, None, None, "PASS", None],
+[
+"Minimum Weighted Average S&P Recovery\nRate Test",
+"40.50%",
+">=",
+"40.00%",
+"",
+"PASS",
+"40.40%",
+],
+[None, None, None, None, None, "PASS", None],
+["Weighted Average Life", "4.83", "<=", "9.00", "", "PASS", "4.92"],
+]
+assert tables[-1].extract() == expected_b
+def test_md_styles():
+"""Test output of table with MD-styled cells."""
+filename = os.path.join(scriptdir, "resources", "test-styled-table.pdf")
+doc = pymupdf.open(filename)
+page = doc[0]
+tabs = page.find_tables()[0]
+text = """|Column 1|Column 2|Column 3|\n|---|---|---|\n|Zelle (0,0)|**Bold (0,1)**|Zelle (0,2)|\n|~~Strikeout (1,0), Zeile 1~~<br>~~Hier kommt Zeile 2.~~|Zelle (1,1)|~~Strikeout (1,2)~~|\n|**`Bold-monospaced`**<br>**`(2,0)`**|_Italic (2,1)_|**_Bold-italic_**<br>**_(2,2)_**|\n|Zelle (3,0)|~~**Bold-strikeout**~~<br>~~**(3,1)**~~|Zelle (3,2)|\n\n"""
+assert tabs.to_markdown() == text

Mercurial > hgrepos > Python2 > PyMuPDF

comparison tests/test_tables.py @ 1:1d09e1dec1d9 upstream