Mercurial > hgrepos > Python2 > PyMuPDF
diff tests/test_tables.py @ 1:1d09e1dec1d9 upstream
ADD: PyMuPDF v1.26.4: the original sdist.
It does not yet contain MuPDF. This normally will be downloaded when
building PyMuPDF.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:37:51 +0200 |
| parents | |
| children | a6bc019ac0b2 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/test_tables.py Mon Sep 15 11:37:51 2025 +0200 @@ -0,0 +1,460 @@ +import os +import io +from pprint import pprint +import textwrap +import pickle +import platform + +import pymupdf + +scriptdir = os.path.abspath(os.path.dirname(__file__)) +filename = os.path.join(scriptdir, "resources", "chinese-tables.pdf") +pickle_file = os.path.join(scriptdir, "resources", "chinese-tables.pickle") + + +def test_table1(): + """Compare pickled tables with those of the current run.""" + pickle_in = open(pickle_file, "rb") + doc = pymupdf.open(filename) + page = doc[0] + tabs = page.find_tables() + cells = tabs[0].cells + tabs[1].cells # all table cell tuples on page + extracts = [tabs[0].extract(), tabs[1].extract()] # all table cell content + old_data = pickle.load(pickle_in) # previously saved data + + # Compare cell contents + assert old_data["extracts"] == extracts # same cell contents + + # Compare cell coordinates. + # Cell rectangles may get somewhat larger due to more cautious border + # computations, but any differences must be small. + old_cells = old_data["cells"][0] + old_data["cells"][1] + assert len(cells) == len(old_cells) + for i in range(len(cells)): + c1 = pymupdf.Rect(cells[i]) # new cell coordinates + c0 = pymupdf.Rect(old_cells[i]) # old cell coordinates + assert c0 in c1 # always: old contained in new + assert abs(c1 - c0) < 0.2 # difference must be small + + +def test_table2(): + """Confirm header properties.""" + doc = pymupdf.open(filename) + page = doc[0] + tab1, tab2 = page.find_tables().tables + # both tables contain their header data + assert tab1.header.external == False + assert tab1.header.cells == tab1.rows[0].cells + assert tab2.header.external == False + assert tab2.header.cells == tab2.rows[0].cells + + +def test_2812(): + """Ensure table detection and extraction independent from page rotation. + + Make 4 pages with rotations 0, 90, 180 and 270 degrees respectively. + Each page shows the same 8x5 table. + We will check that each table is detected and delivers the same content. + """ + doc = pymupdf.open() + # Page 0: rotation 0 + page = doc.new_page(width=842, height=595) + rect = page.rect + (72, 72, -72, -72) + cols = 5 + rows = 8 + # define the cells, draw the grid and insert unique text in each cell. + cells = pymupdf.make_table(rect, rows=rows, cols=cols) + for i in range(rows): + for j in range(cols): + page.draw_rect(cells[i][j]) + for i in range(rows): + for j in range(cols): + page.insert_textbox( + cells[i][j], + f"cell[{i}][{j}]", + align=pymupdf.TEXT_ALIGN_CENTER, + ) + page.clean_contents() + + # Page 1: rotation 90 degrees + page = doc.new_page() + rect = page.rect + (72, 72, -72, -72) + cols = 8 + rows = 5 + cells = pymupdf.make_table(rect, rows=rows, cols=cols) + for i in range(rows): + for j in range(cols): + page.draw_rect(cells[i][j]) + for i in range(rows): + for j in range(cols): + page.insert_textbox( + cells[i][j], + f"cell[{j}][{rows-i-1}]", + rotate=90, + align=pymupdf.TEXT_ALIGN_CENTER, + ) + page.set_rotation(90) + page.clean_contents() + + # Page 2: rotation 180 degrees + page = doc.new_page(width=842, height=595) + rect = page.rect + (72, 72, -72, -72) + cols = 5 + rows = 8 + cells = pymupdf.make_table(rect, rows=rows, cols=cols) + for i in range(rows): + for j in range(cols): + page.draw_rect(cells[i][j]) + for i in range(rows): + for j in range(cols): + page.insert_textbox( + cells[i][j], + f"cell[{rows-i-1}][{cols-j-1}]", + rotate=180, + align=pymupdf.TEXT_ALIGN_CENTER, + ) + page.set_rotation(180) + page.clean_contents() + + # Page 3: rotation 270 degrees + page = doc.new_page() + rect = page.rect + (72, 72, -72, -72) + cols = 8 + rows = 5 + cells = pymupdf.make_table(rect, rows=rows, cols=cols) + for i in range(rows): + for j in range(cols): + page.draw_rect(cells[i][j]) + for i in range(rows): + for j in range(cols): + page.insert_textbox( + cells[i][j], + f"cell[{cols-j-1}][{i}]", + rotate=270, + align=pymupdf.TEXT_ALIGN_CENTER, + ) + page.set_rotation(270) + page.clean_contents() + + pdfdata = doc.tobytes() + # doc.ez_save("test-2812.pdf") + doc.close() + + # ------------------------------------------------------------------------- + # Test PDF prepared. Extract table on each page and + # ensure identical extracted table data. + # ------------------------------------------------------------------------- + doc = pymupdf.open("pdf", pdfdata) + extracts = [] + for page in doc: + tabs = page.find_tables() + assert len(tabs.tables) == 1 + tab = tabs[0] + fp = io.StringIO() + pprint(tab.extract(), stream=fp) + extracts.append(fp.getvalue()) + fp = None + assert tab.row_count == 8 + assert tab.col_count == 5 + e0 = extracts[0] + for e in extracts[1:]: + assert e == e0 + + +def test_2979(): + """This tests fix #2979 and #3001. + + 2979: identical cell count for each row + 3001: no change of global glyph heights + """ + filename = os.path.join(scriptdir, "resources", "test_2979.pdf") + doc = pymupdf.open(filename) + page = doc[0] + tab = page.find_tables()[0] # extract the table + lengths = set() # stores all row cell counts + for e in tab.extract(): + lengths.add(len(e)) # store number of cells for row + + # test 2979 + assert len(lengths) == 1 + + # test 3001 + assert ( + pymupdf.TOOLS.set_small_glyph_heights() is False + ), f"{pymupdf.TOOLS.set_small_glyph_heights()=}" + + wt = pymupdf.TOOLS.mupdf_warnings() + if pymupdf.mupdf_version_tuple >= (1, 26, 0): + assert ( + wt + == "bogus font ascent/descent values (3117 / -2463)\n... repeated 2 times..." + ) + else: + assert not wt + + +def test_3062(): + """Tests the fix for #3062. + After table extraction, a rotated page should behave and look + like as before.""" + if platform.python_implementation() == 'GraalVM': + print(f'test_3062(): Not running because slow on GraalVM.') + return + + filename = os.path.join(scriptdir, "resources", "test_3062.pdf") + doc = pymupdf.open(filename) + page = doc[0] + tab0 = page.find_tables()[0] + cells0 = tab0.cells + + page = None + page = doc[0] + tab1 = page.find_tables()[0] + cells1 = tab1.cells + assert cells1 == cells0 + + +def test_strict_lines(): + """Confirm that ignoring borderless rectangles improves table detection.""" + filename = os.path.join(scriptdir, "resources", "strict-yes-no.pdf") + doc = pymupdf.open(filename) + page = doc[0] + + tab1 = page.find_tables()[0] + tab2 = page.find_tables(strategy="lines_strict")[0] + assert tab2.row_count < tab1.row_count + assert tab2.col_count < tab1.col_count + + +def test_add_lines(): + """Test new parameter add_lines for table recognition.""" + if platform.python_implementation() == 'GraalVM': + print(f'test_add_lines(): Not running because breaks later tests on GraalVM.') + return + + filename = os.path.join(scriptdir, "resources", "small-table.pdf") + doc = pymupdf.open(filename) + page = doc[0] + assert page.find_tables().tables == [] + + more_lines = [ + ((238.9949951171875, 200.0), (238.9949951171875, 300.0)), + ((334.5559997558594, 200.0), (334.5559997558594, 300.0)), + ((433.1809997558594, 200.0), (433.1809997558594, 300.0)), + ] + + # these 3 additional vertical lines should additional 3 columns + tab2 = page.find_tables(add_lines=more_lines)[0] + assert tab2.col_count == 4 + assert tab2.row_count == 5 + + +def test_3148(): + """Ensure correct extraction text of rotated text.""" + doc = pymupdf.open() + page = doc.new_page() + rect = pymupdf.Rect(100, 100, 300, 300) + text = ( + "rotation 0 degrees", + "rotation 90 degrees", + "rotation 180 degrees", + "rotation 270 degrees", + ) + degrees = (0, 90, 180, 270) + delta = (2, 2, -2, -2) + cells = pymupdf.make_table(rect, cols=3, rows=4) + for i in range(3): + for j in range(4): + page.draw_rect(cells[j][i]) + k = (i + j) % 4 + page.insert_textbox(cells[j][i] + delta, text[k], rotate=degrees[k]) + # doc.save("multi-degree.pdf") + tabs = page.find_tables() + tab = tabs[0] + for extract in tab.extract(): + for item in extract: + item = item.replace("\n", " ") + assert item in text + + +def test_3179(): + """Test correct separation of multiple tables on page.""" + filename = os.path.join(scriptdir, "resources", "test_3179.pdf") + doc = pymupdf.open(filename) + page = doc[0] + tabs = page.find_tables() + assert len(tabs.tables) == 3 + + +def test_battery_file(): + """Tests correctly ignoring non-table suspects. + + Earlier versions erroneously tried to identify table headers + where there existed no table at all. + """ + filename = os.path.join(scriptdir, "resources", "battery-file-22.pdf") + doc = pymupdf.open(filename) + page = doc[0] + tabs = page.find_tables() + assert len(tabs.tables) == 0 + + +def test_markdown(): + """Confirm correct markdown output.""" + filename = os.path.join(scriptdir, "resources", "strict-yes-no.pdf") + doc = pymupdf.open(filename) + page = doc[0] + tab = page.find_tables(strategy="lines_strict")[0] + if pymupdf.mupdf_version_tuple < (1, 26, 3): + md_expected = textwrap.dedent(''' + |Header1|Header2|Header3| + |---|---|---| + |Col11<br>Col12|~~Col21~~<br>~~Col22~~|Col31<br>Col32<br>Col33| + |Col13|~~Col23~~|Col34<br>Col35| + |Col14|~~Col24~~|Col36| + |Col15|~~Col25~~<br>~~Col26~~|| + + ''').lstrip() + else: + md_expected = ( + "|Header1|Header2|Header3|\n" + "|---|---|---|\n" + "|Col11<br>Col12|Col21<br>Col22|Col31<br>Col32<br>Col33|\n" + "|Col13|Col23|Col34<br>Col35|\n" + "|Col14|Col24|Col36|\n" + "|Col15|Col25<br>Col26||\n\n" + ) + + + md = tab.to_markdown() + assert md == md_expected, f'Incorrect md:\n{textwrap.indent(md, " ")}' + + +def test_paths_param(): + """Confirm acceptance of supplied vector graphics list.""" + filename = os.path.join(scriptdir, "resources", "strict-yes-no.pdf") + doc = pymupdf.open(filename) + page = doc[0] + tabs = page.find_tables(paths=[]) # will cause all tables are missed + assert tabs.tables == [] + + +def test_boxes_param(): + """Confirm acceptance of supplied boxes list.""" + filename = os.path.join(scriptdir, "resources", "small-table.pdf") + doc = pymupdf.open(filename) + page = doc[0] + paths = page.get_drawings() + box0 = page.cluster_drawings(drawings=paths)[0] + boxes = [box0] + words = page.get_text("words") + x_vals = [w[0] - 5 for w in words if w[4] in ("min", "max", "avg")] + for x in x_vals: + r = +box0 + r.x1 = x + boxes.append(r) + + y_vals = sorted(set([round(w[3]) for w in words])) + for y in y_vals[:-1]: # skip last one to avoid empty row + r = +box0 + r.y1 = y + boxes.append(r) + + tabs = page.find_tables(paths=[], add_boxes=boxes) + tab = tabs.tables[0] + assert tab.extract() == [ + ["Boiling Points °C", "min", "max", "avg"], + ["Noble gases", "-269", "-62", "-170.5"], + ["Nonmetals", "-253", "4827", "414.1"], + ["Metalloids", "335", "3900", "741.5"], + ["Metals", "357", ">5000", "2755.9"], + ] + + +def test_dotted_grid(): + """Confirm dotted lines are detected as gridlines.""" + filename = os.path.join(scriptdir, "resources", "dotted-gridlines.pdf") + doc = pymupdf.open(filename) + page = doc[0] + tabs = page.find_tables() + assert len(tabs.tables) == 3 # must be 3 tables + t0, t1, t2 = tabs # extract them + # check that they have expected dimensions + assert t0.row_count, t0.col_count == (11, 12) + assert t1.row_count, t1.col_count == (25, 11) + assert t2.row_count, t2.col_count == (1, 10) + + +def test_4017(): + path = os.path.normpath(f"{__file__}/../../tests/resources/test_4017.pdf") + with pymupdf.open(path) as document: + page = document[0] + + tables = page.find_tables(add_lines=None) + print(f"{len(tables.tables)=}.") + tables_text = list() + for i, table in enumerate(tables): + print(f"## {i=}.") + t = table.extract() + for tt in t: + print(f" {tt}") + + # 2024-11-29: expect current incorrect output for last two tables. + + expected_a = [ + ["Class A/B Overcollateralization", "131.44%", ">=", "122.60%", "", "PASS"], + [None, None, None, None, None, "PASS"], + ["Class D Overcollateralization", "112.24%", ">=", "106.40%", "", "PASS"], + [None, None, None, None, None, "PASS"], + ["Event of Default", "156.08%", ">=", "102.50%", "", "PASS"], + [None, None, None, None, None, "PASS"], + ["Class A/B Interest Coverage", "N/A", ">=", "120.00%", "", "N/A"], + [None, None, None, None, None, "N/A"], + ["Class D Interest Coverage", "N/A", ">=", "105.00%", "", "N/A"], + ] + assert tables[-2].extract() == expected_a + + expected_b = [ + [ + "Moody's Maximum Rating Factor Test", + "2,577", + "<=", + "3,250", + "", + "PASS", + "2,581", + ], + [None, None, None, None, None, "PASS", None], + [ + "Minimum Floating Spread", + "3.5006%", + ">=", + "2.0000%", + "", + "PASS", + "3.4871%", + ], + [None, None, None, None, None, "PASS", None], + [ + "Minimum Weighted Average S&P Recovery\nRate Test", + "40.50%", + ">=", + "40.00%", + "", + "PASS", + "40.40%", + ], + [None, None, None, None, None, "PASS", None], + ["Weighted Average Life", "4.83", "<=", "9.00", "", "PASS", "4.92"], + ] + assert tables[-1].extract() == expected_b + + +def test_md_styles(): + """Test output of table with MD-styled cells.""" + filename = os.path.join(scriptdir, "resources", "test-styled-table.pdf") + doc = pymupdf.open(filename) + page = doc[0] + tabs = page.find_tables()[0] + text = """|Column 1|Column 2|Column 3|\n|---|---|---|\n|Zelle (0,0)|**Bold (0,1)**|Zelle (0,2)|\n|~~Strikeout (1,0), Zeile 1~~<br>~~Hier kommt Zeile 2.~~|Zelle (1,1)|~~Strikeout (1,2)~~|\n|**`Bold-monospaced`**<br>**`(2,0)`**|_Italic (2,1)_|**_Bold-italic_**<br>**_(2,2)_**|\n|Zelle (3,0)|~~**Bold-strikeout**~~<br>~~**(3,1)**~~|Zelle (3,2)|\n\n""" + assert tabs.to_markdown() == text
