Mercurial > hgrepos > Python2 > PyMuPDF
comparison tests/test_tables.py @ 1:1d09e1dec1d9 upstream
ADD: PyMuPDF v1.26.4: the original sdist.
It does not yet contain MuPDF. This normally will be downloaded when
building PyMuPDF.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:37:51 +0200 |
| parents | |
| children | a6bc019ac0b2 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 1:1d09e1dec1d9 |
|---|---|
| 1 import os | |
| 2 import io | |
| 3 from pprint import pprint | |
| 4 import textwrap | |
| 5 import pickle | |
| 6 import platform | |
| 7 | |
| 8 import pymupdf | |
| 9 | |
| 10 scriptdir = os.path.abspath(os.path.dirname(__file__)) | |
| 11 filename = os.path.join(scriptdir, "resources", "chinese-tables.pdf") | |
| 12 pickle_file = os.path.join(scriptdir, "resources", "chinese-tables.pickle") | |
| 13 | |
| 14 | |
| 15 def test_table1(): | |
| 16 """Compare pickled tables with those of the current run.""" | |
| 17 pickle_in = open(pickle_file, "rb") | |
| 18 doc = pymupdf.open(filename) | |
| 19 page = doc[0] | |
| 20 tabs = page.find_tables() | |
| 21 cells = tabs[0].cells + tabs[1].cells # all table cell tuples on page | |
| 22 extracts = [tabs[0].extract(), tabs[1].extract()] # all table cell content | |
| 23 old_data = pickle.load(pickle_in) # previously saved data | |
| 24 | |
| 25 # Compare cell contents | |
| 26 assert old_data["extracts"] == extracts # same cell contents | |
| 27 | |
| 28 # Compare cell coordinates. | |
| 29 # Cell rectangles may get somewhat larger due to more cautious border | |
| 30 # computations, but any differences must be small. | |
| 31 old_cells = old_data["cells"][0] + old_data["cells"][1] | |
| 32 assert len(cells) == len(old_cells) | |
| 33 for i in range(len(cells)): | |
| 34 c1 = pymupdf.Rect(cells[i]) # new cell coordinates | |
| 35 c0 = pymupdf.Rect(old_cells[i]) # old cell coordinates | |
| 36 assert c0 in c1 # always: old contained in new | |
| 37 assert abs(c1 - c0) < 0.2 # difference must be small | |
| 38 | |
| 39 | |
| 40 def test_table2(): | |
| 41 """Confirm header properties.""" | |
| 42 doc = pymupdf.open(filename) | |
| 43 page = doc[0] | |
| 44 tab1, tab2 = page.find_tables().tables | |
| 45 # both tables contain their header data | |
| 46 assert tab1.header.external == False | |
| 47 assert tab1.header.cells == tab1.rows[0].cells | |
| 48 assert tab2.header.external == False | |
| 49 assert tab2.header.cells == tab2.rows[0].cells | |
| 50 | |
| 51 | |
| 52 def test_2812(): | |
| 53 """Ensure table detection and extraction independent from page rotation. | |
| 54 | |
| 55 Make 4 pages with rotations 0, 90, 180 and 270 degrees respectively. | |
| 56 Each page shows the same 8x5 table. | |
| 57 We will check that each table is detected and delivers the same content. | |
| 58 """ | |
| 59 doc = pymupdf.open() | |
| 60 # Page 0: rotation 0 | |
| 61 page = doc.new_page(width=842, height=595) | |
| 62 rect = page.rect + (72, 72, -72, -72) | |
| 63 cols = 5 | |
| 64 rows = 8 | |
| 65 # define the cells, draw the grid and insert unique text in each cell. | |
| 66 cells = pymupdf.make_table(rect, rows=rows, cols=cols) | |
| 67 for i in range(rows): | |
| 68 for j in range(cols): | |
| 69 page.draw_rect(cells[i][j]) | |
| 70 for i in range(rows): | |
| 71 for j in range(cols): | |
| 72 page.insert_textbox( | |
| 73 cells[i][j], | |
| 74 f"cell[{i}][{j}]", | |
| 75 align=pymupdf.TEXT_ALIGN_CENTER, | |
| 76 ) | |
| 77 page.clean_contents() | |
| 78 | |
| 79 # Page 1: rotation 90 degrees | |
| 80 page = doc.new_page() | |
| 81 rect = page.rect + (72, 72, -72, -72) | |
| 82 cols = 8 | |
| 83 rows = 5 | |
| 84 cells = pymupdf.make_table(rect, rows=rows, cols=cols) | |
| 85 for i in range(rows): | |
| 86 for j in range(cols): | |
| 87 page.draw_rect(cells[i][j]) | |
| 88 for i in range(rows): | |
| 89 for j in range(cols): | |
| 90 page.insert_textbox( | |
| 91 cells[i][j], | |
| 92 f"cell[{j}][{rows-i-1}]", | |
| 93 rotate=90, | |
| 94 align=pymupdf.TEXT_ALIGN_CENTER, | |
| 95 ) | |
| 96 page.set_rotation(90) | |
| 97 page.clean_contents() | |
| 98 | |
| 99 # Page 2: rotation 180 degrees | |
| 100 page = doc.new_page(width=842, height=595) | |
| 101 rect = page.rect + (72, 72, -72, -72) | |
| 102 cols = 5 | |
| 103 rows = 8 | |
| 104 cells = pymupdf.make_table(rect, rows=rows, cols=cols) | |
| 105 for i in range(rows): | |
| 106 for j in range(cols): | |
| 107 page.draw_rect(cells[i][j]) | |
| 108 for i in range(rows): | |
| 109 for j in range(cols): | |
| 110 page.insert_textbox( | |
| 111 cells[i][j], | |
| 112 f"cell[{rows-i-1}][{cols-j-1}]", | |
| 113 rotate=180, | |
| 114 align=pymupdf.TEXT_ALIGN_CENTER, | |
| 115 ) | |
| 116 page.set_rotation(180) | |
| 117 page.clean_contents() | |
| 118 | |
| 119 # Page 3: rotation 270 degrees | |
| 120 page = doc.new_page() | |
| 121 rect = page.rect + (72, 72, -72, -72) | |
| 122 cols = 8 | |
| 123 rows = 5 | |
| 124 cells = pymupdf.make_table(rect, rows=rows, cols=cols) | |
| 125 for i in range(rows): | |
| 126 for j in range(cols): | |
| 127 page.draw_rect(cells[i][j]) | |
| 128 for i in range(rows): | |
| 129 for j in range(cols): | |
| 130 page.insert_textbox( | |
| 131 cells[i][j], | |
| 132 f"cell[{cols-j-1}][{i}]", | |
| 133 rotate=270, | |
| 134 align=pymupdf.TEXT_ALIGN_CENTER, | |
| 135 ) | |
| 136 page.set_rotation(270) | |
| 137 page.clean_contents() | |
| 138 | |
| 139 pdfdata = doc.tobytes() | |
| 140 # doc.ez_save("test-2812.pdf") | |
| 141 doc.close() | |
| 142 | |
| 143 # ------------------------------------------------------------------------- | |
| 144 # Test PDF prepared. Extract table on each page and | |
| 145 # ensure identical extracted table data. | |
| 146 # ------------------------------------------------------------------------- | |
| 147 doc = pymupdf.open("pdf", pdfdata) | |
| 148 extracts = [] | |
| 149 for page in doc: | |
| 150 tabs = page.find_tables() | |
| 151 assert len(tabs.tables) == 1 | |
| 152 tab = tabs[0] | |
| 153 fp = io.StringIO() | |
| 154 pprint(tab.extract(), stream=fp) | |
| 155 extracts.append(fp.getvalue()) | |
| 156 fp = None | |
| 157 assert tab.row_count == 8 | |
| 158 assert tab.col_count == 5 | |
| 159 e0 = extracts[0] | |
| 160 for e in extracts[1:]: | |
| 161 assert e == e0 | |
| 162 | |
| 163 | |
| 164 def test_2979(): | |
| 165 """This tests fix #2979 and #3001. | |
| 166 | |
| 167 2979: identical cell count for each row | |
| 168 3001: no change of global glyph heights | |
| 169 """ | |
| 170 filename = os.path.join(scriptdir, "resources", "test_2979.pdf") | |
| 171 doc = pymupdf.open(filename) | |
| 172 page = doc[0] | |
| 173 tab = page.find_tables()[0] # extract the table | |
| 174 lengths = set() # stores all row cell counts | |
| 175 for e in tab.extract(): | |
| 176 lengths.add(len(e)) # store number of cells for row | |
| 177 | |
| 178 # test 2979 | |
| 179 assert len(lengths) == 1 | |
| 180 | |
| 181 # test 3001 | |
| 182 assert ( | |
| 183 pymupdf.TOOLS.set_small_glyph_heights() is False | |
| 184 ), f"{pymupdf.TOOLS.set_small_glyph_heights()=}" | |
| 185 | |
| 186 wt = pymupdf.TOOLS.mupdf_warnings() | |
| 187 if pymupdf.mupdf_version_tuple >= (1, 26, 0): | |
| 188 assert ( | |
| 189 wt | |
| 190 == "bogus font ascent/descent values (3117 / -2463)\n... repeated 2 times..." | |
| 191 ) | |
| 192 else: | |
| 193 assert not wt | |
| 194 | |
| 195 | |
| 196 def test_3062(): | |
| 197 """Tests the fix for #3062. | |
| 198 After table extraction, a rotated page should behave and look | |
| 199 like as before.""" | |
| 200 if platform.python_implementation() == 'GraalVM': | |
| 201 print(f'test_3062(): Not running because slow on GraalVM.') | |
| 202 return | |
| 203 | |
| 204 filename = os.path.join(scriptdir, "resources", "test_3062.pdf") | |
| 205 doc = pymupdf.open(filename) | |
| 206 page = doc[0] | |
| 207 tab0 = page.find_tables()[0] | |
| 208 cells0 = tab0.cells | |
| 209 | |
| 210 page = None | |
| 211 page = doc[0] | |
| 212 tab1 = page.find_tables()[0] | |
| 213 cells1 = tab1.cells | |
| 214 assert cells1 == cells0 | |
| 215 | |
| 216 | |
| 217 def test_strict_lines(): | |
| 218 """Confirm that ignoring borderless rectangles improves table detection.""" | |
| 219 filename = os.path.join(scriptdir, "resources", "strict-yes-no.pdf") | |
| 220 doc = pymupdf.open(filename) | |
| 221 page = doc[0] | |
| 222 | |
| 223 tab1 = page.find_tables()[0] | |
| 224 tab2 = page.find_tables(strategy="lines_strict")[0] | |
| 225 assert tab2.row_count < tab1.row_count | |
| 226 assert tab2.col_count < tab1.col_count | |
| 227 | |
| 228 | |
| 229 def test_add_lines(): | |
| 230 """Test new parameter add_lines for table recognition.""" | |
| 231 if platform.python_implementation() == 'GraalVM': | |
| 232 print(f'test_add_lines(): Not running because breaks later tests on GraalVM.') | |
| 233 return | |
| 234 | |
| 235 filename = os.path.join(scriptdir, "resources", "small-table.pdf") | |
| 236 doc = pymupdf.open(filename) | |
| 237 page = doc[0] | |
| 238 assert page.find_tables().tables == [] | |
| 239 | |
| 240 more_lines = [ | |
| 241 ((238.9949951171875, 200.0), (238.9949951171875, 300.0)), | |
| 242 ((334.5559997558594, 200.0), (334.5559997558594, 300.0)), | |
| 243 ((433.1809997558594, 200.0), (433.1809997558594, 300.0)), | |
| 244 ] | |
| 245 | |
| 246 # these 3 additional vertical lines should additional 3 columns | |
| 247 tab2 = page.find_tables(add_lines=more_lines)[0] | |
| 248 assert tab2.col_count == 4 | |
| 249 assert tab2.row_count == 5 | |
| 250 | |
| 251 | |
| 252 def test_3148(): | |
| 253 """Ensure correct extraction text of rotated text.""" | |
| 254 doc = pymupdf.open() | |
| 255 page = doc.new_page() | |
| 256 rect = pymupdf.Rect(100, 100, 300, 300) | |
| 257 text = ( | |
| 258 "rotation 0 degrees", | |
| 259 "rotation 90 degrees", | |
| 260 "rotation 180 degrees", | |
| 261 "rotation 270 degrees", | |
| 262 ) | |
| 263 degrees = (0, 90, 180, 270) | |
| 264 delta = (2, 2, -2, -2) | |
| 265 cells = pymupdf.make_table(rect, cols=3, rows=4) | |
| 266 for i in range(3): | |
| 267 for j in range(4): | |
| 268 page.draw_rect(cells[j][i]) | |
| 269 k = (i + j) % 4 | |
| 270 page.insert_textbox(cells[j][i] + delta, text[k], rotate=degrees[k]) | |
| 271 # doc.save("multi-degree.pdf") | |
| 272 tabs = page.find_tables() | |
| 273 tab = tabs[0] | |
| 274 for extract in tab.extract(): | |
| 275 for item in extract: | |
| 276 item = item.replace("\n", " ") | |
| 277 assert item in text | |
| 278 | |
| 279 | |
| 280 def test_3179(): | |
| 281 """Test correct separation of multiple tables on page.""" | |
| 282 filename = os.path.join(scriptdir, "resources", "test_3179.pdf") | |
| 283 doc = pymupdf.open(filename) | |
| 284 page = doc[0] | |
| 285 tabs = page.find_tables() | |
| 286 assert len(tabs.tables) == 3 | |
| 287 | |
| 288 | |
| 289 def test_battery_file(): | |
| 290 """Tests correctly ignoring non-table suspects. | |
| 291 | |
| 292 Earlier versions erroneously tried to identify table headers | |
| 293 where there existed no table at all. | |
| 294 """ | |
| 295 filename = os.path.join(scriptdir, "resources", "battery-file-22.pdf") | |
| 296 doc = pymupdf.open(filename) | |
| 297 page = doc[0] | |
| 298 tabs = page.find_tables() | |
| 299 assert len(tabs.tables) == 0 | |
| 300 | |
| 301 | |
| 302 def test_markdown(): | |
| 303 """Confirm correct markdown output.""" | |
| 304 filename = os.path.join(scriptdir, "resources", "strict-yes-no.pdf") | |
| 305 doc = pymupdf.open(filename) | |
| 306 page = doc[0] | |
| 307 tab = page.find_tables(strategy="lines_strict")[0] | |
| 308 if pymupdf.mupdf_version_tuple < (1, 26, 3): | |
| 309 md_expected = textwrap.dedent(''' | |
| 310 |Header1|Header2|Header3| | |
| 311 |---|---|---| | |
| 312 |Col11<br>Col12|~~Col21~~<br>~~Col22~~|Col31<br>Col32<br>Col33| | |
| 313 |Col13|~~Col23~~|Col34<br>Col35| | |
| 314 |Col14|~~Col24~~|Col36| | |
| 315 |Col15|~~Col25~~<br>~~Col26~~|| | |
| 316 | |
| 317 ''').lstrip() | |
| 318 else: | |
| 319 md_expected = ( | |
| 320 "|Header1|Header2|Header3|\n" | |
| 321 "|---|---|---|\n" | |
| 322 "|Col11<br>Col12|Col21<br>Col22|Col31<br>Col32<br>Col33|\n" | |
| 323 "|Col13|Col23|Col34<br>Col35|\n" | |
| 324 "|Col14|Col24|Col36|\n" | |
| 325 "|Col15|Col25<br>Col26||\n\n" | |
| 326 ) | |
| 327 | |
| 328 | |
| 329 md = tab.to_markdown() | |
| 330 assert md == md_expected, f'Incorrect md:\n{textwrap.indent(md, " ")}' | |
| 331 | |
| 332 | |
| 333 def test_paths_param(): | |
| 334 """Confirm acceptance of supplied vector graphics list.""" | |
| 335 filename = os.path.join(scriptdir, "resources", "strict-yes-no.pdf") | |
| 336 doc = pymupdf.open(filename) | |
| 337 page = doc[0] | |
| 338 tabs = page.find_tables(paths=[]) # will cause all tables are missed | |
| 339 assert tabs.tables == [] | |
| 340 | |
| 341 | |
| 342 def test_boxes_param(): | |
| 343 """Confirm acceptance of supplied boxes list.""" | |
| 344 filename = os.path.join(scriptdir, "resources", "small-table.pdf") | |
| 345 doc = pymupdf.open(filename) | |
| 346 page = doc[0] | |
| 347 paths = page.get_drawings() | |
| 348 box0 = page.cluster_drawings(drawings=paths)[0] | |
| 349 boxes = [box0] | |
| 350 words = page.get_text("words") | |
| 351 x_vals = [w[0] - 5 for w in words if w[4] in ("min", "max", "avg")] | |
| 352 for x in x_vals: | |
| 353 r = +box0 | |
| 354 r.x1 = x | |
| 355 boxes.append(r) | |
| 356 | |
| 357 y_vals = sorted(set([round(w[3]) for w in words])) | |
| 358 for y in y_vals[:-1]: # skip last one to avoid empty row | |
| 359 r = +box0 | |
| 360 r.y1 = y | |
| 361 boxes.append(r) | |
| 362 | |
| 363 tabs = page.find_tables(paths=[], add_boxes=boxes) | |
| 364 tab = tabs.tables[0] | |
| 365 assert tab.extract() == [ | |
| 366 ["Boiling Points °C", "min", "max", "avg"], | |
| 367 ["Noble gases", "-269", "-62", "-170.5"], | |
| 368 ["Nonmetals", "-253", "4827", "414.1"], | |
| 369 ["Metalloids", "335", "3900", "741.5"], | |
| 370 ["Metals", "357", ">5000", "2755.9"], | |
| 371 ] | |
| 372 | |
| 373 | |
| 374 def test_dotted_grid(): | |
| 375 """Confirm dotted lines are detected as gridlines.""" | |
| 376 filename = os.path.join(scriptdir, "resources", "dotted-gridlines.pdf") | |
| 377 doc = pymupdf.open(filename) | |
| 378 page = doc[0] | |
| 379 tabs = page.find_tables() | |
| 380 assert len(tabs.tables) == 3 # must be 3 tables | |
| 381 t0, t1, t2 = tabs # extract them | |
| 382 # check that they have expected dimensions | |
| 383 assert t0.row_count, t0.col_count == (11, 12) | |
| 384 assert t1.row_count, t1.col_count == (25, 11) | |
| 385 assert t2.row_count, t2.col_count == (1, 10) | |
| 386 | |
| 387 | |
| 388 def test_4017(): | |
| 389 path = os.path.normpath(f"{__file__}/../../tests/resources/test_4017.pdf") | |
| 390 with pymupdf.open(path) as document: | |
| 391 page = document[0] | |
| 392 | |
| 393 tables = page.find_tables(add_lines=None) | |
| 394 print(f"{len(tables.tables)=}.") | |
| 395 tables_text = list() | |
| 396 for i, table in enumerate(tables): | |
| 397 print(f"## {i=}.") | |
| 398 t = table.extract() | |
| 399 for tt in t: | |
| 400 print(f" {tt}") | |
| 401 | |
| 402 # 2024-11-29: expect current incorrect output for last two tables. | |
| 403 | |
| 404 expected_a = [ | |
| 405 ["Class A/B Overcollateralization", "131.44%", ">=", "122.60%", "", "PASS"], | |
| 406 [None, None, None, None, None, "PASS"], | |
| 407 ["Class D Overcollateralization", "112.24%", ">=", "106.40%", "", "PASS"], | |
| 408 [None, None, None, None, None, "PASS"], | |
| 409 ["Event of Default", "156.08%", ">=", "102.50%", "", "PASS"], | |
| 410 [None, None, None, None, None, "PASS"], | |
| 411 ["Class A/B Interest Coverage", "N/A", ">=", "120.00%", "", "N/A"], | |
| 412 [None, None, None, None, None, "N/A"], | |
| 413 ["Class D Interest Coverage", "N/A", ">=", "105.00%", "", "N/A"], | |
| 414 ] | |
| 415 assert tables[-2].extract() == expected_a | |
| 416 | |
| 417 expected_b = [ | |
| 418 [ | |
| 419 "Moody's Maximum Rating Factor Test", | |
| 420 "2,577", | |
| 421 "<=", | |
| 422 "3,250", | |
| 423 "", | |
| 424 "PASS", | |
| 425 "2,581", | |
| 426 ], | |
| 427 [None, None, None, None, None, "PASS", None], | |
| 428 [ | |
| 429 "Minimum Floating Spread", | |
| 430 "3.5006%", | |
| 431 ">=", | |
| 432 "2.0000%", | |
| 433 "", | |
| 434 "PASS", | |
| 435 "3.4871%", | |
| 436 ], | |
| 437 [None, None, None, None, None, "PASS", None], | |
| 438 [ | |
| 439 "Minimum Weighted Average S&P Recovery\nRate Test", | |
| 440 "40.50%", | |
| 441 ">=", | |
| 442 "40.00%", | |
| 443 "", | |
| 444 "PASS", | |
| 445 "40.40%", | |
| 446 ], | |
| 447 [None, None, None, None, None, "PASS", None], | |
| 448 ["Weighted Average Life", "4.83", "<=", "9.00", "", "PASS", "4.92"], | |
| 449 ] | |
| 450 assert tables[-1].extract() == expected_b | |
| 451 | |
| 452 | |
| 453 def test_md_styles(): | |
| 454 """Test output of table with MD-styled cells.""" | |
| 455 filename = os.path.join(scriptdir, "resources", "test-styled-table.pdf") | |
| 456 doc = pymupdf.open(filename) | |
| 457 page = doc[0] | |
| 458 tabs = page.find_tables()[0] | |
| 459 text = """|Column 1|Column 2|Column 3|\n|---|---|---|\n|Zelle (0,0)|**Bold (0,1)**|Zelle (0,2)|\n|~~Strikeout (1,0), Zeile 1~~<br>~~Hier kommt Zeile 2.~~|Zelle (1,1)|~~Strikeout (1,2)~~|\n|**`Bold-monospaced`**<br>**`(2,0)`**|_Italic (2,1)_|**_Bold-italic_**<br>**_(2,2)_**|\n|Zelle (3,0)|~~**Bold-strikeout**~~<br>~~**(3,1)**~~|Zelle (3,2)|\n\n""" | |
| 460 assert tabs.to_markdown() == text |
