comparison tests/test_tables.py @ 1:1d09e1dec1d9 upstream

ADD: PyMuPDF v1.26.4: the original sdist. It does not yet contain MuPDF. This normally will be downloaded when building PyMuPDF.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:37:51 +0200
parents
children a6bc019ac0b2
comparison
equal deleted inserted replaced
-1:000000000000 1:1d09e1dec1d9
1 import os
2 import io
3 from pprint import pprint
4 import textwrap
5 import pickle
6 import platform
7
8 import pymupdf
9
10 scriptdir = os.path.abspath(os.path.dirname(__file__))
11 filename = os.path.join(scriptdir, "resources", "chinese-tables.pdf")
12 pickle_file = os.path.join(scriptdir, "resources", "chinese-tables.pickle")
13
14
15 def test_table1():
16 """Compare pickled tables with those of the current run."""
17 pickle_in = open(pickle_file, "rb")
18 doc = pymupdf.open(filename)
19 page = doc[0]
20 tabs = page.find_tables()
21 cells = tabs[0].cells + tabs[1].cells # all table cell tuples on page
22 extracts = [tabs[0].extract(), tabs[1].extract()] # all table cell content
23 old_data = pickle.load(pickle_in) # previously saved data
24
25 # Compare cell contents
26 assert old_data["extracts"] == extracts # same cell contents
27
28 # Compare cell coordinates.
29 # Cell rectangles may get somewhat larger due to more cautious border
30 # computations, but any differences must be small.
31 old_cells = old_data["cells"][0] + old_data["cells"][1]
32 assert len(cells) == len(old_cells)
33 for i in range(len(cells)):
34 c1 = pymupdf.Rect(cells[i]) # new cell coordinates
35 c0 = pymupdf.Rect(old_cells[i]) # old cell coordinates
36 assert c0 in c1 # always: old contained in new
37 assert abs(c1 - c0) < 0.2 # difference must be small
38
39
40 def test_table2():
41 """Confirm header properties."""
42 doc = pymupdf.open(filename)
43 page = doc[0]
44 tab1, tab2 = page.find_tables().tables
45 # both tables contain their header data
46 assert tab1.header.external == False
47 assert tab1.header.cells == tab1.rows[0].cells
48 assert tab2.header.external == False
49 assert tab2.header.cells == tab2.rows[0].cells
50
51
52 def test_2812():
53 """Ensure table detection and extraction independent from page rotation.
54
55 Make 4 pages with rotations 0, 90, 180 and 270 degrees respectively.
56 Each page shows the same 8x5 table.
57 We will check that each table is detected and delivers the same content.
58 """
59 doc = pymupdf.open()
60 # Page 0: rotation 0
61 page = doc.new_page(width=842, height=595)
62 rect = page.rect + (72, 72, -72, -72)
63 cols = 5
64 rows = 8
65 # define the cells, draw the grid and insert unique text in each cell.
66 cells = pymupdf.make_table(rect, rows=rows, cols=cols)
67 for i in range(rows):
68 for j in range(cols):
69 page.draw_rect(cells[i][j])
70 for i in range(rows):
71 for j in range(cols):
72 page.insert_textbox(
73 cells[i][j],
74 f"cell[{i}][{j}]",
75 align=pymupdf.TEXT_ALIGN_CENTER,
76 )
77 page.clean_contents()
78
79 # Page 1: rotation 90 degrees
80 page = doc.new_page()
81 rect = page.rect + (72, 72, -72, -72)
82 cols = 8
83 rows = 5
84 cells = pymupdf.make_table(rect, rows=rows, cols=cols)
85 for i in range(rows):
86 for j in range(cols):
87 page.draw_rect(cells[i][j])
88 for i in range(rows):
89 for j in range(cols):
90 page.insert_textbox(
91 cells[i][j],
92 f"cell[{j}][{rows-i-1}]",
93 rotate=90,
94 align=pymupdf.TEXT_ALIGN_CENTER,
95 )
96 page.set_rotation(90)
97 page.clean_contents()
98
99 # Page 2: rotation 180 degrees
100 page = doc.new_page(width=842, height=595)
101 rect = page.rect + (72, 72, -72, -72)
102 cols = 5
103 rows = 8
104 cells = pymupdf.make_table(rect, rows=rows, cols=cols)
105 for i in range(rows):
106 for j in range(cols):
107 page.draw_rect(cells[i][j])
108 for i in range(rows):
109 for j in range(cols):
110 page.insert_textbox(
111 cells[i][j],
112 f"cell[{rows-i-1}][{cols-j-1}]",
113 rotate=180,
114 align=pymupdf.TEXT_ALIGN_CENTER,
115 )
116 page.set_rotation(180)
117 page.clean_contents()
118
119 # Page 3: rotation 270 degrees
120 page = doc.new_page()
121 rect = page.rect + (72, 72, -72, -72)
122 cols = 8
123 rows = 5
124 cells = pymupdf.make_table(rect, rows=rows, cols=cols)
125 for i in range(rows):
126 for j in range(cols):
127 page.draw_rect(cells[i][j])
128 for i in range(rows):
129 for j in range(cols):
130 page.insert_textbox(
131 cells[i][j],
132 f"cell[{cols-j-1}][{i}]",
133 rotate=270,
134 align=pymupdf.TEXT_ALIGN_CENTER,
135 )
136 page.set_rotation(270)
137 page.clean_contents()
138
139 pdfdata = doc.tobytes()
140 # doc.ez_save("test-2812.pdf")
141 doc.close()
142
143 # -------------------------------------------------------------------------
144 # Test PDF prepared. Extract table on each page and
145 # ensure identical extracted table data.
146 # -------------------------------------------------------------------------
147 doc = pymupdf.open("pdf", pdfdata)
148 extracts = []
149 for page in doc:
150 tabs = page.find_tables()
151 assert len(tabs.tables) == 1
152 tab = tabs[0]
153 fp = io.StringIO()
154 pprint(tab.extract(), stream=fp)
155 extracts.append(fp.getvalue())
156 fp = None
157 assert tab.row_count == 8
158 assert tab.col_count == 5
159 e0 = extracts[0]
160 for e in extracts[1:]:
161 assert e == e0
162
163
164 def test_2979():
165 """This tests fix #2979 and #3001.
166
167 2979: identical cell count for each row
168 3001: no change of global glyph heights
169 """
170 filename = os.path.join(scriptdir, "resources", "test_2979.pdf")
171 doc = pymupdf.open(filename)
172 page = doc[0]
173 tab = page.find_tables()[0] # extract the table
174 lengths = set() # stores all row cell counts
175 for e in tab.extract():
176 lengths.add(len(e)) # store number of cells for row
177
178 # test 2979
179 assert len(lengths) == 1
180
181 # test 3001
182 assert (
183 pymupdf.TOOLS.set_small_glyph_heights() is False
184 ), f"{pymupdf.TOOLS.set_small_glyph_heights()=}"
185
186 wt = pymupdf.TOOLS.mupdf_warnings()
187 if pymupdf.mupdf_version_tuple >= (1, 26, 0):
188 assert (
189 wt
190 == "bogus font ascent/descent values (3117 / -2463)\n... repeated 2 times..."
191 )
192 else:
193 assert not wt
194
195
196 def test_3062():
197 """Tests the fix for #3062.
198 After table extraction, a rotated page should behave and look
199 like as before."""
200 if platform.python_implementation() == 'GraalVM':
201 print(f'test_3062(): Not running because slow on GraalVM.')
202 return
203
204 filename = os.path.join(scriptdir, "resources", "test_3062.pdf")
205 doc = pymupdf.open(filename)
206 page = doc[0]
207 tab0 = page.find_tables()[0]
208 cells0 = tab0.cells
209
210 page = None
211 page = doc[0]
212 tab1 = page.find_tables()[0]
213 cells1 = tab1.cells
214 assert cells1 == cells0
215
216
217 def test_strict_lines():
218 """Confirm that ignoring borderless rectangles improves table detection."""
219 filename = os.path.join(scriptdir, "resources", "strict-yes-no.pdf")
220 doc = pymupdf.open(filename)
221 page = doc[0]
222
223 tab1 = page.find_tables()[0]
224 tab2 = page.find_tables(strategy="lines_strict")[0]
225 assert tab2.row_count < tab1.row_count
226 assert tab2.col_count < tab1.col_count
227
228
229 def test_add_lines():
230 """Test new parameter add_lines for table recognition."""
231 if platform.python_implementation() == 'GraalVM':
232 print(f'test_add_lines(): Not running because breaks later tests on GraalVM.')
233 return
234
235 filename = os.path.join(scriptdir, "resources", "small-table.pdf")
236 doc = pymupdf.open(filename)
237 page = doc[0]
238 assert page.find_tables().tables == []
239
240 more_lines = [
241 ((238.9949951171875, 200.0), (238.9949951171875, 300.0)),
242 ((334.5559997558594, 200.0), (334.5559997558594, 300.0)),
243 ((433.1809997558594, 200.0), (433.1809997558594, 300.0)),
244 ]
245
246 # these 3 additional vertical lines should additional 3 columns
247 tab2 = page.find_tables(add_lines=more_lines)[0]
248 assert tab2.col_count == 4
249 assert tab2.row_count == 5
250
251
252 def test_3148():
253 """Ensure correct extraction text of rotated text."""
254 doc = pymupdf.open()
255 page = doc.new_page()
256 rect = pymupdf.Rect(100, 100, 300, 300)
257 text = (
258 "rotation 0 degrees",
259 "rotation 90 degrees",
260 "rotation 180 degrees",
261 "rotation 270 degrees",
262 )
263 degrees = (0, 90, 180, 270)
264 delta = (2, 2, -2, -2)
265 cells = pymupdf.make_table(rect, cols=3, rows=4)
266 for i in range(3):
267 for j in range(4):
268 page.draw_rect(cells[j][i])
269 k = (i + j) % 4
270 page.insert_textbox(cells[j][i] + delta, text[k], rotate=degrees[k])
271 # doc.save("multi-degree.pdf")
272 tabs = page.find_tables()
273 tab = tabs[0]
274 for extract in tab.extract():
275 for item in extract:
276 item = item.replace("\n", " ")
277 assert item in text
278
279
280 def test_3179():
281 """Test correct separation of multiple tables on page."""
282 filename = os.path.join(scriptdir, "resources", "test_3179.pdf")
283 doc = pymupdf.open(filename)
284 page = doc[0]
285 tabs = page.find_tables()
286 assert len(tabs.tables) == 3
287
288
289 def test_battery_file():
290 """Tests correctly ignoring non-table suspects.
291
292 Earlier versions erroneously tried to identify table headers
293 where there existed no table at all.
294 """
295 filename = os.path.join(scriptdir, "resources", "battery-file-22.pdf")
296 doc = pymupdf.open(filename)
297 page = doc[0]
298 tabs = page.find_tables()
299 assert len(tabs.tables) == 0
300
301
302 def test_markdown():
303 """Confirm correct markdown output."""
304 filename = os.path.join(scriptdir, "resources", "strict-yes-no.pdf")
305 doc = pymupdf.open(filename)
306 page = doc[0]
307 tab = page.find_tables(strategy="lines_strict")[0]
308 if pymupdf.mupdf_version_tuple < (1, 26, 3):
309 md_expected = textwrap.dedent('''
310 |Header1|Header2|Header3|
311 |---|---|---|
312 |Col11<br>Col12|~~Col21~~<br>~~Col22~~|Col31<br>Col32<br>Col33|
313 |Col13|~~Col23~~|Col34<br>Col35|
314 |Col14|~~Col24~~|Col36|
315 |Col15|~~Col25~~<br>~~Col26~~||
316
317 ''').lstrip()
318 else:
319 md_expected = (
320 "|Header1|Header2|Header3|\n"
321 "|---|---|---|\n"
322 "|Col11<br>Col12|Col21<br>Col22|Col31<br>Col32<br>Col33|\n"
323 "|Col13|Col23|Col34<br>Col35|\n"
324 "|Col14|Col24|Col36|\n"
325 "|Col15|Col25<br>Col26||\n\n"
326 )
327
328
329 md = tab.to_markdown()
330 assert md == md_expected, f'Incorrect md:\n{textwrap.indent(md, " ")}'
331
332
333 def test_paths_param():
334 """Confirm acceptance of supplied vector graphics list."""
335 filename = os.path.join(scriptdir, "resources", "strict-yes-no.pdf")
336 doc = pymupdf.open(filename)
337 page = doc[0]
338 tabs = page.find_tables(paths=[]) # will cause all tables are missed
339 assert tabs.tables == []
340
341
342 def test_boxes_param():
343 """Confirm acceptance of supplied boxes list."""
344 filename = os.path.join(scriptdir, "resources", "small-table.pdf")
345 doc = pymupdf.open(filename)
346 page = doc[0]
347 paths = page.get_drawings()
348 box0 = page.cluster_drawings(drawings=paths)[0]
349 boxes = [box0]
350 words = page.get_text("words")
351 x_vals = [w[0] - 5 for w in words if w[4] in ("min", "max", "avg")]
352 for x in x_vals:
353 r = +box0
354 r.x1 = x
355 boxes.append(r)
356
357 y_vals = sorted(set([round(w[3]) for w in words]))
358 for y in y_vals[:-1]: # skip last one to avoid empty row
359 r = +box0
360 r.y1 = y
361 boxes.append(r)
362
363 tabs = page.find_tables(paths=[], add_boxes=boxes)
364 tab = tabs.tables[0]
365 assert tab.extract() == [
366 ["Boiling Points °C", "min", "max", "avg"],
367 ["Noble gases", "-269", "-62", "-170.5"],
368 ["Nonmetals", "-253", "4827", "414.1"],
369 ["Metalloids", "335", "3900", "741.5"],
370 ["Metals", "357", ">5000", "2755.9"],
371 ]
372
373
374 def test_dotted_grid():
375 """Confirm dotted lines are detected as gridlines."""
376 filename = os.path.join(scriptdir, "resources", "dotted-gridlines.pdf")
377 doc = pymupdf.open(filename)
378 page = doc[0]
379 tabs = page.find_tables()
380 assert len(tabs.tables) == 3 # must be 3 tables
381 t0, t1, t2 = tabs # extract them
382 # check that they have expected dimensions
383 assert t0.row_count, t0.col_count == (11, 12)
384 assert t1.row_count, t1.col_count == (25, 11)
385 assert t2.row_count, t2.col_count == (1, 10)
386
387
388 def test_4017():
389 path = os.path.normpath(f"{__file__}/../../tests/resources/test_4017.pdf")
390 with pymupdf.open(path) as document:
391 page = document[0]
392
393 tables = page.find_tables(add_lines=None)
394 print(f"{len(tables.tables)=}.")
395 tables_text = list()
396 for i, table in enumerate(tables):
397 print(f"## {i=}.")
398 t = table.extract()
399 for tt in t:
400 print(f" {tt}")
401
402 # 2024-11-29: expect current incorrect output for last two tables.
403
404 expected_a = [
405 ["Class A/B Overcollateralization", "131.44%", ">=", "122.60%", "", "PASS"],
406 [None, None, None, None, None, "PASS"],
407 ["Class D Overcollateralization", "112.24%", ">=", "106.40%", "", "PASS"],
408 [None, None, None, None, None, "PASS"],
409 ["Event of Default", "156.08%", ">=", "102.50%", "", "PASS"],
410 [None, None, None, None, None, "PASS"],
411 ["Class A/B Interest Coverage", "N/A", ">=", "120.00%", "", "N/A"],
412 [None, None, None, None, None, "N/A"],
413 ["Class D Interest Coverage", "N/A", ">=", "105.00%", "", "N/A"],
414 ]
415 assert tables[-2].extract() == expected_a
416
417 expected_b = [
418 [
419 "Moody's Maximum Rating Factor Test",
420 "2,577",
421 "<=",
422 "3,250",
423 "",
424 "PASS",
425 "2,581",
426 ],
427 [None, None, None, None, None, "PASS", None],
428 [
429 "Minimum Floating Spread",
430 "3.5006%",
431 ">=",
432 "2.0000%",
433 "",
434 "PASS",
435 "3.4871%",
436 ],
437 [None, None, None, None, None, "PASS", None],
438 [
439 "Minimum Weighted Average S&P Recovery\nRate Test",
440 "40.50%",
441 ">=",
442 "40.00%",
443 "",
444 "PASS",
445 "40.40%",
446 ],
447 [None, None, None, None, None, "PASS", None],
448 ["Weighted Average Life", "4.83", "<=", "9.00", "", "PASS", "4.92"],
449 ]
450 assert tables[-1].extract() == expected_b
451
452
453 def test_md_styles():
454 """Test output of table with MD-styled cells."""
455 filename = os.path.join(scriptdir, "resources", "test-styled-table.pdf")
456 doc = pymupdf.open(filename)
457 page = doc[0]
458 tabs = page.find_tables()[0]
459 text = """|Column 1|Column 2|Column 3|\n|---|---|---|\n|Zelle (0,0)|**Bold (0,1)**|Zelle (0,2)|\n|~~Strikeout (1,0), Zeile 1~~<br>~~Hier kommt Zeile 2.~~|Zelle (1,1)|~~Strikeout (1,2)~~|\n|**`Bold-monospaced`**<br>**`(2,0)`**|_Italic (2,1)_|**_Bold-italic_**<br>**_(2,2)_**|\n|Zelle (3,0)|~~**Bold-strikeout**~~<br>~~**(3,1)**~~|Zelle (3,2)|\n\n"""
460 assert tabs.to_markdown() == text