comparison src/table.py @ 1:1d09e1dec1d9 upstream

ADD: PyMuPDF v1.26.4: the original sdist. It does not yet contain MuPDF. This normally will be downloaded when building PyMuPDF.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:37:51 +0200
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 1:1d09e1dec1d9
1 """
2 Copyright (C) 2023 Artifex Software, Inc.
3
4 This file is part of PyMuPDF.
5
6 PyMuPDF is free software: you can redistribute it and/or modify it under the
7 terms of the GNU Affero General Public License as published by the Free
8 Software Foundation, either version 3 of the License, or (at your option)
9 any later version.
10
11 PyMuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13 FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 details.
15
16 You should have received a copy of the GNU Affero General Public License
17 along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
18
19 Alternative licensing terms are available from the licensor.
20 For commercial licensing, see <https://www.artifex.com/> or contact
21 Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
22 CA 94129, USA, for further information.
23
24 ---------------------------------------------------------------------
25 Portions of this code have been ported from pdfplumber, see
26 https://pypi.org/project/pdfplumber/.
27
28 The ported code is under the following MIT license:
29
30 ---------------------------------------------------------------------
31 The MIT License (MIT)
32
33 Copyright (c) 2015, Jeremy Singer-Vine
34
35 Permission is hereby granted, free of charge, to any person obtaining a copy
36 of this software and associated documentation files (the "Software"), to deal
37 in the Software without restriction, including without limitation the rights
38 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
39 copies of the Software, and to permit persons to whom the Software is
40 furnished to do so, subject to the following conditions:
41
42 The above copyright notice and this permission notice shall be included in all
43 copies or substantial portions of the Software.
44
45 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
46 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
47 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
48 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
49 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
50 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
51 SOFTWARE.
52 ---------------------------------------------------------------------
53 Also see here: https://github.com/jsvine/pdfplumber/blob/stable/LICENSE.txt
54 ---------------------------------------------------------------------
55
56 The porting mainly pertains to files "table.py" and relevant parts of
57 "utils/text.py" within pdfplumber's repository on Github.
58 With respect to "text.py", we have removed functions or features that are not
59 used by table processing. Examples are:
60
61 * the text search function
62 * simple text extraction
63 * text extraction by lines
64
65 Original pdfplumber code does neither detect, nor identify table headers.
66 This PyMuPDF port adds respective code to the 'Table' class as method '_get_header'.
67 This is implemented as new class TableHeader with the properties:
68 * bbox: A tuple for the header's bbox
69 * cells: A tuple for each bbox of a column header
70 * names: A list of strings with column header text
71 * external: A bool indicating whether the header is outside the table cells.
72
73 """
74
75 import inspect
76 import itertools
77 import string
78 import html
79 from collections.abc import Sequence
80 from dataclasses import dataclass
81 from operator import itemgetter
82 import weakref
83
84 # -------------------------------------------------------------------
85 # Start of PyMuPDF interface code
86 # -------------------------------------------------------------------
87 from . import (
88 Rect,
89 Matrix,
90 TEXTFLAGS_TEXT,
91 TEXT_FONT_BOLD,
92 TEXT_FONT_ITALIC,
93 TEXT_FONT_MONOSPACED,
94 TEXT_FONT_SUPERSCRIPT,
95 TEXT_COLLECT_STYLES,
96 TOOLS,
97 EMPTY_RECT,
98 sRGB_to_pdf,
99 Point,
100 message,
101 mupdf,
102 )
103
104 EDGES = [] # vector graphics from PyMuPDF
105 CHARS = [] # text characters from PyMuPDF
106 TEXTPAGE = None
107 TEXT_BOLD = mupdf.FZ_STEXT_BOLD
108 TEXT_STRIKEOUT = mupdf.FZ_STEXT_STRIKEOUT
109 FLAGS = TEXTFLAGS_TEXT | TEXT_COLLECT_STYLES
110
111 white_spaces = set(string.whitespace) # for checking white space only cells
112
113
114 def extract_cells(textpage, cell, markdown=False):
115 """Extract text from a rect-like 'cell' as plain or MD style text.
116
117 This function should ultimately be used to extract text from a table cell.
118 Markdown output will only work correctly if extraction flag bit
119 TEXT_COLLECT_STYLES is set.
120
121 Args:
122 textpage: A PyMuPDF TextPage object. Must have been created with
123 TEXTFLAGS_TEXT | TEXT_COLLECT_STYLES.
124 cell: A tuple (x0, y0, x1, y1) defining the cell's bbox.
125 markdown: If True, return text formatted for Markdown.
126
127 Returns:
128 A string with the text extracted from the cell.
129 """
130 text = ""
131 for block in textpage.extractRAWDICT()["blocks"]:
132 if block["type"] != 0:
133 continue
134 block_bbox = block["bbox"]
135 if (
136 0
137 or block_bbox[0] > cell[2]
138 or block_bbox[2] < cell[0]
139 or block_bbox[1] > cell[3]
140 or block_bbox[3] < cell[1]
141 ):
142 continue # skip block outside cell
143 for line in block["lines"]:
144 lbbox = line["bbox"]
145 if (
146 0
147 or lbbox[0] > cell[2]
148 or lbbox[2] < cell[0]
149 or lbbox[1] > cell[3]
150 or lbbox[3] < cell[1]
151 ):
152 continue # skip line outside cell
153
154 if text: # must be a new line in the cell
155 text += "<br>" if markdown else "\n"
156
157 # strikeout detection only works with horizontal text
158 horizontal = line["dir"] == (0, 1) or line["dir"] == (1, 0)
159
160 for span in line["spans"]:
161 sbbox = span["bbox"]
162 if (
163 0
164 or sbbox[0] > cell[2]
165 or sbbox[2] < cell[0]
166 or sbbox[1] > cell[3]
167 or sbbox[3] < cell[1]
168 ):
169 continue # skip spans outside cell
170
171 # only include chars with more than 50% bbox overlap
172 span_text = ""
173 for char in span["chars"]:
174 bbox = Rect(char["bbox"])
175 if abs(bbox & cell) > 0.5 * abs(bbox):
176 span_text += char["c"]
177
178 if not span_text:
179 continue # skip empty span
180
181 if not markdown: # no MD styling
182 text += span_text
183 continue
184
185 prefix = ""
186 suffix = ""
187 if horizontal and span["char_flags"] & TEXT_STRIKEOUT:
188 prefix += "~~"
189 suffix = "~~" + suffix
190 if span["char_flags"] & TEXT_BOLD:
191 prefix += "**"
192 suffix = "**" + suffix
193 if span["flags"] & TEXT_FONT_ITALIC:
194 prefix += "_"
195 suffix = "_" + suffix
196 if span["flags"] & TEXT_FONT_MONOSPACED:
197 prefix += "`"
198 suffix = "`" + suffix
199
200 if len(span["chars"]) > 2:
201 span_text = span_text.rstrip()
202
203 # if span continues previous styling: extend cell text
204 if (ls := len(suffix)) and text.endswith(suffix):
205 text = text[:-ls] + span_text + suffix
206 else: # append the span with new styling
207 if not span_text.strip():
208 text += " "
209 else:
210 text += prefix + span_text + suffix
211
212 return text.strip()
213
214
215 # -------------------------------------------------------------------
216 # End of PyMuPDF interface code
217 # -------------------------------------------------------------------
218
219
220 class UnsetFloat(float):
221 pass
222
223
224 NON_NEGATIVE_SETTINGS = [
225 "snap_tolerance",
226 "snap_x_tolerance",
227 "snap_y_tolerance",
228 "join_tolerance",
229 "join_x_tolerance",
230 "join_y_tolerance",
231 "edge_min_length",
232 "min_words_vertical",
233 "min_words_horizontal",
234 "intersection_tolerance",
235 "intersection_x_tolerance",
236 "intersection_y_tolerance",
237 ]
238
239
240 TABLE_STRATEGIES = ["lines", "lines_strict", "text", "explicit"]
241 UNSET = UnsetFloat(0)
242 DEFAULT_SNAP_TOLERANCE = 3
243 DEFAULT_JOIN_TOLERANCE = 3
244 DEFAULT_MIN_WORDS_VERTICAL = 3
245 DEFAULT_MIN_WORDS_HORIZONTAL = 1
246 DEFAULT_X_TOLERANCE = 3
247 DEFAULT_Y_TOLERANCE = 3
248 DEFAULT_X_DENSITY = 7.25
249 DEFAULT_Y_DENSITY = 13
250 bbox_getter = itemgetter("x0", "top", "x1", "bottom")
251
252
253 LIGATURES = {
254 "ff": "ff",
255 "ffi": "ffi",
256 "ffl": "ffl",
257 "fi": "fi",
258 "fl": "fl",
259 "st": "st",
260 "ſt": "st",
261 }
262
263
264 def to_list(collection) -> list:
265 if isinstance(collection, list):
266 return collection
267 elif isinstance(collection, Sequence):
268 return list(collection)
269 elif hasattr(collection, "to_dict"):
270 res = collection.to_dict("records") # pragma: nocover
271 return res
272 else:
273 return list(collection)
274
275
276 class TextMap:
277 """
278 A TextMap maps each unicode character in the text to an individual `char`
279 object (or, in the case of layout-implied whitespace, `None`).
280 """
281
282 def __init__(self, tuples=None) -> None:
283 self.tuples = tuples
284 self.as_string = "".join(map(itemgetter(0), tuples))
285
286 def match_to_dict(
287 self,
288 m,
289 main_group: int = 0,
290 return_groups: bool = True,
291 return_chars: bool = True,
292 ) -> dict:
293 subset = self.tuples[m.start(main_group) : m.end(main_group)]
294 chars = [c for (text, c) in subset if c is not None]
295 x0, top, x1, bottom = objects_to_bbox(chars)
296
297 result = {
298 "text": m.group(main_group),
299 "x0": x0,
300 "top": top,
301 "x1": x1,
302 "bottom": bottom,
303 }
304
305 if return_groups:
306 result["groups"] = m.groups()
307
308 if return_chars:
309 result["chars"] = chars
310
311 return result
312
313
314 class WordMap:
315 """
316 A WordMap maps words->chars.
317 """
318
319 def __init__(self, tuples) -> None:
320 self.tuples = tuples
321
322 def to_textmap(
323 self,
324 layout: bool = False,
325 layout_width=0,
326 layout_height=0,
327 layout_width_chars: int = 0,
328 layout_height_chars: int = 0,
329 x_density=DEFAULT_X_DENSITY,
330 y_density=DEFAULT_Y_DENSITY,
331 x_shift=0,
332 y_shift=0,
333 y_tolerance=DEFAULT_Y_TOLERANCE,
334 use_text_flow: bool = False,
335 presorted: bool = False,
336 expand_ligatures: bool = True,
337 ) -> TextMap:
338 """
339 Given a list of (word, chars) tuples (i.e., a WordMap), return a list of
340 (char-text, char) tuples (i.e., a TextMap) that can be used to mimic the
341 structural layout of the text on the page(s), using the following approach:
342
343 - Sort the words by (doctop, x0) if not already sorted.
344
345 - Calculate the initial doctop for the starting page.
346
347 - Cluster the words by doctop (taking `y_tolerance` into account), and
348 iterate through them.
349
350 - For each cluster, calculate the distance between that doctop and the
351 initial doctop, in points, minus `y_shift`. Divide that distance by
352 `y_density` to calculate the minimum number of newlines that should come
353 before this cluster. Append that number of newlines *minus* the number of
354 newlines already appended, with a minimum of one.
355
356 - Then for each cluster, iterate through each word in it. Divide each
357 word's x0, minus `x_shift`, by `x_density` to calculate the minimum
358 number of characters that should come before this cluster. Append that
359 number of spaces *minus* the number of characters and spaces already
360 appended, with a minimum of one. Then append the word's text.
361
362 - At the termination of each line, add more spaces if necessary to
363 mimic `layout_width`.
364
365 - Finally, add newlines to the end if necessary to mimic to
366 `layout_height`.
367
368 Note: This approach currently works best for horizontal, left-to-right
369 text, but will display all words regardless of orientation. There is room
370 for improvement in better supporting right-to-left text, as well as
371 vertical text.
372 """
373 _textmap = []
374
375 if not len(self.tuples):
376 return TextMap(_textmap)
377
378 expansions = LIGATURES if expand_ligatures else {}
379
380 if layout:
381 if layout_width_chars:
382 if layout_width:
383 raise ValueError(
384 "`layout_width` and `layout_width_chars` cannot both be set."
385 )
386 else:
387 layout_width_chars = int(round(layout_width / x_density))
388
389 if layout_height_chars:
390 if layout_height:
391 raise ValueError(
392 "`layout_height` and `layout_height_chars` cannot both be set."
393 )
394 else:
395 layout_height_chars = int(round(layout_height / y_density))
396
397 blank_line = [(" ", None)] * layout_width_chars
398 else:
399 blank_line = []
400
401 num_newlines = 0
402
403 words_sorted_doctop = (
404 self.tuples
405 if presorted or use_text_flow
406 else sorted(self.tuples, key=lambda x: float(x[0]["doctop"]))
407 )
408
409 first_word = words_sorted_doctop[0][0]
410 doctop_start = first_word["doctop"] - first_word["top"]
411
412 for i, ws in enumerate(
413 cluster_objects(
414 words_sorted_doctop, lambda x: float(x[0]["doctop"]), y_tolerance
415 )
416 ):
417 y_dist = (
418 (ws[0][0]["doctop"] - (doctop_start + y_shift)) / y_density
419 if layout
420 else 0
421 )
422 num_newlines_prepend = max(
423 # At least one newline, unless this iis the first line
424 int(i > 0),
425 # ... or as many as needed to get the imputed "distance" from the top
426 round(y_dist) - num_newlines,
427 )
428
429 for i in range(num_newlines_prepend):
430 if not len(_textmap) or _textmap[-1][0] == "\n":
431 _textmap += blank_line
432 _textmap.append(("\n", None))
433
434 num_newlines += num_newlines_prepend
435
436 line_len = 0
437
438 line_words_sorted_x0 = (
439 ws
440 if presorted or use_text_flow
441 else sorted(ws, key=lambda x: float(x[0]["x0"]))
442 )
443
444 for word, chars in line_words_sorted_x0:
445 x_dist = (word["x0"] - x_shift) / x_density if layout else 0
446 num_spaces_prepend = max(min(1, line_len), round(x_dist) - line_len)
447 _textmap += [(" ", None)] * num_spaces_prepend
448 line_len += num_spaces_prepend
449
450 for c in chars:
451 letters = expansions.get(c["text"], c["text"])
452 for letter in letters:
453 _textmap.append((letter, c))
454 line_len += 1
455
456 # Append spaces at end of line
457 if layout:
458 _textmap += [(" ", None)] * (layout_width_chars - line_len)
459
460 # Append blank lines at end of text
461 if layout:
462 num_newlines_append = layout_height_chars - (num_newlines + 1)
463 for i in range(num_newlines_append):
464 if i > 0:
465 _textmap += blank_line
466 _textmap.append(("\n", None))
467
468 # Remove terminal newline
469 if _textmap[-1] == ("\n", None):
470 _textmap = _textmap[:-1]
471
472 return TextMap(_textmap)
473
474
475 class WordExtractor:
476 def __init__(
477 self,
478 x_tolerance=DEFAULT_X_TOLERANCE,
479 y_tolerance=DEFAULT_Y_TOLERANCE,
480 keep_blank_chars: bool = False,
481 use_text_flow=False,
482 horizontal_ltr=True, # Should words be read left-to-right?
483 vertical_ttb=False, # Should vertical words be read top-to-bottom?
484 extra_attrs=None,
485 split_at_punctuation=False,
486 expand_ligatures=True,
487 ):
488 self.x_tolerance = x_tolerance
489 self.y_tolerance = y_tolerance
490 self.keep_blank_chars = keep_blank_chars
491 self.use_text_flow = use_text_flow
492 self.horizontal_ltr = horizontal_ltr
493 self.vertical_ttb = vertical_ttb
494 self.extra_attrs = [] if extra_attrs is None else extra_attrs
495
496 # Note: string.punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
497 self.split_at_punctuation = (
498 string.punctuation
499 if split_at_punctuation is True
500 else (split_at_punctuation or "")
501 )
502
503 self.expansions = LIGATURES if expand_ligatures else {}
504
505 def merge_chars(self, ordered_chars: list):
506 x0, top, x1, bottom = objects_to_bbox(ordered_chars)
507 doctop_adj = ordered_chars[0]["doctop"] - ordered_chars[0]["top"]
508 upright = ordered_chars[0]["upright"]
509 direction = 1 if (self.horizontal_ltr if upright else self.vertical_ttb) else -1
510
511 matrix = ordered_chars[0]["matrix"]
512
513 rotation = 0
514 if not upright and matrix[1] < 0:
515 ordered_chars = reversed(ordered_chars)
516 rotation = 270
517
518 if matrix[0] < 0 and matrix[3] < 0:
519 rotation = 180
520 elif matrix[1] > 0:
521 rotation = 90
522
523 word = {
524 "text": "".join(
525 self.expansions.get(c["text"], c["text"]) for c in ordered_chars
526 ),
527 "x0": x0,
528 "x1": x1,
529 "top": top,
530 "doctop": top + doctop_adj,
531 "bottom": bottom,
532 "upright": upright,
533 "direction": direction,
534 "rotation": rotation,
535 }
536
537 for key in self.extra_attrs:
538 word[key] = ordered_chars[0][key]
539
540 return word
541
542 def char_begins_new_word(
543 self,
544 prev_char,
545 curr_char,
546 ) -> bool:
547 """This method takes several factors into account to determine if
548 `curr_char` represents the beginning of a new word:
549
550 - Whether the text is "upright" (i.e., non-rotated)
551 - Whether the user has specified that horizontal text runs
552 left-to-right (default) or right-to-left, as represented by
553 self.horizontal_ltr
554 - Whether the user has specified that vertical text the text runs
555 top-to-bottom (default) or bottom-to-top, as represented by
556 self.vertical_ttb
557 - The x0, top, x1, and bottom attributes of prev_char and
558 curr_char
559 - The self.x_tolerance and self.y_tolerance settings. Note: In
560 this case, x/y refer to those directions for non-rotated text.
561 For vertical text, they are flipped. A more accurate terminology
562 might be "*intra*line character distance tolerance" and
563 "*inter*line character distance tolerance"
564
565 An important note: The *intra*line distance is measured from the
566 *end* of the previous character to the *beginning* of the current
567 character, while the *inter*line distance is measured from the
568 *top* of the previous character to the *top* of the next
569 character. The reasons for this are partly repository-historical,
570 and partly logical, as successive text lines' bounding boxes often
571 overlap slightly (and we don't want that overlap to be interpreted
572 as the two lines being the same line).
573
574 The upright-ness of the character determines the attributes to
575 compare, while horizontal_ltr/vertical_ttb determine the direction
576 of the comparison.
577 """
578
579 # Note: Due to the grouping step earlier in the process,
580 # curr_char["upright"] will always equal prev_char["upright"].
581 if curr_char["upright"]:
582 x = self.x_tolerance
583 y = self.y_tolerance
584 ay = prev_char["top"]
585 cy = curr_char["top"]
586 if self.horizontal_ltr:
587 ax = prev_char["x0"]
588 bx = prev_char["x1"]
589 cx = curr_char["x0"]
590 else:
591 ax = -prev_char["x1"]
592 bx = -prev_char["x0"]
593 cx = -curr_char["x1"]
594
595 else:
596 x = self.y_tolerance
597 y = self.x_tolerance
598 ay = prev_char["x0"]
599 cy = curr_char["x0"]
600 if self.vertical_ttb:
601 ax = prev_char["top"]
602 bx = prev_char["bottom"]
603 cx = curr_char["top"]
604 else:
605 ax = -prev_char["bottom"]
606 bx = -prev_char["top"]
607 cx = -curr_char["bottom"]
608
609 return bool(
610 # Intraline test
611 (cx < ax)
612 or (cx > bx + x)
613 # Interline test
614 or (cy > ay + y)
615 )
616
617 def iter_chars_to_words(self, ordered_chars):
618 current_word: list = []
619
620 def start_next_word(new_char=None):
621 nonlocal current_word
622
623 if current_word:
624 yield current_word
625
626 current_word = [] if new_char is None else [new_char]
627
628 for char in ordered_chars:
629 text = char["text"]
630
631 if not self.keep_blank_chars and text.isspace():
632 yield from start_next_word(None)
633
634 elif text in self.split_at_punctuation:
635 yield from start_next_word(char)
636 yield from start_next_word(None)
637
638 elif current_word and self.char_begins_new_word(current_word[-1], char):
639 yield from start_next_word(char)
640
641 else:
642 current_word.append(char)
643
644 # Finally, after all chars processed
645 if current_word:
646 yield current_word
647
648 def iter_sort_chars(self, chars):
649 def upright_key(x) -> int:
650 return -int(x["upright"])
651
652 for upright_cluster in cluster_objects(list(chars), upright_key, 0):
653 upright = upright_cluster[0]["upright"]
654 cluster_key = "doctop" if upright else "x0"
655
656 # Cluster by line
657 subclusters = cluster_objects(
658 upright_cluster, itemgetter(cluster_key), self.y_tolerance
659 )
660
661 for sc in subclusters:
662 # Sort within line
663 sort_key = "x0" if upright else "doctop"
664 to_yield = sorted(sc, key=itemgetter(sort_key))
665
666 # Reverse order if necessary
667 if not (self.horizontal_ltr if upright else self.vertical_ttb):
668 yield from reversed(to_yield)
669 else:
670 yield from to_yield
671
672 def iter_extract_tuples(self, chars):
673 ordered_chars = chars if self.use_text_flow else self.iter_sort_chars(chars)
674
675 grouping_key = itemgetter("upright", *self.extra_attrs)
676 grouped_chars = itertools.groupby(ordered_chars, grouping_key)
677
678 for keyvals, char_group in grouped_chars:
679 for word_chars in self.iter_chars_to_words(char_group):
680 yield (self.merge_chars(word_chars), word_chars)
681
682 def extract_wordmap(self, chars) -> WordMap:
683 return WordMap(list(self.iter_extract_tuples(chars)))
684
685 def extract_words(self, chars: list) -> list:
686 words = list(word for word, word_chars in self.iter_extract_tuples(chars))
687 return words
688
689
690 def extract_words(chars: list, **kwargs) -> list:
691 return WordExtractor(**kwargs).extract_words(chars)
692
693
694 TEXTMAP_KWARGS = inspect.signature(WordMap.to_textmap).parameters.keys()
695 WORD_EXTRACTOR_KWARGS = inspect.signature(WordExtractor).parameters.keys()
696
697
698 def chars_to_textmap(chars: list, **kwargs) -> TextMap:
699 kwargs.update({"presorted": True})
700
701 extractor = WordExtractor(
702 **{k: kwargs[k] for k in WORD_EXTRACTOR_KWARGS if k in kwargs}
703 )
704 wordmap = extractor.extract_wordmap(chars)
705 textmap = wordmap.to_textmap(
706 **{k: kwargs[k] for k in TEXTMAP_KWARGS if k in kwargs}
707 )
708
709 return textmap
710
711
712 def extract_text(chars: list, **kwargs) -> str:
713 chars = to_list(chars)
714 if len(chars) == 0:
715 return ""
716
717 if kwargs.get("layout"):
718 return chars_to_textmap(chars, **kwargs).as_string
719 else:
720 y_tolerance = kwargs.get("y_tolerance", DEFAULT_Y_TOLERANCE)
721 extractor = WordExtractor(
722 **{k: kwargs[k] for k in WORD_EXTRACTOR_KWARGS if k in kwargs}
723 )
724 words = extractor.extract_words(chars)
725 if words:
726 rotation = words[0]["rotation"] # rotation cannot change within a cell
727 else:
728 rotation = 0
729
730 if rotation == 90:
731 words.sort(key=lambda w: (w["x1"], -w["top"]))
732 lines = " ".join([w["text"] for w in words])
733 elif rotation == 270:
734 words.sort(key=lambda w: (-w["x1"], w["top"]))
735 lines = " ".join([w["text"] for w in words])
736 else:
737 lines = cluster_objects(words, itemgetter("doctop"), y_tolerance)
738 lines = "\n".join(" ".join(word["text"] for word in line) for line in lines)
739 if rotation == 180: # needs extra treatment
740 lines = "".join([(c if c != "\n" else " ") for c in reversed(lines)])
741
742 return lines
743
744
745 def collate_line(
746 line_chars: list,
747 tolerance=DEFAULT_X_TOLERANCE,
748 ) -> str:
749 coll = ""
750 last_x1 = None
751 for char in sorted(line_chars, key=itemgetter("x0")):
752 if (last_x1 is not None) and (char["x0"] > (last_x1 + tolerance)):
753 coll += " "
754 last_x1 = char["x1"]
755 coll += char["text"]
756 return coll
757
758
759 def dedupe_chars(chars: list, tolerance=1) -> list:
760 """
761 Removes duplicate chars — those sharing the same text, fontname, size,
762 and positioning (within `tolerance`) as other characters in the set.
763 """
764 key = itemgetter("fontname", "size", "upright", "text")
765 pos_key = itemgetter("doctop", "x0")
766
767 def yield_unique_chars(chars: list):
768 sorted_chars = sorted(chars, key=key)
769 for grp, grp_chars in itertools.groupby(sorted_chars, key=key):
770 for y_cluster in cluster_objects(
771 list(grp_chars), itemgetter("doctop"), tolerance
772 ):
773 for x_cluster in cluster_objects(
774 y_cluster, itemgetter("x0"), tolerance
775 ):
776 yield sorted(x_cluster, key=pos_key)[0]
777
778 deduped = yield_unique_chars(chars)
779 return sorted(deduped, key=chars.index)
780
781
782 def line_to_edge(line):
783 edge = dict(line)
784 edge["orientation"] = "h" if (line["top"] == line["bottom"]) else "v"
785 return edge
786
787
788 def rect_to_edges(rect) -> list:
789 top, bottom, left, right = [dict(rect) for x in range(4)]
790 top.update(
791 {
792 "object_type": "rect_edge",
793 "height": 0,
794 "y0": rect["y1"],
795 "bottom": rect["top"],
796 "orientation": "h",
797 }
798 )
799 bottom.update(
800 {
801 "object_type": "rect_edge",
802 "height": 0,
803 "y1": rect["y0"],
804 "top": rect["top"] + rect["height"],
805 "doctop": rect["doctop"] + rect["height"],
806 "orientation": "h",
807 }
808 )
809 left.update(
810 {
811 "object_type": "rect_edge",
812 "width": 0,
813 "x1": rect["x0"],
814 "orientation": "v",
815 }
816 )
817 right.update(
818 {
819 "object_type": "rect_edge",
820 "width": 0,
821 "x0": rect["x1"],
822 "orientation": "v",
823 }
824 )
825 return [top, bottom, left, right]
826
827
828 def curve_to_edges(curve) -> list:
829 point_pairs = zip(curve["pts"], curve["pts"][1:])
830 return [
831 {
832 "object_type": "curve_edge",
833 "x0": min(p0[0], p1[0]),
834 "x1": max(p0[0], p1[0]),
835 "top": min(p0[1], p1[1]),
836 "doctop": min(p0[1], p1[1]) + (curve["doctop"] - curve["top"]),
837 "bottom": max(p0[1], p1[1]),
838 "width": abs(p0[0] - p1[0]),
839 "height": abs(p0[1] - p1[1]),
840 "orientation": "v" if p0[0] == p1[0] else ("h" if p0[1] == p1[1] else None),
841 }
842 for p0, p1 in point_pairs
843 ]
844
845
846 def obj_to_edges(obj) -> list:
847 t = obj["object_type"]
848 if "_edge" in t:
849 return [obj]
850 elif t == "line":
851 return [line_to_edge(obj)]
852 else:
853 return {"rect": rect_to_edges, "curve": curve_to_edges}[t](obj)
854
855
856 def filter_edges(
857 edges,
858 orientation=None,
859 edge_type=None,
860 min_length=1,
861 ) -> list:
862 if orientation not in ("v", "h", None):
863 raise ValueError("Orientation must be 'v' or 'h'")
864
865 def test(e) -> bool:
866 dim = "height" if e["orientation"] == "v" else "width"
867 et_correct = e["object_type"] == edge_type if edge_type is not None else True
868 orient_correct = orientation is None or e["orientation"] == orientation
869 return bool(et_correct and orient_correct and (e[dim] >= min_length))
870
871 return list(filter(test, edges))
872
873
874 def cluster_list(xs, tolerance=0) -> list:
875 if tolerance == 0:
876 return [[x] for x in sorted(xs)]
877 if len(xs) < 2:
878 return [[x] for x in sorted(xs)]
879 groups = []
880 xs = list(sorted(xs))
881 current_group = [xs[0]]
882 last = xs[0]
883 for x in xs[1:]:
884 if x <= (last + tolerance):
885 current_group.append(x)
886 else:
887 groups.append(current_group)
888 current_group = [x]
889 last = x
890 groups.append(current_group)
891 return groups
892
893
894 def make_cluster_dict(values, tolerance) -> dict:
895 clusters = cluster_list(list(set(values)), tolerance)
896
897 nested_tuples = [
898 [(val, i) for val in value_cluster] for i, value_cluster in enumerate(clusters)
899 ]
900
901 return dict(itertools.chain(*nested_tuples))
902
903
904 def cluster_objects(xs, key_fn, tolerance) -> list:
905 if not callable(key_fn):
906 key_fn = itemgetter(key_fn)
907
908 values = map(key_fn, xs)
909 cluster_dict = make_cluster_dict(values, tolerance)
910
911 get_0, get_1 = itemgetter(0), itemgetter(1)
912
913 cluster_tuples = sorted(((x, cluster_dict.get(key_fn(x))) for x in xs), key=get_1)
914
915 grouped = itertools.groupby(cluster_tuples, key=get_1)
916
917 return [list(map(get_0, v)) for k, v in grouped]
918
919
920 def move_object(obj, axis: str, value):
921 assert axis in ("h", "v")
922 if axis == "h":
923 new_items = [
924 ("x0", obj["x0"] + value),
925 ("x1", obj["x1"] + value),
926 ]
927 if axis == "v":
928 new_items = [
929 ("top", obj["top"] + value),
930 ("bottom", obj["bottom"] + value),
931 ]
932 if "doctop" in obj:
933 new_items += [("doctop", obj["doctop"] + value)]
934 if "y0" in obj:
935 new_items += [
936 ("y0", obj["y0"] - value),
937 ("y1", obj["y1"] - value),
938 ]
939 return obj.__class__(tuple(obj.items()) + tuple(new_items))
940
941
942 def snap_objects(objs, attr: str, tolerance) -> list:
943 axis = {"x0": "h", "x1": "h", "top": "v", "bottom": "v"}[attr]
944 list_objs = list(objs)
945 clusters = cluster_objects(list_objs, itemgetter(attr), tolerance)
946 avgs = [sum(map(itemgetter(attr), cluster)) / len(cluster) for cluster in clusters]
947 snapped_clusters = [
948 [move_object(obj, axis, avg - obj[attr]) for obj in cluster]
949 for cluster, avg in zip(clusters, avgs)
950 ]
951 return list(itertools.chain(*snapped_clusters))
952
953
954 def snap_edges(
955 edges,
956 x_tolerance=DEFAULT_SNAP_TOLERANCE,
957 y_tolerance=DEFAULT_SNAP_TOLERANCE,
958 ):
959 """
960 Given a list of edges, snap any within `tolerance` pixels of one another
961 to their positional average.
962 """
963 by_orientation = {"v": [], "h": []}
964 for e in edges:
965 by_orientation[e["orientation"]].append(e)
966
967 snapped_v = snap_objects(by_orientation["v"], "x0", x_tolerance)
968 snapped_h = snap_objects(by_orientation["h"], "top", y_tolerance)
969 return snapped_v + snapped_h
970
971
972 def resize_object(obj, key: str, value):
973 assert key in ("x0", "x1", "top", "bottom")
974 old_value = obj[key]
975 diff = value - old_value
976 new_items = [
977 (key, value),
978 ]
979 if key == "x0":
980 assert value <= obj["x1"]
981 new_items.append(("width", obj["x1"] - value))
982 elif key == "x1":
983 assert value >= obj["x0"]
984 new_items.append(("width", value - obj["x0"]))
985 elif key == "top":
986 assert value <= obj["bottom"]
987 new_items.append(("doctop", obj["doctop"] + diff))
988 new_items.append(("height", obj["height"] - diff))
989 if "y1" in obj:
990 new_items.append(("y1", obj["y1"] - diff))
991 elif key == "bottom":
992 assert value >= obj["top"]
993 new_items.append(("height", obj["height"] + diff))
994 if "y0" in obj:
995 new_items.append(("y0", obj["y0"] - diff))
996 return obj.__class__(tuple(obj.items()) + tuple(new_items))
997
998
999 def join_edge_group(edges, orientation: str, tolerance=DEFAULT_JOIN_TOLERANCE):
1000 """
1001 Given a list of edges along the same infinite line, join those that
1002 are within `tolerance` pixels of one another.
1003 """
1004 if orientation == "h":
1005 min_prop, max_prop = "x0", "x1"
1006 elif orientation == "v":
1007 min_prop, max_prop = "top", "bottom"
1008 else:
1009 raise ValueError("Orientation must be 'v' or 'h'")
1010
1011 sorted_edges = list(sorted(edges, key=itemgetter(min_prop)))
1012 joined = [sorted_edges[0]]
1013 for e in sorted_edges[1:]:
1014 last = joined[-1]
1015 if e[min_prop] <= (last[max_prop] + tolerance):
1016 if e[max_prop] > last[max_prop]:
1017 # Extend current edge to new extremity
1018 joined[-1] = resize_object(last, max_prop, e[max_prop])
1019 else:
1020 # Edge is separate from previous edges
1021 joined.append(e)
1022
1023 return joined
1024
1025
1026 def merge_edges(
1027 edges,
1028 snap_x_tolerance,
1029 snap_y_tolerance,
1030 join_x_tolerance,
1031 join_y_tolerance,
1032 ):
1033 """
1034 Using the `snap_edges` and `join_edge_group` methods above,
1035 merge a list of edges into a more "seamless" list.
1036 """
1037
1038 def get_group(edge):
1039 if edge["orientation"] == "h":
1040 return ("h", edge["top"])
1041 else:
1042 return ("v", edge["x0"])
1043
1044 if snap_x_tolerance > 0 or snap_y_tolerance > 0:
1045 edges = snap_edges(edges, snap_x_tolerance, snap_y_tolerance)
1046
1047 _sorted = sorted(edges, key=get_group)
1048 edge_groups = itertools.groupby(_sorted, key=get_group)
1049 edge_gen = (
1050 join_edge_group(
1051 items, k[0], (join_x_tolerance if k[0] == "h" else join_y_tolerance)
1052 )
1053 for k, items in edge_groups
1054 )
1055 edges = list(itertools.chain(*edge_gen))
1056 return edges
1057
1058
1059 def bbox_to_rect(bbox) -> dict:
1060 """
1061 Return the rectangle (i.e a dict with keys "x0", "top", "x1",
1062 "bottom") for an object.
1063 """
1064 return {"x0": bbox[0], "top": bbox[1], "x1": bbox[2], "bottom": bbox[3]}
1065
1066
1067 def objects_to_rect(objects) -> dict:
1068 """
1069 Given an iterable of objects, return the smallest rectangle (i.e. a
1070 dict with "x0", "top", "x1", and "bottom" keys) that contains them
1071 all.
1072 """
1073 return bbox_to_rect(objects_to_bbox(objects))
1074
1075
1076 def merge_bboxes(bboxes):
1077 """
1078 Given an iterable of bounding boxes, return the smallest bounding box
1079 that contains them all.
1080 """
1081 x0, top, x1, bottom = zip(*bboxes)
1082 return (min(x0), min(top), max(x1), max(bottom))
1083
1084
1085 def objects_to_bbox(objects):
1086 """
1087 Given an iterable of objects, return the smallest bounding box that
1088 contains them all.
1089 """
1090 return merge_bboxes(map(bbox_getter, objects))
1091
1092
1093 def words_to_edges_h(words, word_threshold: int = DEFAULT_MIN_WORDS_HORIZONTAL):
1094 """
1095 Find (imaginary) horizontal lines that connect the tops
1096 of at least `word_threshold` words.
1097 """
1098 by_top = cluster_objects(words, itemgetter("top"), 1)
1099 large_clusters = filter(lambda x: len(x) >= word_threshold, by_top)
1100 rects = list(map(objects_to_rect, large_clusters))
1101 if len(rects) == 0:
1102 return []
1103 min_x0 = min(map(itemgetter("x0"), rects))
1104 max_x1 = max(map(itemgetter("x1"), rects))
1105
1106 edges = []
1107 for r in rects:
1108 edges += [
1109 # Top of text
1110 {
1111 "x0": min_x0,
1112 "x1": max_x1,
1113 "top": r["top"],
1114 "bottom": r["top"],
1115 "width": max_x1 - min_x0,
1116 "orientation": "h",
1117 },
1118 # For each detected row, we also add the 'bottom' line. This will
1119 # generate extra edges, (some will be redundant with the next row
1120 # 'top' line), but this catches the last row of every table.
1121 {
1122 "x0": min_x0,
1123 "x1": max_x1,
1124 "top": r["bottom"],
1125 "bottom": r["bottom"],
1126 "width": max_x1 - min_x0,
1127 "orientation": "h",
1128 },
1129 ]
1130
1131 return edges
1132
1133
1134 def get_bbox_overlap(a, b):
1135 a_left, a_top, a_right, a_bottom = a
1136 b_left, b_top, b_right, b_bottom = b
1137 o_left = max(a_left, b_left)
1138 o_right = min(a_right, b_right)
1139 o_bottom = min(a_bottom, b_bottom)
1140 o_top = max(a_top, b_top)
1141 o_width = o_right - o_left
1142 o_height = o_bottom - o_top
1143 if o_height >= 0 and o_width >= 0 and o_height + o_width > 0:
1144 return (o_left, o_top, o_right, o_bottom)
1145 else:
1146 return None
1147
1148
1149 def words_to_edges_v(words, word_threshold: int = DEFAULT_MIN_WORDS_VERTICAL):
1150 """
1151 Find (imaginary) vertical lines that connect the left, right, or
1152 center of at least `word_threshold` words.
1153 """
1154 # Find words that share the same left, right, or centerpoints
1155 by_x0 = cluster_objects(words, itemgetter("x0"), 1)
1156 by_x1 = cluster_objects(words, itemgetter("x1"), 1)
1157
1158 def get_center(word):
1159 return float(word["x0"] + word["x1"]) / 2
1160
1161 by_center = cluster_objects(words, get_center, 1)
1162 clusters = by_x0 + by_x1 + by_center
1163
1164 # Find the points that align with the most words
1165 sorted_clusters = sorted(clusters, key=lambda x: -len(x))
1166 large_clusters = filter(lambda x: len(x) >= word_threshold, sorted_clusters)
1167
1168 # For each of those points, find the bboxes fitting all matching words
1169 bboxes = list(map(objects_to_bbox, large_clusters))
1170
1171 # Iterate through those bboxes, condensing overlapping bboxes
1172 condensed_bboxes = []
1173 for bbox in bboxes:
1174 overlap = any(get_bbox_overlap(bbox, c) for c in condensed_bboxes)
1175 if not overlap:
1176 condensed_bboxes.append(bbox)
1177
1178 if not condensed_bboxes:
1179 return []
1180
1181 condensed_rects = map(bbox_to_rect, condensed_bboxes)
1182 sorted_rects = list(sorted(condensed_rects, key=itemgetter("x0")))
1183
1184 max_x1 = max(map(itemgetter("x1"), sorted_rects))
1185 min_top = min(map(itemgetter("top"), sorted_rects))
1186 max_bottom = max(map(itemgetter("bottom"), sorted_rects))
1187
1188 return [
1189 {
1190 "x0": b["x0"],
1191 "x1": b["x0"],
1192 "top": min_top,
1193 "bottom": max_bottom,
1194 "height": max_bottom - min_top,
1195 "orientation": "v",
1196 }
1197 for b in sorted_rects
1198 ] + [
1199 {
1200 "x0": max_x1,
1201 "x1": max_x1,
1202 "top": min_top,
1203 "bottom": max_bottom,
1204 "height": max_bottom - min_top,
1205 "orientation": "v",
1206 }
1207 ]
1208
1209
1210 def edges_to_intersections(edges, x_tolerance=1, y_tolerance=1) -> dict:
1211 """
1212 Given a list of edges, return the points at which they intersect
1213 within `tolerance` pixels.
1214 """
1215 intersections = {}
1216 v_edges, h_edges = [
1217 list(filter(lambda x: x["orientation"] == o, edges)) for o in ("v", "h")
1218 ]
1219 for v in sorted(v_edges, key=itemgetter("x0", "top")):
1220 for h in sorted(h_edges, key=itemgetter("top", "x0")):
1221 if (
1222 (v["top"] <= (h["top"] + y_tolerance))
1223 and (v["bottom"] >= (h["top"] - y_tolerance))
1224 and (v["x0"] >= (h["x0"] - x_tolerance))
1225 and (v["x0"] <= (h["x1"] + x_tolerance))
1226 ):
1227 vertex = (v["x0"], h["top"])
1228 if vertex not in intersections:
1229 intersections[vertex] = {"v": [], "h": []}
1230 intersections[vertex]["v"].append(v)
1231 intersections[vertex]["h"].append(h)
1232 return intersections
1233
1234
1235 def obj_to_bbox(obj):
1236 """
1237 Return the bounding box for an object.
1238 """
1239 return bbox_getter(obj)
1240
1241
1242 def intersections_to_cells(intersections):
1243 """
1244 Given a list of points (`intersections`), return all rectangular "cells"
1245 that those points describe.
1246
1247 `intersections` should be a dictionary with (x0, top) tuples as keys,
1248 and a list of edge objects as values. The edge objects should correspond
1249 to the edges that touch the intersection.
1250 """
1251
1252 def edge_connects(p1, p2) -> bool:
1253 def edges_to_set(edges):
1254 return set(map(obj_to_bbox, edges))
1255
1256 if p1[0] == p2[0]:
1257 common = edges_to_set(intersections[p1]["v"]).intersection(
1258 edges_to_set(intersections[p2]["v"])
1259 )
1260 if len(common):
1261 return True
1262
1263 if p1[1] == p2[1]:
1264 common = edges_to_set(intersections[p1]["h"]).intersection(
1265 edges_to_set(intersections[p2]["h"])
1266 )
1267 if len(common):
1268 return True
1269 return False
1270
1271 points = list(sorted(intersections.keys()))
1272 n_points = len(points)
1273
1274 def find_smallest_cell(points, i: int):
1275 if i == n_points - 1:
1276 return None
1277 pt = points[i]
1278 rest = points[i + 1 :]
1279 # Get all the points directly below and directly right
1280 below = [x for x in rest if x[0] == pt[0]]
1281 right = [x for x in rest if x[1] == pt[1]]
1282 for below_pt in below:
1283 if not edge_connects(pt, below_pt):
1284 continue
1285
1286 for right_pt in right:
1287 if not edge_connects(pt, right_pt):
1288 continue
1289
1290 bottom_right = (right_pt[0], below_pt[1])
1291
1292 if (
1293 (bottom_right in intersections)
1294 and edge_connects(bottom_right, right_pt)
1295 and edge_connects(bottom_right, below_pt)
1296 ):
1297 return (pt[0], pt[1], bottom_right[0], bottom_right[1])
1298 return None
1299
1300 cell_gen = (find_smallest_cell(points, i) for i in range(len(points)))
1301 return list(filter(None, cell_gen))
1302
1303
1304 def cells_to_tables(page, cells) -> list:
1305 """
1306 Given a list of bounding boxes (`cells`), return a list of tables that
1307 hold those cells most simply (and contiguously).
1308 """
1309
1310 def bbox_to_corners(bbox) -> tuple:
1311 x0, top, x1, bottom = bbox
1312 return ((x0, top), (x0, bottom), (x1, top), (x1, bottom))
1313
1314 remaining_cells = list(cells)
1315
1316 # Iterate through the cells found above, and assign them
1317 # to contiguous tables
1318
1319 current_corners = set()
1320 current_cells = []
1321
1322 tables = []
1323 while len(remaining_cells):
1324 initial_cell_count = len(current_cells)
1325 for cell in list(remaining_cells):
1326 cell_corners = bbox_to_corners(cell)
1327 # If we're just starting a table ...
1328 if len(current_cells) == 0:
1329 # ... immediately assign it to the empty group
1330 current_corners |= set(cell_corners)
1331 current_cells.append(cell)
1332 remaining_cells.remove(cell)
1333 else:
1334 # How many corners does this table share with the current group?
1335 corner_count = sum(c in current_corners for c in cell_corners)
1336
1337 # If touching on at least one corner...
1338 if corner_count > 0:
1339 # ... assign it to the current group
1340 current_corners |= set(cell_corners)
1341 current_cells.append(cell)
1342 remaining_cells.remove(cell)
1343
1344 # If this iteration did not find any more cells to append...
1345 if len(current_cells) == initial_cell_count:
1346 # ... start a new cell group
1347 tables.append(list(current_cells))
1348 current_corners.clear()
1349 current_cells.clear()
1350
1351 # Once we have exhausting the list of cells ...
1352
1353 # ... and we have a cell group that has not been stored
1354 if len(current_cells):
1355 # ... store it.
1356 tables.append(list(current_cells))
1357
1358 # PyMuPDF modification:
1359 # Remove tables without text or having only 1 column
1360 for i in range(len(tables) - 1, -1, -1):
1361 r = EMPTY_RECT()
1362 x1_vals = set()
1363 x0_vals = set()
1364 for c in tables[i]:
1365 r |= c
1366 x1_vals.add(c[2])
1367 x0_vals.add(c[0])
1368 if (
1369 len(x1_vals) < 2
1370 or len(x0_vals) < 2
1371 or white_spaces.issuperset(
1372 page.get_textbox(
1373 r,
1374 textpage=TEXTPAGE,
1375 )
1376 )
1377 ):
1378 del tables[i]
1379
1380 # Sort the tables top-to-bottom-left-to-right based on the value of the
1381 # topmost-and-then-leftmost coordinate of a table.
1382 _sorted = sorted(tables, key=lambda t: min((c[1], c[0]) for c in t))
1383 return _sorted
1384
1385
1386 class CellGroup:
1387 def __init__(self, cells):
1388 self.cells = cells
1389 self.bbox = (
1390 min(map(itemgetter(0), filter(None, cells))),
1391 min(map(itemgetter(1), filter(None, cells))),
1392 max(map(itemgetter(2), filter(None, cells))),
1393 max(map(itemgetter(3), filter(None, cells))),
1394 )
1395
1396
1397 class TableRow(CellGroup):
1398 pass
1399
1400
1401 class TableHeader:
1402 """PyMuPDF extension containing the identified table header."""
1403
1404 def __init__(self, bbox, cells, names, above):
1405 self.bbox = bbox
1406 self.cells = cells
1407 self.names = names
1408 self.external = above
1409
1410
1411 class Table:
1412 def __init__(self, page, cells):
1413 self.page = page
1414 self.cells = cells
1415 self.header = self._get_header() # PyMuPDF extension
1416
1417 @property
1418 def bbox(self):
1419 c = self.cells
1420 return (
1421 min(map(itemgetter(0), c)),
1422 min(map(itemgetter(1), c)),
1423 max(map(itemgetter(2), c)),
1424 max(map(itemgetter(3), c)),
1425 )
1426
1427 @property
1428 def rows(self) -> list:
1429 _sorted = sorted(self.cells, key=itemgetter(1, 0))
1430 xs = list(sorted(set(map(itemgetter(0), self.cells))))
1431 rows = []
1432 for y, row_cells in itertools.groupby(_sorted, itemgetter(1)):
1433 xdict = {cell[0]: cell for cell in row_cells}
1434 row = TableRow([xdict.get(x) for x in xs])
1435 rows.append(row)
1436 return rows
1437
1438 @property
1439 def row_count(self) -> int: # PyMuPDF extension
1440 return len(self.rows)
1441
1442 @property
1443 def col_count(self) -> int: # PyMuPDF extension
1444 return max([len(r.cells) for r in self.rows])
1445
1446 def extract(self, **kwargs) -> list:
1447 chars = CHARS
1448 table_arr = []
1449
1450 def char_in_bbox(char, bbox) -> bool:
1451 v_mid = (char["top"] + char["bottom"]) / 2
1452 h_mid = (char["x0"] + char["x1"]) / 2
1453 x0, top, x1, bottom = bbox
1454 return bool(
1455 (h_mid >= x0) and (h_mid < x1) and (v_mid >= top) and (v_mid < bottom)
1456 )
1457
1458 for row in self.rows:
1459 arr = []
1460 row_chars = [char for char in chars if char_in_bbox(char, row.bbox)]
1461
1462 for cell in row.cells:
1463 if cell is None:
1464 cell_text = None
1465 else:
1466 cell_chars = [
1467 char for char in row_chars if char_in_bbox(char, cell)
1468 ]
1469
1470 if len(cell_chars):
1471 kwargs["x_shift"] = cell[0]
1472 kwargs["y_shift"] = cell[1]
1473 if "layout" in kwargs:
1474 kwargs["layout_width"] = cell[2] - cell[0]
1475 kwargs["layout_height"] = cell[3] - cell[1]
1476 cell_text = extract_text(cell_chars, **kwargs)
1477 else:
1478 cell_text = ""
1479 arr.append(cell_text)
1480 table_arr.append(arr)
1481
1482 return table_arr
1483
1484 def to_markdown(self, clean=False, fill_empty=True):
1485 """Output table content as a string in Github-markdown format.
1486
1487 If "clean" then markdown syntax is removed from cell content.
1488 If "fill_empty" then cell content None is replaced by the values
1489 above (columns) or left (rows) in an effort to approximate row and
1490 columns spans.
1491
1492 """
1493 output = "|"
1494 rows = self.row_count
1495 cols = self.col_count
1496
1497 # cell coordinates
1498 cell_boxes = [[c for c in r.cells] for r in self.rows]
1499
1500 # cell text strings
1501 cells = [[None for i in range(cols)] for j in range(rows)]
1502 for i, row in enumerate(cell_boxes):
1503 for j, cell in enumerate(row):
1504 if cell is not None:
1505 cells[i][j] = extract_cells(
1506 TEXTPAGE, cell_boxes[i][j], markdown=True
1507 )
1508
1509 if fill_empty: # fill "None" cells where possible
1510
1511 # for rows, copy content from left to right
1512 for j in range(rows):
1513 for i in range(cols - 1):
1514 if cells[j][i + 1] is None:
1515 cells[j][i + 1] = cells[j][i]
1516
1517 # for columns, copy top to bottom
1518 for i in range(cols):
1519 for j in range(rows - 1):
1520 if cells[j + 1][i] is None:
1521 cells[j + 1][i] = cells[j][i]
1522
1523 # generate header string and MD separator
1524 for i, name in enumerate(self.header.names):
1525 if not name: # generate a name if empty
1526 name = f"Col{i+1}"
1527 name = name.replace("\n", "<br>") # use HTML line breaks
1528 if clean: # remove sensitive syntax
1529 name = html.escape(name.replace("-", "&#45;"))
1530 output += name + "|"
1531
1532 output += "\n"
1533 # insert GitHub header line separator
1534 output += "|" + "|".join("---" for i in range(self.col_count)) + "|\n"
1535
1536 # skip first row in details if header is part of the table
1537 j = 0 if self.header.external else 1
1538
1539 # iterate over detail rows
1540 for row in cells[j:]:
1541 line = "|"
1542 for i, cell in enumerate(row):
1543 # replace None cells with empty string
1544 # use HTML line break tag
1545 if cell is None:
1546 cell = ""
1547 if clean: # remove sensitive syntax
1548 cell = html.escape(cell.replace("-", "&#45;"))
1549 line += cell + "|"
1550 line += "\n"
1551 output += line
1552 return output + "\n"
1553
1554 def to_pandas(self, **kwargs):
1555 """Return a pandas DataFrame version of the table."""
1556 try:
1557 import pandas as pd
1558 except ModuleNotFoundError:
1559 message("Package 'pandas' is not installed")
1560 raise
1561
1562 pd_dict = {}
1563 extract = self.extract()
1564 hdr = self.header
1565 names = self.header.names
1566 hdr_len = len(names)
1567 # ensure uniqueness of column names
1568 for i in range(hdr_len):
1569 name = names[i]
1570 if not name:
1571 names[i] = f"Col{i}"
1572 if hdr_len != len(set(names)):
1573 for i in range(hdr_len):
1574 name = names[i]
1575 if name != f"Col{i}":
1576 names[i] = f"{i}-{name}"
1577
1578 if not hdr.external: # header is part of 'extract'
1579 extract = extract[1:]
1580
1581 for i in range(hdr_len):
1582 key = names[i]
1583 value = []
1584 for j in range(len(extract)):
1585 value.append(extract[j][i])
1586 pd_dict[key] = value
1587
1588 return pd.DataFrame(pd_dict)
1589
1590 def _get_header(self, y_tolerance=3):
1591 """Identify the table header.
1592
1593 *** PyMuPDF extension. ***
1594
1595 Starting from the first line above the table upwards, check if it
1596 qualifies to be part of the table header.
1597
1598 Criteria include:
1599 * A one-line table never has an extra header.
1600 * Column borders must not intersect any word. If this happens, all
1601 text of this line and above of it is ignored.
1602 * No excess inter-line distance: If a line further up has a distance
1603 of more than 1.5 times of its font size, it will be ignored and
1604 all lines above of it.
1605 * Must have same text properties.
1606 * Starting with the top table line, a bold text property cannot change
1607 back to non-bold.
1608
1609 If not all criteria are met (or there is no text above the table),
1610 the first table row is assumed to be the header.
1611 """
1612 page = self.page
1613 y_delta = y_tolerance
1614
1615 def top_row_bg_color(self):
1616 """
1617 Compare top row background color with color of same-sized bbox
1618 above. If different, return True indicating that the original
1619 table top row is already the header.
1620 """
1621 bbox0 = Rect(self.rows[0].bbox)
1622 bboxt = bbox0 + (0, -bbox0.height, 0, -bbox0.height) # area above
1623 top_color0 = page.get_pixmap(clip=bbox0).color_topusage()[1]
1624 top_colort = page.get_pixmap(clip=bboxt).color_topusage()[1]
1625 if top_color0 != top_colort:
1626 return True # top row is header
1627 return False
1628
1629 def row_has_bold(bbox):
1630 """Check if a row contains some bold text.
1631
1632 If e.g. true for the top row, then it will be used as (internal)
1633 column header row if any of the following is true:
1634 * the previous (above) text line has no bold span
1635 * the second table row text has no bold span
1636
1637 Returns True if any spans are bold else False.
1638 """
1639 blocks = page.get_text("dict", flags=TEXTFLAGS_TEXT, clip=bbox)["blocks"]
1640 spans = [s for b in blocks for l in b["lines"] for s in l["spans"]]
1641
1642 return any(s["flags"] & TEXT_FONT_BOLD for s in spans)
1643
1644 try:
1645 row = self.rows[0]
1646 cells = row.cells
1647 bbox = Rect(row.bbox)
1648 except IndexError: # this table has no rows
1649 return None
1650
1651 # return this if we determine that the top row is the header
1652 header_top_row = TableHeader(bbox, cells, self.extract()[0], False)
1653
1654 # 1-line tables have no extra header
1655 if len(self.rows) < 2:
1656 return header_top_row
1657
1658 # 1-column tables have no extra header
1659 if len(cells) < 2:
1660 return header_top_row
1661
1662 # assume top row is the header if second row is empty
1663 row2 = self.rows[1] # second row
1664 if all(c is None for c in row2.cells): # no valid cell bboxes in row2
1665 return header_top_row
1666
1667 # Special check: is top row bold?
1668 top_row_bold = row_has_bold(bbox)
1669
1670 # assume top row is header if it is bold and any cell
1671 # of 2nd row is non-bold
1672 if top_row_bold and not row_has_bold(row2.bbox):
1673 return header_top_row
1674
1675 if top_row_bg_color(self):
1676 # if area above top row has a different background color,
1677 # then top row is already the header
1678 return header_top_row
1679
1680 # column coordinates (x1 values) in top row
1681 col_x = [c[2] if c is not None else None for c in cells[:-1]]
1682
1683 # clip = page area above the table
1684 # We will inspect this area for text qualifying as column header.
1685 clip = +bbox # take row 0 bbox
1686 clip.y0 = 0 # start at top of page
1687 clip.y1 = bbox.y0 # end at top of table
1688
1689 blocks = page.get_text("dict", clip=clip, flags=TEXTFLAGS_TEXT)["blocks"]
1690 # non-empty, non-superscript spans above table, sorted descending by y1
1691 spans = sorted(
1692 [
1693 s
1694 for b in blocks
1695 for l in b["lines"]
1696 for s in l["spans"]
1697 if not (
1698 white_spaces.issuperset(s["text"])
1699 or s["flags"] & TEXT_FONT_SUPERSCRIPT
1700 )
1701 ],
1702 key=lambda s: s["bbox"][3],
1703 reverse=True,
1704 )
1705
1706 select = [] # y1 coordinates above, sorted descending
1707 line_heights = [] # line heights above, sorted descending
1708 line_bolds = [] # bold indicator per line above, same sorting
1709
1710 # walk through the spans and fill above 3 lists
1711 for i in range(len(spans)):
1712 s = spans[i]
1713 y1 = s["bbox"][3] # span bottom
1714 h = y1 - s["bbox"][1] # span bbox height
1715 bold = s["flags"] & TEXT_FONT_BOLD
1716
1717 # use first item to start the lists
1718 if i == 0:
1719 select.append(y1)
1720 line_heights.append(h)
1721 line_bolds.append(bold)
1722 continue
1723
1724 # get previous items from the 3 lists
1725 y0 = select[-1]
1726 h0 = line_heights[-1]
1727 bold0 = line_bolds[-1]
1728
1729 if bold0 and not bold:
1730 break # stop if switching from bold to non-bold
1731
1732 # if fitting in height of previous span, modify bbox
1733 if y0 - y1 <= y_delta or abs((y0 - h0) - s["bbox"][1]) <= y_delta:
1734 s["bbox"] = (s["bbox"][0], y0 - h0, s["bbox"][2], y0)
1735 spans[i] = s
1736 if bold:
1737 line_bolds[-1] = bold
1738 continue
1739 elif y0 - y1 > 1.5 * h0:
1740 break # stop if distance to previous line too large
1741 select.append(y1)
1742 line_heights.append(h)
1743 line_bolds.append(bold)
1744
1745 if select == []: # nothing above the table?
1746 return header_top_row
1747
1748 select = select[:5] # accept up to 5 lines for an external header
1749
1750 # assume top row as header if text above is too far away
1751 if bbox.y0 - select[0] >= line_heights[0]:
1752 return header_top_row
1753
1754 # accept top row as header if bold, but line above is not
1755 if top_row_bold and not line_bolds[0]:
1756 return header_top_row
1757
1758 if spans == []: # nothing left above the table, return top row
1759 return header_top_row
1760
1761 # re-compute clip above table
1762 nclip = EMPTY_RECT()
1763 for s in [s for s in spans if s["bbox"][3] >= select[-1]]:
1764 nclip |= s["bbox"]
1765 if not nclip.is_empty:
1766 clip = nclip
1767
1768 clip.y1 = bbox.y0 # make sure we still include every word above
1769
1770 # Confirm that no word in clip is intersecting a column separator
1771 word_rects = [Rect(w[:4]) for w in page.get_text("words", clip=clip)]
1772 word_tops = sorted(list(set([r[1] for r in word_rects])), reverse=True)
1773
1774 select = []
1775
1776 # exclude lines with words that intersect a column border
1777 for top in word_tops:
1778 intersecting = [
1779 (x, r)
1780 for x in col_x
1781 if x is not None
1782 for r in word_rects
1783 if r[1] == top and r[0] < x and r[2] > x
1784 ]
1785 if intersecting == []:
1786 select.append(top)
1787 else: # detected a word crossing a column border
1788 break
1789
1790 if select == []: # nothing left over: return first row
1791 return header_top_row
1792
1793 hdr_bbox = +clip # compute the header cells
1794 hdr_bbox.y0 = select[-1] # hdr_bbox top is smallest top coord of words
1795 hdr_cells = [
1796 (c[0], hdr_bbox.y0, c[2], hdr_bbox.y1) if c is not None else None
1797 for c in cells
1798 ]
1799
1800 # adjust left/right of header bbox
1801 hdr_bbox.x0 = self.bbox[0]
1802 hdr_bbox.x1 = self.bbox[2]
1803
1804 # column names: no line breaks, no excess spaces
1805 hdr_names = [
1806 (
1807 page.get_textbox(c).replace("\n", " ").replace(" ", " ").strip()
1808 if c is not None
1809 else ""
1810 )
1811 for c in hdr_cells
1812 ]
1813 return TableHeader(tuple(hdr_bbox), hdr_cells, hdr_names, True)
1814
1815
1816 @dataclass
1817 class TableSettings:
1818 vertical_strategy: str = "lines"
1819 horizontal_strategy: str = "lines"
1820 explicit_vertical_lines: list = None
1821 explicit_horizontal_lines: list = None
1822 snap_tolerance: float = DEFAULT_SNAP_TOLERANCE
1823 snap_x_tolerance: float = UNSET
1824 snap_y_tolerance: float = UNSET
1825 join_tolerance: float = DEFAULT_JOIN_TOLERANCE
1826 join_x_tolerance: float = UNSET
1827 join_y_tolerance: float = UNSET
1828 edge_min_length: float = 3
1829 min_words_vertical: float = DEFAULT_MIN_WORDS_VERTICAL
1830 min_words_horizontal: float = DEFAULT_MIN_WORDS_HORIZONTAL
1831 intersection_tolerance: float = 3
1832 intersection_x_tolerance: float = UNSET
1833 intersection_y_tolerance: float = UNSET
1834 text_settings: dict = None
1835
1836 def __post_init__(self) -> "TableSettings":
1837 """Clean up user-provided table settings.
1838
1839 Validates that the table settings provided consists of acceptable values and
1840 returns a cleaned up version. The cleaned up version fills out the missing
1841 values with the default values in the provided settings.
1842
1843 TODO: Can be further used to validate that the values are of the correct
1844 type. For example, raising a value error when a non-boolean input is
1845 provided for the key ``keep_blank_chars``.
1846
1847 :param table_settings: User-provided table settings.
1848 :returns: A cleaned up version of the user-provided table settings.
1849 :raises ValueError: When an unrecognised key is provided.
1850 """
1851
1852 for setting in NON_NEGATIVE_SETTINGS:
1853 if (getattr(self, setting) or 0) < 0:
1854 raise ValueError(f"Table setting '{setting}' cannot be negative")
1855
1856 for orientation in ["horizontal", "vertical"]:
1857 strategy = getattr(self, orientation + "_strategy")
1858 if strategy not in TABLE_STRATEGIES:
1859 raise ValueError(
1860 f"{orientation}_strategy must be one of"
1861 f'{{{",".join(TABLE_STRATEGIES)}}}'
1862 )
1863
1864 if self.text_settings is None:
1865 self.text_settings = {}
1866
1867 # This next section is for backwards compatibility
1868 for attr in ["x_tolerance", "y_tolerance"]:
1869 if attr not in self.text_settings:
1870 self.text_settings[attr] = self.text_settings.get("tolerance", 3)
1871
1872 if "tolerance" in self.text_settings:
1873 del self.text_settings["tolerance"]
1874 # End of that section
1875
1876 for attr, fallback in [
1877 ("snap_x_tolerance", "snap_tolerance"),
1878 ("snap_y_tolerance", "snap_tolerance"),
1879 ("join_x_tolerance", "join_tolerance"),
1880 ("join_y_tolerance", "join_tolerance"),
1881 ("intersection_x_tolerance", "intersection_tolerance"),
1882 ("intersection_y_tolerance", "intersection_tolerance"),
1883 ]:
1884 if getattr(self, attr) is UNSET:
1885 setattr(self, attr, getattr(self, fallback))
1886
1887 return self
1888
1889 @classmethod
1890 def resolve(cls, settings=None):
1891 if settings is None:
1892 return cls()
1893 elif isinstance(settings, cls):
1894 return settings
1895 elif isinstance(settings, dict):
1896 core_settings = {}
1897 text_settings = {}
1898 for k, v in settings.items():
1899 if k[:5] == "text_":
1900 text_settings[k[5:]] = v
1901 else:
1902 core_settings[k] = v
1903 core_settings["text_settings"] = text_settings
1904 return cls(**core_settings)
1905 else:
1906 raise ValueError(f"Cannot resolve settings: {settings}")
1907
1908
1909 class TableFinder:
1910 """
1911 Given a PDF page, find plausible table structures.
1912
1913 Largely borrowed from Anssi Nurminen's master's thesis:
1914 http://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3
1915
1916 ... and inspired by Tabula:
1917 https://github.com/tabulapdf/tabula-extractor/issues/16
1918 """
1919
1920 def __init__(self, page, settings=None):
1921 self.page = weakref.proxy(page)
1922 self.settings = TableSettings.resolve(settings)
1923 self.edges = self.get_edges()
1924 self.intersections = edges_to_intersections(
1925 self.edges,
1926 self.settings.intersection_x_tolerance,
1927 self.settings.intersection_y_tolerance,
1928 )
1929 self.cells = intersections_to_cells(self.intersections)
1930 self.tables = [
1931 Table(self.page, cell_group)
1932 for cell_group in cells_to_tables(self.page, self.cells)
1933 ]
1934
1935 def get_edges(self) -> list:
1936 settings = self.settings
1937
1938 for orientation in ["vertical", "horizontal"]:
1939 strategy = getattr(settings, orientation + "_strategy")
1940 if strategy == "explicit":
1941 lines = getattr(settings, "explicit_" + orientation + "_lines")
1942 if len(lines) < 2:
1943 raise ValueError(
1944 f"If {orientation}_strategy == 'explicit', "
1945 f"explicit_{orientation}_lines "
1946 f"must be specified as a list/tuple of two or more "
1947 f"floats/ints."
1948 )
1949
1950 v_strat = settings.vertical_strategy
1951 h_strat = settings.horizontal_strategy
1952
1953 if v_strat == "text" or h_strat == "text":
1954 words = extract_words(CHARS, **(settings.text_settings or {}))
1955 else:
1956 words = []
1957
1958 v_explicit = []
1959 for desc in settings.explicit_vertical_lines or []:
1960 if isinstance(desc, dict):
1961 for e in obj_to_edges(desc):
1962 if e["orientation"] == "v":
1963 v_explicit.append(e)
1964 else:
1965 v_explicit.append(
1966 {
1967 "x0": desc,
1968 "x1": desc,
1969 "top": self.page.rect[1],
1970 "bottom": self.page.rect[3],
1971 "height": self.page.rect[3] - self.page.rect[1],
1972 "orientation": "v",
1973 }
1974 )
1975
1976 if v_strat == "lines":
1977 v_base = filter_edges(EDGES, "v")
1978 elif v_strat == "lines_strict":
1979 v_base = filter_edges(EDGES, "v", edge_type="line")
1980 elif v_strat == "text":
1981 v_base = words_to_edges_v(words, word_threshold=settings.min_words_vertical)
1982 elif v_strat == "explicit":
1983 v_base = []
1984 else:
1985 v_base = []
1986
1987 v = v_base + v_explicit
1988
1989 h_explicit = []
1990 for desc in settings.explicit_horizontal_lines or []:
1991 if isinstance(desc, dict):
1992 for e in obj_to_edges(desc):
1993 if e["orientation"] == "h":
1994 h_explicit.append(e)
1995 else:
1996 h_explicit.append(
1997 {
1998 "x0": self.page.rect[0],
1999 "x1": self.page.rect[2],
2000 "width": self.page.rect[2] - self.page.rect[0],
2001 "top": desc,
2002 "bottom": desc,
2003 "orientation": "h",
2004 }
2005 )
2006
2007 if h_strat == "lines":
2008 h_base = filter_edges(EDGES, "h")
2009 elif h_strat == "lines_strict":
2010 h_base = filter_edges(EDGES, "h", edge_type="line")
2011 elif h_strat == "text":
2012 h_base = words_to_edges_h(
2013 words, word_threshold=settings.min_words_horizontal
2014 )
2015 elif h_strat == "explicit":
2016 h_base = []
2017 else:
2018 h_base = []
2019
2020 h = h_base + h_explicit
2021
2022 edges = list(v) + list(h)
2023
2024 edges = merge_edges(
2025 edges,
2026 snap_x_tolerance=settings.snap_x_tolerance,
2027 snap_y_tolerance=settings.snap_y_tolerance,
2028 join_x_tolerance=settings.join_x_tolerance,
2029 join_y_tolerance=settings.join_y_tolerance,
2030 )
2031
2032 return filter_edges(edges, min_length=settings.edge_min_length)
2033
2034 def __getitem__(self, i):
2035 tcount = len(self.tables)
2036 if i >= tcount:
2037 raise IndexError("table not on page")
2038 while i < 0:
2039 i += tcount
2040 return self.tables[i]
2041
2042
2043 """
2044 Start of PyMuPDF interface code.
2045 The following functions are executed when "page.find_tables()" is called.
2046
2047 * make_chars: Fills the CHARS list with text character information extracted
2048 via "rawdict" text extraction. Items in CHARS are formatted
2049 as expected by the table code.
2050 * make_edges: Fills the EDGES list with vector graphic information extracted
2051 via "get_drawings". Items in EDGES are formatted as expected
2052 by the table code.
2053
2054 The lists CHARS and EDGES are used to replace respective document access
2055 of pdfplumber or, respectively pdfminer.
2056 The table code has been modified to use these lists instead of accessing
2057 page information themselves.
2058 """
2059
2060
2061 # -----------------------------------------------------------------------------
2062 # Extract all page characters to fill the CHARS list
2063 # -----------------------------------------------------------------------------
2064 def make_chars(page, clip=None):
2065 """Extract text as "rawdict" to fill CHARS."""
2066 global TEXTPAGE
2067 page_number = page.number + 1
2068 page_height = page.rect.height
2069 ctm = page.transformation_matrix
2070 TEXTPAGE = page.get_textpage(clip=clip, flags=FLAGS)
2071 blocks = page.get_text("rawdict", textpage=TEXTPAGE)["blocks"]
2072 doctop_base = page_height * page.number
2073 for block in blocks:
2074 for line in block["lines"]:
2075 ldir = line["dir"] # = (cosine, sine) of angle
2076 ldir = (round(ldir[0], 4), round(ldir[1], 4))
2077 matrix = Matrix(ldir[0], -ldir[1], ldir[1], ldir[0], 0, 0)
2078 if ldir[1] == 0:
2079 upright = True
2080 else:
2081 upright = False
2082 for span in sorted(line["spans"], key=lambda s: s["bbox"][0]):
2083 fontname = span["font"]
2084 fontsize = span["size"]
2085 color = sRGB_to_pdf(span["color"])
2086 for char in sorted(span["chars"], key=lambda c: c["bbox"][0]):
2087 bbox = Rect(char["bbox"])
2088 bbox_ctm = bbox * ctm
2089 origin = Point(char["origin"]) * ctm
2090 matrix.e = origin.x
2091 matrix.f = origin.y
2092 text = char["c"]
2093 char_dict = {
2094 "adv": bbox.x1 - bbox.x0 if upright else bbox.y1 - bbox.y0,
2095 "bottom": bbox.y1,
2096 "doctop": bbox.y0 + doctop_base,
2097 "fontname": fontname,
2098 "height": bbox.y1 - bbox.y0,
2099 "matrix": tuple(matrix),
2100 "ncs": "DeviceRGB",
2101 "non_stroking_color": color,
2102 "non_stroking_pattern": None,
2103 "object_type": "char",
2104 "page_number": page_number,
2105 "size": fontsize if upright else bbox.y1 - bbox.y0,
2106 "stroking_color": color,
2107 "stroking_pattern": None,
2108 "text": text,
2109 "top": bbox.y0,
2110 "upright": upright,
2111 "width": bbox.x1 - bbox.x0,
2112 "x0": bbox.x0,
2113 "x1": bbox.x1,
2114 "y0": bbox_ctm.y0,
2115 "y1": bbox_ctm.y1,
2116 }
2117 CHARS.append(char_dict)
2118
2119
2120 # ------------------------------------------------------------------------
2121 # Extract all page vector graphics to fill the EDGES list.
2122 # We are ignoring Bézier curves completely and are converting everything
2123 # else to lines.
2124 # ------------------------------------------------------------------------
2125 def make_edges(page, clip=None, tset=None, paths=None, add_lines=None, add_boxes=None):
2126 snap_x = tset.snap_x_tolerance
2127 snap_y = tset.snap_y_tolerance
2128 min_length = tset.edge_min_length
2129 lines_strict = (
2130 tset.vertical_strategy == "lines_strict"
2131 or tset.horizontal_strategy == "lines_strict"
2132 )
2133 page_height = page.rect.height
2134 doctop_basis = page.number * page_height
2135 page_number = page.number + 1
2136 prect = page.rect
2137 if page.rotation in (90, 270):
2138 w, h = prect.br
2139 prect = Rect(0, 0, h, w)
2140 if clip is not None:
2141 clip = Rect(clip)
2142 else:
2143 clip = prect
2144
2145 def are_neighbors(r1, r2):
2146 """Detect whether r1, r2 are neighbors.
2147
2148 Defined as:
2149 The minimum distance between points of r1 and points of r2 is not
2150 larger than some delta.
2151
2152 This check supports empty rect-likes and thus also lines.
2153
2154 Note:
2155 This type of check is MUCH faster than native Rect containment checks.
2156 """
2157 if ( # check if x-coordinates of r1 are within those of r2
2158 r2.x0 - snap_x <= r1.x0 <= r2.x1 + snap_x
2159 or r2.x0 - snap_x <= r1.x1 <= r2.x1 + snap_x
2160 ) and ( # ... same for y-coordinates
2161 r2.y0 - snap_y <= r1.y0 <= r2.y1 + snap_y
2162 or r2.y0 - snap_y <= r1.y1 <= r2.y1 + snap_y
2163 ):
2164 return True
2165
2166 # same check with r1 / r2 exchanging their roles (this is necessary!)
2167 if (
2168 r1.x0 - snap_x <= r2.x0 <= r1.x1 + snap_x
2169 or r1.x0 - snap_x <= r2.x1 <= r1.x1 + snap_x
2170 ) and (
2171 r1.y0 - snap_y <= r2.y0 <= r1.y1 + snap_y
2172 or r1.y0 - snap_y <= r2.y1 <= r1.y1 + snap_y
2173 ):
2174 return True
2175 return False
2176
2177 def clean_graphics(npaths=None):
2178 """Detect and join rectangles of "connected" vector graphics."""
2179 if npaths is None:
2180 allpaths = page.get_drawings()
2181 else: # accept passed-in vector graphics
2182 allpaths = npaths[:] # paths relevant for table detection
2183 paths = []
2184 for p in allpaths:
2185 # If only looking at lines, we ignore fill-only paths,
2186 # except simulated lines (i.e. small width or height).
2187 if (
2188 lines_strict
2189 and p["type"] == "f"
2190 and p["rect"].width > snap_x
2191 and p["rect"].height > snap_y
2192 ):
2193 continue
2194 paths.append(p)
2195
2196 # start with all vector graphics rectangles
2197 prects = sorted(set([p["rect"] for p in paths]), key=lambda r: (r.y1, r.x0))
2198 new_rects = [] # the final list of joined rectangles
2199 # ----------------------------------------------------------------
2200 # Strategy: Join rectangles that "almost touch" each other.
2201 # Extend first rectangle with any other that is a "neighbor".
2202 # Then move it to the final list and continue with the rest.
2203 # ----------------------------------------------------------------
2204 while prects: # the algorithm will empty this list
2205 prect0 = prects[0] # copy of first rectangle (performance reasons!)
2206 repeat = True
2207 while repeat: # this loop extends first rect in list
2208 repeat = False # set to true again if some other rect touches
2209 for i in range(len(prects) - 1, 0, -1): # run backwards
2210 if are_neighbors(prect0, prects[i]): # close enough to rect 0?
2211 prect0 |= prects[i].tl # extend rect 0
2212 prect0 |= prects[i].br # extend rect 0
2213 del prects[i] # delete this rect
2214 repeat = True # keep checking the rest
2215
2216 # move rect 0 over to result list if there is some text in it
2217 if not white_spaces.issuperset(page.get_textbox(prect0, textpage=TEXTPAGE)):
2218 # contains text, so accept it as a table bbox candidate
2219 new_rects.append(prect0)
2220 del prects[0] # remove from rect list
2221
2222 return new_rects, paths
2223
2224 bboxes, paths = clean_graphics(npaths=paths)
2225
2226 def is_parallel(p1, p2):
2227 """Check if line is roughly axis-parallel."""
2228 if abs(p1.x - p2.x) <= snap_x or abs(p1.y - p2.y) <= snap_y:
2229 return True
2230 return False
2231
2232 def make_line(p, p1, p2, clip):
2233 """Given 2 points, make a line dictionary for table detection."""
2234 if not is_parallel(p1, p2): # only accepting axis-parallel lines
2235 return {}
2236 # compute the extremal values
2237 x0 = min(p1.x, p2.x)
2238 x1 = max(p1.x, p2.x)
2239 y0 = min(p1.y, p2.y)
2240 y1 = max(p1.y, p2.y)
2241
2242 # check for outside clip
2243 if x0 > clip.x1 or x1 < clip.x0 or y0 > clip.y1 or y1 < clip.y0:
2244 return {}
2245
2246 if x0 < clip.x0:
2247 x0 = clip.x0 # adjust to clip boundary
2248
2249 if x1 > clip.x1:
2250 x1 = clip.x1 # adjust to clip boundary
2251
2252 if y0 < clip.y0:
2253 y0 = clip.y0 # adjust to clip boundary
2254
2255 if y1 > clip.y1:
2256 y1 = clip.y1 # adjust to clip boundary
2257
2258 width = x1 - x0 # from adjusted values
2259 height = y1 - y0 # from adjusted values
2260 if width == height == 0:
2261 return {} # nothing left to deal with
2262 line_dict = {
2263 "x0": x0,
2264 "y0": page_height - y0,
2265 "x1": x1,
2266 "y1": page_height - y1,
2267 "width": width,
2268 "height": height,
2269 "pts": [(x0, y0), (x1, y1)],
2270 "linewidth": p["width"],
2271 "stroke": True,
2272 "fill": False,
2273 "evenodd": False,
2274 "stroking_color": p["color"] if p["color"] else p["fill"],
2275 "non_stroking_color": None,
2276 "object_type": "line",
2277 "page_number": page_number,
2278 "stroking_pattern": None,
2279 "non_stroking_pattern": None,
2280 "top": y0,
2281 "bottom": y1,
2282 "doctop": y0 + doctop_basis,
2283 }
2284 return line_dict
2285
2286 for p in paths:
2287 items = p["items"] # items in this path
2288
2289 # if 'closePath', add a line from last to first point
2290 if p["closePath"] and items[0][0] == "l" and items[-1][0] == "l":
2291 items.append(("l", items[-1][2], items[0][1]))
2292
2293 for i in items:
2294 if i[0] not in ("l", "re", "qu"):
2295 continue # ignore anything else
2296
2297 if i[0] == "l": # a line
2298 p1, p2 = i[1:]
2299 line_dict = make_line(p, p1, p2, clip)
2300 if line_dict:
2301 EDGES.append(line_to_edge(line_dict))
2302
2303 elif i[0] == "re":
2304 # A rectangle: decompose into 4 lines, but filter out
2305 # the ones that simulate a line
2306 rect = i[1].normalize() # normalize the rectangle
2307
2308 if (
2309 rect.width <= min_length and rect.width < rect.height
2310 ): # simulates a vertical line
2311 x = abs(rect.x1 + rect.x0) / 2 # take middle value for x
2312 p1 = Point(x, rect.y0)
2313 p2 = Point(x, rect.y1)
2314 line_dict = make_line(p, p1, p2, clip)
2315 if line_dict:
2316 EDGES.append(line_to_edge(line_dict))
2317 continue
2318
2319 if (
2320 rect.height <= min_length and rect.height < rect.width
2321 ): # simulates a horizontal line
2322 y = abs(rect.y1 + rect.y0) / 2 # take middle value for y
2323 p1 = Point(rect.x0, y)
2324 p2 = Point(rect.x1, y)
2325 line_dict = make_line(p, p1, p2, clip)
2326 if line_dict:
2327 EDGES.append(line_to_edge(line_dict))
2328 continue
2329
2330 line_dict = make_line(p, rect.tl, rect.bl, clip)
2331 if line_dict:
2332 EDGES.append(line_to_edge(line_dict))
2333
2334 line_dict = make_line(p, rect.bl, rect.br, clip)
2335 if line_dict:
2336 EDGES.append(line_to_edge(line_dict))
2337
2338 line_dict = make_line(p, rect.br, rect.tr, clip)
2339 if line_dict:
2340 EDGES.append(line_to_edge(line_dict))
2341
2342 line_dict = make_line(p, rect.tr, rect.tl, clip)
2343 if line_dict:
2344 EDGES.append(line_to_edge(line_dict))
2345
2346 else: # must be a quad
2347 # we convert it into (up to) 4 lines
2348 ul, ur, ll, lr = i[1]
2349
2350 line_dict = make_line(p, ul, ll, clip)
2351 if line_dict:
2352 EDGES.append(line_to_edge(line_dict))
2353
2354 line_dict = make_line(p, ll, lr, clip)
2355 if line_dict:
2356 EDGES.append(line_to_edge(line_dict))
2357
2358 line_dict = make_line(p, lr, ur, clip)
2359 if line_dict:
2360 EDGES.append(line_to_edge(line_dict))
2361
2362 line_dict = make_line(p, ur, ul, clip)
2363 if line_dict:
2364 EDGES.append(line_to_edge(line_dict))
2365
2366 path = {"color": (0, 0, 0), "fill": None, "width": 1}
2367 for bbox in bboxes: # add the border lines for all enveloping bboxes
2368 line_dict = make_line(path, bbox.tl, bbox.tr, clip)
2369 if line_dict:
2370 EDGES.append(line_to_edge(line_dict))
2371
2372 line_dict = make_line(path, bbox.bl, bbox.br, clip)
2373 if line_dict:
2374 EDGES.append(line_to_edge(line_dict))
2375
2376 line_dict = make_line(path, bbox.tl, bbox.bl, clip)
2377 if line_dict:
2378 EDGES.append(line_to_edge(line_dict))
2379
2380 line_dict = make_line(path, bbox.tr, bbox.br, clip)
2381 if line_dict:
2382 EDGES.append(line_to_edge(line_dict))
2383
2384 if add_lines is not None: # add user-specified lines
2385 assert isinstance(add_lines, (tuple, list))
2386 else:
2387 add_lines = []
2388 for p1, p2 in add_lines:
2389 p1 = Point(p1)
2390 p2 = Point(p2)
2391 line_dict = make_line(path, p1, p2, clip)
2392 if line_dict:
2393 EDGES.append(line_to_edge(line_dict))
2394
2395 if add_boxes is not None: # add user-specified rectangles
2396 assert isinstance(add_boxes, (tuple, list))
2397 else:
2398 add_boxes = []
2399 for box in add_boxes:
2400 r = Rect(box)
2401 line_dict = make_line(path, r.tl, r.bl, clip)
2402 if line_dict:
2403 EDGES.append(line_to_edge(line_dict))
2404 line_dict = make_line(path, r.bl, r.br, clip)
2405 if line_dict:
2406 EDGES.append(line_to_edge(line_dict))
2407 line_dict = make_line(path, r.br, r.tr, clip)
2408 if line_dict:
2409 EDGES.append(line_to_edge(line_dict))
2410 line_dict = make_line(path, r.tr, r.tl, clip)
2411 if line_dict:
2412 EDGES.append(line_to_edge(line_dict))
2413
2414
2415 def page_rotation_set0(page):
2416 """Nullify page rotation.
2417
2418 To correctly detect tables, page rotation must be zero.
2419 This function performs the necessary adjustments and returns information
2420 for reverting this changes.
2421 """
2422 mediabox = page.mediabox
2423 rot = page.rotation # contains normalized rotation value
2424 # need to derotate the page's content
2425 mb = page.mediabox # current mediabox
2426
2427 if rot == 90:
2428 # before derotation, shift content horizontally
2429 mat0 = Matrix(1, 0, 0, 1, mb.y1 - mb.x1 - mb.x0 - mb.y0, 0)
2430 elif rot == 270:
2431 # before derotation, shift content vertically
2432 mat0 = Matrix(1, 0, 0, 1, 0, mb.x1 - mb.y1 - mb.y0 - mb.x0)
2433 else:
2434 mat0 = Matrix(1, 0, 0, 1, -2 * mb.x0, -2 * mb.y0)
2435
2436 # prefix with derotation matrix
2437 mat = mat0 * page.derotation_matrix
2438 cmd = b"%g %g %g %g %g %g cm " % tuple(mat)
2439 xref = TOOLS._insert_contents(page, cmd, 0)
2440
2441 # swap x- and y-coordinates
2442 if rot in (90, 270):
2443 x0, y0, x1, y1 = mb
2444 mb.x0 = y0
2445 mb.y0 = x0
2446 mb.x1 = y1
2447 mb.y1 = x1
2448 page.set_mediabox(mb)
2449
2450 page.set_rotation(0)
2451
2452 # refresh the page to apply these changes
2453 doc = page.parent
2454 pno = page.number
2455 page = doc[pno]
2456 return page, xref, rot, mediabox
2457
2458
2459 def page_rotation_reset(page, xref, rot, mediabox):
2460 """Reset page rotation to original values.
2461
2462 To be used before we return tables."""
2463 doc = page.parent # document of the page
2464 doc.update_stream(xref, b" ") # remove de-rotation matrix
2465 page.set_mediabox(mediabox) # set mediabox to old value
2466 page.set_rotation(rot) # set rotation to old value
2467 pno = page.number
2468 page = doc[pno] # update page info
2469 return page
2470
2471
2472 def find_tables(
2473 page,
2474 clip=None,
2475 vertical_strategy: str = "lines",
2476 horizontal_strategy: str = "lines",
2477 vertical_lines: list = None,
2478 horizontal_lines: list = None,
2479 snap_tolerance: float = DEFAULT_SNAP_TOLERANCE,
2480 snap_x_tolerance: float = None,
2481 snap_y_tolerance: float = None,
2482 join_tolerance: float = DEFAULT_JOIN_TOLERANCE,
2483 join_x_tolerance: float = None,
2484 join_y_tolerance: float = None,
2485 edge_min_length: float = 3,
2486 min_words_vertical: float = DEFAULT_MIN_WORDS_VERTICAL,
2487 min_words_horizontal: float = DEFAULT_MIN_WORDS_HORIZONTAL,
2488 intersection_tolerance: float = 3,
2489 intersection_x_tolerance: float = None,
2490 intersection_y_tolerance: float = None,
2491 text_tolerance=3,
2492 text_x_tolerance=3,
2493 text_y_tolerance=3,
2494 strategy=None, # offer abbreviation
2495 add_lines=None, # user-specified lines
2496 add_boxes=None, # user-specified rectangles
2497 paths=None, # accept vector graphics as parameter
2498 ):
2499 global CHARS, EDGES
2500 CHARS = []
2501 EDGES = []
2502 old_small = bool(TOOLS.set_small_glyph_heights()) # save old value
2503 TOOLS.set_small_glyph_heights(True) # we need minimum bboxes
2504 if page.rotation != 0:
2505 page, old_xref, old_rot, old_mediabox = page_rotation_set0(page)
2506 else:
2507 old_xref, old_rot, old_mediabox = None, None, None
2508
2509 if snap_x_tolerance is None:
2510 snap_x_tolerance = UNSET
2511 if snap_y_tolerance is None:
2512 snap_y_tolerance = UNSET
2513 if join_x_tolerance is None:
2514 join_x_tolerance = UNSET
2515 if join_y_tolerance is None:
2516 join_y_tolerance = UNSET
2517 if intersection_x_tolerance is None:
2518 intersection_x_tolerance = UNSET
2519 if intersection_y_tolerance is None:
2520 intersection_y_tolerance = UNSET
2521 if strategy is not None:
2522 vertical_strategy = strategy
2523 horizontal_strategy = strategy
2524
2525 settings = {
2526 "vertical_strategy": vertical_strategy,
2527 "horizontal_strategy": horizontal_strategy,
2528 "explicit_vertical_lines": vertical_lines,
2529 "explicit_horizontal_lines": horizontal_lines,
2530 "snap_tolerance": snap_tolerance,
2531 "snap_x_tolerance": snap_x_tolerance,
2532 "snap_y_tolerance": snap_y_tolerance,
2533 "join_tolerance": join_tolerance,
2534 "join_x_tolerance": join_x_tolerance,
2535 "join_y_tolerance": join_y_tolerance,
2536 "edge_min_length": edge_min_length,
2537 "min_words_vertical": min_words_vertical,
2538 "min_words_horizontal": min_words_horizontal,
2539 "intersection_tolerance": intersection_tolerance,
2540 "intersection_x_tolerance": intersection_x_tolerance,
2541 "intersection_y_tolerance": intersection_y_tolerance,
2542 "text_tolerance": text_tolerance,
2543 "text_x_tolerance": text_x_tolerance,
2544 "text_y_tolerance": text_y_tolerance,
2545 }
2546 tset = TableSettings.resolve(settings=settings)
2547 page.table_settings = tset
2548
2549 make_chars(page, clip=clip) # create character list of page
2550 make_edges(
2551 page,
2552 clip=clip,
2553 tset=tset,
2554 paths=paths,
2555 add_lines=add_lines,
2556 add_boxes=add_boxes,
2557 ) # create lines and curves
2558 tables = TableFinder(page, settings=tset)
2559
2560 TOOLS.set_small_glyph_heights(old_small)
2561 if old_xref is not None:
2562 page = page_rotation_reset(page, old_xref, old_rot, old_mediabox)
2563 return tables