Python2/PyMuPDF: src/table.py comparison

comparison src/table.py @ 1:1d09e1dec1d9 upstream

ADD: PyMuPDF v1.26.4: the original sdist. It does not yet contain MuPDF. This normally will be downloaded when building PyMuPDF.

author	Franz Glasner <fzglas.hg@dom66.de>
date	Mon, 15 Sep 2025 11:37:51 +0200
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:1d09e1dec1d9
+"""
+Copyright (C) 2023 Artifex Software, Inc.
+This file is part of PyMuPDF.
+PyMuPDF is free software: you can redistribute it and/or modify it under the
+terms of the GNU Affero General Public License as published by the Free
+Software Foundation, either version 3 of the License, or (at your option)
+any later version.
+PyMuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+details.
+You should have received a copy of the GNU Affero General Public License
+along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
+Alternative licensing terms are available from the licensor.
+For commercial licensing, see <https://www.artifex.com/> or contact
+Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
+CA 94129, USA, for further information.
+---------------------------------------------------------------------
+Portions of this code have been ported from pdfplumber, see
+https://pypi.org/project/pdfplumber/.
+The ported code is under the following MIT license:
+---------------------------------------------------------------------
+The MIT License (MIT)
+Copyright (c) 2015, Jeremy Singer-Vine
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+---------------------------------------------------------------------
+Also see here: https://github.com/jsvine/pdfplumber/blob/stable/LICENSE.txt
+---------------------------------------------------------------------
+The porting mainly pertains to files "table.py" and relevant parts of
+"utils/text.py" within pdfplumber's repository on Github.
+With respect to "text.py", we have removed functions or features that are not
+used by table processing. Examples are:
+* the text search function
+* simple text extraction
+* text extraction by lines
+Original pdfplumber code does neither detect, nor identify table headers.
+This PyMuPDF port adds respective code to the 'Table' class as method '_get_header'.
+This is implemented as new class TableHeader with the properties:
+* bbox: A tuple for the header's bbox
+* cells: A tuple for each bbox of a column header
+* names: A list of strings with column header text
+* external: A bool indicating whether the header is outside the table cells.
+"""
+import inspect
+import itertools
+import string
+import html
+from collections.abc import Sequence
+from dataclasses import dataclass
+from operator import itemgetter
+import weakref
+# -------------------------------------------------------------------
+# Start of PyMuPDF interface code
+# -------------------------------------------------------------------
+from . import (
+Rect,
+Matrix,
+TEXTFLAGS_TEXT,
+TEXT_FONT_BOLD,
+TEXT_FONT_ITALIC,
+TEXT_FONT_MONOSPACED,
+TEXT_FONT_SUPERSCRIPT,
+TEXT_COLLECT_STYLES,
+TOOLS,
+EMPTY_RECT,
+sRGB_to_pdf,
+Point,
+message,
+mupdf,
+)
+EDGES = []  # vector graphics from PyMuPDF
+CHARS = []  # text characters from PyMuPDF
+TEXTPAGE = None
+TEXT_BOLD = mupdf.FZ_STEXT_BOLD
+TEXT_STRIKEOUT = mupdf.FZ_STEXT_STRIKEOUT
+FLAGS = TEXTFLAGS_TEXT | TEXT_COLLECT_STYLES
+white_spaces = set(string.whitespace)  # for checking white space only cells
+def extract_cells(textpage, cell, markdown=False):
+"""Extract text from a rect-like 'cell' as plain or MD style text.
+This function should ultimately be used to extract text from a table cell.
+Markdown output will only work correctly if extraction flag bit
+TEXT_COLLECT_STYLES is set.
+Args:
+textpage: A PyMuPDF TextPage object. Must have been created with
+TEXTFLAGS_TEXT | TEXT_COLLECT_STYLES.
+cell: A tuple (x0, y0, x1, y1) defining the cell's bbox.
+markdown: If True, return text formatted for Markdown.
+Returns:
+A string with the text extracted from the cell.
+"""
+text = ""
+for block in textpage.extractRAWDICT()["blocks"]:
+if block["type"] != 0:
+continue
+block_bbox = block["bbox"]
+if (
+0
+or block_bbox[0] > cell[2]
+or block_bbox[2] < cell[0]
+or block_bbox[1] > cell[3]
+or block_bbox[3] < cell[1]
+):
+continue  # skip block outside cell
+for line in block["lines"]:
+lbbox = line["bbox"]
+if (
+0
+or lbbox[0] > cell[2]
+or lbbox[2] < cell[0]
+or lbbox[1] > cell[3]
+or lbbox[3] < cell[1]
+):
+continue  # skip line outside cell
+if text:  # must be a new line in the cell
+text += "<br>" if markdown else "\n"
+# strikeout detection only works with horizontal text
+horizontal = line["dir"] == (0, 1) or line["dir"] == (1, 0)
+for span in line["spans"]:
+sbbox = span["bbox"]
+if (
+0
+or sbbox[0] > cell[2]
+or sbbox[2] < cell[0]
+or sbbox[1] > cell[3]
+or sbbox[3] < cell[1]
+):
+continue  # skip spans outside cell
+# only include chars with more than 50% bbox overlap
+span_text = ""
+for char in span["chars"]:
+bbox = Rect(char["bbox"])
+if abs(bbox & cell) > 0.5 * abs(bbox):
+span_text += char["c"]
+if not span_text:
+continue  # skip empty span
+if not markdown:  # no MD styling
+text += span_text
+continue
+prefix = ""
+suffix = ""
+if horizontal and span["char_flags"] & TEXT_STRIKEOUT:
+prefix += "~~"
+suffix = "~~" + suffix
+if span["char_flags"] & TEXT_BOLD:
+prefix += "**"
+suffix = "**" + suffix
+if span["flags"] & TEXT_FONT_ITALIC:
+prefix += "_"
+suffix = "_" + suffix
+if span["flags"] & TEXT_FONT_MONOSPACED:
+prefix += "`"
+suffix = "`" + suffix
+if len(span["chars"]) > 2:
+span_text = span_text.rstrip()
+# if span continues previous styling: extend cell text
+if (ls := len(suffix)) and text.endswith(suffix):
+text = text[:-ls] + span_text + suffix
+else:  # append the span with new styling
+if not span_text.strip():
+text += " "
+else:
+text += prefix + span_text + suffix
+return text.strip()
+# -------------------------------------------------------------------
+# End of PyMuPDF interface code
+# -------------------------------------------------------------------
+class UnsetFloat(float):
+pass
+NON_NEGATIVE_SETTINGS = [
+"snap_tolerance",
+"snap_x_tolerance",
+"snap_y_tolerance",
+"join_tolerance",
+"join_x_tolerance",
+"join_y_tolerance",
+"edge_min_length",
+"min_words_vertical",
+"min_words_horizontal",
+"intersection_tolerance",
+"intersection_x_tolerance",
+"intersection_y_tolerance",
+]
+TABLE_STRATEGIES = ["lines", "lines_strict", "text", "explicit"]
+UNSET = UnsetFloat(0)
+DEFAULT_SNAP_TOLERANCE = 3
+DEFAULT_JOIN_TOLERANCE = 3
+DEFAULT_MIN_WORDS_VERTICAL = 3
+DEFAULT_MIN_WORDS_HORIZONTAL = 1
+DEFAULT_X_TOLERANCE = 3
+DEFAULT_Y_TOLERANCE = 3
+DEFAULT_X_DENSITY = 7.25
+DEFAULT_Y_DENSITY = 13
+bbox_getter = itemgetter("x0", "top", "x1", "bottom")
+LIGATURES = {
+"ﬀ": "ff",
+"ﬃ": "ffi",
+"ﬄ": "ffl",
+"ﬁ": "fi",
+"ﬂ": "fl",
+"ﬆ": "st",
+"ﬅ": "st",
+}
+def to_list(collection) -> list:
+if isinstance(collection, list):
+return collection
+elif isinstance(collection, Sequence):
+return list(collection)
+elif hasattr(collection, "to_dict"):
+res = collection.to_dict("records")  # pragma: nocover
+return res
+else:
+return list(collection)
+class TextMap:
+"""
+A TextMap maps each unicode character in the text to an individual `char`
+object (or, in the case of layout-implied whitespace, `None`).
+"""
+def __init__(self, tuples=None) -> None:
+self.tuples = tuples
+self.as_string = "".join(map(itemgetter(0), tuples))
+def match_to_dict(
+self,
+m,
+main_group: int = 0,
+return_groups: bool = True,
+return_chars: bool = True,
+) -> dict:
+subset = self.tuples[m.start(main_group) : m.end(main_group)]
+chars = [c for (text, c) in subset if c is not None]
+x0, top, x1, bottom = objects_to_bbox(chars)
+result = {
+"text": m.group(main_group),
+"x0": x0,
+"top": top,
+"x1": x1,
+"bottom": bottom,
+}
+if return_groups:
+result["groups"] = m.groups()
+if return_chars:
+result["chars"] = chars
+return result
+class WordMap:
+"""
+A WordMap maps words->chars.
+"""
+def __init__(self, tuples) -> None:
+self.tuples = tuples
+def to_textmap(
+self,
+layout: bool = False,
+layout_width=0,
+layout_height=0,
+layout_width_chars: int = 0,
+layout_height_chars: int = 0,
+x_density=DEFAULT_X_DENSITY,
+y_density=DEFAULT_Y_DENSITY,
+x_shift=0,
+y_shift=0,
+y_tolerance=DEFAULT_Y_TOLERANCE,
+use_text_flow: bool = False,
+presorted: bool = False,
+expand_ligatures: bool = True,
+) -> TextMap:
+"""
+Given a list of (word, chars) tuples (i.e., a WordMap), return a list of
+(char-text, char) tuples (i.e., a TextMap) that can be used to mimic the
+structural layout of the text on the page(s), using the following approach:
+- Sort the words by (doctop, x0) if not already sorted.
+- Calculate the initial doctop for the starting page.
+- Cluster the words by doctop (taking `y_tolerance` into account), and
+iterate through them.
+- For each cluster, calculate the distance between that doctop and the
+initial doctop, in points, minus `y_shift`. Divide that distance by
+`y_density` to calculate the minimum number of newlines that should come
+before this cluster. Append that number of newlines *minus* the number of
+newlines already appended, with a minimum of one.
+- Then for each cluster, iterate through each word in it. Divide each
+word's x0, minus `x_shift`, by `x_density` to calculate the minimum
+number of characters that should come before this cluster.  Append that
+number of spaces *minus* the number of characters and spaces already
+appended, with a minimum of one. Then append the word's text.
+- At the termination of each line, add more spaces if necessary to
+mimic `layout_width`.
+- Finally, add newlines to the end if necessary to mimic to
+`layout_height`.
+Note: This approach currently works best for horizontal, left-to-right
+text, but will display all words regardless of orientation. There is room
+for improvement in better supporting right-to-left text, as well as
+vertical text.
+"""
+_textmap = []
+if not len(self.tuples):
+return TextMap(_textmap)
+expansions = LIGATURES if expand_ligatures else {}
+if layout:
+if layout_width_chars:
+if layout_width:
+raise ValueError(
+"`layout_width` and `layout_width_chars` cannot both be set."
+)
+else:
+layout_width_chars = int(round(layout_width / x_density))
+if layout_height_chars:
+if layout_height:
+raise ValueError(
+"`layout_height` and `layout_height_chars` cannot both be set."
+)
+else:
+layout_height_chars = int(round(layout_height / y_density))
+blank_line = [(" ", None)] * layout_width_chars
+else:
+blank_line = []
+num_newlines = 0
+words_sorted_doctop = (
+self.tuples
+if presorted or use_text_flow
+else sorted(self.tuples, key=lambda x: float(x[0]["doctop"]))
+)
+first_word = words_sorted_doctop[0][0]
+doctop_start = first_word["doctop"] - first_word["top"]
+for i, ws in enumerate(
+cluster_objects(
+words_sorted_doctop, lambda x: float(x[0]["doctop"]), y_tolerance
+)
+):
+y_dist = (
+(ws[0][0]["doctop"] - (doctop_start + y_shift)) / y_density
+if layout
+else 0
+)
+num_newlines_prepend = max(
+# At least one newline, unless this iis the first line
+int(i > 0),
+# ... or as many as needed to get the imputed "distance" from the top
+round(y_dist) - num_newlines,
+)
+for i in range(num_newlines_prepend):
+if not len(_textmap) or _textmap[-1][0] == "\n":
+_textmap += blank_line
+_textmap.append(("\n", None))
+num_newlines += num_newlines_prepend
+line_len = 0
+line_words_sorted_x0 = (
+ws
+if presorted or use_text_flow
+else sorted(ws, key=lambda x: float(x[0]["x0"]))
+)
+for word, chars in line_words_sorted_x0:
+x_dist = (word["x0"] - x_shift) / x_density if layout else 0
+num_spaces_prepend = max(min(1, line_len), round(x_dist) - line_len)
+_textmap += [(" ", None)] * num_spaces_prepend
+line_len += num_spaces_prepend
+for c in chars:
+letters = expansions.get(c["text"], c["text"])
+for letter in letters:
+_textmap.append((letter, c))
+line_len += 1
+# Append spaces at end of line
+if layout:
+_textmap += [(" ", None)] * (layout_width_chars - line_len)
+# Append blank lines at end of text
+if layout:
+num_newlines_append = layout_height_chars - (num_newlines + 1)
+for i in range(num_newlines_append):
+if i > 0:
+_textmap += blank_line
+_textmap.append(("\n", None))
+# Remove terminal newline
+if _textmap[-1] == ("\n", None):
+_textmap = _textmap[:-1]
+return TextMap(_textmap)
+class WordExtractor:
+def __init__(
+self,
+x_tolerance=DEFAULT_X_TOLERANCE,
+y_tolerance=DEFAULT_Y_TOLERANCE,
+keep_blank_chars: bool = False,
+use_text_flow=False,
+horizontal_ltr=True,  # Should words be read left-to-right?
+vertical_ttb=False,  # Should vertical words be read top-to-bottom?
+extra_attrs=None,
+split_at_punctuation=False,
+expand_ligatures=True,
+):
+self.x_tolerance = x_tolerance
+self.y_tolerance = y_tolerance
+self.keep_blank_chars = keep_blank_chars
+self.use_text_flow = use_text_flow
+self.horizontal_ltr = horizontal_ltr
+self.vertical_ttb = vertical_ttb
+self.extra_attrs = [] if extra_attrs is None else extra_attrs
+# Note: string.punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
+self.split_at_punctuation = (
+string.punctuation
+if split_at_punctuation is True
+else (split_at_punctuation or "")
+)
+self.expansions = LIGATURES if expand_ligatures else {}
+def merge_chars(self, ordered_chars: list):
+x0, top, x1, bottom = objects_to_bbox(ordered_chars)
+doctop_adj = ordered_chars[0]["doctop"] - ordered_chars[0]["top"]
+upright = ordered_chars[0]["upright"]
+direction = 1 if (self.horizontal_ltr if upright else self.vertical_ttb) else -1
+matrix = ordered_chars[0]["matrix"]
+rotation = 0
+if not upright and matrix[1] < 0:
+ordered_chars = reversed(ordered_chars)
+rotation = 270
+if matrix[0] < 0 and matrix[3] < 0:
+rotation = 180
+elif matrix[1] > 0:
+rotation = 90
+word = {
+"text": "".join(
+self.expansions.get(c["text"], c["text"]) for c in ordered_chars
+),
+"x0": x0,
+"x1": x1,
+"top": top,
+"doctop": top + doctop_adj,
+"bottom": bottom,
+"upright": upright,
+"direction": direction,
+"rotation": rotation,
+}
+for key in self.extra_attrs:
+word[key] = ordered_chars[0][key]
+return word
+def char_begins_new_word(
+self,
+prev_char,
+curr_char,
+) -> bool:
+"""This method takes several factors into account to determine if
+`curr_char` represents the beginning of a new word:
+- Whether the text is "upright" (i.e., non-rotated)
+- Whether the user has specified that horizontal text runs
+left-to-right (default) or right-to-left, as represented by
+self.horizontal_ltr
+- Whether the user has specified that vertical text the text runs
+top-to-bottom (default) or bottom-to-top, as represented by
+self.vertical_ttb
+- The x0, top, x1, and bottom attributes of prev_char and
+curr_char
+- The self.x_tolerance and self.y_tolerance settings. Note: In
+this case, x/y refer to those directions for non-rotated text.
+For vertical text, they are flipped. A more accurate terminology
+might be "*intra*line character distance tolerance" and
+"*inter*line character distance tolerance"
+An important note: The *intra*line distance is measured from the
+*end* of the previous character to the *beginning* of the current
+character, while the *inter*line distance is measured from the
+*top* of the previous character to the *top* of the next
+character. The reasons for this are partly repository-historical,
+and partly logical, as successive text lines' bounding boxes often
+overlap slightly (and we don't want that overlap to be interpreted
+as the two lines being the same line).
+The upright-ness of the character determines the attributes to
+compare, while horizontal_ltr/vertical_ttb determine the direction
+of the comparison.
+"""
+# Note: Due to the grouping step earlier in the process,
+# curr_char["upright"] will always equal prev_char["upright"].
+if curr_char["upright"]:
+x = self.x_tolerance
+y = self.y_tolerance
+ay = prev_char["top"]
+cy = curr_char["top"]
+if self.horizontal_ltr:
+ax = prev_char["x0"]
+bx = prev_char["x1"]
+cx = curr_char["x0"]
+else:
+ax = -prev_char["x1"]
+bx = -prev_char["x0"]
+cx = -curr_char["x1"]
+else:
+x = self.y_tolerance
+y = self.x_tolerance
+ay = prev_char["x0"]
+cy = curr_char["x0"]
+if self.vertical_ttb:
+ax = prev_char["top"]
+bx = prev_char["bottom"]
+cx = curr_char["top"]
+else:
+ax = -prev_char["bottom"]
+bx = -prev_char["top"]
+cx = -curr_char["bottom"]
+return bool(
+# Intraline test
+(cx < ax)
+or (cx > bx + x)
+# Interline test
+or (cy > ay + y)
+)
+def iter_chars_to_words(self, ordered_chars):
+current_word: list = []
+def start_next_word(new_char=None):
+nonlocal current_word
+if current_word:
+yield current_word
+current_word = [] if new_char is None else [new_char]
+for char in ordered_chars:
+text = char["text"]
+if not self.keep_blank_chars and text.isspace():
+yield from start_next_word(None)
+elif text in self.split_at_punctuation:
+yield from start_next_word(char)
+yield from start_next_word(None)
+elif current_word and self.char_begins_new_word(current_word[-1], char):
+yield from start_next_word(char)
+else:
+current_word.append(char)
+# Finally, after all chars processed
+if current_word:
+yield current_word
+def iter_sort_chars(self, chars):
+def upright_key(x) -> int:
+return -int(x["upright"])
+for upright_cluster in cluster_objects(list(chars), upright_key, 0):
+upright = upright_cluster[0]["upright"]
+cluster_key = "doctop" if upright else "x0"
+# Cluster by line
+subclusters = cluster_objects(
+upright_cluster, itemgetter(cluster_key), self.y_tolerance
+)
+for sc in subclusters:
+# Sort within line
+sort_key = "x0" if upright else "doctop"
+to_yield = sorted(sc, key=itemgetter(sort_key))
+# Reverse order if necessary
+if not (self.horizontal_ltr if upright else self.vertical_ttb):
+yield from reversed(to_yield)
+else:
+yield from to_yield
+def iter_extract_tuples(self, chars):
+ordered_chars = chars if self.use_text_flow else self.iter_sort_chars(chars)
+grouping_key = itemgetter("upright", *self.extra_attrs)
+grouped_chars = itertools.groupby(ordered_chars, grouping_key)
+for keyvals, char_group in grouped_chars:
+for word_chars in self.iter_chars_to_words(char_group):
+yield (self.merge_chars(word_chars), word_chars)
+def extract_wordmap(self, chars) -> WordMap:
+return WordMap(list(self.iter_extract_tuples(chars)))
+def extract_words(self, chars: list) -> list:
+words = list(word for word, word_chars in self.iter_extract_tuples(chars))
+return words
+def extract_words(chars: list, **kwargs) -> list:
+return WordExtractor(**kwargs).extract_words(chars)
+TEXTMAP_KWARGS = inspect.signature(WordMap.to_textmap).parameters.keys()
+WORD_EXTRACTOR_KWARGS = inspect.signature(WordExtractor).parameters.keys()
+def chars_to_textmap(chars: list, **kwargs) -> TextMap:
+kwargs.update({"presorted": True})
+extractor = WordExtractor(
+**{k: kwargs[k] for k in WORD_EXTRACTOR_KWARGS if k in kwargs}
+)
+wordmap = extractor.extract_wordmap(chars)
+textmap = wordmap.to_textmap(
+**{k: kwargs[k] for k in TEXTMAP_KWARGS if k in kwargs}
+)
+return textmap
+def extract_text(chars: list, **kwargs) -> str:
+chars = to_list(chars)
+if len(chars) == 0:
+return ""
+if kwargs.get("layout"):
+return chars_to_textmap(chars, **kwargs).as_string
+else:
+y_tolerance = kwargs.get("y_tolerance", DEFAULT_Y_TOLERANCE)
+extractor = WordExtractor(
+**{k: kwargs[k] for k in WORD_EXTRACTOR_KWARGS if k in kwargs}
+)
+words = extractor.extract_words(chars)
+if words:
+rotation = words[0]["rotation"]  # rotation cannot change within a cell
+else:
+rotation = 0
+if rotation == 90:
+words.sort(key=lambda w: (w["x1"], -w["top"]))
+lines = " ".join([w["text"] for w in words])
+elif rotation == 270:
+words.sort(key=lambda w: (-w["x1"], w["top"]))
+lines = " ".join([w["text"] for w in words])
+else:
+lines = cluster_objects(words, itemgetter("doctop"), y_tolerance)
+lines = "\n".join(" ".join(word["text"] for word in line) for line in lines)
+if rotation == 180:  # needs extra treatment
+lines = "".join([(c if c != "\n" else " ") for c in reversed(lines)])
+return lines
+def collate_line(
+line_chars: list,
+tolerance=DEFAULT_X_TOLERANCE,
+) -> str:
+coll = ""
+last_x1 = None
+for char in sorted(line_chars, key=itemgetter("x0")):
+if (last_x1 is not None) and (char["x0"] > (last_x1 + tolerance)):
+coll += " "
+last_x1 = char["x1"]
+coll += char["text"]
+return coll
+def dedupe_chars(chars: list, tolerance=1) -> list:
+"""
+Removes duplicate chars — those sharing the same text, fontname, size,
+and positioning (within `tolerance`) as other characters in the set.
+"""
+key = itemgetter("fontname", "size", "upright", "text")
+pos_key = itemgetter("doctop", "x0")
+def yield_unique_chars(chars: list):
+sorted_chars = sorted(chars, key=key)
+for grp, grp_chars in itertools.groupby(sorted_chars, key=key):
+for y_cluster in cluster_objects(
+list(grp_chars), itemgetter("doctop"), tolerance
+):
+for x_cluster in cluster_objects(
+y_cluster, itemgetter("x0"), tolerance
+):
+yield sorted(x_cluster, key=pos_key)[0]
+deduped = yield_unique_chars(chars)
+return sorted(deduped, key=chars.index)
+def line_to_edge(line):
+edge = dict(line)
+edge["orientation"] = "h" if (line["top"] == line["bottom"]) else "v"
+return edge
+def rect_to_edges(rect) -> list:
+top, bottom, left, right = [dict(rect) for x in range(4)]
+top.update(
+{
+"object_type": "rect_edge",
+"height": 0,
+"y0": rect["y1"],
+"bottom": rect["top"],
+"orientation": "h",
+}
+)
+bottom.update(
+{
+"object_type": "rect_edge",
+"height": 0,
+"y1": rect["y0"],
+"top": rect["top"] + rect["height"],
+"doctop": rect["doctop"] + rect["height"],
+"orientation": "h",
+}
+)
+left.update(
+{
+"object_type": "rect_edge",
+"width": 0,
+"x1": rect["x0"],
+"orientation": "v",
+}
+)
+right.update(
+{
+"object_type": "rect_edge",
+"width": 0,
+"x0": rect["x1"],
+"orientation": "v",
+}
+)
+return [top, bottom, left, right]
+def curve_to_edges(curve) -> list:
+point_pairs = zip(curve["pts"], curve["pts"][1:])
+return [
+{
+"object_type": "curve_edge",
+"x0": min(p0[0], p1[0]),
+"x1": max(p0[0], p1[0]),
+"top": min(p0[1], p1[1]),
+"doctop": min(p0[1], p1[1]) + (curve["doctop"] - curve["top"]),
+"bottom": max(p0[1], p1[1]),
+"width": abs(p0[0] - p1[0]),
+"height": abs(p0[1] - p1[1]),
+"orientation": "v" if p0[0] == p1[0] else ("h" if p0[1] == p1[1] else None),
+}
+for p0, p1 in point_pairs
+]
+def obj_to_edges(obj) -> list:
+t = obj["object_type"]
+if "_edge" in t:
+return [obj]
+elif t == "line":
+return [line_to_edge(obj)]
+else:
+return {"rect": rect_to_edges, "curve": curve_to_edges}[t](obj)
+def filter_edges(
+edges,
+orientation=None,
+edge_type=None,
+min_length=1,
+) -> list:
+if orientation not in ("v", "h", None):
+raise ValueError("Orientation must be 'v' or 'h'")
+def test(e) -> bool:
+dim = "height" if e["orientation"] == "v" else "width"
+et_correct = e["object_type"] == edge_type if edge_type is not None else True
+orient_correct = orientation is None or e["orientation"] == orientation
+return bool(et_correct and orient_correct and (e[dim] >= min_length))
+return list(filter(test, edges))
+def cluster_list(xs, tolerance=0) -> list:
+if tolerance == 0:
+return [[x] for x in sorted(xs)]
+if len(xs) < 2:
+return [[x] for x in sorted(xs)]
+groups = []
+xs = list(sorted(xs))
+current_group = [xs[0]]
+last = xs[0]
+for x in xs[1:]:
+if x <= (last + tolerance):
+current_group.append(x)
+else:
+groups.append(current_group)
+current_group = [x]
+last = x
+groups.append(current_group)
+return groups
+def make_cluster_dict(values, tolerance) -> dict:
+clusters = cluster_list(list(set(values)), tolerance)
+nested_tuples = [
+[(val, i) for val in value_cluster] for i, value_cluster in enumerate(clusters)
+]
+return dict(itertools.chain(*nested_tuples))
+def cluster_objects(xs, key_fn, tolerance) -> list:
+if not callable(key_fn):
+key_fn = itemgetter(key_fn)
+values = map(key_fn, xs)
+cluster_dict = make_cluster_dict(values, tolerance)
+get_0, get_1 = itemgetter(0), itemgetter(1)
+cluster_tuples = sorted(((x, cluster_dict.get(key_fn(x))) for x in xs), key=get_1)
+grouped = itertools.groupby(cluster_tuples, key=get_1)
+return [list(map(get_0, v)) for k, v in grouped]
+def move_object(obj, axis: str, value):
+assert axis in ("h", "v")
+if axis == "h":
+new_items = [
+("x0", obj["x0"] + value),
+("x1", obj["x1"] + value),
+]
+if axis == "v":
+new_items = [
+("top", obj["top"] + value),
+("bottom", obj["bottom"] + value),
+]
+if "doctop" in obj:
+new_items += [("doctop", obj["doctop"] + value)]
+if "y0" in obj:
+new_items += [
+("y0", obj["y0"] - value),
+("y1", obj["y1"] - value),
+]
+return obj.__class__(tuple(obj.items()) + tuple(new_items))
+def snap_objects(objs, attr: str, tolerance) -> list:
+axis = {"x0": "h", "x1": "h", "top": "v", "bottom": "v"}[attr]
+list_objs = list(objs)
+clusters = cluster_objects(list_objs, itemgetter(attr), tolerance)
+avgs = [sum(map(itemgetter(attr), cluster)) / len(cluster) for cluster in clusters]
+snapped_clusters = [
+[move_object(obj, axis, avg - obj[attr]) for obj in cluster]
+for cluster, avg in zip(clusters, avgs)
+]
+return list(itertools.chain(*snapped_clusters))
+def snap_edges(
+edges,
+x_tolerance=DEFAULT_SNAP_TOLERANCE,
+y_tolerance=DEFAULT_SNAP_TOLERANCE,
+):
+"""
+Given a list of edges, snap any within `tolerance` pixels of one another
+to their positional average.
+"""
+by_orientation = {"v": [], "h": []}
+for e in edges:
+by_orientation[e["orientation"]].append(e)
+snapped_v = snap_objects(by_orientation["v"], "x0", x_tolerance)
+snapped_h = snap_objects(by_orientation["h"], "top", y_tolerance)
+return snapped_v + snapped_h
+def resize_object(obj, key: str, value):
+assert key in ("x0", "x1", "top", "bottom")
+old_value = obj[key]
+diff = value - old_value
+new_items = [
+(key, value),
+]
+if key == "x0":
+assert value <= obj["x1"]
+new_items.append(("width", obj["x1"] - value))
+elif key == "x1":
+assert value >= obj["x0"]
+new_items.append(("width", value - obj["x0"]))
+elif key == "top":
+assert value <= obj["bottom"]
+new_items.append(("doctop", obj["doctop"] + diff))
+new_items.append(("height", obj["height"] - diff))
+if "y1" in obj:
+new_items.append(("y1", obj["y1"] - diff))
+elif key == "bottom":
+assert value >= obj["top"]
+new_items.append(("height", obj["height"] + diff))
+if "y0" in obj:
+new_items.append(("y0", obj["y0"] - diff))
+return obj.__class__(tuple(obj.items()) + tuple(new_items))
+def join_edge_group(edges, orientation: str, tolerance=DEFAULT_JOIN_TOLERANCE):
+"""
+Given a list of edges along the same infinite line, join those that
+are within `tolerance` pixels of one another.
+"""
+if orientation == "h":
+min_prop, max_prop = "x0", "x1"
+elif orientation == "v":
+min_prop, max_prop = "top", "bottom"
+else:
+raise ValueError("Orientation must be 'v' or 'h'")
+sorted_edges = list(sorted(edges, key=itemgetter(min_prop)))
+joined = [sorted_edges[0]]
+for e in sorted_edges[1:]:
+last = joined[-1]
+if e[min_prop] <= (last[max_prop] + tolerance):
+if e[max_prop] > last[max_prop]:
+# Extend current edge to new extremity
+joined[-1] = resize_object(last, max_prop, e[max_prop])
+else:
+# Edge is separate from previous edges
+joined.append(e)
+return joined
+def merge_edges(
+edges,
+snap_x_tolerance,
+snap_y_tolerance,
+join_x_tolerance,
+join_y_tolerance,
+):
+"""
+Using the `snap_edges` and `join_edge_group` methods above,
+merge a list of edges into a more "seamless" list.
+"""
+def get_group(edge):
+if edge["orientation"] == "h":
+return ("h", edge["top"])
+else:
+return ("v", edge["x0"])
+if snap_x_tolerance > 0 or snap_y_tolerance > 0:
+edges = snap_edges(edges, snap_x_tolerance, snap_y_tolerance)
+_sorted = sorted(edges, key=get_group)
+edge_groups = itertools.groupby(_sorted, key=get_group)
+edge_gen = (
+join_edge_group(
+items, k[0], (join_x_tolerance if k[0] == "h" else join_y_tolerance)
+)
+for k, items in edge_groups
+)
+edges = list(itertools.chain(*edge_gen))
+return edges
+def bbox_to_rect(bbox) -> dict:
+"""
+Return the rectangle (i.e a dict with keys "x0", "top", "x1",
+"bottom") for an object.
+"""
+return {"x0": bbox[0], "top": bbox[1], "x1": bbox[2], "bottom": bbox[3]}
+def objects_to_rect(objects) -> dict:
+"""
+Given an iterable of objects, return the smallest rectangle (i.e. a
+dict with "x0", "top", "x1", and "bottom" keys) that contains them
+all.
+"""
+return bbox_to_rect(objects_to_bbox(objects))
+def merge_bboxes(bboxes):
+"""
+Given an iterable of bounding boxes, return the smallest bounding box
+that contains them all.
+"""
+x0, top, x1, bottom = zip(*bboxes)
+return (min(x0), min(top), max(x1), max(bottom))
+def objects_to_bbox(objects):
+"""
+Given an iterable of objects, return the smallest bounding box that
+contains them all.
+"""
+return merge_bboxes(map(bbox_getter, objects))
+def words_to_edges_h(words, word_threshold: int = DEFAULT_MIN_WORDS_HORIZONTAL):
+"""
+Find (imaginary) horizontal lines that connect the tops
+of at least `word_threshold` words.
+"""
+by_top = cluster_objects(words, itemgetter("top"), 1)
+large_clusters = filter(lambda x: len(x) >= word_threshold, by_top)
+rects = list(map(objects_to_rect, large_clusters))
+if len(rects) == 0:
+return []
+min_x0 = min(map(itemgetter("x0"), rects))
+max_x1 = max(map(itemgetter("x1"), rects))
+edges = []
+for r in rects:
+edges += [
+# Top of text
+{
+"x0": min_x0,
+"x1": max_x1,
+"top": r["top"],
+"bottom": r["top"],
+"width": max_x1 - min_x0,
+"orientation": "h",
+},
+# For each detected row, we also add the 'bottom' line.  This will
+# generate extra edges, (some will be redundant with the next row
+# 'top' line), but this catches the last row of every table.
+{
+"x0": min_x0,
+"x1": max_x1,
+"top": r["bottom"],
+"bottom": r["bottom"],
+"width": max_x1 - min_x0,
+"orientation": "h",
+},
+]
+return edges
+def get_bbox_overlap(a, b):
+a_left, a_top, a_right, a_bottom = a
+b_left, b_top, b_right, b_bottom = b
+o_left = max(a_left, b_left)
+o_right = min(a_right, b_right)
+o_bottom = min(a_bottom, b_bottom)
+o_top = max(a_top, b_top)
+o_width = o_right - o_left
+o_height = o_bottom - o_top
+if o_height >= 0 and o_width >= 0 and o_height + o_width > 0:
+return (o_left, o_top, o_right, o_bottom)
+else:
+return None
+def words_to_edges_v(words, word_threshold: int = DEFAULT_MIN_WORDS_VERTICAL):
+"""
+Find (imaginary) vertical lines that connect the left, right, or
+center of at least `word_threshold` words.
+"""
+# Find words that share the same left, right, or centerpoints
+by_x0 = cluster_objects(words, itemgetter("x0"), 1)
+by_x1 = cluster_objects(words, itemgetter("x1"), 1)
+def get_center(word):
+return float(word["x0"] + word["x1"]) / 2
+by_center = cluster_objects(words, get_center, 1)
+clusters = by_x0 + by_x1 + by_center
+# Find the points that align with the most words
+sorted_clusters = sorted(clusters, key=lambda x: -len(x))
+large_clusters = filter(lambda x: len(x) >= word_threshold, sorted_clusters)
+# For each of those points, find the bboxes fitting all matching words
+bboxes = list(map(objects_to_bbox, large_clusters))
+# Iterate through those bboxes, condensing overlapping bboxes
+condensed_bboxes = []
+for bbox in bboxes:
+overlap = any(get_bbox_overlap(bbox, c) for c in condensed_bboxes)
+if not overlap:
+condensed_bboxes.append(bbox)
+if not condensed_bboxes:
+return []
+condensed_rects = map(bbox_to_rect, condensed_bboxes)
+sorted_rects = list(sorted(condensed_rects, key=itemgetter("x0")))
+max_x1 = max(map(itemgetter("x1"), sorted_rects))
+min_top = min(map(itemgetter("top"), sorted_rects))
+max_bottom = max(map(itemgetter("bottom"), sorted_rects))
+return [
+{
+"x0": b["x0"],
+"x1": b["x0"],
+"top": min_top,
+"bottom": max_bottom,
+"height": max_bottom - min_top,
+"orientation": "v",
+}
+for b in sorted_rects
+] + [
+{
+"x0": max_x1,
+"x1": max_x1,
+"top": min_top,
+"bottom": max_bottom,
+"height": max_bottom - min_top,
+"orientation": "v",
+}
+]
+def edges_to_intersections(edges, x_tolerance=1, y_tolerance=1) -> dict:
+"""
+Given a list of edges, return the points at which they intersect
+within `tolerance` pixels.
+"""
+intersections = {}
+v_edges, h_edges = [
+list(filter(lambda x: x["orientation"] == o, edges)) for o in ("v", "h")
+]
+for v in sorted(v_edges, key=itemgetter("x0", "top")):
+for h in sorted(h_edges, key=itemgetter("top", "x0")):
+if (
+(v["top"] <= (h["top"] + y_tolerance))
+and (v["bottom"] >= (h["top"] - y_tolerance))
+and (v["x0"] >= (h["x0"] - x_tolerance))
+and (v["x0"] <= (h["x1"] + x_tolerance))
+):
+vertex = (v["x0"], h["top"])
+if vertex not in intersections:
+intersections[vertex] = {"v": [], "h": []}
+intersections[vertex]["v"].append(v)
+intersections[vertex]["h"].append(h)
+return intersections
+def obj_to_bbox(obj):
+"""
+Return the bounding box for an object.
+"""
+return bbox_getter(obj)
+def intersections_to_cells(intersections):
+"""
+Given a list of points (`intersections`), return all rectangular "cells"
+that those points describe.
+`intersections` should be a dictionary with (x0, top) tuples as keys,
+and a list of edge objects as values. The edge objects should correspond
+to the edges that touch the intersection.
+"""
+def edge_connects(p1, p2) -> bool:
+def edges_to_set(edges):
+return set(map(obj_to_bbox, edges))
+if p1[0] == p2[0]:
+common = edges_to_set(intersections[p1]["v"]).intersection(
+edges_to_set(intersections[p2]["v"])
+)
+if len(common):
+return True
+if p1[1] == p2[1]:
+common = edges_to_set(intersections[p1]["h"]).intersection(
+edges_to_set(intersections[p2]["h"])
+)
+if len(common):
+return True
+return False
+points = list(sorted(intersections.keys()))
+n_points = len(points)
+def find_smallest_cell(points, i: int):
+if i == n_points - 1:
+return None
+pt = points[i]
+rest = points[i + 1 :]
+# Get all the points directly below and directly right
+below = [x for x in rest if x[0] == pt[0]]
+right = [x for x in rest if x[1] == pt[1]]
+for below_pt in below:
+if not edge_connects(pt, below_pt):
+continue
+for right_pt in right:
+if not edge_connects(pt, right_pt):
+continue
+bottom_right = (right_pt[0], below_pt[1])
+if (
+(bottom_right in intersections)
+and edge_connects(bottom_right, right_pt)
+and edge_connects(bottom_right, below_pt)
+):
+return (pt[0], pt[1], bottom_right[0], bottom_right[1])
+return None
+cell_gen = (find_smallest_cell(points, i) for i in range(len(points)))
+return list(filter(None, cell_gen))
+def cells_to_tables(page, cells) -> list:
+"""
+Given a list of bounding boxes (`cells`), return a list of tables that
+hold those cells most simply (and contiguously).
+"""
+def bbox_to_corners(bbox) -> tuple:
+x0, top, x1, bottom = bbox
+return ((x0, top), (x0, bottom), (x1, top), (x1, bottom))
+remaining_cells = list(cells)
+# Iterate through the cells found above, and assign them
+# to contiguous tables
+current_corners = set()
+current_cells = []
+tables = []
+while len(remaining_cells):
+initial_cell_count = len(current_cells)
+for cell in list(remaining_cells):
+cell_corners = bbox_to_corners(cell)
+# If we're just starting a table ...
+if len(current_cells) == 0:
+# ... immediately assign it to the empty group
+current_corners |= set(cell_corners)
+current_cells.append(cell)
+remaining_cells.remove(cell)
+else:
+# How many corners does this table share with the current group?
+corner_count = sum(c in current_corners for c in cell_corners)
+# If touching on at least one corner...
+if corner_count > 0:
+# ... assign it to the current group
+current_corners |= set(cell_corners)
+current_cells.append(cell)
+remaining_cells.remove(cell)
+# If this iteration did not find any more cells to append...
+if len(current_cells) == initial_cell_count:
+# ... start a new cell group
+tables.append(list(current_cells))
+current_corners.clear()
+current_cells.clear()
+# Once we have exhausting the list of cells ...
+# ... and we have a cell group that has not been stored
+if len(current_cells):
+# ... store it.
+tables.append(list(current_cells))
+# PyMuPDF modification:
+# Remove tables without text or having only 1 column
+for i in range(len(tables) - 1, -1, -1):
+r = EMPTY_RECT()
+x1_vals = set()
+x0_vals = set()
+for c in tables[i]:
+r |= c
+x1_vals.add(c[2])
+x0_vals.add(c[0])
+if (
+len(x1_vals) < 2
+or len(x0_vals) < 2
+or white_spaces.issuperset(
+page.get_textbox(
+r,
+textpage=TEXTPAGE,
+)
+)
+):
+del tables[i]
+# Sort the tables top-to-bottom-left-to-right based on the value of the
+# topmost-and-then-leftmost coordinate of a table.
+_sorted = sorted(tables, key=lambda t: min((c[1], c[0]) for c in t))
+return _sorted
+class CellGroup:
+def __init__(self, cells):
+self.cells = cells
+self.bbox = (
+min(map(itemgetter(0), filter(None, cells))),
+min(map(itemgetter(1), filter(None, cells))),
+max(map(itemgetter(2), filter(None, cells))),
+max(map(itemgetter(3), filter(None, cells))),
+)
+class TableRow(CellGroup):
+pass
+class TableHeader:
+"""PyMuPDF extension containing the identified table header."""
+def __init__(self, bbox, cells, names, above):
+self.bbox = bbox
+self.cells = cells
+self.names = names
+self.external = above
+class Table:
+def __init__(self, page, cells):
+self.page = page
+self.cells = cells
+self.header = self._get_header()  # PyMuPDF extension
+@property
+def bbox(self):
+c = self.cells
+return (
+min(map(itemgetter(0), c)),
+min(map(itemgetter(1), c)),
+max(map(itemgetter(2), c)),
+max(map(itemgetter(3), c)),
+)
+@property
+def rows(self) -> list:
+_sorted = sorted(self.cells, key=itemgetter(1, 0))
+xs = list(sorted(set(map(itemgetter(0), self.cells))))
+rows = []
+for y, row_cells in itertools.groupby(_sorted, itemgetter(1)):
+xdict = {cell[0]: cell for cell in row_cells}
+row = TableRow([xdict.get(x) for x in xs])
+rows.append(row)
+return rows
+@property
+def row_count(self) -> int:  # PyMuPDF extension
+return len(self.rows)
+@property
+def col_count(self) -> int:  # PyMuPDF extension
+return max([len(r.cells) for r in self.rows])
+def extract(self, **kwargs) -> list:
+chars = CHARS
+table_arr = []
+def char_in_bbox(char, bbox) -> bool:
+v_mid = (char["top"] + char["bottom"]) / 2
+h_mid = (char["x0"] + char["x1"]) / 2
+x0, top, x1, bottom = bbox
+return bool(
+(h_mid >= x0) and (h_mid < x1) and (v_mid >= top) and (v_mid < bottom)
+)
+for row in self.rows:
+arr = []
+row_chars = [char for char in chars if char_in_bbox(char, row.bbox)]
+for cell in row.cells:
+if cell is None:
+cell_text = None
+else:
+cell_chars = [
+char for char in row_chars if char_in_bbox(char, cell)
+]
+if len(cell_chars):
+kwargs["x_shift"] = cell[0]
+kwargs["y_shift"] = cell[1]
+if "layout" in kwargs:
+kwargs["layout_width"] = cell[2] - cell[0]
+kwargs["layout_height"] = cell[3] - cell[1]
+cell_text = extract_text(cell_chars, **kwargs)
+else:
+cell_text = ""
+arr.append(cell_text)
+table_arr.append(arr)
+return table_arr
+def to_markdown(self, clean=False, fill_empty=True):
+"""Output table content as a string in Github-markdown format.
+If "clean" then markdown syntax is removed from cell content.
+If "fill_empty" then cell content None is replaced by the values
+above (columns) or left (rows) in an effort to approximate row and
+columns spans.
+"""
+output = "|"
+rows = self.row_count
+cols = self.col_count
+# cell coordinates
+cell_boxes = [[c for c in r.cells] for r in self.rows]
+# cell text strings
+cells = [[None for i in range(cols)] for j in range(rows)]
+for i, row in enumerate(cell_boxes):
+for j, cell in enumerate(row):
+if cell is not None:
+cells[i][j] = extract_cells(
+TEXTPAGE, cell_boxes[i][j], markdown=True
+)
+if fill_empty:  # fill "None" cells where possible
+# for rows, copy content from left to right
+for j in range(rows):
+for i in range(cols - 1):
+if cells[j][i + 1] is None:
+cells[j][i + 1] = cells[j][i]
+# for columns, copy top to bottom
+for i in range(cols):
+for j in range(rows - 1):
+if cells[j + 1][i] is None:
+cells[j + 1][i] = cells[j][i]
+# generate header string and MD separator
+for i, name in enumerate(self.header.names):
+if not name:  # generate a name if empty
+name = f"Col{i+1}"
+name = name.replace("\n", "<br>")  # use HTML line breaks
+if clean:  # remove sensitive syntax
+name = html.escape(name.replace("-", "&#45;"))
+output += name + "|"
+output += "\n"
+# insert GitHub header line separator
+output += "|" + "|".join("---" for i in range(self.col_count)) + "|\n"
+# skip first row in details if header is part of the table
+j = 0 if self.header.external else 1
+# iterate over detail rows
+for row in cells[j:]:
+line = "|"
+for i, cell in enumerate(row):
+# replace None cells with empty string
+# use HTML line break tag
+if cell is None:
+cell = ""
+if clean:  # remove sensitive syntax
+cell = html.escape(cell.replace("-", "&#45;"))
+line += cell + "|"
+line += "\n"
+output += line
+return output + "\n"
+def to_pandas(self, **kwargs):
+"""Return a pandas DataFrame version of the table."""
+try:
+import pandas as pd
+except ModuleNotFoundError:
+message("Package 'pandas' is not installed")
+raise
+pd_dict = {}
+extract = self.extract()
+hdr = self.header
+names = self.header.names
+hdr_len = len(names)
+# ensure uniqueness of column names
+for i in range(hdr_len):
+name = names[i]
+if not name:
+names[i] = f"Col{i}"
+if hdr_len != len(set(names)):
+for i in range(hdr_len):
+name = names[i]
+if name != f"Col{i}":
+names[i] = f"{i}-{name}"
+if not hdr.external:  # header is part of 'extract'
+extract = extract[1:]
+for i in range(hdr_len):
+key = names[i]
+value = []
+for j in range(len(extract)):
+value.append(extract[j][i])
+pd_dict[key] = value
+return pd.DataFrame(pd_dict)
+def _get_header(self, y_tolerance=3):
+"""Identify the table header.
+*** PyMuPDF extension. ***
+Starting from the first line above the table upwards, check if it
+qualifies to be part of the table header.
+Criteria include:
+* A one-line table never has an extra header.
+* Column borders must not intersect any word. If this happens, all
+text of this line and above of it is ignored.
+* No excess inter-line distance: If a line further up has a distance
+of more than 1.5 times of its font size, it will be ignored and
+all lines above of it.
+* Must have same text properties.
+* Starting with the top table line, a bold text property cannot change
+back to non-bold.
+If not all criteria are met (or there is no text above the table),
+the first table row is assumed to be the header.
+"""
+page = self.page
+y_delta = y_tolerance
+def top_row_bg_color(self):
+"""
+Compare top row background color with color of same-sized bbox
+above. If different, return True indicating that the original
+table top row is already the header.
+"""
+bbox0 = Rect(self.rows[0].bbox)
+bboxt = bbox0 + (0, -bbox0.height, 0, -bbox0.height)  # area above
+top_color0 = page.get_pixmap(clip=bbox0).color_topusage()[1]
+top_colort = page.get_pixmap(clip=bboxt).color_topusage()[1]
+if top_color0 != top_colort:
+return True  # top row is header
+return False
+def row_has_bold(bbox):
+"""Check if a row contains some bold text.
+If e.g. true for the top row, then it will be used as (internal)
+column header row if any of the following is true:
+* the previous (above) text line has no bold span
+* the second table row text has no bold span
+Returns True if any spans are bold else False.
+"""
+blocks = page.get_text("dict", flags=TEXTFLAGS_TEXT, clip=bbox)["blocks"]
+spans = [s for b in blocks for l in b["lines"] for s in l["spans"]]
+return any(s["flags"] & TEXT_FONT_BOLD for s in spans)
+try:
+row = self.rows[0]
+cells = row.cells
+bbox = Rect(row.bbox)
+except IndexError:  # this table has no rows
+return None
+# return this if we determine that the top row is the header
+header_top_row = TableHeader(bbox, cells, self.extract()[0], False)
+# 1-line tables have no extra header
+if len(self.rows) < 2:
+return header_top_row
+# 1-column tables have no extra header
+if len(cells) < 2:
+return header_top_row
+# assume top row is the header if second row is empty
+row2 = self.rows[1]  # second row
+if all(c is None for c in row2.cells):  # no valid cell bboxes in row2
+return header_top_row
+# Special check: is top row bold?
+top_row_bold = row_has_bold(bbox)
+# assume top row is header if it is bold and any cell
+# of 2nd row is non-bold
+if top_row_bold and not row_has_bold(row2.bbox):
+return header_top_row
+if top_row_bg_color(self):
+# if area above top row has a different background color,
+# then top row is already the header
+return header_top_row
+# column coordinates (x1 values) in top row
+col_x = [c[2] if c is not None else None for c in cells[:-1]]
+# clip = page area above the table
+# We will inspect this area for text qualifying as column header.
+clip = +bbox  # take row 0 bbox
+clip.y0 = 0  # start at top of page
+clip.y1 = bbox.y0  # end at top of table
+blocks = page.get_text("dict", clip=clip, flags=TEXTFLAGS_TEXT)["blocks"]
+# non-empty, non-superscript spans above table, sorted descending by y1
+spans = sorted(
+[
+s
+for b in blocks
+for l in b["lines"]
+for s in l["spans"]
+if not (
+white_spaces.issuperset(s["text"])
+or s["flags"] & TEXT_FONT_SUPERSCRIPT
+)
+],
+key=lambda s: s["bbox"][3],
+reverse=True,
+)
+select = []  # y1 coordinates above, sorted descending
+line_heights = []  # line heights above, sorted descending
+line_bolds = []  # bold indicator per line above, same sorting
+# walk through the spans and fill above 3 lists
+for i in range(len(spans)):
+s = spans[i]
+y1 = s["bbox"][3]  # span bottom
+h = y1 - s["bbox"][1]  # span bbox height
+bold = s["flags"] & TEXT_FONT_BOLD
+# use first item to start the lists
+if i == 0:
+select.append(y1)
+line_heights.append(h)
+line_bolds.append(bold)
+continue
+# get previous items from the 3 lists
+y0 = select[-1]
+h0 = line_heights[-1]
+bold0 = line_bolds[-1]
+if bold0 and not bold:
+break  # stop if switching from bold to non-bold
+# if fitting in height of previous span, modify bbox
+if y0 - y1 <= y_delta or abs((y0 - h0) - s["bbox"][1]) <= y_delta:
+s["bbox"] = (s["bbox"][0], y0 - h0, s["bbox"][2], y0)
+spans[i] = s
+if bold:
+line_bolds[-1] = bold
+continue
+elif y0 - y1 > 1.5 * h0:
+break  # stop if distance to previous line too large
+select.append(y1)
+line_heights.append(h)
+line_bolds.append(bold)
+if select == []:  # nothing above the table?
+return header_top_row
+select = select[:5]  # accept up to 5 lines for an external header
+# assume top row as header if text above is too far away
+if bbox.y0 - select[0] >= line_heights[0]:
+return header_top_row
+# accept top row as header if bold, but line above is not
+if top_row_bold and not line_bolds[0]:
+return header_top_row
+if spans == []:  # nothing left above the table, return top row
+return header_top_row
+# re-compute clip above table
+nclip = EMPTY_RECT()
+for s in [s for s in spans if s["bbox"][3] >= select[-1]]:
+nclip |= s["bbox"]
+if not nclip.is_empty:
+clip = nclip
+clip.y1 = bbox.y0  # make sure we still include every word above
+# Confirm that no word in clip is intersecting a column separator
+word_rects = [Rect(w[:4]) for w in page.get_text("words", clip=clip)]
+word_tops = sorted(list(set([r[1] for r in word_rects])), reverse=True)
+select = []
+# exclude lines with words that intersect a column border
+for top in word_tops:
+intersecting = [
+(x, r)
+for x in col_x
+if x is not None
+for r in word_rects
+if r[1] == top and r[0] < x and r[2] > x
+]
+if intersecting == []:
+select.append(top)
+else:  # detected a word crossing a column border
+break
+if select == []:  # nothing left over: return first row
+return header_top_row
+hdr_bbox = +clip  # compute the header cells
+hdr_bbox.y0 = select[-1]  # hdr_bbox top is smallest top coord of words
+hdr_cells = [
+(c[0], hdr_bbox.y0, c[2], hdr_bbox.y1) if c is not None else None
+for c in cells
+]
+# adjust left/right of header bbox
+hdr_bbox.x0 = self.bbox[0]
+hdr_bbox.x1 = self.bbox[2]
+# column names: no line breaks, no excess spaces
+hdr_names = [
+(
+page.get_textbox(c).replace("\n", " ").replace("  ", " ").strip()
+if c is not None
+else ""
+)
+for c in hdr_cells
+]
+return TableHeader(tuple(hdr_bbox), hdr_cells, hdr_names, True)
+@dataclass
+class TableSettings:
+vertical_strategy: str = "lines"
+horizontal_strategy: str = "lines"
+explicit_vertical_lines: list = None
+explicit_horizontal_lines: list = None
+snap_tolerance: float = DEFAULT_SNAP_TOLERANCE
+snap_x_tolerance: float = UNSET
+snap_y_tolerance: float = UNSET
+join_tolerance: float = DEFAULT_JOIN_TOLERANCE
+join_x_tolerance: float = UNSET
+join_y_tolerance: float = UNSET
+edge_min_length: float = 3
+min_words_vertical: float = DEFAULT_MIN_WORDS_VERTICAL
+min_words_horizontal: float = DEFAULT_MIN_WORDS_HORIZONTAL
+intersection_tolerance: float = 3
+intersection_x_tolerance: float = UNSET
+intersection_y_tolerance: float = UNSET
+text_settings: dict = None
+def __post_init__(self) -> "TableSettings":
+"""Clean up user-provided table settings.
+Validates that the table settings provided consists of acceptable values and
+returns a cleaned up version. The cleaned up version fills out the missing
+values with the default values in the provided settings.
+TODO: Can be further used to validate that the values are of the correct
+type. For example, raising a value error when a non-boolean input is
+provided for the key ``keep_blank_chars``.
+:param table_settings: User-provided table settings.
+:returns: A cleaned up version of the user-provided table settings.
+:raises ValueError: When an unrecognised key is provided.
+"""
+for setting in NON_NEGATIVE_SETTINGS:
+if (getattr(self, setting) or 0) < 0:
+raise ValueError(f"Table setting '{setting}' cannot be negative")
+for orientation in ["horizontal", "vertical"]:
+strategy = getattr(self, orientation + "_strategy")
+if strategy not in TABLE_STRATEGIES:
+raise ValueError(
+f"{orientation}_strategy must be one of"
+f'{{{",".join(TABLE_STRATEGIES)}}}'
+)
+if self.text_settings is None:
+self.text_settings = {}
+# This next section is for backwards compatibility
+for attr in ["x_tolerance", "y_tolerance"]:
+if attr not in self.text_settings:
+self.text_settings[attr] = self.text_settings.get("tolerance", 3)
+if "tolerance" in self.text_settings:
+del self.text_settings["tolerance"]
+# End of that section
+for attr, fallback in [
+("snap_x_tolerance", "snap_tolerance"),
+("snap_y_tolerance", "snap_tolerance"),
+("join_x_tolerance", "join_tolerance"),
+("join_y_tolerance", "join_tolerance"),
+("intersection_x_tolerance", "intersection_tolerance"),
+("intersection_y_tolerance", "intersection_tolerance"),
+]:
+if getattr(self, attr) is UNSET:
+setattr(self, attr, getattr(self, fallback))
+return self
+@classmethod
+def resolve(cls, settings=None):
+if settings is None:
+return cls()
+elif isinstance(settings, cls):
+return settings
+elif isinstance(settings, dict):
+core_settings = {}
+text_settings = {}
+for k, v in settings.items():
+if k[:5] == "text_":
+text_settings[k[5:]] = v
+else:
+core_settings[k] = v
+core_settings["text_settings"] = text_settings
+return cls(**core_settings)
+else:
+raise ValueError(f"Cannot resolve settings: {settings}")
+class TableFinder:
+"""
+Given a PDF page, find plausible table structures.
+Largely borrowed from Anssi Nurminen's master's thesis:
+http://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3
+... and inspired by Tabula:
+https://github.com/tabulapdf/tabula-extractor/issues/16
+"""
+def __init__(self, page, settings=None):
+self.page = weakref.proxy(page)
+self.settings = TableSettings.resolve(settings)
+self.edges = self.get_edges()
+self.intersections = edges_to_intersections(
+self.edges,
+self.settings.intersection_x_tolerance,
+self.settings.intersection_y_tolerance,
+)
+self.cells = intersections_to_cells(self.intersections)
+self.tables = [
+Table(self.page, cell_group)
+for cell_group in cells_to_tables(self.page, self.cells)
+]
+def get_edges(self) -> list:
+settings = self.settings
+for orientation in ["vertical", "horizontal"]:
+strategy = getattr(settings, orientation + "_strategy")
+if strategy == "explicit":
+lines = getattr(settings, "explicit_" + orientation + "_lines")
+if len(lines) < 2:
+raise ValueError(
+f"If {orientation}_strategy == 'explicit', "
+f"explicit_{orientation}_lines "
+f"must be specified as a list/tuple of two or more "
+f"floats/ints."
+)
+v_strat = settings.vertical_strategy
+h_strat = settings.horizontal_strategy
+if v_strat == "text" or h_strat == "text":
+words = extract_words(CHARS, **(settings.text_settings or {}))
+else:
+words = []
+v_explicit = []
+for desc in settings.explicit_vertical_lines or []:
+if isinstance(desc, dict):
+for e in obj_to_edges(desc):
+if e["orientation"] == "v":
+v_explicit.append(e)
+else:
+v_explicit.append(
+{
+"x0": desc,
+"x1": desc,
+"top": self.page.rect[1],
+"bottom": self.page.rect[3],
+"height": self.page.rect[3] - self.page.rect[1],
+"orientation": "v",
+}
+)
+if v_strat == "lines":
+v_base = filter_edges(EDGES, "v")
+elif v_strat == "lines_strict":
+v_base = filter_edges(EDGES, "v", edge_type="line")
+elif v_strat == "text":
+v_base = words_to_edges_v(words, word_threshold=settings.min_words_vertical)
+elif v_strat == "explicit":
+v_base = []
+else:
+v_base = []
+v = v_base + v_explicit
+h_explicit = []
+for desc in settings.explicit_horizontal_lines or []:
+if isinstance(desc, dict):
+for e in obj_to_edges(desc):
+if e["orientation"] == "h":
+h_explicit.append(e)
+else:
+h_explicit.append(
+{
+"x0": self.page.rect[0],
+"x1": self.page.rect[2],
+"width": self.page.rect[2] - self.page.rect[0],
+"top": desc,
+"bottom": desc,
+"orientation": "h",
+}
+)
+if h_strat == "lines":
+h_base = filter_edges(EDGES, "h")
+elif h_strat == "lines_strict":
+h_base = filter_edges(EDGES, "h", edge_type="line")
+elif h_strat == "text":
+h_base = words_to_edges_h(
+words, word_threshold=settings.min_words_horizontal
+)
+elif h_strat == "explicit":
+h_base = []
+else:
+h_base = []
+h = h_base + h_explicit
+edges = list(v) + list(h)
+edges = merge_edges(
+edges,
+snap_x_tolerance=settings.snap_x_tolerance,
+snap_y_tolerance=settings.snap_y_tolerance,
+join_x_tolerance=settings.join_x_tolerance,
+join_y_tolerance=settings.join_y_tolerance,
+)
+return filter_edges(edges, min_length=settings.edge_min_length)
+def __getitem__(self, i):
+tcount = len(self.tables)
+if i >= tcount:
+raise IndexError("table not on page")
+while i < 0:
+i += tcount
+return self.tables[i]
+"""
+Start of PyMuPDF interface code.
+The following functions are executed when "page.find_tables()" is called.
+* make_chars: Fills the CHARS list with text character information extracted
+via "rawdict" text extraction. Items in CHARS are formatted
+as expected by the table code.
+* make_edges: Fills the EDGES list with vector graphic information extracted
+via "get_drawings". Items in EDGES are formatted as expected
+by the table code.
+The lists CHARS and EDGES are used to replace respective document access
+of pdfplumber or, respectively pdfminer.
+The table code has been modified to use these lists instead of accessing
+page information themselves.
+"""
+# -----------------------------------------------------------------------------
+# Extract all page characters to fill the CHARS list
+# -----------------------------------------------------------------------------
+def make_chars(page, clip=None):
+"""Extract text as "rawdict" to fill CHARS."""
+global TEXTPAGE
+page_number = page.number + 1
+page_height = page.rect.height
+ctm = page.transformation_matrix
+TEXTPAGE = page.get_textpage(clip=clip, flags=FLAGS)
+blocks = page.get_text("rawdict", textpage=TEXTPAGE)["blocks"]
+doctop_base = page_height * page.number
+for block in blocks:
+for line in block["lines"]:
+ldir = line["dir"]  # = (cosine, sine) of angle
+ldir = (round(ldir[0], 4), round(ldir[1], 4))
+matrix = Matrix(ldir[0], -ldir[1], ldir[1], ldir[0], 0, 0)
+if ldir[1] == 0:
+upright = True
+else:
+upright = False
+for span in sorted(line["spans"], key=lambda s: s["bbox"][0]):
+fontname = span["font"]
+fontsize = span["size"]
+color = sRGB_to_pdf(span["color"])
+for char in sorted(span["chars"], key=lambda c: c["bbox"][0]):
+bbox = Rect(char["bbox"])
+bbox_ctm = bbox * ctm
+origin = Point(char["origin"]) * ctm
+matrix.e = origin.x
+matrix.f = origin.y
+text = char["c"]
+char_dict = {
+"adv": bbox.x1 - bbox.x0 if upright else bbox.y1 - bbox.y0,
+"bottom": bbox.y1,
+"doctop": bbox.y0 + doctop_base,
+"fontname": fontname,
+"height": bbox.y1 - bbox.y0,
+"matrix": tuple(matrix),
+"ncs": "DeviceRGB",
+"non_stroking_color": color,
+"non_stroking_pattern": None,
+"object_type": "char",
+"page_number": page_number,
+"size": fontsize if upright else bbox.y1 - bbox.y0,
+"stroking_color": color,
+"stroking_pattern": None,
+"text": text,
+"top": bbox.y0,
+"upright": upright,
+"width": bbox.x1 - bbox.x0,
+"x0": bbox.x0,
+"x1": bbox.x1,
+"y0": bbox_ctm.y0,
+"y1": bbox_ctm.y1,
+}
+CHARS.append(char_dict)
+# ------------------------------------------------------------------------
+# Extract all page vector graphics to fill the EDGES list.
+# We are ignoring Bézier curves completely and are converting everything
+# else to lines.
+# ------------------------------------------------------------------------
+def make_edges(page, clip=None, tset=None, paths=None, add_lines=None, add_boxes=None):
+snap_x = tset.snap_x_tolerance
+snap_y = tset.snap_y_tolerance
+min_length = tset.edge_min_length
+lines_strict = (
+tset.vertical_strategy == "lines_strict"
+or tset.horizontal_strategy == "lines_strict"
+)
+page_height = page.rect.height
+doctop_basis = page.number * page_height
+page_number = page.number + 1
+prect = page.rect
+if page.rotation in (90, 270):
+w, h = prect.br
+prect = Rect(0, 0, h, w)
+if clip is not None:
+clip = Rect(clip)
+else:
+clip = prect
+def are_neighbors(r1, r2):
+"""Detect whether r1, r2 are neighbors.
+Defined as:
+The minimum distance between points of r1 and points of r2 is not
+larger than some delta.
+This check supports empty rect-likes and thus also lines.
+Note:
+This type of check is MUCH faster than native Rect containment checks.
+"""
+if (  # check if x-coordinates of r1 are within those of r2
+r2.x0 - snap_x <= r1.x0 <= r2.x1 + snap_x
+or r2.x0 - snap_x <= r1.x1 <= r2.x1 + snap_x
+) and (  # ... same for y-coordinates
+r2.y0 - snap_y <= r1.y0 <= r2.y1 + snap_y
+or r2.y0 - snap_y <= r1.y1 <= r2.y1 + snap_y
+):
+return True
+# same check with r1 / r2 exchanging their roles (this is necessary!)
+if (
+r1.x0 - snap_x <= r2.x0 <= r1.x1 + snap_x
+or r1.x0 - snap_x <= r2.x1 <= r1.x1 + snap_x
+) and (
+r1.y0 - snap_y <= r2.y0 <= r1.y1 + snap_y
+or r1.y0 - snap_y <= r2.y1 <= r1.y1 + snap_y
+):
+return True
+return False
+def clean_graphics(npaths=None):
+"""Detect and join rectangles of "connected" vector graphics."""
+if npaths is None:
+allpaths = page.get_drawings()
+else:  # accept passed-in vector graphics
+allpaths = npaths[:]  # paths relevant for table detection
+paths = []
+for p in allpaths:
+# If only looking at lines, we ignore fill-only paths,
+# except simulated lines (i.e. small width or height).
+if (
+lines_strict
+and p["type"] == "f"
+and p["rect"].width > snap_x
+and p["rect"].height > snap_y
+):
+continue
+paths.append(p)
+# start with all vector graphics rectangles
+prects = sorted(set([p["rect"] for p in paths]), key=lambda r: (r.y1, r.x0))
+new_rects = []  # the final list of joined rectangles
+# ----------------------------------------------------------------
+# Strategy: Join rectangles that "almost touch" each other.
+# Extend first rectangle with any other that is a "neighbor".
+# Then move it to the final list and continue with the rest.
+# ----------------------------------------------------------------
+while prects:  # the algorithm will empty this list
+prect0 = prects[0]  # copy of first rectangle (performance reasons!)
+repeat = True
+while repeat:  # this loop extends first rect in list
+repeat = False  # set to true again if some other rect touches
+for i in range(len(prects) - 1, 0, -1):  # run backwards
+if are_neighbors(prect0, prects[i]):  # close enough to rect 0?
+prect0 |= prects[i].tl  # extend rect 0
+prect0 |= prects[i].br  # extend rect 0
+del prects[i]  # delete this rect
+repeat = True  # keep checking the rest
+# move rect 0 over to result list if there is some text in it
+if not white_spaces.issuperset(page.get_textbox(prect0, textpage=TEXTPAGE)):
+# contains text, so accept it as a table bbox candidate
+new_rects.append(prect0)
+del prects[0]  # remove from rect list
+return new_rects, paths
+bboxes, paths = clean_graphics(npaths=paths)
+def is_parallel(p1, p2):
+"""Check if line is roughly axis-parallel."""
+if abs(p1.x - p2.x) <= snap_x or abs(p1.y - p2.y) <= snap_y:
+return True
+return False
+def make_line(p, p1, p2, clip):
+"""Given 2 points, make a line dictionary for table detection."""
+if not is_parallel(p1, p2):  # only accepting axis-parallel lines
+return {}
+# compute the extremal values
+x0 = min(p1.x, p2.x)
+x1 = max(p1.x, p2.x)
+y0 = min(p1.y, p2.y)
+y1 = max(p1.y, p2.y)
+# check for outside clip
+if x0 > clip.x1 or x1 < clip.x0 or y0 > clip.y1 or y1 < clip.y0:
+return {}
+if x0 < clip.x0:
+x0 = clip.x0  # adjust to clip boundary
+if x1 > clip.x1:
+x1 = clip.x1  # adjust to clip boundary
+if y0 < clip.y0:
+y0 = clip.y0  # adjust to clip boundary
+if y1 > clip.y1:
+y1 = clip.y1  # adjust to clip boundary
+width = x1 - x0  # from adjusted values
+height = y1 - y0  # from adjusted values
+if width == height == 0:
+return {}  # nothing left to deal with
+line_dict = {
+"x0": x0,
+"y0": page_height - y0,
+"x1": x1,
+"y1": page_height - y1,
+"width": width,
+"height": height,
+"pts": [(x0, y0), (x1, y1)],
+"linewidth": p["width"],
+"stroke": True,
+"fill": False,
+"evenodd": False,
+"stroking_color": p["color"] if p["color"] else p["fill"],
+"non_stroking_color": None,
+"object_type": "line",
+"page_number": page_number,
+"stroking_pattern": None,
+"non_stroking_pattern": None,
+"top": y0,
+"bottom": y1,
+"doctop": y0 + doctop_basis,
+}
+return line_dict
+for p in paths:
+items = p["items"]  # items in this path
+# if 'closePath', add a line from last to first point
+if p["closePath"] and items[0][0] == "l" and items[-1][0] == "l":
+items.append(("l", items[-1][2], items[0][1]))
+for i in items:
+if i[0] not in ("l", "re", "qu"):
+continue  # ignore anything else
+if i[0] == "l":  # a line
+p1, p2 = i[1:]
+line_dict = make_line(p, p1, p2, clip)
+if line_dict:
+EDGES.append(line_to_edge(line_dict))
+elif i[0] == "re":
+# A rectangle: decompose into 4 lines, but filter out
+# the ones that simulate a line
+rect = i[1].normalize()  # normalize the rectangle
+if (
+rect.width <= min_length and rect.width < rect.height
+):  # simulates a vertical line
+x = abs(rect.x1 + rect.x0) / 2  # take middle value for x
+p1 = Point(x, rect.y0)
+p2 = Point(x, rect.y1)
+line_dict = make_line(p, p1, p2, clip)
+if line_dict:
+EDGES.append(line_to_edge(line_dict))
+continue
+if (
+rect.height <= min_length and rect.height < rect.width
+):  # simulates a horizontal line
+y = abs(rect.y1 + rect.y0) / 2  # take middle value for y
+p1 = Point(rect.x0, y)
+p2 = Point(rect.x1, y)
+line_dict = make_line(p, p1, p2, clip)
+if line_dict:
+EDGES.append(line_to_edge(line_dict))
+continue
+line_dict = make_line(p, rect.tl, rect.bl, clip)
+if line_dict:
+EDGES.append(line_to_edge(line_dict))
+line_dict = make_line(p, rect.bl, rect.br, clip)
+if line_dict:
+EDGES.append(line_to_edge(line_dict))
+line_dict = make_line(p, rect.br, rect.tr, clip)
+if line_dict:
+EDGES.append(line_to_edge(line_dict))
+line_dict = make_line(p, rect.tr, rect.tl, clip)
+if line_dict:
+EDGES.append(line_to_edge(line_dict))
+else:  # must be a quad
+# we convert it into (up to) 4 lines
+ul, ur, ll, lr = i[1]
+line_dict = make_line(p, ul, ll, clip)
+if line_dict:
+EDGES.append(line_to_edge(line_dict))
+line_dict = make_line(p, ll, lr, clip)
+if line_dict:
+EDGES.append(line_to_edge(line_dict))
+line_dict = make_line(p, lr, ur, clip)
+if line_dict:
+EDGES.append(line_to_edge(line_dict))
+line_dict = make_line(p, ur, ul, clip)
+if line_dict:
+EDGES.append(line_to_edge(line_dict))
+path = {"color": (0, 0, 0), "fill": None, "width": 1}
+for bbox in bboxes:  # add the border lines for all enveloping bboxes
+line_dict = make_line(path, bbox.tl, bbox.tr, clip)
+if line_dict:
+EDGES.append(line_to_edge(line_dict))
+line_dict = make_line(path, bbox.bl, bbox.br, clip)
+if line_dict:
+EDGES.append(line_to_edge(line_dict))
+line_dict = make_line(path, bbox.tl, bbox.bl, clip)
+if line_dict:
+EDGES.append(line_to_edge(line_dict))
+line_dict = make_line(path, bbox.tr, bbox.br, clip)
+if line_dict:
+EDGES.append(line_to_edge(line_dict))
+if add_lines is not None:  # add user-specified lines
+assert isinstance(add_lines, (tuple, list))
+else:
+add_lines = []
+for p1, p2 in add_lines:
+p1 = Point(p1)
+p2 = Point(p2)
+line_dict = make_line(path, p1, p2, clip)
+if line_dict:
+EDGES.append(line_to_edge(line_dict))
+if add_boxes is not None:  # add user-specified rectangles
+assert isinstance(add_boxes, (tuple, list))
+else:
+add_boxes = []
+for box in add_boxes:
+r = Rect(box)
+line_dict = make_line(path, r.tl, r.bl, clip)
+if line_dict:
+EDGES.append(line_to_edge(line_dict))
+line_dict = make_line(path, r.bl, r.br, clip)
+if line_dict:
+EDGES.append(line_to_edge(line_dict))
+line_dict = make_line(path, r.br, r.tr, clip)
+if line_dict:
+EDGES.append(line_to_edge(line_dict))
+line_dict = make_line(path, r.tr, r.tl, clip)
+if line_dict:
+EDGES.append(line_to_edge(line_dict))
+def page_rotation_set0(page):
+"""Nullify page rotation.
+To correctly detect tables, page rotation must be zero.
+This function performs the necessary adjustments and returns information
+for reverting this changes.
+"""
+mediabox = page.mediabox
+rot = page.rotation  # contains normalized rotation value
+# need to derotate the page's content
+mb = page.mediabox  # current mediabox
+if rot == 90:
+# before derotation, shift content horizontally
+mat0 = Matrix(1, 0, 0, 1, mb.y1 - mb.x1 - mb.x0 - mb.y0, 0)
+elif rot == 270:
+# before derotation, shift content vertically
+mat0 = Matrix(1, 0, 0, 1, 0, mb.x1 - mb.y1 - mb.y0 - mb.x0)
+else:
+mat0 = Matrix(1, 0, 0, 1, -2 * mb.x0, -2 * mb.y0)
+# prefix with derotation matrix
+mat = mat0 * page.derotation_matrix
+cmd = b"%g %g %g %g %g %g cm " % tuple(mat)
+xref = TOOLS._insert_contents(page, cmd, 0)
+# swap x- and y-coordinates
+if rot in (90, 270):
+x0, y0, x1, y1 = mb
+mb.x0 = y0
+mb.y0 = x0
+mb.x1 = y1
+mb.y1 = x1
+page.set_mediabox(mb)
+page.set_rotation(0)
+# refresh the page to apply these changes
+doc = page.parent
+pno = page.number
+page = doc[pno]
+return page, xref, rot, mediabox
+def page_rotation_reset(page, xref, rot, mediabox):
+"""Reset page rotation to original values.
+To be used before we return tables."""
+doc = page.parent  # document of the page
+doc.update_stream(xref, b" ")  # remove de-rotation matrix
+page.set_mediabox(mediabox)  # set mediabox to old value
+page.set_rotation(rot)  # set rotation to old value
+pno = page.number
+page = doc[pno]  # update page info
+return page
+def find_tables(
+page,
+clip=None,
+vertical_strategy: str = "lines",
+horizontal_strategy: str = "lines",
+vertical_lines: list = None,
+horizontal_lines: list = None,
+snap_tolerance: float = DEFAULT_SNAP_TOLERANCE,
+snap_x_tolerance: float = None,
+snap_y_tolerance: float = None,
+join_tolerance: float = DEFAULT_JOIN_TOLERANCE,
+join_x_tolerance: float = None,
+join_y_tolerance: float = None,
+edge_min_length: float = 3,
+min_words_vertical: float = DEFAULT_MIN_WORDS_VERTICAL,
+min_words_horizontal: float = DEFAULT_MIN_WORDS_HORIZONTAL,
+intersection_tolerance: float = 3,
+intersection_x_tolerance: float = None,
+intersection_y_tolerance: float = None,
+text_tolerance=3,
+text_x_tolerance=3,
+text_y_tolerance=3,
+strategy=None,  # offer abbreviation
+add_lines=None,  # user-specified lines
+add_boxes=None,  # user-specified rectangles
+paths=None,  # accept vector graphics as parameter
+):
+global CHARS, EDGES
+CHARS = []
+EDGES = []
+old_small = bool(TOOLS.set_small_glyph_heights())  # save old value
+TOOLS.set_small_glyph_heights(True)  # we need minimum bboxes
+if page.rotation != 0:
+page, old_xref, old_rot, old_mediabox = page_rotation_set0(page)
+else:
+old_xref, old_rot, old_mediabox = None, None, None
+if snap_x_tolerance is None:
+snap_x_tolerance = UNSET
+if snap_y_tolerance is None:
+snap_y_tolerance = UNSET
+if join_x_tolerance is None:
+join_x_tolerance = UNSET
+if join_y_tolerance is None:
+join_y_tolerance = UNSET
+if intersection_x_tolerance is None:
+intersection_x_tolerance = UNSET
+if intersection_y_tolerance is None:
+intersection_y_tolerance = UNSET
+if strategy is not None:
+vertical_strategy = strategy
+horizontal_strategy = strategy
+settings = {
+"vertical_strategy": vertical_strategy,
+"horizontal_strategy": horizontal_strategy,
+"explicit_vertical_lines": vertical_lines,
+"explicit_horizontal_lines": horizontal_lines,
+"snap_tolerance": snap_tolerance,
+"snap_x_tolerance": snap_x_tolerance,
+"snap_y_tolerance": snap_y_tolerance,
+"join_tolerance": join_tolerance,
+"join_x_tolerance": join_x_tolerance,
+"join_y_tolerance": join_y_tolerance,
+"edge_min_length": edge_min_length,
+"min_words_vertical": min_words_vertical,
+"min_words_horizontal": min_words_horizontal,
+"intersection_tolerance": intersection_tolerance,
+"intersection_x_tolerance": intersection_x_tolerance,
+"intersection_y_tolerance": intersection_y_tolerance,
+"text_tolerance": text_tolerance,
+"text_x_tolerance": text_x_tolerance,
+"text_y_tolerance": text_y_tolerance,
+}
+tset = TableSettings.resolve(settings=settings)
+page.table_settings = tset
+make_chars(page, clip=clip)  # create character list of page
+make_edges(
+page,
+clip=clip,
+tset=tset,
+paths=paths,
+add_lines=add_lines,
+add_boxes=add_boxes,
+)  # create lines and curves
+tables = TableFinder(page, settings=tset)
+TOOLS.set_small_glyph_heights(old_small)
+if old_xref is not None:
+page = page_rotation_reset(page, old_xref, old_rot, old_mediabox)
+return tables

Mercurial > hgrepos > Python2 > PyMuPDF

comparison src/table.py @ 1:1d09e1dec1d9 upstream