Mercurial > hgrepos > Python2 > PyMuPDF
comparison src/table.py @ 1:1d09e1dec1d9 upstream
ADD: PyMuPDF v1.26.4: the original sdist.
It does not yet contain MuPDF. This normally will be downloaded when
building PyMuPDF.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:37:51 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 1:1d09e1dec1d9 |
|---|---|
| 1 """ | |
| 2 Copyright (C) 2023 Artifex Software, Inc. | |
| 3 | |
| 4 This file is part of PyMuPDF. | |
| 5 | |
| 6 PyMuPDF is free software: you can redistribute it and/or modify it under the | |
| 7 terms of the GNU Affero General Public License as published by the Free | |
| 8 Software Foundation, either version 3 of the License, or (at your option) | |
| 9 any later version. | |
| 10 | |
| 11 PyMuPDF is distributed in the hope that it will be useful, but WITHOUT ANY | |
| 12 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | |
| 13 FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more | |
| 14 details. | |
| 15 | |
| 16 You should have received a copy of the GNU Affero General Public License | |
| 17 along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> | |
| 18 | |
| 19 Alternative licensing terms are available from the licensor. | |
| 20 For commercial licensing, see <https://www.artifex.com/> or contact | |
| 21 Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, | |
| 22 CA 94129, USA, for further information. | |
| 23 | |
| 24 --------------------------------------------------------------------- | |
| 25 Portions of this code have been ported from pdfplumber, see | |
| 26 https://pypi.org/project/pdfplumber/. | |
| 27 | |
| 28 The ported code is under the following MIT license: | |
| 29 | |
| 30 --------------------------------------------------------------------- | |
| 31 The MIT License (MIT) | |
| 32 | |
| 33 Copyright (c) 2015, Jeremy Singer-Vine | |
| 34 | |
| 35 Permission is hereby granted, free of charge, to any person obtaining a copy | |
| 36 of this software and associated documentation files (the "Software"), to deal | |
| 37 in the Software without restriction, including without limitation the rights | |
| 38 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
| 39 copies of the Software, and to permit persons to whom the Software is | |
| 40 furnished to do so, subject to the following conditions: | |
| 41 | |
| 42 The above copyright notice and this permission notice shall be included in all | |
| 43 copies or substantial portions of the Software. | |
| 44 | |
| 45 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| 46 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| 47 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
| 48 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| 49 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
| 50 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
| 51 SOFTWARE. | |
| 52 --------------------------------------------------------------------- | |
| 53 Also see here: https://github.com/jsvine/pdfplumber/blob/stable/LICENSE.txt | |
| 54 --------------------------------------------------------------------- | |
| 55 | |
| 56 The porting mainly pertains to files "table.py" and relevant parts of | |
| 57 "utils/text.py" within pdfplumber's repository on Github. | |
| 58 With respect to "text.py", we have removed functions or features that are not | |
| 59 used by table processing. Examples are: | |
| 60 | |
| 61 * the text search function | |
| 62 * simple text extraction | |
| 63 * text extraction by lines | |
| 64 | |
| 65 Original pdfplumber code does neither detect, nor identify table headers. | |
| 66 This PyMuPDF port adds respective code to the 'Table' class as method '_get_header'. | |
| 67 This is implemented as new class TableHeader with the properties: | |
| 68 * bbox: A tuple for the header's bbox | |
| 69 * cells: A tuple for each bbox of a column header | |
| 70 * names: A list of strings with column header text | |
| 71 * external: A bool indicating whether the header is outside the table cells. | |
| 72 | |
| 73 """ | |
| 74 | |
| 75 import inspect | |
| 76 import itertools | |
| 77 import string | |
| 78 import html | |
| 79 from collections.abc import Sequence | |
| 80 from dataclasses import dataclass | |
| 81 from operator import itemgetter | |
| 82 import weakref | |
| 83 | |
| 84 # ------------------------------------------------------------------- | |
| 85 # Start of PyMuPDF interface code | |
| 86 # ------------------------------------------------------------------- | |
| 87 from . import ( | |
| 88 Rect, | |
| 89 Matrix, | |
| 90 TEXTFLAGS_TEXT, | |
| 91 TEXT_FONT_BOLD, | |
| 92 TEXT_FONT_ITALIC, | |
| 93 TEXT_FONT_MONOSPACED, | |
| 94 TEXT_FONT_SUPERSCRIPT, | |
| 95 TEXT_COLLECT_STYLES, | |
| 96 TOOLS, | |
| 97 EMPTY_RECT, | |
| 98 sRGB_to_pdf, | |
| 99 Point, | |
| 100 message, | |
| 101 mupdf, | |
| 102 ) | |
| 103 | |
| 104 EDGES = [] # vector graphics from PyMuPDF | |
| 105 CHARS = [] # text characters from PyMuPDF | |
| 106 TEXTPAGE = None | |
| 107 TEXT_BOLD = mupdf.FZ_STEXT_BOLD | |
| 108 TEXT_STRIKEOUT = mupdf.FZ_STEXT_STRIKEOUT | |
| 109 FLAGS = TEXTFLAGS_TEXT | TEXT_COLLECT_STYLES | |
| 110 | |
| 111 white_spaces = set(string.whitespace) # for checking white space only cells | |
| 112 | |
| 113 | |
| 114 def extract_cells(textpage, cell, markdown=False): | |
| 115 """Extract text from a rect-like 'cell' as plain or MD style text. | |
| 116 | |
| 117 This function should ultimately be used to extract text from a table cell. | |
| 118 Markdown output will only work correctly if extraction flag bit | |
| 119 TEXT_COLLECT_STYLES is set. | |
| 120 | |
| 121 Args: | |
| 122 textpage: A PyMuPDF TextPage object. Must have been created with | |
| 123 TEXTFLAGS_TEXT | TEXT_COLLECT_STYLES. | |
| 124 cell: A tuple (x0, y0, x1, y1) defining the cell's bbox. | |
| 125 markdown: If True, return text formatted for Markdown. | |
| 126 | |
| 127 Returns: | |
| 128 A string with the text extracted from the cell. | |
| 129 """ | |
| 130 text = "" | |
| 131 for block in textpage.extractRAWDICT()["blocks"]: | |
| 132 if block["type"] != 0: | |
| 133 continue | |
| 134 block_bbox = block["bbox"] | |
| 135 if ( | |
| 136 0 | |
| 137 or block_bbox[0] > cell[2] | |
| 138 or block_bbox[2] < cell[0] | |
| 139 or block_bbox[1] > cell[3] | |
| 140 or block_bbox[3] < cell[1] | |
| 141 ): | |
| 142 continue # skip block outside cell | |
| 143 for line in block["lines"]: | |
| 144 lbbox = line["bbox"] | |
| 145 if ( | |
| 146 0 | |
| 147 or lbbox[0] > cell[2] | |
| 148 or lbbox[2] < cell[0] | |
| 149 or lbbox[1] > cell[3] | |
| 150 or lbbox[3] < cell[1] | |
| 151 ): | |
| 152 continue # skip line outside cell | |
| 153 | |
| 154 if text: # must be a new line in the cell | |
| 155 text += "<br>" if markdown else "\n" | |
| 156 | |
| 157 # strikeout detection only works with horizontal text | |
| 158 horizontal = line["dir"] == (0, 1) or line["dir"] == (1, 0) | |
| 159 | |
| 160 for span in line["spans"]: | |
| 161 sbbox = span["bbox"] | |
| 162 if ( | |
| 163 0 | |
| 164 or sbbox[0] > cell[2] | |
| 165 or sbbox[2] < cell[0] | |
| 166 or sbbox[1] > cell[3] | |
| 167 or sbbox[3] < cell[1] | |
| 168 ): | |
| 169 continue # skip spans outside cell | |
| 170 | |
| 171 # only include chars with more than 50% bbox overlap | |
| 172 span_text = "" | |
| 173 for char in span["chars"]: | |
| 174 bbox = Rect(char["bbox"]) | |
| 175 if abs(bbox & cell) > 0.5 * abs(bbox): | |
| 176 span_text += char["c"] | |
| 177 | |
| 178 if not span_text: | |
| 179 continue # skip empty span | |
| 180 | |
| 181 if not markdown: # no MD styling | |
| 182 text += span_text | |
| 183 continue | |
| 184 | |
| 185 prefix = "" | |
| 186 suffix = "" | |
| 187 if horizontal and span["char_flags"] & TEXT_STRIKEOUT: | |
| 188 prefix += "~~" | |
| 189 suffix = "~~" + suffix | |
| 190 if span["char_flags"] & TEXT_BOLD: | |
| 191 prefix += "**" | |
| 192 suffix = "**" + suffix | |
| 193 if span["flags"] & TEXT_FONT_ITALIC: | |
| 194 prefix += "_" | |
| 195 suffix = "_" + suffix | |
| 196 if span["flags"] & TEXT_FONT_MONOSPACED: | |
| 197 prefix += "`" | |
| 198 suffix = "`" + suffix | |
| 199 | |
| 200 if len(span["chars"]) > 2: | |
| 201 span_text = span_text.rstrip() | |
| 202 | |
| 203 # if span continues previous styling: extend cell text | |
| 204 if (ls := len(suffix)) and text.endswith(suffix): | |
| 205 text = text[:-ls] + span_text + suffix | |
| 206 else: # append the span with new styling | |
| 207 if not span_text.strip(): | |
| 208 text += " " | |
| 209 else: | |
| 210 text += prefix + span_text + suffix | |
| 211 | |
| 212 return text.strip() | |
| 213 | |
| 214 | |
| 215 # ------------------------------------------------------------------- | |
| 216 # End of PyMuPDF interface code | |
| 217 # ------------------------------------------------------------------- | |
| 218 | |
| 219 | |
| 220 class UnsetFloat(float): | |
| 221 pass | |
| 222 | |
| 223 | |
| 224 NON_NEGATIVE_SETTINGS = [ | |
| 225 "snap_tolerance", | |
| 226 "snap_x_tolerance", | |
| 227 "snap_y_tolerance", | |
| 228 "join_tolerance", | |
| 229 "join_x_tolerance", | |
| 230 "join_y_tolerance", | |
| 231 "edge_min_length", | |
| 232 "min_words_vertical", | |
| 233 "min_words_horizontal", | |
| 234 "intersection_tolerance", | |
| 235 "intersection_x_tolerance", | |
| 236 "intersection_y_tolerance", | |
| 237 ] | |
| 238 | |
| 239 | |
| 240 TABLE_STRATEGIES = ["lines", "lines_strict", "text", "explicit"] | |
| 241 UNSET = UnsetFloat(0) | |
| 242 DEFAULT_SNAP_TOLERANCE = 3 | |
| 243 DEFAULT_JOIN_TOLERANCE = 3 | |
| 244 DEFAULT_MIN_WORDS_VERTICAL = 3 | |
| 245 DEFAULT_MIN_WORDS_HORIZONTAL = 1 | |
| 246 DEFAULT_X_TOLERANCE = 3 | |
| 247 DEFAULT_Y_TOLERANCE = 3 | |
| 248 DEFAULT_X_DENSITY = 7.25 | |
| 249 DEFAULT_Y_DENSITY = 13 | |
| 250 bbox_getter = itemgetter("x0", "top", "x1", "bottom") | |
| 251 | |
| 252 | |
| 253 LIGATURES = { | |
| 254 "ff": "ff", | |
| 255 "ffi": "ffi", | |
| 256 "ffl": "ffl", | |
| 257 "fi": "fi", | |
| 258 "fl": "fl", | |
| 259 "st": "st", | |
| 260 "ſt": "st", | |
| 261 } | |
| 262 | |
| 263 | |
| 264 def to_list(collection) -> list: | |
| 265 if isinstance(collection, list): | |
| 266 return collection | |
| 267 elif isinstance(collection, Sequence): | |
| 268 return list(collection) | |
| 269 elif hasattr(collection, "to_dict"): | |
| 270 res = collection.to_dict("records") # pragma: nocover | |
| 271 return res | |
| 272 else: | |
| 273 return list(collection) | |
| 274 | |
| 275 | |
| 276 class TextMap: | |
| 277 """ | |
| 278 A TextMap maps each unicode character in the text to an individual `char` | |
| 279 object (or, in the case of layout-implied whitespace, `None`). | |
| 280 """ | |
| 281 | |
| 282 def __init__(self, tuples=None) -> None: | |
| 283 self.tuples = tuples | |
| 284 self.as_string = "".join(map(itemgetter(0), tuples)) | |
| 285 | |
| 286 def match_to_dict( | |
| 287 self, | |
| 288 m, | |
| 289 main_group: int = 0, | |
| 290 return_groups: bool = True, | |
| 291 return_chars: bool = True, | |
| 292 ) -> dict: | |
| 293 subset = self.tuples[m.start(main_group) : m.end(main_group)] | |
| 294 chars = [c for (text, c) in subset if c is not None] | |
| 295 x0, top, x1, bottom = objects_to_bbox(chars) | |
| 296 | |
| 297 result = { | |
| 298 "text": m.group(main_group), | |
| 299 "x0": x0, | |
| 300 "top": top, | |
| 301 "x1": x1, | |
| 302 "bottom": bottom, | |
| 303 } | |
| 304 | |
| 305 if return_groups: | |
| 306 result["groups"] = m.groups() | |
| 307 | |
| 308 if return_chars: | |
| 309 result["chars"] = chars | |
| 310 | |
| 311 return result | |
| 312 | |
| 313 | |
| 314 class WordMap: | |
| 315 """ | |
| 316 A WordMap maps words->chars. | |
| 317 """ | |
| 318 | |
| 319 def __init__(self, tuples) -> None: | |
| 320 self.tuples = tuples | |
| 321 | |
| 322 def to_textmap( | |
| 323 self, | |
| 324 layout: bool = False, | |
| 325 layout_width=0, | |
| 326 layout_height=0, | |
| 327 layout_width_chars: int = 0, | |
| 328 layout_height_chars: int = 0, | |
| 329 x_density=DEFAULT_X_DENSITY, | |
| 330 y_density=DEFAULT_Y_DENSITY, | |
| 331 x_shift=0, | |
| 332 y_shift=0, | |
| 333 y_tolerance=DEFAULT_Y_TOLERANCE, | |
| 334 use_text_flow: bool = False, | |
| 335 presorted: bool = False, | |
| 336 expand_ligatures: bool = True, | |
| 337 ) -> TextMap: | |
| 338 """ | |
| 339 Given a list of (word, chars) tuples (i.e., a WordMap), return a list of | |
| 340 (char-text, char) tuples (i.e., a TextMap) that can be used to mimic the | |
| 341 structural layout of the text on the page(s), using the following approach: | |
| 342 | |
| 343 - Sort the words by (doctop, x0) if not already sorted. | |
| 344 | |
| 345 - Calculate the initial doctop for the starting page. | |
| 346 | |
| 347 - Cluster the words by doctop (taking `y_tolerance` into account), and | |
| 348 iterate through them. | |
| 349 | |
| 350 - For each cluster, calculate the distance between that doctop and the | |
| 351 initial doctop, in points, minus `y_shift`. Divide that distance by | |
| 352 `y_density` to calculate the minimum number of newlines that should come | |
| 353 before this cluster. Append that number of newlines *minus* the number of | |
| 354 newlines already appended, with a minimum of one. | |
| 355 | |
| 356 - Then for each cluster, iterate through each word in it. Divide each | |
| 357 word's x0, minus `x_shift`, by `x_density` to calculate the minimum | |
| 358 number of characters that should come before this cluster. Append that | |
| 359 number of spaces *minus* the number of characters and spaces already | |
| 360 appended, with a minimum of one. Then append the word's text. | |
| 361 | |
| 362 - At the termination of each line, add more spaces if necessary to | |
| 363 mimic `layout_width`. | |
| 364 | |
| 365 - Finally, add newlines to the end if necessary to mimic to | |
| 366 `layout_height`. | |
| 367 | |
| 368 Note: This approach currently works best for horizontal, left-to-right | |
| 369 text, but will display all words regardless of orientation. There is room | |
| 370 for improvement in better supporting right-to-left text, as well as | |
| 371 vertical text. | |
| 372 """ | |
| 373 _textmap = [] | |
| 374 | |
| 375 if not len(self.tuples): | |
| 376 return TextMap(_textmap) | |
| 377 | |
| 378 expansions = LIGATURES if expand_ligatures else {} | |
| 379 | |
| 380 if layout: | |
| 381 if layout_width_chars: | |
| 382 if layout_width: | |
| 383 raise ValueError( | |
| 384 "`layout_width` and `layout_width_chars` cannot both be set." | |
| 385 ) | |
| 386 else: | |
| 387 layout_width_chars = int(round(layout_width / x_density)) | |
| 388 | |
| 389 if layout_height_chars: | |
| 390 if layout_height: | |
| 391 raise ValueError( | |
| 392 "`layout_height` and `layout_height_chars` cannot both be set." | |
| 393 ) | |
| 394 else: | |
| 395 layout_height_chars = int(round(layout_height / y_density)) | |
| 396 | |
| 397 blank_line = [(" ", None)] * layout_width_chars | |
| 398 else: | |
| 399 blank_line = [] | |
| 400 | |
| 401 num_newlines = 0 | |
| 402 | |
| 403 words_sorted_doctop = ( | |
| 404 self.tuples | |
| 405 if presorted or use_text_flow | |
| 406 else sorted(self.tuples, key=lambda x: float(x[0]["doctop"])) | |
| 407 ) | |
| 408 | |
| 409 first_word = words_sorted_doctop[0][0] | |
| 410 doctop_start = first_word["doctop"] - first_word["top"] | |
| 411 | |
| 412 for i, ws in enumerate( | |
| 413 cluster_objects( | |
| 414 words_sorted_doctop, lambda x: float(x[0]["doctop"]), y_tolerance | |
| 415 ) | |
| 416 ): | |
| 417 y_dist = ( | |
| 418 (ws[0][0]["doctop"] - (doctop_start + y_shift)) / y_density | |
| 419 if layout | |
| 420 else 0 | |
| 421 ) | |
| 422 num_newlines_prepend = max( | |
| 423 # At least one newline, unless this iis the first line | |
| 424 int(i > 0), | |
| 425 # ... or as many as needed to get the imputed "distance" from the top | |
| 426 round(y_dist) - num_newlines, | |
| 427 ) | |
| 428 | |
| 429 for i in range(num_newlines_prepend): | |
| 430 if not len(_textmap) or _textmap[-1][0] == "\n": | |
| 431 _textmap += blank_line | |
| 432 _textmap.append(("\n", None)) | |
| 433 | |
| 434 num_newlines += num_newlines_prepend | |
| 435 | |
| 436 line_len = 0 | |
| 437 | |
| 438 line_words_sorted_x0 = ( | |
| 439 ws | |
| 440 if presorted or use_text_flow | |
| 441 else sorted(ws, key=lambda x: float(x[0]["x0"])) | |
| 442 ) | |
| 443 | |
| 444 for word, chars in line_words_sorted_x0: | |
| 445 x_dist = (word["x0"] - x_shift) / x_density if layout else 0 | |
| 446 num_spaces_prepend = max(min(1, line_len), round(x_dist) - line_len) | |
| 447 _textmap += [(" ", None)] * num_spaces_prepend | |
| 448 line_len += num_spaces_prepend | |
| 449 | |
| 450 for c in chars: | |
| 451 letters = expansions.get(c["text"], c["text"]) | |
| 452 for letter in letters: | |
| 453 _textmap.append((letter, c)) | |
| 454 line_len += 1 | |
| 455 | |
| 456 # Append spaces at end of line | |
| 457 if layout: | |
| 458 _textmap += [(" ", None)] * (layout_width_chars - line_len) | |
| 459 | |
| 460 # Append blank lines at end of text | |
| 461 if layout: | |
| 462 num_newlines_append = layout_height_chars - (num_newlines + 1) | |
| 463 for i in range(num_newlines_append): | |
| 464 if i > 0: | |
| 465 _textmap += blank_line | |
| 466 _textmap.append(("\n", None)) | |
| 467 | |
| 468 # Remove terminal newline | |
| 469 if _textmap[-1] == ("\n", None): | |
| 470 _textmap = _textmap[:-1] | |
| 471 | |
| 472 return TextMap(_textmap) | |
| 473 | |
| 474 | |
| 475 class WordExtractor: | |
| 476 def __init__( | |
| 477 self, | |
| 478 x_tolerance=DEFAULT_X_TOLERANCE, | |
| 479 y_tolerance=DEFAULT_Y_TOLERANCE, | |
| 480 keep_blank_chars: bool = False, | |
| 481 use_text_flow=False, | |
| 482 horizontal_ltr=True, # Should words be read left-to-right? | |
| 483 vertical_ttb=False, # Should vertical words be read top-to-bottom? | |
| 484 extra_attrs=None, | |
| 485 split_at_punctuation=False, | |
| 486 expand_ligatures=True, | |
| 487 ): | |
| 488 self.x_tolerance = x_tolerance | |
| 489 self.y_tolerance = y_tolerance | |
| 490 self.keep_blank_chars = keep_blank_chars | |
| 491 self.use_text_flow = use_text_flow | |
| 492 self.horizontal_ltr = horizontal_ltr | |
| 493 self.vertical_ttb = vertical_ttb | |
| 494 self.extra_attrs = [] if extra_attrs is None else extra_attrs | |
| 495 | |
| 496 # Note: string.punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~' | |
| 497 self.split_at_punctuation = ( | |
| 498 string.punctuation | |
| 499 if split_at_punctuation is True | |
| 500 else (split_at_punctuation or "") | |
| 501 ) | |
| 502 | |
| 503 self.expansions = LIGATURES if expand_ligatures else {} | |
| 504 | |
| 505 def merge_chars(self, ordered_chars: list): | |
| 506 x0, top, x1, bottom = objects_to_bbox(ordered_chars) | |
| 507 doctop_adj = ordered_chars[0]["doctop"] - ordered_chars[0]["top"] | |
| 508 upright = ordered_chars[0]["upright"] | |
| 509 direction = 1 if (self.horizontal_ltr if upright else self.vertical_ttb) else -1 | |
| 510 | |
| 511 matrix = ordered_chars[0]["matrix"] | |
| 512 | |
| 513 rotation = 0 | |
| 514 if not upright and matrix[1] < 0: | |
| 515 ordered_chars = reversed(ordered_chars) | |
| 516 rotation = 270 | |
| 517 | |
| 518 if matrix[0] < 0 and matrix[3] < 0: | |
| 519 rotation = 180 | |
| 520 elif matrix[1] > 0: | |
| 521 rotation = 90 | |
| 522 | |
| 523 word = { | |
| 524 "text": "".join( | |
| 525 self.expansions.get(c["text"], c["text"]) for c in ordered_chars | |
| 526 ), | |
| 527 "x0": x0, | |
| 528 "x1": x1, | |
| 529 "top": top, | |
| 530 "doctop": top + doctop_adj, | |
| 531 "bottom": bottom, | |
| 532 "upright": upright, | |
| 533 "direction": direction, | |
| 534 "rotation": rotation, | |
| 535 } | |
| 536 | |
| 537 for key in self.extra_attrs: | |
| 538 word[key] = ordered_chars[0][key] | |
| 539 | |
| 540 return word | |
| 541 | |
| 542 def char_begins_new_word( | |
| 543 self, | |
| 544 prev_char, | |
| 545 curr_char, | |
| 546 ) -> bool: | |
| 547 """This method takes several factors into account to determine if | |
| 548 `curr_char` represents the beginning of a new word: | |
| 549 | |
| 550 - Whether the text is "upright" (i.e., non-rotated) | |
| 551 - Whether the user has specified that horizontal text runs | |
| 552 left-to-right (default) or right-to-left, as represented by | |
| 553 self.horizontal_ltr | |
| 554 - Whether the user has specified that vertical text the text runs | |
| 555 top-to-bottom (default) or bottom-to-top, as represented by | |
| 556 self.vertical_ttb | |
| 557 - The x0, top, x1, and bottom attributes of prev_char and | |
| 558 curr_char | |
| 559 - The self.x_tolerance and self.y_tolerance settings. Note: In | |
| 560 this case, x/y refer to those directions for non-rotated text. | |
| 561 For vertical text, they are flipped. A more accurate terminology | |
| 562 might be "*intra*line character distance tolerance" and | |
| 563 "*inter*line character distance tolerance" | |
| 564 | |
| 565 An important note: The *intra*line distance is measured from the | |
| 566 *end* of the previous character to the *beginning* of the current | |
| 567 character, while the *inter*line distance is measured from the | |
| 568 *top* of the previous character to the *top* of the next | |
| 569 character. The reasons for this are partly repository-historical, | |
| 570 and partly logical, as successive text lines' bounding boxes often | |
| 571 overlap slightly (and we don't want that overlap to be interpreted | |
| 572 as the two lines being the same line). | |
| 573 | |
| 574 The upright-ness of the character determines the attributes to | |
| 575 compare, while horizontal_ltr/vertical_ttb determine the direction | |
| 576 of the comparison. | |
| 577 """ | |
| 578 | |
| 579 # Note: Due to the grouping step earlier in the process, | |
| 580 # curr_char["upright"] will always equal prev_char["upright"]. | |
| 581 if curr_char["upright"]: | |
| 582 x = self.x_tolerance | |
| 583 y = self.y_tolerance | |
| 584 ay = prev_char["top"] | |
| 585 cy = curr_char["top"] | |
| 586 if self.horizontal_ltr: | |
| 587 ax = prev_char["x0"] | |
| 588 bx = prev_char["x1"] | |
| 589 cx = curr_char["x0"] | |
| 590 else: | |
| 591 ax = -prev_char["x1"] | |
| 592 bx = -prev_char["x0"] | |
| 593 cx = -curr_char["x1"] | |
| 594 | |
| 595 else: | |
| 596 x = self.y_tolerance | |
| 597 y = self.x_tolerance | |
| 598 ay = prev_char["x0"] | |
| 599 cy = curr_char["x0"] | |
| 600 if self.vertical_ttb: | |
| 601 ax = prev_char["top"] | |
| 602 bx = prev_char["bottom"] | |
| 603 cx = curr_char["top"] | |
| 604 else: | |
| 605 ax = -prev_char["bottom"] | |
| 606 bx = -prev_char["top"] | |
| 607 cx = -curr_char["bottom"] | |
| 608 | |
| 609 return bool( | |
| 610 # Intraline test | |
| 611 (cx < ax) | |
| 612 or (cx > bx + x) | |
| 613 # Interline test | |
| 614 or (cy > ay + y) | |
| 615 ) | |
| 616 | |
| 617 def iter_chars_to_words(self, ordered_chars): | |
| 618 current_word: list = [] | |
| 619 | |
| 620 def start_next_word(new_char=None): | |
| 621 nonlocal current_word | |
| 622 | |
| 623 if current_word: | |
| 624 yield current_word | |
| 625 | |
| 626 current_word = [] if new_char is None else [new_char] | |
| 627 | |
| 628 for char in ordered_chars: | |
| 629 text = char["text"] | |
| 630 | |
| 631 if not self.keep_blank_chars and text.isspace(): | |
| 632 yield from start_next_word(None) | |
| 633 | |
| 634 elif text in self.split_at_punctuation: | |
| 635 yield from start_next_word(char) | |
| 636 yield from start_next_word(None) | |
| 637 | |
| 638 elif current_word and self.char_begins_new_word(current_word[-1], char): | |
| 639 yield from start_next_word(char) | |
| 640 | |
| 641 else: | |
| 642 current_word.append(char) | |
| 643 | |
| 644 # Finally, after all chars processed | |
| 645 if current_word: | |
| 646 yield current_word | |
| 647 | |
| 648 def iter_sort_chars(self, chars): | |
| 649 def upright_key(x) -> int: | |
| 650 return -int(x["upright"]) | |
| 651 | |
| 652 for upright_cluster in cluster_objects(list(chars), upright_key, 0): | |
| 653 upright = upright_cluster[0]["upright"] | |
| 654 cluster_key = "doctop" if upright else "x0" | |
| 655 | |
| 656 # Cluster by line | |
| 657 subclusters = cluster_objects( | |
| 658 upright_cluster, itemgetter(cluster_key), self.y_tolerance | |
| 659 ) | |
| 660 | |
| 661 for sc in subclusters: | |
| 662 # Sort within line | |
| 663 sort_key = "x0" if upright else "doctop" | |
| 664 to_yield = sorted(sc, key=itemgetter(sort_key)) | |
| 665 | |
| 666 # Reverse order if necessary | |
| 667 if not (self.horizontal_ltr if upright else self.vertical_ttb): | |
| 668 yield from reversed(to_yield) | |
| 669 else: | |
| 670 yield from to_yield | |
| 671 | |
| 672 def iter_extract_tuples(self, chars): | |
| 673 ordered_chars = chars if self.use_text_flow else self.iter_sort_chars(chars) | |
| 674 | |
| 675 grouping_key = itemgetter("upright", *self.extra_attrs) | |
| 676 grouped_chars = itertools.groupby(ordered_chars, grouping_key) | |
| 677 | |
| 678 for keyvals, char_group in grouped_chars: | |
| 679 for word_chars in self.iter_chars_to_words(char_group): | |
| 680 yield (self.merge_chars(word_chars), word_chars) | |
| 681 | |
| 682 def extract_wordmap(self, chars) -> WordMap: | |
| 683 return WordMap(list(self.iter_extract_tuples(chars))) | |
| 684 | |
| 685 def extract_words(self, chars: list) -> list: | |
| 686 words = list(word for word, word_chars in self.iter_extract_tuples(chars)) | |
| 687 return words | |
| 688 | |
| 689 | |
| 690 def extract_words(chars: list, **kwargs) -> list: | |
| 691 return WordExtractor(**kwargs).extract_words(chars) | |
| 692 | |
| 693 | |
| 694 TEXTMAP_KWARGS = inspect.signature(WordMap.to_textmap).parameters.keys() | |
| 695 WORD_EXTRACTOR_KWARGS = inspect.signature(WordExtractor).parameters.keys() | |
| 696 | |
| 697 | |
| 698 def chars_to_textmap(chars: list, **kwargs) -> TextMap: | |
| 699 kwargs.update({"presorted": True}) | |
| 700 | |
| 701 extractor = WordExtractor( | |
| 702 **{k: kwargs[k] for k in WORD_EXTRACTOR_KWARGS if k in kwargs} | |
| 703 ) | |
| 704 wordmap = extractor.extract_wordmap(chars) | |
| 705 textmap = wordmap.to_textmap( | |
| 706 **{k: kwargs[k] for k in TEXTMAP_KWARGS if k in kwargs} | |
| 707 ) | |
| 708 | |
| 709 return textmap | |
| 710 | |
| 711 | |
| 712 def extract_text(chars: list, **kwargs) -> str: | |
| 713 chars = to_list(chars) | |
| 714 if len(chars) == 0: | |
| 715 return "" | |
| 716 | |
| 717 if kwargs.get("layout"): | |
| 718 return chars_to_textmap(chars, **kwargs).as_string | |
| 719 else: | |
| 720 y_tolerance = kwargs.get("y_tolerance", DEFAULT_Y_TOLERANCE) | |
| 721 extractor = WordExtractor( | |
| 722 **{k: kwargs[k] for k in WORD_EXTRACTOR_KWARGS if k in kwargs} | |
| 723 ) | |
| 724 words = extractor.extract_words(chars) | |
| 725 if words: | |
| 726 rotation = words[0]["rotation"] # rotation cannot change within a cell | |
| 727 else: | |
| 728 rotation = 0 | |
| 729 | |
| 730 if rotation == 90: | |
| 731 words.sort(key=lambda w: (w["x1"], -w["top"])) | |
| 732 lines = " ".join([w["text"] for w in words]) | |
| 733 elif rotation == 270: | |
| 734 words.sort(key=lambda w: (-w["x1"], w["top"])) | |
| 735 lines = " ".join([w["text"] for w in words]) | |
| 736 else: | |
| 737 lines = cluster_objects(words, itemgetter("doctop"), y_tolerance) | |
| 738 lines = "\n".join(" ".join(word["text"] for word in line) for line in lines) | |
| 739 if rotation == 180: # needs extra treatment | |
| 740 lines = "".join([(c if c != "\n" else " ") for c in reversed(lines)]) | |
| 741 | |
| 742 return lines | |
| 743 | |
| 744 | |
| 745 def collate_line( | |
| 746 line_chars: list, | |
| 747 tolerance=DEFAULT_X_TOLERANCE, | |
| 748 ) -> str: | |
| 749 coll = "" | |
| 750 last_x1 = None | |
| 751 for char in sorted(line_chars, key=itemgetter("x0")): | |
| 752 if (last_x1 is not None) and (char["x0"] > (last_x1 + tolerance)): | |
| 753 coll += " " | |
| 754 last_x1 = char["x1"] | |
| 755 coll += char["text"] | |
| 756 return coll | |
| 757 | |
| 758 | |
| 759 def dedupe_chars(chars: list, tolerance=1) -> list: | |
| 760 """ | |
| 761 Removes duplicate chars — those sharing the same text, fontname, size, | |
| 762 and positioning (within `tolerance`) as other characters in the set. | |
| 763 """ | |
| 764 key = itemgetter("fontname", "size", "upright", "text") | |
| 765 pos_key = itemgetter("doctop", "x0") | |
| 766 | |
| 767 def yield_unique_chars(chars: list): | |
| 768 sorted_chars = sorted(chars, key=key) | |
| 769 for grp, grp_chars in itertools.groupby(sorted_chars, key=key): | |
| 770 for y_cluster in cluster_objects( | |
| 771 list(grp_chars), itemgetter("doctop"), tolerance | |
| 772 ): | |
| 773 for x_cluster in cluster_objects( | |
| 774 y_cluster, itemgetter("x0"), tolerance | |
| 775 ): | |
| 776 yield sorted(x_cluster, key=pos_key)[0] | |
| 777 | |
| 778 deduped = yield_unique_chars(chars) | |
| 779 return sorted(deduped, key=chars.index) | |
| 780 | |
| 781 | |
| 782 def line_to_edge(line): | |
| 783 edge = dict(line) | |
| 784 edge["orientation"] = "h" if (line["top"] == line["bottom"]) else "v" | |
| 785 return edge | |
| 786 | |
| 787 | |
| 788 def rect_to_edges(rect) -> list: | |
| 789 top, bottom, left, right = [dict(rect) for x in range(4)] | |
| 790 top.update( | |
| 791 { | |
| 792 "object_type": "rect_edge", | |
| 793 "height": 0, | |
| 794 "y0": rect["y1"], | |
| 795 "bottom": rect["top"], | |
| 796 "orientation": "h", | |
| 797 } | |
| 798 ) | |
| 799 bottom.update( | |
| 800 { | |
| 801 "object_type": "rect_edge", | |
| 802 "height": 0, | |
| 803 "y1": rect["y0"], | |
| 804 "top": rect["top"] + rect["height"], | |
| 805 "doctop": rect["doctop"] + rect["height"], | |
| 806 "orientation": "h", | |
| 807 } | |
| 808 ) | |
| 809 left.update( | |
| 810 { | |
| 811 "object_type": "rect_edge", | |
| 812 "width": 0, | |
| 813 "x1": rect["x0"], | |
| 814 "orientation": "v", | |
| 815 } | |
| 816 ) | |
| 817 right.update( | |
| 818 { | |
| 819 "object_type": "rect_edge", | |
| 820 "width": 0, | |
| 821 "x0": rect["x1"], | |
| 822 "orientation": "v", | |
| 823 } | |
| 824 ) | |
| 825 return [top, bottom, left, right] | |
| 826 | |
| 827 | |
| 828 def curve_to_edges(curve) -> list: | |
| 829 point_pairs = zip(curve["pts"], curve["pts"][1:]) | |
| 830 return [ | |
| 831 { | |
| 832 "object_type": "curve_edge", | |
| 833 "x0": min(p0[0], p1[0]), | |
| 834 "x1": max(p0[0], p1[0]), | |
| 835 "top": min(p0[1], p1[1]), | |
| 836 "doctop": min(p0[1], p1[1]) + (curve["doctop"] - curve["top"]), | |
| 837 "bottom": max(p0[1], p1[1]), | |
| 838 "width": abs(p0[0] - p1[0]), | |
| 839 "height": abs(p0[1] - p1[1]), | |
| 840 "orientation": "v" if p0[0] == p1[0] else ("h" if p0[1] == p1[1] else None), | |
| 841 } | |
| 842 for p0, p1 in point_pairs | |
| 843 ] | |
| 844 | |
| 845 | |
| 846 def obj_to_edges(obj) -> list: | |
| 847 t = obj["object_type"] | |
| 848 if "_edge" in t: | |
| 849 return [obj] | |
| 850 elif t == "line": | |
| 851 return [line_to_edge(obj)] | |
| 852 else: | |
| 853 return {"rect": rect_to_edges, "curve": curve_to_edges}[t](obj) | |
| 854 | |
| 855 | |
| 856 def filter_edges( | |
| 857 edges, | |
| 858 orientation=None, | |
| 859 edge_type=None, | |
| 860 min_length=1, | |
| 861 ) -> list: | |
| 862 if orientation not in ("v", "h", None): | |
| 863 raise ValueError("Orientation must be 'v' or 'h'") | |
| 864 | |
| 865 def test(e) -> bool: | |
| 866 dim = "height" if e["orientation"] == "v" else "width" | |
| 867 et_correct = e["object_type"] == edge_type if edge_type is not None else True | |
| 868 orient_correct = orientation is None or e["orientation"] == orientation | |
| 869 return bool(et_correct and orient_correct and (e[dim] >= min_length)) | |
| 870 | |
| 871 return list(filter(test, edges)) | |
| 872 | |
| 873 | |
| 874 def cluster_list(xs, tolerance=0) -> list: | |
| 875 if tolerance == 0: | |
| 876 return [[x] for x in sorted(xs)] | |
| 877 if len(xs) < 2: | |
| 878 return [[x] for x in sorted(xs)] | |
| 879 groups = [] | |
| 880 xs = list(sorted(xs)) | |
| 881 current_group = [xs[0]] | |
| 882 last = xs[0] | |
| 883 for x in xs[1:]: | |
| 884 if x <= (last + tolerance): | |
| 885 current_group.append(x) | |
| 886 else: | |
| 887 groups.append(current_group) | |
| 888 current_group = [x] | |
| 889 last = x | |
| 890 groups.append(current_group) | |
| 891 return groups | |
| 892 | |
| 893 | |
| 894 def make_cluster_dict(values, tolerance) -> dict: | |
| 895 clusters = cluster_list(list(set(values)), tolerance) | |
| 896 | |
| 897 nested_tuples = [ | |
| 898 [(val, i) for val in value_cluster] for i, value_cluster in enumerate(clusters) | |
| 899 ] | |
| 900 | |
| 901 return dict(itertools.chain(*nested_tuples)) | |
| 902 | |
| 903 | |
| 904 def cluster_objects(xs, key_fn, tolerance) -> list: | |
| 905 if not callable(key_fn): | |
| 906 key_fn = itemgetter(key_fn) | |
| 907 | |
| 908 values = map(key_fn, xs) | |
| 909 cluster_dict = make_cluster_dict(values, tolerance) | |
| 910 | |
| 911 get_0, get_1 = itemgetter(0), itemgetter(1) | |
| 912 | |
| 913 cluster_tuples = sorted(((x, cluster_dict.get(key_fn(x))) for x in xs), key=get_1) | |
| 914 | |
| 915 grouped = itertools.groupby(cluster_tuples, key=get_1) | |
| 916 | |
| 917 return [list(map(get_0, v)) for k, v in grouped] | |
| 918 | |
| 919 | |
| 920 def move_object(obj, axis: str, value): | |
| 921 assert axis in ("h", "v") | |
| 922 if axis == "h": | |
| 923 new_items = [ | |
| 924 ("x0", obj["x0"] + value), | |
| 925 ("x1", obj["x1"] + value), | |
| 926 ] | |
| 927 if axis == "v": | |
| 928 new_items = [ | |
| 929 ("top", obj["top"] + value), | |
| 930 ("bottom", obj["bottom"] + value), | |
| 931 ] | |
| 932 if "doctop" in obj: | |
| 933 new_items += [("doctop", obj["doctop"] + value)] | |
| 934 if "y0" in obj: | |
| 935 new_items += [ | |
| 936 ("y0", obj["y0"] - value), | |
| 937 ("y1", obj["y1"] - value), | |
| 938 ] | |
| 939 return obj.__class__(tuple(obj.items()) + tuple(new_items)) | |
| 940 | |
| 941 | |
| 942 def snap_objects(objs, attr: str, tolerance) -> list: | |
| 943 axis = {"x0": "h", "x1": "h", "top": "v", "bottom": "v"}[attr] | |
| 944 list_objs = list(objs) | |
| 945 clusters = cluster_objects(list_objs, itemgetter(attr), tolerance) | |
| 946 avgs = [sum(map(itemgetter(attr), cluster)) / len(cluster) for cluster in clusters] | |
| 947 snapped_clusters = [ | |
| 948 [move_object(obj, axis, avg - obj[attr]) for obj in cluster] | |
| 949 for cluster, avg in zip(clusters, avgs) | |
| 950 ] | |
| 951 return list(itertools.chain(*snapped_clusters)) | |
| 952 | |
| 953 | |
| 954 def snap_edges( | |
| 955 edges, | |
| 956 x_tolerance=DEFAULT_SNAP_TOLERANCE, | |
| 957 y_tolerance=DEFAULT_SNAP_TOLERANCE, | |
| 958 ): | |
| 959 """ | |
| 960 Given a list of edges, snap any within `tolerance` pixels of one another | |
| 961 to their positional average. | |
| 962 """ | |
| 963 by_orientation = {"v": [], "h": []} | |
| 964 for e in edges: | |
| 965 by_orientation[e["orientation"]].append(e) | |
| 966 | |
| 967 snapped_v = snap_objects(by_orientation["v"], "x0", x_tolerance) | |
| 968 snapped_h = snap_objects(by_orientation["h"], "top", y_tolerance) | |
| 969 return snapped_v + snapped_h | |
| 970 | |
| 971 | |
| 972 def resize_object(obj, key: str, value): | |
| 973 assert key in ("x0", "x1", "top", "bottom") | |
| 974 old_value = obj[key] | |
| 975 diff = value - old_value | |
| 976 new_items = [ | |
| 977 (key, value), | |
| 978 ] | |
| 979 if key == "x0": | |
| 980 assert value <= obj["x1"] | |
| 981 new_items.append(("width", obj["x1"] - value)) | |
| 982 elif key == "x1": | |
| 983 assert value >= obj["x0"] | |
| 984 new_items.append(("width", value - obj["x0"])) | |
| 985 elif key == "top": | |
| 986 assert value <= obj["bottom"] | |
| 987 new_items.append(("doctop", obj["doctop"] + diff)) | |
| 988 new_items.append(("height", obj["height"] - diff)) | |
| 989 if "y1" in obj: | |
| 990 new_items.append(("y1", obj["y1"] - diff)) | |
| 991 elif key == "bottom": | |
| 992 assert value >= obj["top"] | |
| 993 new_items.append(("height", obj["height"] + diff)) | |
| 994 if "y0" in obj: | |
| 995 new_items.append(("y0", obj["y0"] - diff)) | |
| 996 return obj.__class__(tuple(obj.items()) + tuple(new_items)) | |
| 997 | |
| 998 | |
| 999 def join_edge_group(edges, orientation: str, tolerance=DEFAULT_JOIN_TOLERANCE): | |
| 1000 """ | |
| 1001 Given a list of edges along the same infinite line, join those that | |
| 1002 are within `tolerance` pixels of one another. | |
| 1003 """ | |
| 1004 if orientation == "h": | |
| 1005 min_prop, max_prop = "x0", "x1" | |
| 1006 elif orientation == "v": | |
| 1007 min_prop, max_prop = "top", "bottom" | |
| 1008 else: | |
| 1009 raise ValueError("Orientation must be 'v' or 'h'") | |
| 1010 | |
| 1011 sorted_edges = list(sorted(edges, key=itemgetter(min_prop))) | |
| 1012 joined = [sorted_edges[0]] | |
| 1013 for e in sorted_edges[1:]: | |
| 1014 last = joined[-1] | |
| 1015 if e[min_prop] <= (last[max_prop] + tolerance): | |
| 1016 if e[max_prop] > last[max_prop]: | |
| 1017 # Extend current edge to new extremity | |
| 1018 joined[-1] = resize_object(last, max_prop, e[max_prop]) | |
| 1019 else: | |
| 1020 # Edge is separate from previous edges | |
| 1021 joined.append(e) | |
| 1022 | |
| 1023 return joined | |
| 1024 | |
| 1025 | |
| 1026 def merge_edges( | |
| 1027 edges, | |
| 1028 snap_x_tolerance, | |
| 1029 snap_y_tolerance, | |
| 1030 join_x_tolerance, | |
| 1031 join_y_tolerance, | |
| 1032 ): | |
| 1033 """ | |
| 1034 Using the `snap_edges` and `join_edge_group` methods above, | |
| 1035 merge a list of edges into a more "seamless" list. | |
| 1036 """ | |
| 1037 | |
| 1038 def get_group(edge): | |
| 1039 if edge["orientation"] == "h": | |
| 1040 return ("h", edge["top"]) | |
| 1041 else: | |
| 1042 return ("v", edge["x0"]) | |
| 1043 | |
| 1044 if snap_x_tolerance > 0 or snap_y_tolerance > 0: | |
| 1045 edges = snap_edges(edges, snap_x_tolerance, snap_y_tolerance) | |
| 1046 | |
| 1047 _sorted = sorted(edges, key=get_group) | |
| 1048 edge_groups = itertools.groupby(_sorted, key=get_group) | |
| 1049 edge_gen = ( | |
| 1050 join_edge_group( | |
| 1051 items, k[0], (join_x_tolerance if k[0] == "h" else join_y_tolerance) | |
| 1052 ) | |
| 1053 for k, items in edge_groups | |
| 1054 ) | |
| 1055 edges = list(itertools.chain(*edge_gen)) | |
| 1056 return edges | |
| 1057 | |
| 1058 | |
| 1059 def bbox_to_rect(bbox) -> dict: | |
| 1060 """ | |
| 1061 Return the rectangle (i.e a dict with keys "x0", "top", "x1", | |
| 1062 "bottom") for an object. | |
| 1063 """ | |
| 1064 return {"x0": bbox[0], "top": bbox[1], "x1": bbox[2], "bottom": bbox[3]} | |
| 1065 | |
| 1066 | |
| 1067 def objects_to_rect(objects) -> dict: | |
| 1068 """ | |
| 1069 Given an iterable of objects, return the smallest rectangle (i.e. a | |
| 1070 dict with "x0", "top", "x1", and "bottom" keys) that contains them | |
| 1071 all. | |
| 1072 """ | |
| 1073 return bbox_to_rect(objects_to_bbox(objects)) | |
| 1074 | |
| 1075 | |
| 1076 def merge_bboxes(bboxes): | |
| 1077 """ | |
| 1078 Given an iterable of bounding boxes, return the smallest bounding box | |
| 1079 that contains them all. | |
| 1080 """ | |
| 1081 x0, top, x1, bottom = zip(*bboxes) | |
| 1082 return (min(x0), min(top), max(x1), max(bottom)) | |
| 1083 | |
| 1084 | |
| 1085 def objects_to_bbox(objects): | |
| 1086 """ | |
| 1087 Given an iterable of objects, return the smallest bounding box that | |
| 1088 contains them all. | |
| 1089 """ | |
| 1090 return merge_bboxes(map(bbox_getter, objects)) | |
| 1091 | |
| 1092 | |
| 1093 def words_to_edges_h(words, word_threshold: int = DEFAULT_MIN_WORDS_HORIZONTAL): | |
| 1094 """ | |
| 1095 Find (imaginary) horizontal lines that connect the tops | |
| 1096 of at least `word_threshold` words. | |
| 1097 """ | |
| 1098 by_top = cluster_objects(words, itemgetter("top"), 1) | |
| 1099 large_clusters = filter(lambda x: len(x) >= word_threshold, by_top) | |
| 1100 rects = list(map(objects_to_rect, large_clusters)) | |
| 1101 if len(rects) == 0: | |
| 1102 return [] | |
| 1103 min_x0 = min(map(itemgetter("x0"), rects)) | |
| 1104 max_x1 = max(map(itemgetter("x1"), rects)) | |
| 1105 | |
| 1106 edges = [] | |
| 1107 for r in rects: | |
| 1108 edges += [ | |
| 1109 # Top of text | |
| 1110 { | |
| 1111 "x0": min_x0, | |
| 1112 "x1": max_x1, | |
| 1113 "top": r["top"], | |
| 1114 "bottom": r["top"], | |
| 1115 "width": max_x1 - min_x0, | |
| 1116 "orientation": "h", | |
| 1117 }, | |
| 1118 # For each detected row, we also add the 'bottom' line. This will | |
| 1119 # generate extra edges, (some will be redundant with the next row | |
| 1120 # 'top' line), but this catches the last row of every table. | |
| 1121 { | |
| 1122 "x0": min_x0, | |
| 1123 "x1": max_x1, | |
| 1124 "top": r["bottom"], | |
| 1125 "bottom": r["bottom"], | |
| 1126 "width": max_x1 - min_x0, | |
| 1127 "orientation": "h", | |
| 1128 }, | |
| 1129 ] | |
| 1130 | |
| 1131 return edges | |
| 1132 | |
| 1133 | |
| 1134 def get_bbox_overlap(a, b): | |
| 1135 a_left, a_top, a_right, a_bottom = a | |
| 1136 b_left, b_top, b_right, b_bottom = b | |
| 1137 o_left = max(a_left, b_left) | |
| 1138 o_right = min(a_right, b_right) | |
| 1139 o_bottom = min(a_bottom, b_bottom) | |
| 1140 o_top = max(a_top, b_top) | |
| 1141 o_width = o_right - o_left | |
| 1142 o_height = o_bottom - o_top | |
| 1143 if o_height >= 0 and o_width >= 0 and o_height + o_width > 0: | |
| 1144 return (o_left, o_top, o_right, o_bottom) | |
| 1145 else: | |
| 1146 return None | |
| 1147 | |
| 1148 | |
| 1149 def words_to_edges_v(words, word_threshold: int = DEFAULT_MIN_WORDS_VERTICAL): | |
| 1150 """ | |
| 1151 Find (imaginary) vertical lines that connect the left, right, or | |
| 1152 center of at least `word_threshold` words. | |
| 1153 """ | |
| 1154 # Find words that share the same left, right, or centerpoints | |
| 1155 by_x0 = cluster_objects(words, itemgetter("x0"), 1) | |
| 1156 by_x1 = cluster_objects(words, itemgetter("x1"), 1) | |
| 1157 | |
| 1158 def get_center(word): | |
| 1159 return float(word["x0"] + word["x1"]) / 2 | |
| 1160 | |
| 1161 by_center = cluster_objects(words, get_center, 1) | |
| 1162 clusters = by_x0 + by_x1 + by_center | |
| 1163 | |
| 1164 # Find the points that align with the most words | |
| 1165 sorted_clusters = sorted(clusters, key=lambda x: -len(x)) | |
| 1166 large_clusters = filter(lambda x: len(x) >= word_threshold, sorted_clusters) | |
| 1167 | |
| 1168 # For each of those points, find the bboxes fitting all matching words | |
| 1169 bboxes = list(map(objects_to_bbox, large_clusters)) | |
| 1170 | |
| 1171 # Iterate through those bboxes, condensing overlapping bboxes | |
| 1172 condensed_bboxes = [] | |
| 1173 for bbox in bboxes: | |
| 1174 overlap = any(get_bbox_overlap(bbox, c) for c in condensed_bboxes) | |
| 1175 if not overlap: | |
| 1176 condensed_bboxes.append(bbox) | |
| 1177 | |
| 1178 if not condensed_bboxes: | |
| 1179 return [] | |
| 1180 | |
| 1181 condensed_rects = map(bbox_to_rect, condensed_bboxes) | |
| 1182 sorted_rects = list(sorted(condensed_rects, key=itemgetter("x0"))) | |
| 1183 | |
| 1184 max_x1 = max(map(itemgetter("x1"), sorted_rects)) | |
| 1185 min_top = min(map(itemgetter("top"), sorted_rects)) | |
| 1186 max_bottom = max(map(itemgetter("bottom"), sorted_rects)) | |
| 1187 | |
| 1188 return [ | |
| 1189 { | |
| 1190 "x0": b["x0"], | |
| 1191 "x1": b["x0"], | |
| 1192 "top": min_top, | |
| 1193 "bottom": max_bottom, | |
| 1194 "height": max_bottom - min_top, | |
| 1195 "orientation": "v", | |
| 1196 } | |
| 1197 for b in sorted_rects | |
| 1198 ] + [ | |
| 1199 { | |
| 1200 "x0": max_x1, | |
| 1201 "x1": max_x1, | |
| 1202 "top": min_top, | |
| 1203 "bottom": max_bottom, | |
| 1204 "height": max_bottom - min_top, | |
| 1205 "orientation": "v", | |
| 1206 } | |
| 1207 ] | |
| 1208 | |
| 1209 | |
| 1210 def edges_to_intersections(edges, x_tolerance=1, y_tolerance=1) -> dict: | |
| 1211 """ | |
| 1212 Given a list of edges, return the points at which they intersect | |
| 1213 within `tolerance` pixels. | |
| 1214 """ | |
| 1215 intersections = {} | |
| 1216 v_edges, h_edges = [ | |
| 1217 list(filter(lambda x: x["orientation"] == o, edges)) for o in ("v", "h") | |
| 1218 ] | |
| 1219 for v in sorted(v_edges, key=itemgetter("x0", "top")): | |
| 1220 for h in sorted(h_edges, key=itemgetter("top", "x0")): | |
| 1221 if ( | |
| 1222 (v["top"] <= (h["top"] + y_tolerance)) | |
| 1223 and (v["bottom"] >= (h["top"] - y_tolerance)) | |
| 1224 and (v["x0"] >= (h["x0"] - x_tolerance)) | |
| 1225 and (v["x0"] <= (h["x1"] + x_tolerance)) | |
| 1226 ): | |
| 1227 vertex = (v["x0"], h["top"]) | |
| 1228 if vertex not in intersections: | |
| 1229 intersections[vertex] = {"v": [], "h": []} | |
| 1230 intersections[vertex]["v"].append(v) | |
| 1231 intersections[vertex]["h"].append(h) | |
| 1232 return intersections | |
| 1233 | |
| 1234 | |
| 1235 def obj_to_bbox(obj): | |
| 1236 """ | |
| 1237 Return the bounding box for an object. | |
| 1238 """ | |
| 1239 return bbox_getter(obj) | |
| 1240 | |
| 1241 | |
| 1242 def intersections_to_cells(intersections): | |
| 1243 """ | |
| 1244 Given a list of points (`intersections`), return all rectangular "cells" | |
| 1245 that those points describe. | |
| 1246 | |
| 1247 `intersections` should be a dictionary with (x0, top) tuples as keys, | |
| 1248 and a list of edge objects as values. The edge objects should correspond | |
| 1249 to the edges that touch the intersection. | |
| 1250 """ | |
| 1251 | |
| 1252 def edge_connects(p1, p2) -> bool: | |
| 1253 def edges_to_set(edges): | |
| 1254 return set(map(obj_to_bbox, edges)) | |
| 1255 | |
| 1256 if p1[0] == p2[0]: | |
| 1257 common = edges_to_set(intersections[p1]["v"]).intersection( | |
| 1258 edges_to_set(intersections[p2]["v"]) | |
| 1259 ) | |
| 1260 if len(common): | |
| 1261 return True | |
| 1262 | |
| 1263 if p1[1] == p2[1]: | |
| 1264 common = edges_to_set(intersections[p1]["h"]).intersection( | |
| 1265 edges_to_set(intersections[p2]["h"]) | |
| 1266 ) | |
| 1267 if len(common): | |
| 1268 return True | |
| 1269 return False | |
| 1270 | |
| 1271 points = list(sorted(intersections.keys())) | |
| 1272 n_points = len(points) | |
| 1273 | |
| 1274 def find_smallest_cell(points, i: int): | |
| 1275 if i == n_points - 1: | |
| 1276 return None | |
| 1277 pt = points[i] | |
| 1278 rest = points[i + 1 :] | |
| 1279 # Get all the points directly below and directly right | |
| 1280 below = [x for x in rest if x[0] == pt[0]] | |
| 1281 right = [x for x in rest if x[1] == pt[1]] | |
| 1282 for below_pt in below: | |
| 1283 if not edge_connects(pt, below_pt): | |
| 1284 continue | |
| 1285 | |
| 1286 for right_pt in right: | |
| 1287 if not edge_connects(pt, right_pt): | |
| 1288 continue | |
| 1289 | |
| 1290 bottom_right = (right_pt[0], below_pt[1]) | |
| 1291 | |
| 1292 if ( | |
| 1293 (bottom_right in intersections) | |
| 1294 and edge_connects(bottom_right, right_pt) | |
| 1295 and edge_connects(bottom_right, below_pt) | |
| 1296 ): | |
| 1297 return (pt[0], pt[1], bottom_right[0], bottom_right[1]) | |
| 1298 return None | |
| 1299 | |
| 1300 cell_gen = (find_smallest_cell(points, i) for i in range(len(points))) | |
| 1301 return list(filter(None, cell_gen)) | |
| 1302 | |
| 1303 | |
| 1304 def cells_to_tables(page, cells) -> list: | |
| 1305 """ | |
| 1306 Given a list of bounding boxes (`cells`), return a list of tables that | |
| 1307 hold those cells most simply (and contiguously). | |
| 1308 """ | |
| 1309 | |
| 1310 def bbox_to_corners(bbox) -> tuple: | |
| 1311 x0, top, x1, bottom = bbox | |
| 1312 return ((x0, top), (x0, bottom), (x1, top), (x1, bottom)) | |
| 1313 | |
| 1314 remaining_cells = list(cells) | |
| 1315 | |
| 1316 # Iterate through the cells found above, and assign them | |
| 1317 # to contiguous tables | |
| 1318 | |
| 1319 current_corners = set() | |
| 1320 current_cells = [] | |
| 1321 | |
| 1322 tables = [] | |
| 1323 while len(remaining_cells): | |
| 1324 initial_cell_count = len(current_cells) | |
| 1325 for cell in list(remaining_cells): | |
| 1326 cell_corners = bbox_to_corners(cell) | |
| 1327 # If we're just starting a table ... | |
| 1328 if len(current_cells) == 0: | |
| 1329 # ... immediately assign it to the empty group | |
| 1330 current_corners |= set(cell_corners) | |
| 1331 current_cells.append(cell) | |
| 1332 remaining_cells.remove(cell) | |
| 1333 else: | |
| 1334 # How many corners does this table share with the current group? | |
| 1335 corner_count = sum(c in current_corners for c in cell_corners) | |
| 1336 | |
| 1337 # If touching on at least one corner... | |
| 1338 if corner_count > 0: | |
| 1339 # ... assign it to the current group | |
| 1340 current_corners |= set(cell_corners) | |
| 1341 current_cells.append(cell) | |
| 1342 remaining_cells.remove(cell) | |
| 1343 | |
| 1344 # If this iteration did not find any more cells to append... | |
| 1345 if len(current_cells) == initial_cell_count: | |
| 1346 # ... start a new cell group | |
| 1347 tables.append(list(current_cells)) | |
| 1348 current_corners.clear() | |
| 1349 current_cells.clear() | |
| 1350 | |
| 1351 # Once we have exhausting the list of cells ... | |
| 1352 | |
| 1353 # ... and we have a cell group that has not been stored | |
| 1354 if len(current_cells): | |
| 1355 # ... store it. | |
| 1356 tables.append(list(current_cells)) | |
| 1357 | |
| 1358 # PyMuPDF modification: | |
| 1359 # Remove tables without text or having only 1 column | |
| 1360 for i in range(len(tables) - 1, -1, -1): | |
| 1361 r = EMPTY_RECT() | |
| 1362 x1_vals = set() | |
| 1363 x0_vals = set() | |
| 1364 for c in tables[i]: | |
| 1365 r |= c | |
| 1366 x1_vals.add(c[2]) | |
| 1367 x0_vals.add(c[0]) | |
| 1368 if ( | |
| 1369 len(x1_vals) < 2 | |
| 1370 or len(x0_vals) < 2 | |
| 1371 or white_spaces.issuperset( | |
| 1372 page.get_textbox( | |
| 1373 r, | |
| 1374 textpage=TEXTPAGE, | |
| 1375 ) | |
| 1376 ) | |
| 1377 ): | |
| 1378 del tables[i] | |
| 1379 | |
| 1380 # Sort the tables top-to-bottom-left-to-right based on the value of the | |
| 1381 # topmost-and-then-leftmost coordinate of a table. | |
| 1382 _sorted = sorted(tables, key=lambda t: min((c[1], c[0]) for c in t)) | |
| 1383 return _sorted | |
| 1384 | |
| 1385 | |
| 1386 class CellGroup: | |
| 1387 def __init__(self, cells): | |
| 1388 self.cells = cells | |
| 1389 self.bbox = ( | |
| 1390 min(map(itemgetter(0), filter(None, cells))), | |
| 1391 min(map(itemgetter(1), filter(None, cells))), | |
| 1392 max(map(itemgetter(2), filter(None, cells))), | |
| 1393 max(map(itemgetter(3), filter(None, cells))), | |
| 1394 ) | |
| 1395 | |
| 1396 | |
| 1397 class TableRow(CellGroup): | |
| 1398 pass | |
| 1399 | |
| 1400 | |
| 1401 class TableHeader: | |
| 1402 """PyMuPDF extension containing the identified table header.""" | |
| 1403 | |
| 1404 def __init__(self, bbox, cells, names, above): | |
| 1405 self.bbox = bbox | |
| 1406 self.cells = cells | |
| 1407 self.names = names | |
| 1408 self.external = above | |
| 1409 | |
| 1410 | |
| 1411 class Table: | |
| 1412 def __init__(self, page, cells): | |
| 1413 self.page = page | |
| 1414 self.cells = cells | |
| 1415 self.header = self._get_header() # PyMuPDF extension | |
| 1416 | |
| 1417 @property | |
| 1418 def bbox(self): | |
| 1419 c = self.cells | |
| 1420 return ( | |
| 1421 min(map(itemgetter(0), c)), | |
| 1422 min(map(itemgetter(1), c)), | |
| 1423 max(map(itemgetter(2), c)), | |
| 1424 max(map(itemgetter(3), c)), | |
| 1425 ) | |
| 1426 | |
| 1427 @property | |
| 1428 def rows(self) -> list: | |
| 1429 _sorted = sorted(self.cells, key=itemgetter(1, 0)) | |
| 1430 xs = list(sorted(set(map(itemgetter(0), self.cells)))) | |
| 1431 rows = [] | |
| 1432 for y, row_cells in itertools.groupby(_sorted, itemgetter(1)): | |
| 1433 xdict = {cell[0]: cell for cell in row_cells} | |
| 1434 row = TableRow([xdict.get(x) for x in xs]) | |
| 1435 rows.append(row) | |
| 1436 return rows | |
| 1437 | |
| 1438 @property | |
| 1439 def row_count(self) -> int: # PyMuPDF extension | |
| 1440 return len(self.rows) | |
| 1441 | |
| 1442 @property | |
| 1443 def col_count(self) -> int: # PyMuPDF extension | |
| 1444 return max([len(r.cells) for r in self.rows]) | |
| 1445 | |
| 1446 def extract(self, **kwargs) -> list: | |
| 1447 chars = CHARS | |
| 1448 table_arr = [] | |
| 1449 | |
| 1450 def char_in_bbox(char, bbox) -> bool: | |
| 1451 v_mid = (char["top"] + char["bottom"]) / 2 | |
| 1452 h_mid = (char["x0"] + char["x1"]) / 2 | |
| 1453 x0, top, x1, bottom = bbox | |
| 1454 return bool( | |
| 1455 (h_mid >= x0) and (h_mid < x1) and (v_mid >= top) and (v_mid < bottom) | |
| 1456 ) | |
| 1457 | |
| 1458 for row in self.rows: | |
| 1459 arr = [] | |
| 1460 row_chars = [char for char in chars if char_in_bbox(char, row.bbox)] | |
| 1461 | |
| 1462 for cell in row.cells: | |
| 1463 if cell is None: | |
| 1464 cell_text = None | |
| 1465 else: | |
| 1466 cell_chars = [ | |
| 1467 char for char in row_chars if char_in_bbox(char, cell) | |
| 1468 ] | |
| 1469 | |
| 1470 if len(cell_chars): | |
| 1471 kwargs["x_shift"] = cell[0] | |
| 1472 kwargs["y_shift"] = cell[1] | |
| 1473 if "layout" in kwargs: | |
| 1474 kwargs["layout_width"] = cell[2] - cell[0] | |
| 1475 kwargs["layout_height"] = cell[3] - cell[1] | |
| 1476 cell_text = extract_text(cell_chars, **kwargs) | |
| 1477 else: | |
| 1478 cell_text = "" | |
| 1479 arr.append(cell_text) | |
| 1480 table_arr.append(arr) | |
| 1481 | |
| 1482 return table_arr | |
| 1483 | |
| 1484 def to_markdown(self, clean=False, fill_empty=True): | |
| 1485 """Output table content as a string in Github-markdown format. | |
| 1486 | |
| 1487 If "clean" then markdown syntax is removed from cell content. | |
| 1488 If "fill_empty" then cell content None is replaced by the values | |
| 1489 above (columns) or left (rows) in an effort to approximate row and | |
| 1490 columns spans. | |
| 1491 | |
| 1492 """ | |
| 1493 output = "|" | |
| 1494 rows = self.row_count | |
| 1495 cols = self.col_count | |
| 1496 | |
| 1497 # cell coordinates | |
| 1498 cell_boxes = [[c for c in r.cells] for r in self.rows] | |
| 1499 | |
| 1500 # cell text strings | |
| 1501 cells = [[None for i in range(cols)] for j in range(rows)] | |
| 1502 for i, row in enumerate(cell_boxes): | |
| 1503 for j, cell in enumerate(row): | |
| 1504 if cell is not None: | |
| 1505 cells[i][j] = extract_cells( | |
| 1506 TEXTPAGE, cell_boxes[i][j], markdown=True | |
| 1507 ) | |
| 1508 | |
| 1509 if fill_empty: # fill "None" cells where possible | |
| 1510 | |
| 1511 # for rows, copy content from left to right | |
| 1512 for j in range(rows): | |
| 1513 for i in range(cols - 1): | |
| 1514 if cells[j][i + 1] is None: | |
| 1515 cells[j][i + 1] = cells[j][i] | |
| 1516 | |
| 1517 # for columns, copy top to bottom | |
| 1518 for i in range(cols): | |
| 1519 for j in range(rows - 1): | |
| 1520 if cells[j + 1][i] is None: | |
| 1521 cells[j + 1][i] = cells[j][i] | |
| 1522 | |
| 1523 # generate header string and MD separator | |
| 1524 for i, name in enumerate(self.header.names): | |
| 1525 if not name: # generate a name if empty | |
| 1526 name = f"Col{i+1}" | |
| 1527 name = name.replace("\n", "<br>") # use HTML line breaks | |
| 1528 if clean: # remove sensitive syntax | |
| 1529 name = html.escape(name.replace("-", "-")) | |
| 1530 output += name + "|" | |
| 1531 | |
| 1532 output += "\n" | |
| 1533 # insert GitHub header line separator | |
| 1534 output += "|" + "|".join("---" for i in range(self.col_count)) + "|\n" | |
| 1535 | |
| 1536 # skip first row in details if header is part of the table | |
| 1537 j = 0 if self.header.external else 1 | |
| 1538 | |
| 1539 # iterate over detail rows | |
| 1540 for row in cells[j:]: | |
| 1541 line = "|" | |
| 1542 for i, cell in enumerate(row): | |
| 1543 # replace None cells with empty string | |
| 1544 # use HTML line break tag | |
| 1545 if cell is None: | |
| 1546 cell = "" | |
| 1547 if clean: # remove sensitive syntax | |
| 1548 cell = html.escape(cell.replace("-", "-")) | |
| 1549 line += cell + "|" | |
| 1550 line += "\n" | |
| 1551 output += line | |
| 1552 return output + "\n" | |
| 1553 | |
| 1554 def to_pandas(self, **kwargs): | |
| 1555 """Return a pandas DataFrame version of the table.""" | |
| 1556 try: | |
| 1557 import pandas as pd | |
| 1558 except ModuleNotFoundError: | |
| 1559 message("Package 'pandas' is not installed") | |
| 1560 raise | |
| 1561 | |
| 1562 pd_dict = {} | |
| 1563 extract = self.extract() | |
| 1564 hdr = self.header | |
| 1565 names = self.header.names | |
| 1566 hdr_len = len(names) | |
| 1567 # ensure uniqueness of column names | |
| 1568 for i in range(hdr_len): | |
| 1569 name = names[i] | |
| 1570 if not name: | |
| 1571 names[i] = f"Col{i}" | |
| 1572 if hdr_len != len(set(names)): | |
| 1573 for i in range(hdr_len): | |
| 1574 name = names[i] | |
| 1575 if name != f"Col{i}": | |
| 1576 names[i] = f"{i}-{name}" | |
| 1577 | |
| 1578 if not hdr.external: # header is part of 'extract' | |
| 1579 extract = extract[1:] | |
| 1580 | |
| 1581 for i in range(hdr_len): | |
| 1582 key = names[i] | |
| 1583 value = [] | |
| 1584 for j in range(len(extract)): | |
| 1585 value.append(extract[j][i]) | |
| 1586 pd_dict[key] = value | |
| 1587 | |
| 1588 return pd.DataFrame(pd_dict) | |
| 1589 | |
| 1590 def _get_header(self, y_tolerance=3): | |
| 1591 """Identify the table header. | |
| 1592 | |
| 1593 *** PyMuPDF extension. *** | |
| 1594 | |
| 1595 Starting from the first line above the table upwards, check if it | |
| 1596 qualifies to be part of the table header. | |
| 1597 | |
| 1598 Criteria include: | |
| 1599 * A one-line table never has an extra header. | |
| 1600 * Column borders must not intersect any word. If this happens, all | |
| 1601 text of this line and above of it is ignored. | |
| 1602 * No excess inter-line distance: If a line further up has a distance | |
| 1603 of more than 1.5 times of its font size, it will be ignored and | |
| 1604 all lines above of it. | |
| 1605 * Must have same text properties. | |
| 1606 * Starting with the top table line, a bold text property cannot change | |
| 1607 back to non-bold. | |
| 1608 | |
| 1609 If not all criteria are met (or there is no text above the table), | |
| 1610 the first table row is assumed to be the header. | |
| 1611 """ | |
| 1612 page = self.page | |
| 1613 y_delta = y_tolerance | |
| 1614 | |
| 1615 def top_row_bg_color(self): | |
| 1616 """ | |
| 1617 Compare top row background color with color of same-sized bbox | |
| 1618 above. If different, return True indicating that the original | |
| 1619 table top row is already the header. | |
| 1620 """ | |
| 1621 bbox0 = Rect(self.rows[0].bbox) | |
| 1622 bboxt = bbox0 + (0, -bbox0.height, 0, -bbox0.height) # area above | |
| 1623 top_color0 = page.get_pixmap(clip=bbox0).color_topusage()[1] | |
| 1624 top_colort = page.get_pixmap(clip=bboxt).color_topusage()[1] | |
| 1625 if top_color0 != top_colort: | |
| 1626 return True # top row is header | |
| 1627 return False | |
| 1628 | |
| 1629 def row_has_bold(bbox): | |
| 1630 """Check if a row contains some bold text. | |
| 1631 | |
| 1632 If e.g. true for the top row, then it will be used as (internal) | |
| 1633 column header row if any of the following is true: | |
| 1634 * the previous (above) text line has no bold span | |
| 1635 * the second table row text has no bold span | |
| 1636 | |
| 1637 Returns True if any spans are bold else False. | |
| 1638 """ | |
| 1639 blocks = page.get_text("dict", flags=TEXTFLAGS_TEXT, clip=bbox)["blocks"] | |
| 1640 spans = [s for b in blocks for l in b["lines"] for s in l["spans"]] | |
| 1641 | |
| 1642 return any(s["flags"] & TEXT_FONT_BOLD for s in spans) | |
| 1643 | |
| 1644 try: | |
| 1645 row = self.rows[0] | |
| 1646 cells = row.cells | |
| 1647 bbox = Rect(row.bbox) | |
| 1648 except IndexError: # this table has no rows | |
| 1649 return None | |
| 1650 | |
| 1651 # return this if we determine that the top row is the header | |
| 1652 header_top_row = TableHeader(bbox, cells, self.extract()[0], False) | |
| 1653 | |
| 1654 # 1-line tables have no extra header | |
| 1655 if len(self.rows) < 2: | |
| 1656 return header_top_row | |
| 1657 | |
| 1658 # 1-column tables have no extra header | |
| 1659 if len(cells) < 2: | |
| 1660 return header_top_row | |
| 1661 | |
| 1662 # assume top row is the header if second row is empty | |
| 1663 row2 = self.rows[1] # second row | |
| 1664 if all(c is None for c in row2.cells): # no valid cell bboxes in row2 | |
| 1665 return header_top_row | |
| 1666 | |
| 1667 # Special check: is top row bold? | |
| 1668 top_row_bold = row_has_bold(bbox) | |
| 1669 | |
| 1670 # assume top row is header if it is bold and any cell | |
| 1671 # of 2nd row is non-bold | |
| 1672 if top_row_bold and not row_has_bold(row2.bbox): | |
| 1673 return header_top_row | |
| 1674 | |
| 1675 if top_row_bg_color(self): | |
| 1676 # if area above top row has a different background color, | |
| 1677 # then top row is already the header | |
| 1678 return header_top_row | |
| 1679 | |
| 1680 # column coordinates (x1 values) in top row | |
| 1681 col_x = [c[2] if c is not None else None for c in cells[:-1]] | |
| 1682 | |
| 1683 # clip = page area above the table | |
| 1684 # We will inspect this area for text qualifying as column header. | |
| 1685 clip = +bbox # take row 0 bbox | |
| 1686 clip.y0 = 0 # start at top of page | |
| 1687 clip.y1 = bbox.y0 # end at top of table | |
| 1688 | |
| 1689 blocks = page.get_text("dict", clip=clip, flags=TEXTFLAGS_TEXT)["blocks"] | |
| 1690 # non-empty, non-superscript spans above table, sorted descending by y1 | |
| 1691 spans = sorted( | |
| 1692 [ | |
| 1693 s | |
| 1694 for b in blocks | |
| 1695 for l in b["lines"] | |
| 1696 for s in l["spans"] | |
| 1697 if not ( | |
| 1698 white_spaces.issuperset(s["text"]) | |
| 1699 or s["flags"] & TEXT_FONT_SUPERSCRIPT | |
| 1700 ) | |
| 1701 ], | |
| 1702 key=lambda s: s["bbox"][3], | |
| 1703 reverse=True, | |
| 1704 ) | |
| 1705 | |
| 1706 select = [] # y1 coordinates above, sorted descending | |
| 1707 line_heights = [] # line heights above, sorted descending | |
| 1708 line_bolds = [] # bold indicator per line above, same sorting | |
| 1709 | |
| 1710 # walk through the spans and fill above 3 lists | |
| 1711 for i in range(len(spans)): | |
| 1712 s = spans[i] | |
| 1713 y1 = s["bbox"][3] # span bottom | |
| 1714 h = y1 - s["bbox"][1] # span bbox height | |
| 1715 bold = s["flags"] & TEXT_FONT_BOLD | |
| 1716 | |
| 1717 # use first item to start the lists | |
| 1718 if i == 0: | |
| 1719 select.append(y1) | |
| 1720 line_heights.append(h) | |
| 1721 line_bolds.append(bold) | |
| 1722 continue | |
| 1723 | |
| 1724 # get previous items from the 3 lists | |
| 1725 y0 = select[-1] | |
| 1726 h0 = line_heights[-1] | |
| 1727 bold0 = line_bolds[-1] | |
| 1728 | |
| 1729 if bold0 and not bold: | |
| 1730 break # stop if switching from bold to non-bold | |
| 1731 | |
| 1732 # if fitting in height of previous span, modify bbox | |
| 1733 if y0 - y1 <= y_delta or abs((y0 - h0) - s["bbox"][1]) <= y_delta: | |
| 1734 s["bbox"] = (s["bbox"][0], y0 - h0, s["bbox"][2], y0) | |
| 1735 spans[i] = s | |
| 1736 if bold: | |
| 1737 line_bolds[-1] = bold | |
| 1738 continue | |
| 1739 elif y0 - y1 > 1.5 * h0: | |
| 1740 break # stop if distance to previous line too large | |
| 1741 select.append(y1) | |
| 1742 line_heights.append(h) | |
| 1743 line_bolds.append(bold) | |
| 1744 | |
| 1745 if select == []: # nothing above the table? | |
| 1746 return header_top_row | |
| 1747 | |
| 1748 select = select[:5] # accept up to 5 lines for an external header | |
| 1749 | |
| 1750 # assume top row as header if text above is too far away | |
| 1751 if bbox.y0 - select[0] >= line_heights[0]: | |
| 1752 return header_top_row | |
| 1753 | |
| 1754 # accept top row as header if bold, but line above is not | |
| 1755 if top_row_bold and not line_bolds[0]: | |
| 1756 return header_top_row | |
| 1757 | |
| 1758 if spans == []: # nothing left above the table, return top row | |
| 1759 return header_top_row | |
| 1760 | |
| 1761 # re-compute clip above table | |
| 1762 nclip = EMPTY_RECT() | |
| 1763 for s in [s for s in spans if s["bbox"][3] >= select[-1]]: | |
| 1764 nclip |= s["bbox"] | |
| 1765 if not nclip.is_empty: | |
| 1766 clip = nclip | |
| 1767 | |
| 1768 clip.y1 = bbox.y0 # make sure we still include every word above | |
| 1769 | |
| 1770 # Confirm that no word in clip is intersecting a column separator | |
| 1771 word_rects = [Rect(w[:4]) for w in page.get_text("words", clip=clip)] | |
| 1772 word_tops = sorted(list(set([r[1] for r in word_rects])), reverse=True) | |
| 1773 | |
| 1774 select = [] | |
| 1775 | |
| 1776 # exclude lines with words that intersect a column border | |
| 1777 for top in word_tops: | |
| 1778 intersecting = [ | |
| 1779 (x, r) | |
| 1780 for x in col_x | |
| 1781 if x is not None | |
| 1782 for r in word_rects | |
| 1783 if r[1] == top and r[0] < x and r[2] > x | |
| 1784 ] | |
| 1785 if intersecting == []: | |
| 1786 select.append(top) | |
| 1787 else: # detected a word crossing a column border | |
| 1788 break | |
| 1789 | |
| 1790 if select == []: # nothing left over: return first row | |
| 1791 return header_top_row | |
| 1792 | |
| 1793 hdr_bbox = +clip # compute the header cells | |
| 1794 hdr_bbox.y0 = select[-1] # hdr_bbox top is smallest top coord of words | |
| 1795 hdr_cells = [ | |
| 1796 (c[0], hdr_bbox.y0, c[2], hdr_bbox.y1) if c is not None else None | |
| 1797 for c in cells | |
| 1798 ] | |
| 1799 | |
| 1800 # adjust left/right of header bbox | |
| 1801 hdr_bbox.x0 = self.bbox[0] | |
| 1802 hdr_bbox.x1 = self.bbox[2] | |
| 1803 | |
| 1804 # column names: no line breaks, no excess spaces | |
| 1805 hdr_names = [ | |
| 1806 ( | |
| 1807 page.get_textbox(c).replace("\n", " ").replace(" ", " ").strip() | |
| 1808 if c is not None | |
| 1809 else "" | |
| 1810 ) | |
| 1811 for c in hdr_cells | |
| 1812 ] | |
| 1813 return TableHeader(tuple(hdr_bbox), hdr_cells, hdr_names, True) | |
| 1814 | |
| 1815 | |
| 1816 @dataclass | |
| 1817 class TableSettings: | |
| 1818 vertical_strategy: str = "lines" | |
| 1819 horizontal_strategy: str = "lines" | |
| 1820 explicit_vertical_lines: list = None | |
| 1821 explicit_horizontal_lines: list = None | |
| 1822 snap_tolerance: float = DEFAULT_SNAP_TOLERANCE | |
| 1823 snap_x_tolerance: float = UNSET | |
| 1824 snap_y_tolerance: float = UNSET | |
| 1825 join_tolerance: float = DEFAULT_JOIN_TOLERANCE | |
| 1826 join_x_tolerance: float = UNSET | |
| 1827 join_y_tolerance: float = UNSET | |
| 1828 edge_min_length: float = 3 | |
| 1829 min_words_vertical: float = DEFAULT_MIN_WORDS_VERTICAL | |
| 1830 min_words_horizontal: float = DEFAULT_MIN_WORDS_HORIZONTAL | |
| 1831 intersection_tolerance: float = 3 | |
| 1832 intersection_x_tolerance: float = UNSET | |
| 1833 intersection_y_tolerance: float = UNSET | |
| 1834 text_settings: dict = None | |
| 1835 | |
| 1836 def __post_init__(self) -> "TableSettings": | |
| 1837 """Clean up user-provided table settings. | |
| 1838 | |
| 1839 Validates that the table settings provided consists of acceptable values and | |
| 1840 returns a cleaned up version. The cleaned up version fills out the missing | |
| 1841 values with the default values in the provided settings. | |
| 1842 | |
| 1843 TODO: Can be further used to validate that the values are of the correct | |
| 1844 type. For example, raising a value error when a non-boolean input is | |
| 1845 provided for the key ``keep_blank_chars``. | |
| 1846 | |
| 1847 :param table_settings: User-provided table settings. | |
| 1848 :returns: A cleaned up version of the user-provided table settings. | |
| 1849 :raises ValueError: When an unrecognised key is provided. | |
| 1850 """ | |
| 1851 | |
| 1852 for setting in NON_NEGATIVE_SETTINGS: | |
| 1853 if (getattr(self, setting) or 0) < 0: | |
| 1854 raise ValueError(f"Table setting '{setting}' cannot be negative") | |
| 1855 | |
| 1856 for orientation in ["horizontal", "vertical"]: | |
| 1857 strategy = getattr(self, orientation + "_strategy") | |
| 1858 if strategy not in TABLE_STRATEGIES: | |
| 1859 raise ValueError( | |
| 1860 f"{orientation}_strategy must be one of" | |
| 1861 f'{{{",".join(TABLE_STRATEGIES)}}}' | |
| 1862 ) | |
| 1863 | |
| 1864 if self.text_settings is None: | |
| 1865 self.text_settings = {} | |
| 1866 | |
| 1867 # This next section is for backwards compatibility | |
| 1868 for attr in ["x_tolerance", "y_tolerance"]: | |
| 1869 if attr not in self.text_settings: | |
| 1870 self.text_settings[attr] = self.text_settings.get("tolerance", 3) | |
| 1871 | |
| 1872 if "tolerance" in self.text_settings: | |
| 1873 del self.text_settings["tolerance"] | |
| 1874 # End of that section | |
| 1875 | |
| 1876 for attr, fallback in [ | |
| 1877 ("snap_x_tolerance", "snap_tolerance"), | |
| 1878 ("snap_y_tolerance", "snap_tolerance"), | |
| 1879 ("join_x_tolerance", "join_tolerance"), | |
| 1880 ("join_y_tolerance", "join_tolerance"), | |
| 1881 ("intersection_x_tolerance", "intersection_tolerance"), | |
| 1882 ("intersection_y_tolerance", "intersection_tolerance"), | |
| 1883 ]: | |
| 1884 if getattr(self, attr) is UNSET: | |
| 1885 setattr(self, attr, getattr(self, fallback)) | |
| 1886 | |
| 1887 return self | |
| 1888 | |
| 1889 @classmethod | |
| 1890 def resolve(cls, settings=None): | |
| 1891 if settings is None: | |
| 1892 return cls() | |
| 1893 elif isinstance(settings, cls): | |
| 1894 return settings | |
| 1895 elif isinstance(settings, dict): | |
| 1896 core_settings = {} | |
| 1897 text_settings = {} | |
| 1898 for k, v in settings.items(): | |
| 1899 if k[:5] == "text_": | |
| 1900 text_settings[k[5:]] = v | |
| 1901 else: | |
| 1902 core_settings[k] = v | |
| 1903 core_settings["text_settings"] = text_settings | |
| 1904 return cls(**core_settings) | |
| 1905 else: | |
| 1906 raise ValueError(f"Cannot resolve settings: {settings}") | |
| 1907 | |
| 1908 | |
| 1909 class TableFinder: | |
| 1910 """ | |
| 1911 Given a PDF page, find plausible table structures. | |
| 1912 | |
| 1913 Largely borrowed from Anssi Nurminen's master's thesis: | |
| 1914 http://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3 | |
| 1915 | |
| 1916 ... and inspired by Tabula: | |
| 1917 https://github.com/tabulapdf/tabula-extractor/issues/16 | |
| 1918 """ | |
| 1919 | |
| 1920 def __init__(self, page, settings=None): | |
| 1921 self.page = weakref.proxy(page) | |
| 1922 self.settings = TableSettings.resolve(settings) | |
| 1923 self.edges = self.get_edges() | |
| 1924 self.intersections = edges_to_intersections( | |
| 1925 self.edges, | |
| 1926 self.settings.intersection_x_tolerance, | |
| 1927 self.settings.intersection_y_tolerance, | |
| 1928 ) | |
| 1929 self.cells = intersections_to_cells(self.intersections) | |
| 1930 self.tables = [ | |
| 1931 Table(self.page, cell_group) | |
| 1932 for cell_group in cells_to_tables(self.page, self.cells) | |
| 1933 ] | |
| 1934 | |
| 1935 def get_edges(self) -> list: | |
| 1936 settings = self.settings | |
| 1937 | |
| 1938 for orientation in ["vertical", "horizontal"]: | |
| 1939 strategy = getattr(settings, orientation + "_strategy") | |
| 1940 if strategy == "explicit": | |
| 1941 lines = getattr(settings, "explicit_" + orientation + "_lines") | |
| 1942 if len(lines) < 2: | |
| 1943 raise ValueError( | |
| 1944 f"If {orientation}_strategy == 'explicit', " | |
| 1945 f"explicit_{orientation}_lines " | |
| 1946 f"must be specified as a list/tuple of two or more " | |
| 1947 f"floats/ints." | |
| 1948 ) | |
| 1949 | |
| 1950 v_strat = settings.vertical_strategy | |
| 1951 h_strat = settings.horizontal_strategy | |
| 1952 | |
| 1953 if v_strat == "text" or h_strat == "text": | |
| 1954 words = extract_words(CHARS, **(settings.text_settings or {})) | |
| 1955 else: | |
| 1956 words = [] | |
| 1957 | |
| 1958 v_explicit = [] | |
| 1959 for desc in settings.explicit_vertical_lines or []: | |
| 1960 if isinstance(desc, dict): | |
| 1961 for e in obj_to_edges(desc): | |
| 1962 if e["orientation"] == "v": | |
| 1963 v_explicit.append(e) | |
| 1964 else: | |
| 1965 v_explicit.append( | |
| 1966 { | |
| 1967 "x0": desc, | |
| 1968 "x1": desc, | |
| 1969 "top": self.page.rect[1], | |
| 1970 "bottom": self.page.rect[3], | |
| 1971 "height": self.page.rect[3] - self.page.rect[1], | |
| 1972 "orientation": "v", | |
| 1973 } | |
| 1974 ) | |
| 1975 | |
| 1976 if v_strat == "lines": | |
| 1977 v_base = filter_edges(EDGES, "v") | |
| 1978 elif v_strat == "lines_strict": | |
| 1979 v_base = filter_edges(EDGES, "v", edge_type="line") | |
| 1980 elif v_strat == "text": | |
| 1981 v_base = words_to_edges_v(words, word_threshold=settings.min_words_vertical) | |
| 1982 elif v_strat == "explicit": | |
| 1983 v_base = [] | |
| 1984 else: | |
| 1985 v_base = [] | |
| 1986 | |
| 1987 v = v_base + v_explicit | |
| 1988 | |
| 1989 h_explicit = [] | |
| 1990 for desc in settings.explicit_horizontal_lines or []: | |
| 1991 if isinstance(desc, dict): | |
| 1992 for e in obj_to_edges(desc): | |
| 1993 if e["orientation"] == "h": | |
| 1994 h_explicit.append(e) | |
| 1995 else: | |
| 1996 h_explicit.append( | |
| 1997 { | |
| 1998 "x0": self.page.rect[0], | |
| 1999 "x1": self.page.rect[2], | |
| 2000 "width": self.page.rect[2] - self.page.rect[0], | |
| 2001 "top": desc, | |
| 2002 "bottom": desc, | |
| 2003 "orientation": "h", | |
| 2004 } | |
| 2005 ) | |
| 2006 | |
| 2007 if h_strat == "lines": | |
| 2008 h_base = filter_edges(EDGES, "h") | |
| 2009 elif h_strat == "lines_strict": | |
| 2010 h_base = filter_edges(EDGES, "h", edge_type="line") | |
| 2011 elif h_strat == "text": | |
| 2012 h_base = words_to_edges_h( | |
| 2013 words, word_threshold=settings.min_words_horizontal | |
| 2014 ) | |
| 2015 elif h_strat == "explicit": | |
| 2016 h_base = [] | |
| 2017 else: | |
| 2018 h_base = [] | |
| 2019 | |
| 2020 h = h_base + h_explicit | |
| 2021 | |
| 2022 edges = list(v) + list(h) | |
| 2023 | |
| 2024 edges = merge_edges( | |
| 2025 edges, | |
| 2026 snap_x_tolerance=settings.snap_x_tolerance, | |
| 2027 snap_y_tolerance=settings.snap_y_tolerance, | |
| 2028 join_x_tolerance=settings.join_x_tolerance, | |
| 2029 join_y_tolerance=settings.join_y_tolerance, | |
| 2030 ) | |
| 2031 | |
| 2032 return filter_edges(edges, min_length=settings.edge_min_length) | |
| 2033 | |
| 2034 def __getitem__(self, i): | |
| 2035 tcount = len(self.tables) | |
| 2036 if i >= tcount: | |
| 2037 raise IndexError("table not on page") | |
| 2038 while i < 0: | |
| 2039 i += tcount | |
| 2040 return self.tables[i] | |
| 2041 | |
| 2042 | |
| 2043 """ | |
| 2044 Start of PyMuPDF interface code. | |
| 2045 The following functions are executed when "page.find_tables()" is called. | |
| 2046 | |
| 2047 * make_chars: Fills the CHARS list with text character information extracted | |
| 2048 via "rawdict" text extraction. Items in CHARS are formatted | |
| 2049 as expected by the table code. | |
| 2050 * make_edges: Fills the EDGES list with vector graphic information extracted | |
| 2051 via "get_drawings". Items in EDGES are formatted as expected | |
| 2052 by the table code. | |
| 2053 | |
| 2054 The lists CHARS and EDGES are used to replace respective document access | |
| 2055 of pdfplumber or, respectively pdfminer. | |
| 2056 The table code has been modified to use these lists instead of accessing | |
| 2057 page information themselves. | |
| 2058 """ | |
| 2059 | |
| 2060 | |
| 2061 # ----------------------------------------------------------------------------- | |
| 2062 # Extract all page characters to fill the CHARS list | |
| 2063 # ----------------------------------------------------------------------------- | |
| 2064 def make_chars(page, clip=None): | |
| 2065 """Extract text as "rawdict" to fill CHARS.""" | |
| 2066 global TEXTPAGE | |
| 2067 page_number = page.number + 1 | |
| 2068 page_height = page.rect.height | |
| 2069 ctm = page.transformation_matrix | |
| 2070 TEXTPAGE = page.get_textpage(clip=clip, flags=FLAGS) | |
| 2071 blocks = page.get_text("rawdict", textpage=TEXTPAGE)["blocks"] | |
| 2072 doctop_base = page_height * page.number | |
| 2073 for block in blocks: | |
| 2074 for line in block["lines"]: | |
| 2075 ldir = line["dir"] # = (cosine, sine) of angle | |
| 2076 ldir = (round(ldir[0], 4), round(ldir[1], 4)) | |
| 2077 matrix = Matrix(ldir[0], -ldir[1], ldir[1], ldir[0], 0, 0) | |
| 2078 if ldir[1] == 0: | |
| 2079 upright = True | |
| 2080 else: | |
| 2081 upright = False | |
| 2082 for span in sorted(line["spans"], key=lambda s: s["bbox"][0]): | |
| 2083 fontname = span["font"] | |
| 2084 fontsize = span["size"] | |
| 2085 color = sRGB_to_pdf(span["color"]) | |
| 2086 for char in sorted(span["chars"], key=lambda c: c["bbox"][0]): | |
| 2087 bbox = Rect(char["bbox"]) | |
| 2088 bbox_ctm = bbox * ctm | |
| 2089 origin = Point(char["origin"]) * ctm | |
| 2090 matrix.e = origin.x | |
| 2091 matrix.f = origin.y | |
| 2092 text = char["c"] | |
| 2093 char_dict = { | |
| 2094 "adv": bbox.x1 - bbox.x0 if upright else bbox.y1 - bbox.y0, | |
| 2095 "bottom": bbox.y1, | |
| 2096 "doctop": bbox.y0 + doctop_base, | |
| 2097 "fontname": fontname, | |
| 2098 "height": bbox.y1 - bbox.y0, | |
| 2099 "matrix": tuple(matrix), | |
| 2100 "ncs": "DeviceRGB", | |
| 2101 "non_stroking_color": color, | |
| 2102 "non_stroking_pattern": None, | |
| 2103 "object_type": "char", | |
| 2104 "page_number": page_number, | |
| 2105 "size": fontsize if upright else bbox.y1 - bbox.y0, | |
| 2106 "stroking_color": color, | |
| 2107 "stroking_pattern": None, | |
| 2108 "text": text, | |
| 2109 "top": bbox.y0, | |
| 2110 "upright": upright, | |
| 2111 "width": bbox.x1 - bbox.x0, | |
| 2112 "x0": bbox.x0, | |
| 2113 "x1": bbox.x1, | |
| 2114 "y0": bbox_ctm.y0, | |
| 2115 "y1": bbox_ctm.y1, | |
| 2116 } | |
| 2117 CHARS.append(char_dict) | |
| 2118 | |
| 2119 | |
| 2120 # ------------------------------------------------------------------------ | |
| 2121 # Extract all page vector graphics to fill the EDGES list. | |
| 2122 # We are ignoring Bézier curves completely and are converting everything | |
| 2123 # else to lines. | |
| 2124 # ------------------------------------------------------------------------ | |
| 2125 def make_edges(page, clip=None, tset=None, paths=None, add_lines=None, add_boxes=None): | |
| 2126 snap_x = tset.snap_x_tolerance | |
| 2127 snap_y = tset.snap_y_tolerance | |
| 2128 min_length = tset.edge_min_length | |
| 2129 lines_strict = ( | |
| 2130 tset.vertical_strategy == "lines_strict" | |
| 2131 or tset.horizontal_strategy == "lines_strict" | |
| 2132 ) | |
| 2133 page_height = page.rect.height | |
| 2134 doctop_basis = page.number * page_height | |
| 2135 page_number = page.number + 1 | |
| 2136 prect = page.rect | |
| 2137 if page.rotation in (90, 270): | |
| 2138 w, h = prect.br | |
| 2139 prect = Rect(0, 0, h, w) | |
| 2140 if clip is not None: | |
| 2141 clip = Rect(clip) | |
| 2142 else: | |
| 2143 clip = prect | |
| 2144 | |
| 2145 def are_neighbors(r1, r2): | |
| 2146 """Detect whether r1, r2 are neighbors. | |
| 2147 | |
| 2148 Defined as: | |
| 2149 The minimum distance between points of r1 and points of r2 is not | |
| 2150 larger than some delta. | |
| 2151 | |
| 2152 This check supports empty rect-likes and thus also lines. | |
| 2153 | |
| 2154 Note: | |
| 2155 This type of check is MUCH faster than native Rect containment checks. | |
| 2156 """ | |
| 2157 if ( # check if x-coordinates of r1 are within those of r2 | |
| 2158 r2.x0 - snap_x <= r1.x0 <= r2.x1 + snap_x | |
| 2159 or r2.x0 - snap_x <= r1.x1 <= r2.x1 + snap_x | |
| 2160 ) and ( # ... same for y-coordinates | |
| 2161 r2.y0 - snap_y <= r1.y0 <= r2.y1 + snap_y | |
| 2162 or r2.y0 - snap_y <= r1.y1 <= r2.y1 + snap_y | |
| 2163 ): | |
| 2164 return True | |
| 2165 | |
| 2166 # same check with r1 / r2 exchanging their roles (this is necessary!) | |
| 2167 if ( | |
| 2168 r1.x0 - snap_x <= r2.x0 <= r1.x1 + snap_x | |
| 2169 or r1.x0 - snap_x <= r2.x1 <= r1.x1 + snap_x | |
| 2170 ) and ( | |
| 2171 r1.y0 - snap_y <= r2.y0 <= r1.y1 + snap_y | |
| 2172 or r1.y0 - snap_y <= r2.y1 <= r1.y1 + snap_y | |
| 2173 ): | |
| 2174 return True | |
| 2175 return False | |
| 2176 | |
| 2177 def clean_graphics(npaths=None): | |
| 2178 """Detect and join rectangles of "connected" vector graphics.""" | |
| 2179 if npaths is None: | |
| 2180 allpaths = page.get_drawings() | |
| 2181 else: # accept passed-in vector graphics | |
| 2182 allpaths = npaths[:] # paths relevant for table detection | |
| 2183 paths = [] | |
| 2184 for p in allpaths: | |
| 2185 # If only looking at lines, we ignore fill-only paths, | |
| 2186 # except simulated lines (i.e. small width or height). | |
| 2187 if ( | |
| 2188 lines_strict | |
| 2189 and p["type"] == "f" | |
| 2190 and p["rect"].width > snap_x | |
| 2191 and p["rect"].height > snap_y | |
| 2192 ): | |
| 2193 continue | |
| 2194 paths.append(p) | |
| 2195 | |
| 2196 # start with all vector graphics rectangles | |
| 2197 prects = sorted(set([p["rect"] for p in paths]), key=lambda r: (r.y1, r.x0)) | |
| 2198 new_rects = [] # the final list of joined rectangles | |
| 2199 # ---------------------------------------------------------------- | |
| 2200 # Strategy: Join rectangles that "almost touch" each other. | |
| 2201 # Extend first rectangle with any other that is a "neighbor". | |
| 2202 # Then move it to the final list and continue with the rest. | |
| 2203 # ---------------------------------------------------------------- | |
| 2204 while prects: # the algorithm will empty this list | |
| 2205 prect0 = prects[0] # copy of first rectangle (performance reasons!) | |
| 2206 repeat = True | |
| 2207 while repeat: # this loop extends first rect in list | |
| 2208 repeat = False # set to true again if some other rect touches | |
| 2209 for i in range(len(prects) - 1, 0, -1): # run backwards | |
| 2210 if are_neighbors(prect0, prects[i]): # close enough to rect 0? | |
| 2211 prect0 |= prects[i].tl # extend rect 0 | |
| 2212 prect0 |= prects[i].br # extend rect 0 | |
| 2213 del prects[i] # delete this rect | |
| 2214 repeat = True # keep checking the rest | |
| 2215 | |
| 2216 # move rect 0 over to result list if there is some text in it | |
| 2217 if not white_spaces.issuperset(page.get_textbox(prect0, textpage=TEXTPAGE)): | |
| 2218 # contains text, so accept it as a table bbox candidate | |
| 2219 new_rects.append(prect0) | |
| 2220 del prects[0] # remove from rect list | |
| 2221 | |
| 2222 return new_rects, paths | |
| 2223 | |
| 2224 bboxes, paths = clean_graphics(npaths=paths) | |
| 2225 | |
| 2226 def is_parallel(p1, p2): | |
| 2227 """Check if line is roughly axis-parallel.""" | |
| 2228 if abs(p1.x - p2.x) <= snap_x or abs(p1.y - p2.y) <= snap_y: | |
| 2229 return True | |
| 2230 return False | |
| 2231 | |
| 2232 def make_line(p, p1, p2, clip): | |
| 2233 """Given 2 points, make a line dictionary for table detection.""" | |
| 2234 if not is_parallel(p1, p2): # only accepting axis-parallel lines | |
| 2235 return {} | |
| 2236 # compute the extremal values | |
| 2237 x0 = min(p1.x, p2.x) | |
| 2238 x1 = max(p1.x, p2.x) | |
| 2239 y0 = min(p1.y, p2.y) | |
| 2240 y1 = max(p1.y, p2.y) | |
| 2241 | |
| 2242 # check for outside clip | |
| 2243 if x0 > clip.x1 or x1 < clip.x0 or y0 > clip.y1 or y1 < clip.y0: | |
| 2244 return {} | |
| 2245 | |
| 2246 if x0 < clip.x0: | |
| 2247 x0 = clip.x0 # adjust to clip boundary | |
| 2248 | |
| 2249 if x1 > clip.x1: | |
| 2250 x1 = clip.x1 # adjust to clip boundary | |
| 2251 | |
| 2252 if y0 < clip.y0: | |
| 2253 y0 = clip.y0 # adjust to clip boundary | |
| 2254 | |
| 2255 if y1 > clip.y1: | |
| 2256 y1 = clip.y1 # adjust to clip boundary | |
| 2257 | |
| 2258 width = x1 - x0 # from adjusted values | |
| 2259 height = y1 - y0 # from adjusted values | |
| 2260 if width == height == 0: | |
| 2261 return {} # nothing left to deal with | |
| 2262 line_dict = { | |
| 2263 "x0": x0, | |
| 2264 "y0": page_height - y0, | |
| 2265 "x1": x1, | |
| 2266 "y1": page_height - y1, | |
| 2267 "width": width, | |
| 2268 "height": height, | |
| 2269 "pts": [(x0, y0), (x1, y1)], | |
| 2270 "linewidth": p["width"], | |
| 2271 "stroke": True, | |
| 2272 "fill": False, | |
| 2273 "evenodd": False, | |
| 2274 "stroking_color": p["color"] if p["color"] else p["fill"], | |
| 2275 "non_stroking_color": None, | |
| 2276 "object_type": "line", | |
| 2277 "page_number": page_number, | |
| 2278 "stroking_pattern": None, | |
| 2279 "non_stroking_pattern": None, | |
| 2280 "top": y0, | |
| 2281 "bottom": y1, | |
| 2282 "doctop": y0 + doctop_basis, | |
| 2283 } | |
| 2284 return line_dict | |
| 2285 | |
| 2286 for p in paths: | |
| 2287 items = p["items"] # items in this path | |
| 2288 | |
| 2289 # if 'closePath', add a line from last to first point | |
| 2290 if p["closePath"] and items[0][0] == "l" and items[-1][0] == "l": | |
| 2291 items.append(("l", items[-1][2], items[0][1])) | |
| 2292 | |
| 2293 for i in items: | |
| 2294 if i[0] not in ("l", "re", "qu"): | |
| 2295 continue # ignore anything else | |
| 2296 | |
| 2297 if i[0] == "l": # a line | |
| 2298 p1, p2 = i[1:] | |
| 2299 line_dict = make_line(p, p1, p2, clip) | |
| 2300 if line_dict: | |
| 2301 EDGES.append(line_to_edge(line_dict)) | |
| 2302 | |
| 2303 elif i[0] == "re": | |
| 2304 # A rectangle: decompose into 4 lines, but filter out | |
| 2305 # the ones that simulate a line | |
| 2306 rect = i[1].normalize() # normalize the rectangle | |
| 2307 | |
| 2308 if ( | |
| 2309 rect.width <= min_length and rect.width < rect.height | |
| 2310 ): # simulates a vertical line | |
| 2311 x = abs(rect.x1 + rect.x0) / 2 # take middle value for x | |
| 2312 p1 = Point(x, rect.y0) | |
| 2313 p2 = Point(x, rect.y1) | |
| 2314 line_dict = make_line(p, p1, p2, clip) | |
| 2315 if line_dict: | |
| 2316 EDGES.append(line_to_edge(line_dict)) | |
| 2317 continue | |
| 2318 | |
| 2319 if ( | |
| 2320 rect.height <= min_length and rect.height < rect.width | |
| 2321 ): # simulates a horizontal line | |
| 2322 y = abs(rect.y1 + rect.y0) / 2 # take middle value for y | |
| 2323 p1 = Point(rect.x0, y) | |
| 2324 p2 = Point(rect.x1, y) | |
| 2325 line_dict = make_line(p, p1, p2, clip) | |
| 2326 if line_dict: | |
| 2327 EDGES.append(line_to_edge(line_dict)) | |
| 2328 continue | |
| 2329 | |
| 2330 line_dict = make_line(p, rect.tl, rect.bl, clip) | |
| 2331 if line_dict: | |
| 2332 EDGES.append(line_to_edge(line_dict)) | |
| 2333 | |
| 2334 line_dict = make_line(p, rect.bl, rect.br, clip) | |
| 2335 if line_dict: | |
| 2336 EDGES.append(line_to_edge(line_dict)) | |
| 2337 | |
| 2338 line_dict = make_line(p, rect.br, rect.tr, clip) | |
| 2339 if line_dict: | |
| 2340 EDGES.append(line_to_edge(line_dict)) | |
| 2341 | |
| 2342 line_dict = make_line(p, rect.tr, rect.tl, clip) | |
| 2343 if line_dict: | |
| 2344 EDGES.append(line_to_edge(line_dict)) | |
| 2345 | |
| 2346 else: # must be a quad | |
| 2347 # we convert it into (up to) 4 lines | |
| 2348 ul, ur, ll, lr = i[1] | |
| 2349 | |
| 2350 line_dict = make_line(p, ul, ll, clip) | |
| 2351 if line_dict: | |
| 2352 EDGES.append(line_to_edge(line_dict)) | |
| 2353 | |
| 2354 line_dict = make_line(p, ll, lr, clip) | |
| 2355 if line_dict: | |
| 2356 EDGES.append(line_to_edge(line_dict)) | |
| 2357 | |
| 2358 line_dict = make_line(p, lr, ur, clip) | |
| 2359 if line_dict: | |
| 2360 EDGES.append(line_to_edge(line_dict)) | |
| 2361 | |
| 2362 line_dict = make_line(p, ur, ul, clip) | |
| 2363 if line_dict: | |
| 2364 EDGES.append(line_to_edge(line_dict)) | |
| 2365 | |
| 2366 path = {"color": (0, 0, 0), "fill": None, "width": 1} | |
| 2367 for bbox in bboxes: # add the border lines for all enveloping bboxes | |
| 2368 line_dict = make_line(path, bbox.tl, bbox.tr, clip) | |
| 2369 if line_dict: | |
| 2370 EDGES.append(line_to_edge(line_dict)) | |
| 2371 | |
| 2372 line_dict = make_line(path, bbox.bl, bbox.br, clip) | |
| 2373 if line_dict: | |
| 2374 EDGES.append(line_to_edge(line_dict)) | |
| 2375 | |
| 2376 line_dict = make_line(path, bbox.tl, bbox.bl, clip) | |
| 2377 if line_dict: | |
| 2378 EDGES.append(line_to_edge(line_dict)) | |
| 2379 | |
| 2380 line_dict = make_line(path, bbox.tr, bbox.br, clip) | |
| 2381 if line_dict: | |
| 2382 EDGES.append(line_to_edge(line_dict)) | |
| 2383 | |
| 2384 if add_lines is not None: # add user-specified lines | |
| 2385 assert isinstance(add_lines, (tuple, list)) | |
| 2386 else: | |
| 2387 add_lines = [] | |
| 2388 for p1, p2 in add_lines: | |
| 2389 p1 = Point(p1) | |
| 2390 p2 = Point(p2) | |
| 2391 line_dict = make_line(path, p1, p2, clip) | |
| 2392 if line_dict: | |
| 2393 EDGES.append(line_to_edge(line_dict)) | |
| 2394 | |
| 2395 if add_boxes is not None: # add user-specified rectangles | |
| 2396 assert isinstance(add_boxes, (tuple, list)) | |
| 2397 else: | |
| 2398 add_boxes = [] | |
| 2399 for box in add_boxes: | |
| 2400 r = Rect(box) | |
| 2401 line_dict = make_line(path, r.tl, r.bl, clip) | |
| 2402 if line_dict: | |
| 2403 EDGES.append(line_to_edge(line_dict)) | |
| 2404 line_dict = make_line(path, r.bl, r.br, clip) | |
| 2405 if line_dict: | |
| 2406 EDGES.append(line_to_edge(line_dict)) | |
| 2407 line_dict = make_line(path, r.br, r.tr, clip) | |
| 2408 if line_dict: | |
| 2409 EDGES.append(line_to_edge(line_dict)) | |
| 2410 line_dict = make_line(path, r.tr, r.tl, clip) | |
| 2411 if line_dict: | |
| 2412 EDGES.append(line_to_edge(line_dict)) | |
| 2413 | |
| 2414 | |
| 2415 def page_rotation_set0(page): | |
| 2416 """Nullify page rotation. | |
| 2417 | |
| 2418 To correctly detect tables, page rotation must be zero. | |
| 2419 This function performs the necessary adjustments and returns information | |
| 2420 for reverting this changes. | |
| 2421 """ | |
| 2422 mediabox = page.mediabox | |
| 2423 rot = page.rotation # contains normalized rotation value | |
| 2424 # need to derotate the page's content | |
| 2425 mb = page.mediabox # current mediabox | |
| 2426 | |
| 2427 if rot == 90: | |
| 2428 # before derotation, shift content horizontally | |
| 2429 mat0 = Matrix(1, 0, 0, 1, mb.y1 - mb.x1 - mb.x0 - mb.y0, 0) | |
| 2430 elif rot == 270: | |
| 2431 # before derotation, shift content vertically | |
| 2432 mat0 = Matrix(1, 0, 0, 1, 0, mb.x1 - mb.y1 - mb.y0 - mb.x0) | |
| 2433 else: | |
| 2434 mat0 = Matrix(1, 0, 0, 1, -2 * mb.x0, -2 * mb.y0) | |
| 2435 | |
| 2436 # prefix with derotation matrix | |
| 2437 mat = mat0 * page.derotation_matrix | |
| 2438 cmd = b"%g %g %g %g %g %g cm " % tuple(mat) | |
| 2439 xref = TOOLS._insert_contents(page, cmd, 0) | |
| 2440 | |
| 2441 # swap x- and y-coordinates | |
| 2442 if rot in (90, 270): | |
| 2443 x0, y0, x1, y1 = mb | |
| 2444 mb.x0 = y0 | |
| 2445 mb.y0 = x0 | |
| 2446 mb.x1 = y1 | |
| 2447 mb.y1 = x1 | |
| 2448 page.set_mediabox(mb) | |
| 2449 | |
| 2450 page.set_rotation(0) | |
| 2451 | |
| 2452 # refresh the page to apply these changes | |
| 2453 doc = page.parent | |
| 2454 pno = page.number | |
| 2455 page = doc[pno] | |
| 2456 return page, xref, rot, mediabox | |
| 2457 | |
| 2458 | |
| 2459 def page_rotation_reset(page, xref, rot, mediabox): | |
| 2460 """Reset page rotation to original values. | |
| 2461 | |
| 2462 To be used before we return tables.""" | |
| 2463 doc = page.parent # document of the page | |
| 2464 doc.update_stream(xref, b" ") # remove de-rotation matrix | |
| 2465 page.set_mediabox(mediabox) # set mediabox to old value | |
| 2466 page.set_rotation(rot) # set rotation to old value | |
| 2467 pno = page.number | |
| 2468 page = doc[pno] # update page info | |
| 2469 return page | |
| 2470 | |
| 2471 | |
| 2472 def find_tables( | |
| 2473 page, | |
| 2474 clip=None, | |
| 2475 vertical_strategy: str = "lines", | |
| 2476 horizontal_strategy: str = "lines", | |
| 2477 vertical_lines: list = None, | |
| 2478 horizontal_lines: list = None, | |
| 2479 snap_tolerance: float = DEFAULT_SNAP_TOLERANCE, | |
| 2480 snap_x_tolerance: float = None, | |
| 2481 snap_y_tolerance: float = None, | |
| 2482 join_tolerance: float = DEFAULT_JOIN_TOLERANCE, | |
| 2483 join_x_tolerance: float = None, | |
| 2484 join_y_tolerance: float = None, | |
| 2485 edge_min_length: float = 3, | |
| 2486 min_words_vertical: float = DEFAULT_MIN_WORDS_VERTICAL, | |
| 2487 min_words_horizontal: float = DEFAULT_MIN_WORDS_HORIZONTAL, | |
| 2488 intersection_tolerance: float = 3, | |
| 2489 intersection_x_tolerance: float = None, | |
| 2490 intersection_y_tolerance: float = None, | |
| 2491 text_tolerance=3, | |
| 2492 text_x_tolerance=3, | |
| 2493 text_y_tolerance=3, | |
| 2494 strategy=None, # offer abbreviation | |
| 2495 add_lines=None, # user-specified lines | |
| 2496 add_boxes=None, # user-specified rectangles | |
| 2497 paths=None, # accept vector graphics as parameter | |
| 2498 ): | |
| 2499 global CHARS, EDGES | |
| 2500 CHARS = [] | |
| 2501 EDGES = [] | |
| 2502 old_small = bool(TOOLS.set_small_glyph_heights()) # save old value | |
| 2503 TOOLS.set_small_glyph_heights(True) # we need minimum bboxes | |
| 2504 if page.rotation != 0: | |
| 2505 page, old_xref, old_rot, old_mediabox = page_rotation_set0(page) | |
| 2506 else: | |
| 2507 old_xref, old_rot, old_mediabox = None, None, None | |
| 2508 | |
| 2509 if snap_x_tolerance is None: | |
| 2510 snap_x_tolerance = UNSET | |
| 2511 if snap_y_tolerance is None: | |
| 2512 snap_y_tolerance = UNSET | |
| 2513 if join_x_tolerance is None: | |
| 2514 join_x_tolerance = UNSET | |
| 2515 if join_y_tolerance is None: | |
| 2516 join_y_tolerance = UNSET | |
| 2517 if intersection_x_tolerance is None: | |
| 2518 intersection_x_tolerance = UNSET | |
| 2519 if intersection_y_tolerance is None: | |
| 2520 intersection_y_tolerance = UNSET | |
| 2521 if strategy is not None: | |
| 2522 vertical_strategy = strategy | |
| 2523 horizontal_strategy = strategy | |
| 2524 | |
| 2525 settings = { | |
| 2526 "vertical_strategy": vertical_strategy, | |
| 2527 "horizontal_strategy": horizontal_strategy, | |
| 2528 "explicit_vertical_lines": vertical_lines, | |
| 2529 "explicit_horizontal_lines": horizontal_lines, | |
| 2530 "snap_tolerance": snap_tolerance, | |
| 2531 "snap_x_tolerance": snap_x_tolerance, | |
| 2532 "snap_y_tolerance": snap_y_tolerance, | |
| 2533 "join_tolerance": join_tolerance, | |
| 2534 "join_x_tolerance": join_x_tolerance, | |
| 2535 "join_y_tolerance": join_y_tolerance, | |
| 2536 "edge_min_length": edge_min_length, | |
| 2537 "min_words_vertical": min_words_vertical, | |
| 2538 "min_words_horizontal": min_words_horizontal, | |
| 2539 "intersection_tolerance": intersection_tolerance, | |
| 2540 "intersection_x_tolerance": intersection_x_tolerance, | |
| 2541 "intersection_y_tolerance": intersection_y_tolerance, | |
| 2542 "text_tolerance": text_tolerance, | |
| 2543 "text_x_tolerance": text_x_tolerance, | |
| 2544 "text_y_tolerance": text_y_tolerance, | |
| 2545 } | |
| 2546 tset = TableSettings.resolve(settings=settings) | |
| 2547 page.table_settings = tset | |
| 2548 | |
| 2549 make_chars(page, clip=clip) # create character list of page | |
| 2550 make_edges( | |
| 2551 page, | |
| 2552 clip=clip, | |
| 2553 tset=tset, | |
| 2554 paths=paths, | |
| 2555 add_lines=add_lines, | |
| 2556 add_boxes=add_boxes, | |
| 2557 ) # create lines and curves | |
| 2558 tables = TableFinder(page, settings=tset) | |
| 2559 | |
| 2560 TOOLS.set_small_glyph_heights(old_small) | |
| 2561 if old_xref is not None: | |
| 2562 page = page_rotation_reset(page, old_xref, old_rot, old_mediabox) | |
| 2563 return tables |
