Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/textord/tablefind.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: tablefind.h | |
| 3 // Description: Helper classes to find tables from ColPartitions. | |
| 4 // Author: Faisal Shafait (faisal.shafait@dfki.de) | |
| 5 // | |
| 6 // (C) Copyright 2009, Google Inc. | |
| 7 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 // you may not use this file except in compliance with the License. | |
| 9 // You may obtain a copy of the License at | |
| 10 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 // Unless required by applicable law or agreed to in writing, software | |
| 12 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 // See the License for the specific language governing permissions and | |
| 15 // limitations under the License. | |
| 16 // | |
| 17 /////////////////////////////////////////////////////////////////////// | |
| 18 | |
| 19 #ifndef TESSERACT_TEXTORD_TABLEFIND_H_ | |
| 20 #define TESSERACT_TEXTORD_TABLEFIND_H_ | |
| 21 | |
| 22 #include "colpartitiongrid.h" | |
| 23 #include "elst.h" | |
| 24 #include "rect.h" | |
| 25 | |
| 26 namespace tesseract { | |
| 27 | |
| 28 // Possible types for a column segment. | |
| 29 enum ColSegType { COL_UNKNOWN, COL_TEXT, COL_TABLE, COL_MIXED, COL_COUNT }; | |
| 30 | |
| 31 class ColPartitionSet; | |
| 32 | |
| 33 // ColSegment holds rectangular blocks that represent segmentation of a page | |
| 34 // into regions containing single column text/table. | |
| 35 class ColSegment; | |
| 36 ELISTIZEH(ColSegment) | |
| 37 CLISTIZEH(ColSegment) | |
| 38 | |
| 39 class ColSegment : public ELIST_LINK { | |
| 40 public: | |
| 41 ColSegment(); | |
| 42 ~ColSegment() = default; | |
| 43 | |
| 44 // Simple accessors and mutators | |
| 45 const TBOX &bounding_box() const { | |
| 46 return bounding_box_; | |
| 47 } | |
| 48 | |
| 49 void set_top(int y) { | |
| 50 bounding_box_.set_top(y); | |
| 51 } | |
| 52 | |
| 53 void set_bottom(int y) { | |
| 54 bounding_box_.set_bottom(y); | |
| 55 } | |
| 56 | |
| 57 void set_left(int x) { | |
| 58 bounding_box_.set_left(x); | |
| 59 } | |
| 60 | |
| 61 void set_right(int x) { | |
| 62 bounding_box_.set_right(x); | |
| 63 } | |
| 64 | |
| 65 void set_bounding_box(const TBOX &other) { | |
| 66 bounding_box_ = other; | |
| 67 } | |
| 68 | |
| 69 int get_num_table_cells() const { | |
| 70 return num_table_cells_; | |
| 71 } | |
| 72 | |
| 73 // set the number of table colpartitions covered by the bounding_box_ | |
| 74 void set_num_table_cells(int n) { | |
| 75 num_table_cells_ = n; | |
| 76 } | |
| 77 | |
| 78 int get_num_text_cells() const { | |
| 79 return num_text_cells_; | |
| 80 } | |
| 81 | |
| 82 // set the number of text colpartitions covered by the bounding_box_ | |
| 83 void set_num_text_cells(int n) { | |
| 84 num_text_cells_ = n; | |
| 85 } | |
| 86 | |
| 87 ColSegType type() const { | |
| 88 return type_; | |
| 89 } | |
| 90 | |
| 91 // set the type of the block based on the ratio of table to text | |
| 92 // colpartitions covered by it. | |
| 93 void set_type(); | |
| 94 | |
| 95 // Provides a color for BBGrid to draw the rectangle. | |
| 96 ScrollView::Color BoxColor() const; | |
| 97 | |
| 98 // Insert a rectangle into bounding_box_ | |
| 99 void InsertBox(const TBOX &other); | |
| 100 | |
| 101 private: | |
| 102 TBOX bounding_box_; // bounding box | |
| 103 int num_table_cells_; | |
| 104 int num_text_cells_; | |
| 105 ColSegType type_; | |
| 106 }; | |
| 107 | |
| 108 // Typedef BBGrid of ColSegments | |
| 109 using ColSegmentGrid = BBGrid<ColSegment, ColSegment_CLIST, ColSegment_C_IT>; | |
| 110 using ColSegmentGridSearch = | |
| 111 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>; | |
| 112 | |
| 113 // TableFinder is a utility class to find a set of tables given a set of | |
| 114 // ColPartitions and Columns. The TableFinder will mark candidate ColPartitions | |
| 115 // based on research in "Table Detection in Heterogeneous Documents". | |
| 116 // Usage flow is as follows: | |
| 117 // TableFinder finder; | |
| 118 // finder.InsertCleanPartitions(/* grid info */) | |
| 119 // finder.LocateTables(/* ColPartitions and Columns */); | |
| 120 // finder.Update TODO(nbeato) | |
| 121 class TESS_API TableFinder { | |
| 122 public: | |
| 123 // Constructor is simple initializations | |
| 124 TableFinder(); | |
| 125 ~TableFinder(); | |
| 126 | |
| 127 // Set the resolution of the connected components in ppi. | |
| 128 void set_resolution(int resolution) { | |
| 129 resolution_ = resolution; | |
| 130 } | |
| 131 // Change the reading order. Initially it is left to right. | |
| 132 void set_left_to_right_language(bool order); | |
| 133 | |
| 134 // Initialize | |
| 135 void Init(int grid_size, const ICOORD &bottom_left, const ICOORD &top_right); | |
| 136 | |
| 137 // Copy cleaned partitions from ColumnFinder's part_grid_ to this | |
| 138 // clean_part_grid_ and insert dot-like noise into period_grid_. | |
| 139 // It resizes the grids in this object to the dimensions of grid. | |
| 140 void InsertCleanPartitions(ColPartitionGrid *grid, TO_BLOCK *block); | |
| 141 | |
| 142 // High level function to perform table detection | |
| 143 // Finds tables and updates the grid object with new partitions for the | |
| 144 // tables. The columns and width callbacks are used to merge tables. | |
| 145 // The reskew argument is only used to write the tables to the out.png | |
| 146 // if that feature is enabled. | |
| 147 void LocateTables(ColPartitionGrid *grid, ColPartitionSet **columns, | |
| 148 WidthCallback width_cb, const FCOORD &reskew); | |
| 149 | |
| 150 protected: | |
| 151 // Access for the grid dimensions. | |
| 152 // The results will not be correct until InsertCleanPartitions | |
| 153 // has been called. The values are taken from the grid passed as an argument | |
| 154 // to that function. | |
| 155 int gridsize() const; | |
| 156 int gridwidth() const; | |
| 157 int gridheight() const; | |
| 158 const ICOORD &bleft() const; | |
| 159 const ICOORD &tright() const; | |
| 160 | |
| 161 // Makes a window for debugging, see BBGrid | |
| 162 ScrollView *MakeWindow(int x, int y, const char *window_name); | |
| 163 | |
| 164 //////// Functions to insert objects from the grid into the table finder. | |
| 165 //////// In all cases, ownership is transferred to the table finder. | |
| 166 // Inserts text into the table finder. | |
| 167 void InsertTextPartition(ColPartition *part); | |
| 168 void InsertFragmentedTextPartition(ColPartition *part); | |
| 169 void InsertLeaderPartition(ColPartition *part); | |
| 170 void InsertRulingPartition(ColPartition *part); | |
| 171 void InsertImagePartition(ColPartition *part); | |
| 172 void SplitAndInsertFragmentedTextPartition(ColPartition *part); | |
| 173 bool AllowTextPartition(const ColPartition &part) const; | |
| 174 bool AllowBlob(const BLOBNBOX &blob) const; | |
| 175 | |
| 176 //////// Functions that manipulate ColPartitions in the part_grid_ ///// | |
| 177 //////// to find tables. | |
| 178 //////// | |
| 179 | |
| 180 // Utility function to move segments to col_seg_grid | |
| 181 // Note: Move includes ownership, | |
| 182 // so segments will be be owned by col_seg_grid | |
| 183 void MoveColSegmentsToGrid(ColSegment_LIST *segments, | |
| 184 ColSegmentGrid *col_seg_grid); | |
| 185 | |
| 186 //////// Set up code to run during table detection to correctly | |
| 187 //////// initialize variables on column partitions that are used later. | |
| 188 //////// | |
| 189 | |
| 190 // Initialize the grid and partitions | |
| 191 void InitializePartitions(ColPartitionSet **all_columns); | |
| 192 | |
| 193 // Set left, right and top, bottom spacings of each colpartition. | |
| 194 // Left/right spacings are w.r.t the column boundaries | |
| 195 // Top/bottom spacings are w.r.t. previous and next colpartitions | |
| 196 static void SetPartitionSpacings(ColPartitionGrid *grid, | |
| 197 ColPartitionSet **all_columns); | |
| 198 | |
| 199 // Set spacing and closest neighbors above and below a given colpartition. | |
| 200 void SetVerticalSpacing(ColPartition *part); | |
| 201 | |
| 202 // Set global spacing estimates. This function is dependent on the | |
| 203 // partition spacings. So make sure SetPartitionSpacings is called | |
| 204 // on the same grid before this. | |
| 205 void SetGlobalSpacings(ColPartitionGrid *grid); | |
| 206 // Access to the global median xheight. The xheight is the height | |
| 207 // of a lowercase 'x' character on the page. This can be viewed as the | |
| 208 // average height of a lowercase letter in a textline. As a result | |
| 209 // it is used to make assumptions about spacing between words and | |
| 210 // table cells. | |
| 211 void set_global_median_xheight(int xheight); | |
| 212 // Access to the global median blob width. The width is useful | |
| 213 // when deciding if a partition is noise. | |
| 214 void set_global_median_blob_width(int width); | |
| 215 // Access to the global median ledding. The ledding is the distance between | |
| 216 // two adjacent text lines. This value can be used to get a rough estimate | |
| 217 // for the amount of space between two lines of text. As a result, it | |
| 218 // is used to calculate appropriate spacing between adjacent rows of text. | |
| 219 void set_global_median_ledding(int ledding); | |
| 220 | |
| 221 // Updates the nearest neighbors for each ColPartition in clean_part_grid_. | |
| 222 // The neighbors are most likely SingletonPartner calls after the neighbors | |
| 223 // are assigned. This is hear until it is decided to remove the | |
| 224 // nearest_neighbor code in ColPartition | |
| 225 void FindNeighbors(); | |
| 226 | |
| 227 //////// Functions to mark candidate column partitions as tables. | |
| 228 //////// Tables are marked as described in | |
| 229 //////// Table Detection in Heterogeneous Documents (2010, Shafait & Smith) | |
| 230 //////// | |
| 231 | |
| 232 // High level function to mark partitions as table rows/cells. | |
| 233 // When this function is done, the column partitions in clean_part_grid_ | |
| 234 // should mostly be marked as tables. | |
| 235 void MarkTablePartitions(); | |
| 236 // Marks partitions given a local view of a single partition | |
| 237 void MarkPartitionsUsingLocalInformation(); | |
| 238 /////// Heuristics for local marking | |
| 239 // Check if the partition has at least one large gap between words or no | |
| 240 // significant gap at all | |
| 241 // TODO(nbeato): Make const, prevented because blobnbox array access | |
| 242 bool HasWideOrNoInterWordGap(ColPartition *part) const; | |
| 243 // Checks if a partition is adjacent to leaders on the page | |
| 244 bool HasLeaderAdjacent(const ColPartition &part); | |
| 245 // Filter individual text partitions marked as table partitions | |
| 246 // consisting of paragraph endings, small section headings, and | |
| 247 // headers and footers. | |
| 248 void FilterFalseAlarms(); | |
| 249 void FilterParagraphEndings(); | |
| 250 void FilterHeaderAndFooter(); | |
| 251 // Mark all ColPartitions as table cells that have a table cell above | |
| 252 // and below them | |
| 253 void SmoothTablePartitionRuns(); | |
| 254 | |
| 255 //////// Functions to create bounding boxes (ColSegment) objects for | |
| 256 //////// the columns on the page. The columns are not necessarily | |
| 257 //////// vertical lines, meaning if tab stops strongly suggests that | |
| 258 //////// a column changes horizontal position, as in the case below, | |
| 259 //////// The ColSegment objects will respect that after processing. | |
| 260 //////// | |
| 261 //////// _____________ | |
| 262 //////// Ex. | | | | |
| 263 //////// |_____|______| 5 boxes: 2 on this line | |
| 264 //////// | | | | 3 on this line | |
| 265 //////// |___|____|___| | |
| 266 //////// | |
| 267 | |
| 268 // Get Column segments from best_columns_ | |
| 269 void GetColumnBlocks(ColPartitionSet **columns, | |
| 270 ColSegment_LIST *col_segments); | |
| 271 | |
| 272 // Group Column segments into consecutive single column regions. | |
| 273 void GroupColumnBlocks(ColSegment_LIST *current_segments, | |
| 274 ColSegment_LIST *col_segments); | |
| 275 | |
| 276 // Check if two boxes are consecutive within the same column | |
| 277 bool ConsecutiveBoxes(const TBOX &b1, const TBOX &b2); | |
| 278 | |
| 279 // Set the ratio of candidate table partitions in each column | |
| 280 void SetColumnsType(ColSegment_LIST *col_segments); | |
| 281 | |
| 282 // Merge Column Blocks that were split due to the presence of a table | |
| 283 void GridMergeColumnBlocks(); | |
| 284 | |
| 285 //////// Functions to turn marked ColPartitions into candidate tables | |
| 286 //////// using a modified T-Recs++ algorithm described in | |
| 287 //////// Applying The T-Recs Table Recognition System | |
| 288 //////// To The Business Letter Domain (2001, Kieninger & Dengel) | |
| 289 //////// | |
| 290 | |
| 291 // Merge partititons cells into table columns | |
| 292 // Differs from paper by just looking at marked table partitions | |
| 293 // instead of similarity metric. | |
| 294 // Modified section 4.1 of paper. | |
| 295 void GetTableColumns(ColSegment_LIST *table_columns); | |
| 296 | |
| 297 // Finds regions within a column that potentially contain a table. | |
| 298 // Ie, the table columns from GetTableColumns are turned into boxes | |
| 299 // that span the entire page column (using ColumnBlocks found in | |
| 300 // earlier functions) in the x direction and the min/max extent of | |
| 301 // overlapping table columns in the y direction. | |
| 302 // Section 4.2 of paper. | |
| 303 void GetTableRegions(ColSegment_LIST *table_columns, | |
| 304 ColSegment_LIST *table_regions); | |
| 305 | |
| 306 //////// Functions to "patch up" found tables | |
| 307 //////// | |
| 308 | |
| 309 // Merge table regions corresponding to tables spanning multiple columns | |
| 310 void GridMergeTableRegions(); | |
| 311 bool BelongToOneTable(const TBOX &box1, const TBOX &box2); | |
| 312 | |
| 313 // Adjust table boundaries by building a tight bounding box around all | |
| 314 // ColPartitions contained in it. | |
| 315 void AdjustTableBoundaries(); | |
| 316 | |
| 317 // Grows a table to include partitions that are partially covered | |
| 318 // by the table. This includes lines and text. It does not include | |
| 319 // noise or images. | |
| 320 // On entry, result_box is the minimum size of the result. The results of the | |
| 321 // function will union the actual result with result_box. | |
| 322 void GrowTableBox(const TBOX &table_box, TBOX *result_box); | |
| 323 // Grow a table by increasing the size of the box to include | |
| 324 // partitions with significant overlap with the table. | |
| 325 void GrowTableToIncludePartials(const TBOX &table_box, | |
| 326 const TBOX &search_range, TBOX *result_box); | |
| 327 // Grow a table by expanding to the extents of significantly | |
| 328 // overlapping lines. | |
| 329 void GrowTableToIncludeLines(const TBOX &table_box, const TBOX &search_range, | |
| 330 TBOX *result_box); | |
| 331 // Checks whether the horizontal line belong to the table by looking at the | |
| 332 // side spacing of extra ColPartitions that will be included in the table | |
| 333 // due to expansion | |
| 334 bool HLineBelongsToTable(const ColPartition &part, const TBOX &table_box); | |
| 335 | |
| 336 // Look for isolated column headers above the given table box and | |
| 337 // include them in the table | |
| 338 void IncludeLeftOutColumnHeaders(TBOX *table_box); | |
| 339 | |
| 340 // Remove false alarms consisting of a single column | |
| 341 void DeleteSingleColumnTables(); | |
| 342 | |
| 343 // Return true if at least one gap larger than the global x-height | |
| 344 // exists in the horizontal projection | |
| 345 bool GapInXProjection(int *xprojection, int length); | |
| 346 | |
| 347 //////// Recognize the tables. | |
| 348 //////// | |
| 349 // This function will run the table recognizer and try to find better | |
| 350 // bounding boxes. The structures of the tables never leave this function | |
| 351 // right now. It just tries to prune and merge tables based on info it | |
| 352 // has available. | |
| 353 void RecognizeTables(); | |
| 354 | |
| 355 //////// Debugging functions. Render different structures to GUI | |
| 356 //////// for visual debugging / intuition. | |
| 357 //////// | |
| 358 | |
| 359 // Displays Colpartitions marked as table row. Overlays them on top of | |
| 360 // part_grid_. | |
| 361 void DisplayColSegments(ScrollView *win, ColSegment_LIST *cols, | |
| 362 ScrollView::Color color); | |
| 363 | |
| 364 // Displays the colpartitions using a new coloring on an existing window. | |
| 365 // Note: This method is only for debug purpose during development and | |
| 366 // would not be part of checked in code | |
| 367 void DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid, | |
| 368 ScrollView::Color text_color, | |
| 369 ScrollView::Color table_color); | |
| 370 void DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid, | |
| 371 ScrollView::Color default_color); | |
| 372 void DisplayColPartitionConnections(ScrollView *win, ColPartitionGrid *grid, | |
| 373 ScrollView::Color default_color); | |
| 374 | |
| 375 // Merge all colpartitions in table regions to make them a single | |
| 376 // colpartition and revert types of isolated table cells not | |
| 377 // assigned to any table to their original types. | |
| 378 void MakeTableBlocks(ColPartitionGrid *grid, ColPartitionSet **columns, | |
| 379 const WidthCallback &width_cb); | |
| 380 | |
| 381 ///////////////////////////////////////////////// | |
| 382 // Useful objects used during table find process. | |
| 383 ///////////////////////////////////////////////// | |
| 384 // Resolution of the connected components in ppi. | |
| 385 int resolution_; | |
| 386 // Estimate of median x-height over the page | |
| 387 int global_median_xheight_; | |
| 388 // Estimate of the median blob width on the page | |
| 389 int global_median_blob_width_; | |
| 390 // Estimate of median leading on the page | |
| 391 int global_median_ledding_; | |
| 392 // Grid to hold cleaned colpartitions after removing all | |
| 393 // colpartitions that consist of only noise blobs, and removing | |
| 394 // noise blobs from remaining colpartitions. | |
| 395 ColPartitionGrid clean_part_grid_; | |
| 396 // Grid contains the leaders and ruling lines. | |
| 397 ColPartitionGrid leader_and_ruling_grid_; | |
| 398 // Grid contains the broken down column partitions. It can be thought | |
| 399 // of as a "word" grid. However, it usually doesn't break apart text lines. | |
| 400 // It does break apart table data (most of the time). | |
| 401 ColPartitionGrid fragmented_text_grid_; | |
| 402 // Grid of page column blocks | |
| 403 ColSegmentGrid col_seg_grid_; | |
| 404 // Grid of detected tables | |
| 405 ColSegmentGrid table_grid_; | |
| 406 // The reading order of text. Defaults to true, for languages such as English. | |
| 407 bool left_to_right_language_; | |
| 408 }; | |
| 409 | |
| 410 } // namespace tesseract. | |
| 411 | |
| 412 #endif // TESSERACT_TEXTORD_TABLEFIND_H_ |
