Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/textord/tablefind.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: tablefind.cpp | |
| 3 // Description: Helper classes to find tables from ColPartitions. | |
| 4 // Author: Faisal Shafait (faisal.shafait@dfki.de) | |
| 5 // | |
| 6 // (C) Copyright 2009, Google Inc. | |
| 7 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 // you may not use this file except in compliance with the License. | |
| 9 // You may obtain a copy of the License at | |
| 10 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 // Unless required by applicable law or agreed to in writing, software | |
| 12 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 // See the License for the specific language governing permissions and | |
| 15 // limitations under the License. | |
| 16 // | |
| 17 /////////////////////////////////////////////////////////////////////// | |
| 18 | |
| 19 #ifdef HAVE_CONFIG_H | |
| 20 # include "config_auto.h" | |
| 21 #endif | |
| 22 | |
| 23 #include <algorithm> | |
| 24 #include <cmath> | |
| 25 #include <utility> | |
| 26 #include "tablefind.h" | |
| 27 | |
| 28 #include <allheaders.h> | |
| 29 | |
| 30 #include "colpartitionset.h" | |
| 31 #include "tablerecog.h" | |
| 32 | |
| 33 namespace tesseract { | |
| 34 | |
| 35 // These numbers are used to calculate the global median stats. | |
| 36 // They just set an upper bound on the stats objects. | |
| 37 // Maximum vertical spacing between neighbor partitions. | |
| 38 const int kMaxVerticalSpacing = 500; | |
| 39 // Maximum width of a blob in a partition. | |
| 40 const int kMaxBlobWidth = 500; | |
| 41 | |
| 42 // Minimum whitespace size to split a partition (measured as a multiple | |
| 43 // of a partition's median width). | |
| 44 const double kSplitPartitionSize = 2.0; | |
| 45 // To insert text, the partition must satisfy these size constraints | |
| 46 // in AllowTextPartition(). The idea is to filter noise partitions | |
| 47 // determined by the size compared to the global medians. | |
| 48 // TODO(nbeato): Need to find good numbers again. | |
| 49 const double kAllowTextHeight = 0.5; | |
| 50 const double kAllowTextWidth = 0.6; | |
| 51 const double kAllowTextArea = 0.8; | |
| 52 // The same thing applies to blobs (to filter noise). | |
| 53 // TODO(nbeato): These numbers are a shot in the dark... | |
| 54 // height and width are 0.5 * gridsize() in colfind.cpp | |
| 55 // area is a rough guess for the size of a period. | |
| 56 const double kAllowBlobHeight = 0.3; | |
| 57 const double kAllowBlobWidth = 0.4; | |
| 58 const double kAllowBlobArea = 0.05; | |
| 59 | |
| 60 // Minimum number of components in a text partition. A partition having fewer | |
| 61 // components than that is more likely a data partition and is a candidate | |
| 62 // table cell. | |
| 63 const int kMinBoxesInTextPartition = 10; | |
| 64 | |
| 65 // Maximum number of components that a data partition can have | |
| 66 const int kMaxBoxesInDataPartition = 20; | |
| 67 | |
| 68 // Maximum allowed gap in a text partitions as a multiple of its median size. | |
| 69 const double kMaxGapInTextPartition = 4.0; | |
| 70 | |
| 71 // Minimum value that the maximum gap in a text partition should have as a | |
| 72 // factor of its median size. | |
| 73 const double kMinMaxGapInTextPartition = 0.5; | |
| 74 | |
| 75 // The amount of overlap that is "normal" for adjacent blobs in a text | |
| 76 // partition. This is used to calculate gap between overlapping blobs. | |
| 77 const double kMaxBlobOverlapFactor = 4.0; | |
| 78 | |
| 79 // Maximum x-height a table partition can have as a multiple of global | |
| 80 // median x-height | |
| 81 const double kMaxTableCellXheight = 2.0; | |
| 82 | |
| 83 // Maximum line spacing between a table column header and column contents | |
| 84 // for merging the two (as a multiple of the partition's median_height). | |
| 85 const int kMaxColumnHeaderDistance = 4; | |
| 86 | |
| 87 // Minimum ratio of num_table_partitions to num_text_partitions in a column | |
| 88 // block to be called it a table column | |
| 89 const double kTableColumnThreshold = 3.0; | |
| 90 | |
| 91 // Search for horizontal ruling lines within the vertical margin as a | |
| 92 // multiple of grid size | |
| 93 // const int kRulingVerticalMargin = 3; | |
| 94 | |
| 95 // Minimum overlap that a colpartition must have with a table region | |
| 96 // to become part of that table | |
| 97 const double kMinOverlapWithTable = 0.6; | |
| 98 | |
| 99 // Maximum side space (distance from column boundary) that a typical | |
| 100 // text-line in flowing text should have as a multiple of its x-height | |
| 101 // (Median size). | |
| 102 const int kSideSpaceMargin = 10; | |
| 103 | |
| 104 // Fraction of the peak of x-projection of a table region to set the | |
| 105 // threshold for the x-projection histogram | |
| 106 const double kSmallTableProjectionThreshold = 0.35; | |
| 107 const double kLargeTableProjectionThreshold = 0.45; | |
| 108 // Minimum number of rows required to look for more rows in the projection. | |
| 109 const int kLargeTableRowCount = 6; | |
| 110 | |
| 111 // Minimum number of rows in a table | |
| 112 const int kMinRowsInTable = 3; | |
| 113 | |
| 114 // The amount of padding (multiplied by global_median_xheight_ during use) | |
| 115 // that is vertically added to the search adjacent leader search during | |
| 116 // ColPartition marking. | |
| 117 const int kAdjacentLeaderSearchPadding = 2; | |
| 118 | |
| 119 // Used when filtering false positives. When finding the last line | |
| 120 // of a paragraph (typically left-aligned), the previous line should have | |
| 121 // its center to the right of the last line by this scaled amount. | |
| 122 const double kParagraphEndingPreviousLineRatio = 1.3; | |
| 123 | |
| 124 // The maximum amount of whitespace allowed left of a paragraph ending. | |
| 125 // Do not filter a ColPartition with more than this space left of it. | |
| 126 const double kMaxParagraphEndingLeftSpaceMultiple = 3.0; | |
| 127 | |
| 128 // Used when filtering false positives. The last line of a paragraph | |
| 129 // should be preceded by a line that is predominantly text. This is the | |
| 130 // ratio of text to whitespace (to the right of the text) that is required | |
| 131 // for the previous line to be a text. | |
| 132 const double kMinParagraphEndingTextToWhitespaceRatio = 3.0; | |
| 133 | |
| 134 // When counting table columns, this is the required gap between two columns | |
| 135 // (it is multiplied by global_median_xheight_). | |
| 136 const double kMaxXProjectionGapFactor = 2.0; | |
| 137 | |
| 138 // Used for similarity in partitions using stroke width. Values copied | |
| 139 // from ColFind.cpp in Ray's CL. | |
| 140 const double kStrokeWidthFractionalTolerance = 0.25; | |
| 141 const double kStrokeWidthConstantTolerance = 2.0; | |
| 142 | |
| 143 #ifndef GRAPHICS_DISABLED | |
| 144 static BOOL_VAR(textord_show_tables, false, "Show table regions (ScrollView)"); | |
| 145 static BOOL_VAR(textord_tablefind_show_mark, false, | |
| 146 "Debug table marking steps in detail (ScrollView)"); | |
| 147 static BOOL_VAR(textord_tablefind_show_stats, false, | |
| 148 "Show page stats used in table finding (ScrollView)"); | |
| 149 #endif | |
| 150 static BOOL_VAR(textord_tablefind_recognize_tables, false, | |
| 151 "Enables the table recognizer for table layout and filtering."); | |
| 152 | |
| 153 // Templated helper function used to create destructor callbacks for the | |
| 154 // BBGrid::ClearGridData() method. | |
| 155 template <typename T> | |
| 156 void DeleteObject(T *object) { | |
| 157 delete object; | |
| 158 } | |
| 159 | |
| 160 TableFinder::TableFinder() | |
| 161 : resolution_(0), | |
| 162 global_median_xheight_(0), | |
| 163 global_median_blob_width_(0), | |
| 164 global_median_ledding_(0), | |
| 165 left_to_right_language_(true) {} | |
| 166 | |
| 167 TableFinder::~TableFinder() { | |
| 168 // ColPartitions and ColSegments created by this class for storage in grids | |
| 169 // need to be deleted explicitly. | |
| 170 clean_part_grid_.ClearGridData(&DeleteObject<ColPartition>); | |
| 171 leader_and_ruling_grid_.ClearGridData(&DeleteObject<ColPartition>); | |
| 172 fragmented_text_grid_.ClearGridData(&DeleteObject<ColPartition>); | |
| 173 col_seg_grid_.ClearGridData(&DeleteObject<ColSegment>); | |
| 174 table_grid_.ClearGridData(&DeleteObject<ColSegment>); | |
| 175 } | |
| 176 | |
| 177 void TableFinder::set_left_to_right_language(bool order) { | |
| 178 left_to_right_language_ = order; | |
| 179 } | |
| 180 | |
| 181 void TableFinder::Init(int grid_size, const ICOORD &bottom_left, | |
| 182 const ICOORD &top_right) { | |
| 183 // Initialize clean partitions list and grid | |
| 184 clean_part_grid_.Init(grid_size, bottom_left, top_right); | |
| 185 leader_and_ruling_grid_.Init(grid_size, bottom_left, top_right); | |
| 186 fragmented_text_grid_.Init(grid_size, bottom_left, top_right); | |
| 187 col_seg_grid_.Init(grid_size, bottom_left, top_right); | |
| 188 table_grid_.Init(grid_size, bottom_left, top_right); | |
| 189 } | |
| 190 | |
| 191 // Copy cleaned partitions from part_grid_ to clean_part_grid_ and | |
| 192 // insert leaders and rulers into the leader_and_ruling_grid_ | |
| 193 void TableFinder::InsertCleanPartitions(ColPartitionGrid *grid, | |
| 194 TO_BLOCK *block) { | |
| 195 // Calculate stats. This lets us filter partitions in AllowTextPartition() | |
| 196 // and filter blobs in AllowBlob(). | |
| 197 SetGlobalSpacings(grid); | |
| 198 | |
| 199 // Iterate the ColPartitions in the grid. | |
| 200 ColPartitionGridSearch gsearch(grid); | |
| 201 gsearch.SetUniqueMode(true); | |
| 202 gsearch.StartFullSearch(); | |
| 203 ColPartition *part = nullptr; | |
| 204 while ((part = gsearch.NextFullSearch()) != nullptr) { | |
| 205 // Reject partitions with nothing useful inside of them. | |
| 206 if (part->blob_type() == BRT_NOISE || part->bounding_box().area() <= 0) { | |
| 207 continue; | |
| 208 } | |
| 209 ColPartition *clean_part = part->ShallowCopy(); | |
| 210 ColPartition *leader_part = nullptr; | |
| 211 if (part->IsLineType()) { | |
| 212 InsertRulingPartition(clean_part); | |
| 213 continue; | |
| 214 } | |
| 215 // Insert all non-text partitions to clean_parts | |
| 216 if (!part->IsTextType()) { | |
| 217 InsertImagePartition(clean_part); | |
| 218 continue; | |
| 219 } | |
| 220 // Insert text colpartitions after removing noisy components from them | |
| 221 // The leaders are split into a separate grid. | |
| 222 BLOBNBOX_CLIST *part_boxes = part->boxes(); | |
| 223 BLOBNBOX_C_IT pit(part_boxes); | |
| 224 for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) { | |
| 225 BLOBNBOX *pblob = pit.data(); | |
| 226 // Bad blobs... happens in UNLV set. | |
| 227 // news.3G1, page 17 (around x=6) | |
| 228 if (!AllowBlob(*pblob)) { | |
| 229 continue; | |
| 230 } | |
| 231 if (pblob->flow() == BTFT_LEADER) { | |
| 232 if (leader_part == nullptr) { | |
| 233 leader_part = part->ShallowCopy(); | |
| 234 leader_part->set_flow(BTFT_LEADER); | |
| 235 } | |
| 236 leader_part->AddBox(pblob); | |
| 237 } else if (pblob->region_type() != BRT_NOISE) { | |
| 238 clean_part->AddBox(pblob); | |
| 239 } | |
| 240 } | |
| 241 clean_part->ComputeLimits(); | |
| 242 ColPartition *fragmented = clean_part->CopyButDontOwnBlobs(); | |
| 243 InsertTextPartition(clean_part); | |
| 244 SplitAndInsertFragmentedTextPartition(fragmented); | |
| 245 if (leader_part != nullptr) { | |
| 246 // TODO(nbeato): Note that ComputeLimits does not update the column | |
| 247 // information. So the leader may appear to span more columns than it | |
| 248 // really does later on when IsInSameColumnAs gets called to test | |
| 249 // for adjacent leaders. | |
| 250 leader_part->ComputeLimits(); | |
| 251 InsertLeaderPartition(leader_part); | |
| 252 } | |
| 253 } | |
| 254 | |
| 255 // Make the partition partners better for upper and lower neighbors. | |
| 256 clean_part_grid_.FindPartitionPartners(); | |
| 257 clean_part_grid_.RefinePartitionPartners(false); | |
| 258 } | |
| 259 | |
| 260 // High level function to perform table detection | |
| 261 void TableFinder::LocateTables(ColPartitionGrid *grid, | |
| 262 ColPartitionSet **all_columns, | |
| 263 WidthCallback width_cb, const FCOORD &reskew) { | |
| 264 // initialize spacing, neighbors, and columns | |
| 265 InitializePartitions(all_columns); | |
| 266 | |
| 267 #ifndef GRAPHICS_DISABLED | |
| 268 if (textord_show_tables) { | |
| 269 ScrollView *table_win = MakeWindow(0, 300, "Column Partitions & Neighbors"); | |
| 270 DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE); | |
| 271 DisplayColPartitions(table_win, &leader_and_ruling_grid_, | |
| 272 ScrollView::AQUAMARINE); | |
| 273 DisplayColPartitionConnections(table_win, &clean_part_grid_, | |
| 274 ScrollView::ORANGE); | |
| 275 | |
| 276 table_win = MakeWindow(100, 300, "Fragmented Text"); | |
| 277 DisplayColPartitions(table_win, &fragmented_text_grid_, ScrollView::BLUE); | |
| 278 } | |
| 279 #endif // !GRAPHICS_DISABLED | |
| 280 | |
| 281 // mark, filter, and smooth candidate table partitions | |
| 282 MarkTablePartitions(); | |
| 283 | |
| 284 // Make single-column blocks from good_columns_ partitions. col_segments are | |
| 285 // moved to a grid later which takes the ownership | |
| 286 ColSegment_LIST column_blocks; | |
| 287 GetColumnBlocks(all_columns, &column_blocks); | |
| 288 // Set the ratio of candidate table partitions in each column | |
| 289 SetColumnsType(&column_blocks); | |
| 290 | |
| 291 // Move column segments to col_seg_grid_ | |
| 292 MoveColSegmentsToGrid(&column_blocks, &col_seg_grid_); | |
| 293 | |
| 294 // Detect split in column layout that might have occurred due to the | |
| 295 // presence of a table. In such a case, merge the corresponding columns. | |
| 296 GridMergeColumnBlocks(); | |
| 297 | |
| 298 // Group horizontally overlapping table partitions into table columns. | |
| 299 // table_columns created here get deleted at the end of this method. | |
| 300 ColSegment_LIST table_columns; | |
| 301 GetTableColumns(&table_columns); | |
| 302 | |
| 303 // Within each column, mark the range table regions occupy based on the | |
| 304 // table columns detected. table_regions are moved to a grid later which | |
| 305 // takes the ownership | |
| 306 ColSegment_LIST table_regions; | |
| 307 GetTableRegions(&table_columns, &table_regions); | |
| 308 | |
| 309 #ifndef GRAPHICS_DISABLED | |
| 310 if (textord_tablefind_show_mark) { | |
| 311 ScrollView *table_win = MakeWindow(1200, 300, "Table Columns and Regions"); | |
| 312 DisplayColSegments(table_win, &table_columns, ScrollView::DARK_TURQUOISE); | |
| 313 DisplayColSegments(table_win, &table_regions, ScrollView::YELLOW); | |
| 314 } | |
| 315 #endif // !GRAPHICS_DISABLED | |
| 316 | |
| 317 // Merge table regions across columns for tables spanning multiple | |
| 318 // columns | |
| 319 MoveColSegmentsToGrid(&table_regions, &table_grid_); | |
| 320 GridMergeTableRegions(); | |
| 321 | |
| 322 // Adjust table boundaries by including nearby horizontal lines and left | |
| 323 // out column headers | |
| 324 AdjustTableBoundaries(); | |
| 325 GridMergeTableRegions(); | |
| 326 | |
| 327 if (textord_tablefind_recognize_tables) { | |
| 328 // Remove false alarms consisting of a single column | |
| 329 DeleteSingleColumnTables(); | |
| 330 | |
| 331 #ifndef GRAPHICS_DISABLED | |
| 332 if (textord_show_tables) { | |
| 333 ScrollView *table_win = MakeWindow(1200, 300, "Detected Table Locations"); | |
| 334 DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE); | |
| 335 DisplayColSegments(table_win, &table_columns, ScrollView::KHAKI); | |
| 336 table_grid_.DisplayBoxes(table_win); | |
| 337 } | |
| 338 #endif // !GRAPHICS_DISABLED | |
| 339 | |
| 340 // Find table grid structure and reject tables that are malformed. | |
| 341 RecognizeTables(); | |
| 342 GridMergeTableRegions(); | |
| 343 RecognizeTables(); | |
| 344 | |
| 345 #ifndef GRAPHICS_DISABLED | |
| 346 if (textord_show_tables) { | |
| 347 ScrollView *table_win = MakeWindow(1400, 600, "Recognized Tables"); | |
| 348 DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE, | |
| 349 ScrollView::BLUE); | |
| 350 table_grid_.DisplayBoxes(table_win); | |
| 351 } | |
| 352 #endif // !GRAPHICS_DISABLED | |
| 353 } else { | |
| 354 // Remove false alarms consisting of a single column | |
| 355 // TODO(nbeato): verify this is a NOP after structured table rejection. | |
| 356 // Right now it isn't. If the recognize function is doing what it is | |
| 357 // supposed to do, this function is obsolete. | |
| 358 DeleteSingleColumnTables(); | |
| 359 | |
| 360 #ifndef GRAPHICS_DISABLED | |
| 361 if (textord_show_tables) { | |
| 362 ScrollView *table_win = MakeWindow(1500, 300, "Detected Tables"); | |
| 363 DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE, | |
| 364 ScrollView::BLUE); | |
| 365 table_grid_.DisplayBoxes(table_win); | |
| 366 } | |
| 367 #endif // !GRAPHICS_DISABLED | |
| 368 } | |
| 369 | |
| 370 // Merge all colpartitions in table regions to make them a single | |
| 371 // colpartition and revert types of isolated table cells not | |
| 372 // assigned to any table to their original types. | |
| 373 MakeTableBlocks(grid, all_columns, width_cb); | |
| 374 } | |
| 375 // All grids have the same dimensions. The clean_part_grid_ sizes are set from | |
| 376 // the part_grid_ that is passed to InsertCleanPartitions, which was the same as | |
| 377 // the grid that is the base of ColumnFinder. Just return the clean_part_grid_ | |
| 378 // dimensions instead of duplicated memory. | |
| 379 int TableFinder::gridsize() const { | |
| 380 return clean_part_grid_.gridsize(); | |
| 381 } | |
| 382 int TableFinder::gridwidth() const { | |
| 383 return clean_part_grid_.gridwidth(); | |
| 384 } | |
| 385 int TableFinder::gridheight() const { | |
| 386 return clean_part_grid_.gridheight(); | |
| 387 } | |
| 388 const ICOORD &TableFinder::bleft() const { | |
| 389 return clean_part_grid_.bleft(); | |
| 390 } | |
| 391 const ICOORD &TableFinder::tright() const { | |
| 392 return clean_part_grid_.tright(); | |
| 393 } | |
| 394 | |
| 395 void TableFinder::InsertTextPartition(ColPartition *part) { | |
| 396 ASSERT_HOST(part != nullptr); | |
| 397 if (AllowTextPartition(*part)) { | |
| 398 clean_part_grid_.InsertBBox(true, true, part); | |
| 399 } else { | |
| 400 delete part; | |
| 401 } | |
| 402 } | |
| 403 void TableFinder::InsertFragmentedTextPartition(ColPartition *part) { | |
| 404 ASSERT_HOST(part != nullptr); | |
| 405 if (AllowTextPartition(*part)) { | |
| 406 fragmented_text_grid_.InsertBBox(true, true, part); | |
| 407 } else { | |
| 408 delete part; | |
| 409 } | |
| 410 } | |
| 411 void TableFinder::InsertLeaderPartition(ColPartition *part) { | |
| 412 ASSERT_HOST(part != nullptr); | |
| 413 if (!part->IsEmpty() && part->bounding_box().area() > 0) { | |
| 414 leader_and_ruling_grid_.InsertBBox(true, true, part); | |
| 415 } else { | |
| 416 delete part; | |
| 417 } | |
| 418 } | |
| 419 void TableFinder::InsertRulingPartition(ColPartition *part) { | |
| 420 leader_and_ruling_grid_.InsertBBox(true, true, part); | |
| 421 } | |
| 422 void TableFinder::InsertImagePartition(ColPartition *part) { | |
| 423 // NOTE: If images are placed into a different grid in the future, | |
| 424 // the function SetPartitionSpacings needs to be updated. It should | |
| 425 // be the only thing that cares about image partitions. | |
| 426 clean_part_grid_.InsertBBox(true, true, part); | |
| 427 } | |
| 428 | |
| 429 // Splits a partition into its "words". The splits happen | |
| 430 // at locations with wide inter-blob spacing. This is useful | |
| 431 // because it allows the table recognize to "cut through" the | |
| 432 // text lines on the page. The assumption is that a table | |
| 433 // will have several lines with similar overlapping whitespace | |
| 434 // whereas text will not have this type of property. | |
| 435 // Note: The code assumes that blobs are sorted by the left side x! | |
| 436 // This will not work (as well) if the blobs are sorted by center/right. | |
| 437 void TableFinder::SplitAndInsertFragmentedTextPartition(ColPartition *part) { | |
| 438 ASSERT_HOST(part != nullptr); | |
| 439 // Bye bye empty partitions! | |
| 440 if (part->boxes()->empty()) { | |
| 441 delete part; | |
| 442 return; | |
| 443 } | |
| 444 | |
| 445 // The AllowBlob function prevents this. | |
| 446 ASSERT_HOST(part->median_width() > 0); | |
| 447 const double kThreshold = part->median_width() * kSplitPartitionSize; | |
| 448 | |
| 449 ColPartition *right_part = part; | |
| 450 bool found_split = true; | |
| 451 while (found_split) { | |
| 452 found_split = false; | |
| 453 BLOBNBOX_C_IT box_it(right_part->boxes()); | |
| 454 // Blobs are sorted left side first. If blobs overlap, | |
| 455 // the previous blob may have a "more right" right side. | |
| 456 // Account for this by always keeping the largest "right" | |
| 457 // so far. | |
| 458 int previous_right = INT32_MIN; | |
| 459 | |
| 460 // Look for the next split in the partition. | |
| 461 for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) { | |
| 462 const TBOX &box = box_it.data()->bounding_box(); | |
| 463 if (previous_right != INT32_MIN && | |
| 464 box.left() - previous_right > kThreshold) { | |
| 465 // We have a split position. Split the partition in two pieces. | |
| 466 // Insert the left piece in the grid and keep processing the right. | |
| 467 int mid_x = (box.left() + previous_right) / 2; | |
| 468 ColPartition *left_part = right_part; | |
| 469 right_part = left_part->SplitAt(mid_x); | |
| 470 | |
| 471 InsertFragmentedTextPartition(left_part); | |
| 472 found_split = true; | |
| 473 break; | |
| 474 } | |
| 475 | |
| 476 // The right side of the previous blobs. | |
| 477 previous_right = std::max(previous_right, static_cast<int>(box.right())); | |
| 478 } | |
| 479 } | |
| 480 // When a split is not found, the right part is minimized | |
| 481 // as much as possible, so process it. | |
| 482 InsertFragmentedTextPartition(right_part); | |
| 483 } | |
| 484 | |
| 485 // Some simple criteria to filter out now. We want to make sure the | |
| 486 // average blob size in the partition is consistent with the | |
| 487 // global page stats. | |
| 488 // The area metric will almost always pass for multi-blob partitions. | |
| 489 // It is useful when filtering out noise caused by an isolated blob. | |
| 490 bool TableFinder::AllowTextPartition(const ColPartition &part) const { | |
| 491 const double kHeightRequired = global_median_xheight_ * kAllowTextHeight; | |
| 492 const double kWidthRequired = global_median_blob_width_ * kAllowTextWidth; | |
| 493 const int median_area = global_median_xheight_ * global_median_blob_width_; | |
| 494 const double kAreaPerBlobRequired = median_area * kAllowTextArea; | |
| 495 // Keep comparisons strictly greater to disallow 0! | |
| 496 return part.median_height() > kHeightRequired && | |
| 497 part.median_width() > kWidthRequired && | |
| 498 part.bounding_box().area() > kAreaPerBlobRequired * part.boxes_count(); | |
| 499 } | |
| 500 | |
| 501 // Same as above, applied to blobs. Keep in mind that | |
| 502 // leaders, commas, and periods are important in tables. | |
| 503 bool TableFinder::AllowBlob(const BLOBNBOX &blob) const { | |
| 504 const TBOX &box = blob.bounding_box(); | |
| 505 const double kHeightRequired = global_median_xheight_ * kAllowBlobHeight; | |
| 506 const double kWidthRequired = global_median_blob_width_ * kAllowBlobWidth; | |
| 507 const int median_area = global_median_xheight_ * global_median_blob_width_; | |
| 508 const double kAreaRequired = median_area * kAllowBlobArea; | |
| 509 // Keep comparisons strictly greater to disallow 0! | |
| 510 return box.height() > kHeightRequired && box.width() > kWidthRequired && | |
| 511 box.area() > kAreaRequired; | |
| 512 } | |
| 513 | |
| 514 // TODO(nbeato): The grid that makes the window doesn't seem to matter. | |
| 515 // The only downside is that window messages will be caught by | |
| 516 // clean_part_grid_ instead of a useful object. This is a temporary solution | |
| 517 // for the debug windows created by the TableFinder. | |
| 518 #ifndef GRAPHICS_DISABLED | |
| 519 ScrollView *TableFinder::MakeWindow(int x, int y, const char *window_name) { | |
| 520 return clean_part_grid_.MakeWindow(x, y, window_name); | |
| 521 } | |
| 522 #endif | |
| 523 | |
| 524 // Make single-column blocks from good_columns_ partitions. | |
| 525 void TableFinder::GetColumnBlocks(ColPartitionSet **all_columns, | |
| 526 ColSegment_LIST *column_blocks) { | |
| 527 for (int i = 0; i < gridheight(); ++i) { | |
| 528 ColPartitionSet *columns = all_columns[i]; | |
| 529 if (columns != nullptr) { | |
| 530 ColSegment_LIST new_blocks; | |
| 531 // Get boxes from the current vertical position on the grid | |
| 532 columns->GetColumnBoxes(i * gridsize(), (i + 1) * gridsize(), | |
| 533 &new_blocks); | |
| 534 // Merge the new_blocks boxes into column_blocks if they are well-aligned | |
| 535 GroupColumnBlocks(&new_blocks, column_blocks); | |
| 536 } | |
| 537 } | |
| 538 } | |
| 539 | |
| 540 // Merge column segments into the current list if they are well aligned. | |
| 541 void TableFinder::GroupColumnBlocks(ColSegment_LIST *new_blocks, | |
| 542 ColSegment_LIST *column_blocks) { | |
| 543 ColSegment_IT src_it(new_blocks); | |
| 544 ColSegment_IT dest_it(column_blocks); | |
| 545 // iterate through the source list | |
| 546 for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) { | |
| 547 ColSegment *src_seg = src_it.data(); | |
| 548 const TBOX &src_box = src_seg->bounding_box(); | |
| 549 bool match_found = false; | |
| 550 // iterate through the destination list to find a matching column block | |
| 551 for (dest_it.mark_cycle_pt(); !dest_it.cycled_list(); dest_it.forward()) { | |
| 552 ColSegment *dest_seg = dest_it.data(); | |
| 553 TBOX dest_box = dest_seg->bounding_box(); | |
| 554 if (ConsecutiveBoxes(src_box, dest_box)) { | |
| 555 // If matching block is found, insert the current block into it | |
| 556 // and delete the source block. | |
| 557 dest_seg->InsertBox(src_box); | |
| 558 match_found = true; | |
| 559 delete src_it.extract(); | |
| 560 break; | |
| 561 } | |
| 562 } | |
| 563 // If no match is found, just append the source block to column_blocks | |
| 564 if (!match_found) { | |
| 565 dest_it.add_after_then_move(src_it.extract()); | |
| 566 } | |
| 567 } | |
| 568 } | |
| 569 | |
| 570 // are the two boxes immediate neighbors along the vertical direction | |
| 571 bool TableFinder::ConsecutiveBoxes(const TBOX &b1, const TBOX &b2) { | |
| 572 int x_margin = 20; | |
| 573 int y_margin = 5; | |
| 574 return (abs(b1.left() - b2.left()) < x_margin) && | |
| 575 (abs(b1.right() - b2.right()) < x_margin) && | |
| 576 (abs(b1.top() - b2.bottom()) < y_margin || | |
| 577 abs(b2.top() - b1.bottom()) < y_margin); | |
| 578 } | |
| 579 | |
| 580 // Set up info for clean_part_grid_ partitions to be valid during detection | |
| 581 // code. | |
| 582 void TableFinder::InitializePartitions(ColPartitionSet **all_columns) { | |
| 583 FindNeighbors(); | |
| 584 SetPartitionSpacings(&clean_part_grid_, all_columns); | |
| 585 SetGlobalSpacings(&clean_part_grid_); | |
| 586 } | |
| 587 | |
| 588 // Set left, right and top, bottom spacings of each colpartition. | |
| 589 void TableFinder::SetPartitionSpacings(ColPartitionGrid *grid, | |
| 590 ColPartitionSet **all_columns) { | |
| 591 // Iterate the ColPartitions in the grid. | |
| 592 ColPartitionGridSearch gsearch(grid); | |
| 593 gsearch.StartFullSearch(); | |
| 594 ColPartition *part = nullptr; | |
| 595 while ((part = gsearch.NextFullSearch()) != nullptr) { | |
| 596 ColPartitionSet *columns = all_columns[gsearch.GridY()]; | |
| 597 TBOX box = part->bounding_box(); | |
| 598 int y = part->MidY(); | |
| 599 ColPartition *left_column = columns->ColumnContaining(box.left(), y); | |
| 600 ColPartition *right_column = columns->ColumnContaining(box.right(), y); | |
| 601 // set distance from left column as space to the left | |
| 602 if (left_column) { | |
| 603 int left_space = std::max(0, box.left() - left_column->LeftAtY(y)); | |
| 604 part->set_space_to_left(left_space); | |
| 605 } | |
| 606 // set distance from right column as space to the right | |
| 607 if (right_column) { | |
| 608 int right_space = std::max(0, right_column->RightAtY(y) - box.right()); | |
| 609 part->set_space_to_right(right_space); | |
| 610 } | |
| 611 | |
| 612 // Look for images that may be closer. | |
| 613 // NOTE: used to be part_grid_, might cause issues now | |
| 614 ColPartitionGridSearch hsearch(grid); | |
| 615 hsearch.StartSideSearch(box.left(), box.bottom(), box.top()); | |
| 616 ColPartition *neighbor = nullptr; | |
| 617 while ((neighbor = hsearch.NextSideSearch(true)) != nullptr) { | |
| 618 if (neighbor->type() == PT_PULLOUT_IMAGE || | |
| 619 neighbor->type() == PT_FLOWING_IMAGE || | |
| 620 neighbor->type() == PT_HEADING_IMAGE) { | |
| 621 int right = neighbor->bounding_box().right(); | |
| 622 if (right < box.left()) { | |
| 623 int space = std::min(box.left() - right, part->space_to_left()); | |
| 624 part->set_space_to_left(space); | |
| 625 } | |
| 626 } | |
| 627 } | |
| 628 hsearch.StartSideSearch(box.left(), box.bottom(), box.top()); | |
| 629 neighbor = nullptr; | |
| 630 while ((neighbor = hsearch.NextSideSearch(false)) != nullptr) { | |
| 631 if (neighbor->type() == PT_PULLOUT_IMAGE || | |
| 632 neighbor->type() == PT_FLOWING_IMAGE || | |
| 633 neighbor->type() == PT_HEADING_IMAGE) { | |
| 634 int left = neighbor->bounding_box().left(); | |
| 635 if (left > box.right()) { | |
| 636 int space = std::min(left - box.right(), part->space_to_right()); | |
| 637 part->set_space_to_right(space); | |
| 638 } | |
| 639 } | |
| 640 } | |
| 641 | |
| 642 ColPartition *upper_part = part->SingletonPartner(true); | |
| 643 if (upper_part) { | |
| 644 int space = | |
| 645 std::max(0, static_cast<int>(upper_part->bounding_box().bottom() - | |
| 646 part->bounding_box().bottom())); | |
| 647 part->set_space_above(space); | |
| 648 } else { | |
| 649 // TODO(nbeato): What constitutes a good value? | |
| 650 // 0 is the default value when not set, explicitly noting it needs to | |
| 651 // be something else. | |
| 652 part->set_space_above(INT32_MAX); | |
| 653 } | |
| 654 | |
| 655 ColPartition *lower_part = part->SingletonPartner(false); | |
| 656 if (lower_part) { | |
| 657 int space = | |
| 658 std::max(0, static_cast<int>(part->bounding_box().bottom() - | |
| 659 lower_part->bounding_box().bottom())); | |
| 660 part->set_space_below(space); | |
| 661 } else { | |
| 662 // TODO(nbeato): What constitutes a good value? | |
| 663 // 0 is the default value when not set, explicitly noting it needs to | |
| 664 // be something else. | |
| 665 part->set_space_below(INT32_MAX); | |
| 666 } | |
| 667 } | |
| 668 } | |
| 669 | |
| 670 // Set spacing and closest neighbors above and below a given colpartition. | |
| 671 void TableFinder::SetVerticalSpacing(ColPartition *part) { | |
| 672 TBOX box = part->bounding_box(); | |
| 673 int top_range = | |
| 674 std::min(box.top() + kMaxVerticalSpacing, static_cast<int>(tright().y())); | |
| 675 int bottom_range = std::max(box.bottom() - kMaxVerticalSpacing, | |
| 676 static_cast<int>(bleft().y())); | |
| 677 box.set_top(top_range); | |
| 678 box.set_bottom(bottom_range); | |
| 679 | |
| 680 TBOX part_box = part->bounding_box(); | |
| 681 // Start a rect search | |
| 682 ColPartitionGridSearch rectsearch(&clean_part_grid_); | |
| 683 rectsearch.StartRectSearch(box); | |
| 684 ColPartition *neighbor; | |
| 685 int min_space_above = kMaxVerticalSpacing; | |
| 686 int min_space_below = kMaxVerticalSpacing; | |
| 687 ColPartition *above_neighbor = nullptr; | |
| 688 ColPartition *below_neighbor = nullptr; | |
| 689 while ((neighbor = rectsearch.NextRectSearch()) != nullptr) { | |
| 690 if (neighbor == part) { | |
| 691 continue; | |
| 692 } | |
| 693 TBOX neighbor_box = neighbor->bounding_box(); | |
| 694 if (neighbor_box.major_x_overlap(part_box)) { | |
| 695 int gap = abs(part->median_bottom() - neighbor->median_bottom()); | |
| 696 // If neighbor is below current partition | |
| 697 if (neighbor_box.top() < part_box.bottom() && gap < min_space_below) { | |
| 698 min_space_below = gap; | |
| 699 below_neighbor = neighbor; | |
| 700 } // If neighbor is above current partition | |
| 701 else if (part_box.top() < neighbor_box.bottom() && | |
| 702 gap < min_space_above) { | |
| 703 min_space_above = gap; | |
| 704 above_neighbor = neighbor; | |
| 705 } | |
| 706 } | |
| 707 } | |
| 708 part->set_space_above(min_space_above); | |
| 709 part->set_space_below(min_space_below); | |
| 710 part->set_nearest_neighbor_above(above_neighbor); | |
| 711 part->set_nearest_neighbor_below(below_neighbor); | |
| 712 } | |
| 713 | |
| 714 // Set global spacing and x-height estimates | |
| 715 void TableFinder::SetGlobalSpacings(ColPartitionGrid *grid) { | |
| 716 STATS xheight_stats(0, kMaxVerticalSpacing); | |
| 717 STATS width_stats(0, kMaxBlobWidth); | |
| 718 STATS ledding_stats(0, kMaxVerticalSpacing); | |
| 719 // Iterate the ColPartitions in the grid. | |
| 720 ColPartitionGridSearch gsearch(grid); | |
| 721 gsearch.SetUniqueMode(true); | |
| 722 gsearch.StartFullSearch(); | |
| 723 ColPartition *part = nullptr; | |
| 724 while ((part = gsearch.NextFullSearch()) != nullptr) { | |
| 725 // TODO(nbeato): HACK HACK HACK! medians are equal to partition length. | |
| 726 // ComputeLimits needs to get called somewhere outside of TableFinder | |
| 727 // to make sure the partitions are properly initialized. | |
| 728 // When this is called, SmoothPartitionPartners dies in an assert after | |
| 729 // table find runs. Alternative solution. | |
| 730 // part->ComputeLimits(); | |
| 731 if (part->IsTextType()) { | |
| 732 // xheight_stats.add(part->median_height(), part->boxes_count()); | |
| 733 // width_stats.add(part->median_width(), part->boxes_count()); | |
| 734 | |
| 735 // This loop can be removed when above issues are fixed. | |
| 736 // Replace it with the 2 lines commented out above. | |
| 737 BLOBNBOX_C_IT it(part->boxes()); | |
| 738 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { | |
| 739 xheight_stats.add(it.data()->bounding_box().height(), 1); | |
| 740 width_stats.add(it.data()->bounding_box().width(), 1); | |
| 741 } | |
| 742 | |
| 743 ledding_stats.add(part->space_above(), 1); | |
| 744 ledding_stats.add(part->space_below(), 1); | |
| 745 } | |
| 746 } | |
| 747 // Set estimates based on median of statistics obtained | |
| 748 set_global_median_xheight(static_cast<int>(xheight_stats.median() + 0.5)); | |
| 749 set_global_median_blob_width(static_cast<int>(width_stats.median() + 0.5)); | |
| 750 set_global_median_ledding(static_cast<int>(ledding_stats.median() + 0.5)); | |
| 751 #ifndef GRAPHICS_DISABLED | |
| 752 if (textord_tablefind_show_stats) { | |
| 753 const char *kWindowName = "X-height (R), X-width (G), and ledding (B)"; | |
| 754 ScrollView *stats_win = MakeWindow(500, 10, kWindowName); | |
| 755 xheight_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::RED); | |
| 756 width_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::GREEN); | |
| 757 ledding_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::BLUE); | |
| 758 } | |
| 759 #endif // !GRAPHICS_DISABLED | |
| 760 } | |
| 761 | |
| 762 void TableFinder::set_global_median_xheight(int xheight) { | |
| 763 global_median_xheight_ = xheight; | |
| 764 } | |
| 765 void TableFinder::set_global_median_blob_width(int width) { | |
| 766 global_median_blob_width_ = width; | |
| 767 } | |
| 768 void TableFinder::set_global_median_ledding(int ledding) { | |
| 769 global_median_ledding_ = ledding; | |
| 770 } | |
| 771 | |
| 772 void TableFinder::FindNeighbors() { | |
| 773 ColPartitionGridSearch gsearch(&clean_part_grid_); | |
| 774 gsearch.StartFullSearch(); | |
| 775 ColPartition *part = nullptr; | |
| 776 while ((part = gsearch.NextFullSearch()) != nullptr) { | |
| 777 // TODO(nbeato): Rename this function, meaning is different now. | |
| 778 // IT is finding nearest neighbors its own way | |
| 779 // SetVerticalSpacing(part); | |
| 780 | |
| 781 ColPartition *upper = part->SingletonPartner(true); | |
| 782 if (upper) { | |
| 783 part->set_nearest_neighbor_above(upper); | |
| 784 } | |
| 785 | |
| 786 ColPartition *lower = part->SingletonPartner(false); | |
| 787 if (lower) { | |
| 788 part->set_nearest_neighbor_below(lower); | |
| 789 } | |
| 790 } | |
| 791 } | |
| 792 | |
| 793 // High level interface. Input is an unmarked ColPartitionGrid | |
| 794 // (namely, clean_part_grid_). Partitions are identified using local | |
| 795 // information and filter/smoothed. The function exit should contain | |
| 796 // a good sampling of the table partitions. | |
| 797 void TableFinder::MarkTablePartitions() { | |
| 798 MarkPartitionsUsingLocalInformation(); | |
| 799 #ifndef GRAPHICS_DISABLED | |
| 800 if (textord_tablefind_show_mark) { | |
| 801 ScrollView *table_win = MakeWindow(300, 300, "Initial Table Partitions"); | |
| 802 DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE); | |
| 803 DisplayColPartitions(table_win, &leader_and_ruling_grid_, | |
| 804 ScrollView::AQUAMARINE); | |
| 805 } | |
| 806 #endif | |
| 807 FilterFalseAlarms(); | |
| 808 #ifndef GRAPHICS_DISABLED | |
| 809 if (textord_tablefind_show_mark) { | |
| 810 ScrollView *table_win = MakeWindow(600, 300, "Filtered Table Partitions"); | |
| 811 DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE); | |
| 812 DisplayColPartitions(table_win, &leader_and_ruling_grid_, | |
| 813 ScrollView::AQUAMARINE); | |
| 814 } | |
| 815 #endif | |
| 816 SmoothTablePartitionRuns(); | |
| 817 #ifndef GRAPHICS_DISABLED | |
| 818 if (textord_tablefind_show_mark) { | |
| 819 ScrollView *table_win = MakeWindow(900, 300, "Smoothed Table Partitions"); | |
| 820 DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE); | |
| 821 DisplayColPartitions(table_win, &leader_and_ruling_grid_, | |
| 822 ScrollView::AQUAMARINE); | |
| 823 } | |
| 824 #endif | |
| 825 FilterFalseAlarms(); | |
| 826 #ifndef GRAPHICS_DISABLED | |
| 827 if (textord_tablefind_show_mark || textord_show_tables) { | |
| 828 ScrollView *table_win = MakeWindow(900, 300, "Final Table Partitions"); | |
| 829 DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE); | |
| 830 DisplayColPartitions(table_win, &leader_and_ruling_grid_, | |
| 831 ScrollView::AQUAMARINE); | |
| 832 } | |
| 833 #endif | |
| 834 } | |
| 835 | |
| 836 // These types of partitions are marked as table partitions: | |
| 837 // 1- Partitions that have at lease one large gap between words | |
| 838 // 2- Partitions that consist of only one word (no significant gap | |
| 839 // between components) | |
| 840 // 3- Partitions that vertically overlap with other partitions within the | |
| 841 // same column. | |
| 842 // 4- Partitions with leaders before/after them. | |
| 843 void TableFinder::MarkPartitionsUsingLocalInformation() { | |
| 844 // Iterate the ColPartitions in the grid. | |
| 845 ColPartitionGridSearch gsearch(&clean_part_grid_); | |
| 846 gsearch.StartFullSearch(); | |
| 847 ColPartition *part = nullptr; | |
| 848 while ((part = gsearch.NextFullSearch()) != nullptr) { | |
| 849 if (!part->IsTextType()) { // Only consider text partitions | |
| 850 continue; | |
| 851 } | |
| 852 // Only consider partitions in dominant font size or smaller | |
| 853 if (part->median_height() > kMaxTableCellXheight * global_median_xheight_) { | |
| 854 continue; | |
| 855 } | |
| 856 // Mark partitions with a large gap, or no significant gap as | |
| 857 // table partitions. | |
| 858 // Comments: It produces several false alarms at: | |
| 859 // - last line of a paragraph (fixed) | |
| 860 // - single word section headings | |
| 861 // - page headers and footers | |
| 862 // - numbered equations | |
| 863 // - line drawing regions | |
| 864 // TODO(faisal): detect and fix above-mentioned cases | |
| 865 if (HasWideOrNoInterWordGap(part) || HasLeaderAdjacent(*part)) { | |
| 866 part->set_table_type(); | |
| 867 } | |
| 868 } | |
| 869 } | |
| 870 | |
| 871 // Check if the partition has at least one large gap between words or no | |
| 872 // significant gap at all | |
| 873 bool TableFinder::HasWideOrNoInterWordGap(ColPartition *part) const { | |
| 874 // Should only get text partitions. | |
| 875 ASSERT_HOST(part->IsTextType()); | |
| 876 // Blob access | |
| 877 BLOBNBOX_CLIST *part_boxes = part->boxes(); | |
| 878 BLOBNBOX_C_IT it(part_boxes); | |
| 879 // Check if this is a relatively small partition (such as a single word) | |
| 880 if (part->bounding_box().width() < | |
| 881 kMinBoxesInTextPartition * part->median_height() && | |
| 882 part_boxes->length() < kMinBoxesInTextPartition) { | |
| 883 return true; | |
| 884 } | |
| 885 | |
| 886 // Variables used to compute inter-blob spacing. | |
| 887 int previous_x1 = -1; | |
| 888 // Stores the maximum gap detected. | |
| 889 int largest_partition_gap_found = -1; | |
| 890 // Text partition gap limits. If this is text (and not a table), | |
| 891 // there should be at least one gap larger than min_gap and no gap | |
| 892 // larger than max_gap. | |
| 893 const double max_gap = kMaxGapInTextPartition * part->median_height(); | |
| 894 const double min_gap = kMinMaxGapInTextPartition * part->median_height(); | |
| 895 | |
| 896 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { | |
| 897 BLOBNBOX *blob = it.data(); | |
| 898 int current_x0 = blob->bounding_box().left(); | |
| 899 int current_x1 = blob->bounding_box().right(); | |
| 900 if (previous_x1 != -1) { | |
| 901 int gap = current_x0 - previous_x1; | |
| 902 | |
| 903 // TODO(nbeato): Boxes may overlap? Huh? | |
| 904 // For example, mag.3B 8003_033.3B.tif in UNLV data. The titles/authors | |
| 905 // on the top right of the page are filtered out with this line. | |
| 906 // Note 2: Iterating over blobs in a partition, so we are looking for | |
| 907 // spacing between the words. | |
| 908 if (gap < 0) { | |
| 909 // More likely case, the blobs slightly overlap. This can happen | |
| 910 // with diacritics (accents) or broken alphabet symbols (characters). | |
| 911 // Merge boxes together by taking max of right sides. | |
| 912 if (-gap < part->median_height() * kMaxBlobOverlapFactor) { | |
| 913 previous_x1 = std::max(previous_x1, current_x1); | |
| 914 continue; | |
| 915 } | |
| 916 // Extreme case, blobs overlap significantly in the same partition... | |
| 917 // This should not happen often (if at all), but it does. | |
| 918 // TODO(nbeato): investigate cases when this happens. | |
| 919 else { | |
| 920 // The behavior before was to completely ignore this case. | |
| 921 } | |
| 922 } | |
| 923 | |
| 924 // If a large enough gap is found, mark it as a table cell (return true) | |
| 925 if (gap > max_gap) { | |
| 926 return true; | |
| 927 } | |
| 928 if (gap > largest_partition_gap_found) { | |
| 929 largest_partition_gap_found = gap; | |
| 930 } | |
| 931 } | |
| 932 previous_x1 = current_x1; | |
| 933 } | |
| 934 // Since no large gap was found, return false if the partition is too | |
| 935 // long to be a data cell | |
| 936 if (part->bounding_box().width() > | |
| 937 kMaxBoxesInDataPartition * part->median_height() || | |
| 938 part_boxes->length() > kMaxBoxesInDataPartition) { | |
| 939 return false; | |
| 940 } | |
| 941 | |
| 942 // A partition may be a single blob. In this case, it's an isolated symbol | |
| 943 // or non-text (such as a ruling or image). | |
| 944 // Detect these as table partitions? Shouldn't this be case by case? | |
| 945 // The behavior before was to ignore this, making max_partition_gap < 0 | |
| 946 // and implicitly return true. Just making it explicit. | |
| 947 if (largest_partition_gap_found == -1) { | |
| 948 return true; | |
| 949 } | |
| 950 | |
| 951 // return true if the maximum gap found is smaller than the minimum allowed | |
| 952 // max_gap in a text partition. This indicates that there is no significant | |
| 953 // space in the partition, hence it is likely a single word. | |
| 954 return largest_partition_gap_found < min_gap; | |
| 955 } | |
| 956 | |
| 957 // A criteria for possible tables is that a table may have leaders | |
| 958 // between data cells. An aggressive solution to find such tables is to | |
| 959 // explicitly mark partitions that have adjacent leaders. | |
| 960 // Note that this includes overlapping leaders. However, it does not | |
| 961 // include leaders in different columns on the page. | |
| 962 // Possible false-positive will include lists, such as a table of contents. | |
| 963 // As these arise, the aggressive nature of this search may need to be | |
| 964 // trimmed down. | |
| 965 bool TableFinder::HasLeaderAdjacent(const ColPartition &part) { | |
| 966 if (part.flow() == BTFT_LEADER) { | |
| 967 return true; | |
| 968 } | |
| 969 // Search range is left and right bounded by an offset of the | |
| 970 // median xheight. This offset is to allow some tolerance to the | |
| 971 // the leaders on the page in the event that the alignment is still | |
| 972 // a bit off. | |
| 973 const TBOX &box = part.bounding_box(); | |
| 974 const int search_size = kAdjacentLeaderSearchPadding * global_median_xheight_; | |
| 975 const int top = box.top() + search_size; | |
| 976 const int bottom = box.bottom() - search_size; | |
| 977 ColPartitionGridSearch hsearch(&leader_and_ruling_grid_); | |
| 978 for (int direction = 0; direction < 2; ++direction) { | |
| 979 bool right_to_left = (direction == 0); | |
| 980 int x = right_to_left ? box.right() : box.left(); | |
| 981 hsearch.StartSideSearch(x, bottom, top); | |
| 982 ColPartition *leader = nullptr; | |
| 983 while ((leader = hsearch.NextSideSearch(right_to_left)) != nullptr) { | |
| 984 // The leader could be a horizontal ruling in the grid. | |
| 985 // Make sure it is actually a leader. | |
| 986 if (leader->flow() != BTFT_LEADER) { | |
| 987 continue; | |
| 988 } | |
| 989 // This should not happen, they are in different grids. | |
| 990 ASSERT_HOST(&part != leader); | |
| 991 // Make sure the leader shares a page column with the partition, | |
| 992 // otherwise we are spreading across columns. | |
| 993 if (!part.IsInSameColumnAs(*leader)) { | |
| 994 break; | |
| 995 } | |
| 996 // There should be a significant vertical overlap | |
| 997 if (!leader->VSignificantCoreOverlap(part)) { | |
| 998 continue; | |
| 999 } | |
| 1000 // Leader passed all tests, so it is adjacent. | |
| 1001 return true; | |
| 1002 } | |
| 1003 } | |
| 1004 // No leaders are adjacent to the given partition. | |
| 1005 return false; | |
| 1006 } | |
| 1007 | |
| 1008 // Filter individual text partitions marked as table partitions | |
| 1009 // consisting of paragraph endings, small section headings, and | |
| 1010 // headers and footers. | |
| 1011 void TableFinder::FilterFalseAlarms() { | |
| 1012 FilterParagraphEndings(); | |
| 1013 FilterHeaderAndFooter(); | |
| 1014 // TODO(nbeato): Fully justified text as non-table? | |
| 1015 } | |
| 1016 | |
| 1017 void TableFinder::FilterParagraphEndings() { | |
| 1018 // Detect last line of paragraph | |
| 1019 // Iterate the ColPartitions in the grid. | |
| 1020 ColPartitionGridSearch gsearch(&clean_part_grid_); | |
| 1021 gsearch.StartFullSearch(); | |
| 1022 ColPartition *part = nullptr; | |
| 1023 while ((part = gsearch.NextFullSearch()) != nullptr) { | |
| 1024 if (part->type() != PT_TABLE) { | |
| 1025 continue; // Consider only table partitions | |
| 1026 } | |
| 1027 | |
| 1028 // Paragraph ending should have flowing text above it. | |
| 1029 ColPartition *upper_part = part->nearest_neighbor_above(); | |
| 1030 if (!upper_part) { | |
| 1031 continue; | |
| 1032 } | |
| 1033 if (upper_part->type() != PT_FLOWING_TEXT) { | |
| 1034 continue; | |
| 1035 } | |
| 1036 if (upper_part->bounding_box().width() < 2 * part->bounding_box().width()) { | |
| 1037 continue; | |
| 1038 } | |
| 1039 // Check if its the last line of a paragraph. | |
| 1040 // In most cases, a paragraph ending should be left-aligned to text line | |
| 1041 // above it. Sometimes, it could be a 2 line paragraph, in which case | |
| 1042 // the line above it is indented. | |
| 1043 // To account for that, check if the partition center is to | |
| 1044 // the left of the one above it. | |
| 1045 int mid = (part->bounding_box().left() + part->bounding_box().right()) / 2; | |
| 1046 int upper_mid = (upper_part->bounding_box().left() + | |
| 1047 upper_part->bounding_box().right()) / | |
| 1048 2; | |
| 1049 int current_spacing = 0; // spacing of the current line to margin | |
| 1050 int upper_spacing = 0; // spacing of the previous line to the margin | |
| 1051 if (left_to_right_language_) { | |
| 1052 // Left to right languages, use mid - left to figure out the distance | |
| 1053 // the middle is from the left margin. | |
| 1054 int left = std::min(part->bounding_box().left(), | |
| 1055 upper_part->bounding_box().left()); | |
| 1056 current_spacing = mid - left; | |
| 1057 upper_spacing = upper_mid - left; | |
| 1058 } else { | |
| 1059 // Right to left languages, use right - mid to figure out the distance | |
| 1060 // the middle is from the right margin. | |
| 1061 int right = std::max(part->bounding_box().right(), | |
| 1062 upper_part->bounding_box().right()); | |
| 1063 current_spacing = right - mid; | |
| 1064 upper_spacing = right - upper_mid; | |
| 1065 } | |
| 1066 if (current_spacing * kParagraphEndingPreviousLineRatio > upper_spacing) { | |
| 1067 continue; | |
| 1068 } | |
| 1069 | |
| 1070 // Paragraphs should have similar fonts. | |
| 1071 if (!part->MatchingSizes(*upper_part) || | |
| 1072 !part->MatchingStrokeWidth(*upper_part, kStrokeWidthFractionalTolerance, | |
| 1073 kStrokeWidthConstantTolerance)) { | |
| 1074 continue; | |
| 1075 } | |
| 1076 | |
| 1077 // The last line of a paragraph should be left aligned. | |
| 1078 // TODO(nbeato): This would be untrue if the text was right aligned. | |
| 1079 // How often is that? | |
| 1080 if (part->space_to_left() > | |
| 1081 kMaxParagraphEndingLeftSpaceMultiple * part->median_height()) { | |
| 1082 continue; | |
| 1083 } | |
| 1084 // The line above it should be right aligned (assuming justified format). | |
| 1085 // Since we can't assume justified text, we compare whitespace to text. | |
| 1086 // The above line should have majority spanning text (or the current | |
| 1087 // line could have fit on the previous line). So compare | |
| 1088 // whitespace to text. | |
| 1089 if (upper_part->bounding_box().width() < | |
| 1090 kMinParagraphEndingTextToWhitespaceRatio * | |
| 1091 upper_part->space_to_right()) { | |
| 1092 continue; | |
| 1093 } | |
| 1094 | |
| 1095 // Ledding above the line should be less than ledding below | |
| 1096 if (part->space_above() >= part->space_below() || | |
| 1097 part->space_above() > 2 * global_median_ledding_) { | |
| 1098 continue; | |
| 1099 } | |
| 1100 | |
| 1101 // If all checks failed, it is probably text. | |
| 1102 part->clear_table_type(); | |
| 1103 } | |
| 1104 } | |
| 1105 | |
| 1106 void TableFinder::FilterHeaderAndFooter() { | |
| 1107 // Consider top-most text colpartition as header and bottom most as footer | |
| 1108 ColPartition *header = nullptr; | |
| 1109 ColPartition *footer = nullptr; | |
| 1110 int max_top = INT32_MIN; | |
| 1111 int min_bottom = INT32_MAX; | |
| 1112 ColPartitionGridSearch gsearch(&clean_part_grid_); | |
| 1113 gsearch.StartFullSearch(); | |
| 1114 ColPartition *part = nullptr; | |
| 1115 while ((part = gsearch.NextFullSearch()) != nullptr) { | |
| 1116 if (!part->IsTextType()) { | |
| 1117 continue; // Consider only text partitions | |
| 1118 } | |
| 1119 int top = part->bounding_box().top(); | |
| 1120 int bottom = part->bounding_box().bottom(); | |
| 1121 if (top > max_top) { | |
| 1122 max_top = top; | |
| 1123 header = part; | |
| 1124 } | |
| 1125 if (bottom < min_bottom) { | |
| 1126 min_bottom = bottom; | |
| 1127 footer = part; | |
| 1128 } | |
| 1129 } | |
| 1130 if (header) { | |
| 1131 header->clear_table_type(); | |
| 1132 } | |
| 1133 if (footer) { | |
| 1134 footer->clear_table_type(); | |
| 1135 } | |
| 1136 } | |
| 1137 | |
| 1138 // Mark all ColPartitions as table cells that have a table cell above | |
| 1139 // and below them | |
| 1140 // TODO(faisal): This is too aggressive at the moment. The method needs to | |
| 1141 // consider spacing and alignment as well. Detection of false alarm table cells | |
| 1142 // should also be done as part of it. | |
| 1143 void TableFinder::SmoothTablePartitionRuns() { | |
| 1144 // Iterate the ColPartitions in the grid. | |
| 1145 ColPartitionGridSearch gsearch(&clean_part_grid_); | |
| 1146 gsearch.StartFullSearch(); | |
| 1147 ColPartition *part = nullptr; | |
| 1148 while ((part = gsearch.NextFullSearch()) != nullptr) { | |
| 1149 if (part->type() >= PT_TABLE || part->type() == PT_UNKNOWN) { | |
| 1150 continue; // Consider only text partitions | |
| 1151 } | |
| 1152 ColPartition *upper_part = part->nearest_neighbor_above(); | |
| 1153 ColPartition *lower_part = part->nearest_neighbor_below(); | |
| 1154 if (!upper_part || !lower_part) { | |
| 1155 continue; | |
| 1156 } | |
| 1157 if (upper_part->type() == PT_TABLE && lower_part->type() == PT_TABLE) { | |
| 1158 part->set_table_type(); | |
| 1159 } | |
| 1160 } | |
| 1161 | |
| 1162 // Pass 2, do the opposite. If both the upper and lower neighbors | |
| 1163 // exist and are not tables, this probably shouldn't be a table. | |
| 1164 gsearch.StartFullSearch(); | |
| 1165 part = nullptr; | |
| 1166 while ((part = gsearch.NextFullSearch()) != nullptr) { | |
| 1167 if (part->type() != PT_TABLE) { | |
| 1168 continue; // Consider only text partitions | |
| 1169 } | |
| 1170 ColPartition *upper_part = part->nearest_neighbor_above(); | |
| 1171 ColPartition *lower_part = part->nearest_neighbor_below(); | |
| 1172 | |
| 1173 // table can't be by itself | |
| 1174 if ((upper_part && upper_part->type() != PT_TABLE) && | |
| 1175 (lower_part && lower_part->type() != PT_TABLE)) { | |
| 1176 part->clear_table_type(); | |
| 1177 } | |
| 1178 } | |
| 1179 } | |
| 1180 | |
| 1181 // Set the type of a column segment based on the ratio of table to text cells | |
| 1182 void TableFinder::SetColumnsType(ColSegment_LIST *column_blocks) { | |
| 1183 ColSegment_IT it(column_blocks); | |
| 1184 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { | |
| 1185 ColSegment *seg = it.data(); | |
| 1186 TBOX box = seg->bounding_box(); | |
| 1187 int num_table_cells = 0; | |
| 1188 int num_text_cells = 0; | |
| 1189 ColPartitionGridSearch rsearch(&clean_part_grid_); | |
| 1190 rsearch.SetUniqueMode(true); | |
| 1191 rsearch.StartRectSearch(box); | |
| 1192 ColPartition *part = nullptr; | |
| 1193 while ((part = rsearch.NextRectSearch()) != nullptr) { | |
| 1194 if (part->type() == PT_TABLE) { | |
| 1195 num_table_cells++; | |
| 1196 } else if (part->type() == PT_FLOWING_TEXT) { | |
| 1197 num_text_cells++; | |
| 1198 } | |
| 1199 } | |
| 1200 // If a column block has no text or table partition in it, it is not needed | |
| 1201 // for table detection. | |
| 1202 if (!num_table_cells && !num_text_cells) { | |
| 1203 delete it.extract(); | |
| 1204 } else { | |
| 1205 seg->set_num_table_cells(num_table_cells); | |
| 1206 seg->set_num_text_cells(num_text_cells); | |
| 1207 // set column type based on the ratio of table to text cells | |
| 1208 seg->set_type(); | |
| 1209 } | |
| 1210 } | |
| 1211 } | |
| 1212 | |
| 1213 // Move column blocks to grid | |
| 1214 void TableFinder::MoveColSegmentsToGrid(ColSegment_LIST *segments, | |
| 1215 ColSegmentGrid *col_seg_grid) { | |
| 1216 ColSegment_IT it(segments); | |
| 1217 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { | |
| 1218 ColSegment *seg = it.extract(); | |
| 1219 col_seg_grid->InsertBBox(true, true, seg); | |
| 1220 } | |
| 1221 } | |
| 1222 | |
| 1223 // Merge column blocks if a split is detected due to the presence of a | |
| 1224 // table. A text block is considered split if it has multiple | |
| 1225 // neighboring blocks above/below it, and at least one of the | |
| 1226 // neighboring blocks is of table type (has a high density of table | |
| 1227 // partitions). In this case neighboring blocks in the direction | |
| 1228 // (above/below) of the table block are merged with the text block. | |
| 1229 | |
| 1230 // Comment: This method does not handle split due to a full page table | |
| 1231 // since table columns in this case do not have a text column on which | |
| 1232 // split decision can be based. | |
| 1233 void TableFinder::GridMergeColumnBlocks() { | |
| 1234 int margin = gridsize(); | |
| 1235 | |
| 1236 // Iterate the Column Blocks in the grid. | |
| 1237 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> gsearch( | |
| 1238 &col_seg_grid_); | |
| 1239 gsearch.StartFullSearch(); | |
| 1240 ColSegment *seg; | |
| 1241 while ((seg = gsearch.NextFullSearch()) != nullptr) { | |
| 1242 if (seg->type() != COL_TEXT) { | |
| 1243 continue; // only consider text blocks for split detection | |
| 1244 } | |
| 1245 bool neighbor_found = false; | |
| 1246 bool modified = false; // Modified at least once | |
| 1247 // keep expanding current box as long as neighboring table columns | |
| 1248 // are found above or below it. | |
| 1249 do { | |
| 1250 TBOX box = seg->bounding_box(); | |
| 1251 // slightly expand the search region vertically | |
| 1252 int top_range = | |
| 1253 std::min(box.top() + margin, static_cast<int>(tright().y())); | |
| 1254 int bottom_range = | |
| 1255 std::max(box.bottom() - margin, static_cast<int>(bleft().y())); | |
| 1256 box.set_top(top_range); | |
| 1257 box.set_bottom(bottom_range); | |
| 1258 neighbor_found = false; | |
| 1259 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> rectsearch( | |
| 1260 &col_seg_grid_); | |
| 1261 rectsearch.StartRectSearch(box); | |
| 1262 ColSegment *neighbor = nullptr; | |
| 1263 while ((neighbor = rectsearch.NextRectSearch()) != nullptr) { | |
| 1264 if (neighbor == seg) { | |
| 1265 continue; | |
| 1266 } | |
| 1267 const TBOX &neighbor_box = neighbor->bounding_box(); | |
| 1268 // If the neighbor box significantly overlaps with the current | |
| 1269 // box (due to the expansion of the current box in the | |
| 1270 // previous iteration of this loop), remove the neighbor box | |
| 1271 // and expand the current box to include it. | |
| 1272 if (neighbor_box.overlap_fraction(box) >= 0.9) { | |
| 1273 seg->InsertBox(neighbor_box); | |
| 1274 modified = true; | |
| 1275 rectsearch.RemoveBBox(); | |
| 1276 gsearch.RepositionIterator(); | |
| 1277 delete neighbor; | |
| 1278 continue; | |
| 1279 } | |
| 1280 // Only expand if the neighbor box is of table type | |
| 1281 if (neighbor->type() != COL_TABLE) { | |
| 1282 continue; | |
| 1283 } | |
| 1284 // Insert the neighbor box into the current column block | |
| 1285 if (neighbor_box.major_x_overlap(box) && !box.contains(neighbor_box)) { | |
| 1286 seg->InsertBox(neighbor_box); | |
| 1287 neighbor_found = true; | |
| 1288 modified = true; | |
| 1289 rectsearch.RemoveBBox(); | |
| 1290 gsearch.RepositionIterator(); | |
| 1291 delete neighbor; | |
| 1292 } | |
| 1293 } | |
| 1294 } while (neighbor_found); | |
| 1295 if (modified) { | |
| 1296 // Because the box has changed, it has to be removed first. | |
| 1297 gsearch.RemoveBBox(); | |
| 1298 col_seg_grid_.InsertBBox(true, true, seg); | |
| 1299 gsearch.RepositionIterator(); | |
| 1300 } | |
| 1301 } | |
| 1302 } | |
| 1303 | |
| 1304 // Group horizontally overlapping table partitions into table columns. | |
| 1305 // TODO(faisal): This is too aggressive at the moment. The method should | |
| 1306 // consider more attributes to group table partitions together. Some common | |
| 1307 // errors are: | |
| 1308 // 1- page number is merged with a table column above it even | |
| 1309 // if there is a large vertical gap between them. | |
| 1310 // 2- column headers go on to catch one of the columns arbitrarily | |
| 1311 // 3- an isolated noise blob near page top or bottom merges with the table | |
| 1312 // column below/above it | |
| 1313 // 4- cells from two vertically adjacent tables merge together to make a | |
| 1314 // single column resulting in merging of the two tables | |
| 1315 void TableFinder::GetTableColumns(ColSegment_LIST *table_columns) { | |
| 1316 ColSegment_IT it(table_columns); | |
| 1317 // Iterate the ColPartitions in the grid. | |
| 1318 ColPartitionGridSearch gsearch(&clean_part_grid_); | |
| 1319 gsearch.StartFullSearch(); | |
| 1320 ColPartition *part; | |
| 1321 while ((part = gsearch.NextFullSearch()) != nullptr) { | |
| 1322 if (part->inside_table_column() || part->type() != PT_TABLE) { | |
| 1323 continue; // prevent a partition to be assigned to multiple columns | |
| 1324 } | |
| 1325 const TBOX &box = part->bounding_box(); | |
| 1326 auto *col = new ColSegment(); | |
| 1327 col->InsertBox(box); | |
| 1328 part->set_inside_table_column(true); | |
| 1329 // Start a search below the current cell to find bottom neighbours | |
| 1330 // Note: a full search will always process things above it first, so | |
| 1331 // this should be starting at the highest cell and working its way down. | |
| 1332 ColPartitionGridSearch vsearch(&clean_part_grid_); | |
| 1333 vsearch.StartVerticalSearch(box.left(), box.right(), box.bottom()); | |
| 1334 ColPartition *neighbor = nullptr; | |
| 1335 bool found_neighbours = false; | |
| 1336 while ((neighbor = vsearch.NextVerticalSearch(true)) != nullptr) { | |
| 1337 // only consider neighbors not assigned to any column yet | |
| 1338 if (neighbor->inside_table_column()) { | |
| 1339 continue; | |
| 1340 } | |
| 1341 // Horizontal lines should not break the flow | |
| 1342 if (neighbor->IsHorizontalLine()) { | |
| 1343 continue; | |
| 1344 } | |
| 1345 // presence of a non-table neighbor marks the end of current | |
| 1346 // table column | |
| 1347 if (neighbor->type() != PT_TABLE) { | |
| 1348 break; | |
| 1349 } | |
| 1350 // add the neighbor partition to the table column | |
| 1351 const TBOX &neighbor_box = neighbor->bounding_box(); | |
| 1352 col->InsertBox(neighbor_box); | |
| 1353 neighbor->set_inside_table_column(true); | |
| 1354 found_neighbours = true; | |
| 1355 } | |
| 1356 if (found_neighbours) { | |
| 1357 it.add_after_then_move(col); | |
| 1358 } else { | |
| 1359 part->set_inside_table_column(false); | |
| 1360 delete col; | |
| 1361 } | |
| 1362 } | |
| 1363 } | |
| 1364 | |
| 1365 // Mark regions in a column that are x-bounded by the column boundaries and | |
| 1366 // y-bounded by the table columns' projection on the y-axis as table regions | |
| 1367 void TableFinder::GetTableRegions(ColSegment_LIST *table_columns, | |
| 1368 ColSegment_LIST *table_regions) { | |
| 1369 ColSegment_IT cit(table_columns); | |
| 1370 ColSegment_IT rit(table_regions); | |
| 1371 // Iterate through column blocks | |
| 1372 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> gsearch( | |
| 1373 &col_seg_grid_); | |
| 1374 gsearch.StartFullSearch(); | |
| 1375 ColSegment *part; | |
| 1376 int page_height = tright().y() - bleft().y(); | |
| 1377 ASSERT_HOST(page_height > 0); | |
| 1378 // create a bool array to hold projection on y-axis | |
| 1379 bool *table_region = new bool[page_height]; | |
| 1380 while ((part = gsearch.NextFullSearch()) != nullptr) { | |
| 1381 const TBOX &part_box = part->bounding_box(); | |
| 1382 // reset the projection array | |
| 1383 for (int i = 0; i < page_height; i++) { | |
| 1384 table_region[i] = false; | |
| 1385 } | |
| 1386 // iterate through all table columns to find regions in the current | |
| 1387 // page column block | |
| 1388 cit.move_to_first(); | |
| 1389 for (cit.mark_cycle_pt(); !cit.cycled_list(); cit.forward()) { | |
| 1390 TBOX col_box = cit.data()->bounding_box(); | |
| 1391 // find intersection region of table column and page column | |
| 1392 TBOX intersection_box = col_box.intersection(part_box); | |
| 1393 // project table column on the y-axis | |
| 1394 for (int i = intersection_box.bottom(); i < intersection_box.top(); i++) { | |
| 1395 table_region[i - bleft().y()] = true; | |
| 1396 } | |
| 1397 } | |
| 1398 // set x-limits of table regions to page column width | |
| 1399 TBOX current_table_box; | |
| 1400 current_table_box.set_left(part_box.left()); | |
| 1401 current_table_box.set_right(part_box.right()); | |
| 1402 // go through the y-axis projection to find runs of table | |
| 1403 // regions. Each run makes one table region. | |
| 1404 for (int i = 1; i < page_height; i++) { | |
| 1405 // detect start of a table region | |
| 1406 if (!table_region[i - 1] && table_region[i]) { | |
| 1407 current_table_box.set_bottom(i + bleft().y()); | |
| 1408 } | |
| 1409 // TODO(nbeato): Is it guaranteed that the last row is not a table region? | |
| 1410 // detect end of a table region | |
| 1411 if (table_region[i - 1] && !table_region[i]) { | |
| 1412 current_table_box.set_top(i + bleft().y()); | |
| 1413 if (!current_table_box.null_box()) { | |
| 1414 auto *seg = new ColSegment(); | |
| 1415 seg->InsertBox(current_table_box); | |
| 1416 rit.add_after_then_move(seg); | |
| 1417 } | |
| 1418 } | |
| 1419 } | |
| 1420 } | |
| 1421 delete[] table_region; | |
| 1422 } | |
| 1423 | |
| 1424 // Merge table regions corresponding to tables spanning multiple columns if | |
| 1425 // there is a colpartition (horizontal ruling line or normal text) that | |
| 1426 // touches both regions. | |
| 1427 // TODO(faisal): A rare error occurs if there are two horizontally adjacent | |
| 1428 // tables with aligned ruling lines. In this case, line finder returns a | |
| 1429 // single line and hence the tables get merged together | |
| 1430 void TableFinder::GridMergeTableRegions() { | |
| 1431 // Iterate the table regions in the grid. | |
| 1432 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> gsearch( | |
| 1433 &table_grid_); | |
| 1434 gsearch.StartFullSearch(); | |
| 1435 ColSegment *seg = nullptr; | |
| 1436 while ((seg = gsearch.NextFullSearch()) != nullptr) { | |
| 1437 bool neighbor_found = false; | |
| 1438 bool modified = false; // Modified at least once | |
| 1439 do { | |
| 1440 // Start a rectangle search x-bounded by the image and y by the table | |
| 1441 const TBOX &box = seg->bounding_box(); | |
| 1442 TBOX search_region(box); | |
| 1443 search_region.set_left(bleft().x()); | |
| 1444 search_region.set_right(tright().x()); | |
| 1445 neighbor_found = false; | |
| 1446 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> rectsearch( | |
| 1447 &table_grid_); | |
| 1448 rectsearch.StartRectSearch(search_region); | |
| 1449 ColSegment *neighbor = nullptr; | |
| 1450 while ((neighbor = rectsearch.NextRectSearch()) != nullptr) { | |
| 1451 if (neighbor == seg) { | |
| 1452 continue; | |
| 1453 } | |
| 1454 const TBOX &neighbor_box = neighbor->bounding_box(); | |
| 1455 // Check if a neighbor box has a large overlap with the table | |
| 1456 // region. This may happen as a result of merging two table | |
| 1457 // regions in the previous iteration. | |
| 1458 if (neighbor_box.overlap_fraction(box) >= 0.9) { | |
| 1459 seg->InsertBox(neighbor_box); | |
| 1460 rectsearch.RemoveBBox(); | |
| 1461 gsearch.RepositionIterator(); | |
| 1462 delete neighbor; | |
| 1463 modified = true; | |
| 1464 continue; | |
| 1465 } | |
| 1466 // Check if two table regions belong together based on a common | |
| 1467 // horizontal ruling line | |
| 1468 if (BelongToOneTable(box, neighbor_box)) { | |
| 1469 seg->InsertBox(neighbor_box); | |
| 1470 neighbor_found = true; | |
| 1471 modified = true; | |
| 1472 rectsearch.RemoveBBox(); | |
| 1473 gsearch.RepositionIterator(); | |
| 1474 delete neighbor; | |
| 1475 } | |
| 1476 } | |
| 1477 } while (neighbor_found); | |
| 1478 if (modified) { | |
| 1479 // Because the box has changed, it has to be removed first. | |
| 1480 gsearch.RemoveBBox(); | |
| 1481 table_grid_.InsertBBox(true, true, seg); | |
| 1482 gsearch.RepositionIterator(); | |
| 1483 } | |
| 1484 } | |
| 1485 } | |
| 1486 | |
| 1487 // Decide if two table regions belong to one table based on a common | |
| 1488 // horizontal ruling line or another colpartition | |
| 1489 bool TableFinder::BelongToOneTable(const TBOX &box1, const TBOX &box2) { | |
| 1490 // Check the obvious case. Most likely not true because overlapping boxes | |
| 1491 // should already be merged, but seems like a good thing to do in case things | |
| 1492 // change. | |
| 1493 if (box1.overlap(box2)) { | |
| 1494 return true; | |
| 1495 } | |
| 1496 // Check for ColPartitions spanning both table regions | |
| 1497 TBOX bbox = box1.bounding_union(box2); | |
| 1498 // Start a rect search on bbox | |
| 1499 ColPartitionGridSearch rectsearch(&clean_part_grid_); | |
| 1500 rectsearch.StartRectSearch(bbox); | |
| 1501 ColPartition *part = nullptr; | |
| 1502 while ((part = rectsearch.NextRectSearch()) != nullptr) { | |
| 1503 const TBOX &part_box = part->bounding_box(); | |
| 1504 // return true if a colpartition spanning both table regions is found | |
| 1505 if (part_box.overlap(box1) && part_box.overlap(box2) && | |
| 1506 !part->IsImageType()) { | |
| 1507 return true; | |
| 1508 } | |
| 1509 } | |
| 1510 return false; | |
| 1511 } | |
| 1512 | |
| 1513 // Adjust table boundaries by: | |
| 1514 // - building a tight bounding box around all ColPartitions contained in it. | |
| 1515 // - expanding table boundaries to include all colpartitions that overlap the | |
| 1516 // table by more than half of their area | |
| 1517 // - expanding table boundaries to include nearby horizontal rule lines | |
| 1518 // - expanding table vertically to include left out column headers | |
| 1519 // TODO(faisal): Expansion of table boundaries is quite aggressive. It usually | |
| 1520 // makes following errors: | |
| 1521 // 1- horizontal lines consisting of underlines are included in the table if | |
| 1522 // they are close enough | |
| 1523 // 2- horizontal lines originating from noise tend to get merged with a table | |
| 1524 // near the top of the page | |
| 1525 // 3- the criteria for including horizontal lines is very generous. Many times | |
| 1526 // horizontal lines separating headers and footers get merged with a | |
| 1527 // single-column table in a multi-column page thereby including text | |
| 1528 // from the neighboring column inside the table | |
| 1529 // 4- the criteria for including left out column headers also tends to | |
| 1530 // occasionally include text-lines above the tables, typically from | |
| 1531 // table caption | |
| 1532 void TableFinder::AdjustTableBoundaries() { | |
| 1533 // Iterate the table regions in the grid | |
| 1534 ColSegment_CLIST adjusted_tables; | |
| 1535 ColSegment_C_IT it(&adjusted_tables); | |
| 1536 ColSegmentGridSearch gsearch(&table_grid_); | |
| 1537 gsearch.StartFullSearch(); | |
| 1538 ColSegment *table = nullptr; | |
| 1539 while ((table = gsearch.NextFullSearch()) != nullptr) { | |
| 1540 const TBOX &table_box = table->bounding_box(); | |
| 1541 TBOX grown_box = table_box; | |
| 1542 GrowTableBox(table_box, &grown_box); | |
| 1543 // To prevent a table from expanding again, do not insert the | |
| 1544 // modified box back to the grid. Instead move it to a list and | |
| 1545 // and remove it from the grid. The list is moved later back to the grid. | |
| 1546 if (!grown_box.null_box()) { | |
| 1547 auto *col = new ColSegment(); | |
| 1548 col->InsertBox(grown_box); | |
| 1549 it.add_after_then_move(col); | |
| 1550 } | |
| 1551 gsearch.RemoveBBox(); | |
| 1552 delete table; | |
| 1553 } | |
| 1554 // clear table grid to move final tables in it | |
| 1555 // TODO(nbeato): table_grid_ should already be empty. The above loop | |
| 1556 // removed everything. Maybe just assert it is empty? | |
| 1557 table_grid_.Clear(); | |
| 1558 it.move_to_first(); | |
| 1559 // move back final tables to table_grid_ | |
| 1560 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { | |
| 1561 ColSegment *seg = it.extract(); | |
| 1562 table_grid_.InsertBBox(true, true, seg); | |
| 1563 } | |
| 1564 } | |
| 1565 | |
| 1566 void TableFinder::GrowTableBox(const TBOX &table_box, TBOX *result_box) { | |
| 1567 // TODO(nbeato): The growing code is a bit excessive right now. | |
| 1568 // By removing these lines, the partitions considered need | |
| 1569 // to have some overlap or be special cases. These lines could | |
| 1570 // be added again once a check is put in place to make sure that | |
| 1571 // growing tables don't stomp on a lot of non-table partitions. | |
| 1572 | |
| 1573 // search for horizontal ruling lines within the vertical margin | |
| 1574 // int vertical_margin = kRulingVerticalMargin * gridsize(); | |
| 1575 TBOX search_box = table_box; | |
| 1576 // int top = MIN(search_box.top() + vertical_margin, tright().y()); | |
| 1577 // int bottom = MAX(search_box.bottom() - vertical_margin, bleft().y()); | |
| 1578 // search_box.set_top(top); | |
| 1579 // search_box.set_bottom(bottom); | |
| 1580 | |
| 1581 GrowTableToIncludePartials(table_box, search_box, result_box); | |
| 1582 GrowTableToIncludeLines(table_box, search_box, result_box); | |
| 1583 IncludeLeftOutColumnHeaders(result_box); | |
| 1584 } | |
| 1585 | |
| 1586 // Grow a table by increasing the size of the box to include | |
| 1587 // partitions with significant overlap with the table. | |
| 1588 void TableFinder::GrowTableToIncludePartials(const TBOX &table_box, | |
| 1589 const TBOX &search_range, | |
| 1590 TBOX *result_box) { | |
| 1591 // Rulings are in a different grid, so search 2 grids for rulings, text, | |
| 1592 // and table partitions that are not entirely within the new box. | |
| 1593 for (int i = 0; i < 2; ++i) { | |
| 1594 ColPartitionGrid *grid = | |
| 1595 (i == 0) ? &fragmented_text_grid_ : &leader_and_ruling_grid_; | |
| 1596 ColPartitionGridSearch rectsearch(grid); | |
| 1597 rectsearch.StartRectSearch(search_range); | |
| 1598 ColPartition *part = nullptr; | |
| 1599 while ((part = rectsearch.NextRectSearch()) != nullptr) { | |
| 1600 // Only include text and table types. | |
| 1601 if (part->IsImageType()) { | |
| 1602 continue; | |
| 1603 } | |
| 1604 const TBOX &part_box = part->bounding_box(); | |
| 1605 // Include partition in the table if more than half of it | |
| 1606 // is covered by the table | |
| 1607 if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) { | |
| 1608 *result_box = result_box->bounding_union(part_box); | |
| 1609 continue; | |
| 1610 } | |
| 1611 } | |
| 1612 } | |
| 1613 } | |
| 1614 | |
| 1615 // Grow a table by expanding to the extents of significantly | |
| 1616 // overlapping lines. | |
| 1617 void TableFinder::GrowTableToIncludeLines(const TBOX &table_box, | |
| 1618 const TBOX &search_range, | |
| 1619 TBOX *result_box) { | |
| 1620 ColPartitionGridSearch rsearch(&leader_and_ruling_grid_); | |
| 1621 rsearch.SetUniqueMode(true); | |
| 1622 rsearch.StartRectSearch(search_range); | |
| 1623 ColPartition *part = nullptr; | |
| 1624 while ((part = rsearch.NextRectSearch()) != nullptr) { | |
| 1625 // TODO(nbeato) This should also do vertical, but column | |
| 1626 // boundaries are breaking things. This function needs to be | |
| 1627 // updated to allow vertical lines as well. | |
| 1628 if (!part->IsLineType()) { | |
| 1629 continue; | |
| 1630 } | |
| 1631 // Avoid the following function call if the result of the | |
| 1632 // function is irrelevant. | |
| 1633 const TBOX &part_box = part->bounding_box(); | |
| 1634 if (result_box->contains(part_box)) { | |
| 1635 continue; | |
| 1636 } | |
| 1637 // Include a partially overlapping horizontal line only if the | |
| 1638 // extra ColPartitions that will be included due to expansion | |
| 1639 // have large side spacing w.r.t. columns containing them. | |
| 1640 if (HLineBelongsToTable(*part, table_box)) { | |
| 1641 *result_box = result_box->bounding_union(part_box); | |
| 1642 } | |
| 1643 // TODO(nbeato): Vertical | |
| 1644 } | |
| 1645 } | |
| 1646 | |
| 1647 // Checks whether the horizontal line belong to the table by looking at the | |
| 1648 // side spacing of extra ColPartitions that will be included in the table | |
| 1649 // due to expansion | |
| 1650 bool TableFinder::HLineBelongsToTable(const ColPartition &part, | |
| 1651 const TBOX &table_box) { | |
| 1652 if (!part.IsHorizontalLine()) { | |
| 1653 return false; | |
| 1654 } | |
| 1655 const TBOX &part_box = part.bounding_box(); | |
| 1656 if (!part_box.major_x_overlap(table_box)) { | |
| 1657 return false; | |
| 1658 } | |
| 1659 // Do not consider top-most horizontal line since it usually | |
| 1660 // originates from noise. | |
| 1661 // TODO(nbeato): I had to comment this out because the ruling grid doesn't | |
| 1662 // have neighbors solved. | |
| 1663 // if (!part.nearest_neighbor_above()) | |
| 1664 // return false; | |
| 1665 const TBOX bbox = part_box.bounding_union(table_box); | |
| 1666 // In the "unioned table" box (the table extents expanded by the line), | |
| 1667 // keep track of how many partitions have significant padding to the left | |
| 1668 // and right. If more than half of the partitions covered by the new table | |
| 1669 // have significant spacing, the line belongs to the table and the table | |
| 1670 // grows to include all of the partitions. | |
| 1671 int num_extra_partitions = 0; | |
| 1672 int extra_space_to_right = 0; | |
| 1673 int extra_space_to_left = 0; | |
| 1674 // Rulings are in a different grid, so search 2 grids for rulings, text, | |
| 1675 // and table partitions that are introduced by the new box. | |
| 1676 for (int i = 0; i < 2; ++i) { | |
| 1677 ColPartitionGrid *grid = | |
| 1678 (i == 0) ? &clean_part_grid_ : &leader_and_ruling_grid_; | |
| 1679 // Start a rect search on bbox | |
| 1680 ColPartitionGridSearch rectsearch(grid); | |
| 1681 rectsearch.SetUniqueMode(true); | |
| 1682 rectsearch.StartRectSearch(bbox); | |
| 1683 ColPartition *extra_part = nullptr; | |
| 1684 while ((extra_part = rectsearch.NextRectSearch()) != nullptr) { | |
| 1685 // ColPartition already in table | |
| 1686 const TBOX &extra_part_box = extra_part->bounding_box(); | |
| 1687 if (extra_part_box.overlap_fraction(table_box) > kMinOverlapWithTable) { | |
| 1688 continue; | |
| 1689 } | |
| 1690 // Non-text ColPartitions do not contribute | |
| 1691 if (extra_part->IsImageType()) { | |
| 1692 continue; | |
| 1693 } | |
| 1694 // Consider this partition. | |
| 1695 num_extra_partitions++; | |
| 1696 // presence of a table cell is a strong hint, so just increment the scores | |
| 1697 // without looking at the spacing. | |
| 1698 if (extra_part->type() == PT_TABLE || extra_part->IsLineType()) { | |
| 1699 extra_space_to_right++; | |
| 1700 extra_space_to_left++; | |
| 1701 continue; | |
| 1702 } | |
| 1703 int space_threshold = kSideSpaceMargin * part.median_height(); | |
| 1704 if (extra_part->space_to_right() > space_threshold) { | |
| 1705 extra_space_to_right++; | |
| 1706 } | |
| 1707 if (extra_part->space_to_left() > space_threshold) { | |
| 1708 extra_space_to_left++; | |
| 1709 } | |
| 1710 } | |
| 1711 } | |
| 1712 // tprintf("%d %d %d\n", | |
| 1713 // num_extra_partitions,extra_space_to_right,extra_space_to_left); | |
| 1714 return (extra_space_to_right > num_extra_partitions / 2) || | |
| 1715 (extra_space_to_left > num_extra_partitions / 2); | |
| 1716 } | |
| 1717 | |
| 1718 // Look for isolated column headers above the given table box and | |
| 1719 // include them in the table | |
| 1720 void TableFinder::IncludeLeftOutColumnHeaders(TBOX *table_box) { | |
| 1721 // Start a search above the current table to look for column headers | |
| 1722 ColPartitionGridSearch vsearch(&clean_part_grid_); | |
| 1723 vsearch.StartVerticalSearch(table_box->left(), table_box->right(), | |
| 1724 table_box->top()); | |
| 1725 ColPartition *neighbor = nullptr; | |
| 1726 ColPartition *previous_neighbor = nullptr; | |
| 1727 while ((neighbor = vsearch.NextVerticalSearch(false)) != nullptr) { | |
| 1728 // Max distance to find a table heading. | |
| 1729 const int max_distance = | |
| 1730 kMaxColumnHeaderDistance * neighbor->median_height(); | |
| 1731 int table_top = table_box->top(); | |
| 1732 const TBOX &box = neighbor->bounding_box(); | |
| 1733 // Do not continue if the next box is way above | |
| 1734 if (box.bottom() - table_top > max_distance) { | |
| 1735 break; | |
| 1736 } | |
| 1737 // Unconditionally include partitions of type TABLE or LINE | |
| 1738 // TODO(faisal): add some reasonable conditions here | |
| 1739 if (neighbor->type() == PT_TABLE || neighbor->IsLineType()) { | |
| 1740 table_box->set_top(box.top()); | |
| 1741 previous_neighbor = nullptr; | |
| 1742 continue; | |
| 1743 } | |
| 1744 // If there are two text partitions, one above the other, without a table | |
| 1745 // cell on their left or right side, consider them a barrier and quit | |
| 1746 if (previous_neighbor == nullptr) { | |
| 1747 previous_neighbor = neighbor; | |
| 1748 } else { | |
| 1749 const TBOX &previous_box = previous_neighbor->bounding_box(); | |
| 1750 if (!box.major_y_overlap(previous_box)) { | |
| 1751 break; | |
| 1752 } | |
| 1753 } | |
| 1754 } | |
| 1755 } | |
| 1756 | |
| 1757 // Remove false alarms consisting of a single column based on their | |
| 1758 // projection on the x-axis. Projection of a real table on the x-axis | |
| 1759 // should have at least one zero-valley larger than the global median | |
| 1760 // x-height of the page. | |
| 1761 void TableFinder::DeleteSingleColumnTables() { | |
| 1762 int page_width = tright().x() - bleft().x(); | |
| 1763 ASSERT_HOST(page_width > 0); | |
| 1764 // create an integer array to hold projection on x-axis | |
| 1765 int *table_xprojection = new int[page_width]; | |
| 1766 // Iterate through all tables in the table grid | |
| 1767 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> table_search( | |
| 1768 &table_grid_); | |
| 1769 table_search.StartFullSearch(); | |
| 1770 ColSegment *table; | |
| 1771 while ((table = table_search.NextFullSearch()) != nullptr) { | |
| 1772 TBOX table_box = table->bounding_box(); | |
| 1773 // reset the projection array | |
| 1774 for (int i = 0; i < page_width; i++) { | |
| 1775 table_xprojection[i] = 0; | |
| 1776 } | |
| 1777 // Start a rect search on table_box | |
| 1778 ColPartitionGridSearch rectsearch(&clean_part_grid_); | |
| 1779 rectsearch.SetUniqueMode(true); | |
| 1780 rectsearch.StartRectSearch(table_box); | |
| 1781 ColPartition *part; | |
| 1782 while ((part = rectsearch.NextRectSearch()) != nullptr) { | |
| 1783 if (!part->IsTextType()) { | |
| 1784 continue; // Do not consider non-text partitions | |
| 1785 } | |
| 1786 if (part->flow() == BTFT_LEADER) { | |
| 1787 continue; // Assume leaders are in tables | |
| 1788 } | |
| 1789 TBOX part_box = part->bounding_box(); | |
| 1790 // Do not consider partitions partially covered by the table | |
| 1791 if (part_box.overlap_fraction(table_box) < kMinOverlapWithTable) { | |
| 1792 continue; | |
| 1793 } | |
| 1794 BLOBNBOX_CLIST *part_boxes = part->boxes(); | |
| 1795 BLOBNBOX_C_IT pit(part_boxes); | |
| 1796 | |
| 1797 // Make sure overlapping blobs don't artificially inflate the number | |
| 1798 // of rows in the table. This happens frequently with things such as | |
| 1799 // decimals and split characters. Do this by assuming the column | |
| 1800 // partition is sorted mostly left to right and just clip | |
| 1801 // bounding boxes by the previous box's extent. | |
| 1802 int next_position_to_write = 0; | |
| 1803 | |
| 1804 for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) { | |
| 1805 BLOBNBOX *pblob = pit.data(); | |
| 1806 // ignore blob height for the purpose of projection since we | |
| 1807 // are only interested in finding valleys | |
| 1808 int xstart = pblob->bounding_box().left(); | |
| 1809 int xend = pblob->bounding_box().right(); | |
| 1810 | |
| 1811 xstart = std::max(xstart, next_position_to_write); | |
| 1812 for (int i = xstart; i < xend; i++) { | |
| 1813 table_xprojection[i - bleft().x()]++; | |
| 1814 } | |
| 1815 next_position_to_write = xend; | |
| 1816 } | |
| 1817 } | |
| 1818 // Find largest valley between two reasonable peaks in the table | |
| 1819 if (!GapInXProjection(table_xprojection, page_width)) { | |
| 1820 table_search.RemoveBBox(); | |
| 1821 delete table; | |
| 1822 } | |
| 1823 } | |
| 1824 delete[] table_xprojection; | |
| 1825 } | |
| 1826 | |
| 1827 // Return true if at least one gap larger than the global x-height | |
| 1828 // exists in the horizontal projection | |
| 1829 bool TableFinder::GapInXProjection(int *xprojection, int length) { | |
| 1830 // Find peak value of the histogram | |
| 1831 int peak_value = 0; | |
| 1832 for (int i = 0; i < length; i++) { | |
| 1833 if (xprojection[i] > peak_value) { | |
| 1834 peak_value = xprojection[i]; | |
| 1835 } | |
| 1836 } | |
| 1837 // Peak value represents the maximum number of horizontally | |
| 1838 // overlapping colpartitions, so this can be considered as the | |
| 1839 // number of rows in the table | |
| 1840 if (peak_value < kMinRowsInTable) { | |
| 1841 return false; | |
| 1842 } | |
| 1843 double projection_threshold = kSmallTableProjectionThreshold * peak_value; | |
| 1844 if (peak_value >= kLargeTableRowCount) { | |
| 1845 projection_threshold = kLargeTableProjectionThreshold * peak_value; | |
| 1846 } | |
| 1847 // Threshold the histogram | |
| 1848 for (int i = 0; i < length; i++) { | |
| 1849 xprojection[i] = (xprojection[i] >= projection_threshold) ? 1 : 0; | |
| 1850 } | |
| 1851 // Find the largest run of zeros between two ones | |
| 1852 int largest_gap = 0; | |
| 1853 int run_start = -1; | |
| 1854 for (int i = 1; i < length; i++) { | |
| 1855 // detect start of a run of zeros | |
| 1856 if (xprojection[i - 1] && !xprojection[i]) { | |
| 1857 run_start = i; | |
| 1858 } | |
| 1859 // detect end of a run of zeros and update the value of largest gap | |
| 1860 if (run_start != -1 && !xprojection[i - 1] && xprojection[i]) { | |
| 1861 int gap = i - run_start; | |
| 1862 if (gap > largest_gap) { | |
| 1863 largest_gap = gap; | |
| 1864 } | |
| 1865 run_start = -1; | |
| 1866 } | |
| 1867 } | |
| 1868 return largest_gap > kMaxXProjectionGapFactor * global_median_xheight_; | |
| 1869 } | |
| 1870 | |
| 1871 // Given the location of a table "guess", try to overlay a cellular | |
| 1872 // grid in the location, adjusting the boundaries. | |
| 1873 // TODO(nbeato): Falsely introduces: | |
| 1874 // -headers/footers (not any worse, too much overlap destroys cells) | |
| 1875 // -page numbers (not worse, included because maximize margins) | |
| 1876 // -equations (nicely fit into a celluar grid, but more sparsely) | |
| 1877 // -figures (random text box, also sparse) | |
| 1878 // -small left-aligned text areas with overlapping positioned whitespace | |
| 1879 // (rejected before) | |
| 1880 // Overall, this just needs some more work. | |
| 1881 void TableFinder::RecognizeTables() { | |
| 1882 #ifndef GRAPHICS_DISABLED | |
| 1883 ScrollView *table_win = nullptr; | |
| 1884 if (textord_show_tables) { | |
| 1885 table_win = MakeWindow(0, 0, "Table Structure"); | |
| 1886 DisplayColPartitions(table_win, &fragmented_text_grid_, ScrollView::BLUE, | |
| 1887 ScrollView::LIGHT_BLUE); | |
| 1888 // table_grid_.DisplayBoxes(table_win); | |
| 1889 } | |
| 1890 #endif | |
| 1891 | |
| 1892 TableRecognizer recognizer; | |
| 1893 recognizer.Init(); | |
| 1894 recognizer.set_line_grid(&leader_and_ruling_grid_); | |
| 1895 recognizer.set_text_grid(&fragmented_text_grid_); | |
| 1896 recognizer.set_max_text_height(global_median_xheight_ * 2.0); | |
| 1897 recognizer.set_min_height(1.5 * gridheight()); | |
| 1898 // Loop over all of the tables and try to fit them. | |
| 1899 // Store the good tables here. | |
| 1900 ColSegment_CLIST good_tables; | |
| 1901 ColSegment_C_IT good_it(&good_tables); | |
| 1902 | |
| 1903 ColSegmentGridSearch gsearch(&table_grid_); | |
| 1904 gsearch.StartFullSearch(); | |
| 1905 ColSegment *found_table = nullptr; | |
| 1906 while ((found_table = gsearch.NextFullSearch()) != nullptr) { | |
| 1907 gsearch.RemoveBBox(); | |
| 1908 | |
| 1909 // The goal is to make the tables persistent in a list. | |
| 1910 // When that happens, this will move into the search loop. | |
| 1911 const TBOX &found_box = found_table->bounding_box(); | |
| 1912 StructuredTable *table_structure = recognizer.RecognizeTable(found_box); | |
| 1913 | |
| 1914 // Process a table. Good tables are inserted into the grid again later on | |
| 1915 // We can't change boxes in the grid while it is running a search. | |
| 1916 if (table_structure != nullptr) { | |
| 1917 #ifndef GRAPHICS_DISABLED | |
| 1918 if (textord_show_tables) { | |
| 1919 table_structure->Display(table_win, ScrollView::LIME_GREEN); | |
| 1920 } | |
| 1921 #endif | |
| 1922 found_table->set_bounding_box(table_structure->bounding_box()); | |
| 1923 delete table_structure; | |
| 1924 good_it.add_after_then_move(found_table); | |
| 1925 } else { | |
| 1926 delete found_table; | |
| 1927 } | |
| 1928 } | |
| 1929 // TODO(nbeato): MERGE!! There is awesome info now available for merging. | |
| 1930 | |
| 1931 // At this point, the grid is empty. We can safely insert the good tables | |
| 1932 // back into grid. | |
| 1933 for (good_it.mark_cycle_pt(); !good_it.cycled_list(); good_it.forward()) { | |
| 1934 table_grid_.InsertBBox(true, true, good_it.extract()); | |
| 1935 } | |
| 1936 } | |
| 1937 | |
| 1938 #ifndef GRAPHICS_DISABLED | |
| 1939 | |
| 1940 // Displays the column segments in some window. | |
| 1941 void TableFinder::DisplayColSegments(ScrollView *win, ColSegment_LIST *segments, | |
| 1942 ScrollView::Color color) { | |
| 1943 win->Pen(color); | |
| 1944 win->Brush(ScrollView::NONE); | |
| 1945 ColSegment_IT it(segments); | |
| 1946 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { | |
| 1947 ColSegment *col = it.data(); | |
| 1948 const TBOX &box = col->bounding_box(); | |
| 1949 int left_x = box.left(); | |
| 1950 int right_x = box.right(); | |
| 1951 int top_y = box.top(); | |
| 1952 int bottom_y = box.bottom(); | |
| 1953 win->Rectangle(left_x, bottom_y, right_x, top_y); | |
| 1954 } | |
| 1955 win->UpdateWindow(); | |
| 1956 } | |
| 1957 | |
| 1958 // Displays the colpartitions using a new coloring on an existing window. | |
| 1959 // Note: This method is only for debug purpose during development and | |
| 1960 // would not be part of checked in code | |
| 1961 void TableFinder::DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid, | |
| 1962 ScrollView::Color default_color, | |
| 1963 ScrollView::Color table_color) { | |
| 1964 ScrollView::Color color = default_color; | |
| 1965 // Iterate the ColPartitions in the grid. | |
| 1966 ColPartitionGridSearch gsearch(grid); | |
| 1967 gsearch.StartFullSearch(); | |
| 1968 ColPartition *part = nullptr; | |
| 1969 while ((part = gsearch.NextFullSearch()) != nullptr) { | |
| 1970 color = default_color; | |
| 1971 if (part->type() == PT_TABLE) { | |
| 1972 color = table_color; | |
| 1973 } | |
| 1974 | |
| 1975 const TBOX &box = part->bounding_box(); | |
| 1976 int left_x = box.left(); | |
| 1977 int right_x = box.right(); | |
| 1978 int top_y = box.top(); | |
| 1979 int bottom_y = box.bottom(); | |
| 1980 win->Brush(ScrollView::NONE); | |
| 1981 win->Pen(color); | |
| 1982 win->Rectangle(left_x, bottom_y, right_x, top_y); | |
| 1983 } | |
| 1984 win->UpdateWindow(); | |
| 1985 } | |
| 1986 | |
| 1987 void TableFinder::DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid, | |
| 1988 ScrollView::Color default_color) { | |
| 1989 DisplayColPartitions(win, grid, default_color, ScrollView::YELLOW); | |
| 1990 } | |
| 1991 | |
| 1992 void TableFinder::DisplayColPartitionConnections(ScrollView *win, | |
| 1993 ColPartitionGrid *grid, | |
| 1994 ScrollView::Color color) { | |
| 1995 // Iterate the ColPartitions in the grid. | |
| 1996 ColPartitionGridSearch gsearch(grid); | |
| 1997 gsearch.StartFullSearch(); | |
| 1998 ColPartition *part = nullptr; | |
| 1999 while ((part = gsearch.NextFullSearch()) != nullptr) { | |
| 2000 const TBOX &box = part->bounding_box(); | |
| 2001 int left_x = box.left(); | |
| 2002 int right_x = box.right(); | |
| 2003 int top_y = box.top(); | |
| 2004 int bottom_y = box.bottom(); | |
| 2005 | |
| 2006 ColPartition *upper_part = part->nearest_neighbor_above(); | |
| 2007 if (upper_part) { | |
| 2008 const TBOX &upper_box = upper_part->bounding_box(); | |
| 2009 int mid_x = (left_x + right_x) / 2; | |
| 2010 int mid_y = (top_y + bottom_y) / 2; | |
| 2011 int other_x = (upper_box.left() + upper_box.right()) / 2; | |
| 2012 int other_y = (upper_box.top() + upper_box.bottom()) / 2; | |
| 2013 win->Brush(ScrollView::NONE); | |
| 2014 win->Pen(color); | |
| 2015 win->Line(mid_x, mid_y, other_x, other_y); | |
| 2016 } | |
| 2017 ColPartition *lower_part = part->nearest_neighbor_below(); | |
| 2018 if (lower_part) { | |
| 2019 const TBOX &lower_box = lower_part->bounding_box(); | |
| 2020 int mid_x = (left_x + right_x) / 2; | |
| 2021 int mid_y = (top_y + bottom_y) / 2; | |
| 2022 int other_x = (lower_box.left() + lower_box.right()) / 2; | |
| 2023 int other_y = (lower_box.top() + lower_box.bottom()) / 2; | |
| 2024 win->Brush(ScrollView::NONE); | |
| 2025 win->Pen(color); | |
| 2026 win->Line(mid_x, mid_y, other_x, other_y); | |
| 2027 } | |
| 2028 } | |
| 2029 win->UpdateWindow(); | |
| 2030 } | |
| 2031 | |
| 2032 #endif | |
| 2033 | |
| 2034 // Merge all colpartitions in table regions to make them a single | |
| 2035 // colpartition and revert types of isolated table cells not | |
| 2036 // assigned to any table to their original types. | |
| 2037 void TableFinder::MakeTableBlocks(ColPartitionGrid *grid, | |
| 2038 ColPartitionSet **all_columns, | |
| 2039 const WidthCallback &width_cb) { | |
| 2040 // Since we have table blocks already, remove table tags from all | |
| 2041 // colpartitions | |
| 2042 ColPartitionGridSearch gsearch(grid); | |
| 2043 gsearch.StartFullSearch(); | |
| 2044 ColPartition *part = nullptr; | |
| 2045 | |
| 2046 while ((part = gsearch.NextFullSearch()) != nullptr) { | |
| 2047 if (part->type() == PT_TABLE) { | |
| 2048 part->clear_table_type(); | |
| 2049 } | |
| 2050 } | |
| 2051 // Now make a single colpartition out of each table block and remove | |
| 2052 // all colpartitions contained within a table | |
| 2053 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> table_search( | |
| 2054 &table_grid_); | |
| 2055 table_search.StartFullSearch(); | |
| 2056 ColSegment *table; | |
| 2057 while ((table = table_search.NextFullSearch()) != nullptr) { | |
| 2058 const TBOX &table_box = table->bounding_box(); | |
| 2059 // Start a rect search on table_box | |
| 2060 ColPartitionGridSearch rectsearch(grid); | |
| 2061 rectsearch.StartRectSearch(table_box); | |
| 2062 ColPartition *part; | |
| 2063 ColPartition *table_partition = nullptr; | |
| 2064 while ((part = rectsearch.NextRectSearch()) != nullptr) { | |
| 2065 // Do not consider image partitions | |
| 2066 if (!part->IsTextType()) { | |
| 2067 continue; | |
| 2068 } | |
| 2069 TBOX part_box = part->bounding_box(); | |
| 2070 // Include partition in the table if more than half of it | |
| 2071 // is covered by the table | |
| 2072 if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) { | |
| 2073 rectsearch.RemoveBBox(); | |
| 2074 if (table_partition) { | |
| 2075 table_partition->Absorb(part, width_cb); | |
| 2076 } else { | |
| 2077 table_partition = part; | |
| 2078 } | |
| 2079 } | |
| 2080 } | |
| 2081 // Insert table colpartition back to part_grid_ | |
| 2082 if (table_partition) { | |
| 2083 // To match the columns used when transforming to blocks, the new table | |
| 2084 // partition must have its first and last column set at the grid y that | |
| 2085 // corresponds to its bottom. | |
| 2086 const TBOX &table_box = table_partition->bounding_box(); | |
| 2087 int grid_x, grid_y; | |
| 2088 grid->GridCoords(table_box.left(), table_box.bottom(), &grid_x, &grid_y); | |
| 2089 table_partition->SetPartitionType(resolution_, all_columns[grid_y]); | |
| 2090 table_partition->set_table_type(); | |
| 2091 table_partition->set_blob_type(BRT_TEXT); | |
| 2092 table_partition->set_flow(BTFT_CHAIN); | |
| 2093 table_partition->SetBlobTypes(); | |
| 2094 grid->InsertBBox(true, true, table_partition); | |
| 2095 } | |
| 2096 } | |
| 2097 } | |
| 2098 | |
| 2099 //////// ColSegment code | |
| 2100 //////// | |
| 2101 ColSegment::ColSegment() | |
| 2102 : ELIST_LINK(), | |
| 2103 num_table_cells_(0), | |
| 2104 num_text_cells_(0), | |
| 2105 type_(COL_UNKNOWN) {} | |
| 2106 | |
| 2107 // Provides a color for BBGrid to draw the rectangle. | |
| 2108 ScrollView::Color ColSegment::BoxColor() const { | |
| 2109 const ScrollView::Color kBoxColors[PT_COUNT] = { | |
| 2110 ScrollView::YELLOW, | |
| 2111 ScrollView::BLUE, | |
| 2112 ScrollView::YELLOW, | |
| 2113 ScrollView::MAGENTA, | |
| 2114 }; | |
| 2115 return kBoxColors[type_]; | |
| 2116 } | |
| 2117 | |
| 2118 // Insert a box into this column segment | |
| 2119 void ColSegment::InsertBox(const TBOX &other) { | |
| 2120 bounding_box_ = bounding_box_.bounding_union(other); | |
| 2121 } | |
| 2122 | |
| 2123 // Set column segment type based on the ratio of text and table partitions | |
| 2124 // in it. | |
| 2125 void ColSegment::set_type() { | |
| 2126 if (num_table_cells_ > kTableColumnThreshold * num_text_cells_) { | |
| 2127 type_ = COL_TABLE; | |
| 2128 } else if (num_text_cells_ > num_table_cells_) { | |
| 2129 type_ = COL_TEXT; | |
| 2130 } else { | |
| 2131 type_ = COL_MIXED; | |
| 2132 } | |
| 2133 } | |
| 2134 | |
| 2135 } // namespace tesseract. |
