comparison mupdf-source/thirdparty/tesseract/src/textord/tablefind.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 ///////////////////////////////////////////////////////////////////////
2 // File: tablefind.cpp
3 // Description: Helper classes to find tables from ColPartitions.
4 // Author: Faisal Shafait (faisal.shafait@dfki.de)
5 //
6 // (C) Copyright 2009, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
17 ///////////////////////////////////////////////////////////////////////
18
19 #ifdef HAVE_CONFIG_H
20 # include "config_auto.h"
21 #endif
22
23 #include <algorithm>
24 #include <cmath>
25 #include <utility>
26 #include "tablefind.h"
27
28 #include <allheaders.h>
29
30 #include "colpartitionset.h"
31 #include "tablerecog.h"
32
33 namespace tesseract {
34
35 // These numbers are used to calculate the global median stats.
36 // They just set an upper bound on the stats objects.
37 // Maximum vertical spacing between neighbor partitions.
38 const int kMaxVerticalSpacing = 500;
39 // Maximum width of a blob in a partition.
40 const int kMaxBlobWidth = 500;
41
42 // Minimum whitespace size to split a partition (measured as a multiple
43 // of a partition's median width).
44 const double kSplitPartitionSize = 2.0;
45 // To insert text, the partition must satisfy these size constraints
46 // in AllowTextPartition(). The idea is to filter noise partitions
47 // determined by the size compared to the global medians.
48 // TODO(nbeato): Need to find good numbers again.
49 const double kAllowTextHeight = 0.5;
50 const double kAllowTextWidth = 0.6;
51 const double kAllowTextArea = 0.8;
52 // The same thing applies to blobs (to filter noise).
53 // TODO(nbeato): These numbers are a shot in the dark...
54 // height and width are 0.5 * gridsize() in colfind.cpp
55 // area is a rough guess for the size of a period.
56 const double kAllowBlobHeight = 0.3;
57 const double kAllowBlobWidth = 0.4;
58 const double kAllowBlobArea = 0.05;
59
60 // Minimum number of components in a text partition. A partition having fewer
61 // components than that is more likely a data partition and is a candidate
62 // table cell.
63 const int kMinBoxesInTextPartition = 10;
64
65 // Maximum number of components that a data partition can have
66 const int kMaxBoxesInDataPartition = 20;
67
68 // Maximum allowed gap in a text partitions as a multiple of its median size.
69 const double kMaxGapInTextPartition = 4.0;
70
71 // Minimum value that the maximum gap in a text partition should have as a
72 // factor of its median size.
73 const double kMinMaxGapInTextPartition = 0.5;
74
75 // The amount of overlap that is "normal" for adjacent blobs in a text
76 // partition. This is used to calculate gap between overlapping blobs.
77 const double kMaxBlobOverlapFactor = 4.0;
78
79 // Maximum x-height a table partition can have as a multiple of global
80 // median x-height
81 const double kMaxTableCellXheight = 2.0;
82
83 // Maximum line spacing between a table column header and column contents
84 // for merging the two (as a multiple of the partition's median_height).
85 const int kMaxColumnHeaderDistance = 4;
86
87 // Minimum ratio of num_table_partitions to num_text_partitions in a column
88 // block to be called it a table column
89 const double kTableColumnThreshold = 3.0;
90
91 // Search for horizontal ruling lines within the vertical margin as a
92 // multiple of grid size
93 // const int kRulingVerticalMargin = 3;
94
95 // Minimum overlap that a colpartition must have with a table region
96 // to become part of that table
97 const double kMinOverlapWithTable = 0.6;
98
99 // Maximum side space (distance from column boundary) that a typical
100 // text-line in flowing text should have as a multiple of its x-height
101 // (Median size).
102 const int kSideSpaceMargin = 10;
103
104 // Fraction of the peak of x-projection of a table region to set the
105 // threshold for the x-projection histogram
106 const double kSmallTableProjectionThreshold = 0.35;
107 const double kLargeTableProjectionThreshold = 0.45;
108 // Minimum number of rows required to look for more rows in the projection.
109 const int kLargeTableRowCount = 6;
110
111 // Minimum number of rows in a table
112 const int kMinRowsInTable = 3;
113
114 // The amount of padding (multiplied by global_median_xheight_ during use)
115 // that is vertically added to the search adjacent leader search during
116 // ColPartition marking.
117 const int kAdjacentLeaderSearchPadding = 2;
118
119 // Used when filtering false positives. When finding the last line
120 // of a paragraph (typically left-aligned), the previous line should have
121 // its center to the right of the last line by this scaled amount.
122 const double kParagraphEndingPreviousLineRatio = 1.3;
123
124 // The maximum amount of whitespace allowed left of a paragraph ending.
125 // Do not filter a ColPartition with more than this space left of it.
126 const double kMaxParagraphEndingLeftSpaceMultiple = 3.0;
127
128 // Used when filtering false positives. The last line of a paragraph
129 // should be preceded by a line that is predominantly text. This is the
130 // ratio of text to whitespace (to the right of the text) that is required
131 // for the previous line to be a text.
132 const double kMinParagraphEndingTextToWhitespaceRatio = 3.0;
133
134 // When counting table columns, this is the required gap between two columns
135 // (it is multiplied by global_median_xheight_).
136 const double kMaxXProjectionGapFactor = 2.0;
137
138 // Used for similarity in partitions using stroke width. Values copied
139 // from ColFind.cpp in Ray's CL.
140 const double kStrokeWidthFractionalTolerance = 0.25;
141 const double kStrokeWidthConstantTolerance = 2.0;
142
143 #ifndef GRAPHICS_DISABLED
144 static BOOL_VAR(textord_show_tables, false, "Show table regions (ScrollView)");
145 static BOOL_VAR(textord_tablefind_show_mark, false,
146 "Debug table marking steps in detail (ScrollView)");
147 static BOOL_VAR(textord_tablefind_show_stats, false,
148 "Show page stats used in table finding (ScrollView)");
149 #endif
150 static BOOL_VAR(textord_tablefind_recognize_tables, false,
151 "Enables the table recognizer for table layout and filtering.");
152
153 // Templated helper function used to create destructor callbacks for the
154 // BBGrid::ClearGridData() method.
155 template <typename T>
156 void DeleteObject(T *object) {
157 delete object;
158 }
159
160 TableFinder::TableFinder()
161 : resolution_(0),
162 global_median_xheight_(0),
163 global_median_blob_width_(0),
164 global_median_ledding_(0),
165 left_to_right_language_(true) {}
166
167 TableFinder::~TableFinder() {
168 // ColPartitions and ColSegments created by this class for storage in grids
169 // need to be deleted explicitly.
170 clean_part_grid_.ClearGridData(&DeleteObject<ColPartition>);
171 leader_and_ruling_grid_.ClearGridData(&DeleteObject<ColPartition>);
172 fragmented_text_grid_.ClearGridData(&DeleteObject<ColPartition>);
173 col_seg_grid_.ClearGridData(&DeleteObject<ColSegment>);
174 table_grid_.ClearGridData(&DeleteObject<ColSegment>);
175 }
176
177 void TableFinder::set_left_to_right_language(bool order) {
178 left_to_right_language_ = order;
179 }
180
181 void TableFinder::Init(int grid_size, const ICOORD &bottom_left,
182 const ICOORD &top_right) {
183 // Initialize clean partitions list and grid
184 clean_part_grid_.Init(grid_size, bottom_left, top_right);
185 leader_and_ruling_grid_.Init(grid_size, bottom_left, top_right);
186 fragmented_text_grid_.Init(grid_size, bottom_left, top_right);
187 col_seg_grid_.Init(grid_size, bottom_left, top_right);
188 table_grid_.Init(grid_size, bottom_left, top_right);
189 }
190
191 // Copy cleaned partitions from part_grid_ to clean_part_grid_ and
192 // insert leaders and rulers into the leader_and_ruling_grid_
193 void TableFinder::InsertCleanPartitions(ColPartitionGrid *grid,
194 TO_BLOCK *block) {
195 // Calculate stats. This lets us filter partitions in AllowTextPartition()
196 // and filter blobs in AllowBlob().
197 SetGlobalSpacings(grid);
198
199 // Iterate the ColPartitions in the grid.
200 ColPartitionGridSearch gsearch(grid);
201 gsearch.SetUniqueMode(true);
202 gsearch.StartFullSearch();
203 ColPartition *part = nullptr;
204 while ((part = gsearch.NextFullSearch()) != nullptr) {
205 // Reject partitions with nothing useful inside of them.
206 if (part->blob_type() == BRT_NOISE || part->bounding_box().area() <= 0) {
207 continue;
208 }
209 ColPartition *clean_part = part->ShallowCopy();
210 ColPartition *leader_part = nullptr;
211 if (part->IsLineType()) {
212 InsertRulingPartition(clean_part);
213 continue;
214 }
215 // Insert all non-text partitions to clean_parts
216 if (!part->IsTextType()) {
217 InsertImagePartition(clean_part);
218 continue;
219 }
220 // Insert text colpartitions after removing noisy components from them
221 // The leaders are split into a separate grid.
222 BLOBNBOX_CLIST *part_boxes = part->boxes();
223 BLOBNBOX_C_IT pit(part_boxes);
224 for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
225 BLOBNBOX *pblob = pit.data();
226 // Bad blobs... happens in UNLV set.
227 // news.3G1, page 17 (around x=6)
228 if (!AllowBlob(*pblob)) {
229 continue;
230 }
231 if (pblob->flow() == BTFT_LEADER) {
232 if (leader_part == nullptr) {
233 leader_part = part->ShallowCopy();
234 leader_part->set_flow(BTFT_LEADER);
235 }
236 leader_part->AddBox(pblob);
237 } else if (pblob->region_type() != BRT_NOISE) {
238 clean_part->AddBox(pblob);
239 }
240 }
241 clean_part->ComputeLimits();
242 ColPartition *fragmented = clean_part->CopyButDontOwnBlobs();
243 InsertTextPartition(clean_part);
244 SplitAndInsertFragmentedTextPartition(fragmented);
245 if (leader_part != nullptr) {
246 // TODO(nbeato): Note that ComputeLimits does not update the column
247 // information. So the leader may appear to span more columns than it
248 // really does later on when IsInSameColumnAs gets called to test
249 // for adjacent leaders.
250 leader_part->ComputeLimits();
251 InsertLeaderPartition(leader_part);
252 }
253 }
254
255 // Make the partition partners better for upper and lower neighbors.
256 clean_part_grid_.FindPartitionPartners();
257 clean_part_grid_.RefinePartitionPartners(false);
258 }
259
260 // High level function to perform table detection
261 void TableFinder::LocateTables(ColPartitionGrid *grid,
262 ColPartitionSet **all_columns,
263 WidthCallback width_cb, const FCOORD &reskew) {
264 // initialize spacing, neighbors, and columns
265 InitializePartitions(all_columns);
266
267 #ifndef GRAPHICS_DISABLED
268 if (textord_show_tables) {
269 ScrollView *table_win = MakeWindow(0, 300, "Column Partitions & Neighbors");
270 DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);
271 DisplayColPartitions(table_win, &leader_and_ruling_grid_,
272 ScrollView::AQUAMARINE);
273 DisplayColPartitionConnections(table_win, &clean_part_grid_,
274 ScrollView::ORANGE);
275
276 table_win = MakeWindow(100, 300, "Fragmented Text");
277 DisplayColPartitions(table_win, &fragmented_text_grid_, ScrollView::BLUE);
278 }
279 #endif // !GRAPHICS_DISABLED
280
281 // mark, filter, and smooth candidate table partitions
282 MarkTablePartitions();
283
284 // Make single-column blocks from good_columns_ partitions. col_segments are
285 // moved to a grid later which takes the ownership
286 ColSegment_LIST column_blocks;
287 GetColumnBlocks(all_columns, &column_blocks);
288 // Set the ratio of candidate table partitions in each column
289 SetColumnsType(&column_blocks);
290
291 // Move column segments to col_seg_grid_
292 MoveColSegmentsToGrid(&column_blocks, &col_seg_grid_);
293
294 // Detect split in column layout that might have occurred due to the
295 // presence of a table. In such a case, merge the corresponding columns.
296 GridMergeColumnBlocks();
297
298 // Group horizontally overlapping table partitions into table columns.
299 // table_columns created here get deleted at the end of this method.
300 ColSegment_LIST table_columns;
301 GetTableColumns(&table_columns);
302
303 // Within each column, mark the range table regions occupy based on the
304 // table columns detected. table_regions are moved to a grid later which
305 // takes the ownership
306 ColSegment_LIST table_regions;
307 GetTableRegions(&table_columns, &table_regions);
308
309 #ifndef GRAPHICS_DISABLED
310 if (textord_tablefind_show_mark) {
311 ScrollView *table_win = MakeWindow(1200, 300, "Table Columns and Regions");
312 DisplayColSegments(table_win, &table_columns, ScrollView::DARK_TURQUOISE);
313 DisplayColSegments(table_win, &table_regions, ScrollView::YELLOW);
314 }
315 #endif // !GRAPHICS_DISABLED
316
317 // Merge table regions across columns for tables spanning multiple
318 // columns
319 MoveColSegmentsToGrid(&table_regions, &table_grid_);
320 GridMergeTableRegions();
321
322 // Adjust table boundaries by including nearby horizontal lines and left
323 // out column headers
324 AdjustTableBoundaries();
325 GridMergeTableRegions();
326
327 if (textord_tablefind_recognize_tables) {
328 // Remove false alarms consisting of a single column
329 DeleteSingleColumnTables();
330
331 #ifndef GRAPHICS_DISABLED
332 if (textord_show_tables) {
333 ScrollView *table_win = MakeWindow(1200, 300, "Detected Table Locations");
334 DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);
335 DisplayColSegments(table_win, &table_columns, ScrollView::KHAKI);
336 table_grid_.DisplayBoxes(table_win);
337 }
338 #endif // !GRAPHICS_DISABLED
339
340 // Find table grid structure and reject tables that are malformed.
341 RecognizeTables();
342 GridMergeTableRegions();
343 RecognizeTables();
344
345 #ifndef GRAPHICS_DISABLED
346 if (textord_show_tables) {
347 ScrollView *table_win = MakeWindow(1400, 600, "Recognized Tables");
348 DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE,
349 ScrollView::BLUE);
350 table_grid_.DisplayBoxes(table_win);
351 }
352 #endif // !GRAPHICS_DISABLED
353 } else {
354 // Remove false alarms consisting of a single column
355 // TODO(nbeato): verify this is a NOP after structured table rejection.
356 // Right now it isn't. If the recognize function is doing what it is
357 // supposed to do, this function is obsolete.
358 DeleteSingleColumnTables();
359
360 #ifndef GRAPHICS_DISABLED
361 if (textord_show_tables) {
362 ScrollView *table_win = MakeWindow(1500, 300, "Detected Tables");
363 DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE,
364 ScrollView::BLUE);
365 table_grid_.DisplayBoxes(table_win);
366 }
367 #endif // !GRAPHICS_DISABLED
368 }
369
370 // Merge all colpartitions in table regions to make them a single
371 // colpartition and revert types of isolated table cells not
372 // assigned to any table to their original types.
373 MakeTableBlocks(grid, all_columns, width_cb);
374 }
375 // All grids have the same dimensions. The clean_part_grid_ sizes are set from
376 // the part_grid_ that is passed to InsertCleanPartitions, which was the same as
377 // the grid that is the base of ColumnFinder. Just return the clean_part_grid_
378 // dimensions instead of duplicated memory.
379 int TableFinder::gridsize() const {
380 return clean_part_grid_.gridsize();
381 }
382 int TableFinder::gridwidth() const {
383 return clean_part_grid_.gridwidth();
384 }
385 int TableFinder::gridheight() const {
386 return clean_part_grid_.gridheight();
387 }
388 const ICOORD &TableFinder::bleft() const {
389 return clean_part_grid_.bleft();
390 }
391 const ICOORD &TableFinder::tright() const {
392 return clean_part_grid_.tright();
393 }
394
395 void TableFinder::InsertTextPartition(ColPartition *part) {
396 ASSERT_HOST(part != nullptr);
397 if (AllowTextPartition(*part)) {
398 clean_part_grid_.InsertBBox(true, true, part);
399 } else {
400 delete part;
401 }
402 }
403 void TableFinder::InsertFragmentedTextPartition(ColPartition *part) {
404 ASSERT_HOST(part != nullptr);
405 if (AllowTextPartition(*part)) {
406 fragmented_text_grid_.InsertBBox(true, true, part);
407 } else {
408 delete part;
409 }
410 }
411 void TableFinder::InsertLeaderPartition(ColPartition *part) {
412 ASSERT_HOST(part != nullptr);
413 if (!part->IsEmpty() && part->bounding_box().area() > 0) {
414 leader_and_ruling_grid_.InsertBBox(true, true, part);
415 } else {
416 delete part;
417 }
418 }
419 void TableFinder::InsertRulingPartition(ColPartition *part) {
420 leader_and_ruling_grid_.InsertBBox(true, true, part);
421 }
422 void TableFinder::InsertImagePartition(ColPartition *part) {
423 // NOTE: If images are placed into a different grid in the future,
424 // the function SetPartitionSpacings needs to be updated. It should
425 // be the only thing that cares about image partitions.
426 clean_part_grid_.InsertBBox(true, true, part);
427 }
428
429 // Splits a partition into its "words". The splits happen
430 // at locations with wide inter-blob spacing. This is useful
431 // because it allows the table recognize to "cut through" the
432 // text lines on the page. The assumption is that a table
433 // will have several lines with similar overlapping whitespace
434 // whereas text will not have this type of property.
435 // Note: The code assumes that blobs are sorted by the left side x!
436 // This will not work (as well) if the blobs are sorted by center/right.
437 void TableFinder::SplitAndInsertFragmentedTextPartition(ColPartition *part) {
438 ASSERT_HOST(part != nullptr);
439 // Bye bye empty partitions!
440 if (part->boxes()->empty()) {
441 delete part;
442 return;
443 }
444
445 // The AllowBlob function prevents this.
446 ASSERT_HOST(part->median_width() > 0);
447 const double kThreshold = part->median_width() * kSplitPartitionSize;
448
449 ColPartition *right_part = part;
450 bool found_split = true;
451 while (found_split) {
452 found_split = false;
453 BLOBNBOX_C_IT box_it(right_part->boxes());
454 // Blobs are sorted left side first. If blobs overlap,
455 // the previous blob may have a "more right" right side.
456 // Account for this by always keeping the largest "right"
457 // so far.
458 int previous_right = INT32_MIN;
459
460 // Look for the next split in the partition.
461 for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
462 const TBOX &box = box_it.data()->bounding_box();
463 if (previous_right != INT32_MIN &&
464 box.left() - previous_right > kThreshold) {
465 // We have a split position. Split the partition in two pieces.
466 // Insert the left piece in the grid and keep processing the right.
467 int mid_x = (box.left() + previous_right) / 2;
468 ColPartition *left_part = right_part;
469 right_part = left_part->SplitAt(mid_x);
470
471 InsertFragmentedTextPartition(left_part);
472 found_split = true;
473 break;
474 }
475
476 // The right side of the previous blobs.
477 previous_right = std::max(previous_right, static_cast<int>(box.right()));
478 }
479 }
480 // When a split is not found, the right part is minimized
481 // as much as possible, so process it.
482 InsertFragmentedTextPartition(right_part);
483 }
484
485 // Some simple criteria to filter out now. We want to make sure the
486 // average blob size in the partition is consistent with the
487 // global page stats.
488 // The area metric will almost always pass for multi-blob partitions.
489 // It is useful when filtering out noise caused by an isolated blob.
490 bool TableFinder::AllowTextPartition(const ColPartition &part) const {
491 const double kHeightRequired = global_median_xheight_ * kAllowTextHeight;
492 const double kWidthRequired = global_median_blob_width_ * kAllowTextWidth;
493 const int median_area = global_median_xheight_ * global_median_blob_width_;
494 const double kAreaPerBlobRequired = median_area * kAllowTextArea;
495 // Keep comparisons strictly greater to disallow 0!
496 return part.median_height() > kHeightRequired &&
497 part.median_width() > kWidthRequired &&
498 part.bounding_box().area() > kAreaPerBlobRequired * part.boxes_count();
499 }
500
501 // Same as above, applied to blobs. Keep in mind that
502 // leaders, commas, and periods are important in tables.
503 bool TableFinder::AllowBlob(const BLOBNBOX &blob) const {
504 const TBOX &box = blob.bounding_box();
505 const double kHeightRequired = global_median_xheight_ * kAllowBlobHeight;
506 const double kWidthRequired = global_median_blob_width_ * kAllowBlobWidth;
507 const int median_area = global_median_xheight_ * global_median_blob_width_;
508 const double kAreaRequired = median_area * kAllowBlobArea;
509 // Keep comparisons strictly greater to disallow 0!
510 return box.height() > kHeightRequired && box.width() > kWidthRequired &&
511 box.area() > kAreaRequired;
512 }
513
514 // TODO(nbeato): The grid that makes the window doesn't seem to matter.
515 // The only downside is that window messages will be caught by
516 // clean_part_grid_ instead of a useful object. This is a temporary solution
517 // for the debug windows created by the TableFinder.
518 #ifndef GRAPHICS_DISABLED
519 ScrollView *TableFinder::MakeWindow(int x, int y, const char *window_name) {
520 return clean_part_grid_.MakeWindow(x, y, window_name);
521 }
522 #endif
523
524 // Make single-column blocks from good_columns_ partitions.
525 void TableFinder::GetColumnBlocks(ColPartitionSet **all_columns,
526 ColSegment_LIST *column_blocks) {
527 for (int i = 0; i < gridheight(); ++i) {
528 ColPartitionSet *columns = all_columns[i];
529 if (columns != nullptr) {
530 ColSegment_LIST new_blocks;
531 // Get boxes from the current vertical position on the grid
532 columns->GetColumnBoxes(i * gridsize(), (i + 1) * gridsize(),
533 &new_blocks);
534 // Merge the new_blocks boxes into column_blocks if they are well-aligned
535 GroupColumnBlocks(&new_blocks, column_blocks);
536 }
537 }
538 }
539
540 // Merge column segments into the current list if they are well aligned.
541 void TableFinder::GroupColumnBlocks(ColSegment_LIST *new_blocks,
542 ColSegment_LIST *column_blocks) {
543 ColSegment_IT src_it(new_blocks);
544 ColSegment_IT dest_it(column_blocks);
545 // iterate through the source list
546 for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
547 ColSegment *src_seg = src_it.data();
548 const TBOX &src_box = src_seg->bounding_box();
549 bool match_found = false;
550 // iterate through the destination list to find a matching column block
551 for (dest_it.mark_cycle_pt(); !dest_it.cycled_list(); dest_it.forward()) {
552 ColSegment *dest_seg = dest_it.data();
553 TBOX dest_box = dest_seg->bounding_box();
554 if (ConsecutiveBoxes(src_box, dest_box)) {
555 // If matching block is found, insert the current block into it
556 // and delete the source block.
557 dest_seg->InsertBox(src_box);
558 match_found = true;
559 delete src_it.extract();
560 break;
561 }
562 }
563 // If no match is found, just append the source block to column_blocks
564 if (!match_found) {
565 dest_it.add_after_then_move(src_it.extract());
566 }
567 }
568 }
569
570 // are the two boxes immediate neighbors along the vertical direction
571 bool TableFinder::ConsecutiveBoxes(const TBOX &b1, const TBOX &b2) {
572 int x_margin = 20;
573 int y_margin = 5;
574 return (abs(b1.left() - b2.left()) < x_margin) &&
575 (abs(b1.right() - b2.right()) < x_margin) &&
576 (abs(b1.top() - b2.bottom()) < y_margin ||
577 abs(b2.top() - b1.bottom()) < y_margin);
578 }
579
580 // Set up info for clean_part_grid_ partitions to be valid during detection
581 // code.
582 void TableFinder::InitializePartitions(ColPartitionSet **all_columns) {
583 FindNeighbors();
584 SetPartitionSpacings(&clean_part_grid_, all_columns);
585 SetGlobalSpacings(&clean_part_grid_);
586 }
587
588 // Set left, right and top, bottom spacings of each colpartition.
589 void TableFinder::SetPartitionSpacings(ColPartitionGrid *grid,
590 ColPartitionSet **all_columns) {
591 // Iterate the ColPartitions in the grid.
592 ColPartitionGridSearch gsearch(grid);
593 gsearch.StartFullSearch();
594 ColPartition *part = nullptr;
595 while ((part = gsearch.NextFullSearch()) != nullptr) {
596 ColPartitionSet *columns = all_columns[gsearch.GridY()];
597 TBOX box = part->bounding_box();
598 int y = part->MidY();
599 ColPartition *left_column = columns->ColumnContaining(box.left(), y);
600 ColPartition *right_column = columns->ColumnContaining(box.right(), y);
601 // set distance from left column as space to the left
602 if (left_column) {
603 int left_space = std::max(0, box.left() - left_column->LeftAtY(y));
604 part->set_space_to_left(left_space);
605 }
606 // set distance from right column as space to the right
607 if (right_column) {
608 int right_space = std::max(0, right_column->RightAtY(y) - box.right());
609 part->set_space_to_right(right_space);
610 }
611
612 // Look for images that may be closer.
613 // NOTE: used to be part_grid_, might cause issues now
614 ColPartitionGridSearch hsearch(grid);
615 hsearch.StartSideSearch(box.left(), box.bottom(), box.top());
616 ColPartition *neighbor = nullptr;
617 while ((neighbor = hsearch.NextSideSearch(true)) != nullptr) {
618 if (neighbor->type() == PT_PULLOUT_IMAGE ||
619 neighbor->type() == PT_FLOWING_IMAGE ||
620 neighbor->type() == PT_HEADING_IMAGE) {
621 int right = neighbor->bounding_box().right();
622 if (right < box.left()) {
623 int space = std::min(box.left() - right, part->space_to_left());
624 part->set_space_to_left(space);
625 }
626 }
627 }
628 hsearch.StartSideSearch(box.left(), box.bottom(), box.top());
629 neighbor = nullptr;
630 while ((neighbor = hsearch.NextSideSearch(false)) != nullptr) {
631 if (neighbor->type() == PT_PULLOUT_IMAGE ||
632 neighbor->type() == PT_FLOWING_IMAGE ||
633 neighbor->type() == PT_HEADING_IMAGE) {
634 int left = neighbor->bounding_box().left();
635 if (left > box.right()) {
636 int space = std::min(left - box.right(), part->space_to_right());
637 part->set_space_to_right(space);
638 }
639 }
640 }
641
642 ColPartition *upper_part = part->SingletonPartner(true);
643 if (upper_part) {
644 int space =
645 std::max(0, static_cast<int>(upper_part->bounding_box().bottom() -
646 part->bounding_box().bottom()));
647 part->set_space_above(space);
648 } else {
649 // TODO(nbeato): What constitutes a good value?
650 // 0 is the default value when not set, explicitly noting it needs to
651 // be something else.
652 part->set_space_above(INT32_MAX);
653 }
654
655 ColPartition *lower_part = part->SingletonPartner(false);
656 if (lower_part) {
657 int space =
658 std::max(0, static_cast<int>(part->bounding_box().bottom() -
659 lower_part->bounding_box().bottom()));
660 part->set_space_below(space);
661 } else {
662 // TODO(nbeato): What constitutes a good value?
663 // 0 is the default value when not set, explicitly noting it needs to
664 // be something else.
665 part->set_space_below(INT32_MAX);
666 }
667 }
668 }
669
670 // Set spacing and closest neighbors above and below a given colpartition.
671 void TableFinder::SetVerticalSpacing(ColPartition *part) {
672 TBOX box = part->bounding_box();
673 int top_range =
674 std::min(box.top() + kMaxVerticalSpacing, static_cast<int>(tright().y()));
675 int bottom_range = std::max(box.bottom() - kMaxVerticalSpacing,
676 static_cast<int>(bleft().y()));
677 box.set_top(top_range);
678 box.set_bottom(bottom_range);
679
680 TBOX part_box = part->bounding_box();
681 // Start a rect search
682 ColPartitionGridSearch rectsearch(&clean_part_grid_);
683 rectsearch.StartRectSearch(box);
684 ColPartition *neighbor;
685 int min_space_above = kMaxVerticalSpacing;
686 int min_space_below = kMaxVerticalSpacing;
687 ColPartition *above_neighbor = nullptr;
688 ColPartition *below_neighbor = nullptr;
689 while ((neighbor = rectsearch.NextRectSearch()) != nullptr) {
690 if (neighbor == part) {
691 continue;
692 }
693 TBOX neighbor_box = neighbor->bounding_box();
694 if (neighbor_box.major_x_overlap(part_box)) {
695 int gap = abs(part->median_bottom() - neighbor->median_bottom());
696 // If neighbor is below current partition
697 if (neighbor_box.top() < part_box.bottom() && gap < min_space_below) {
698 min_space_below = gap;
699 below_neighbor = neighbor;
700 } // If neighbor is above current partition
701 else if (part_box.top() < neighbor_box.bottom() &&
702 gap < min_space_above) {
703 min_space_above = gap;
704 above_neighbor = neighbor;
705 }
706 }
707 }
708 part->set_space_above(min_space_above);
709 part->set_space_below(min_space_below);
710 part->set_nearest_neighbor_above(above_neighbor);
711 part->set_nearest_neighbor_below(below_neighbor);
712 }
713
714 // Set global spacing and x-height estimates
715 void TableFinder::SetGlobalSpacings(ColPartitionGrid *grid) {
716 STATS xheight_stats(0, kMaxVerticalSpacing);
717 STATS width_stats(0, kMaxBlobWidth);
718 STATS ledding_stats(0, kMaxVerticalSpacing);
719 // Iterate the ColPartitions in the grid.
720 ColPartitionGridSearch gsearch(grid);
721 gsearch.SetUniqueMode(true);
722 gsearch.StartFullSearch();
723 ColPartition *part = nullptr;
724 while ((part = gsearch.NextFullSearch()) != nullptr) {
725 // TODO(nbeato): HACK HACK HACK! medians are equal to partition length.
726 // ComputeLimits needs to get called somewhere outside of TableFinder
727 // to make sure the partitions are properly initialized.
728 // When this is called, SmoothPartitionPartners dies in an assert after
729 // table find runs. Alternative solution.
730 // part->ComputeLimits();
731 if (part->IsTextType()) {
732 // xheight_stats.add(part->median_height(), part->boxes_count());
733 // width_stats.add(part->median_width(), part->boxes_count());
734
735 // This loop can be removed when above issues are fixed.
736 // Replace it with the 2 lines commented out above.
737 BLOBNBOX_C_IT it(part->boxes());
738 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
739 xheight_stats.add(it.data()->bounding_box().height(), 1);
740 width_stats.add(it.data()->bounding_box().width(), 1);
741 }
742
743 ledding_stats.add(part->space_above(), 1);
744 ledding_stats.add(part->space_below(), 1);
745 }
746 }
747 // Set estimates based on median of statistics obtained
748 set_global_median_xheight(static_cast<int>(xheight_stats.median() + 0.5));
749 set_global_median_blob_width(static_cast<int>(width_stats.median() + 0.5));
750 set_global_median_ledding(static_cast<int>(ledding_stats.median() + 0.5));
751 #ifndef GRAPHICS_DISABLED
752 if (textord_tablefind_show_stats) {
753 const char *kWindowName = "X-height (R), X-width (G), and ledding (B)";
754 ScrollView *stats_win = MakeWindow(500, 10, kWindowName);
755 xheight_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::RED);
756 width_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::GREEN);
757 ledding_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::BLUE);
758 }
759 #endif // !GRAPHICS_DISABLED
760 }
761
762 void TableFinder::set_global_median_xheight(int xheight) {
763 global_median_xheight_ = xheight;
764 }
765 void TableFinder::set_global_median_blob_width(int width) {
766 global_median_blob_width_ = width;
767 }
768 void TableFinder::set_global_median_ledding(int ledding) {
769 global_median_ledding_ = ledding;
770 }
771
772 void TableFinder::FindNeighbors() {
773 ColPartitionGridSearch gsearch(&clean_part_grid_);
774 gsearch.StartFullSearch();
775 ColPartition *part = nullptr;
776 while ((part = gsearch.NextFullSearch()) != nullptr) {
777 // TODO(nbeato): Rename this function, meaning is different now.
778 // IT is finding nearest neighbors its own way
779 // SetVerticalSpacing(part);
780
781 ColPartition *upper = part->SingletonPartner(true);
782 if (upper) {
783 part->set_nearest_neighbor_above(upper);
784 }
785
786 ColPartition *lower = part->SingletonPartner(false);
787 if (lower) {
788 part->set_nearest_neighbor_below(lower);
789 }
790 }
791 }
792
793 // High level interface. Input is an unmarked ColPartitionGrid
794 // (namely, clean_part_grid_). Partitions are identified using local
795 // information and filter/smoothed. The function exit should contain
796 // a good sampling of the table partitions.
797 void TableFinder::MarkTablePartitions() {
798 MarkPartitionsUsingLocalInformation();
799 #ifndef GRAPHICS_DISABLED
800 if (textord_tablefind_show_mark) {
801 ScrollView *table_win = MakeWindow(300, 300, "Initial Table Partitions");
802 DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);
803 DisplayColPartitions(table_win, &leader_and_ruling_grid_,
804 ScrollView::AQUAMARINE);
805 }
806 #endif
807 FilterFalseAlarms();
808 #ifndef GRAPHICS_DISABLED
809 if (textord_tablefind_show_mark) {
810 ScrollView *table_win = MakeWindow(600, 300, "Filtered Table Partitions");
811 DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);
812 DisplayColPartitions(table_win, &leader_and_ruling_grid_,
813 ScrollView::AQUAMARINE);
814 }
815 #endif
816 SmoothTablePartitionRuns();
817 #ifndef GRAPHICS_DISABLED
818 if (textord_tablefind_show_mark) {
819 ScrollView *table_win = MakeWindow(900, 300, "Smoothed Table Partitions");
820 DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);
821 DisplayColPartitions(table_win, &leader_and_ruling_grid_,
822 ScrollView::AQUAMARINE);
823 }
824 #endif
825 FilterFalseAlarms();
826 #ifndef GRAPHICS_DISABLED
827 if (textord_tablefind_show_mark || textord_show_tables) {
828 ScrollView *table_win = MakeWindow(900, 300, "Final Table Partitions");
829 DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);
830 DisplayColPartitions(table_win, &leader_and_ruling_grid_,
831 ScrollView::AQUAMARINE);
832 }
833 #endif
834 }
835
836 // These types of partitions are marked as table partitions:
837 // 1- Partitions that have at lease one large gap between words
838 // 2- Partitions that consist of only one word (no significant gap
839 // between components)
840 // 3- Partitions that vertically overlap with other partitions within the
841 // same column.
842 // 4- Partitions with leaders before/after them.
843 void TableFinder::MarkPartitionsUsingLocalInformation() {
844 // Iterate the ColPartitions in the grid.
845 ColPartitionGridSearch gsearch(&clean_part_grid_);
846 gsearch.StartFullSearch();
847 ColPartition *part = nullptr;
848 while ((part = gsearch.NextFullSearch()) != nullptr) {
849 if (!part->IsTextType()) { // Only consider text partitions
850 continue;
851 }
852 // Only consider partitions in dominant font size or smaller
853 if (part->median_height() > kMaxTableCellXheight * global_median_xheight_) {
854 continue;
855 }
856 // Mark partitions with a large gap, or no significant gap as
857 // table partitions.
858 // Comments: It produces several false alarms at:
859 // - last line of a paragraph (fixed)
860 // - single word section headings
861 // - page headers and footers
862 // - numbered equations
863 // - line drawing regions
864 // TODO(faisal): detect and fix above-mentioned cases
865 if (HasWideOrNoInterWordGap(part) || HasLeaderAdjacent(*part)) {
866 part->set_table_type();
867 }
868 }
869 }
870
871 // Check if the partition has at least one large gap between words or no
872 // significant gap at all
873 bool TableFinder::HasWideOrNoInterWordGap(ColPartition *part) const {
874 // Should only get text partitions.
875 ASSERT_HOST(part->IsTextType());
876 // Blob access
877 BLOBNBOX_CLIST *part_boxes = part->boxes();
878 BLOBNBOX_C_IT it(part_boxes);
879 // Check if this is a relatively small partition (such as a single word)
880 if (part->bounding_box().width() <
881 kMinBoxesInTextPartition * part->median_height() &&
882 part_boxes->length() < kMinBoxesInTextPartition) {
883 return true;
884 }
885
886 // Variables used to compute inter-blob spacing.
887 int previous_x1 = -1;
888 // Stores the maximum gap detected.
889 int largest_partition_gap_found = -1;
890 // Text partition gap limits. If this is text (and not a table),
891 // there should be at least one gap larger than min_gap and no gap
892 // larger than max_gap.
893 const double max_gap = kMaxGapInTextPartition * part->median_height();
894 const double min_gap = kMinMaxGapInTextPartition * part->median_height();
895
896 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
897 BLOBNBOX *blob = it.data();
898 int current_x0 = blob->bounding_box().left();
899 int current_x1 = blob->bounding_box().right();
900 if (previous_x1 != -1) {
901 int gap = current_x0 - previous_x1;
902
903 // TODO(nbeato): Boxes may overlap? Huh?
904 // For example, mag.3B 8003_033.3B.tif in UNLV data. The titles/authors
905 // on the top right of the page are filtered out with this line.
906 // Note 2: Iterating over blobs in a partition, so we are looking for
907 // spacing between the words.
908 if (gap < 0) {
909 // More likely case, the blobs slightly overlap. This can happen
910 // with diacritics (accents) or broken alphabet symbols (characters).
911 // Merge boxes together by taking max of right sides.
912 if (-gap < part->median_height() * kMaxBlobOverlapFactor) {
913 previous_x1 = std::max(previous_x1, current_x1);
914 continue;
915 }
916 // Extreme case, blobs overlap significantly in the same partition...
917 // This should not happen often (if at all), but it does.
918 // TODO(nbeato): investigate cases when this happens.
919 else {
920 // The behavior before was to completely ignore this case.
921 }
922 }
923
924 // If a large enough gap is found, mark it as a table cell (return true)
925 if (gap > max_gap) {
926 return true;
927 }
928 if (gap > largest_partition_gap_found) {
929 largest_partition_gap_found = gap;
930 }
931 }
932 previous_x1 = current_x1;
933 }
934 // Since no large gap was found, return false if the partition is too
935 // long to be a data cell
936 if (part->bounding_box().width() >
937 kMaxBoxesInDataPartition * part->median_height() ||
938 part_boxes->length() > kMaxBoxesInDataPartition) {
939 return false;
940 }
941
942 // A partition may be a single blob. In this case, it's an isolated symbol
943 // or non-text (such as a ruling or image).
944 // Detect these as table partitions? Shouldn't this be case by case?
945 // The behavior before was to ignore this, making max_partition_gap < 0
946 // and implicitly return true. Just making it explicit.
947 if (largest_partition_gap_found == -1) {
948 return true;
949 }
950
951 // return true if the maximum gap found is smaller than the minimum allowed
952 // max_gap in a text partition. This indicates that there is no significant
953 // space in the partition, hence it is likely a single word.
954 return largest_partition_gap_found < min_gap;
955 }
956
957 // A criteria for possible tables is that a table may have leaders
958 // between data cells. An aggressive solution to find such tables is to
959 // explicitly mark partitions that have adjacent leaders.
960 // Note that this includes overlapping leaders. However, it does not
961 // include leaders in different columns on the page.
962 // Possible false-positive will include lists, such as a table of contents.
963 // As these arise, the aggressive nature of this search may need to be
964 // trimmed down.
965 bool TableFinder::HasLeaderAdjacent(const ColPartition &part) {
966 if (part.flow() == BTFT_LEADER) {
967 return true;
968 }
969 // Search range is left and right bounded by an offset of the
970 // median xheight. This offset is to allow some tolerance to the
971 // the leaders on the page in the event that the alignment is still
972 // a bit off.
973 const TBOX &box = part.bounding_box();
974 const int search_size = kAdjacentLeaderSearchPadding * global_median_xheight_;
975 const int top = box.top() + search_size;
976 const int bottom = box.bottom() - search_size;
977 ColPartitionGridSearch hsearch(&leader_and_ruling_grid_);
978 for (int direction = 0; direction < 2; ++direction) {
979 bool right_to_left = (direction == 0);
980 int x = right_to_left ? box.right() : box.left();
981 hsearch.StartSideSearch(x, bottom, top);
982 ColPartition *leader = nullptr;
983 while ((leader = hsearch.NextSideSearch(right_to_left)) != nullptr) {
984 // The leader could be a horizontal ruling in the grid.
985 // Make sure it is actually a leader.
986 if (leader->flow() != BTFT_LEADER) {
987 continue;
988 }
989 // This should not happen, they are in different grids.
990 ASSERT_HOST(&part != leader);
991 // Make sure the leader shares a page column with the partition,
992 // otherwise we are spreading across columns.
993 if (!part.IsInSameColumnAs(*leader)) {
994 break;
995 }
996 // There should be a significant vertical overlap
997 if (!leader->VSignificantCoreOverlap(part)) {
998 continue;
999 }
1000 // Leader passed all tests, so it is adjacent.
1001 return true;
1002 }
1003 }
1004 // No leaders are adjacent to the given partition.
1005 return false;
1006 }
1007
1008 // Filter individual text partitions marked as table partitions
1009 // consisting of paragraph endings, small section headings, and
1010 // headers and footers.
1011 void TableFinder::FilterFalseAlarms() {
1012 FilterParagraphEndings();
1013 FilterHeaderAndFooter();
1014 // TODO(nbeato): Fully justified text as non-table?
1015 }
1016
1017 void TableFinder::FilterParagraphEndings() {
1018 // Detect last line of paragraph
1019 // Iterate the ColPartitions in the grid.
1020 ColPartitionGridSearch gsearch(&clean_part_grid_);
1021 gsearch.StartFullSearch();
1022 ColPartition *part = nullptr;
1023 while ((part = gsearch.NextFullSearch()) != nullptr) {
1024 if (part->type() != PT_TABLE) {
1025 continue; // Consider only table partitions
1026 }
1027
1028 // Paragraph ending should have flowing text above it.
1029 ColPartition *upper_part = part->nearest_neighbor_above();
1030 if (!upper_part) {
1031 continue;
1032 }
1033 if (upper_part->type() != PT_FLOWING_TEXT) {
1034 continue;
1035 }
1036 if (upper_part->bounding_box().width() < 2 * part->bounding_box().width()) {
1037 continue;
1038 }
1039 // Check if its the last line of a paragraph.
1040 // In most cases, a paragraph ending should be left-aligned to text line
1041 // above it. Sometimes, it could be a 2 line paragraph, in which case
1042 // the line above it is indented.
1043 // To account for that, check if the partition center is to
1044 // the left of the one above it.
1045 int mid = (part->bounding_box().left() + part->bounding_box().right()) / 2;
1046 int upper_mid = (upper_part->bounding_box().left() +
1047 upper_part->bounding_box().right()) /
1048 2;
1049 int current_spacing = 0; // spacing of the current line to margin
1050 int upper_spacing = 0; // spacing of the previous line to the margin
1051 if (left_to_right_language_) {
1052 // Left to right languages, use mid - left to figure out the distance
1053 // the middle is from the left margin.
1054 int left = std::min(part->bounding_box().left(),
1055 upper_part->bounding_box().left());
1056 current_spacing = mid - left;
1057 upper_spacing = upper_mid - left;
1058 } else {
1059 // Right to left languages, use right - mid to figure out the distance
1060 // the middle is from the right margin.
1061 int right = std::max(part->bounding_box().right(),
1062 upper_part->bounding_box().right());
1063 current_spacing = right - mid;
1064 upper_spacing = right - upper_mid;
1065 }
1066 if (current_spacing * kParagraphEndingPreviousLineRatio > upper_spacing) {
1067 continue;
1068 }
1069
1070 // Paragraphs should have similar fonts.
1071 if (!part->MatchingSizes(*upper_part) ||
1072 !part->MatchingStrokeWidth(*upper_part, kStrokeWidthFractionalTolerance,
1073 kStrokeWidthConstantTolerance)) {
1074 continue;
1075 }
1076
1077 // The last line of a paragraph should be left aligned.
1078 // TODO(nbeato): This would be untrue if the text was right aligned.
1079 // How often is that?
1080 if (part->space_to_left() >
1081 kMaxParagraphEndingLeftSpaceMultiple * part->median_height()) {
1082 continue;
1083 }
1084 // The line above it should be right aligned (assuming justified format).
1085 // Since we can't assume justified text, we compare whitespace to text.
1086 // The above line should have majority spanning text (or the current
1087 // line could have fit on the previous line). So compare
1088 // whitespace to text.
1089 if (upper_part->bounding_box().width() <
1090 kMinParagraphEndingTextToWhitespaceRatio *
1091 upper_part->space_to_right()) {
1092 continue;
1093 }
1094
1095 // Ledding above the line should be less than ledding below
1096 if (part->space_above() >= part->space_below() ||
1097 part->space_above() > 2 * global_median_ledding_) {
1098 continue;
1099 }
1100
1101 // If all checks failed, it is probably text.
1102 part->clear_table_type();
1103 }
1104 }
1105
1106 void TableFinder::FilterHeaderAndFooter() {
1107 // Consider top-most text colpartition as header and bottom most as footer
1108 ColPartition *header = nullptr;
1109 ColPartition *footer = nullptr;
1110 int max_top = INT32_MIN;
1111 int min_bottom = INT32_MAX;
1112 ColPartitionGridSearch gsearch(&clean_part_grid_);
1113 gsearch.StartFullSearch();
1114 ColPartition *part = nullptr;
1115 while ((part = gsearch.NextFullSearch()) != nullptr) {
1116 if (!part->IsTextType()) {
1117 continue; // Consider only text partitions
1118 }
1119 int top = part->bounding_box().top();
1120 int bottom = part->bounding_box().bottom();
1121 if (top > max_top) {
1122 max_top = top;
1123 header = part;
1124 }
1125 if (bottom < min_bottom) {
1126 min_bottom = bottom;
1127 footer = part;
1128 }
1129 }
1130 if (header) {
1131 header->clear_table_type();
1132 }
1133 if (footer) {
1134 footer->clear_table_type();
1135 }
1136 }
1137
1138 // Mark all ColPartitions as table cells that have a table cell above
1139 // and below them
1140 // TODO(faisal): This is too aggressive at the moment. The method needs to
1141 // consider spacing and alignment as well. Detection of false alarm table cells
1142 // should also be done as part of it.
1143 void TableFinder::SmoothTablePartitionRuns() {
1144 // Iterate the ColPartitions in the grid.
1145 ColPartitionGridSearch gsearch(&clean_part_grid_);
1146 gsearch.StartFullSearch();
1147 ColPartition *part = nullptr;
1148 while ((part = gsearch.NextFullSearch()) != nullptr) {
1149 if (part->type() >= PT_TABLE || part->type() == PT_UNKNOWN) {
1150 continue; // Consider only text partitions
1151 }
1152 ColPartition *upper_part = part->nearest_neighbor_above();
1153 ColPartition *lower_part = part->nearest_neighbor_below();
1154 if (!upper_part || !lower_part) {
1155 continue;
1156 }
1157 if (upper_part->type() == PT_TABLE && lower_part->type() == PT_TABLE) {
1158 part->set_table_type();
1159 }
1160 }
1161
1162 // Pass 2, do the opposite. If both the upper and lower neighbors
1163 // exist and are not tables, this probably shouldn't be a table.
1164 gsearch.StartFullSearch();
1165 part = nullptr;
1166 while ((part = gsearch.NextFullSearch()) != nullptr) {
1167 if (part->type() != PT_TABLE) {
1168 continue; // Consider only text partitions
1169 }
1170 ColPartition *upper_part = part->nearest_neighbor_above();
1171 ColPartition *lower_part = part->nearest_neighbor_below();
1172
1173 // table can't be by itself
1174 if ((upper_part && upper_part->type() != PT_TABLE) &&
1175 (lower_part && lower_part->type() != PT_TABLE)) {
1176 part->clear_table_type();
1177 }
1178 }
1179 }
1180
1181 // Set the type of a column segment based on the ratio of table to text cells
1182 void TableFinder::SetColumnsType(ColSegment_LIST *column_blocks) {
1183 ColSegment_IT it(column_blocks);
1184 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1185 ColSegment *seg = it.data();
1186 TBOX box = seg->bounding_box();
1187 int num_table_cells = 0;
1188 int num_text_cells = 0;
1189 ColPartitionGridSearch rsearch(&clean_part_grid_);
1190 rsearch.SetUniqueMode(true);
1191 rsearch.StartRectSearch(box);
1192 ColPartition *part = nullptr;
1193 while ((part = rsearch.NextRectSearch()) != nullptr) {
1194 if (part->type() == PT_TABLE) {
1195 num_table_cells++;
1196 } else if (part->type() == PT_FLOWING_TEXT) {
1197 num_text_cells++;
1198 }
1199 }
1200 // If a column block has no text or table partition in it, it is not needed
1201 // for table detection.
1202 if (!num_table_cells && !num_text_cells) {
1203 delete it.extract();
1204 } else {
1205 seg->set_num_table_cells(num_table_cells);
1206 seg->set_num_text_cells(num_text_cells);
1207 // set column type based on the ratio of table to text cells
1208 seg->set_type();
1209 }
1210 }
1211 }
1212
1213 // Move column blocks to grid
1214 void TableFinder::MoveColSegmentsToGrid(ColSegment_LIST *segments,
1215 ColSegmentGrid *col_seg_grid) {
1216 ColSegment_IT it(segments);
1217 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1218 ColSegment *seg = it.extract();
1219 col_seg_grid->InsertBBox(true, true, seg);
1220 }
1221 }
1222
1223 // Merge column blocks if a split is detected due to the presence of a
1224 // table. A text block is considered split if it has multiple
1225 // neighboring blocks above/below it, and at least one of the
1226 // neighboring blocks is of table type (has a high density of table
1227 // partitions). In this case neighboring blocks in the direction
1228 // (above/below) of the table block are merged with the text block.
1229
1230 // Comment: This method does not handle split due to a full page table
1231 // since table columns in this case do not have a text column on which
1232 // split decision can be based.
1233 void TableFinder::GridMergeColumnBlocks() {
1234 int margin = gridsize();
1235
1236 // Iterate the Column Blocks in the grid.
1237 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> gsearch(
1238 &col_seg_grid_);
1239 gsearch.StartFullSearch();
1240 ColSegment *seg;
1241 while ((seg = gsearch.NextFullSearch()) != nullptr) {
1242 if (seg->type() != COL_TEXT) {
1243 continue; // only consider text blocks for split detection
1244 }
1245 bool neighbor_found = false;
1246 bool modified = false; // Modified at least once
1247 // keep expanding current box as long as neighboring table columns
1248 // are found above or below it.
1249 do {
1250 TBOX box = seg->bounding_box();
1251 // slightly expand the search region vertically
1252 int top_range =
1253 std::min(box.top() + margin, static_cast<int>(tright().y()));
1254 int bottom_range =
1255 std::max(box.bottom() - margin, static_cast<int>(bleft().y()));
1256 box.set_top(top_range);
1257 box.set_bottom(bottom_range);
1258 neighbor_found = false;
1259 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> rectsearch(
1260 &col_seg_grid_);
1261 rectsearch.StartRectSearch(box);
1262 ColSegment *neighbor = nullptr;
1263 while ((neighbor = rectsearch.NextRectSearch()) != nullptr) {
1264 if (neighbor == seg) {
1265 continue;
1266 }
1267 const TBOX &neighbor_box = neighbor->bounding_box();
1268 // If the neighbor box significantly overlaps with the current
1269 // box (due to the expansion of the current box in the
1270 // previous iteration of this loop), remove the neighbor box
1271 // and expand the current box to include it.
1272 if (neighbor_box.overlap_fraction(box) >= 0.9) {
1273 seg->InsertBox(neighbor_box);
1274 modified = true;
1275 rectsearch.RemoveBBox();
1276 gsearch.RepositionIterator();
1277 delete neighbor;
1278 continue;
1279 }
1280 // Only expand if the neighbor box is of table type
1281 if (neighbor->type() != COL_TABLE) {
1282 continue;
1283 }
1284 // Insert the neighbor box into the current column block
1285 if (neighbor_box.major_x_overlap(box) && !box.contains(neighbor_box)) {
1286 seg->InsertBox(neighbor_box);
1287 neighbor_found = true;
1288 modified = true;
1289 rectsearch.RemoveBBox();
1290 gsearch.RepositionIterator();
1291 delete neighbor;
1292 }
1293 }
1294 } while (neighbor_found);
1295 if (modified) {
1296 // Because the box has changed, it has to be removed first.
1297 gsearch.RemoveBBox();
1298 col_seg_grid_.InsertBBox(true, true, seg);
1299 gsearch.RepositionIterator();
1300 }
1301 }
1302 }
1303
1304 // Group horizontally overlapping table partitions into table columns.
1305 // TODO(faisal): This is too aggressive at the moment. The method should
1306 // consider more attributes to group table partitions together. Some common
1307 // errors are:
1308 // 1- page number is merged with a table column above it even
1309 // if there is a large vertical gap between them.
1310 // 2- column headers go on to catch one of the columns arbitrarily
1311 // 3- an isolated noise blob near page top or bottom merges with the table
1312 // column below/above it
1313 // 4- cells from two vertically adjacent tables merge together to make a
1314 // single column resulting in merging of the two tables
1315 void TableFinder::GetTableColumns(ColSegment_LIST *table_columns) {
1316 ColSegment_IT it(table_columns);
1317 // Iterate the ColPartitions in the grid.
1318 ColPartitionGridSearch gsearch(&clean_part_grid_);
1319 gsearch.StartFullSearch();
1320 ColPartition *part;
1321 while ((part = gsearch.NextFullSearch()) != nullptr) {
1322 if (part->inside_table_column() || part->type() != PT_TABLE) {
1323 continue; // prevent a partition to be assigned to multiple columns
1324 }
1325 const TBOX &box = part->bounding_box();
1326 auto *col = new ColSegment();
1327 col->InsertBox(box);
1328 part->set_inside_table_column(true);
1329 // Start a search below the current cell to find bottom neighbours
1330 // Note: a full search will always process things above it first, so
1331 // this should be starting at the highest cell and working its way down.
1332 ColPartitionGridSearch vsearch(&clean_part_grid_);
1333 vsearch.StartVerticalSearch(box.left(), box.right(), box.bottom());
1334 ColPartition *neighbor = nullptr;
1335 bool found_neighbours = false;
1336 while ((neighbor = vsearch.NextVerticalSearch(true)) != nullptr) {
1337 // only consider neighbors not assigned to any column yet
1338 if (neighbor->inside_table_column()) {
1339 continue;
1340 }
1341 // Horizontal lines should not break the flow
1342 if (neighbor->IsHorizontalLine()) {
1343 continue;
1344 }
1345 // presence of a non-table neighbor marks the end of current
1346 // table column
1347 if (neighbor->type() != PT_TABLE) {
1348 break;
1349 }
1350 // add the neighbor partition to the table column
1351 const TBOX &neighbor_box = neighbor->bounding_box();
1352 col->InsertBox(neighbor_box);
1353 neighbor->set_inside_table_column(true);
1354 found_neighbours = true;
1355 }
1356 if (found_neighbours) {
1357 it.add_after_then_move(col);
1358 } else {
1359 part->set_inside_table_column(false);
1360 delete col;
1361 }
1362 }
1363 }
1364
1365 // Mark regions in a column that are x-bounded by the column boundaries and
1366 // y-bounded by the table columns' projection on the y-axis as table regions
1367 void TableFinder::GetTableRegions(ColSegment_LIST *table_columns,
1368 ColSegment_LIST *table_regions) {
1369 ColSegment_IT cit(table_columns);
1370 ColSegment_IT rit(table_regions);
1371 // Iterate through column blocks
1372 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> gsearch(
1373 &col_seg_grid_);
1374 gsearch.StartFullSearch();
1375 ColSegment *part;
1376 int page_height = tright().y() - bleft().y();
1377 ASSERT_HOST(page_height > 0);
1378 // create a bool array to hold projection on y-axis
1379 bool *table_region = new bool[page_height];
1380 while ((part = gsearch.NextFullSearch()) != nullptr) {
1381 const TBOX &part_box = part->bounding_box();
1382 // reset the projection array
1383 for (int i = 0; i < page_height; i++) {
1384 table_region[i] = false;
1385 }
1386 // iterate through all table columns to find regions in the current
1387 // page column block
1388 cit.move_to_first();
1389 for (cit.mark_cycle_pt(); !cit.cycled_list(); cit.forward()) {
1390 TBOX col_box = cit.data()->bounding_box();
1391 // find intersection region of table column and page column
1392 TBOX intersection_box = col_box.intersection(part_box);
1393 // project table column on the y-axis
1394 for (int i = intersection_box.bottom(); i < intersection_box.top(); i++) {
1395 table_region[i - bleft().y()] = true;
1396 }
1397 }
1398 // set x-limits of table regions to page column width
1399 TBOX current_table_box;
1400 current_table_box.set_left(part_box.left());
1401 current_table_box.set_right(part_box.right());
1402 // go through the y-axis projection to find runs of table
1403 // regions. Each run makes one table region.
1404 for (int i = 1; i < page_height; i++) {
1405 // detect start of a table region
1406 if (!table_region[i - 1] && table_region[i]) {
1407 current_table_box.set_bottom(i + bleft().y());
1408 }
1409 // TODO(nbeato): Is it guaranteed that the last row is not a table region?
1410 // detect end of a table region
1411 if (table_region[i - 1] && !table_region[i]) {
1412 current_table_box.set_top(i + bleft().y());
1413 if (!current_table_box.null_box()) {
1414 auto *seg = new ColSegment();
1415 seg->InsertBox(current_table_box);
1416 rit.add_after_then_move(seg);
1417 }
1418 }
1419 }
1420 }
1421 delete[] table_region;
1422 }
1423
1424 // Merge table regions corresponding to tables spanning multiple columns if
1425 // there is a colpartition (horizontal ruling line or normal text) that
1426 // touches both regions.
1427 // TODO(faisal): A rare error occurs if there are two horizontally adjacent
1428 // tables with aligned ruling lines. In this case, line finder returns a
1429 // single line and hence the tables get merged together
1430 void TableFinder::GridMergeTableRegions() {
1431 // Iterate the table regions in the grid.
1432 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> gsearch(
1433 &table_grid_);
1434 gsearch.StartFullSearch();
1435 ColSegment *seg = nullptr;
1436 while ((seg = gsearch.NextFullSearch()) != nullptr) {
1437 bool neighbor_found = false;
1438 bool modified = false; // Modified at least once
1439 do {
1440 // Start a rectangle search x-bounded by the image and y by the table
1441 const TBOX &box = seg->bounding_box();
1442 TBOX search_region(box);
1443 search_region.set_left(bleft().x());
1444 search_region.set_right(tright().x());
1445 neighbor_found = false;
1446 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> rectsearch(
1447 &table_grid_);
1448 rectsearch.StartRectSearch(search_region);
1449 ColSegment *neighbor = nullptr;
1450 while ((neighbor = rectsearch.NextRectSearch()) != nullptr) {
1451 if (neighbor == seg) {
1452 continue;
1453 }
1454 const TBOX &neighbor_box = neighbor->bounding_box();
1455 // Check if a neighbor box has a large overlap with the table
1456 // region. This may happen as a result of merging two table
1457 // regions in the previous iteration.
1458 if (neighbor_box.overlap_fraction(box) >= 0.9) {
1459 seg->InsertBox(neighbor_box);
1460 rectsearch.RemoveBBox();
1461 gsearch.RepositionIterator();
1462 delete neighbor;
1463 modified = true;
1464 continue;
1465 }
1466 // Check if two table regions belong together based on a common
1467 // horizontal ruling line
1468 if (BelongToOneTable(box, neighbor_box)) {
1469 seg->InsertBox(neighbor_box);
1470 neighbor_found = true;
1471 modified = true;
1472 rectsearch.RemoveBBox();
1473 gsearch.RepositionIterator();
1474 delete neighbor;
1475 }
1476 }
1477 } while (neighbor_found);
1478 if (modified) {
1479 // Because the box has changed, it has to be removed first.
1480 gsearch.RemoveBBox();
1481 table_grid_.InsertBBox(true, true, seg);
1482 gsearch.RepositionIterator();
1483 }
1484 }
1485 }
1486
1487 // Decide if two table regions belong to one table based on a common
1488 // horizontal ruling line or another colpartition
1489 bool TableFinder::BelongToOneTable(const TBOX &box1, const TBOX &box2) {
1490 // Check the obvious case. Most likely not true because overlapping boxes
1491 // should already be merged, but seems like a good thing to do in case things
1492 // change.
1493 if (box1.overlap(box2)) {
1494 return true;
1495 }
1496 // Check for ColPartitions spanning both table regions
1497 TBOX bbox = box1.bounding_union(box2);
1498 // Start a rect search on bbox
1499 ColPartitionGridSearch rectsearch(&clean_part_grid_);
1500 rectsearch.StartRectSearch(bbox);
1501 ColPartition *part = nullptr;
1502 while ((part = rectsearch.NextRectSearch()) != nullptr) {
1503 const TBOX &part_box = part->bounding_box();
1504 // return true if a colpartition spanning both table regions is found
1505 if (part_box.overlap(box1) && part_box.overlap(box2) &&
1506 !part->IsImageType()) {
1507 return true;
1508 }
1509 }
1510 return false;
1511 }
1512
1513 // Adjust table boundaries by:
1514 // - building a tight bounding box around all ColPartitions contained in it.
1515 // - expanding table boundaries to include all colpartitions that overlap the
1516 // table by more than half of their area
1517 // - expanding table boundaries to include nearby horizontal rule lines
1518 // - expanding table vertically to include left out column headers
1519 // TODO(faisal): Expansion of table boundaries is quite aggressive. It usually
1520 // makes following errors:
1521 // 1- horizontal lines consisting of underlines are included in the table if
1522 // they are close enough
1523 // 2- horizontal lines originating from noise tend to get merged with a table
1524 // near the top of the page
1525 // 3- the criteria for including horizontal lines is very generous. Many times
1526 // horizontal lines separating headers and footers get merged with a
1527 // single-column table in a multi-column page thereby including text
1528 // from the neighboring column inside the table
1529 // 4- the criteria for including left out column headers also tends to
1530 // occasionally include text-lines above the tables, typically from
1531 // table caption
1532 void TableFinder::AdjustTableBoundaries() {
1533 // Iterate the table regions in the grid
1534 ColSegment_CLIST adjusted_tables;
1535 ColSegment_C_IT it(&adjusted_tables);
1536 ColSegmentGridSearch gsearch(&table_grid_);
1537 gsearch.StartFullSearch();
1538 ColSegment *table = nullptr;
1539 while ((table = gsearch.NextFullSearch()) != nullptr) {
1540 const TBOX &table_box = table->bounding_box();
1541 TBOX grown_box = table_box;
1542 GrowTableBox(table_box, &grown_box);
1543 // To prevent a table from expanding again, do not insert the
1544 // modified box back to the grid. Instead move it to a list and
1545 // and remove it from the grid. The list is moved later back to the grid.
1546 if (!grown_box.null_box()) {
1547 auto *col = new ColSegment();
1548 col->InsertBox(grown_box);
1549 it.add_after_then_move(col);
1550 }
1551 gsearch.RemoveBBox();
1552 delete table;
1553 }
1554 // clear table grid to move final tables in it
1555 // TODO(nbeato): table_grid_ should already be empty. The above loop
1556 // removed everything. Maybe just assert it is empty?
1557 table_grid_.Clear();
1558 it.move_to_first();
1559 // move back final tables to table_grid_
1560 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1561 ColSegment *seg = it.extract();
1562 table_grid_.InsertBBox(true, true, seg);
1563 }
1564 }
1565
1566 void TableFinder::GrowTableBox(const TBOX &table_box, TBOX *result_box) {
1567 // TODO(nbeato): The growing code is a bit excessive right now.
1568 // By removing these lines, the partitions considered need
1569 // to have some overlap or be special cases. These lines could
1570 // be added again once a check is put in place to make sure that
1571 // growing tables don't stomp on a lot of non-table partitions.
1572
1573 // search for horizontal ruling lines within the vertical margin
1574 // int vertical_margin = kRulingVerticalMargin * gridsize();
1575 TBOX search_box = table_box;
1576 // int top = MIN(search_box.top() + vertical_margin, tright().y());
1577 // int bottom = MAX(search_box.bottom() - vertical_margin, bleft().y());
1578 // search_box.set_top(top);
1579 // search_box.set_bottom(bottom);
1580
1581 GrowTableToIncludePartials(table_box, search_box, result_box);
1582 GrowTableToIncludeLines(table_box, search_box, result_box);
1583 IncludeLeftOutColumnHeaders(result_box);
1584 }
1585
1586 // Grow a table by increasing the size of the box to include
1587 // partitions with significant overlap with the table.
1588 void TableFinder::GrowTableToIncludePartials(const TBOX &table_box,
1589 const TBOX &search_range,
1590 TBOX *result_box) {
1591 // Rulings are in a different grid, so search 2 grids for rulings, text,
1592 // and table partitions that are not entirely within the new box.
1593 for (int i = 0; i < 2; ++i) {
1594 ColPartitionGrid *grid =
1595 (i == 0) ? &fragmented_text_grid_ : &leader_and_ruling_grid_;
1596 ColPartitionGridSearch rectsearch(grid);
1597 rectsearch.StartRectSearch(search_range);
1598 ColPartition *part = nullptr;
1599 while ((part = rectsearch.NextRectSearch()) != nullptr) {
1600 // Only include text and table types.
1601 if (part->IsImageType()) {
1602 continue;
1603 }
1604 const TBOX &part_box = part->bounding_box();
1605 // Include partition in the table if more than half of it
1606 // is covered by the table
1607 if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
1608 *result_box = result_box->bounding_union(part_box);
1609 continue;
1610 }
1611 }
1612 }
1613 }
1614
1615 // Grow a table by expanding to the extents of significantly
1616 // overlapping lines.
1617 void TableFinder::GrowTableToIncludeLines(const TBOX &table_box,
1618 const TBOX &search_range,
1619 TBOX *result_box) {
1620 ColPartitionGridSearch rsearch(&leader_and_ruling_grid_);
1621 rsearch.SetUniqueMode(true);
1622 rsearch.StartRectSearch(search_range);
1623 ColPartition *part = nullptr;
1624 while ((part = rsearch.NextRectSearch()) != nullptr) {
1625 // TODO(nbeato) This should also do vertical, but column
1626 // boundaries are breaking things. This function needs to be
1627 // updated to allow vertical lines as well.
1628 if (!part->IsLineType()) {
1629 continue;
1630 }
1631 // Avoid the following function call if the result of the
1632 // function is irrelevant.
1633 const TBOX &part_box = part->bounding_box();
1634 if (result_box->contains(part_box)) {
1635 continue;
1636 }
1637 // Include a partially overlapping horizontal line only if the
1638 // extra ColPartitions that will be included due to expansion
1639 // have large side spacing w.r.t. columns containing them.
1640 if (HLineBelongsToTable(*part, table_box)) {
1641 *result_box = result_box->bounding_union(part_box);
1642 }
1643 // TODO(nbeato): Vertical
1644 }
1645 }
1646
1647 // Checks whether the horizontal line belong to the table by looking at the
1648 // side spacing of extra ColPartitions that will be included in the table
1649 // due to expansion
1650 bool TableFinder::HLineBelongsToTable(const ColPartition &part,
1651 const TBOX &table_box) {
1652 if (!part.IsHorizontalLine()) {
1653 return false;
1654 }
1655 const TBOX &part_box = part.bounding_box();
1656 if (!part_box.major_x_overlap(table_box)) {
1657 return false;
1658 }
1659 // Do not consider top-most horizontal line since it usually
1660 // originates from noise.
1661 // TODO(nbeato): I had to comment this out because the ruling grid doesn't
1662 // have neighbors solved.
1663 // if (!part.nearest_neighbor_above())
1664 // return false;
1665 const TBOX bbox = part_box.bounding_union(table_box);
1666 // In the "unioned table" box (the table extents expanded by the line),
1667 // keep track of how many partitions have significant padding to the left
1668 // and right. If more than half of the partitions covered by the new table
1669 // have significant spacing, the line belongs to the table and the table
1670 // grows to include all of the partitions.
1671 int num_extra_partitions = 0;
1672 int extra_space_to_right = 0;
1673 int extra_space_to_left = 0;
1674 // Rulings are in a different grid, so search 2 grids for rulings, text,
1675 // and table partitions that are introduced by the new box.
1676 for (int i = 0; i < 2; ++i) {
1677 ColPartitionGrid *grid =
1678 (i == 0) ? &clean_part_grid_ : &leader_and_ruling_grid_;
1679 // Start a rect search on bbox
1680 ColPartitionGridSearch rectsearch(grid);
1681 rectsearch.SetUniqueMode(true);
1682 rectsearch.StartRectSearch(bbox);
1683 ColPartition *extra_part = nullptr;
1684 while ((extra_part = rectsearch.NextRectSearch()) != nullptr) {
1685 // ColPartition already in table
1686 const TBOX &extra_part_box = extra_part->bounding_box();
1687 if (extra_part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
1688 continue;
1689 }
1690 // Non-text ColPartitions do not contribute
1691 if (extra_part->IsImageType()) {
1692 continue;
1693 }
1694 // Consider this partition.
1695 num_extra_partitions++;
1696 // presence of a table cell is a strong hint, so just increment the scores
1697 // without looking at the spacing.
1698 if (extra_part->type() == PT_TABLE || extra_part->IsLineType()) {
1699 extra_space_to_right++;
1700 extra_space_to_left++;
1701 continue;
1702 }
1703 int space_threshold = kSideSpaceMargin * part.median_height();
1704 if (extra_part->space_to_right() > space_threshold) {
1705 extra_space_to_right++;
1706 }
1707 if (extra_part->space_to_left() > space_threshold) {
1708 extra_space_to_left++;
1709 }
1710 }
1711 }
1712 // tprintf("%d %d %d\n",
1713 // num_extra_partitions,extra_space_to_right,extra_space_to_left);
1714 return (extra_space_to_right > num_extra_partitions / 2) ||
1715 (extra_space_to_left > num_extra_partitions / 2);
1716 }
1717
1718 // Look for isolated column headers above the given table box and
1719 // include them in the table
1720 void TableFinder::IncludeLeftOutColumnHeaders(TBOX *table_box) {
1721 // Start a search above the current table to look for column headers
1722 ColPartitionGridSearch vsearch(&clean_part_grid_);
1723 vsearch.StartVerticalSearch(table_box->left(), table_box->right(),
1724 table_box->top());
1725 ColPartition *neighbor = nullptr;
1726 ColPartition *previous_neighbor = nullptr;
1727 while ((neighbor = vsearch.NextVerticalSearch(false)) != nullptr) {
1728 // Max distance to find a table heading.
1729 const int max_distance =
1730 kMaxColumnHeaderDistance * neighbor->median_height();
1731 int table_top = table_box->top();
1732 const TBOX &box = neighbor->bounding_box();
1733 // Do not continue if the next box is way above
1734 if (box.bottom() - table_top > max_distance) {
1735 break;
1736 }
1737 // Unconditionally include partitions of type TABLE or LINE
1738 // TODO(faisal): add some reasonable conditions here
1739 if (neighbor->type() == PT_TABLE || neighbor->IsLineType()) {
1740 table_box->set_top(box.top());
1741 previous_neighbor = nullptr;
1742 continue;
1743 }
1744 // If there are two text partitions, one above the other, without a table
1745 // cell on their left or right side, consider them a barrier and quit
1746 if (previous_neighbor == nullptr) {
1747 previous_neighbor = neighbor;
1748 } else {
1749 const TBOX &previous_box = previous_neighbor->bounding_box();
1750 if (!box.major_y_overlap(previous_box)) {
1751 break;
1752 }
1753 }
1754 }
1755 }
1756
1757 // Remove false alarms consisting of a single column based on their
1758 // projection on the x-axis. Projection of a real table on the x-axis
1759 // should have at least one zero-valley larger than the global median
1760 // x-height of the page.
1761 void TableFinder::DeleteSingleColumnTables() {
1762 int page_width = tright().x() - bleft().x();
1763 ASSERT_HOST(page_width > 0);
1764 // create an integer array to hold projection on x-axis
1765 int *table_xprojection = new int[page_width];
1766 // Iterate through all tables in the table grid
1767 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> table_search(
1768 &table_grid_);
1769 table_search.StartFullSearch();
1770 ColSegment *table;
1771 while ((table = table_search.NextFullSearch()) != nullptr) {
1772 TBOX table_box = table->bounding_box();
1773 // reset the projection array
1774 for (int i = 0; i < page_width; i++) {
1775 table_xprojection[i] = 0;
1776 }
1777 // Start a rect search on table_box
1778 ColPartitionGridSearch rectsearch(&clean_part_grid_);
1779 rectsearch.SetUniqueMode(true);
1780 rectsearch.StartRectSearch(table_box);
1781 ColPartition *part;
1782 while ((part = rectsearch.NextRectSearch()) != nullptr) {
1783 if (!part->IsTextType()) {
1784 continue; // Do not consider non-text partitions
1785 }
1786 if (part->flow() == BTFT_LEADER) {
1787 continue; // Assume leaders are in tables
1788 }
1789 TBOX part_box = part->bounding_box();
1790 // Do not consider partitions partially covered by the table
1791 if (part_box.overlap_fraction(table_box) < kMinOverlapWithTable) {
1792 continue;
1793 }
1794 BLOBNBOX_CLIST *part_boxes = part->boxes();
1795 BLOBNBOX_C_IT pit(part_boxes);
1796
1797 // Make sure overlapping blobs don't artificially inflate the number
1798 // of rows in the table. This happens frequently with things such as
1799 // decimals and split characters. Do this by assuming the column
1800 // partition is sorted mostly left to right and just clip
1801 // bounding boxes by the previous box's extent.
1802 int next_position_to_write = 0;
1803
1804 for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
1805 BLOBNBOX *pblob = pit.data();
1806 // ignore blob height for the purpose of projection since we
1807 // are only interested in finding valleys
1808 int xstart = pblob->bounding_box().left();
1809 int xend = pblob->bounding_box().right();
1810
1811 xstart = std::max(xstart, next_position_to_write);
1812 for (int i = xstart; i < xend; i++) {
1813 table_xprojection[i - bleft().x()]++;
1814 }
1815 next_position_to_write = xend;
1816 }
1817 }
1818 // Find largest valley between two reasonable peaks in the table
1819 if (!GapInXProjection(table_xprojection, page_width)) {
1820 table_search.RemoveBBox();
1821 delete table;
1822 }
1823 }
1824 delete[] table_xprojection;
1825 }
1826
1827 // Return true if at least one gap larger than the global x-height
1828 // exists in the horizontal projection
1829 bool TableFinder::GapInXProjection(int *xprojection, int length) {
1830 // Find peak value of the histogram
1831 int peak_value = 0;
1832 for (int i = 0; i < length; i++) {
1833 if (xprojection[i] > peak_value) {
1834 peak_value = xprojection[i];
1835 }
1836 }
1837 // Peak value represents the maximum number of horizontally
1838 // overlapping colpartitions, so this can be considered as the
1839 // number of rows in the table
1840 if (peak_value < kMinRowsInTable) {
1841 return false;
1842 }
1843 double projection_threshold = kSmallTableProjectionThreshold * peak_value;
1844 if (peak_value >= kLargeTableRowCount) {
1845 projection_threshold = kLargeTableProjectionThreshold * peak_value;
1846 }
1847 // Threshold the histogram
1848 for (int i = 0; i < length; i++) {
1849 xprojection[i] = (xprojection[i] >= projection_threshold) ? 1 : 0;
1850 }
1851 // Find the largest run of zeros between two ones
1852 int largest_gap = 0;
1853 int run_start = -1;
1854 for (int i = 1; i < length; i++) {
1855 // detect start of a run of zeros
1856 if (xprojection[i - 1] && !xprojection[i]) {
1857 run_start = i;
1858 }
1859 // detect end of a run of zeros and update the value of largest gap
1860 if (run_start != -1 && !xprojection[i - 1] && xprojection[i]) {
1861 int gap = i - run_start;
1862 if (gap > largest_gap) {
1863 largest_gap = gap;
1864 }
1865 run_start = -1;
1866 }
1867 }
1868 return largest_gap > kMaxXProjectionGapFactor * global_median_xheight_;
1869 }
1870
1871 // Given the location of a table "guess", try to overlay a cellular
1872 // grid in the location, adjusting the boundaries.
1873 // TODO(nbeato): Falsely introduces:
1874 // -headers/footers (not any worse, too much overlap destroys cells)
1875 // -page numbers (not worse, included because maximize margins)
1876 // -equations (nicely fit into a celluar grid, but more sparsely)
1877 // -figures (random text box, also sparse)
1878 // -small left-aligned text areas with overlapping positioned whitespace
1879 // (rejected before)
1880 // Overall, this just needs some more work.
1881 void TableFinder::RecognizeTables() {
1882 #ifndef GRAPHICS_DISABLED
1883 ScrollView *table_win = nullptr;
1884 if (textord_show_tables) {
1885 table_win = MakeWindow(0, 0, "Table Structure");
1886 DisplayColPartitions(table_win, &fragmented_text_grid_, ScrollView::BLUE,
1887 ScrollView::LIGHT_BLUE);
1888 // table_grid_.DisplayBoxes(table_win);
1889 }
1890 #endif
1891
1892 TableRecognizer recognizer;
1893 recognizer.Init();
1894 recognizer.set_line_grid(&leader_and_ruling_grid_);
1895 recognizer.set_text_grid(&fragmented_text_grid_);
1896 recognizer.set_max_text_height(global_median_xheight_ * 2.0);
1897 recognizer.set_min_height(1.5 * gridheight());
1898 // Loop over all of the tables and try to fit them.
1899 // Store the good tables here.
1900 ColSegment_CLIST good_tables;
1901 ColSegment_C_IT good_it(&good_tables);
1902
1903 ColSegmentGridSearch gsearch(&table_grid_);
1904 gsearch.StartFullSearch();
1905 ColSegment *found_table = nullptr;
1906 while ((found_table = gsearch.NextFullSearch()) != nullptr) {
1907 gsearch.RemoveBBox();
1908
1909 // The goal is to make the tables persistent in a list.
1910 // When that happens, this will move into the search loop.
1911 const TBOX &found_box = found_table->bounding_box();
1912 StructuredTable *table_structure = recognizer.RecognizeTable(found_box);
1913
1914 // Process a table. Good tables are inserted into the grid again later on
1915 // We can't change boxes in the grid while it is running a search.
1916 if (table_structure != nullptr) {
1917 #ifndef GRAPHICS_DISABLED
1918 if (textord_show_tables) {
1919 table_structure->Display(table_win, ScrollView::LIME_GREEN);
1920 }
1921 #endif
1922 found_table->set_bounding_box(table_structure->bounding_box());
1923 delete table_structure;
1924 good_it.add_after_then_move(found_table);
1925 } else {
1926 delete found_table;
1927 }
1928 }
1929 // TODO(nbeato): MERGE!! There is awesome info now available for merging.
1930
1931 // At this point, the grid is empty. We can safely insert the good tables
1932 // back into grid.
1933 for (good_it.mark_cycle_pt(); !good_it.cycled_list(); good_it.forward()) {
1934 table_grid_.InsertBBox(true, true, good_it.extract());
1935 }
1936 }
1937
1938 #ifndef GRAPHICS_DISABLED
1939
1940 // Displays the column segments in some window.
1941 void TableFinder::DisplayColSegments(ScrollView *win, ColSegment_LIST *segments,
1942 ScrollView::Color color) {
1943 win->Pen(color);
1944 win->Brush(ScrollView::NONE);
1945 ColSegment_IT it(segments);
1946 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1947 ColSegment *col = it.data();
1948 const TBOX &box = col->bounding_box();
1949 int left_x = box.left();
1950 int right_x = box.right();
1951 int top_y = box.top();
1952 int bottom_y = box.bottom();
1953 win->Rectangle(left_x, bottom_y, right_x, top_y);
1954 }
1955 win->UpdateWindow();
1956 }
1957
1958 // Displays the colpartitions using a new coloring on an existing window.
1959 // Note: This method is only for debug purpose during development and
1960 // would not be part of checked in code
1961 void TableFinder::DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid,
1962 ScrollView::Color default_color,
1963 ScrollView::Color table_color) {
1964 ScrollView::Color color = default_color;
1965 // Iterate the ColPartitions in the grid.
1966 ColPartitionGridSearch gsearch(grid);
1967 gsearch.StartFullSearch();
1968 ColPartition *part = nullptr;
1969 while ((part = gsearch.NextFullSearch()) != nullptr) {
1970 color = default_color;
1971 if (part->type() == PT_TABLE) {
1972 color = table_color;
1973 }
1974
1975 const TBOX &box = part->bounding_box();
1976 int left_x = box.left();
1977 int right_x = box.right();
1978 int top_y = box.top();
1979 int bottom_y = box.bottom();
1980 win->Brush(ScrollView::NONE);
1981 win->Pen(color);
1982 win->Rectangle(left_x, bottom_y, right_x, top_y);
1983 }
1984 win->UpdateWindow();
1985 }
1986
1987 void TableFinder::DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid,
1988 ScrollView::Color default_color) {
1989 DisplayColPartitions(win, grid, default_color, ScrollView::YELLOW);
1990 }
1991
1992 void TableFinder::DisplayColPartitionConnections(ScrollView *win,
1993 ColPartitionGrid *grid,
1994 ScrollView::Color color) {
1995 // Iterate the ColPartitions in the grid.
1996 ColPartitionGridSearch gsearch(grid);
1997 gsearch.StartFullSearch();
1998 ColPartition *part = nullptr;
1999 while ((part = gsearch.NextFullSearch()) != nullptr) {
2000 const TBOX &box = part->bounding_box();
2001 int left_x = box.left();
2002 int right_x = box.right();
2003 int top_y = box.top();
2004 int bottom_y = box.bottom();
2005
2006 ColPartition *upper_part = part->nearest_neighbor_above();
2007 if (upper_part) {
2008 const TBOX &upper_box = upper_part->bounding_box();
2009 int mid_x = (left_x + right_x) / 2;
2010 int mid_y = (top_y + bottom_y) / 2;
2011 int other_x = (upper_box.left() + upper_box.right()) / 2;
2012 int other_y = (upper_box.top() + upper_box.bottom()) / 2;
2013 win->Brush(ScrollView::NONE);
2014 win->Pen(color);
2015 win->Line(mid_x, mid_y, other_x, other_y);
2016 }
2017 ColPartition *lower_part = part->nearest_neighbor_below();
2018 if (lower_part) {
2019 const TBOX &lower_box = lower_part->bounding_box();
2020 int mid_x = (left_x + right_x) / 2;
2021 int mid_y = (top_y + bottom_y) / 2;
2022 int other_x = (lower_box.left() + lower_box.right()) / 2;
2023 int other_y = (lower_box.top() + lower_box.bottom()) / 2;
2024 win->Brush(ScrollView::NONE);
2025 win->Pen(color);
2026 win->Line(mid_x, mid_y, other_x, other_y);
2027 }
2028 }
2029 win->UpdateWindow();
2030 }
2031
2032 #endif
2033
2034 // Merge all colpartitions in table regions to make them a single
2035 // colpartition and revert types of isolated table cells not
2036 // assigned to any table to their original types.
2037 void TableFinder::MakeTableBlocks(ColPartitionGrid *grid,
2038 ColPartitionSet **all_columns,
2039 const WidthCallback &width_cb) {
2040 // Since we have table blocks already, remove table tags from all
2041 // colpartitions
2042 ColPartitionGridSearch gsearch(grid);
2043 gsearch.StartFullSearch();
2044 ColPartition *part = nullptr;
2045
2046 while ((part = gsearch.NextFullSearch()) != nullptr) {
2047 if (part->type() == PT_TABLE) {
2048 part->clear_table_type();
2049 }
2050 }
2051 // Now make a single colpartition out of each table block and remove
2052 // all colpartitions contained within a table
2053 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> table_search(
2054 &table_grid_);
2055 table_search.StartFullSearch();
2056 ColSegment *table;
2057 while ((table = table_search.NextFullSearch()) != nullptr) {
2058 const TBOX &table_box = table->bounding_box();
2059 // Start a rect search on table_box
2060 ColPartitionGridSearch rectsearch(grid);
2061 rectsearch.StartRectSearch(table_box);
2062 ColPartition *part;
2063 ColPartition *table_partition = nullptr;
2064 while ((part = rectsearch.NextRectSearch()) != nullptr) {
2065 // Do not consider image partitions
2066 if (!part->IsTextType()) {
2067 continue;
2068 }
2069 TBOX part_box = part->bounding_box();
2070 // Include partition in the table if more than half of it
2071 // is covered by the table
2072 if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
2073 rectsearch.RemoveBBox();
2074 if (table_partition) {
2075 table_partition->Absorb(part, width_cb);
2076 } else {
2077 table_partition = part;
2078 }
2079 }
2080 }
2081 // Insert table colpartition back to part_grid_
2082 if (table_partition) {
2083 // To match the columns used when transforming to blocks, the new table
2084 // partition must have its first and last column set at the grid y that
2085 // corresponds to its bottom.
2086 const TBOX &table_box = table_partition->bounding_box();
2087 int grid_x, grid_y;
2088 grid->GridCoords(table_box.left(), table_box.bottom(), &grid_x, &grid_y);
2089 table_partition->SetPartitionType(resolution_, all_columns[grid_y]);
2090 table_partition->set_table_type();
2091 table_partition->set_blob_type(BRT_TEXT);
2092 table_partition->set_flow(BTFT_CHAIN);
2093 table_partition->SetBlobTypes();
2094 grid->InsertBBox(true, true, table_partition);
2095 }
2096 }
2097 }
2098
2099 //////// ColSegment code
2100 ////////
2101 ColSegment::ColSegment()
2102 : ELIST_LINK(),
2103 num_table_cells_(0),
2104 num_text_cells_(0),
2105 type_(COL_UNKNOWN) {}
2106
2107 // Provides a color for BBGrid to draw the rectangle.
2108 ScrollView::Color ColSegment::BoxColor() const {
2109 const ScrollView::Color kBoxColors[PT_COUNT] = {
2110 ScrollView::YELLOW,
2111 ScrollView::BLUE,
2112 ScrollView::YELLOW,
2113 ScrollView::MAGENTA,
2114 };
2115 return kBoxColors[type_];
2116 }
2117
2118 // Insert a box into this column segment
2119 void ColSegment::InsertBox(const TBOX &other) {
2120 bounding_box_ = bounding_box_.bounding_union(other);
2121 }
2122
2123 // Set column segment type based on the ratio of text and table partitions
2124 // in it.
2125 void ColSegment::set_type() {
2126 if (num_table_cells_ > kTableColumnThreshold * num_text_cells_) {
2127 type_ = COL_TABLE;
2128 } else if (num_text_cells_ > num_table_cells_) {
2129 type_ = COL_TEXT;
2130 } else {
2131 type_ = COL_MIXED;
2132 }
2133 }
2134
2135 } // namespace tesseract.