comparison mupdf-source/thirdparty/tesseract/src/textord/tablerecog.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 ///////////////////////////////////////////////////////////////////////
2 // File: tablerecog.cpp
3 // Description: Helper class to help structure table areas. Given an bounding
4 // box from TableFinder, the TableRecognizer should give a
5 // StructuredTable (maybe a list in the future) of "good" tables
6 // in that area.
7 // Author: Nicholas Beato
8 //
9 // (C) Copyright 2009, Google Inc.
10 // Licensed under the Apache License, Version 2.0 (the "License");
11 // you may not use this file except in compliance with the License.
12 // You may obtain a copy of the License at
13 // http://www.apache.org/licenses/LICENSE-2.0
14 // Unless required by applicable law or agreed to in writing, software
15 // distributed under the License is distributed on an "AS IS" BASIS,
16 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 // See the License for the specific language governing permissions and
18 // limitations under the License.
19 //
20 ///////////////////////////////////////////////////////////////////////
21
22 #ifdef HAVE_CONFIG_H
23 # include "config_auto.h"
24 #endif
25
26 #include "tablerecog.h"
27
28 #include <algorithm>
29
30 namespace tesseract {
31
32 // The amount of space required between the ColPartitions in 2 columns
33 // of a non-lined table as a multiple of the median width.
34 const double kHorizontalSpacing = 0.30;
35 // The amount of space required between the ColPartitions in 2 rows
36 // of a non-lined table as multiples of the median height.
37 const double kVerticalSpacing = -0.2;
38 // The number of cells that the grid lines may intersect.
39 // See FindCellSplitLocations for explanation.
40 const int kCellSplitRowThreshold = 0;
41 const int kCellSplitColumnThreshold = 0;
42 // For "lined tables", the number of required lines. Currently a guess.
43 const int kLinedTableMinVerticalLines = 3;
44 const int kLinedTableMinHorizontalLines = 3;
45 // Number of columns required, as a fraction of the most columns found.
46 // None of these are tweaked at all.
47 const double kRequiredColumns = 0.7;
48 // The tolerance for comparing margins of potential tables.
49 const double kMarginFactor = 1.1;
50 // The first and last row should be consistent cell height.
51 // This factor is the first and last row cell height max.
52 const double kMaxRowSize = 2.5;
53 // Number of filled columns required to form a strong table row.
54 // For small tables, this is an absolute number.
55 const double kGoodRowNumberOfColumnsSmall[] = {2, 2, 2, 2, 2, 3, 3};
56 // For large tables, it is a relative number
57 const double kGoodRowNumberOfColumnsLarge = 0.7;
58 // The amount of area that must be covered in a cell by ColPartitions to
59 // be considered "filled"
60 const double kMinFilledArea = 0.35;
61
62 // Indicates that a table row is weak. This means that it has
63 // many missing data cells or very large cell heights compared.
64 // to the rest of the table.
65 // Code is buggy right now. It is disabled in the calling function.
66 // It seems like sometimes the row that is passed in is not correct
67 // sometimes (like a phantom row is introduced). There's something going
68 // on in the cell_y_ data member before this is called... not certain.
69 static bool IsWeakTableRow(StructuredTable *table, int row) {
70 if (!table->VerifyRowFilled(row)) {
71 return false;
72 }
73
74 double threshold;
75 if (table->column_count() < countof(kGoodRowNumberOfColumnsSmall)) {
76 threshold = kGoodRowNumberOfColumnsSmall[table->column_count()];
77 } else {
78 threshold = table->column_count() * kGoodRowNumberOfColumnsLarge;
79 }
80
81 return table->CountFilledCellsInRow(row) < threshold;
82 }
83
84 ////////
85 //////// StructuredTable Class
86 ////////
87
88 StructuredTable::StructuredTable()
89 : text_grid_(nullptr)
90 , line_grid_(nullptr)
91 , is_lined_(false)
92 , space_above_(0)
93 , space_below_(0)
94 , space_left_(0)
95 , space_right_(0)
96 , median_cell_height_(0)
97 , median_cell_width_(0)
98 , max_text_height_(INT32_MAX) {}
99
100 void StructuredTable::Init() {}
101
102 void StructuredTable::set_text_grid(ColPartitionGrid *text_grid) {
103 text_grid_ = text_grid;
104 }
105 void StructuredTable::set_line_grid(ColPartitionGrid *line_grid) {
106 line_grid_ = line_grid;
107 }
108 void StructuredTable::set_max_text_height(int height) {
109 max_text_height_ = height;
110 }
111 bool StructuredTable::is_lined() const {
112 return is_lined_;
113 }
114 unsigned StructuredTable::row_count() const {
115 return cell_y_.empty() ? 0 : cell_y_.size() - 1;
116 }
117 unsigned StructuredTable::column_count() const {
118 return cell_x_.empty() ? 0 : cell_x_.size() - 1;
119 }
120 unsigned StructuredTable::cell_count() const {
121 return row_count() * column_count();
122 }
123 void StructuredTable::set_bounding_box(const TBOX &box) {
124 bounding_box_ = box;
125 }
126 const TBOX &StructuredTable::bounding_box() const {
127 return bounding_box_;
128 }
129 int StructuredTable::median_cell_height() {
130 return median_cell_height_;
131 }
132 int StructuredTable::median_cell_width() {
133 return median_cell_width_;
134 }
135 int StructuredTable::row_height(unsigned row) const {
136 ASSERT_HOST(row < row_count());
137 return cell_y_[row + 1] - cell_y_[row];
138 }
139 int StructuredTable::column_width(unsigned column) const {
140 ASSERT_HOST(column < column_count());
141 return cell_x_[column + 1] - cell_x_[column];
142 }
143 int StructuredTable::space_above() const {
144 return space_above_;
145 }
146 int StructuredTable::space_below() const {
147 return space_below_;
148 }
149
150 // At this point, we know that the lines are contained
151 // by the box (by FindLinesBoundingBox).
152 // So try to find the cell structure and make sure it works out.
153 // The assumption is that all lines span the table. If this
154 // assumption fails, the VerifyLinedTable method will
155 // abort the lined table. The TableRecognizer will fall
156 // back on FindWhitespacedStructure.
157 bool StructuredTable::FindLinedStructure() {
158 ClearStructure();
159
160 // Search for all of the lines in the current box.
161 // Update the cellular structure with the exact lines.
162 ColPartitionGridSearch box_search(line_grid_);
163 box_search.SetUniqueMode(true);
164 box_search.StartRectSearch(bounding_box_);
165 ColPartition *line = nullptr;
166
167 while ((line = box_search.NextRectSearch()) != nullptr) {
168 if (line->IsHorizontalLine()) {
169 cell_y_.push_back(line->MidY());
170 }
171 if (line->IsVerticalLine()) {
172 cell_x_.push_back(line->MidX());
173 }
174 }
175
176 // HasSignificantLines should guarantee cells.
177 // Because that code is a different class, just gracefully
178 // return false. This could be an assert.
179 if (cell_x_.size() < 3 || cell_y_.size() < 3) {
180 return false;
181 }
182
183 // Sort and remove duplicates that may have occurred due to split lines.
184 std::sort(cell_x_.begin(), cell_x_.end());
185 auto last_x = std::unique(cell_x_.begin(), cell_x_.end());
186 cell_x_.erase(last_x, cell_x_.end());
187 std::sort(cell_y_.begin(), cell_y_.end());
188 auto last_y = std::unique(cell_y_.begin(), cell_y_.end());
189 cell_y_.erase(last_y, cell_y_.end());
190
191 // The border should be the extents of line boxes, not middle.
192 cell_x_[0] = bounding_box_.left();
193 cell_x_[cell_x_.size() - 1] = bounding_box_.right();
194 cell_y_[0] = bounding_box_.bottom();
195 cell_y_[cell_y_.size() - 1] = bounding_box_.top();
196
197 // Remove duplicates that may have occurred due to moving the borders.
198 last_x = std::unique(cell_x_.begin(), cell_x_.end());
199 cell_x_.erase(last_x, cell_x_.end());
200 last_y = std::unique(cell_y_.begin(), cell_y_.end());
201 cell_y_.erase(last_y, cell_y_.end());
202
203 CalculateMargins();
204 CalculateStats();
205 is_lined_ = VerifyLinedTableCells();
206 return is_lined_;
207 }
208
209 // Finds the cellular structure given a particular box.
210 bool StructuredTable::FindWhitespacedStructure() {
211 ClearStructure();
212 FindWhitespacedColumns();
213 FindWhitespacedRows();
214
215 if (!VerifyWhitespacedTable()) {
216 return false;
217 } else {
218 bounding_box_.set_left(cell_x_[0]);
219 bounding_box_.set_right(cell_x_[cell_x_.size() - 1]);
220 bounding_box_.set_bottom(cell_y_[0]);
221 bounding_box_.set_top(cell_y_[cell_y_.size() - 1]);
222 AbsorbNearbyLines();
223 CalculateMargins();
224 CalculateStats();
225 return true;
226 }
227 }
228
229 // Tests if a partition fits inside the table structure.
230 // Partitions must fully span a grid line in order to intersect it.
231 // This means that a partition does not intersect a line
232 // that it "just" touches. This is mainly because the assumption
233 // throughout the code is that "0" distance is a very very small space.
234 bool StructuredTable::DoesPartitionFit(const ColPartition &part) const {
235 const TBOX &box = part.bounding_box();
236 for (int i : cell_x_) {
237 if (box.left() < i && i < box.right()) {
238 return false;
239 }
240 }
241 for (int i : cell_y_) {
242 if (box.bottom() < i && i < box.top()) {
243 return false;
244 }
245 }
246 return true;
247 }
248
249 // Checks if a sub-table has multiple data cells filled.
250 int StructuredTable::CountFilledCells() {
251 return CountFilledCells(0, row_count() - 1, 0, column_count() - 1);
252 }
253 int StructuredTable::CountFilledCellsInRow(int row) {
254 return CountFilledCells(row, row, 0, column_count() - 1);
255 }
256 int StructuredTable::CountFilledCellsInColumn(int column) {
257 return CountFilledCells(0, row_count() - 1, column, column);
258 }
259 int StructuredTable::CountFilledCells(unsigned row_start, unsigned row_end, unsigned column_start,
260 unsigned column_end) {
261 ASSERT_HOST(row_start <= row_end && row_end < row_count());
262 ASSERT_HOST(column_start <= column_end && column_end < column_count());
263 int cell_count = 0;
264 TBOX cell_box;
265 for (unsigned row = row_start; row <= row_end; ++row) {
266 cell_box.set_bottom(cell_y_[row]);
267 cell_box.set_top(cell_y_[row + 1]);
268 for (unsigned col = column_start; col <= column_end; ++col) {
269 cell_box.set_left(cell_x_[col]);
270 cell_box.set_right(cell_x_[col + 1]);
271 if (CountPartitions(cell_box) > 0) {
272 ++cell_count;
273 }
274 }
275 }
276 return cell_count;
277 }
278
279 // Makes sure that at least one cell in a row has substantial area filled.
280 // This can filter out large whitespace caused by growing tables too far
281 // and page numbers.
282 bool StructuredTable::VerifyRowFilled(int row) {
283 for (unsigned i = 0; i < column_count(); ++i) {
284 auto area_filled = CalculateCellFilledPercentage(row, i);
285 if (area_filled >= kMinFilledArea) {
286 return true;
287 }
288 }
289 return false;
290 }
291
292 // Finds the filled area in a cell.
293 // Assume ColPartitions do not overlap for simplicity (even though they do).
294 double StructuredTable::CalculateCellFilledPercentage(unsigned row, unsigned column) {
295 ASSERT_HOST(row <= row_count());
296 ASSERT_HOST(column <= column_count());
297 const TBOX kCellBox(cell_x_[column], cell_y_[row], cell_x_[column + 1], cell_y_[row + 1]);
298 ASSERT_HOST(!kCellBox.null_box());
299
300 ColPartitionGridSearch gsearch(text_grid_);
301 gsearch.SetUniqueMode(true);
302 gsearch.StartRectSearch(kCellBox);
303 double area_covered = 0;
304 ColPartition *text = nullptr;
305 while ((text = gsearch.NextRectSearch()) != nullptr) {
306 if (text->IsTextType()) {
307 area_covered += text->bounding_box().intersection(kCellBox).area();
308 }
309 }
310 const int32_t current_area = kCellBox.area();
311 if (current_area == 0) {
312 return 1.0;
313 }
314 return std::min(1.0, area_covered / current_area);
315 }
316
317 #ifndef GRAPHICS_DISABLED
318
319 void StructuredTable::Display(ScrollView *window, ScrollView::Color color) {
320 window->Brush(ScrollView::NONE);
321 window->Pen(color);
322 window->Rectangle(bounding_box_.left(), bounding_box_.bottom(), bounding_box_.right(),
323 bounding_box_.top());
324 for (int i : cell_x_) {
325 window->Line(i, bounding_box_.bottom(), i, bounding_box_.top());
326 }
327 for (int i : cell_y_) {
328 window->Line(bounding_box_.left(), i, bounding_box_.right(), i);
329 }
330 window->UpdateWindow();
331 }
332
333 #endif
334
335 // Clear structure information.
336 void StructuredTable::ClearStructure() {
337 cell_x_.clear();
338 cell_y_.clear();
339 is_lined_ = false;
340 space_above_ = 0;
341 space_below_ = 0;
342 space_left_ = 0;
343 space_right_ = 0;
344 median_cell_height_ = 0;
345 median_cell_width_ = 0;
346 }
347
348 // When a table has lines, the lines should not intersect any partitions.
349 // The following function makes sure the previous assumption is met.
350 bool StructuredTable::VerifyLinedTableCells() {
351 // Function only called when lines exist.
352 ASSERT_HOST(cell_y_.size() >= 2 && cell_x_.size() >= 2);
353 for (int i : cell_y_) {
354 if (CountHorizontalIntersections(i) > 0) {
355 return false;
356 }
357 }
358 for (int i : cell_x_) {
359 if (CountVerticalIntersections(i) > 0) {
360 return false;
361 }
362 }
363 return true;
364 }
365
366 // TODO(nbeato): Could be much better than this.
367 // Examples:
368 // - Calculate the percentage of filled cells.
369 // - Calculate the average number of ColPartitions per cell.
370 // - Calculate the number of cells per row with partitions.
371 // - Check if ColPartitions in adjacent cells are similar.
372 // - Check that all columns are at least a certain width.
373 // - etc.
374 bool StructuredTable::VerifyWhitespacedTable() {
375 // criteria for a table, must be at least 2x3 or 3x2
376 return row_count() >= 2 && column_count() >= 2 && cell_count() >= 6;
377 }
378
379 // Finds vertical splits in the ColPartitions of text_grid_ by considering
380 // all possible "good" guesses. A good guess is just the left/right sides of
381 // the partitions, since these locations will uniquely define where the
382 // extremal values where the splits can occur. The split happens
383 // in the middle of the two nearest partitions.
384 void StructuredTable::FindWhitespacedColumns() {
385 // Set of the extents of all partitions on the page.
386 std::vector<int> left_sides;
387 std::vector<int> right_sides;
388
389 // Look at each text partition. We want to find the partitions
390 // that have extremal left/right sides. These will give us a basis
391 // for the table columns.
392 ColPartitionGridSearch gsearch(text_grid_);
393 gsearch.SetUniqueMode(true);
394 gsearch.StartRectSearch(bounding_box_);
395 ColPartition *text = nullptr;
396 while ((text = gsearch.NextRectSearch()) != nullptr) {
397 if (!text->IsTextType()) {
398 continue;
399 }
400
401 ASSERT_HOST(text->bounding_box().left() < text->bounding_box().right());
402 int spacing = static_cast<int>(text->median_width() * kHorizontalSpacing / 2.0 + 0.5);
403 left_sides.push_back(text->bounding_box().left() - spacing);
404 right_sides.push_back(text->bounding_box().right() + spacing);
405 }
406 // It causes disaster below, so avoid it!
407 if (left_sides.empty() || right_sides.empty()) {
408 return;
409 }
410
411 // Since data may be inserted in grid order, we sort the left/right sides.
412 std::sort(left_sides.begin(), left_sides.end());
413 std::sort(right_sides.begin(), right_sides.end());
414
415 // At this point, in the "merged list", we expect to have a left side,
416 // followed by either more left sides or a right side. The last number
417 // should be a right side. We find places where the splits occur by looking
418 // for "valleys". If we want to force gap sizes or allow overlap, change
419 // the spacing above. If you want to let lines "slice" partitions as long
420 // as it is infrequent, change the following function.
421 FindCellSplitLocations(left_sides, right_sides, kCellSplitColumnThreshold, &cell_x_);
422 }
423
424 // Finds horizontal splits in the ColPartitions of text_grid_ by considering
425 // all possible "good" guesses. A good guess is just the bottom/top sides of
426 // the partitions, since these locations will uniquely define where the
427 // extremal values where the splits can occur. The split happens
428 // in the middle of the two nearest partitions.
429 void StructuredTable::FindWhitespacedRows() {
430 // Set of the extents of all partitions on the page.
431 std::vector<int> bottom_sides;
432 std::vector<int> top_sides;
433 // We will be "shrinking" partitions, so keep the min/max around to
434 // make sure the bottom/top lines do not intersect text.
435 int min_bottom = INT32_MAX;
436 int max_top = INT32_MIN;
437
438 // Look at each text partition. We want to find the partitions
439 // that have extremal bottom/top sides. These will give us a basis
440 // for the table rows. Because the textlines can be skewed and close due
441 // to warping, the height of the partitions is toned down a little bit.
442 ColPartitionGridSearch gsearch(text_grid_);
443 gsearch.SetUniqueMode(true);
444 gsearch.StartRectSearch(bounding_box_);
445 ColPartition *text = nullptr;
446 while ((text = gsearch.NextRectSearch()) != nullptr) {
447 if (!text->IsTextType()) {
448 continue;
449 }
450
451 ASSERT_HOST(text->bounding_box().bottom() < text->bounding_box().top());
452 min_bottom = std::min(min_bottom, static_cast<int>(text->bounding_box().bottom()));
453 max_top = std::max(max_top, static_cast<int>(text->bounding_box().top()));
454
455 // Ignore "tall" text partitions, as these are usually false positive
456 // vertical text or multiple lines pulled together.
457 if (text->bounding_box().height() > max_text_height_) {
458 continue;
459 }
460
461 int spacing = static_cast<int>(text->bounding_box().height() * kVerticalSpacing / 2.0 + 0.5);
462 int bottom = text->bounding_box().bottom() - spacing;
463 int top = text->bounding_box().top() + spacing;
464 // For horizontal text, the factor can be negative. This should
465 // probably cause a warning or failure. I haven't actually checked if
466 // it happens.
467 if (bottom >= top) {
468 continue;
469 }
470
471 bottom_sides.push_back(bottom);
472 top_sides.push_back(top);
473 }
474 // It causes disaster below, so avoid it!
475 if (bottom_sides.empty() || top_sides.empty()) {
476 return;
477 }
478
479 // Since data may be inserted in grid order, we sort the bottom/top sides.
480 std::sort(bottom_sides.begin(), bottom_sides.end());
481 std::sort(top_sides.begin(), top_sides.end());
482
483 // At this point, in the "merged list", we expect to have a bottom side,
484 // followed by either more bottom sides or a top side. The last number
485 // should be a top side. We find places where the splits occur by looking
486 // for "valleys". If we want to force gap sizes or allow overlap, change
487 // the spacing above. If you want to let lines "slice" partitions as long
488 // as it is infrequent, change the following function.
489 FindCellSplitLocations(bottom_sides, top_sides, kCellSplitRowThreshold, &cell_y_);
490
491 // Recover the min/max correctly since it was shifted.
492 cell_y_[0] = min_bottom;
493 cell_y_[cell_y_.size() - 1] = max_top;
494 }
495
496 void StructuredTable::CalculateMargins() {
497 space_above_ = INT32_MAX;
498 space_below_ = INT32_MAX;
499 space_right_ = INT32_MAX;
500 space_left_ = INT32_MAX;
501 UpdateMargins(text_grid_);
502 UpdateMargins(line_grid_);
503 }
504 // Finds the nearest partition in grid to the table
505 // boundaries and updates the margin.
506 void StructuredTable::UpdateMargins(ColPartitionGrid *grid) {
507 int below = FindVerticalMargin(grid, bounding_box_.bottom(), true);
508 space_below_ = std::min(space_below_, below);
509 int above = FindVerticalMargin(grid, bounding_box_.top(), false);
510 space_above_ = std::min(space_above_, above);
511 int left = FindHorizontalMargin(grid, bounding_box_.left(), true);
512 space_left_ = std::min(space_left_, left);
513 int right = FindHorizontalMargin(grid, bounding_box_.right(), false);
514 space_right_ = std::min(space_right_, right);
515 }
516 int StructuredTable::FindVerticalMargin(ColPartitionGrid *grid, int border, bool decrease) const {
517 ColPartitionGridSearch gsearch(grid);
518 gsearch.SetUniqueMode(true);
519 gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(), border);
520 ColPartition *part = nullptr;
521 while ((part = gsearch.NextVerticalSearch(decrease)) != nullptr) {
522 if (!part->IsTextType() && !part->IsHorizontalLine()) {
523 continue;
524 }
525 int distance =
526 decrease ? border - part->bounding_box().top() : part->bounding_box().bottom() - border;
527 if (distance >= 0) {
528 return distance;
529 }
530 }
531 return INT32_MAX;
532 }
533 int StructuredTable::FindHorizontalMargin(ColPartitionGrid *grid, int border, bool decrease) const {
534 ColPartitionGridSearch gsearch(grid);
535 gsearch.SetUniqueMode(true);
536 gsearch.StartSideSearch(border, bounding_box_.bottom(), bounding_box_.top());
537 ColPartition *part = nullptr;
538 while ((part = gsearch.NextSideSearch(decrease)) != nullptr) {
539 if (!part->IsTextType() && !part->IsVerticalLine()) {
540 continue;
541 }
542 int distance =
543 decrease ? border - part->bounding_box().right() : part->bounding_box().left() - border;
544 if (distance >= 0) {
545 return distance;
546 }
547 }
548 return INT32_MAX;
549 }
550
551 void StructuredTable::CalculateStats() {
552 const int kMaxCellHeight = 1000;
553 const int kMaxCellWidth = 1000;
554 STATS height_stats(0, kMaxCellHeight);
555 STATS width_stats(0, kMaxCellWidth);
556
557 for (unsigned i = 0; i < row_count(); ++i) {
558 height_stats.add(row_height(i), column_count());
559 }
560 for (unsigned i = 0; i < column_count(); ++i) {
561 width_stats.add(column_width(i), row_count());
562 }
563
564 median_cell_height_ = static_cast<int>(height_stats.median() + 0.5);
565 median_cell_width_ = static_cast<int>(width_stats.median() + 0.5);
566 }
567
568 // Looks for grid lines near the current bounding box and
569 // grows the bounding box to include them if no intersections
570 // will occur as a result. This is necessary because the margins
571 // are calculated relative to the closest line/text. If the
572 // line isn't absorbed, the margin will be the distance to the line.
573 void StructuredTable::AbsorbNearbyLines() {
574 ColPartitionGridSearch gsearch(line_grid_);
575 gsearch.SetUniqueMode(true);
576
577 // Is the closest line above good? Loop multiple times for tables with
578 // multi-line (sometimes 2) borders. Limit the number of lines by
579 // making sure they stay within a table cell or so.
580 ColPartition *line = nullptr;
581 gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(), bounding_box_.top());
582 while ((line = gsearch.NextVerticalSearch(false)) != nullptr) {
583 if (!line->IsHorizontalLine()) {
584 break;
585 }
586 TBOX text_search(bounding_box_.left(), bounding_box_.top() + 1, bounding_box_.right(),
587 line->MidY());
588 if (text_search.height() > median_cell_height_ * 2) {
589 break;
590 }
591 if (CountPartitions(text_search) > 0) {
592 break;
593 }
594 bounding_box_.set_top(line->MidY());
595 }
596 // As above, is the closest line below good?
597 line = nullptr;
598 gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(), bounding_box_.bottom());
599 while ((line = gsearch.NextVerticalSearch(true)) != nullptr) {
600 if (!line->IsHorizontalLine()) {
601 break;
602 }
603 TBOX text_search(bounding_box_.left(), line->MidY(), bounding_box_.right(),
604 bounding_box_.bottom() - 1);
605 if (text_search.height() > median_cell_height_ * 2) {
606 break;
607 }
608 if (CountPartitions(text_search) > 0) {
609 break;
610 }
611 bounding_box_.set_bottom(line->MidY());
612 }
613 // TODO(nbeato): vertical lines
614 }
615
616 // This function will find all "0 valleys" (of any length) given two
617 // arrays. The arrays are the mins and maxes of partitions (either
618 // left and right or bottom and top). Since the min/max lists are generated
619 // with pairs of increasing integers, we can make some assumptions in
620 // the function about ordering of the overall list, which are shown in the
621 // asserts.
622 // The algorithm works as follows:
623 // While there are numbers to process, take the smallest number.
624 // If it is from the min_list, increment the "hill" counter.
625 // Otherwise, decrement the "hill" counter.
626 // In the process of doing this, keep track of "crossing" the
627 // desired height.
628 // The first/last items are extremal values of the list and known.
629 // NOTE: This function assumes the lists are sorted!
630 void StructuredTable::FindCellSplitLocations(const std::vector<int> &min_list,
631 const std::vector<int> &max_list, int max_merged,
632 std::vector<int> *locations) {
633 locations->clear();
634 ASSERT_HOST(min_list.size() == max_list.size());
635 if (min_list.empty()) {
636 return;
637 }
638 ASSERT_HOST(min_list.at(0) < max_list.at(0));
639 ASSERT_HOST(min_list.at(min_list.size() - 1) < max_list.at(max_list.size() - 1));
640
641 locations->push_back(min_list.at(0));
642 unsigned min_index = 0;
643 unsigned max_index = 0;
644 int stacked_partitions = 0;
645 int last_cross_position = INT32_MAX;
646 // max_index will expire after min_index.
647 // However, we can't "increase" the hill size if min_index expired.
648 // So finish processing when min_index expires.
649 while (min_index < min_list.size()) {
650 // Increase the hill count.
651 if (min_list[min_index] < max_list[max_index]) {
652 ++stacked_partitions;
653 if (last_cross_position != INT32_MAX && stacked_partitions > max_merged) {
654 int mid = (last_cross_position + min_list[min_index]) / 2;
655 locations->push_back(mid);
656 last_cross_position = INT32_MAX;
657 }
658 ++min_index;
659 } else {
660 // Decrease the hill count.
661 --stacked_partitions;
662 if (last_cross_position == INT32_MAX && stacked_partitions <= max_merged) {
663 last_cross_position = max_list[max_index];
664 }
665 ++max_index;
666 }
667 }
668 locations->push_back(max_list.at(max_list.size() - 1));
669 }
670
671 // Counts the number of partitions in the table
672 // box that intersection the given x value.
673 int StructuredTable::CountVerticalIntersections(int x) {
674 int count = 0;
675 // Make a small box to keep the search time down.
676 const int kGridSize = text_grid_->gridsize();
677 TBOX vertical_box = bounding_box_;
678 vertical_box.set_left(x - kGridSize);
679 vertical_box.set_right(x + kGridSize);
680
681 ColPartitionGridSearch gsearch(text_grid_);
682 gsearch.SetUniqueMode(true);
683 gsearch.StartRectSearch(vertical_box);
684 ColPartition *text = nullptr;
685 while ((text = gsearch.NextRectSearch()) != nullptr) {
686 if (!text->IsTextType()) {
687 continue;
688 }
689 const TBOX &box = text->bounding_box();
690 if (box.left() < x && x < box.right()) {
691 ++count;
692 }
693 }
694 return count;
695 }
696
697 // Counts the number of partitions in the table
698 // box that intersection the given y value.
699 int StructuredTable::CountHorizontalIntersections(int y) {
700 int count = 0;
701 // Make a small box to keep the search time down.
702 const int kGridSize = text_grid_->gridsize();
703 TBOX horizontal_box = bounding_box_;
704 horizontal_box.set_bottom(y - kGridSize);
705 horizontal_box.set_top(y + kGridSize);
706
707 ColPartitionGridSearch gsearch(text_grid_);
708 gsearch.SetUniqueMode(true);
709 gsearch.StartRectSearch(horizontal_box);
710 ColPartition *text = nullptr;
711 while ((text = gsearch.NextRectSearch()) != nullptr) {
712 if (!text->IsTextType()) {
713 continue;
714 }
715
716 const TBOX &box = text->bounding_box();
717 if (box.bottom() < y && y < box.top()) {
718 ++count;
719 }
720 }
721 return count;
722 }
723
724 // Counts how many text partitions are in this box.
725 // This is used to count partitions in cells, as that can indicate
726 // how "strong" a potential table row/column (or even full table) actually is.
727 int StructuredTable::CountPartitions(const TBOX &box) {
728 ColPartitionGridSearch gsearch(text_grid_);
729 gsearch.SetUniqueMode(true);
730 gsearch.StartRectSearch(box);
731 int count = 0;
732 ColPartition *text = nullptr;
733 while ((text = gsearch.NextRectSearch()) != nullptr) {
734 if (text->IsTextType()) {
735 ++count;
736 }
737 }
738 return count;
739 }
740
741 ////////
742 //////// TableRecognizer Class
743 ////////
744
745 void TableRecognizer::Init() {}
746
747 void TableRecognizer::set_text_grid(ColPartitionGrid *text_grid) {
748 text_grid_ = text_grid;
749 }
750 void TableRecognizer::set_line_grid(ColPartitionGrid *line_grid) {
751 line_grid_ = line_grid;
752 }
753 void TableRecognizer::set_min_height(int height) {
754 min_height_ = height;
755 }
756 void TableRecognizer::set_min_width(int width) {
757 min_width_ = width;
758 }
759 void TableRecognizer::set_max_text_height(int height) {
760 max_text_height_ = height;
761 }
762
763 StructuredTable *TableRecognizer::RecognizeTable(const TBOX &guess) {
764 auto *table = new StructuredTable();
765 table->Init();
766 table->set_text_grid(text_grid_);
767 table->set_line_grid(line_grid_);
768 table->set_max_text_height(max_text_height_);
769
770 // Try to solve this simple case, a table with *both*
771 // vertical and horizontal lines.
772 if (RecognizeLinedTable(guess, table)) {
773 return table;
774 }
775
776 // Fallback to whitespace if that failed.
777 // TODO(nbeato): Break this apart to take advantage of horizontal
778 // lines or vertical lines when present.
779 if (RecognizeWhitespacedTable(guess, table)) {
780 return table;
781 }
782
783 // No table found...
784 delete table;
785 return nullptr;
786 }
787
788 bool TableRecognizer::RecognizeLinedTable(const TBOX &guess_box, StructuredTable *table) {
789 if (!HasSignificantLines(guess_box)) {
790 return false;
791 }
792 TBOX line_bound = guess_box;
793 if (!FindLinesBoundingBox(&line_bound)) {
794 return false;
795 }
796 table->set_bounding_box(line_bound);
797 return table->FindLinedStructure();
798 }
799
800 // Quick implementation. Just count the number of lines in the box.
801 // A better implementation would counter intersections and look for connected
802 // components. It could even go as far as finding similar length lines.
803 // To account for these possible issues, the VerifyLinedTableCells function
804 // will reject lined tables that cause intersections with text on the page.
805 // TODO(nbeato): look for "better" lines
806 bool TableRecognizer::HasSignificantLines(const TBOX &guess) {
807 ColPartitionGridSearch box_search(line_grid_);
808 box_search.SetUniqueMode(true);
809 box_search.StartRectSearch(guess);
810 ColPartition *line = nullptr;
811 int vertical_count = 0;
812 int horizontal_count = 0;
813
814 while ((line = box_search.NextRectSearch()) != nullptr) {
815 if (line->IsHorizontalLine()) {
816 ++horizontal_count;
817 }
818 if (line->IsVerticalLine()) {
819 ++vertical_count;
820 }
821 }
822
823 return vertical_count >= kLinedTableMinVerticalLines &&
824 horizontal_count >= kLinedTableMinHorizontalLines;
825 }
826
827 // Given a bounding box with a bunch of horizontal / vertical lines,
828 // we just find the extents of all of these lines iteratively.
829 // The box will be at least as large as guess. This
830 // could possibly be a bad assumption.
831 // It is guaranteed to halt in at least O(n * gridarea) where n
832 // is the number of lines.
833 // The assumption is that growing the box iteratively will add lines
834 // several times, but eventually we'll find the extents.
835 //
836 // For tables, the approach is a bit aggressive, a single line (which could be
837 // noise or a column ruling) can destroy the table inside.
838 //
839 // TODO(nbeato): This is a quick first implementation.
840 // A better implementation would actually look for consistency
841 // in extents of the lines and find the extents using lines
842 // that clearly describe the table. This would allow the
843 // lines to "vote" for height/width. An approach like
844 // this would solve issues with page layout rulings.
845 // I haven't looked for these issues yet, so I can't even
846 // say they happen confidently.
847 bool TableRecognizer::FindLinesBoundingBox(TBOX *bounding_box) {
848 // The first iteration will tell us if there are lines
849 // present and shrink the box to a minimal iterative size.
850 if (!FindLinesBoundingBoxIteration(bounding_box)) {
851 return false;
852 }
853
854 // Keep growing until the area of the table stabilizes.
855 // The box can only get bigger, increasing area.
856 bool changed = true;
857 while (changed) {
858 changed = false;
859 int old_area = bounding_box->area();
860 bool check = FindLinesBoundingBoxIteration(bounding_box);
861 // At this point, the function will return true.
862 ASSERT_HOST(check);
863 ASSERT_HOST(bounding_box->area() >= old_area);
864 changed = (bounding_box->area() > old_area);
865 }
866
867 return true;
868 }
869
870 bool TableRecognizer::FindLinesBoundingBoxIteration(TBOX *bounding_box) {
871 // Search for all of the lines in the current box, keeping track of extents.
872 ColPartitionGridSearch box_search(line_grid_);
873 box_search.SetUniqueMode(true);
874 box_search.StartRectSearch(*bounding_box);
875 ColPartition *line = nullptr;
876 bool first_line = true;
877
878 while ((line = box_search.NextRectSearch()) != nullptr) {
879 if (line->IsLineType()) {
880 if (first_line) {
881 // The first iteration can shrink the box.
882 *bounding_box = line->bounding_box();
883 first_line = false;
884 } else {
885 *bounding_box += line->bounding_box();
886 }
887 }
888 }
889 return !first_line;
890 }
891
892 // The goal of this function is to move the table boundaries around and find
893 // a table that maximizes the whitespace around the table while maximizing
894 // the cellular structure. As a result, it gets confused by headers, footers,
895 // and merged columns (text that crosses columns). There is a tolerance
896 // that allows a few partitions to count towards potential cell merges.
897 // It's the max_merged parameter to FindPartitionLocations.
898 // It can work, but it needs some false positive remove on boundaries.
899 // For now, the grid structure must not intersect any partitions.
900 // Also, small tolerance is added to the horizontal lines for tightly packed
901 // tables. The tolerance is added by adjusting the bounding boxes of the
902 // partitions (in FindHorizontalPartitions). The current implementation
903 // only adjusts the vertical extents of the table.
904 //
905 // Also note. This was hacked at a lot. It could probably use some
906 // more hacking at to find a good set of border conditions and then a
907 // nice clean up.
908 bool TableRecognizer::RecognizeWhitespacedTable(const TBOX &guess_box, StructuredTable *table) {
909 TBOX best_box = guess_box; // Best borders known.
910 int best_below = 0; // Margin size above best table.
911 int best_above = 0; // Margin size below best table.
912 TBOX adjusted = guess_box; // The search box.
913
914 // We assume that the guess box is somewhat accurate, so we don't allow
915 // the adjusted border to pass half of the guessed area. This prevents
916 // "negative" tables from forming.
917 const int kMidGuessY = (guess_box.bottom() + guess_box.top()) / 2;
918 // Keeps track of the most columns in an accepted table. The resulting table
919 // may be less than the max, but we don't want to stray too far.
920 unsigned best_cols = 0;
921 // Make sure we find a good border.
922 bool found_good_border = false;
923
924 // Find the bottom of the table by trying a few different locations. For
925 // each location, the top, left, and right are fixed. We start the search
926 // in a smaller table to favor best_cols getting a good estimate sooner.
927 int last_bottom = INT32_MAX;
928 int bottom =
929 NextHorizontalSplit(guess_box.left(), guess_box.right(), kMidGuessY - min_height_ / 2, true);
930 int top =
931 NextHorizontalSplit(guess_box.left(), guess_box.right(), kMidGuessY + min_height_ / 2, false);
932 adjusted.set_top(top);
933
934 // Headers/footers can be spaced far from everything.
935 // Make sure that the space below is greater than the space above
936 // the lowest row.
937 int previous_below = 0;
938 const int kMaxChances = 10;
939 int chances = kMaxChances;
940 while (bottom != last_bottom) {
941 adjusted.set_bottom(bottom);
942
943 if (adjusted.height() >= min_height_) {
944 // Try to fit the grid on the current box. We give it a chance
945 // if the number of columns didn't significantly drop.
946 table->set_bounding_box(adjusted);
947 if (table->FindWhitespacedStructure() &&
948 table->column_count() >= best_cols * kRequiredColumns) {
949 if (false && IsWeakTableRow(table, 0)) {
950 // Currently buggy, but was looking promising so disabled.
951 --chances;
952 } else {
953 // We favor 2 things,
954 // 1- Adding rows that have partitioned data.
955 // 2- Better margins (to find header/footer).
956 // For better tables, we just look for multiple cells in the
957 // bottom row with data in them.
958 // For margins, the space below the last row should
959 // be better than a table with the last row removed.
960 chances = kMaxChances;
961 double max_row_height = kMaxRowSize * table->median_cell_height();
962 if ((table->space_below() * kMarginFactor >= best_below &&
963 table->space_below() >= previous_below) ||
964 (table->CountFilledCellsInRow(0) > 1 && table->row_height(0) < max_row_height)) {
965 best_box.set_bottom(bottom);
966 best_below = table->space_below();
967 best_cols = std::max(table->column_count(), best_cols);
968 found_good_border = true;
969 }
970 }
971 previous_below = table->space_below();
972 } else {
973 --chances;
974 }
975 }
976 if (chances <= 0) {
977 break;
978 }
979
980 last_bottom = bottom;
981 bottom = NextHorizontalSplit(guess_box.left(), guess_box.right(), last_bottom, true);
982 }
983 if (!found_good_border) {
984 return false;
985 }
986
987 // TODO(nbeato) comments: follow modified code above... put it in a function!
988 found_good_border = false;
989 int last_top = INT32_MIN;
990 top =
991 NextHorizontalSplit(guess_box.left(), guess_box.right(), kMidGuessY + min_height_ / 2, false);
992 int previous_above = 0;
993 chances = kMaxChances;
994
995 adjusted.set_bottom(best_box.bottom());
996 while (last_top != top) {
997 adjusted.set_top(top);
998 if (adjusted.height() >= min_height_) {
999 table->set_bounding_box(adjusted);
1000 if (table->FindWhitespacedStructure() &&
1001 table->column_count() >= best_cols * kRequiredColumns) {
1002 int last_row = table->row_count() - 1;
1003 if (false && IsWeakTableRow(table, last_row)) {
1004 // Currently buggy, but was looking promising so disabled.
1005 --chances;
1006 } else {
1007 chances = kMaxChances;
1008 double max_row_height = kMaxRowSize * table->median_cell_height();
1009 if ((table->space_above() * kMarginFactor >= best_above &&
1010 table->space_above() >= previous_above) ||
1011 (table->CountFilledCellsInRow(last_row) > 1 &&
1012 table->row_height(last_row) < max_row_height)) {
1013 best_box.set_top(top);
1014 best_above = table->space_above();
1015 best_cols = std::max(table->column_count(), best_cols);
1016 found_good_border = true;
1017 }
1018 }
1019 previous_above = table->space_above();
1020 } else {
1021 --chances;
1022 }
1023 }
1024 if (chances <= 0) {
1025 break;
1026 }
1027
1028 last_top = top;
1029 top = NextHorizontalSplit(guess_box.left(), guess_box.right(), last_top, false);
1030 }
1031
1032 if (!found_good_border) {
1033 return false;
1034 }
1035
1036 // If we get here, this shouldn't happen. It can be an assert, but
1037 // I haven't tested it enough to make it crash things.
1038 if (best_box.null_box()) {
1039 return false;
1040 }
1041
1042 // Given the best locations, fit the box to those locations.
1043 table->set_bounding_box(best_box);
1044 return table->FindWhitespacedStructure();
1045 }
1046
1047 // Finds the closest value to y that can safely cause a horizontal
1048 // split in the partitions.
1049 // This function has been buggy and not as reliable as I would've
1050 // liked. I suggest finding all of the splits using the
1051 // FindPartitionLocations once and then just keeping the results
1052 // of that function cached somewhere.
1053 int TableRecognizer::NextHorizontalSplit(int left, int right, int y, bool top_to_bottom) {
1054 ColPartitionGridSearch gsearch(text_grid_);
1055 gsearch.SetUniqueMode(true);
1056 gsearch.StartVerticalSearch(left, right, y);
1057 ColPartition *text = nullptr;
1058 int last_y = y;
1059 while ((text = gsearch.NextVerticalSearch(top_to_bottom)) != nullptr) {
1060 if (!text->IsTextType() || !text->IsHorizontalType()) {
1061 continue;
1062 }
1063 if (text->bounding_box().height() > max_text_height_) {
1064 continue;
1065 }
1066
1067 const TBOX &text_box = text->bounding_box();
1068 if (top_to_bottom && (last_y >= y || last_y <= text_box.top())) {
1069 last_y = std::min(last_y, static_cast<int>(text_box.bottom()));
1070 continue;
1071 }
1072 if (!top_to_bottom && (last_y <= y || last_y >= text_box.bottom())) {
1073 last_y = std::max(last_y, static_cast<int>(text_box.top()));
1074 continue;
1075 }
1076
1077 return last_y;
1078 }
1079 // If none is found, we at least want to preserve the min/max,
1080 // which defines the overlap of y with the last partition in the grid.
1081 return last_y;
1082 }
1083
1084 } // namespace tesseract