Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/textord/tablerecog.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: tablerecog.cpp | |
| 3 // Description: Helper class to help structure table areas. Given an bounding | |
| 4 // box from TableFinder, the TableRecognizer should give a | |
| 5 // StructuredTable (maybe a list in the future) of "good" tables | |
| 6 // in that area. | |
| 7 // Author: Nicholas Beato | |
| 8 // | |
| 9 // (C) Copyright 2009, Google Inc. | |
| 10 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 11 // you may not use this file except in compliance with the License. | |
| 12 // You may obtain a copy of the License at | |
| 13 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 14 // Unless required by applicable law or agreed to in writing, software | |
| 15 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 16 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 17 // See the License for the specific language governing permissions and | |
| 18 // limitations under the License. | |
| 19 // | |
| 20 /////////////////////////////////////////////////////////////////////// | |
| 21 | |
| 22 #ifdef HAVE_CONFIG_H | |
| 23 # include "config_auto.h" | |
| 24 #endif | |
| 25 | |
| 26 #include "tablerecog.h" | |
| 27 | |
| 28 #include <algorithm> | |
| 29 | |
| 30 namespace tesseract { | |
| 31 | |
| 32 // The amount of space required between the ColPartitions in 2 columns | |
| 33 // of a non-lined table as a multiple of the median width. | |
| 34 const double kHorizontalSpacing = 0.30; | |
| 35 // The amount of space required between the ColPartitions in 2 rows | |
| 36 // of a non-lined table as multiples of the median height. | |
| 37 const double kVerticalSpacing = -0.2; | |
| 38 // The number of cells that the grid lines may intersect. | |
| 39 // See FindCellSplitLocations for explanation. | |
| 40 const int kCellSplitRowThreshold = 0; | |
| 41 const int kCellSplitColumnThreshold = 0; | |
| 42 // For "lined tables", the number of required lines. Currently a guess. | |
| 43 const int kLinedTableMinVerticalLines = 3; | |
| 44 const int kLinedTableMinHorizontalLines = 3; | |
| 45 // Number of columns required, as a fraction of the most columns found. | |
| 46 // None of these are tweaked at all. | |
| 47 const double kRequiredColumns = 0.7; | |
| 48 // The tolerance for comparing margins of potential tables. | |
| 49 const double kMarginFactor = 1.1; | |
| 50 // The first and last row should be consistent cell height. | |
| 51 // This factor is the first and last row cell height max. | |
| 52 const double kMaxRowSize = 2.5; | |
| 53 // Number of filled columns required to form a strong table row. | |
| 54 // For small tables, this is an absolute number. | |
| 55 const double kGoodRowNumberOfColumnsSmall[] = {2, 2, 2, 2, 2, 3, 3}; | |
| 56 // For large tables, it is a relative number | |
| 57 const double kGoodRowNumberOfColumnsLarge = 0.7; | |
| 58 // The amount of area that must be covered in a cell by ColPartitions to | |
| 59 // be considered "filled" | |
| 60 const double kMinFilledArea = 0.35; | |
| 61 | |
| 62 // Indicates that a table row is weak. This means that it has | |
| 63 // many missing data cells or very large cell heights compared. | |
| 64 // to the rest of the table. | |
| 65 // Code is buggy right now. It is disabled in the calling function. | |
| 66 // It seems like sometimes the row that is passed in is not correct | |
| 67 // sometimes (like a phantom row is introduced). There's something going | |
| 68 // on in the cell_y_ data member before this is called... not certain. | |
| 69 static bool IsWeakTableRow(StructuredTable *table, int row) { | |
| 70 if (!table->VerifyRowFilled(row)) { | |
| 71 return false; | |
| 72 } | |
| 73 | |
| 74 double threshold; | |
| 75 if (table->column_count() < countof(kGoodRowNumberOfColumnsSmall)) { | |
| 76 threshold = kGoodRowNumberOfColumnsSmall[table->column_count()]; | |
| 77 } else { | |
| 78 threshold = table->column_count() * kGoodRowNumberOfColumnsLarge; | |
| 79 } | |
| 80 | |
| 81 return table->CountFilledCellsInRow(row) < threshold; | |
| 82 } | |
| 83 | |
| 84 //////// | |
| 85 //////// StructuredTable Class | |
| 86 //////// | |
| 87 | |
| 88 StructuredTable::StructuredTable() | |
| 89 : text_grid_(nullptr) | |
| 90 , line_grid_(nullptr) | |
| 91 , is_lined_(false) | |
| 92 , space_above_(0) | |
| 93 , space_below_(0) | |
| 94 , space_left_(0) | |
| 95 , space_right_(0) | |
| 96 , median_cell_height_(0) | |
| 97 , median_cell_width_(0) | |
| 98 , max_text_height_(INT32_MAX) {} | |
| 99 | |
| 100 void StructuredTable::Init() {} | |
| 101 | |
| 102 void StructuredTable::set_text_grid(ColPartitionGrid *text_grid) { | |
| 103 text_grid_ = text_grid; | |
| 104 } | |
| 105 void StructuredTable::set_line_grid(ColPartitionGrid *line_grid) { | |
| 106 line_grid_ = line_grid; | |
| 107 } | |
| 108 void StructuredTable::set_max_text_height(int height) { | |
| 109 max_text_height_ = height; | |
| 110 } | |
| 111 bool StructuredTable::is_lined() const { | |
| 112 return is_lined_; | |
| 113 } | |
| 114 unsigned StructuredTable::row_count() const { | |
| 115 return cell_y_.empty() ? 0 : cell_y_.size() - 1; | |
| 116 } | |
| 117 unsigned StructuredTable::column_count() const { | |
| 118 return cell_x_.empty() ? 0 : cell_x_.size() - 1; | |
| 119 } | |
| 120 unsigned StructuredTable::cell_count() const { | |
| 121 return row_count() * column_count(); | |
| 122 } | |
| 123 void StructuredTable::set_bounding_box(const TBOX &box) { | |
| 124 bounding_box_ = box; | |
| 125 } | |
| 126 const TBOX &StructuredTable::bounding_box() const { | |
| 127 return bounding_box_; | |
| 128 } | |
| 129 int StructuredTable::median_cell_height() { | |
| 130 return median_cell_height_; | |
| 131 } | |
| 132 int StructuredTable::median_cell_width() { | |
| 133 return median_cell_width_; | |
| 134 } | |
| 135 int StructuredTable::row_height(unsigned row) const { | |
| 136 ASSERT_HOST(row < row_count()); | |
| 137 return cell_y_[row + 1] - cell_y_[row]; | |
| 138 } | |
| 139 int StructuredTable::column_width(unsigned column) const { | |
| 140 ASSERT_HOST(column < column_count()); | |
| 141 return cell_x_[column + 1] - cell_x_[column]; | |
| 142 } | |
| 143 int StructuredTable::space_above() const { | |
| 144 return space_above_; | |
| 145 } | |
| 146 int StructuredTable::space_below() const { | |
| 147 return space_below_; | |
| 148 } | |
| 149 | |
| 150 // At this point, we know that the lines are contained | |
| 151 // by the box (by FindLinesBoundingBox). | |
| 152 // So try to find the cell structure and make sure it works out. | |
| 153 // The assumption is that all lines span the table. If this | |
| 154 // assumption fails, the VerifyLinedTable method will | |
| 155 // abort the lined table. The TableRecognizer will fall | |
| 156 // back on FindWhitespacedStructure. | |
| 157 bool StructuredTable::FindLinedStructure() { | |
| 158 ClearStructure(); | |
| 159 | |
| 160 // Search for all of the lines in the current box. | |
| 161 // Update the cellular structure with the exact lines. | |
| 162 ColPartitionGridSearch box_search(line_grid_); | |
| 163 box_search.SetUniqueMode(true); | |
| 164 box_search.StartRectSearch(bounding_box_); | |
| 165 ColPartition *line = nullptr; | |
| 166 | |
| 167 while ((line = box_search.NextRectSearch()) != nullptr) { | |
| 168 if (line->IsHorizontalLine()) { | |
| 169 cell_y_.push_back(line->MidY()); | |
| 170 } | |
| 171 if (line->IsVerticalLine()) { | |
| 172 cell_x_.push_back(line->MidX()); | |
| 173 } | |
| 174 } | |
| 175 | |
| 176 // HasSignificantLines should guarantee cells. | |
| 177 // Because that code is a different class, just gracefully | |
| 178 // return false. This could be an assert. | |
| 179 if (cell_x_.size() < 3 || cell_y_.size() < 3) { | |
| 180 return false; | |
| 181 } | |
| 182 | |
| 183 // Sort and remove duplicates that may have occurred due to split lines. | |
| 184 std::sort(cell_x_.begin(), cell_x_.end()); | |
| 185 auto last_x = std::unique(cell_x_.begin(), cell_x_.end()); | |
| 186 cell_x_.erase(last_x, cell_x_.end()); | |
| 187 std::sort(cell_y_.begin(), cell_y_.end()); | |
| 188 auto last_y = std::unique(cell_y_.begin(), cell_y_.end()); | |
| 189 cell_y_.erase(last_y, cell_y_.end()); | |
| 190 | |
| 191 // The border should be the extents of line boxes, not middle. | |
| 192 cell_x_[0] = bounding_box_.left(); | |
| 193 cell_x_[cell_x_.size() - 1] = bounding_box_.right(); | |
| 194 cell_y_[0] = bounding_box_.bottom(); | |
| 195 cell_y_[cell_y_.size() - 1] = bounding_box_.top(); | |
| 196 | |
| 197 // Remove duplicates that may have occurred due to moving the borders. | |
| 198 last_x = std::unique(cell_x_.begin(), cell_x_.end()); | |
| 199 cell_x_.erase(last_x, cell_x_.end()); | |
| 200 last_y = std::unique(cell_y_.begin(), cell_y_.end()); | |
| 201 cell_y_.erase(last_y, cell_y_.end()); | |
| 202 | |
| 203 CalculateMargins(); | |
| 204 CalculateStats(); | |
| 205 is_lined_ = VerifyLinedTableCells(); | |
| 206 return is_lined_; | |
| 207 } | |
| 208 | |
| 209 // Finds the cellular structure given a particular box. | |
| 210 bool StructuredTable::FindWhitespacedStructure() { | |
| 211 ClearStructure(); | |
| 212 FindWhitespacedColumns(); | |
| 213 FindWhitespacedRows(); | |
| 214 | |
| 215 if (!VerifyWhitespacedTable()) { | |
| 216 return false; | |
| 217 } else { | |
| 218 bounding_box_.set_left(cell_x_[0]); | |
| 219 bounding_box_.set_right(cell_x_[cell_x_.size() - 1]); | |
| 220 bounding_box_.set_bottom(cell_y_[0]); | |
| 221 bounding_box_.set_top(cell_y_[cell_y_.size() - 1]); | |
| 222 AbsorbNearbyLines(); | |
| 223 CalculateMargins(); | |
| 224 CalculateStats(); | |
| 225 return true; | |
| 226 } | |
| 227 } | |
| 228 | |
| 229 // Tests if a partition fits inside the table structure. | |
| 230 // Partitions must fully span a grid line in order to intersect it. | |
| 231 // This means that a partition does not intersect a line | |
| 232 // that it "just" touches. This is mainly because the assumption | |
| 233 // throughout the code is that "0" distance is a very very small space. | |
| 234 bool StructuredTable::DoesPartitionFit(const ColPartition &part) const { | |
| 235 const TBOX &box = part.bounding_box(); | |
| 236 for (int i : cell_x_) { | |
| 237 if (box.left() < i && i < box.right()) { | |
| 238 return false; | |
| 239 } | |
| 240 } | |
| 241 for (int i : cell_y_) { | |
| 242 if (box.bottom() < i && i < box.top()) { | |
| 243 return false; | |
| 244 } | |
| 245 } | |
| 246 return true; | |
| 247 } | |
| 248 | |
| 249 // Checks if a sub-table has multiple data cells filled. | |
| 250 int StructuredTable::CountFilledCells() { | |
| 251 return CountFilledCells(0, row_count() - 1, 0, column_count() - 1); | |
| 252 } | |
| 253 int StructuredTable::CountFilledCellsInRow(int row) { | |
| 254 return CountFilledCells(row, row, 0, column_count() - 1); | |
| 255 } | |
| 256 int StructuredTable::CountFilledCellsInColumn(int column) { | |
| 257 return CountFilledCells(0, row_count() - 1, column, column); | |
| 258 } | |
| 259 int StructuredTable::CountFilledCells(unsigned row_start, unsigned row_end, unsigned column_start, | |
| 260 unsigned column_end) { | |
| 261 ASSERT_HOST(row_start <= row_end && row_end < row_count()); | |
| 262 ASSERT_HOST(column_start <= column_end && column_end < column_count()); | |
| 263 int cell_count = 0; | |
| 264 TBOX cell_box; | |
| 265 for (unsigned row = row_start; row <= row_end; ++row) { | |
| 266 cell_box.set_bottom(cell_y_[row]); | |
| 267 cell_box.set_top(cell_y_[row + 1]); | |
| 268 for (unsigned col = column_start; col <= column_end; ++col) { | |
| 269 cell_box.set_left(cell_x_[col]); | |
| 270 cell_box.set_right(cell_x_[col + 1]); | |
| 271 if (CountPartitions(cell_box) > 0) { | |
| 272 ++cell_count; | |
| 273 } | |
| 274 } | |
| 275 } | |
| 276 return cell_count; | |
| 277 } | |
| 278 | |
| 279 // Makes sure that at least one cell in a row has substantial area filled. | |
| 280 // This can filter out large whitespace caused by growing tables too far | |
| 281 // and page numbers. | |
| 282 bool StructuredTable::VerifyRowFilled(int row) { | |
| 283 for (unsigned i = 0; i < column_count(); ++i) { | |
| 284 auto area_filled = CalculateCellFilledPercentage(row, i); | |
| 285 if (area_filled >= kMinFilledArea) { | |
| 286 return true; | |
| 287 } | |
| 288 } | |
| 289 return false; | |
| 290 } | |
| 291 | |
| 292 // Finds the filled area in a cell. | |
| 293 // Assume ColPartitions do not overlap for simplicity (even though they do). | |
| 294 double StructuredTable::CalculateCellFilledPercentage(unsigned row, unsigned column) { | |
| 295 ASSERT_HOST(row <= row_count()); | |
| 296 ASSERT_HOST(column <= column_count()); | |
| 297 const TBOX kCellBox(cell_x_[column], cell_y_[row], cell_x_[column + 1], cell_y_[row + 1]); | |
| 298 ASSERT_HOST(!kCellBox.null_box()); | |
| 299 | |
| 300 ColPartitionGridSearch gsearch(text_grid_); | |
| 301 gsearch.SetUniqueMode(true); | |
| 302 gsearch.StartRectSearch(kCellBox); | |
| 303 double area_covered = 0; | |
| 304 ColPartition *text = nullptr; | |
| 305 while ((text = gsearch.NextRectSearch()) != nullptr) { | |
| 306 if (text->IsTextType()) { | |
| 307 area_covered += text->bounding_box().intersection(kCellBox).area(); | |
| 308 } | |
| 309 } | |
| 310 const int32_t current_area = kCellBox.area(); | |
| 311 if (current_area == 0) { | |
| 312 return 1.0; | |
| 313 } | |
| 314 return std::min(1.0, area_covered / current_area); | |
| 315 } | |
| 316 | |
| 317 #ifndef GRAPHICS_DISABLED | |
| 318 | |
| 319 void StructuredTable::Display(ScrollView *window, ScrollView::Color color) { | |
| 320 window->Brush(ScrollView::NONE); | |
| 321 window->Pen(color); | |
| 322 window->Rectangle(bounding_box_.left(), bounding_box_.bottom(), bounding_box_.right(), | |
| 323 bounding_box_.top()); | |
| 324 for (int i : cell_x_) { | |
| 325 window->Line(i, bounding_box_.bottom(), i, bounding_box_.top()); | |
| 326 } | |
| 327 for (int i : cell_y_) { | |
| 328 window->Line(bounding_box_.left(), i, bounding_box_.right(), i); | |
| 329 } | |
| 330 window->UpdateWindow(); | |
| 331 } | |
| 332 | |
| 333 #endif | |
| 334 | |
| 335 // Clear structure information. | |
| 336 void StructuredTable::ClearStructure() { | |
| 337 cell_x_.clear(); | |
| 338 cell_y_.clear(); | |
| 339 is_lined_ = false; | |
| 340 space_above_ = 0; | |
| 341 space_below_ = 0; | |
| 342 space_left_ = 0; | |
| 343 space_right_ = 0; | |
| 344 median_cell_height_ = 0; | |
| 345 median_cell_width_ = 0; | |
| 346 } | |
| 347 | |
| 348 // When a table has lines, the lines should not intersect any partitions. | |
| 349 // The following function makes sure the previous assumption is met. | |
| 350 bool StructuredTable::VerifyLinedTableCells() { | |
| 351 // Function only called when lines exist. | |
| 352 ASSERT_HOST(cell_y_.size() >= 2 && cell_x_.size() >= 2); | |
| 353 for (int i : cell_y_) { | |
| 354 if (CountHorizontalIntersections(i) > 0) { | |
| 355 return false; | |
| 356 } | |
| 357 } | |
| 358 for (int i : cell_x_) { | |
| 359 if (CountVerticalIntersections(i) > 0) { | |
| 360 return false; | |
| 361 } | |
| 362 } | |
| 363 return true; | |
| 364 } | |
| 365 | |
| 366 // TODO(nbeato): Could be much better than this. | |
| 367 // Examples: | |
| 368 // - Calculate the percentage of filled cells. | |
| 369 // - Calculate the average number of ColPartitions per cell. | |
| 370 // - Calculate the number of cells per row with partitions. | |
| 371 // - Check if ColPartitions in adjacent cells are similar. | |
| 372 // - Check that all columns are at least a certain width. | |
| 373 // - etc. | |
| 374 bool StructuredTable::VerifyWhitespacedTable() { | |
| 375 // criteria for a table, must be at least 2x3 or 3x2 | |
| 376 return row_count() >= 2 && column_count() >= 2 && cell_count() >= 6; | |
| 377 } | |
| 378 | |
| 379 // Finds vertical splits in the ColPartitions of text_grid_ by considering | |
| 380 // all possible "good" guesses. A good guess is just the left/right sides of | |
| 381 // the partitions, since these locations will uniquely define where the | |
| 382 // extremal values where the splits can occur. The split happens | |
| 383 // in the middle of the two nearest partitions. | |
| 384 void StructuredTable::FindWhitespacedColumns() { | |
| 385 // Set of the extents of all partitions on the page. | |
| 386 std::vector<int> left_sides; | |
| 387 std::vector<int> right_sides; | |
| 388 | |
| 389 // Look at each text partition. We want to find the partitions | |
| 390 // that have extremal left/right sides. These will give us a basis | |
| 391 // for the table columns. | |
| 392 ColPartitionGridSearch gsearch(text_grid_); | |
| 393 gsearch.SetUniqueMode(true); | |
| 394 gsearch.StartRectSearch(bounding_box_); | |
| 395 ColPartition *text = nullptr; | |
| 396 while ((text = gsearch.NextRectSearch()) != nullptr) { | |
| 397 if (!text->IsTextType()) { | |
| 398 continue; | |
| 399 } | |
| 400 | |
| 401 ASSERT_HOST(text->bounding_box().left() < text->bounding_box().right()); | |
| 402 int spacing = static_cast<int>(text->median_width() * kHorizontalSpacing / 2.0 + 0.5); | |
| 403 left_sides.push_back(text->bounding_box().left() - spacing); | |
| 404 right_sides.push_back(text->bounding_box().right() + spacing); | |
| 405 } | |
| 406 // It causes disaster below, so avoid it! | |
| 407 if (left_sides.empty() || right_sides.empty()) { | |
| 408 return; | |
| 409 } | |
| 410 | |
| 411 // Since data may be inserted in grid order, we sort the left/right sides. | |
| 412 std::sort(left_sides.begin(), left_sides.end()); | |
| 413 std::sort(right_sides.begin(), right_sides.end()); | |
| 414 | |
| 415 // At this point, in the "merged list", we expect to have a left side, | |
| 416 // followed by either more left sides or a right side. The last number | |
| 417 // should be a right side. We find places where the splits occur by looking | |
| 418 // for "valleys". If we want to force gap sizes or allow overlap, change | |
| 419 // the spacing above. If you want to let lines "slice" partitions as long | |
| 420 // as it is infrequent, change the following function. | |
| 421 FindCellSplitLocations(left_sides, right_sides, kCellSplitColumnThreshold, &cell_x_); | |
| 422 } | |
| 423 | |
| 424 // Finds horizontal splits in the ColPartitions of text_grid_ by considering | |
| 425 // all possible "good" guesses. A good guess is just the bottom/top sides of | |
| 426 // the partitions, since these locations will uniquely define where the | |
| 427 // extremal values where the splits can occur. The split happens | |
| 428 // in the middle of the two nearest partitions. | |
| 429 void StructuredTable::FindWhitespacedRows() { | |
| 430 // Set of the extents of all partitions on the page. | |
| 431 std::vector<int> bottom_sides; | |
| 432 std::vector<int> top_sides; | |
| 433 // We will be "shrinking" partitions, so keep the min/max around to | |
| 434 // make sure the bottom/top lines do not intersect text. | |
| 435 int min_bottom = INT32_MAX; | |
| 436 int max_top = INT32_MIN; | |
| 437 | |
| 438 // Look at each text partition. We want to find the partitions | |
| 439 // that have extremal bottom/top sides. These will give us a basis | |
| 440 // for the table rows. Because the textlines can be skewed and close due | |
| 441 // to warping, the height of the partitions is toned down a little bit. | |
| 442 ColPartitionGridSearch gsearch(text_grid_); | |
| 443 gsearch.SetUniqueMode(true); | |
| 444 gsearch.StartRectSearch(bounding_box_); | |
| 445 ColPartition *text = nullptr; | |
| 446 while ((text = gsearch.NextRectSearch()) != nullptr) { | |
| 447 if (!text->IsTextType()) { | |
| 448 continue; | |
| 449 } | |
| 450 | |
| 451 ASSERT_HOST(text->bounding_box().bottom() < text->bounding_box().top()); | |
| 452 min_bottom = std::min(min_bottom, static_cast<int>(text->bounding_box().bottom())); | |
| 453 max_top = std::max(max_top, static_cast<int>(text->bounding_box().top())); | |
| 454 | |
| 455 // Ignore "tall" text partitions, as these are usually false positive | |
| 456 // vertical text or multiple lines pulled together. | |
| 457 if (text->bounding_box().height() > max_text_height_) { | |
| 458 continue; | |
| 459 } | |
| 460 | |
| 461 int spacing = static_cast<int>(text->bounding_box().height() * kVerticalSpacing / 2.0 + 0.5); | |
| 462 int bottom = text->bounding_box().bottom() - spacing; | |
| 463 int top = text->bounding_box().top() + spacing; | |
| 464 // For horizontal text, the factor can be negative. This should | |
| 465 // probably cause a warning or failure. I haven't actually checked if | |
| 466 // it happens. | |
| 467 if (bottom >= top) { | |
| 468 continue; | |
| 469 } | |
| 470 | |
| 471 bottom_sides.push_back(bottom); | |
| 472 top_sides.push_back(top); | |
| 473 } | |
| 474 // It causes disaster below, so avoid it! | |
| 475 if (bottom_sides.empty() || top_sides.empty()) { | |
| 476 return; | |
| 477 } | |
| 478 | |
| 479 // Since data may be inserted in grid order, we sort the bottom/top sides. | |
| 480 std::sort(bottom_sides.begin(), bottom_sides.end()); | |
| 481 std::sort(top_sides.begin(), top_sides.end()); | |
| 482 | |
| 483 // At this point, in the "merged list", we expect to have a bottom side, | |
| 484 // followed by either more bottom sides or a top side. The last number | |
| 485 // should be a top side. We find places where the splits occur by looking | |
| 486 // for "valleys". If we want to force gap sizes or allow overlap, change | |
| 487 // the spacing above. If you want to let lines "slice" partitions as long | |
| 488 // as it is infrequent, change the following function. | |
| 489 FindCellSplitLocations(bottom_sides, top_sides, kCellSplitRowThreshold, &cell_y_); | |
| 490 | |
| 491 // Recover the min/max correctly since it was shifted. | |
| 492 cell_y_[0] = min_bottom; | |
| 493 cell_y_[cell_y_.size() - 1] = max_top; | |
| 494 } | |
| 495 | |
| 496 void StructuredTable::CalculateMargins() { | |
| 497 space_above_ = INT32_MAX; | |
| 498 space_below_ = INT32_MAX; | |
| 499 space_right_ = INT32_MAX; | |
| 500 space_left_ = INT32_MAX; | |
| 501 UpdateMargins(text_grid_); | |
| 502 UpdateMargins(line_grid_); | |
| 503 } | |
| 504 // Finds the nearest partition in grid to the table | |
| 505 // boundaries and updates the margin. | |
| 506 void StructuredTable::UpdateMargins(ColPartitionGrid *grid) { | |
| 507 int below = FindVerticalMargin(grid, bounding_box_.bottom(), true); | |
| 508 space_below_ = std::min(space_below_, below); | |
| 509 int above = FindVerticalMargin(grid, bounding_box_.top(), false); | |
| 510 space_above_ = std::min(space_above_, above); | |
| 511 int left = FindHorizontalMargin(grid, bounding_box_.left(), true); | |
| 512 space_left_ = std::min(space_left_, left); | |
| 513 int right = FindHorizontalMargin(grid, bounding_box_.right(), false); | |
| 514 space_right_ = std::min(space_right_, right); | |
| 515 } | |
| 516 int StructuredTable::FindVerticalMargin(ColPartitionGrid *grid, int border, bool decrease) const { | |
| 517 ColPartitionGridSearch gsearch(grid); | |
| 518 gsearch.SetUniqueMode(true); | |
| 519 gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(), border); | |
| 520 ColPartition *part = nullptr; | |
| 521 while ((part = gsearch.NextVerticalSearch(decrease)) != nullptr) { | |
| 522 if (!part->IsTextType() && !part->IsHorizontalLine()) { | |
| 523 continue; | |
| 524 } | |
| 525 int distance = | |
| 526 decrease ? border - part->bounding_box().top() : part->bounding_box().bottom() - border; | |
| 527 if (distance >= 0) { | |
| 528 return distance; | |
| 529 } | |
| 530 } | |
| 531 return INT32_MAX; | |
| 532 } | |
| 533 int StructuredTable::FindHorizontalMargin(ColPartitionGrid *grid, int border, bool decrease) const { | |
| 534 ColPartitionGridSearch gsearch(grid); | |
| 535 gsearch.SetUniqueMode(true); | |
| 536 gsearch.StartSideSearch(border, bounding_box_.bottom(), bounding_box_.top()); | |
| 537 ColPartition *part = nullptr; | |
| 538 while ((part = gsearch.NextSideSearch(decrease)) != nullptr) { | |
| 539 if (!part->IsTextType() && !part->IsVerticalLine()) { | |
| 540 continue; | |
| 541 } | |
| 542 int distance = | |
| 543 decrease ? border - part->bounding_box().right() : part->bounding_box().left() - border; | |
| 544 if (distance >= 0) { | |
| 545 return distance; | |
| 546 } | |
| 547 } | |
| 548 return INT32_MAX; | |
| 549 } | |
| 550 | |
| 551 void StructuredTable::CalculateStats() { | |
| 552 const int kMaxCellHeight = 1000; | |
| 553 const int kMaxCellWidth = 1000; | |
| 554 STATS height_stats(0, kMaxCellHeight); | |
| 555 STATS width_stats(0, kMaxCellWidth); | |
| 556 | |
| 557 for (unsigned i = 0; i < row_count(); ++i) { | |
| 558 height_stats.add(row_height(i), column_count()); | |
| 559 } | |
| 560 for (unsigned i = 0; i < column_count(); ++i) { | |
| 561 width_stats.add(column_width(i), row_count()); | |
| 562 } | |
| 563 | |
| 564 median_cell_height_ = static_cast<int>(height_stats.median() + 0.5); | |
| 565 median_cell_width_ = static_cast<int>(width_stats.median() + 0.5); | |
| 566 } | |
| 567 | |
| 568 // Looks for grid lines near the current bounding box and | |
| 569 // grows the bounding box to include them if no intersections | |
| 570 // will occur as a result. This is necessary because the margins | |
| 571 // are calculated relative to the closest line/text. If the | |
| 572 // line isn't absorbed, the margin will be the distance to the line. | |
| 573 void StructuredTable::AbsorbNearbyLines() { | |
| 574 ColPartitionGridSearch gsearch(line_grid_); | |
| 575 gsearch.SetUniqueMode(true); | |
| 576 | |
| 577 // Is the closest line above good? Loop multiple times for tables with | |
| 578 // multi-line (sometimes 2) borders. Limit the number of lines by | |
| 579 // making sure they stay within a table cell or so. | |
| 580 ColPartition *line = nullptr; | |
| 581 gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(), bounding_box_.top()); | |
| 582 while ((line = gsearch.NextVerticalSearch(false)) != nullptr) { | |
| 583 if (!line->IsHorizontalLine()) { | |
| 584 break; | |
| 585 } | |
| 586 TBOX text_search(bounding_box_.left(), bounding_box_.top() + 1, bounding_box_.right(), | |
| 587 line->MidY()); | |
| 588 if (text_search.height() > median_cell_height_ * 2) { | |
| 589 break; | |
| 590 } | |
| 591 if (CountPartitions(text_search) > 0) { | |
| 592 break; | |
| 593 } | |
| 594 bounding_box_.set_top(line->MidY()); | |
| 595 } | |
| 596 // As above, is the closest line below good? | |
| 597 line = nullptr; | |
| 598 gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(), bounding_box_.bottom()); | |
| 599 while ((line = gsearch.NextVerticalSearch(true)) != nullptr) { | |
| 600 if (!line->IsHorizontalLine()) { | |
| 601 break; | |
| 602 } | |
| 603 TBOX text_search(bounding_box_.left(), line->MidY(), bounding_box_.right(), | |
| 604 bounding_box_.bottom() - 1); | |
| 605 if (text_search.height() > median_cell_height_ * 2) { | |
| 606 break; | |
| 607 } | |
| 608 if (CountPartitions(text_search) > 0) { | |
| 609 break; | |
| 610 } | |
| 611 bounding_box_.set_bottom(line->MidY()); | |
| 612 } | |
| 613 // TODO(nbeato): vertical lines | |
| 614 } | |
| 615 | |
| 616 // This function will find all "0 valleys" (of any length) given two | |
| 617 // arrays. The arrays are the mins and maxes of partitions (either | |
| 618 // left and right or bottom and top). Since the min/max lists are generated | |
| 619 // with pairs of increasing integers, we can make some assumptions in | |
| 620 // the function about ordering of the overall list, which are shown in the | |
| 621 // asserts. | |
| 622 // The algorithm works as follows: | |
| 623 // While there are numbers to process, take the smallest number. | |
| 624 // If it is from the min_list, increment the "hill" counter. | |
| 625 // Otherwise, decrement the "hill" counter. | |
| 626 // In the process of doing this, keep track of "crossing" the | |
| 627 // desired height. | |
| 628 // The first/last items are extremal values of the list and known. | |
| 629 // NOTE: This function assumes the lists are sorted! | |
| 630 void StructuredTable::FindCellSplitLocations(const std::vector<int> &min_list, | |
| 631 const std::vector<int> &max_list, int max_merged, | |
| 632 std::vector<int> *locations) { | |
| 633 locations->clear(); | |
| 634 ASSERT_HOST(min_list.size() == max_list.size()); | |
| 635 if (min_list.empty()) { | |
| 636 return; | |
| 637 } | |
| 638 ASSERT_HOST(min_list.at(0) < max_list.at(0)); | |
| 639 ASSERT_HOST(min_list.at(min_list.size() - 1) < max_list.at(max_list.size() - 1)); | |
| 640 | |
| 641 locations->push_back(min_list.at(0)); | |
| 642 unsigned min_index = 0; | |
| 643 unsigned max_index = 0; | |
| 644 int stacked_partitions = 0; | |
| 645 int last_cross_position = INT32_MAX; | |
| 646 // max_index will expire after min_index. | |
| 647 // However, we can't "increase" the hill size if min_index expired. | |
| 648 // So finish processing when min_index expires. | |
| 649 while (min_index < min_list.size()) { | |
| 650 // Increase the hill count. | |
| 651 if (min_list[min_index] < max_list[max_index]) { | |
| 652 ++stacked_partitions; | |
| 653 if (last_cross_position != INT32_MAX && stacked_partitions > max_merged) { | |
| 654 int mid = (last_cross_position + min_list[min_index]) / 2; | |
| 655 locations->push_back(mid); | |
| 656 last_cross_position = INT32_MAX; | |
| 657 } | |
| 658 ++min_index; | |
| 659 } else { | |
| 660 // Decrease the hill count. | |
| 661 --stacked_partitions; | |
| 662 if (last_cross_position == INT32_MAX && stacked_partitions <= max_merged) { | |
| 663 last_cross_position = max_list[max_index]; | |
| 664 } | |
| 665 ++max_index; | |
| 666 } | |
| 667 } | |
| 668 locations->push_back(max_list.at(max_list.size() - 1)); | |
| 669 } | |
| 670 | |
| 671 // Counts the number of partitions in the table | |
| 672 // box that intersection the given x value. | |
| 673 int StructuredTable::CountVerticalIntersections(int x) { | |
| 674 int count = 0; | |
| 675 // Make a small box to keep the search time down. | |
| 676 const int kGridSize = text_grid_->gridsize(); | |
| 677 TBOX vertical_box = bounding_box_; | |
| 678 vertical_box.set_left(x - kGridSize); | |
| 679 vertical_box.set_right(x + kGridSize); | |
| 680 | |
| 681 ColPartitionGridSearch gsearch(text_grid_); | |
| 682 gsearch.SetUniqueMode(true); | |
| 683 gsearch.StartRectSearch(vertical_box); | |
| 684 ColPartition *text = nullptr; | |
| 685 while ((text = gsearch.NextRectSearch()) != nullptr) { | |
| 686 if (!text->IsTextType()) { | |
| 687 continue; | |
| 688 } | |
| 689 const TBOX &box = text->bounding_box(); | |
| 690 if (box.left() < x && x < box.right()) { | |
| 691 ++count; | |
| 692 } | |
| 693 } | |
| 694 return count; | |
| 695 } | |
| 696 | |
| 697 // Counts the number of partitions in the table | |
| 698 // box that intersection the given y value. | |
| 699 int StructuredTable::CountHorizontalIntersections(int y) { | |
| 700 int count = 0; | |
| 701 // Make a small box to keep the search time down. | |
| 702 const int kGridSize = text_grid_->gridsize(); | |
| 703 TBOX horizontal_box = bounding_box_; | |
| 704 horizontal_box.set_bottom(y - kGridSize); | |
| 705 horizontal_box.set_top(y + kGridSize); | |
| 706 | |
| 707 ColPartitionGridSearch gsearch(text_grid_); | |
| 708 gsearch.SetUniqueMode(true); | |
| 709 gsearch.StartRectSearch(horizontal_box); | |
| 710 ColPartition *text = nullptr; | |
| 711 while ((text = gsearch.NextRectSearch()) != nullptr) { | |
| 712 if (!text->IsTextType()) { | |
| 713 continue; | |
| 714 } | |
| 715 | |
| 716 const TBOX &box = text->bounding_box(); | |
| 717 if (box.bottom() < y && y < box.top()) { | |
| 718 ++count; | |
| 719 } | |
| 720 } | |
| 721 return count; | |
| 722 } | |
| 723 | |
| 724 // Counts how many text partitions are in this box. | |
| 725 // This is used to count partitions in cells, as that can indicate | |
| 726 // how "strong" a potential table row/column (or even full table) actually is. | |
| 727 int StructuredTable::CountPartitions(const TBOX &box) { | |
| 728 ColPartitionGridSearch gsearch(text_grid_); | |
| 729 gsearch.SetUniqueMode(true); | |
| 730 gsearch.StartRectSearch(box); | |
| 731 int count = 0; | |
| 732 ColPartition *text = nullptr; | |
| 733 while ((text = gsearch.NextRectSearch()) != nullptr) { | |
| 734 if (text->IsTextType()) { | |
| 735 ++count; | |
| 736 } | |
| 737 } | |
| 738 return count; | |
| 739 } | |
| 740 | |
| 741 //////// | |
| 742 //////// TableRecognizer Class | |
| 743 //////// | |
| 744 | |
| 745 void TableRecognizer::Init() {} | |
| 746 | |
| 747 void TableRecognizer::set_text_grid(ColPartitionGrid *text_grid) { | |
| 748 text_grid_ = text_grid; | |
| 749 } | |
| 750 void TableRecognizer::set_line_grid(ColPartitionGrid *line_grid) { | |
| 751 line_grid_ = line_grid; | |
| 752 } | |
| 753 void TableRecognizer::set_min_height(int height) { | |
| 754 min_height_ = height; | |
| 755 } | |
| 756 void TableRecognizer::set_min_width(int width) { | |
| 757 min_width_ = width; | |
| 758 } | |
| 759 void TableRecognizer::set_max_text_height(int height) { | |
| 760 max_text_height_ = height; | |
| 761 } | |
| 762 | |
| 763 StructuredTable *TableRecognizer::RecognizeTable(const TBOX &guess) { | |
| 764 auto *table = new StructuredTable(); | |
| 765 table->Init(); | |
| 766 table->set_text_grid(text_grid_); | |
| 767 table->set_line_grid(line_grid_); | |
| 768 table->set_max_text_height(max_text_height_); | |
| 769 | |
| 770 // Try to solve this simple case, a table with *both* | |
| 771 // vertical and horizontal lines. | |
| 772 if (RecognizeLinedTable(guess, table)) { | |
| 773 return table; | |
| 774 } | |
| 775 | |
| 776 // Fallback to whitespace if that failed. | |
| 777 // TODO(nbeato): Break this apart to take advantage of horizontal | |
| 778 // lines or vertical lines when present. | |
| 779 if (RecognizeWhitespacedTable(guess, table)) { | |
| 780 return table; | |
| 781 } | |
| 782 | |
| 783 // No table found... | |
| 784 delete table; | |
| 785 return nullptr; | |
| 786 } | |
| 787 | |
| 788 bool TableRecognizer::RecognizeLinedTable(const TBOX &guess_box, StructuredTable *table) { | |
| 789 if (!HasSignificantLines(guess_box)) { | |
| 790 return false; | |
| 791 } | |
| 792 TBOX line_bound = guess_box; | |
| 793 if (!FindLinesBoundingBox(&line_bound)) { | |
| 794 return false; | |
| 795 } | |
| 796 table->set_bounding_box(line_bound); | |
| 797 return table->FindLinedStructure(); | |
| 798 } | |
| 799 | |
| 800 // Quick implementation. Just count the number of lines in the box. | |
| 801 // A better implementation would counter intersections and look for connected | |
| 802 // components. It could even go as far as finding similar length lines. | |
| 803 // To account for these possible issues, the VerifyLinedTableCells function | |
| 804 // will reject lined tables that cause intersections with text on the page. | |
| 805 // TODO(nbeato): look for "better" lines | |
| 806 bool TableRecognizer::HasSignificantLines(const TBOX &guess) { | |
| 807 ColPartitionGridSearch box_search(line_grid_); | |
| 808 box_search.SetUniqueMode(true); | |
| 809 box_search.StartRectSearch(guess); | |
| 810 ColPartition *line = nullptr; | |
| 811 int vertical_count = 0; | |
| 812 int horizontal_count = 0; | |
| 813 | |
| 814 while ((line = box_search.NextRectSearch()) != nullptr) { | |
| 815 if (line->IsHorizontalLine()) { | |
| 816 ++horizontal_count; | |
| 817 } | |
| 818 if (line->IsVerticalLine()) { | |
| 819 ++vertical_count; | |
| 820 } | |
| 821 } | |
| 822 | |
| 823 return vertical_count >= kLinedTableMinVerticalLines && | |
| 824 horizontal_count >= kLinedTableMinHorizontalLines; | |
| 825 } | |
| 826 | |
| 827 // Given a bounding box with a bunch of horizontal / vertical lines, | |
| 828 // we just find the extents of all of these lines iteratively. | |
| 829 // The box will be at least as large as guess. This | |
| 830 // could possibly be a bad assumption. | |
| 831 // It is guaranteed to halt in at least O(n * gridarea) where n | |
| 832 // is the number of lines. | |
| 833 // The assumption is that growing the box iteratively will add lines | |
| 834 // several times, but eventually we'll find the extents. | |
| 835 // | |
| 836 // For tables, the approach is a bit aggressive, a single line (which could be | |
| 837 // noise or a column ruling) can destroy the table inside. | |
| 838 // | |
| 839 // TODO(nbeato): This is a quick first implementation. | |
| 840 // A better implementation would actually look for consistency | |
| 841 // in extents of the lines and find the extents using lines | |
| 842 // that clearly describe the table. This would allow the | |
| 843 // lines to "vote" for height/width. An approach like | |
| 844 // this would solve issues with page layout rulings. | |
| 845 // I haven't looked for these issues yet, so I can't even | |
| 846 // say they happen confidently. | |
| 847 bool TableRecognizer::FindLinesBoundingBox(TBOX *bounding_box) { | |
| 848 // The first iteration will tell us if there are lines | |
| 849 // present and shrink the box to a minimal iterative size. | |
| 850 if (!FindLinesBoundingBoxIteration(bounding_box)) { | |
| 851 return false; | |
| 852 } | |
| 853 | |
| 854 // Keep growing until the area of the table stabilizes. | |
| 855 // The box can only get bigger, increasing area. | |
| 856 bool changed = true; | |
| 857 while (changed) { | |
| 858 changed = false; | |
| 859 int old_area = bounding_box->area(); | |
| 860 bool check = FindLinesBoundingBoxIteration(bounding_box); | |
| 861 // At this point, the function will return true. | |
| 862 ASSERT_HOST(check); | |
| 863 ASSERT_HOST(bounding_box->area() >= old_area); | |
| 864 changed = (bounding_box->area() > old_area); | |
| 865 } | |
| 866 | |
| 867 return true; | |
| 868 } | |
| 869 | |
| 870 bool TableRecognizer::FindLinesBoundingBoxIteration(TBOX *bounding_box) { | |
| 871 // Search for all of the lines in the current box, keeping track of extents. | |
| 872 ColPartitionGridSearch box_search(line_grid_); | |
| 873 box_search.SetUniqueMode(true); | |
| 874 box_search.StartRectSearch(*bounding_box); | |
| 875 ColPartition *line = nullptr; | |
| 876 bool first_line = true; | |
| 877 | |
| 878 while ((line = box_search.NextRectSearch()) != nullptr) { | |
| 879 if (line->IsLineType()) { | |
| 880 if (first_line) { | |
| 881 // The first iteration can shrink the box. | |
| 882 *bounding_box = line->bounding_box(); | |
| 883 first_line = false; | |
| 884 } else { | |
| 885 *bounding_box += line->bounding_box(); | |
| 886 } | |
| 887 } | |
| 888 } | |
| 889 return !first_line; | |
| 890 } | |
| 891 | |
| 892 // The goal of this function is to move the table boundaries around and find | |
| 893 // a table that maximizes the whitespace around the table while maximizing | |
| 894 // the cellular structure. As a result, it gets confused by headers, footers, | |
| 895 // and merged columns (text that crosses columns). There is a tolerance | |
| 896 // that allows a few partitions to count towards potential cell merges. | |
| 897 // It's the max_merged parameter to FindPartitionLocations. | |
| 898 // It can work, but it needs some false positive remove on boundaries. | |
| 899 // For now, the grid structure must not intersect any partitions. | |
| 900 // Also, small tolerance is added to the horizontal lines for tightly packed | |
| 901 // tables. The tolerance is added by adjusting the bounding boxes of the | |
| 902 // partitions (in FindHorizontalPartitions). The current implementation | |
| 903 // only adjusts the vertical extents of the table. | |
| 904 // | |
| 905 // Also note. This was hacked at a lot. It could probably use some | |
| 906 // more hacking at to find a good set of border conditions and then a | |
| 907 // nice clean up. | |
| 908 bool TableRecognizer::RecognizeWhitespacedTable(const TBOX &guess_box, StructuredTable *table) { | |
| 909 TBOX best_box = guess_box; // Best borders known. | |
| 910 int best_below = 0; // Margin size above best table. | |
| 911 int best_above = 0; // Margin size below best table. | |
| 912 TBOX adjusted = guess_box; // The search box. | |
| 913 | |
| 914 // We assume that the guess box is somewhat accurate, so we don't allow | |
| 915 // the adjusted border to pass half of the guessed area. This prevents | |
| 916 // "negative" tables from forming. | |
| 917 const int kMidGuessY = (guess_box.bottom() + guess_box.top()) / 2; | |
| 918 // Keeps track of the most columns in an accepted table. The resulting table | |
| 919 // may be less than the max, but we don't want to stray too far. | |
| 920 unsigned best_cols = 0; | |
| 921 // Make sure we find a good border. | |
| 922 bool found_good_border = false; | |
| 923 | |
| 924 // Find the bottom of the table by trying a few different locations. For | |
| 925 // each location, the top, left, and right are fixed. We start the search | |
| 926 // in a smaller table to favor best_cols getting a good estimate sooner. | |
| 927 int last_bottom = INT32_MAX; | |
| 928 int bottom = | |
| 929 NextHorizontalSplit(guess_box.left(), guess_box.right(), kMidGuessY - min_height_ / 2, true); | |
| 930 int top = | |
| 931 NextHorizontalSplit(guess_box.left(), guess_box.right(), kMidGuessY + min_height_ / 2, false); | |
| 932 adjusted.set_top(top); | |
| 933 | |
| 934 // Headers/footers can be spaced far from everything. | |
| 935 // Make sure that the space below is greater than the space above | |
| 936 // the lowest row. | |
| 937 int previous_below = 0; | |
| 938 const int kMaxChances = 10; | |
| 939 int chances = kMaxChances; | |
| 940 while (bottom != last_bottom) { | |
| 941 adjusted.set_bottom(bottom); | |
| 942 | |
| 943 if (adjusted.height() >= min_height_) { | |
| 944 // Try to fit the grid on the current box. We give it a chance | |
| 945 // if the number of columns didn't significantly drop. | |
| 946 table->set_bounding_box(adjusted); | |
| 947 if (table->FindWhitespacedStructure() && | |
| 948 table->column_count() >= best_cols * kRequiredColumns) { | |
| 949 if (false && IsWeakTableRow(table, 0)) { | |
| 950 // Currently buggy, but was looking promising so disabled. | |
| 951 --chances; | |
| 952 } else { | |
| 953 // We favor 2 things, | |
| 954 // 1- Adding rows that have partitioned data. | |
| 955 // 2- Better margins (to find header/footer). | |
| 956 // For better tables, we just look for multiple cells in the | |
| 957 // bottom row with data in them. | |
| 958 // For margins, the space below the last row should | |
| 959 // be better than a table with the last row removed. | |
| 960 chances = kMaxChances; | |
| 961 double max_row_height = kMaxRowSize * table->median_cell_height(); | |
| 962 if ((table->space_below() * kMarginFactor >= best_below && | |
| 963 table->space_below() >= previous_below) || | |
| 964 (table->CountFilledCellsInRow(0) > 1 && table->row_height(0) < max_row_height)) { | |
| 965 best_box.set_bottom(bottom); | |
| 966 best_below = table->space_below(); | |
| 967 best_cols = std::max(table->column_count(), best_cols); | |
| 968 found_good_border = true; | |
| 969 } | |
| 970 } | |
| 971 previous_below = table->space_below(); | |
| 972 } else { | |
| 973 --chances; | |
| 974 } | |
| 975 } | |
| 976 if (chances <= 0) { | |
| 977 break; | |
| 978 } | |
| 979 | |
| 980 last_bottom = bottom; | |
| 981 bottom = NextHorizontalSplit(guess_box.left(), guess_box.right(), last_bottom, true); | |
| 982 } | |
| 983 if (!found_good_border) { | |
| 984 return false; | |
| 985 } | |
| 986 | |
| 987 // TODO(nbeato) comments: follow modified code above... put it in a function! | |
| 988 found_good_border = false; | |
| 989 int last_top = INT32_MIN; | |
| 990 top = | |
| 991 NextHorizontalSplit(guess_box.left(), guess_box.right(), kMidGuessY + min_height_ / 2, false); | |
| 992 int previous_above = 0; | |
| 993 chances = kMaxChances; | |
| 994 | |
| 995 adjusted.set_bottom(best_box.bottom()); | |
| 996 while (last_top != top) { | |
| 997 adjusted.set_top(top); | |
| 998 if (adjusted.height() >= min_height_) { | |
| 999 table->set_bounding_box(adjusted); | |
| 1000 if (table->FindWhitespacedStructure() && | |
| 1001 table->column_count() >= best_cols * kRequiredColumns) { | |
| 1002 int last_row = table->row_count() - 1; | |
| 1003 if (false && IsWeakTableRow(table, last_row)) { | |
| 1004 // Currently buggy, but was looking promising so disabled. | |
| 1005 --chances; | |
| 1006 } else { | |
| 1007 chances = kMaxChances; | |
| 1008 double max_row_height = kMaxRowSize * table->median_cell_height(); | |
| 1009 if ((table->space_above() * kMarginFactor >= best_above && | |
| 1010 table->space_above() >= previous_above) || | |
| 1011 (table->CountFilledCellsInRow(last_row) > 1 && | |
| 1012 table->row_height(last_row) < max_row_height)) { | |
| 1013 best_box.set_top(top); | |
| 1014 best_above = table->space_above(); | |
| 1015 best_cols = std::max(table->column_count(), best_cols); | |
| 1016 found_good_border = true; | |
| 1017 } | |
| 1018 } | |
| 1019 previous_above = table->space_above(); | |
| 1020 } else { | |
| 1021 --chances; | |
| 1022 } | |
| 1023 } | |
| 1024 if (chances <= 0) { | |
| 1025 break; | |
| 1026 } | |
| 1027 | |
| 1028 last_top = top; | |
| 1029 top = NextHorizontalSplit(guess_box.left(), guess_box.right(), last_top, false); | |
| 1030 } | |
| 1031 | |
| 1032 if (!found_good_border) { | |
| 1033 return false; | |
| 1034 } | |
| 1035 | |
| 1036 // If we get here, this shouldn't happen. It can be an assert, but | |
| 1037 // I haven't tested it enough to make it crash things. | |
| 1038 if (best_box.null_box()) { | |
| 1039 return false; | |
| 1040 } | |
| 1041 | |
| 1042 // Given the best locations, fit the box to those locations. | |
| 1043 table->set_bounding_box(best_box); | |
| 1044 return table->FindWhitespacedStructure(); | |
| 1045 } | |
| 1046 | |
| 1047 // Finds the closest value to y that can safely cause a horizontal | |
| 1048 // split in the partitions. | |
| 1049 // This function has been buggy and not as reliable as I would've | |
| 1050 // liked. I suggest finding all of the splits using the | |
| 1051 // FindPartitionLocations once and then just keeping the results | |
| 1052 // of that function cached somewhere. | |
| 1053 int TableRecognizer::NextHorizontalSplit(int left, int right, int y, bool top_to_bottom) { | |
| 1054 ColPartitionGridSearch gsearch(text_grid_); | |
| 1055 gsearch.SetUniqueMode(true); | |
| 1056 gsearch.StartVerticalSearch(left, right, y); | |
| 1057 ColPartition *text = nullptr; | |
| 1058 int last_y = y; | |
| 1059 while ((text = gsearch.NextVerticalSearch(top_to_bottom)) != nullptr) { | |
| 1060 if (!text->IsTextType() || !text->IsHorizontalType()) { | |
| 1061 continue; | |
| 1062 } | |
| 1063 if (text->bounding_box().height() > max_text_height_) { | |
| 1064 continue; | |
| 1065 } | |
| 1066 | |
| 1067 const TBOX &text_box = text->bounding_box(); | |
| 1068 if (top_to_bottom && (last_y >= y || last_y <= text_box.top())) { | |
| 1069 last_y = std::min(last_y, static_cast<int>(text_box.bottom())); | |
| 1070 continue; | |
| 1071 } | |
| 1072 if (!top_to_bottom && (last_y <= y || last_y >= text_box.bottom())) { | |
| 1073 last_y = std::max(last_y, static_cast<int>(text_box.top())); | |
| 1074 continue; | |
| 1075 } | |
| 1076 | |
| 1077 return last_y; | |
| 1078 } | |
| 1079 // If none is found, we at least want to preserve the min/max, | |
| 1080 // which defines the overlap of y with the last partition in the grid. | |
| 1081 return last_y; | |
| 1082 } | |
| 1083 | |
| 1084 } // namespace tesseract |
