diff mupdf-source/thirdparty/tesseract/src/textord/tablerecog.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/textord/tablerecog.cpp	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,1084 @@
+///////////////////////////////////////////////////////////////////////
+// File:        tablerecog.cpp
+// Description: Helper class to help structure table areas. Given an bounding
+//              box from TableFinder, the TableRecognizer should give a
+//              StructuredTable (maybe a list in the future) of "good" tables
+//              in that area.
+// Author:      Nicholas Beato
+//
+// (C) Copyright 2009, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifdef HAVE_CONFIG_H
+#  include "config_auto.h"
+#endif
+
+#include "tablerecog.h"
+
+#include <algorithm>
+
+namespace tesseract {
+
+// The amount of space required between the ColPartitions in 2 columns
+// of a non-lined table as a multiple of the median width.
+const double kHorizontalSpacing = 0.30;
+// The amount of space required between the ColPartitions in 2 rows
+// of a non-lined table as multiples of the median height.
+const double kVerticalSpacing = -0.2;
+// The number of cells that the grid lines may intersect.
+// See FindCellSplitLocations for explanation.
+const int kCellSplitRowThreshold = 0;
+const int kCellSplitColumnThreshold = 0;
+// For "lined tables", the number of required lines. Currently a guess.
+const int kLinedTableMinVerticalLines = 3;
+const int kLinedTableMinHorizontalLines = 3;
+// Number of columns required, as a fraction of the most columns found.
+// None of these are tweaked at all.
+const double kRequiredColumns = 0.7;
+// The tolerance for comparing margins of potential tables.
+const double kMarginFactor = 1.1;
+// The first and last row should be consistent cell height.
+// This factor is the first and last row cell height max.
+const double kMaxRowSize = 2.5;
+// Number of filled columns required to form a strong table row.
+// For small tables, this is an absolute number.
+const double kGoodRowNumberOfColumnsSmall[] = {2, 2, 2, 2, 2, 3, 3};
+// For large tables, it is a relative number
+const double kGoodRowNumberOfColumnsLarge = 0.7;
+// The amount of area that must be covered in a cell by ColPartitions to
+// be considered "filled"
+const double kMinFilledArea = 0.35;
+
+// Indicates that a table row is weak. This means that it has
+// many missing data cells or very large cell heights compared.
+// to the rest of the table.
+// Code is buggy right now. It is disabled in the calling function.
+// It seems like sometimes the row that is passed in is not correct
+// sometimes (like a phantom row is introduced). There's something going
+// on in the cell_y_ data member before this is called... not certain.
+static bool IsWeakTableRow(StructuredTable *table, int row) {
+  if (!table->VerifyRowFilled(row)) {
+    return false;
+  }
+
+  double threshold;
+  if (table->column_count() < countof(kGoodRowNumberOfColumnsSmall)) {
+    threshold = kGoodRowNumberOfColumnsSmall[table->column_count()];
+  } else {
+    threshold = table->column_count() * kGoodRowNumberOfColumnsLarge;
+  }
+
+  return table->CountFilledCellsInRow(row) < threshold;
+}
+
+////////
+//////// StructuredTable Class
+////////
+
+StructuredTable::StructuredTable()
+    : text_grid_(nullptr)
+    , line_grid_(nullptr)
+    , is_lined_(false)
+    , space_above_(0)
+    , space_below_(0)
+    , space_left_(0)
+    , space_right_(0)
+    , median_cell_height_(0)
+    , median_cell_width_(0)
+    , max_text_height_(INT32_MAX) {}
+
+void StructuredTable::Init() {}
+
+void StructuredTable::set_text_grid(ColPartitionGrid *text_grid) {
+  text_grid_ = text_grid;
+}
+void StructuredTable::set_line_grid(ColPartitionGrid *line_grid) {
+  line_grid_ = line_grid;
+}
+void StructuredTable::set_max_text_height(int height) {
+  max_text_height_ = height;
+}
+bool StructuredTable::is_lined() const {
+  return is_lined_;
+}
+unsigned StructuredTable::row_count() const {
+  return cell_y_.empty() ? 0 : cell_y_.size() - 1;
+}
+unsigned StructuredTable::column_count() const {
+  return cell_x_.empty() ? 0 : cell_x_.size() - 1;
+}
+unsigned StructuredTable::cell_count() const {
+  return row_count() * column_count();
+}
+void StructuredTable::set_bounding_box(const TBOX &box) {
+  bounding_box_ = box;
+}
+const TBOX &StructuredTable::bounding_box() const {
+  return bounding_box_;
+}
+int StructuredTable::median_cell_height() {
+  return median_cell_height_;
+}
+int StructuredTable::median_cell_width() {
+  return median_cell_width_;
+}
+int StructuredTable::row_height(unsigned row) const {
+  ASSERT_HOST(row < row_count());
+  return cell_y_[row + 1] - cell_y_[row];
+}
+int StructuredTable::column_width(unsigned column) const {
+  ASSERT_HOST(column < column_count());
+  return cell_x_[column + 1] - cell_x_[column];
+}
+int StructuredTable::space_above() const {
+  return space_above_;
+}
+int StructuredTable::space_below() const {
+  return space_below_;
+}
+
+// At this point, we know that the lines are contained
+// by the box (by FindLinesBoundingBox).
+// So try to find the cell structure and make sure it works out.
+// The assumption is that all lines span the table. If this
+// assumption fails, the VerifyLinedTable method will
+// abort the lined table. The TableRecognizer will fall
+// back on FindWhitespacedStructure.
+bool StructuredTable::FindLinedStructure() {
+  ClearStructure();
+
+  // Search for all of the lines in the current box.
+  // Update the cellular structure with the exact lines.
+  ColPartitionGridSearch box_search(line_grid_);
+  box_search.SetUniqueMode(true);
+  box_search.StartRectSearch(bounding_box_);
+  ColPartition *line = nullptr;
+
+  while ((line = box_search.NextRectSearch()) != nullptr) {
+    if (line->IsHorizontalLine()) {
+      cell_y_.push_back(line->MidY());
+    }
+    if (line->IsVerticalLine()) {
+      cell_x_.push_back(line->MidX());
+    }
+  }
+
+  // HasSignificantLines should guarantee cells.
+  // Because that code is a different class, just gracefully
+  // return false. This could be an assert.
+  if (cell_x_.size() < 3 || cell_y_.size() < 3) {
+    return false;
+  }
+
+  // Sort and remove duplicates that may have occurred due to split lines.
+  std::sort(cell_x_.begin(), cell_x_.end());
+  auto last_x = std::unique(cell_x_.begin(), cell_x_.end());
+  cell_x_.erase(last_x, cell_x_.end());
+  std::sort(cell_y_.begin(), cell_y_.end());
+  auto last_y = std::unique(cell_y_.begin(), cell_y_.end());
+  cell_y_.erase(last_y, cell_y_.end());
+
+  // The border should be the extents of line boxes, not middle.
+  cell_x_[0] = bounding_box_.left();
+  cell_x_[cell_x_.size() - 1] = bounding_box_.right();
+  cell_y_[0] = bounding_box_.bottom();
+  cell_y_[cell_y_.size() - 1] = bounding_box_.top();
+
+  // Remove duplicates that may have occurred due to moving the borders.
+  last_x = std::unique(cell_x_.begin(), cell_x_.end());
+  cell_x_.erase(last_x, cell_x_.end());
+  last_y = std::unique(cell_y_.begin(), cell_y_.end());
+  cell_y_.erase(last_y, cell_y_.end());
+
+  CalculateMargins();
+  CalculateStats();
+  is_lined_ = VerifyLinedTableCells();
+  return is_lined_;
+}
+
+// Finds the cellular structure given a particular box.
+bool StructuredTable::FindWhitespacedStructure() {
+  ClearStructure();
+  FindWhitespacedColumns();
+  FindWhitespacedRows();
+
+  if (!VerifyWhitespacedTable()) {
+    return false;
+  } else {
+    bounding_box_.set_left(cell_x_[0]);
+    bounding_box_.set_right(cell_x_[cell_x_.size() - 1]);
+    bounding_box_.set_bottom(cell_y_[0]);
+    bounding_box_.set_top(cell_y_[cell_y_.size() - 1]);
+    AbsorbNearbyLines();
+    CalculateMargins();
+    CalculateStats();
+    return true;
+  }
+}
+
+// Tests if a partition fits inside the table structure.
+// Partitions must fully span a grid line in order to intersect it.
+// This means that a partition does not intersect a line
+// that it "just" touches. This is mainly because the assumption
+// throughout the code is that "0" distance is a very very small space.
+bool StructuredTable::DoesPartitionFit(const ColPartition &part) const {
+  const TBOX &box = part.bounding_box();
+  for (int i : cell_x_) {
+    if (box.left() < i && i < box.right()) {
+      return false;
+    }
+  }
+  for (int i : cell_y_) {
+    if (box.bottom() < i && i < box.top()) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Checks if a sub-table has multiple data cells filled.
+int StructuredTable::CountFilledCells() {
+  return CountFilledCells(0, row_count() - 1, 0, column_count() - 1);
+}
+int StructuredTable::CountFilledCellsInRow(int row) {
+  return CountFilledCells(row, row, 0, column_count() - 1);
+}
+int StructuredTable::CountFilledCellsInColumn(int column) {
+  return CountFilledCells(0, row_count() - 1, column, column);
+}
+int StructuredTable::CountFilledCells(unsigned row_start, unsigned row_end, unsigned column_start,
+                                      unsigned column_end) {
+  ASSERT_HOST(row_start <= row_end && row_end < row_count());
+  ASSERT_HOST(column_start <= column_end && column_end < column_count());
+  int cell_count = 0;
+  TBOX cell_box;
+  for (unsigned row = row_start; row <= row_end; ++row) {
+    cell_box.set_bottom(cell_y_[row]);
+    cell_box.set_top(cell_y_[row + 1]);
+    for (unsigned col = column_start; col <= column_end; ++col) {
+      cell_box.set_left(cell_x_[col]);
+      cell_box.set_right(cell_x_[col + 1]);
+      if (CountPartitions(cell_box) > 0) {
+        ++cell_count;
+      }
+    }
+  }
+  return cell_count;
+}
+
+// Makes sure that at least one cell in a row has substantial area filled.
+// This can filter out large whitespace caused by growing tables too far
+// and page numbers.
+bool StructuredTable::VerifyRowFilled(int row) {
+  for (unsigned i = 0; i < column_count(); ++i) {
+    auto area_filled = CalculateCellFilledPercentage(row, i);
+    if (area_filled >= kMinFilledArea) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Finds the filled area in a cell.
+// Assume ColPartitions do not overlap for simplicity (even though they do).
+double StructuredTable::CalculateCellFilledPercentage(unsigned row, unsigned column) {
+  ASSERT_HOST(row <= row_count());
+  ASSERT_HOST(column <= column_count());
+  const TBOX kCellBox(cell_x_[column], cell_y_[row], cell_x_[column + 1], cell_y_[row + 1]);
+  ASSERT_HOST(!kCellBox.null_box());
+
+  ColPartitionGridSearch gsearch(text_grid_);
+  gsearch.SetUniqueMode(true);
+  gsearch.StartRectSearch(kCellBox);
+  double area_covered = 0;
+  ColPartition *text = nullptr;
+  while ((text = gsearch.NextRectSearch()) != nullptr) {
+    if (text->IsTextType()) {
+      area_covered += text->bounding_box().intersection(kCellBox).area();
+    }
+  }
+  const int32_t current_area = kCellBox.area();
+  if (current_area == 0) {
+    return 1.0;
+  }
+  return std::min(1.0, area_covered / current_area);
+}
+
+#ifndef GRAPHICS_DISABLED
+
+void StructuredTable::Display(ScrollView *window, ScrollView::Color color) {
+  window->Brush(ScrollView::NONE);
+  window->Pen(color);
+  window->Rectangle(bounding_box_.left(), bounding_box_.bottom(), bounding_box_.right(),
+                    bounding_box_.top());
+  for (int i : cell_x_) {
+    window->Line(i, bounding_box_.bottom(), i, bounding_box_.top());
+  }
+  for (int i : cell_y_) {
+    window->Line(bounding_box_.left(), i, bounding_box_.right(), i);
+  }
+  window->UpdateWindow();
+}
+
+#endif
+
+// Clear structure information.
+void StructuredTable::ClearStructure() {
+  cell_x_.clear();
+  cell_y_.clear();
+  is_lined_ = false;
+  space_above_ = 0;
+  space_below_ = 0;
+  space_left_ = 0;
+  space_right_ = 0;
+  median_cell_height_ = 0;
+  median_cell_width_ = 0;
+}
+
+// When a table has lines, the lines should not intersect any partitions.
+// The following function makes sure the previous assumption is met.
+bool StructuredTable::VerifyLinedTableCells() {
+  // Function only called when lines exist.
+  ASSERT_HOST(cell_y_.size() >= 2 && cell_x_.size() >= 2);
+  for (int i : cell_y_) {
+    if (CountHorizontalIntersections(i) > 0) {
+      return false;
+    }
+  }
+  for (int i : cell_x_) {
+    if (CountVerticalIntersections(i) > 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// TODO(nbeato): Could be much better than this.
+// Examples:
+//   - Calculate the percentage of filled cells.
+//   - Calculate the average number of ColPartitions per cell.
+//   - Calculate the number of cells per row with partitions.
+//   - Check if ColPartitions in adjacent cells are similar.
+//   - Check that all columns are at least a certain width.
+//   - etc.
+bool StructuredTable::VerifyWhitespacedTable() {
+  // criteria for a table, must be at least 2x3 or 3x2
+  return row_count() >= 2 && column_count() >= 2 && cell_count() >= 6;
+}
+
+// Finds vertical splits in the ColPartitions of text_grid_ by considering
+// all possible "good" guesses. A good guess is just the left/right sides of
+// the partitions, since these locations will uniquely define where the
+// extremal values where the splits can occur. The split happens
+// in the middle of the two nearest partitions.
+void StructuredTable::FindWhitespacedColumns() {
+  // Set of the extents of all partitions on the page.
+  std::vector<int> left_sides;
+  std::vector<int> right_sides;
+
+  // Look at each text partition. We want to find the partitions
+  // that have extremal left/right sides. These will give us a basis
+  // for the table columns.
+  ColPartitionGridSearch gsearch(text_grid_);
+  gsearch.SetUniqueMode(true);
+  gsearch.StartRectSearch(bounding_box_);
+  ColPartition *text = nullptr;
+  while ((text = gsearch.NextRectSearch()) != nullptr) {
+    if (!text->IsTextType()) {
+      continue;
+    }
+
+    ASSERT_HOST(text->bounding_box().left() < text->bounding_box().right());
+    int spacing = static_cast<int>(text->median_width() * kHorizontalSpacing / 2.0 + 0.5);
+    left_sides.push_back(text->bounding_box().left() - spacing);
+    right_sides.push_back(text->bounding_box().right() + spacing);
+  }
+  // It causes disaster below, so avoid it!
+  if (left_sides.empty() || right_sides.empty()) {
+    return;
+  }
+
+  // Since data may be inserted in grid order, we sort the left/right sides.
+  std::sort(left_sides.begin(), left_sides.end());
+  std::sort(right_sides.begin(), right_sides.end());
+
+  // At this point, in the "merged list", we expect to have a left side,
+  // followed by either more left sides or a right side. The last number
+  // should be a right side. We find places where the splits occur by looking
+  // for "valleys". If we want to force gap sizes or allow overlap, change
+  // the spacing above. If you want to let lines "slice" partitions as long
+  // as it is infrequent, change the following function.
+  FindCellSplitLocations(left_sides, right_sides, kCellSplitColumnThreshold, &cell_x_);
+}
+
+// Finds horizontal splits in the ColPartitions of text_grid_ by considering
+// all possible "good" guesses. A good guess is just the bottom/top sides of
+// the partitions, since these locations will uniquely define where the
+// extremal values where the splits can occur. The split happens
+// in the middle of the two nearest partitions.
+void StructuredTable::FindWhitespacedRows() {
+  // Set of the extents of all partitions on the page.
+  std::vector<int> bottom_sides;
+  std::vector<int> top_sides;
+  // We will be "shrinking" partitions, so keep the min/max around to
+  // make sure the bottom/top lines do not intersect text.
+  int min_bottom = INT32_MAX;
+  int max_top = INT32_MIN;
+
+  // Look at each text partition. We want to find the partitions
+  // that have extremal bottom/top sides. These will give us a basis
+  // for the table rows. Because the textlines can be skewed and close due
+  // to warping, the height of the partitions is toned down a little bit.
+  ColPartitionGridSearch gsearch(text_grid_);
+  gsearch.SetUniqueMode(true);
+  gsearch.StartRectSearch(bounding_box_);
+  ColPartition *text = nullptr;
+  while ((text = gsearch.NextRectSearch()) != nullptr) {
+    if (!text->IsTextType()) {
+      continue;
+    }
+
+    ASSERT_HOST(text->bounding_box().bottom() < text->bounding_box().top());
+    min_bottom = std::min(min_bottom, static_cast<int>(text->bounding_box().bottom()));
+    max_top = std::max(max_top, static_cast<int>(text->bounding_box().top()));
+
+    // Ignore "tall" text partitions, as these are usually false positive
+    // vertical text or multiple lines pulled together.
+    if (text->bounding_box().height() > max_text_height_) {
+      continue;
+    }
+
+    int spacing = static_cast<int>(text->bounding_box().height() * kVerticalSpacing / 2.0 + 0.5);
+    int bottom = text->bounding_box().bottom() - spacing;
+    int top = text->bounding_box().top() + spacing;
+    // For horizontal text, the factor can be negative. This should
+    // probably cause a warning or failure. I haven't actually checked if
+    // it happens.
+    if (bottom >= top) {
+      continue;
+    }
+
+    bottom_sides.push_back(bottom);
+    top_sides.push_back(top);
+  }
+  // It causes disaster below, so avoid it!
+  if (bottom_sides.empty() || top_sides.empty()) {
+    return;
+  }
+
+  // Since data may be inserted in grid order, we sort the bottom/top sides.
+  std::sort(bottom_sides.begin(), bottom_sides.end());
+  std::sort(top_sides.begin(), top_sides.end());
+
+  // At this point, in the "merged list", we expect to have a bottom side,
+  // followed by either more bottom sides or a top side. The last number
+  // should be a top side. We find places where the splits occur by looking
+  // for "valleys". If we want to force gap sizes or allow overlap, change
+  // the spacing above. If you want to let lines "slice" partitions as long
+  // as it is infrequent, change the following function.
+  FindCellSplitLocations(bottom_sides, top_sides, kCellSplitRowThreshold, &cell_y_);
+
+  // Recover the min/max correctly since it was shifted.
+  cell_y_[0] = min_bottom;
+  cell_y_[cell_y_.size() - 1] = max_top;
+}
+
+void StructuredTable::CalculateMargins() {
+  space_above_ = INT32_MAX;
+  space_below_ = INT32_MAX;
+  space_right_ = INT32_MAX;
+  space_left_ = INT32_MAX;
+  UpdateMargins(text_grid_);
+  UpdateMargins(line_grid_);
+}
+// Finds the nearest partition in grid to the table
+// boundaries and updates the margin.
+void StructuredTable::UpdateMargins(ColPartitionGrid *grid) {
+  int below = FindVerticalMargin(grid, bounding_box_.bottom(), true);
+  space_below_ = std::min(space_below_, below);
+  int above = FindVerticalMargin(grid, bounding_box_.top(), false);
+  space_above_ = std::min(space_above_, above);
+  int left = FindHorizontalMargin(grid, bounding_box_.left(), true);
+  space_left_ = std::min(space_left_, left);
+  int right = FindHorizontalMargin(grid, bounding_box_.right(), false);
+  space_right_ = std::min(space_right_, right);
+}
+int StructuredTable::FindVerticalMargin(ColPartitionGrid *grid, int border, bool decrease) const {
+  ColPartitionGridSearch gsearch(grid);
+  gsearch.SetUniqueMode(true);
+  gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(), border);
+  ColPartition *part = nullptr;
+  while ((part = gsearch.NextVerticalSearch(decrease)) != nullptr) {
+    if (!part->IsTextType() && !part->IsHorizontalLine()) {
+      continue;
+    }
+    int distance =
+        decrease ? border - part->bounding_box().top() : part->bounding_box().bottom() - border;
+    if (distance >= 0) {
+      return distance;
+    }
+  }
+  return INT32_MAX;
+}
+int StructuredTable::FindHorizontalMargin(ColPartitionGrid *grid, int border, bool decrease) const {
+  ColPartitionGridSearch gsearch(grid);
+  gsearch.SetUniqueMode(true);
+  gsearch.StartSideSearch(border, bounding_box_.bottom(), bounding_box_.top());
+  ColPartition *part = nullptr;
+  while ((part = gsearch.NextSideSearch(decrease)) != nullptr) {
+    if (!part->IsTextType() && !part->IsVerticalLine()) {
+      continue;
+    }
+    int distance =
+        decrease ? border - part->bounding_box().right() : part->bounding_box().left() - border;
+    if (distance >= 0) {
+      return distance;
+    }
+  }
+  return INT32_MAX;
+}
+
+void StructuredTable::CalculateStats() {
+  const int kMaxCellHeight = 1000;
+  const int kMaxCellWidth = 1000;
+  STATS height_stats(0, kMaxCellHeight);
+  STATS width_stats(0, kMaxCellWidth);
+
+  for (unsigned i = 0; i < row_count(); ++i) {
+    height_stats.add(row_height(i), column_count());
+  }
+  for (unsigned i = 0; i < column_count(); ++i) {
+    width_stats.add(column_width(i), row_count());
+  }
+
+  median_cell_height_ = static_cast<int>(height_stats.median() + 0.5);
+  median_cell_width_ = static_cast<int>(width_stats.median() + 0.5);
+}
+
+// Looks for grid lines near the current bounding box and
+// grows the bounding box to include them if no intersections
+// will occur as a result. This is necessary because the margins
+// are calculated relative to the closest line/text. If the
+// line isn't absorbed, the margin will be the distance to the line.
+void StructuredTable::AbsorbNearbyLines() {
+  ColPartitionGridSearch gsearch(line_grid_);
+  gsearch.SetUniqueMode(true);
+
+  // Is the closest line above good? Loop multiple times for tables with
+  // multi-line (sometimes 2) borders. Limit the number of lines by
+  // making sure they stay within a table cell or so.
+  ColPartition *line = nullptr;
+  gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(), bounding_box_.top());
+  while ((line = gsearch.NextVerticalSearch(false)) != nullptr) {
+    if (!line->IsHorizontalLine()) {
+      break;
+    }
+    TBOX text_search(bounding_box_.left(), bounding_box_.top() + 1, bounding_box_.right(),
+                     line->MidY());
+    if (text_search.height() > median_cell_height_ * 2) {
+      break;
+    }
+    if (CountPartitions(text_search) > 0) {
+      break;
+    }
+    bounding_box_.set_top(line->MidY());
+  }
+  // As above, is the closest line below good?
+  line = nullptr;
+  gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(), bounding_box_.bottom());
+  while ((line = gsearch.NextVerticalSearch(true)) != nullptr) {
+    if (!line->IsHorizontalLine()) {
+      break;
+    }
+    TBOX text_search(bounding_box_.left(), line->MidY(), bounding_box_.right(),
+                     bounding_box_.bottom() - 1);
+    if (text_search.height() > median_cell_height_ * 2) {
+      break;
+    }
+    if (CountPartitions(text_search) > 0) {
+      break;
+    }
+    bounding_box_.set_bottom(line->MidY());
+  }
+  // TODO(nbeato): vertical lines
+}
+
+// This function will find all "0 valleys" (of any length) given two
+// arrays. The arrays are the mins and maxes of partitions (either
+// left and right or bottom and top). Since the min/max lists are generated
+// with pairs of increasing integers, we can make some assumptions in
+// the function about ordering of the overall list, which are shown in the
+// asserts.
+// The algorithm works as follows:
+//   While there are numbers to process, take the smallest number.
+//     If it is from the min_list, increment the "hill" counter.
+//     Otherwise, decrement the "hill" counter.
+//     In the process of doing this, keep track of "crossing" the
+//     desired height.
+// The first/last items are extremal values of the list and known.
+// NOTE: This function assumes the lists are sorted!
+void StructuredTable::FindCellSplitLocations(const std::vector<int> &min_list,
+                                             const std::vector<int> &max_list, int max_merged,
+                                             std::vector<int> *locations) {
+  locations->clear();
+  ASSERT_HOST(min_list.size() == max_list.size());
+  if (min_list.empty()) {
+    return;
+  }
+  ASSERT_HOST(min_list.at(0) < max_list.at(0));
+  ASSERT_HOST(min_list.at(min_list.size() - 1) < max_list.at(max_list.size() - 1));
+
+  locations->push_back(min_list.at(0));
+  unsigned min_index = 0;
+  unsigned max_index = 0;
+  int stacked_partitions = 0;
+  int last_cross_position = INT32_MAX;
+  // max_index will expire after min_index.
+  // However, we can't "increase" the hill size if min_index expired.
+  // So finish processing when min_index expires.
+  while (min_index < min_list.size()) {
+    // Increase the hill count.
+    if (min_list[min_index] < max_list[max_index]) {
+      ++stacked_partitions;
+      if (last_cross_position != INT32_MAX && stacked_partitions > max_merged) {
+        int mid = (last_cross_position + min_list[min_index]) / 2;
+        locations->push_back(mid);
+        last_cross_position = INT32_MAX;
+      }
+      ++min_index;
+    } else {
+      // Decrease the hill count.
+      --stacked_partitions;
+      if (last_cross_position == INT32_MAX && stacked_partitions <= max_merged) {
+        last_cross_position = max_list[max_index];
+      }
+      ++max_index;
+    }
+  }
+  locations->push_back(max_list.at(max_list.size() - 1));
+}
+
+// Counts the number of partitions in the table
+// box that intersection the given x value.
+int StructuredTable::CountVerticalIntersections(int x) {
+  int count = 0;
+  // Make a small box to keep the search time down.
+  const int kGridSize = text_grid_->gridsize();
+  TBOX vertical_box = bounding_box_;
+  vertical_box.set_left(x - kGridSize);
+  vertical_box.set_right(x + kGridSize);
+
+  ColPartitionGridSearch gsearch(text_grid_);
+  gsearch.SetUniqueMode(true);
+  gsearch.StartRectSearch(vertical_box);
+  ColPartition *text = nullptr;
+  while ((text = gsearch.NextRectSearch()) != nullptr) {
+    if (!text->IsTextType()) {
+      continue;
+    }
+    const TBOX &box = text->bounding_box();
+    if (box.left() < x && x < box.right()) {
+      ++count;
+    }
+  }
+  return count;
+}
+
+// Counts the number of partitions in the table
+// box that intersection the given y value.
+int StructuredTable::CountHorizontalIntersections(int y) {
+  int count = 0;
+  // Make a small box to keep the search time down.
+  const int kGridSize = text_grid_->gridsize();
+  TBOX horizontal_box = bounding_box_;
+  horizontal_box.set_bottom(y - kGridSize);
+  horizontal_box.set_top(y + kGridSize);
+
+  ColPartitionGridSearch gsearch(text_grid_);
+  gsearch.SetUniqueMode(true);
+  gsearch.StartRectSearch(horizontal_box);
+  ColPartition *text = nullptr;
+  while ((text = gsearch.NextRectSearch()) != nullptr) {
+    if (!text->IsTextType()) {
+      continue;
+    }
+
+    const TBOX &box = text->bounding_box();
+    if (box.bottom() < y && y < box.top()) {
+      ++count;
+    }
+  }
+  return count;
+}
+
+// Counts how many text partitions are in this box.
+// This is used to count partitions in cells, as that can indicate
+// how "strong" a potential table row/column (or even full table) actually is.
+int StructuredTable::CountPartitions(const TBOX &box) {
+  ColPartitionGridSearch gsearch(text_grid_);
+  gsearch.SetUniqueMode(true);
+  gsearch.StartRectSearch(box);
+  int count = 0;
+  ColPartition *text = nullptr;
+  while ((text = gsearch.NextRectSearch()) != nullptr) {
+    if (text->IsTextType()) {
+      ++count;
+    }
+  }
+  return count;
+}
+
+////////
+//////// TableRecognizer Class
+////////
+
+void TableRecognizer::Init() {}
+
+void TableRecognizer::set_text_grid(ColPartitionGrid *text_grid) {
+  text_grid_ = text_grid;
+}
+void TableRecognizer::set_line_grid(ColPartitionGrid *line_grid) {
+  line_grid_ = line_grid;
+}
+void TableRecognizer::set_min_height(int height) {
+  min_height_ = height;
+}
+void TableRecognizer::set_min_width(int width) {
+  min_width_ = width;
+}
+void TableRecognizer::set_max_text_height(int height) {
+  max_text_height_ = height;
+}
+
+StructuredTable *TableRecognizer::RecognizeTable(const TBOX &guess) {
+  auto *table = new StructuredTable();
+  table->Init();
+  table->set_text_grid(text_grid_);
+  table->set_line_grid(line_grid_);
+  table->set_max_text_height(max_text_height_);
+
+  // Try to solve this simple case, a table with *both*
+  // vertical and horizontal lines.
+  if (RecognizeLinedTable(guess, table)) {
+    return table;
+  }
+
+  // Fallback to whitespace if that failed.
+  // TODO(nbeato): Break this apart to take advantage of horizontal
+  // lines or vertical lines when present.
+  if (RecognizeWhitespacedTable(guess, table)) {
+    return table;
+  }
+
+  // No table found...
+  delete table;
+  return nullptr;
+}
+
+bool TableRecognizer::RecognizeLinedTable(const TBOX &guess_box, StructuredTable *table) {
+  if (!HasSignificantLines(guess_box)) {
+    return false;
+  }
+  TBOX line_bound = guess_box;
+  if (!FindLinesBoundingBox(&line_bound)) {
+    return false;
+  }
+  table->set_bounding_box(line_bound);
+  return table->FindLinedStructure();
+}
+
+// Quick implementation. Just count the number of lines in the box.
+// A better implementation would counter intersections and look for connected
+// components. It could even go as far as finding similar length lines.
+// To account for these possible issues, the VerifyLinedTableCells function
+// will reject lined tables that cause intersections with text on the page.
+// TODO(nbeato): look for "better" lines
+bool TableRecognizer::HasSignificantLines(const TBOX &guess) {
+  ColPartitionGridSearch box_search(line_grid_);
+  box_search.SetUniqueMode(true);
+  box_search.StartRectSearch(guess);
+  ColPartition *line = nullptr;
+  int vertical_count = 0;
+  int horizontal_count = 0;
+
+  while ((line = box_search.NextRectSearch()) != nullptr) {
+    if (line->IsHorizontalLine()) {
+      ++horizontal_count;
+    }
+    if (line->IsVerticalLine()) {
+      ++vertical_count;
+    }
+  }
+
+  return vertical_count >= kLinedTableMinVerticalLines &&
+         horizontal_count >= kLinedTableMinHorizontalLines;
+}
+
+// Given a bounding box with a bunch of horizontal / vertical lines,
+// we just find the extents of all of these lines iteratively.
+// The box will be at least as large as guess. This
+// could possibly be a bad assumption.
+// It is guaranteed to halt in at least O(n * gridarea) where n
+// is the number of lines.
+// The assumption is that growing the box iteratively will add lines
+// several times, but eventually we'll find the extents.
+//
+// For tables, the approach is a bit aggressive, a single line (which could be
+// noise or a column ruling) can destroy the table inside.
+//
+// TODO(nbeato): This is a quick first implementation.
+// A better implementation would actually look for consistency
+// in extents of the lines and find the extents using lines
+// that clearly describe the table. This would allow the
+// lines to "vote" for height/width. An approach like
+// this would solve issues with page layout rulings.
+// I haven't looked for these issues yet, so I can't even
+// say they happen confidently.
+bool TableRecognizer::FindLinesBoundingBox(TBOX *bounding_box) {
+  // The first iteration will tell us if there are lines
+  // present and shrink the box to a minimal iterative size.
+  if (!FindLinesBoundingBoxIteration(bounding_box)) {
+    return false;
+  }
+
+  // Keep growing until the area of the table stabilizes.
+  // The box can only get bigger, increasing area.
+  bool changed = true;
+  while (changed) {
+    changed = false;
+    int old_area = bounding_box->area();
+    bool check = FindLinesBoundingBoxIteration(bounding_box);
+    // At this point, the function will return true.
+    ASSERT_HOST(check);
+    ASSERT_HOST(bounding_box->area() >= old_area);
+    changed = (bounding_box->area() > old_area);
+  }
+
+  return true;
+}
+
+bool TableRecognizer::FindLinesBoundingBoxIteration(TBOX *bounding_box) {
+  // Search for all of the lines in the current box, keeping track of extents.
+  ColPartitionGridSearch box_search(line_grid_);
+  box_search.SetUniqueMode(true);
+  box_search.StartRectSearch(*bounding_box);
+  ColPartition *line = nullptr;
+  bool first_line = true;
+
+  while ((line = box_search.NextRectSearch()) != nullptr) {
+    if (line->IsLineType()) {
+      if (first_line) {
+        // The first iteration can shrink the box.
+        *bounding_box = line->bounding_box();
+        first_line = false;
+      } else {
+        *bounding_box += line->bounding_box();
+      }
+    }
+  }
+  return !first_line;
+}
+
+// The goal of this function is to move the table boundaries around and find
+// a table that maximizes the whitespace around the table while maximizing
+// the cellular structure. As a result, it gets confused by headers, footers,
+// and merged columns (text that crosses columns). There is a tolerance
+// that allows a few partitions to count towards potential cell merges.
+// It's the max_merged parameter to FindPartitionLocations.
+// It can work, but it needs some false positive remove on boundaries.
+// For now, the grid structure must not intersect any partitions.
+// Also, small tolerance is added to the horizontal lines for tightly packed
+// tables. The tolerance is added by adjusting the bounding boxes of the
+// partitions (in FindHorizontalPartitions). The current implementation
+// only adjusts the vertical extents of the table.
+//
+// Also note. This was hacked at a lot. It could probably use some
+// more hacking at to find a good set of border conditions and then a
+// nice clean up.
+bool TableRecognizer::RecognizeWhitespacedTable(const TBOX &guess_box, StructuredTable *table) {
+  TBOX best_box = guess_box; // Best borders known.
+  int best_below = 0;        // Margin size above best table.
+  int best_above = 0;        // Margin size below best table.
+  TBOX adjusted = guess_box; // The search box.
+
+  // We assume that the guess box is somewhat accurate, so we don't allow
+  // the adjusted border to pass half of the guessed area. This prevents
+  // "negative" tables from forming.
+  const int kMidGuessY = (guess_box.bottom() + guess_box.top()) / 2;
+  // Keeps track of the most columns in an accepted table. The resulting table
+  // may be less than the max, but we don't want to stray too far.
+  unsigned best_cols = 0;
+  // Make sure we find a good border.
+  bool found_good_border = false;
+
+  // Find the bottom of the table by trying a few different locations. For
+  // each location, the top, left, and right are fixed. We start the search
+  // in a smaller table to favor best_cols getting a good estimate sooner.
+  int last_bottom = INT32_MAX;
+  int bottom =
+      NextHorizontalSplit(guess_box.left(), guess_box.right(), kMidGuessY - min_height_ / 2, true);
+  int top =
+      NextHorizontalSplit(guess_box.left(), guess_box.right(), kMidGuessY + min_height_ / 2, false);
+  adjusted.set_top(top);
+
+  // Headers/footers can be spaced far from everything.
+  // Make sure that the space below is greater than the space above
+  // the lowest row.
+  int previous_below = 0;
+  const int kMaxChances = 10;
+  int chances = kMaxChances;
+  while (bottom != last_bottom) {
+    adjusted.set_bottom(bottom);
+
+    if (adjusted.height() >= min_height_) {
+      // Try to fit the grid on the current box. We give it a chance
+      // if the number of columns didn't significantly drop.
+      table->set_bounding_box(adjusted);
+      if (table->FindWhitespacedStructure() &&
+          table->column_count() >= best_cols * kRequiredColumns) {
+        if (false && IsWeakTableRow(table, 0)) {
+          // Currently buggy, but was looking promising so disabled.
+          --chances;
+        } else {
+          // We favor 2 things,
+          //   1- Adding rows that have partitioned data.
+          //   2- Better margins (to find header/footer).
+          // For better tables, we just look for multiple cells in the
+          // bottom row with data in them.
+          // For margins, the space below the last row should
+          // be better than a table with the last row removed.
+          chances = kMaxChances;
+          double max_row_height = kMaxRowSize * table->median_cell_height();
+          if ((table->space_below() * kMarginFactor >= best_below &&
+               table->space_below() >= previous_below) ||
+              (table->CountFilledCellsInRow(0) > 1 && table->row_height(0) < max_row_height)) {
+            best_box.set_bottom(bottom);
+            best_below = table->space_below();
+            best_cols = std::max(table->column_count(), best_cols);
+            found_good_border = true;
+          }
+        }
+        previous_below = table->space_below();
+      } else {
+        --chances;
+      }
+    }
+    if (chances <= 0) {
+      break;
+    }
+
+    last_bottom = bottom;
+    bottom = NextHorizontalSplit(guess_box.left(), guess_box.right(), last_bottom, true);
+  }
+  if (!found_good_border) {
+    return false;
+  }
+
+  // TODO(nbeato) comments: follow modified code above... put it in a function!
+  found_good_border = false;
+  int last_top = INT32_MIN;
+  top =
+      NextHorizontalSplit(guess_box.left(), guess_box.right(), kMidGuessY + min_height_ / 2, false);
+  int previous_above = 0;
+  chances = kMaxChances;
+
+  adjusted.set_bottom(best_box.bottom());
+  while (last_top != top) {
+    adjusted.set_top(top);
+    if (adjusted.height() >= min_height_) {
+      table->set_bounding_box(adjusted);
+      if (table->FindWhitespacedStructure() &&
+          table->column_count() >= best_cols * kRequiredColumns) {
+        int last_row = table->row_count() - 1;
+        if (false && IsWeakTableRow(table, last_row)) {
+          // Currently buggy, but was looking promising so disabled.
+          --chances;
+        } else {
+          chances = kMaxChances;
+          double max_row_height = kMaxRowSize * table->median_cell_height();
+          if ((table->space_above() * kMarginFactor >= best_above &&
+               table->space_above() >= previous_above) ||
+              (table->CountFilledCellsInRow(last_row) > 1 &&
+               table->row_height(last_row) < max_row_height)) {
+            best_box.set_top(top);
+            best_above = table->space_above();
+            best_cols = std::max(table->column_count(), best_cols);
+            found_good_border = true;
+          }
+        }
+        previous_above = table->space_above();
+      } else {
+        --chances;
+      }
+    }
+    if (chances <= 0) {
+      break;
+    }
+
+    last_top = top;
+    top = NextHorizontalSplit(guess_box.left(), guess_box.right(), last_top, false);
+  }
+
+  if (!found_good_border) {
+    return false;
+  }
+
+  // If we get here, this shouldn't happen. It can be an assert, but
+  // I haven't tested it enough to make it crash things.
+  if (best_box.null_box()) {
+    return false;
+  }
+
+  // Given the best locations, fit the box to those locations.
+  table->set_bounding_box(best_box);
+  return table->FindWhitespacedStructure();
+}
+
+// Finds the closest value to y that can safely cause a horizontal
+// split in the partitions.
+// This function has been buggy and not as reliable as I would've
+// liked. I suggest finding all of the splits using the
+// FindPartitionLocations once and then just keeping the results
+// of that function cached somewhere.
+int TableRecognizer::NextHorizontalSplit(int left, int right, int y, bool top_to_bottom) {
+  ColPartitionGridSearch gsearch(text_grid_);
+  gsearch.SetUniqueMode(true);
+  gsearch.StartVerticalSearch(left, right, y);
+  ColPartition *text = nullptr;
+  int last_y = y;
+  while ((text = gsearch.NextVerticalSearch(top_to_bottom)) != nullptr) {
+    if (!text->IsTextType() || !text->IsHorizontalType()) {
+      continue;
+    }
+    if (text->bounding_box().height() > max_text_height_) {
+      continue;
+    }
+
+    const TBOX &text_box = text->bounding_box();
+    if (top_to_bottom && (last_y >= y || last_y <= text_box.top())) {
+      last_y = std::min(last_y, static_cast<int>(text_box.bottom()));
+      continue;
+    }
+    if (!top_to_bottom && (last_y <= y || last_y >= text_box.bottom())) {
+      last_y = std::max(last_y, static_cast<int>(text_box.top()));
+      continue;
+    }
+
+    return last_y;
+  }
+  // If none is found, we at least want to preserve the min/max,
+  // which defines the overlap of y with the last partition in the grid.
+  return last_y;
+}
+
+} // namespace tesseract