Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/textord/tablefind.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/textord/tablefind.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,2135 @@ +/////////////////////////////////////////////////////////////////////// +// File: tablefind.cpp +// Description: Helper classes to find tables from ColPartitions. +// Author: Faisal Shafait (faisal.shafait@dfki.de) +// +// (C) Copyright 2009, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#ifdef HAVE_CONFIG_H +# include "config_auto.h" +#endif + +#include <algorithm> +#include <cmath> +#include <utility> +#include "tablefind.h" + +#include <allheaders.h> + +#include "colpartitionset.h" +#include "tablerecog.h" + +namespace tesseract { + +// These numbers are used to calculate the global median stats. +// They just set an upper bound on the stats objects. +// Maximum vertical spacing between neighbor partitions. +const int kMaxVerticalSpacing = 500; +// Maximum width of a blob in a partition. +const int kMaxBlobWidth = 500; + +// Minimum whitespace size to split a partition (measured as a multiple +// of a partition's median width). +const double kSplitPartitionSize = 2.0; +// To insert text, the partition must satisfy these size constraints +// in AllowTextPartition(). The idea is to filter noise partitions +// determined by the size compared to the global medians. +// TODO(nbeato): Need to find good numbers again. +const double kAllowTextHeight = 0.5; +const double kAllowTextWidth = 0.6; +const double kAllowTextArea = 0.8; +// The same thing applies to blobs (to filter noise). +// TODO(nbeato): These numbers are a shot in the dark... +// height and width are 0.5 * gridsize() in colfind.cpp +// area is a rough guess for the size of a period. +const double kAllowBlobHeight = 0.3; +const double kAllowBlobWidth = 0.4; +const double kAllowBlobArea = 0.05; + +// Minimum number of components in a text partition. A partition having fewer +// components than that is more likely a data partition and is a candidate +// table cell. +const int kMinBoxesInTextPartition = 10; + +// Maximum number of components that a data partition can have +const int kMaxBoxesInDataPartition = 20; + +// Maximum allowed gap in a text partitions as a multiple of its median size. +const double kMaxGapInTextPartition = 4.0; + +// Minimum value that the maximum gap in a text partition should have as a +// factor of its median size. +const double kMinMaxGapInTextPartition = 0.5; + +// The amount of overlap that is "normal" for adjacent blobs in a text +// partition. This is used to calculate gap between overlapping blobs. +const double kMaxBlobOverlapFactor = 4.0; + +// Maximum x-height a table partition can have as a multiple of global +// median x-height +const double kMaxTableCellXheight = 2.0; + +// Maximum line spacing between a table column header and column contents +// for merging the two (as a multiple of the partition's median_height). +const int kMaxColumnHeaderDistance = 4; + +// Minimum ratio of num_table_partitions to num_text_partitions in a column +// block to be called it a table column +const double kTableColumnThreshold = 3.0; + +// Search for horizontal ruling lines within the vertical margin as a +// multiple of grid size +// const int kRulingVerticalMargin = 3; + +// Minimum overlap that a colpartition must have with a table region +// to become part of that table +const double kMinOverlapWithTable = 0.6; + +// Maximum side space (distance from column boundary) that a typical +// text-line in flowing text should have as a multiple of its x-height +// (Median size). +const int kSideSpaceMargin = 10; + +// Fraction of the peak of x-projection of a table region to set the +// threshold for the x-projection histogram +const double kSmallTableProjectionThreshold = 0.35; +const double kLargeTableProjectionThreshold = 0.45; +// Minimum number of rows required to look for more rows in the projection. +const int kLargeTableRowCount = 6; + +// Minimum number of rows in a table +const int kMinRowsInTable = 3; + +// The amount of padding (multiplied by global_median_xheight_ during use) +// that is vertically added to the search adjacent leader search during +// ColPartition marking. +const int kAdjacentLeaderSearchPadding = 2; + +// Used when filtering false positives. When finding the last line +// of a paragraph (typically left-aligned), the previous line should have +// its center to the right of the last line by this scaled amount. +const double kParagraphEndingPreviousLineRatio = 1.3; + +// The maximum amount of whitespace allowed left of a paragraph ending. +// Do not filter a ColPartition with more than this space left of it. +const double kMaxParagraphEndingLeftSpaceMultiple = 3.0; + +// Used when filtering false positives. The last line of a paragraph +// should be preceded by a line that is predominantly text. This is the +// ratio of text to whitespace (to the right of the text) that is required +// for the previous line to be a text. +const double kMinParagraphEndingTextToWhitespaceRatio = 3.0; + +// When counting table columns, this is the required gap between two columns +// (it is multiplied by global_median_xheight_). +const double kMaxXProjectionGapFactor = 2.0; + +// Used for similarity in partitions using stroke width. Values copied +// from ColFind.cpp in Ray's CL. +const double kStrokeWidthFractionalTolerance = 0.25; +const double kStrokeWidthConstantTolerance = 2.0; + +#ifndef GRAPHICS_DISABLED +static BOOL_VAR(textord_show_tables, false, "Show table regions (ScrollView)"); +static BOOL_VAR(textord_tablefind_show_mark, false, + "Debug table marking steps in detail (ScrollView)"); +static BOOL_VAR(textord_tablefind_show_stats, false, + "Show page stats used in table finding (ScrollView)"); +#endif +static BOOL_VAR(textord_tablefind_recognize_tables, false, + "Enables the table recognizer for table layout and filtering."); + +// Templated helper function used to create destructor callbacks for the +// BBGrid::ClearGridData() method. +template <typename T> +void DeleteObject(T *object) { + delete object; +} + +TableFinder::TableFinder() + : resolution_(0), + global_median_xheight_(0), + global_median_blob_width_(0), + global_median_ledding_(0), + left_to_right_language_(true) {} + +TableFinder::~TableFinder() { + // ColPartitions and ColSegments created by this class for storage in grids + // need to be deleted explicitly. + clean_part_grid_.ClearGridData(&DeleteObject<ColPartition>); + leader_and_ruling_grid_.ClearGridData(&DeleteObject<ColPartition>); + fragmented_text_grid_.ClearGridData(&DeleteObject<ColPartition>); + col_seg_grid_.ClearGridData(&DeleteObject<ColSegment>); + table_grid_.ClearGridData(&DeleteObject<ColSegment>); +} + +void TableFinder::set_left_to_right_language(bool order) { + left_to_right_language_ = order; +} + +void TableFinder::Init(int grid_size, const ICOORD &bottom_left, + const ICOORD &top_right) { + // Initialize clean partitions list and grid + clean_part_grid_.Init(grid_size, bottom_left, top_right); + leader_and_ruling_grid_.Init(grid_size, bottom_left, top_right); + fragmented_text_grid_.Init(grid_size, bottom_left, top_right); + col_seg_grid_.Init(grid_size, bottom_left, top_right); + table_grid_.Init(grid_size, bottom_left, top_right); +} + +// Copy cleaned partitions from part_grid_ to clean_part_grid_ and +// insert leaders and rulers into the leader_and_ruling_grid_ +void TableFinder::InsertCleanPartitions(ColPartitionGrid *grid, + TO_BLOCK *block) { + // Calculate stats. This lets us filter partitions in AllowTextPartition() + // and filter blobs in AllowBlob(). + SetGlobalSpacings(grid); + + // Iterate the ColPartitions in the grid. + ColPartitionGridSearch gsearch(grid); + gsearch.SetUniqueMode(true); + gsearch.StartFullSearch(); + ColPartition *part = nullptr; + while ((part = gsearch.NextFullSearch()) != nullptr) { + // Reject partitions with nothing useful inside of them. + if (part->blob_type() == BRT_NOISE || part->bounding_box().area() <= 0) { + continue; + } + ColPartition *clean_part = part->ShallowCopy(); + ColPartition *leader_part = nullptr; + if (part->IsLineType()) { + InsertRulingPartition(clean_part); + continue; + } + // Insert all non-text partitions to clean_parts + if (!part->IsTextType()) { + InsertImagePartition(clean_part); + continue; + } + // Insert text colpartitions after removing noisy components from them + // The leaders are split into a separate grid. + BLOBNBOX_CLIST *part_boxes = part->boxes(); + BLOBNBOX_C_IT pit(part_boxes); + for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) { + BLOBNBOX *pblob = pit.data(); + // Bad blobs... happens in UNLV set. + // news.3G1, page 17 (around x=6) + if (!AllowBlob(*pblob)) { + continue; + } + if (pblob->flow() == BTFT_LEADER) { + if (leader_part == nullptr) { + leader_part = part->ShallowCopy(); + leader_part->set_flow(BTFT_LEADER); + } + leader_part->AddBox(pblob); + } else if (pblob->region_type() != BRT_NOISE) { + clean_part->AddBox(pblob); + } + } + clean_part->ComputeLimits(); + ColPartition *fragmented = clean_part->CopyButDontOwnBlobs(); + InsertTextPartition(clean_part); + SplitAndInsertFragmentedTextPartition(fragmented); + if (leader_part != nullptr) { + // TODO(nbeato): Note that ComputeLimits does not update the column + // information. So the leader may appear to span more columns than it + // really does later on when IsInSameColumnAs gets called to test + // for adjacent leaders. + leader_part->ComputeLimits(); + InsertLeaderPartition(leader_part); + } + } + + // Make the partition partners better for upper and lower neighbors. + clean_part_grid_.FindPartitionPartners(); + clean_part_grid_.RefinePartitionPartners(false); +} + +// High level function to perform table detection +void TableFinder::LocateTables(ColPartitionGrid *grid, + ColPartitionSet **all_columns, + WidthCallback width_cb, const FCOORD &reskew) { + // initialize spacing, neighbors, and columns + InitializePartitions(all_columns); + +#ifndef GRAPHICS_DISABLED + if (textord_show_tables) { + ScrollView *table_win = MakeWindow(0, 300, "Column Partitions & Neighbors"); + DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE); + DisplayColPartitions(table_win, &leader_and_ruling_grid_, + ScrollView::AQUAMARINE); + DisplayColPartitionConnections(table_win, &clean_part_grid_, + ScrollView::ORANGE); + + table_win = MakeWindow(100, 300, "Fragmented Text"); + DisplayColPartitions(table_win, &fragmented_text_grid_, ScrollView::BLUE); + } +#endif // !GRAPHICS_DISABLED + + // mark, filter, and smooth candidate table partitions + MarkTablePartitions(); + + // Make single-column blocks from good_columns_ partitions. col_segments are + // moved to a grid later which takes the ownership + ColSegment_LIST column_blocks; + GetColumnBlocks(all_columns, &column_blocks); + // Set the ratio of candidate table partitions in each column + SetColumnsType(&column_blocks); + + // Move column segments to col_seg_grid_ + MoveColSegmentsToGrid(&column_blocks, &col_seg_grid_); + + // Detect split in column layout that might have occurred due to the + // presence of a table. In such a case, merge the corresponding columns. + GridMergeColumnBlocks(); + + // Group horizontally overlapping table partitions into table columns. + // table_columns created here get deleted at the end of this method. + ColSegment_LIST table_columns; + GetTableColumns(&table_columns); + + // Within each column, mark the range table regions occupy based on the + // table columns detected. table_regions are moved to a grid later which + // takes the ownership + ColSegment_LIST table_regions; + GetTableRegions(&table_columns, &table_regions); + +#ifndef GRAPHICS_DISABLED + if (textord_tablefind_show_mark) { + ScrollView *table_win = MakeWindow(1200, 300, "Table Columns and Regions"); + DisplayColSegments(table_win, &table_columns, ScrollView::DARK_TURQUOISE); + DisplayColSegments(table_win, &table_regions, ScrollView::YELLOW); + } +#endif // !GRAPHICS_DISABLED + + // Merge table regions across columns for tables spanning multiple + // columns + MoveColSegmentsToGrid(&table_regions, &table_grid_); + GridMergeTableRegions(); + + // Adjust table boundaries by including nearby horizontal lines and left + // out column headers + AdjustTableBoundaries(); + GridMergeTableRegions(); + + if (textord_tablefind_recognize_tables) { + // Remove false alarms consisting of a single column + DeleteSingleColumnTables(); + +#ifndef GRAPHICS_DISABLED + if (textord_show_tables) { + ScrollView *table_win = MakeWindow(1200, 300, "Detected Table Locations"); + DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE); + DisplayColSegments(table_win, &table_columns, ScrollView::KHAKI); + table_grid_.DisplayBoxes(table_win); + } +#endif // !GRAPHICS_DISABLED + + // Find table grid structure and reject tables that are malformed. + RecognizeTables(); + GridMergeTableRegions(); + RecognizeTables(); + +#ifndef GRAPHICS_DISABLED + if (textord_show_tables) { + ScrollView *table_win = MakeWindow(1400, 600, "Recognized Tables"); + DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE, + ScrollView::BLUE); + table_grid_.DisplayBoxes(table_win); + } +#endif // !GRAPHICS_DISABLED + } else { + // Remove false alarms consisting of a single column + // TODO(nbeato): verify this is a NOP after structured table rejection. + // Right now it isn't. If the recognize function is doing what it is + // supposed to do, this function is obsolete. + DeleteSingleColumnTables(); + +#ifndef GRAPHICS_DISABLED + if (textord_show_tables) { + ScrollView *table_win = MakeWindow(1500, 300, "Detected Tables"); + DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE, + ScrollView::BLUE); + table_grid_.DisplayBoxes(table_win); + } +#endif // !GRAPHICS_DISABLED + } + + // Merge all colpartitions in table regions to make them a single + // colpartition and revert types of isolated table cells not + // assigned to any table to their original types. + MakeTableBlocks(grid, all_columns, width_cb); +} +// All grids have the same dimensions. The clean_part_grid_ sizes are set from +// the part_grid_ that is passed to InsertCleanPartitions, which was the same as +// the grid that is the base of ColumnFinder. Just return the clean_part_grid_ +// dimensions instead of duplicated memory. +int TableFinder::gridsize() const { + return clean_part_grid_.gridsize(); +} +int TableFinder::gridwidth() const { + return clean_part_grid_.gridwidth(); +} +int TableFinder::gridheight() const { + return clean_part_grid_.gridheight(); +} +const ICOORD &TableFinder::bleft() const { + return clean_part_grid_.bleft(); +} +const ICOORD &TableFinder::tright() const { + return clean_part_grid_.tright(); +} + +void TableFinder::InsertTextPartition(ColPartition *part) { + ASSERT_HOST(part != nullptr); + if (AllowTextPartition(*part)) { + clean_part_grid_.InsertBBox(true, true, part); + } else { + delete part; + } +} +void TableFinder::InsertFragmentedTextPartition(ColPartition *part) { + ASSERT_HOST(part != nullptr); + if (AllowTextPartition(*part)) { + fragmented_text_grid_.InsertBBox(true, true, part); + } else { + delete part; + } +} +void TableFinder::InsertLeaderPartition(ColPartition *part) { + ASSERT_HOST(part != nullptr); + if (!part->IsEmpty() && part->bounding_box().area() > 0) { + leader_and_ruling_grid_.InsertBBox(true, true, part); + } else { + delete part; + } +} +void TableFinder::InsertRulingPartition(ColPartition *part) { + leader_and_ruling_grid_.InsertBBox(true, true, part); +} +void TableFinder::InsertImagePartition(ColPartition *part) { + // NOTE: If images are placed into a different grid in the future, + // the function SetPartitionSpacings needs to be updated. It should + // be the only thing that cares about image partitions. + clean_part_grid_.InsertBBox(true, true, part); +} + +// Splits a partition into its "words". The splits happen +// at locations with wide inter-blob spacing. This is useful +// because it allows the table recognize to "cut through" the +// text lines on the page. The assumption is that a table +// will have several lines with similar overlapping whitespace +// whereas text will not have this type of property. +// Note: The code assumes that blobs are sorted by the left side x! +// This will not work (as well) if the blobs are sorted by center/right. +void TableFinder::SplitAndInsertFragmentedTextPartition(ColPartition *part) { + ASSERT_HOST(part != nullptr); + // Bye bye empty partitions! + if (part->boxes()->empty()) { + delete part; + return; + } + + // The AllowBlob function prevents this. + ASSERT_HOST(part->median_width() > 0); + const double kThreshold = part->median_width() * kSplitPartitionSize; + + ColPartition *right_part = part; + bool found_split = true; + while (found_split) { + found_split = false; + BLOBNBOX_C_IT box_it(right_part->boxes()); + // Blobs are sorted left side first. If blobs overlap, + // the previous blob may have a "more right" right side. + // Account for this by always keeping the largest "right" + // so far. + int previous_right = INT32_MIN; + + // Look for the next split in the partition. + for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) { + const TBOX &box = box_it.data()->bounding_box(); + if (previous_right != INT32_MIN && + box.left() - previous_right > kThreshold) { + // We have a split position. Split the partition in two pieces. + // Insert the left piece in the grid and keep processing the right. + int mid_x = (box.left() + previous_right) / 2; + ColPartition *left_part = right_part; + right_part = left_part->SplitAt(mid_x); + + InsertFragmentedTextPartition(left_part); + found_split = true; + break; + } + + // The right side of the previous blobs. + previous_right = std::max(previous_right, static_cast<int>(box.right())); + } + } + // When a split is not found, the right part is minimized + // as much as possible, so process it. + InsertFragmentedTextPartition(right_part); +} + +// Some simple criteria to filter out now. We want to make sure the +// average blob size in the partition is consistent with the +// global page stats. +// The area metric will almost always pass for multi-blob partitions. +// It is useful when filtering out noise caused by an isolated blob. +bool TableFinder::AllowTextPartition(const ColPartition &part) const { + const double kHeightRequired = global_median_xheight_ * kAllowTextHeight; + const double kWidthRequired = global_median_blob_width_ * kAllowTextWidth; + const int median_area = global_median_xheight_ * global_median_blob_width_; + const double kAreaPerBlobRequired = median_area * kAllowTextArea; + // Keep comparisons strictly greater to disallow 0! + return part.median_height() > kHeightRequired && + part.median_width() > kWidthRequired && + part.bounding_box().area() > kAreaPerBlobRequired * part.boxes_count(); +} + +// Same as above, applied to blobs. Keep in mind that +// leaders, commas, and periods are important in tables. +bool TableFinder::AllowBlob(const BLOBNBOX &blob) const { + const TBOX &box = blob.bounding_box(); + const double kHeightRequired = global_median_xheight_ * kAllowBlobHeight; + const double kWidthRequired = global_median_blob_width_ * kAllowBlobWidth; + const int median_area = global_median_xheight_ * global_median_blob_width_; + const double kAreaRequired = median_area * kAllowBlobArea; + // Keep comparisons strictly greater to disallow 0! + return box.height() > kHeightRequired && box.width() > kWidthRequired && + box.area() > kAreaRequired; +} + +// TODO(nbeato): The grid that makes the window doesn't seem to matter. +// The only downside is that window messages will be caught by +// clean_part_grid_ instead of a useful object. This is a temporary solution +// for the debug windows created by the TableFinder. +#ifndef GRAPHICS_DISABLED +ScrollView *TableFinder::MakeWindow(int x, int y, const char *window_name) { + return clean_part_grid_.MakeWindow(x, y, window_name); +} +#endif + +// Make single-column blocks from good_columns_ partitions. +void TableFinder::GetColumnBlocks(ColPartitionSet **all_columns, + ColSegment_LIST *column_blocks) { + for (int i = 0; i < gridheight(); ++i) { + ColPartitionSet *columns = all_columns[i]; + if (columns != nullptr) { + ColSegment_LIST new_blocks; + // Get boxes from the current vertical position on the grid + columns->GetColumnBoxes(i * gridsize(), (i + 1) * gridsize(), + &new_blocks); + // Merge the new_blocks boxes into column_blocks if they are well-aligned + GroupColumnBlocks(&new_blocks, column_blocks); + } + } +} + +// Merge column segments into the current list if they are well aligned. +void TableFinder::GroupColumnBlocks(ColSegment_LIST *new_blocks, + ColSegment_LIST *column_blocks) { + ColSegment_IT src_it(new_blocks); + ColSegment_IT dest_it(column_blocks); + // iterate through the source list + for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) { + ColSegment *src_seg = src_it.data(); + const TBOX &src_box = src_seg->bounding_box(); + bool match_found = false; + // iterate through the destination list to find a matching column block + for (dest_it.mark_cycle_pt(); !dest_it.cycled_list(); dest_it.forward()) { + ColSegment *dest_seg = dest_it.data(); + TBOX dest_box = dest_seg->bounding_box(); + if (ConsecutiveBoxes(src_box, dest_box)) { + // If matching block is found, insert the current block into it + // and delete the source block. + dest_seg->InsertBox(src_box); + match_found = true; + delete src_it.extract(); + break; + } + } + // If no match is found, just append the source block to column_blocks + if (!match_found) { + dest_it.add_after_then_move(src_it.extract()); + } + } +} + +// are the two boxes immediate neighbors along the vertical direction +bool TableFinder::ConsecutiveBoxes(const TBOX &b1, const TBOX &b2) { + int x_margin = 20; + int y_margin = 5; + return (abs(b1.left() - b2.left()) < x_margin) && + (abs(b1.right() - b2.right()) < x_margin) && + (abs(b1.top() - b2.bottom()) < y_margin || + abs(b2.top() - b1.bottom()) < y_margin); +} + +// Set up info for clean_part_grid_ partitions to be valid during detection +// code. +void TableFinder::InitializePartitions(ColPartitionSet **all_columns) { + FindNeighbors(); + SetPartitionSpacings(&clean_part_grid_, all_columns); + SetGlobalSpacings(&clean_part_grid_); +} + +// Set left, right and top, bottom spacings of each colpartition. +void TableFinder::SetPartitionSpacings(ColPartitionGrid *grid, + ColPartitionSet **all_columns) { + // Iterate the ColPartitions in the grid. + ColPartitionGridSearch gsearch(grid); + gsearch.StartFullSearch(); + ColPartition *part = nullptr; + while ((part = gsearch.NextFullSearch()) != nullptr) { + ColPartitionSet *columns = all_columns[gsearch.GridY()]; + TBOX box = part->bounding_box(); + int y = part->MidY(); + ColPartition *left_column = columns->ColumnContaining(box.left(), y); + ColPartition *right_column = columns->ColumnContaining(box.right(), y); + // set distance from left column as space to the left + if (left_column) { + int left_space = std::max(0, box.left() - left_column->LeftAtY(y)); + part->set_space_to_left(left_space); + } + // set distance from right column as space to the right + if (right_column) { + int right_space = std::max(0, right_column->RightAtY(y) - box.right()); + part->set_space_to_right(right_space); + } + + // Look for images that may be closer. + // NOTE: used to be part_grid_, might cause issues now + ColPartitionGridSearch hsearch(grid); + hsearch.StartSideSearch(box.left(), box.bottom(), box.top()); + ColPartition *neighbor = nullptr; + while ((neighbor = hsearch.NextSideSearch(true)) != nullptr) { + if (neighbor->type() == PT_PULLOUT_IMAGE || + neighbor->type() == PT_FLOWING_IMAGE || + neighbor->type() == PT_HEADING_IMAGE) { + int right = neighbor->bounding_box().right(); + if (right < box.left()) { + int space = std::min(box.left() - right, part->space_to_left()); + part->set_space_to_left(space); + } + } + } + hsearch.StartSideSearch(box.left(), box.bottom(), box.top()); + neighbor = nullptr; + while ((neighbor = hsearch.NextSideSearch(false)) != nullptr) { + if (neighbor->type() == PT_PULLOUT_IMAGE || + neighbor->type() == PT_FLOWING_IMAGE || + neighbor->type() == PT_HEADING_IMAGE) { + int left = neighbor->bounding_box().left(); + if (left > box.right()) { + int space = std::min(left - box.right(), part->space_to_right()); + part->set_space_to_right(space); + } + } + } + + ColPartition *upper_part = part->SingletonPartner(true); + if (upper_part) { + int space = + std::max(0, static_cast<int>(upper_part->bounding_box().bottom() - + part->bounding_box().bottom())); + part->set_space_above(space); + } else { + // TODO(nbeato): What constitutes a good value? + // 0 is the default value when not set, explicitly noting it needs to + // be something else. + part->set_space_above(INT32_MAX); + } + + ColPartition *lower_part = part->SingletonPartner(false); + if (lower_part) { + int space = + std::max(0, static_cast<int>(part->bounding_box().bottom() - + lower_part->bounding_box().bottom())); + part->set_space_below(space); + } else { + // TODO(nbeato): What constitutes a good value? + // 0 is the default value when not set, explicitly noting it needs to + // be something else. + part->set_space_below(INT32_MAX); + } + } +} + +// Set spacing and closest neighbors above and below a given colpartition. +void TableFinder::SetVerticalSpacing(ColPartition *part) { + TBOX box = part->bounding_box(); + int top_range = + std::min(box.top() + kMaxVerticalSpacing, static_cast<int>(tright().y())); + int bottom_range = std::max(box.bottom() - kMaxVerticalSpacing, + static_cast<int>(bleft().y())); + box.set_top(top_range); + box.set_bottom(bottom_range); + + TBOX part_box = part->bounding_box(); + // Start a rect search + ColPartitionGridSearch rectsearch(&clean_part_grid_); + rectsearch.StartRectSearch(box); + ColPartition *neighbor; + int min_space_above = kMaxVerticalSpacing; + int min_space_below = kMaxVerticalSpacing; + ColPartition *above_neighbor = nullptr; + ColPartition *below_neighbor = nullptr; + while ((neighbor = rectsearch.NextRectSearch()) != nullptr) { + if (neighbor == part) { + continue; + } + TBOX neighbor_box = neighbor->bounding_box(); + if (neighbor_box.major_x_overlap(part_box)) { + int gap = abs(part->median_bottom() - neighbor->median_bottom()); + // If neighbor is below current partition + if (neighbor_box.top() < part_box.bottom() && gap < min_space_below) { + min_space_below = gap; + below_neighbor = neighbor; + } // If neighbor is above current partition + else if (part_box.top() < neighbor_box.bottom() && + gap < min_space_above) { + min_space_above = gap; + above_neighbor = neighbor; + } + } + } + part->set_space_above(min_space_above); + part->set_space_below(min_space_below); + part->set_nearest_neighbor_above(above_neighbor); + part->set_nearest_neighbor_below(below_neighbor); +} + +// Set global spacing and x-height estimates +void TableFinder::SetGlobalSpacings(ColPartitionGrid *grid) { + STATS xheight_stats(0, kMaxVerticalSpacing); + STATS width_stats(0, kMaxBlobWidth); + STATS ledding_stats(0, kMaxVerticalSpacing); + // Iterate the ColPartitions in the grid. + ColPartitionGridSearch gsearch(grid); + gsearch.SetUniqueMode(true); + gsearch.StartFullSearch(); + ColPartition *part = nullptr; + while ((part = gsearch.NextFullSearch()) != nullptr) { + // TODO(nbeato): HACK HACK HACK! medians are equal to partition length. + // ComputeLimits needs to get called somewhere outside of TableFinder + // to make sure the partitions are properly initialized. + // When this is called, SmoothPartitionPartners dies in an assert after + // table find runs. Alternative solution. + // part->ComputeLimits(); + if (part->IsTextType()) { + // xheight_stats.add(part->median_height(), part->boxes_count()); + // width_stats.add(part->median_width(), part->boxes_count()); + + // This loop can be removed when above issues are fixed. + // Replace it with the 2 lines commented out above. + BLOBNBOX_C_IT it(part->boxes()); + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + xheight_stats.add(it.data()->bounding_box().height(), 1); + width_stats.add(it.data()->bounding_box().width(), 1); + } + + ledding_stats.add(part->space_above(), 1); + ledding_stats.add(part->space_below(), 1); + } + } + // Set estimates based on median of statistics obtained + set_global_median_xheight(static_cast<int>(xheight_stats.median() + 0.5)); + set_global_median_blob_width(static_cast<int>(width_stats.median() + 0.5)); + set_global_median_ledding(static_cast<int>(ledding_stats.median() + 0.5)); +#ifndef GRAPHICS_DISABLED + if (textord_tablefind_show_stats) { + const char *kWindowName = "X-height (R), X-width (G), and ledding (B)"; + ScrollView *stats_win = MakeWindow(500, 10, kWindowName); + xheight_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::RED); + width_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::GREEN); + ledding_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::BLUE); + } +#endif // !GRAPHICS_DISABLED +} + +void TableFinder::set_global_median_xheight(int xheight) { + global_median_xheight_ = xheight; +} +void TableFinder::set_global_median_blob_width(int width) { + global_median_blob_width_ = width; +} +void TableFinder::set_global_median_ledding(int ledding) { + global_median_ledding_ = ledding; +} + +void TableFinder::FindNeighbors() { + ColPartitionGridSearch gsearch(&clean_part_grid_); + gsearch.StartFullSearch(); + ColPartition *part = nullptr; + while ((part = gsearch.NextFullSearch()) != nullptr) { + // TODO(nbeato): Rename this function, meaning is different now. + // IT is finding nearest neighbors its own way + // SetVerticalSpacing(part); + + ColPartition *upper = part->SingletonPartner(true); + if (upper) { + part->set_nearest_neighbor_above(upper); + } + + ColPartition *lower = part->SingletonPartner(false); + if (lower) { + part->set_nearest_neighbor_below(lower); + } + } +} + +// High level interface. Input is an unmarked ColPartitionGrid +// (namely, clean_part_grid_). Partitions are identified using local +// information and filter/smoothed. The function exit should contain +// a good sampling of the table partitions. +void TableFinder::MarkTablePartitions() { + MarkPartitionsUsingLocalInformation(); +#ifndef GRAPHICS_DISABLED + if (textord_tablefind_show_mark) { + ScrollView *table_win = MakeWindow(300, 300, "Initial Table Partitions"); + DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE); + DisplayColPartitions(table_win, &leader_and_ruling_grid_, + ScrollView::AQUAMARINE); + } +#endif + FilterFalseAlarms(); +#ifndef GRAPHICS_DISABLED + if (textord_tablefind_show_mark) { + ScrollView *table_win = MakeWindow(600, 300, "Filtered Table Partitions"); + DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE); + DisplayColPartitions(table_win, &leader_and_ruling_grid_, + ScrollView::AQUAMARINE); + } +#endif + SmoothTablePartitionRuns(); +#ifndef GRAPHICS_DISABLED + if (textord_tablefind_show_mark) { + ScrollView *table_win = MakeWindow(900, 300, "Smoothed Table Partitions"); + DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE); + DisplayColPartitions(table_win, &leader_and_ruling_grid_, + ScrollView::AQUAMARINE); + } +#endif + FilterFalseAlarms(); +#ifndef GRAPHICS_DISABLED + if (textord_tablefind_show_mark || textord_show_tables) { + ScrollView *table_win = MakeWindow(900, 300, "Final Table Partitions"); + DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE); + DisplayColPartitions(table_win, &leader_and_ruling_grid_, + ScrollView::AQUAMARINE); + } +#endif +} + +// These types of partitions are marked as table partitions: +// 1- Partitions that have at lease one large gap between words +// 2- Partitions that consist of only one word (no significant gap +// between components) +// 3- Partitions that vertically overlap with other partitions within the +// same column. +// 4- Partitions with leaders before/after them. +void TableFinder::MarkPartitionsUsingLocalInformation() { + // Iterate the ColPartitions in the grid. + ColPartitionGridSearch gsearch(&clean_part_grid_); + gsearch.StartFullSearch(); + ColPartition *part = nullptr; + while ((part = gsearch.NextFullSearch()) != nullptr) { + if (!part->IsTextType()) { // Only consider text partitions + continue; + } + // Only consider partitions in dominant font size or smaller + if (part->median_height() > kMaxTableCellXheight * global_median_xheight_) { + continue; + } + // Mark partitions with a large gap, or no significant gap as + // table partitions. + // Comments: It produces several false alarms at: + // - last line of a paragraph (fixed) + // - single word section headings + // - page headers and footers + // - numbered equations + // - line drawing regions + // TODO(faisal): detect and fix above-mentioned cases + if (HasWideOrNoInterWordGap(part) || HasLeaderAdjacent(*part)) { + part->set_table_type(); + } + } +} + +// Check if the partition has at least one large gap between words or no +// significant gap at all +bool TableFinder::HasWideOrNoInterWordGap(ColPartition *part) const { + // Should only get text partitions. + ASSERT_HOST(part->IsTextType()); + // Blob access + BLOBNBOX_CLIST *part_boxes = part->boxes(); + BLOBNBOX_C_IT it(part_boxes); + // Check if this is a relatively small partition (such as a single word) + if (part->bounding_box().width() < + kMinBoxesInTextPartition * part->median_height() && + part_boxes->length() < kMinBoxesInTextPartition) { + return true; + } + + // Variables used to compute inter-blob spacing. + int previous_x1 = -1; + // Stores the maximum gap detected. + int largest_partition_gap_found = -1; + // Text partition gap limits. If this is text (and not a table), + // there should be at least one gap larger than min_gap and no gap + // larger than max_gap. + const double max_gap = kMaxGapInTextPartition * part->median_height(); + const double min_gap = kMinMaxGapInTextPartition * part->median_height(); + + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + BLOBNBOX *blob = it.data(); + int current_x0 = blob->bounding_box().left(); + int current_x1 = blob->bounding_box().right(); + if (previous_x1 != -1) { + int gap = current_x0 - previous_x1; + + // TODO(nbeato): Boxes may overlap? Huh? + // For example, mag.3B 8003_033.3B.tif in UNLV data. The titles/authors + // on the top right of the page are filtered out with this line. + // Note 2: Iterating over blobs in a partition, so we are looking for + // spacing between the words. + if (gap < 0) { + // More likely case, the blobs slightly overlap. This can happen + // with diacritics (accents) or broken alphabet symbols (characters). + // Merge boxes together by taking max of right sides. + if (-gap < part->median_height() * kMaxBlobOverlapFactor) { + previous_x1 = std::max(previous_x1, current_x1); + continue; + } + // Extreme case, blobs overlap significantly in the same partition... + // This should not happen often (if at all), but it does. + // TODO(nbeato): investigate cases when this happens. + else { + // The behavior before was to completely ignore this case. + } + } + + // If a large enough gap is found, mark it as a table cell (return true) + if (gap > max_gap) { + return true; + } + if (gap > largest_partition_gap_found) { + largest_partition_gap_found = gap; + } + } + previous_x1 = current_x1; + } + // Since no large gap was found, return false if the partition is too + // long to be a data cell + if (part->bounding_box().width() > + kMaxBoxesInDataPartition * part->median_height() || + part_boxes->length() > kMaxBoxesInDataPartition) { + return false; + } + + // A partition may be a single blob. In this case, it's an isolated symbol + // or non-text (such as a ruling or image). + // Detect these as table partitions? Shouldn't this be case by case? + // The behavior before was to ignore this, making max_partition_gap < 0 + // and implicitly return true. Just making it explicit. + if (largest_partition_gap_found == -1) { + return true; + } + + // return true if the maximum gap found is smaller than the minimum allowed + // max_gap in a text partition. This indicates that there is no significant + // space in the partition, hence it is likely a single word. + return largest_partition_gap_found < min_gap; +} + +// A criteria for possible tables is that a table may have leaders +// between data cells. An aggressive solution to find such tables is to +// explicitly mark partitions that have adjacent leaders. +// Note that this includes overlapping leaders. However, it does not +// include leaders in different columns on the page. +// Possible false-positive will include lists, such as a table of contents. +// As these arise, the aggressive nature of this search may need to be +// trimmed down. +bool TableFinder::HasLeaderAdjacent(const ColPartition &part) { + if (part.flow() == BTFT_LEADER) { + return true; + } + // Search range is left and right bounded by an offset of the + // median xheight. This offset is to allow some tolerance to the + // the leaders on the page in the event that the alignment is still + // a bit off. + const TBOX &box = part.bounding_box(); + const int search_size = kAdjacentLeaderSearchPadding * global_median_xheight_; + const int top = box.top() + search_size; + const int bottom = box.bottom() - search_size; + ColPartitionGridSearch hsearch(&leader_and_ruling_grid_); + for (int direction = 0; direction < 2; ++direction) { + bool right_to_left = (direction == 0); + int x = right_to_left ? box.right() : box.left(); + hsearch.StartSideSearch(x, bottom, top); + ColPartition *leader = nullptr; + while ((leader = hsearch.NextSideSearch(right_to_left)) != nullptr) { + // The leader could be a horizontal ruling in the grid. + // Make sure it is actually a leader. + if (leader->flow() != BTFT_LEADER) { + continue; + } + // This should not happen, they are in different grids. + ASSERT_HOST(&part != leader); + // Make sure the leader shares a page column with the partition, + // otherwise we are spreading across columns. + if (!part.IsInSameColumnAs(*leader)) { + break; + } + // There should be a significant vertical overlap + if (!leader->VSignificantCoreOverlap(part)) { + continue; + } + // Leader passed all tests, so it is adjacent. + return true; + } + } + // No leaders are adjacent to the given partition. + return false; +} + +// Filter individual text partitions marked as table partitions +// consisting of paragraph endings, small section headings, and +// headers and footers. +void TableFinder::FilterFalseAlarms() { + FilterParagraphEndings(); + FilterHeaderAndFooter(); + // TODO(nbeato): Fully justified text as non-table? +} + +void TableFinder::FilterParagraphEndings() { + // Detect last line of paragraph + // Iterate the ColPartitions in the grid. + ColPartitionGridSearch gsearch(&clean_part_grid_); + gsearch.StartFullSearch(); + ColPartition *part = nullptr; + while ((part = gsearch.NextFullSearch()) != nullptr) { + if (part->type() != PT_TABLE) { + continue; // Consider only table partitions + } + + // Paragraph ending should have flowing text above it. + ColPartition *upper_part = part->nearest_neighbor_above(); + if (!upper_part) { + continue; + } + if (upper_part->type() != PT_FLOWING_TEXT) { + continue; + } + if (upper_part->bounding_box().width() < 2 * part->bounding_box().width()) { + continue; + } + // Check if its the last line of a paragraph. + // In most cases, a paragraph ending should be left-aligned to text line + // above it. Sometimes, it could be a 2 line paragraph, in which case + // the line above it is indented. + // To account for that, check if the partition center is to + // the left of the one above it. + int mid = (part->bounding_box().left() + part->bounding_box().right()) / 2; + int upper_mid = (upper_part->bounding_box().left() + + upper_part->bounding_box().right()) / + 2; + int current_spacing = 0; // spacing of the current line to margin + int upper_spacing = 0; // spacing of the previous line to the margin + if (left_to_right_language_) { + // Left to right languages, use mid - left to figure out the distance + // the middle is from the left margin. + int left = std::min(part->bounding_box().left(), + upper_part->bounding_box().left()); + current_spacing = mid - left; + upper_spacing = upper_mid - left; + } else { + // Right to left languages, use right - mid to figure out the distance + // the middle is from the right margin. + int right = std::max(part->bounding_box().right(), + upper_part->bounding_box().right()); + current_spacing = right - mid; + upper_spacing = right - upper_mid; + } + if (current_spacing * kParagraphEndingPreviousLineRatio > upper_spacing) { + continue; + } + + // Paragraphs should have similar fonts. + if (!part->MatchingSizes(*upper_part) || + !part->MatchingStrokeWidth(*upper_part, kStrokeWidthFractionalTolerance, + kStrokeWidthConstantTolerance)) { + continue; + } + + // The last line of a paragraph should be left aligned. + // TODO(nbeato): This would be untrue if the text was right aligned. + // How often is that? + if (part->space_to_left() > + kMaxParagraphEndingLeftSpaceMultiple * part->median_height()) { + continue; + } + // The line above it should be right aligned (assuming justified format). + // Since we can't assume justified text, we compare whitespace to text. + // The above line should have majority spanning text (or the current + // line could have fit on the previous line). So compare + // whitespace to text. + if (upper_part->bounding_box().width() < + kMinParagraphEndingTextToWhitespaceRatio * + upper_part->space_to_right()) { + continue; + } + + // Ledding above the line should be less than ledding below + if (part->space_above() >= part->space_below() || + part->space_above() > 2 * global_median_ledding_) { + continue; + } + + // If all checks failed, it is probably text. + part->clear_table_type(); + } +} + +void TableFinder::FilterHeaderAndFooter() { + // Consider top-most text colpartition as header and bottom most as footer + ColPartition *header = nullptr; + ColPartition *footer = nullptr; + int max_top = INT32_MIN; + int min_bottom = INT32_MAX; + ColPartitionGridSearch gsearch(&clean_part_grid_); + gsearch.StartFullSearch(); + ColPartition *part = nullptr; + while ((part = gsearch.NextFullSearch()) != nullptr) { + if (!part->IsTextType()) { + continue; // Consider only text partitions + } + int top = part->bounding_box().top(); + int bottom = part->bounding_box().bottom(); + if (top > max_top) { + max_top = top; + header = part; + } + if (bottom < min_bottom) { + min_bottom = bottom; + footer = part; + } + } + if (header) { + header->clear_table_type(); + } + if (footer) { + footer->clear_table_type(); + } +} + +// Mark all ColPartitions as table cells that have a table cell above +// and below them +// TODO(faisal): This is too aggressive at the moment. The method needs to +// consider spacing and alignment as well. Detection of false alarm table cells +// should also be done as part of it. +void TableFinder::SmoothTablePartitionRuns() { + // Iterate the ColPartitions in the grid. + ColPartitionGridSearch gsearch(&clean_part_grid_); + gsearch.StartFullSearch(); + ColPartition *part = nullptr; + while ((part = gsearch.NextFullSearch()) != nullptr) { + if (part->type() >= PT_TABLE || part->type() == PT_UNKNOWN) { + continue; // Consider only text partitions + } + ColPartition *upper_part = part->nearest_neighbor_above(); + ColPartition *lower_part = part->nearest_neighbor_below(); + if (!upper_part || !lower_part) { + continue; + } + if (upper_part->type() == PT_TABLE && lower_part->type() == PT_TABLE) { + part->set_table_type(); + } + } + + // Pass 2, do the opposite. If both the upper and lower neighbors + // exist and are not tables, this probably shouldn't be a table. + gsearch.StartFullSearch(); + part = nullptr; + while ((part = gsearch.NextFullSearch()) != nullptr) { + if (part->type() != PT_TABLE) { + continue; // Consider only text partitions + } + ColPartition *upper_part = part->nearest_neighbor_above(); + ColPartition *lower_part = part->nearest_neighbor_below(); + + // table can't be by itself + if ((upper_part && upper_part->type() != PT_TABLE) && + (lower_part && lower_part->type() != PT_TABLE)) { + part->clear_table_type(); + } + } +} + +// Set the type of a column segment based on the ratio of table to text cells +void TableFinder::SetColumnsType(ColSegment_LIST *column_blocks) { + ColSegment_IT it(column_blocks); + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + ColSegment *seg = it.data(); + TBOX box = seg->bounding_box(); + int num_table_cells = 0; + int num_text_cells = 0; + ColPartitionGridSearch rsearch(&clean_part_grid_); + rsearch.SetUniqueMode(true); + rsearch.StartRectSearch(box); + ColPartition *part = nullptr; + while ((part = rsearch.NextRectSearch()) != nullptr) { + if (part->type() == PT_TABLE) { + num_table_cells++; + } else if (part->type() == PT_FLOWING_TEXT) { + num_text_cells++; + } + } + // If a column block has no text or table partition in it, it is not needed + // for table detection. + if (!num_table_cells && !num_text_cells) { + delete it.extract(); + } else { + seg->set_num_table_cells(num_table_cells); + seg->set_num_text_cells(num_text_cells); + // set column type based on the ratio of table to text cells + seg->set_type(); + } + } +} + +// Move column blocks to grid +void TableFinder::MoveColSegmentsToGrid(ColSegment_LIST *segments, + ColSegmentGrid *col_seg_grid) { + ColSegment_IT it(segments); + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + ColSegment *seg = it.extract(); + col_seg_grid->InsertBBox(true, true, seg); + } +} + +// Merge column blocks if a split is detected due to the presence of a +// table. A text block is considered split if it has multiple +// neighboring blocks above/below it, and at least one of the +// neighboring blocks is of table type (has a high density of table +// partitions). In this case neighboring blocks in the direction +// (above/below) of the table block are merged with the text block. + +// Comment: This method does not handle split due to a full page table +// since table columns in this case do not have a text column on which +// split decision can be based. +void TableFinder::GridMergeColumnBlocks() { + int margin = gridsize(); + + // Iterate the Column Blocks in the grid. + GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> gsearch( + &col_seg_grid_); + gsearch.StartFullSearch(); + ColSegment *seg; + while ((seg = gsearch.NextFullSearch()) != nullptr) { + if (seg->type() != COL_TEXT) { + continue; // only consider text blocks for split detection + } + bool neighbor_found = false; + bool modified = false; // Modified at least once + // keep expanding current box as long as neighboring table columns + // are found above or below it. + do { + TBOX box = seg->bounding_box(); + // slightly expand the search region vertically + int top_range = + std::min(box.top() + margin, static_cast<int>(tright().y())); + int bottom_range = + std::max(box.bottom() - margin, static_cast<int>(bleft().y())); + box.set_top(top_range); + box.set_bottom(bottom_range); + neighbor_found = false; + GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> rectsearch( + &col_seg_grid_); + rectsearch.StartRectSearch(box); + ColSegment *neighbor = nullptr; + while ((neighbor = rectsearch.NextRectSearch()) != nullptr) { + if (neighbor == seg) { + continue; + } + const TBOX &neighbor_box = neighbor->bounding_box(); + // If the neighbor box significantly overlaps with the current + // box (due to the expansion of the current box in the + // previous iteration of this loop), remove the neighbor box + // and expand the current box to include it. + if (neighbor_box.overlap_fraction(box) >= 0.9) { + seg->InsertBox(neighbor_box); + modified = true; + rectsearch.RemoveBBox(); + gsearch.RepositionIterator(); + delete neighbor; + continue; + } + // Only expand if the neighbor box is of table type + if (neighbor->type() != COL_TABLE) { + continue; + } + // Insert the neighbor box into the current column block + if (neighbor_box.major_x_overlap(box) && !box.contains(neighbor_box)) { + seg->InsertBox(neighbor_box); + neighbor_found = true; + modified = true; + rectsearch.RemoveBBox(); + gsearch.RepositionIterator(); + delete neighbor; + } + } + } while (neighbor_found); + if (modified) { + // Because the box has changed, it has to be removed first. + gsearch.RemoveBBox(); + col_seg_grid_.InsertBBox(true, true, seg); + gsearch.RepositionIterator(); + } + } +} + +// Group horizontally overlapping table partitions into table columns. +// TODO(faisal): This is too aggressive at the moment. The method should +// consider more attributes to group table partitions together. Some common +// errors are: +// 1- page number is merged with a table column above it even +// if there is a large vertical gap between them. +// 2- column headers go on to catch one of the columns arbitrarily +// 3- an isolated noise blob near page top or bottom merges with the table +// column below/above it +// 4- cells from two vertically adjacent tables merge together to make a +// single column resulting in merging of the two tables +void TableFinder::GetTableColumns(ColSegment_LIST *table_columns) { + ColSegment_IT it(table_columns); + // Iterate the ColPartitions in the grid. + ColPartitionGridSearch gsearch(&clean_part_grid_); + gsearch.StartFullSearch(); + ColPartition *part; + while ((part = gsearch.NextFullSearch()) != nullptr) { + if (part->inside_table_column() || part->type() != PT_TABLE) { + continue; // prevent a partition to be assigned to multiple columns + } + const TBOX &box = part->bounding_box(); + auto *col = new ColSegment(); + col->InsertBox(box); + part->set_inside_table_column(true); + // Start a search below the current cell to find bottom neighbours + // Note: a full search will always process things above it first, so + // this should be starting at the highest cell and working its way down. + ColPartitionGridSearch vsearch(&clean_part_grid_); + vsearch.StartVerticalSearch(box.left(), box.right(), box.bottom()); + ColPartition *neighbor = nullptr; + bool found_neighbours = false; + while ((neighbor = vsearch.NextVerticalSearch(true)) != nullptr) { + // only consider neighbors not assigned to any column yet + if (neighbor->inside_table_column()) { + continue; + } + // Horizontal lines should not break the flow + if (neighbor->IsHorizontalLine()) { + continue; + } + // presence of a non-table neighbor marks the end of current + // table column + if (neighbor->type() != PT_TABLE) { + break; + } + // add the neighbor partition to the table column + const TBOX &neighbor_box = neighbor->bounding_box(); + col->InsertBox(neighbor_box); + neighbor->set_inside_table_column(true); + found_neighbours = true; + } + if (found_neighbours) { + it.add_after_then_move(col); + } else { + part->set_inside_table_column(false); + delete col; + } + } +} + +// Mark regions in a column that are x-bounded by the column boundaries and +// y-bounded by the table columns' projection on the y-axis as table regions +void TableFinder::GetTableRegions(ColSegment_LIST *table_columns, + ColSegment_LIST *table_regions) { + ColSegment_IT cit(table_columns); + ColSegment_IT rit(table_regions); + // Iterate through column blocks + GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> gsearch( + &col_seg_grid_); + gsearch.StartFullSearch(); + ColSegment *part; + int page_height = tright().y() - bleft().y(); + ASSERT_HOST(page_height > 0); + // create a bool array to hold projection on y-axis + bool *table_region = new bool[page_height]; + while ((part = gsearch.NextFullSearch()) != nullptr) { + const TBOX &part_box = part->bounding_box(); + // reset the projection array + for (int i = 0; i < page_height; i++) { + table_region[i] = false; + } + // iterate through all table columns to find regions in the current + // page column block + cit.move_to_first(); + for (cit.mark_cycle_pt(); !cit.cycled_list(); cit.forward()) { + TBOX col_box = cit.data()->bounding_box(); + // find intersection region of table column and page column + TBOX intersection_box = col_box.intersection(part_box); + // project table column on the y-axis + for (int i = intersection_box.bottom(); i < intersection_box.top(); i++) { + table_region[i - bleft().y()] = true; + } + } + // set x-limits of table regions to page column width + TBOX current_table_box; + current_table_box.set_left(part_box.left()); + current_table_box.set_right(part_box.right()); + // go through the y-axis projection to find runs of table + // regions. Each run makes one table region. + for (int i = 1; i < page_height; i++) { + // detect start of a table region + if (!table_region[i - 1] && table_region[i]) { + current_table_box.set_bottom(i + bleft().y()); + } + // TODO(nbeato): Is it guaranteed that the last row is not a table region? + // detect end of a table region + if (table_region[i - 1] && !table_region[i]) { + current_table_box.set_top(i + bleft().y()); + if (!current_table_box.null_box()) { + auto *seg = new ColSegment(); + seg->InsertBox(current_table_box); + rit.add_after_then_move(seg); + } + } + } + } + delete[] table_region; +} + +// Merge table regions corresponding to tables spanning multiple columns if +// there is a colpartition (horizontal ruling line or normal text) that +// touches both regions. +// TODO(faisal): A rare error occurs if there are two horizontally adjacent +// tables with aligned ruling lines. In this case, line finder returns a +// single line and hence the tables get merged together +void TableFinder::GridMergeTableRegions() { + // Iterate the table regions in the grid. + GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> gsearch( + &table_grid_); + gsearch.StartFullSearch(); + ColSegment *seg = nullptr; + while ((seg = gsearch.NextFullSearch()) != nullptr) { + bool neighbor_found = false; + bool modified = false; // Modified at least once + do { + // Start a rectangle search x-bounded by the image and y by the table + const TBOX &box = seg->bounding_box(); + TBOX search_region(box); + search_region.set_left(bleft().x()); + search_region.set_right(tright().x()); + neighbor_found = false; + GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> rectsearch( + &table_grid_); + rectsearch.StartRectSearch(search_region); + ColSegment *neighbor = nullptr; + while ((neighbor = rectsearch.NextRectSearch()) != nullptr) { + if (neighbor == seg) { + continue; + } + const TBOX &neighbor_box = neighbor->bounding_box(); + // Check if a neighbor box has a large overlap with the table + // region. This may happen as a result of merging two table + // regions in the previous iteration. + if (neighbor_box.overlap_fraction(box) >= 0.9) { + seg->InsertBox(neighbor_box); + rectsearch.RemoveBBox(); + gsearch.RepositionIterator(); + delete neighbor; + modified = true; + continue; + } + // Check if two table regions belong together based on a common + // horizontal ruling line + if (BelongToOneTable(box, neighbor_box)) { + seg->InsertBox(neighbor_box); + neighbor_found = true; + modified = true; + rectsearch.RemoveBBox(); + gsearch.RepositionIterator(); + delete neighbor; + } + } + } while (neighbor_found); + if (modified) { + // Because the box has changed, it has to be removed first. + gsearch.RemoveBBox(); + table_grid_.InsertBBox(true, true, seg); + gsearch.RepositionIterator(); + } + } +} + +// Decide if two table regions belong to one table based on a common +// horizontal ruling line or another colpartition +bool TableFinder::BelongToOneTable(const TBOX &box1, const TBOX &box2) { + // Check the obvious case. Most likely not true because overlapping boxes + // should already be merged, but seems like a good thing to do in case things + // change. + if (box1.overlap(box2)) { + return true; + } + // Check for ColPartitions spanning both table regions + TBOX bbox = box1.bounding_union(box2); + // Start a rect search on bbox + ColPartitionGridSearch rectsearch(&clean_part_grid_); + rectsearch.StartRectSearch(bbox); + ColPartition *part = nullptr; + while ((part = rectsearch.NextRectSearch()) != nullptr) { + const TBOX &part_box = part->bounding_box(); + // return true if a colpartition spanning both table regions is found + if (part_box.overlap(box1) && part_box.overlap(box2) && + !part->IsImageType()) { + return true; + } + } + return false; +} + +// Adjust table boundaries by: +// - building a tight bounding box around all ColPartitions contained in it. +// - expanding table boundaries to include all colpartitions that overlap the +// table by more than half of their area +// - expanding table boundaries to include nearby horizontal rule lines +// - expanding table vertically to include left out column headers +// TODO(faisal): Expansion of table boundaries is quite aggressive. It usually +// makes following errors: +// 1- horizontal lines consisting of underlines are included in the table if +// they are close enough +// 2- horizontal lines originating from noise tend to get merged with a table +// near the top of the page +// 3- the criteria for including horizontal lines is very generous. Many times +// horizontal lines separating headers and footers get merged with a +// single-column table in a multi-column page thereby including text +// from the neighboring column inside the table +// 4- the criteria for including left out column headers also tends to +// occasionally include text-lines above the tables, typically from +// table caption +void TableFinder::AdjustTableBoundaries() { + // Iterate the table regions in the grid + ColSegment_CLIST adjusted_tables; + ColSegment_C_IT it(&adjusted_tables); + ColSegmentGridSearch gsearch(&table_grid_); + gsearch.StartFullSearch(); + ColSegment *table = nullptr; + while ((table = gsearch.NextFullSearch()) != nullptr) { + const TBOX &table_box = table->bounding_box(); + TBOX grown_box = table_box; + GrowTableBox(table_box, &grown_box); + // To prevent a table from expanding again, do not insert the + // modified box back to the grid. Instead move it to a list and + // and remove it from the grid. The list is moved later back to the grid. + if (!grown_box.null_box()) { + auto *col = new ColSegment(); + col->InsertBox(grown_box); + it.add_after_then_move(col); + } + gsearch.RemoveBBox(); + delete table; + } + // clear table grid to move final tables in it + // TODO(nbeato): table_grid_ should already be empty. The above loop + // removed everything. Maybe just assert it is empty? + table_grid_.Clear(); + it.move_to_first(); + // move back final tables to table_grid_ + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + ColSegment *seg = it.extract(); + table_grid_.InsertBBox(true, true, seg); + } +} + +void TableFinder::GrowTableBox(const TBOX &table_box, TBOX *result_box) { + // TODO(nbeato): The growing code is a bit excessive right now. + // By removing these lines, the partitions considered need + // to have some overlap or be special cases. These lines could + // be added again once a check is put in place to make sure that + // growing tables don't stomp on a lot of non-table partitions. + + // search for horizontal ruling lines within the vertical margin + // int vertical_margin = kRulingVerticalMargin * gridsize(); + TBOX search_box = table_box; + // int top = MIN(search_box.top() + vertical_margin, tright().y()); + // int bottom = MAX(search_box.bottom() - vertical_margin, bleft().y()); + // search_box.set_top(top); + // search_box.set_bottom(bottom); + + GrowTableToIncludePartials(table_box, search_box, result_box); + GrowTableToIncludeLines(table_box, search_box, result_box); + IncludeLeftOutColumnHeaders(result_box); +} + +// Grow a table by increasing the size of the box to include +// partitions with significant overlap with the table. +void TableFinder::GrowTableToIncludePartials(const TBOX &table_box, + const TBOX &search_range, + TBOX *result_box) { + // Rulings are in a different grid, so search 2 grids for rulings, text, + // and table partitions that are not entirely within the new box. + for (int i = 0; i < 2; ++i) { + ColPartitionGrid *grid = + (i == 0) ? &fragmented_text_grid_ : &leader_and_ruling_grid_; + ColPartitionGridSearch rectsearch(grid); + rectsearch.StartRectSearch(search_range); + ColPartition *part = nullptr; + while ((part = rectsearch.NextRectSearch()) != nullptr) { + // Only include text and table types. + if (part->IsImageType()) { + continue; + } + const TBOX &part_box = part->bounding_box(); + // Include partition in the table if more than half of it + // is covered by the table + if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) { + *result_box = result_box->bounding_union(part_box); + continue; + } + } + } +} + +// Grow a table by expanding to the extents of significantly +// overlapping lines. +void TableFinder::GrowTableToIncludeLines(const TBOX &table_box, + const TBOX &search_range, + TBOX *result_box) { + ColPartitionGridSearch rsearch(&leader_and_ruling_grid_); + rsearch.SetUniqueMode(true); + rsearch.StartRectSearch(search_range); + ColPartition *part = nullptr; + while ((part = rsearch.NextRectSearch()) != nullptr) { + // TODO(nbeato) This should also do vertical, but column + // boundaries are breaking things. This function needs to be + // updated to allow vertical lines as well. + if (!part->IsLineType()) { + continue; + } + // Avoid the following function call if the result of the + // function is irrelevant. + const TBOX &part_box = part->bounding_box(); + if (result_box->contains(part_box)) { + continue; + } + // Include a partially overlapping horizontal line only if the + // extra ColPartitions that will be included due to expansion + // have large side spacing w.r.t. columns containing them. + if (HLineBelongsToTable(*part, table_box)) { + *result_box = result_box->bounding_union(part_box); + } + // TODO(nbeato): Vertical + } +} + +// Checks whether the horizontal line belong to the table by looking at the +// side spacing of extra ColPartitions that will be included in the table +// due to expansion +bool TableFinder::HLineBelongsToTable(const ColPartition &part, + const TBOX &table_box) { + if (!part.IsHorizontalLine()) { + return false; + } + const TBOX &part_box = part.bounding_box(); + if (!part_box.major_x_overlap(table_box)) { + return false; + } + // Do not consider top-most horizontal line since it usually + // originates from noise. + // TODO(nbeato): I had to comment this out because the ruling grid doesn't + // have neighbors solved. + // if (!part.nearest_neighbor_above()) + // return false; + const TBOX bbox = part_box.bounding_union(table_box); + // In the "unioned table" box (the table extents expanded by the line), + // keep track of how many partitions have significant padding to the left + // and right. If more than half of the partitions covered by the new table + // have significant spacing, the line belongs to the table and the table + // grows to include all of the partitions. + int num_extra_partitions = 0; + int extra_space_to_right = 0; + int extra_space_to_left = 0; + // Rulings are in a different grid, so search 2 grids for rulings, text, + // and table partitions that are introduced by the new box. + for (int i = 0; i < 2; ++i) { + ColPartitionGrid *grid = + (i == 0) ? &clean_part_grid_ : &leader_and_ruling_grid_; + // Start a rect search on bbox + ColPartitionGridSearch rectsearch(grid); + rectsearch.SetUniqueMode(true); + rectsearch.StartRectSearch(bbox); + ColPartition *extra_part = nullptr; + while ((extra_part = rectsearch.NextRectSearch()) != nullptr) { + // ColPartition already in table + const TBOX &extra_part_box = extra_part->bounding_box(); + if (extra_part_box.overlap_fraction(table_box) > kMinOverlapWithTable) { + continue; + } + // Non-text ColPartitions do not contribute + if (extra_part->IsImageType()) { + continue; + } + // Consider this partition. + num_extra_partitions++; + // presence of a table cell is a strong hint, so just increment the scores + // without looking at the spacing. + if (extra_part->type() == PT_TABLE || extra_part->IsLineType()) { + extra_space_to_right++; + extra_space_to_left++; + continue; + } + int space_threshold = kSideSpaceMargin * part.median_height(); + if (extra_part->space_to_right() > space_threshold) { + extra_space_to_right++; + } + if (extra_part->space_to_left() > space_threshold) { + extra_space_to_left++; + } + } + } + // tprintf("%d %d %d\n", + // num_extra_partitions,extra_space_to_right,extra_space_to_left); + return (extra_space_to_right > num_extra_partitions / 2) || + (extra_space_to_left > num_extra_partitions / 2); +} + +// Look for isolated column headers above the given table box and +// include them in the table +void TableFinder::IncludeLeftOutColumnHeaders(TBOX *table_box) { + // Start a search above the current table to look for column headers + ColPartitionGridSearch vsearch(&clean_part_grid_); + vsearch.StartVerticalSearch(table_box->left(), table_box->right(), + table_box->top()); + ColPartition *neighbor = nullptr; + ColPartition *previous_neighbor = nullptr; + while ((neighbor = vsearch.NextVerticalSearch(false)) != nullptr) { + // Max distance to find a table heading. + const int max_distance = + kMaxColumnHeaderDistance * neighbor->median_height(); + int table_top = table_box->top(); + const TBOX &box = neighbor->bounding_box(); + // Do not continue if the next box is way above + if (box.bottom() - table_top > max_distance) { + break; + } + // Unconditionally include partitions of type TABLE or LINE + // TODO(faisal): add some reasonable conditions here + if (neighbor->type() == PT_TABLE || neighbor->IsLineType()) { + table_box->set_top(box.top()); + previous_neighbor = nullptr; + continue; + } + // If there are two text partitions, one above the other, without a table + // cell on their left or right side, consider them a barrier and quit + if (previous_neighbor == nullptr) { + previous_neighbor = neighbor; + } else { + const TBOX &previous_box = previous_neighbor->bounding_box(); + if (!box.major_y_overlap(previous_box)) { + break; + } + } + } +} + +// Remove false alarms consisting of a single column based on their +// projection on the x-axis. Projection of a real table on the x-axis +// should have at least one zero-valley larger than the global median +// x-height of the page. +void TableFinder::DeleteSingleColumnTables() { + int page_width = tright().x() - bleft().x(); + ASSERT_HOST(page_width > 0); + // create an integer array to hold projection on x-axis + int *table_xprojection = new int[page_width]; + // Iterate through all tables in the table grid + GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> table_search( + &table_grid_); + table_search.StartFullSearch(); + ColSegment *table; + while ((table = table_search.NextFullSearch()) != nullptr) { + TBOX table_box = table->bounding_box(); + // reset the projection array + for (int i = 0; i < page_width; i++) { + table_xprojection[i] = 0; + } + // Start a rect search on table_box + ColPartitionGridSearch rectsearch(&clean_part_grid_); + rectsearch.SetUniqueMode(true); + rectsearch.StartRectSearch(table_box); + ColPartition *part; + while ((part = rectsearch.NextRectSearch()) != nullptr) { + if (!part->IsTextType()) { + continue; // Do not consider non-text partitions + } + if (part->flow() == BTFT_LEADER) { + continue; // Assume leaders are in tables + } + TBOX part_box = part->bounding_box(); + // Do not consider partitions partially covered by the table + if (part_box.overlap_fraction(table_box) < kMinOverlapWithTable) { + continue; + } + BLOBNBOX_CLIST *part_boxes = part->boxes(); + BLOBNBOX_C_IT pit(part_boxes); + + // Make sure overlapping blobs don't artificially inflate the number + // of rows in the table. This happens frequently with things such as + // decimals and split characters. Do this by assuming the column + // partition is sorted mostly left to right and just clip + // bounding boxes by the previous box's extent. + int next_position_to_write = 0; + + for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) { + BLOBNBOX *pblob = pit.data(); + // ignore blob height for the purpose of projection since we + // are only interested in finding valleys + int xstart = pblob->bounding_box().left(); + int xend = pblob->bounding_box().right(); + + xstart = std::max(xstart, next_position_to_write); + for (int i = xstart; i < xend; i++) { + table_xprojection[i - bleft().x()]++; + } + next_position_to_write = xend; + } + } + // Find largest valley between two reasonable peaks in the table + if (!GapInXProjection(table_xprojection, page_width)) { + table_search.RemoveBBox(); + delete table; + } + } + delete[] table_xprojection; +} + +// Return true if at least one gap larger than the global x-height +// exists in the horizontal projection +bool TableFinder::GapInXProjection(int *xprojection, int length) { + // Find peak value of the histogram + int peak_value = 0; + for (int i = 0; i < length; i++) { + if (xprojection[i] > peak_value) { + peak_value = xprojection[i]; + } + } + // Peak value represents the maximum number of horizontally + // overlapping colpartitions, so this can be considered as the + // number of rows in the table + if (peak_value < kMinRowsInTable) { + return false; + } + double projection_threshold = kSmallTableProjectionThreshold * peak_value; + if (peak_value >= kLargeTableRowCount) { + projection_threshold = kLargeTableProjectionThreshold * peak_value; + } + // Threshold the histogram + for (int i = 0; i < length; i++) { + xprojection[i] = (xprojection[i] >= projection_threshold) ? 1 : 0; + } + // Find the largest run of zeros between two ones + int largest_gap = 0; + int run_start = -1; + for (int i = 1; i < length; i++) { + // detect start of a run of zeros + if (xprojection[i - 1] && !xprojection[i]) { + run_start = i; + } + // detect end of a run of zeros and update the value of largest gap + if (run_start != -1 && !xprojection[i - 1] && xprojection[i]) { + int gap = i - run_start; + if (gap > largest_gap) { + largest_gap = gap; + } + run_start = -1; + } + } + return largest_gap > kMaxXProjectionGapFactor * global_median_xheight_; +} + +// Given the location of a table "guess", try to overlay a cellular +// grid in the location, adjusting the boundaries. +// TODO(nbeato): Falsely introduces: +// -headers/footers (not any worse, too much overlap destroys cells) +// -page numbers (not worse, included because maximize margins) +// -equations (nicely fit into a celluar grid, but more sparsely) +// -figures (random text box, also sparse) +// -small left-aligned text areas with overlapping positioned whitespace +// (rejected before) +// Overall, this just needs some more work. +void TableFinder::RecognizeTables() { +#ifndef GRAPHICS_DISABLED + ScrollView *table_win = nullptr; + if (textord_show_tables) { + table_win = MakeWindow(0, 0, "Table Structure"); + DisplayColPartitions(table_win, &fragmented_text_grid_, ScrollView::BLUE, + ScrollView::LIGHT_BLUE); + // table_grid_.DisplayBoxes(table_win); + } +#endif + + TableRecognizer recognizer; + recognizer.Init(); + recognizer.set_line_grid(&leader_and_ruling_grid_); + recognizer.set_text_grid(&fragmented_text_grid_); + recognizer.set_max_text_height(global_median_xheight_ * 2.0); + recognizer.set_min_height(1.5 * gridheight()); + // Loop over all of the tables and try to fit them. + // Store the good tables here. + ColSegment_CLIST good_tables; + ColSegment_C_IT good_it(&good_tables); + + ColSegmentGridSearch gsearch(&table_grid_); + gsearch.StartFullSearch(); + ColSegment *found_table = nullptr; + while ((found_table = gsearch.NextFullSearch()) != nullptr) { + gsearch.RemoveBBox(); + + // The goal is to make the tables persistent in a list. + // When that happens, this will move into the search loop. + const TBOX &found_box = found_table->bounding_box(); + StructuredTable *table_structure = recognizer.RecognizeTable(found_box); + + // Process a table. Good tables are inserted into the grid again later on + // We can't change boxes in the grid while it is running a search. + if (table_structure != nullptr) { +#ifndef GRAPHICS_DISABLED + if (textord_show_tables) { + table_structure->Display(table_win, ScrollView::LIME_GREEN); + } +#endif + found_table->set_bounding_box(table_structure->bounding_box()); + delete table_structure; + good_it.add_after_then_move(found_table); + } else { + delete found_table; + } + } + // TODO(nbeato): MERGE!! There is awesome info now available for merging. + + // At this point, the grid is empty. We can safely insert the good tables + // back into grid. + for (good_it.mark_cycle_pt(); !good_it.cycled_list(); good_it.forward()) { + table_grid_.InsertBBox(true, true, good_it.extract()); + } +} + +#ifndef GRAPHICS_DISABLED + +// Displays the column segments in some window. +void TableFinder::DisplayColSegments(ScrollView *win, ColSegment_LIST *segments, + ScrollView::Color color) { + win->Pen(color); + win->Brush(ScrollView::NONE); + ColSegment_IT it(segments); + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + ColSegment *col = it.data(); + const TBOX &box = col->bounding_box(); + int left_x = box.left(); + int right_x = box.right(); + int top_y = box.top(); + int bottom_y = box.bottom(); + win->Rectangle(left_x, bottom_y, right_x, top_y); + } + win->UpdateWindow(); +} + +// Displays the colpartitions using a new coloring on an existing window. +// Note: This method is only for debug purpose during development and +// would not be part of checked in code +void TableFinder::DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid, + ScrollView::Color default_color, + ScrollView::Color table_color) { + ScrollView::Color color = default_color; + // Iterate the ColPartitions in the grid. + ColPartitionGridSearch gsearch(grid); + gsearch.StartFullSearch(); + ColPartition *part = nullptr; + while ((part = gsearch.NextFullSearch()) != nullptr) { + color = default_color; + if (part->type() == PT_TABLE) { + color = table_color; + } + + const TBOX &box = part->bounding_box(); + int left_x = box.left(); + int right_x = box.right(); + int top_y = box.top(); + int bottom_y = box.bottom(); + win->Brush(ScrollView::NONE); + win->Pen(color); + win->Rectangle(left_x, bottom_y, right_x, top_y); + } + win->UpdateWindow(); +} + +void TableFinder::DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid, + ScrollView::Color default_color) { + DisplayColPartitions(win, grid, default_color, ScrollView::YELLOW); +} + +void TableFinder::DisplayColPartitionConnections(ScrollView *win, + ColPartitionGrid *grid, + ScrollView::Color color) { + // Iterate the ColPartitions in the grid. + ColPartitionGridSearch gsearch(grid); + gsearch.StartFullSearch(); + ColPartition *part = nullptr; + while ((part = gsearch.NextFullSearch()) != nullptr) { + const TBOX &box = part->bounding_box(); + int left_x = box.left(); + int right_x = box.right(); + int top_y = box.top(); + int bottom_y = box.bottom(); + + ColPartition *upper_part = part->nearest_neighbor_above(); + if (upper_part) { + const TBOX &upper_box = upper_part->bounding_box(); + int mid_x = (left_x + right_x) / 2; + int mid_y = (top_y + bottom_y) / 2; + int other_x = (upper_box.left() + upper_box.right()) / 2; + int other_y = (upper_box.top() + upper_box.bottom()) / 2; + win->Brush(ScrollView::NONE); + win->Pen(color); + win->Line(mid_x, mid_y, other_x, other_y); + } + ColPartition *lower_part = part->nearest_neighbor_below(); + if (lower_part) { + const TBOX &lower_box = lower_part->bounding_box(); + int mid_x = (left_x + right_x) / 2; + int mid_y = (top_y + bottom_y) / 2; + int other_x = (lower_box.left() + lower_box.right()) / 2; + int other_y = (lower_box.top() + lower_box.bottom()) / 2; + win->Brush(ScrollView::NONE); + win->Pen(color); + win->Line(mid_x, mid_y, other_x, other_y); + } + } + win->UpdateWindow(); +} + +#endif + +// Merge all colpartitions in table regions to make them a single +// colpartition and revert types of isolated table cells not +// assigned to any table to their original types. +void TableFinder::MakeTableBlocks(ColPartitionGrid *grid, + ColPartitionSet **all_columns, + const WidthCallback &width_cb) { + // Since we have table blocks already, remove table tags from all + // colpartitions + ColPartitionGridSearch gsearch(grid); + gsearch.StartFullSearch(); + ColPartition *part = nullptr; + + while ((part = gsearch.NextFullSearch()) != nullptr) { + if (part->type() == PT_TABLE) { + part->clear_table_type(); + } + } + // Now make a single colpartition out of each table block and remove + // all colpartitions contained within a table + GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> table_search( + &table_grid_); + table_search.StartFullSearch(); + ColSegment *table; + while ((table = table_search.NextFullSearch()) != nullptr) { + const TBOX &table_box = table->bounding_box(); + // Start a rect search on table_box + ColPartitionGridSearch rectsearch(grid); + rectsearch.StartRectSearch(table_box); + ColPartition *part; + ColPartition *table_partition = nullptr; + while ((part = rectsearch.NextRectSearch()) != nullptr) { + // Do not consider image partitions + if (!part->IsTextType()) { + continue; + } + TBOX part_box = part->bounding_box(); + // Include partition in the table if more than half of it + // is covered by the table + if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) { + rectsearch.RemoveBBox(); + if (table_partition) { + table_partition->Absorb(part, width_cb); + } else { + table_partition = part; + } + } + } + // Insert table colpartition back to part_grid_ + if (table_partition) { + // To match the columns used when transforming to blocks, the new table + // partition must have its first and last column set at the grid y that + // corresponds to its bottom. + const TBOX &table_box = table_partition->bounding_box(); + int grid_x, grid_y; + grid->GridCoords(table_box.left(), table_box.bottom(), &grid_x, &grid_y); + table_partition->SetPartitionType(resolution_, all_columns[grid_y]); + table_partition->set_table_type(); + table_partition->set_blob_type(BRT_TEXT); + table_partition->set_flow(BTFT_CHAIN); + table_partition->SetBlobTypes(); + grid->InsertBBox(true, true, table_partition); + } + } +} + +//////// ColSegment code +//////// +ColSegment::ColSegment() + : ELIST_LINK(), + num_table_cells_(0), + num_text_cells_(0), + type_(COL_UNKNOWN) {} + +// Provides a color for BBGrid to draw the rectangle. +ScrollView::Color ColSegment::BoxColor() const { + const ScrollView::Color kBoxColors[PT_COUNT] = { + ScrollView::YELLOW, + ScrollView::BLUE, + ScrollView::YELLOW, + ScrollView::MAGENTA, + }; + return kBoxColors[type_]; +} + +// Insert a box into this column segment +void ColSegment::InsertBox(const TBOX &other) { + bounding_box_ = bounding_box_.bounding_union(other); +} + +// Set column segment type based on the ratio of text and table partitions +// in it. +void ColSegment::set_type() { + if (num_table_cells_ > kTableColumnThreshold * num_text_cells_) { + type_ = COL_TABLE; + } else if (num_text_cells_ > num_table_cells_) { + type_ = COL_TEXT; + } else { + type_ = COL_MIXED; + } +} + +} // namespace tesseract.
