Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/textord/tablerecog.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/textord/tablerecog.h Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,367 @@ +/////////////////////////////////////////////////////////////////////// +// File: tablerecog.h +// Description: Functions to detect structure of tables. +// Author: Nicholas Beato +// +// (C) Copyright 2010, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#ifndef TABLERECOG_H_ +#define TABLERECOG_H_ + +#include "colpartitiongrid.h" + +namespace tesseract { + +// There are 2 classes in this file. They have 2 different purposes. +// - StructuredTable contains the methods to find the structure given +// a specific bounding box and grow that structure. +// - TableRecognizer contains the methods to adjust the possible positions +// of a table without worrying about structure. +// +// To use these classes, the assumption is that the TableFinder will +// have a guess of the location of a table (or possibly over/undersegmented +// tables). The TableRecognizer is responsible for finding the table boundaries +// at a high level. The StructuredTable class is responsible for determining +// the structure of the table and trying to maximize its bounds while retaining +// the structure. +// (The latter part is not implemented yet, but that was the goal). +// +// While on the boundary discussion, keep in mind that this is a first pass. +// There should eventually be some things like internal structure checks, +// and, more importantly, surrounding text flow checks. +// + +// Usage: +// The StructuredTable class contains methods to query a potential table. +// It has functions to find structure, count rows, find ColPartitions that +// intersect gridlines, etc. It is not meant to blindly find a table. It +// is meant to start with a known table location and enhance it. +// Usage: +// ColPartitionGrid text_grid, line_grid; // init +// TBOX table_box; // known location of table location +// +// StructuredTable table; +// table.Init(); // construction code +// table.set_text_grid(/* text */); // These 2 grids can be the same! +// table.set_line_grid(/* lines */); +// table.set_min_text_height(10); // Filter vertical and tall text. +// // IMPORTANT! The table needs to be told where it is! +// table.set_bounding_box(table_box); // Set initial table location. +// if (table.FindWhitespacedStructure()) { +// // process table +// table.column_count(); // number of columns +// table.row_count(); // number of rows +// table.cells_count(); // number of cells +// table.bounding_box(); // updated bounding box +// // etc. +// } +// +class TESS_API StructuredTable { +public: + StructuredTable(); + ~StructuredTable() = default; + + // Initialization code. Must be called after the constructor. + void Init(); + + // Sets the grids used by the table. These can be changed between + // calls to Recognize. They are treated as read-only data. + void set_text_grid(ColPartitionGrid *text); + void set_line_grid(ColPartitionGrid *lines); + // Filters text partitions that are ridiculously tall to prevent + // merging rows. + void set_max_text_height(int height); + + // Basic accessors. Some are treated as attributes despite having indirect + // representation. + bool is_lined() const; + unsigned row_count() const; + unsigned column_count() const; + unsigned cell_count() const; + void set_bounding_box(const TBOX &box); + const TBOX &bounding_box() const; + int median_cell_height(); + int median_cell_width(); + int row_height(unsigned row) const; + int column_width(unsigned column) const; + int space_above() const; + int space_below() const; + + // Given enough horizontal and vertical lines in a region, create this table + // based on the structure given by the lines. Return true if it worked out. + // Code assumes the lines exist. It is the caller's responsibility to check + // for lines and find an appropriate bounding box. + bool FindLinedStructure(); + + // The main subroutine for finding generic table structure. The function + // finds the grid structure in the given box. Returns true if a good grid + // exists, implying that "this" table is valid. + bool FindWhitespacedStructure(); + + //////// + //////// Functions to query table info. + //////// + + // Returns true if inserting part into the table does not cause any + // cell merges. + bool DoesPartitionFit(const ColPartition &part) const; + // Checks if a sub-table has multiple data cells filled. + int CountFilledCells(); + int CountFilledCellsInRow(int row); + int CountFilledCellsInColumn(int column); + int CountFilledCells(unsigned row_start, unsigned row_end, unsigned column_start, unsigned column_end); + + // Makes sure that at least one cell in a row has substantial area filled. + // This can filter out large whitespace caused by growing tables too far + // and page numbers. + // (currently bugged for some reason). + bool VerifyRowFilled(int row); + // Finds the filled area in a cell. + double CalculateCellFilledPercentage(unsigned row, unsigned column); + + // Debug display, draws the table in the given color. If the table is not + // valid, the table and "best" grid lines are still drawn in the given color. + void Display(ScrollView *window, ScrollView::Color color); + +protected: + // Clear the structure information. + void ClearStructure(); + + //////// + //////// Lined tables + //////// + + // Verifies the lines do not intersect partitions. This happens when + // the lines are in column boundaries and extend the full page. As a result, + // the grid lines go through column text. The condition is detectable. + bool VerifyLinedTableCells(); + + //////// + //////// Tables with whitespace + //////// + + // This is the function to change if you want to filter resulting tables + // better. Right now it just checks for a minimum cell count and such. + // You could add things like maximum number of ColPartitions per cell or + // similar. + bool VerifyWhitespacedTable(); + // Find the columns of a table using whitespace. + void FindWhitespacedColumns(); + // Find the rows of a table using whitespace. + void FindWhitespacedRows(); + + //////// + //////// Functions to provide information about the table. + //////// + + // Calculates the whitespace around the table using the table boundary and + // the supplied grids (set_text_grid and set_line_grid). + void CalculateMargins(); + // Update the table margins with the supplied grid. This is + // only called by calculate margins to use multiple grid sources. + void UpdateMargins(ColPartitionGrid *grid); + int FindVerticalMargin(ColPartitionGrid *grid, int start_x, bool decrease) const; + int FindHorizontalMargin(ColPartitionGrid *grid, int start_y, bool decrease) const; + // Calculates stats on the table, namely the median cell height and width. + void CalculateStats(); + + //////// + //////// Functions to try to "fix" some table errors. + //////// + + // Given a whitespaced table, this looks for bordering lines that might + // be page layout boxes around the table. It is necessary to get the margins + // correct on the table. If the lines are not joined, the margins will be + // the distance to the line, which is not right. + void AbsorbNearbyLines(); + + // Nice utility function for finding partition gaps. You feed it a sorted + // list of all of the mins/maxes of the partitions in the table, and it gives + // you the gaps (middle). This works for both vertical and horizontal + // gaps. + // + // If you want to allow slight overlap in the division and the partitions, + // just scale down the partitions before inserting them in the list. + // Likewise, you can force at least some space between partitions. + // This trick is how the horizontal partitions are done (since the page + // skew could make it hard to find splits in the text). + // + // As a result, "0 distance" between closest partitions causes a gap. + // This is not a programmatic assumption. It is intentional and simplifies + // things. + // + // "max_merged" indicates both the minimum number of stacked partitions + // to cause a cell (add 1 to it), and the maximum number of partitions that + // a grid line can intersect. For example, if max_merged is 0, then lines + // are inserted wherever space exists between partitions. If it is 2, + // lines may intersect 2 partitions at most, but you also need at least + // 2 partitions to generate a line. + static void FindCellSplitLocations(const std::vector<int> &min_list, + const std::vector<int> &max_list, int max_merged, + std::vector<int> *locations); + + //////// + //////// Utility function for table queries + //////// + + // Counts the number of ColPartitions that intersect vertical cell + // division at this x value. Used by VerifyLinedTable. + int CountVerticalIntersections(int x); + int CountHorizontalIntersections(int y); + + // Counts how many text partitions are in this box. + int CountPartitions(const TBOX &box); + + //////// + //////// Data members. + //////// + + // Input data, used as read only data to make decisions. + ColPartitionGrid *text_grid_; // Text ColPartitions + ColPartitionGrid *line_grid_; // Line ColPartitions + // Table structure. + // bounding box is a convenient external representation. + // cell_x_ and cell_y_ indicate the grid lines. + TBOX bounding_box_; // Bounding box + std::vector<int> cell_x_; // Locations of vertical divisions (sorted) + std::vector<int> cell_y_; // Locations of horizontal divisions (sorted) + bool is_lined_; // Is the table backed up by a line structure + // Table margins, set via CalculateMargins + int space_above_; + int space_below_; + int space_left_; + int space_right_; + int median_cell_height_; + int median_cell_width_; + // Filters, used to prevent awkward partitions from destroying structure. + int max_text_height_; +}; + +class TESS_API TableRecognizer { +public: + TableRecognizer() = default; + ~TableRecognizer() = default; + + // Initialization code. Must be called after the constructor. + void Init(); + + //////// + //////// Pre-recognize methods to initial table constraints. + //////// + + // Sets the grids used by the table. These can be changed between + // calls to Recognize. They are treated as read-only data. + void set_text_grid(ColPartitionGrid *text); + void set_line_grid(ColPartitionGrid *lines); + // Sets some additional constraints on the table. + void set_min_height(int height); + void set_min_width(int width); + // Filters text partitions that are ridiculously tall to prevent + // merging rows. Note that "filters" refers to allowing horizontal + // cells to slice through them on the premise that they were + // merged text rows during previous layout. + void set_max_text_height(int height); + + // Given a guess location, the RecognizeTable function will try to find a + // structured grid in the area. On success, it will return a new + // StructuredTable (and assumes you will delete it). Otherwise, + // nullptr is returned. + // + // Keep in mind, this may "overgrow" or "undergrow" the size of guess. + // Ideally, there is either a one-to-one correspondence between + // the guess and table or no table at all. This is not the best of + // assumptions right now, but was made to try to keep things simple in + // the first pass. + // + // If a line structure is available on the page in the given region, + // the table will use the linear structure as it is. + // Otherwise, it will try to maximize the whitespace around it while keeping + // a grid structure. This is somewhat working. + // + // Since the combination of adjustments can get high, effort was + // originally made to keep the number of adjustments linear in the number + // of partitions. The underlying structure finding code used to be + // much more complex. I don't know how necessary this constraint is anymore. + // The evaluation of a possible table is kept within O(nlogn) in the size of + // the table (where size is the number of partitions in the table). + // As a result, the algorithm is capable of O(n^2 log n). Depending + // on the grid search size, it may be higher. + // + // Last note: it is possible to just try all partition boundaries at a high + // level O(n^4) and do a verification scheme (at least O(nlogn)). If there + // area 200 partitions on a page, this could be too costly. Effort could go + // into pruning the search, but I opted for something quicker. I'm confident + // that the independent adjustments can get similar results and keep the + // complextiy down. However, the other approach could work without using + // TableFinder at all if it is fast enough. It comes down to properly + // deciding what is a table. The code currently relies on TableFinder's + // guess to the location of a table for that. + StructuredTable *RecognizeTable(const TBOX &guess_box); + +protected: + //////// + //////// Lined tables + //////// + + // Returns true if the given box has a lined table within it. The + // table argument will be updated with the table if the table exists. + bool RecognizeLinedTable(const TBOX &guess_box, StructuredTable *table); + // Returns true if the given box has a large number of horizontal and + // vertical lines present. If so, we assume the extent of these lines + // uniquely defines a table and find that table via SolveLinedTable. + bool HasSignificantLines(const TBOX &guess); + + // Given enough horizontal and vertical lines in a region, find a bounding + // box that encloses all of them (as well as newly introduced lines). + // The bounding box is the smallest box that encloses the lines in guess + // without having any lines sticking out of it. + // bounding_box is an in/out parameter. + // On input, it in the extents of the box to search. + // On output, it is the resulting bounding box. + bool FindLinesBoundingBox(TBOX *bounding_box); + // Iteration in above search. + // bounding_box is an in/out parameter. + // On input, it in the extents of the box to search. + // On output, it is the resulting bounding box. + bool FindLinesBoundingBoxIteration(TBOX *bounding_box); + + //////// + //////// Generic "whitespaced" tables + //////// + + // Returns true if the given box has a whitespaced table within it. The + // table argument will be updated if the table exists. Also note + // that this method will fail if the guess_box center is not + // mostly within the table. + bool RecognizeWhitespacedTable(const TBOX &guess_box, StructuredTable *table); + + // Finds the location of a horizontal split relative to y. + // This function is mostly unused now. If the SolveWhitespacedTable + // changes much, it can be removed. Note, it isn't really as reliable + // as I thought. I went with alternatives for most of the other uses. + int NextHorizontalSplit(int left, int right, int y, bool top_to_bottom); + + // Input data, used as read only data to make decisions. + ColPartitionGrid *text_grid_ = nullptr; // Text ColPartitions + ColPartitionGrid *line_grid_ = nullptr; // Line ColPartitions + // Table constraints, a "good" table must satisfy these. + int min_height_ = 0; + int min_width_ = 0; + // Filters, used to prevent awkward partitions from destroying structure. + int max_text_height_ = INT32_MAX; // Horizontal lines may intersect taller text. +}; + +} // namespace tesseract + +#endif /* TABLERECOG_H_ */
