Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/textord/tablerecog.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: tablerecog.h | |
| 3 // Description: Functions to detect structure of tables. | |
| 4 // Author: Nicholas Beato | |
| 5 // | |
| 6 // (C) Copyright 2010, Google Inc. | |
| 7 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 // you may not use this file except in compliance with the License. | |
| 9 // You may obtain a copy of the License at | |
| 10 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 // Unless required by applicable law or agreed to in writing, software | |
| 12 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 // See the License for the specific language governing permissions and | |
| 15 // limitations under the License. | |
| 16 // | |
| 17 /////////////////////////////////////////////////////////////////////// | |
| 18 | |
| 19 #ifndef TABLERECOG_H_ | |
| 20 #define TABLERECOG_H_ | |
| 21 | |
| 22 #include "colpartitiongrid.h" | |
| 23 | |
| 24 namespace tesseract { | |
| 25 | |
| 26 // There are 2 classes in this file. They have 2 different purposes. | |
| 27 // - StructuredTable contains the methods to find the structure given | |
| 28 // a specific bounding box and grow that structure. | |
| 29 // - TableRecognizer contains the methods to adjust the possible positions | |
| 30 // of a table without worrying about structure. | |
| 31 // | |
| 32 // To use these classes, the assumption is that the TableFinder will | |
| 33 // have a guess of the location of a table (or possibly over/undersegmented | |
| 34 // tables). The TableRecognizer is responsible for finding the table boundaries | |
| 35 // at a high level. The StructuredTable class is responsible for determining | |
| 36 // the structure of the table and trying to maximize its bounds while retaining | |
| 37 // the structure. | |
| 38 // (The latter part is not implemented yet, but that was the goal). | |
| 39 // | |
| 40 // While on the boundary discussion, keep in mind that this is a first pass. | |
| 41 // There should eventually be some things like internal structure checks, | |
| 42 // and, more importantly, surrounding text flow checks. | |
| 43 // | |
| 44 | |
| 45 // Usage: | |
| 46 // The StructuredTable class contains methods to query a potential table. | |
| 47 // It has functions to find structure, count rows, find ColPartitions that | |
| 48 // intersect gridlines, etc. It is not meant to blindly find a table. It | |
| 49 // is meant to start with a known table location and enhance it. | |
| 50 // Usage: | |
| 51 // ColPartitionGrid text_grid, line_grid; // init | |
| 52 // TBOX table_box; // known location of table location | |
| 53 // | |
| 54 // StructuredTable table; | |
| 55 // table.Init(); // construction code | |
| 56 // table.set_text_grid(/* text */); // These 2 grids can be the same! | |
| 57 // table.set_line_grid(/* lines */); | |
| 58 // table.set_min_text_height(10); // Filter vertical and tall text. | |
| 59 // // IMPORTANT! The table needs to be told where it is! | |
| 60 // table.set_bounding_box(table_box); // Set initial table location. | |
| 61 // if (table.FindWhitespacedStructure()) { | |
| 62 // // process table | |
| 63 // table.column_count(); // number of columns | |
| 64 // table.row_count(); // number of rows | |
| 65 // table.cells_count(); // number of cells | |
| 66 // table.bounding_box(); // updated bounding box | |
| 67 // // etc. | |
| 68 // } | |
| 69 // | |
| 70 class TESS_API StructuredTable { | |
| 71 public: | |
| 72 StructuredTable(); | |
| 73 ~StructuredTable() = default; | |
| 74 | |
| 75 // Initialization code. Must be called after the constructor. | |
| 76 void Init(); | |
| 77 | |
| 78 // Sets the grids used by the table. These can be changed between | |
| 79 // calls to Recognize. They are treated as read-only data. | |
| 80 void set_text_grid(ColPartitionGrid *text); | |
| 81 void set_line_grid(ColPartitionGrid *lines); | |
| 82 // Filters text partitions that are ridiculously tall to prevent | |
| 83 // merging rows. | |
| 84 void set_max_text_height(int height); | |
| 85 | |
| 86 // Basic accessors. Some are treated as attributes despite having indirect | |
| 87 // representation. | |
| 88 bool is_lined() const; | |
| 89 unsigned row_count() const; | |
| 90 unsigned column_count() const; | |
| 91 unsigned cell_count() const; | |
| 92 void set_bounding_box(const TBOX &box); | |
| 93 const TBOX &bounding_box() const; | |
| 94 int median_cell_height(); | |
| 95 int median_cell_width(); | |
| 96 int row_height(unsigned row) const; | |
| 97 int column_width(unsigned column) const; | |
| 98 int space_above() const; | |
| 99 int space_below() const; | |
| 100 | |
| 101 // Given enough horizontal and vertical lines in a region, create this table | |
| 102 // based on the structure given by the lines. Return true if it worked out. | |
| 103 // Code assumes the lines exist. It is the caller's responsibility to check | |
| 104 // for lines and find an appropriate bounding box. | |
| 105 bool FindLinedStructure(); | |
| 106 | |
| 107 // The main subroutine for finding generic table structure. The function | |
| 108 // finds the grid structure in the given box. Returns true if a good grid | |
| 109 // exists, implying that "this" table is valid. | |
| 110 bool FindWhitespacedStructure(); | |
| 111 | |
| 112 //////// | |
| 113 //////// Functions to query table info. | |
| 114 //////// | |
| 115 | |
| 116 // Returns true if inserting part into the table does not cause any | |
| 117 // cell merges. | |
| 118 bool DoesPartitionFit(const ColPartition &part) const; | |
| 119 // Checks if a sub-table has multiple data cells filled. | |
| 120 int CountFilledCells(); | |
| 121 int CountFilledCellsInRow(int row); | |
| 122 int CountFilledCellsInColumn(int column); | |
| 123 int CountFilledCells(unsigned row_start, unsigned row_end, unsigned column_start, unsigned column_end); | |
| 124 | |
| 125 // Makes sure that at least one cell in a row has substantial area filled. | |
| 126 // This can filter out large whitespace caused by growing tables too far | |
| 127 // and page numbers. | |
| 128 // (currently bugged for some reason). | |
| 129 bool VerifyRowFilled(int row); | |
| 130 // Finds the filled area in a cell. | |
| 131 double CalculateCellFilledPercentage(unsigned row, unsigned column); | |
| 132 | |
| 133 // Debug display, draws the table in the given color. If the table is not | |
| 134 // valid, the table and "best" grid lines are still drawn in the given color. | |
| 135 void Display(ScrollView *window, ScrollView::Color color); | |
| 136 | |
| 137 protected: | |
| 138 // Clear the structure information. | |
| 139 void ClearStructure(); | |
| 140 | |
| 141 //////// | |
| 142 //////// Lined tables | |
| 143 //////// | |
| 144 | |
| 145 // Verifies the lines do not intersect partitions. This happens when | |
| 146 // the lines are in column boundaries and extend the full page. As a result, | |
| 147 // the grid lines go through column text. The condition is detectable. | |
| 148 bool VerifyLinedTableCells(); | |
| 149 | |
| 150 //////// | |
| 151 //////// Tables with whitespace | |
| 152 //////// | |
| 153 | |
| 154 // This is the function to change if you want to filter resulting tables | |
| 155 // better. Right now it just checks for a minimum cell count and such. | |
| 156 // You could add things like maximum number of ColPartitions per cell or | |
| 157 // similar. | |
| 158 bool VerifyWhitespacedTable(); | |
| 159 // Find the columns of a table using whitespace. | |
| 160 void FindWhitespacedColumns(); | |
| 161 // Find the rows of a table using whitespace. | |
| 162 void FindWhitespacedRows(); | |
| 163 | |
| 164 //////// | |
| 165 //////// Functions to provide information about the table. | |
| 166 //////// | |
| 167 | |
| 168 // Calculates the whitespace around the table using the table boundary and | |
| 169 // the supplied grids (set_text_grid and set_line_grid). | |
| 170 void CalculateMargins(); | |
| 171 // Update the table margins with the supplied grid. This is | |
| 172 // only called by calculate margins to use multiple grid sources. | |
| 173 void UpdateMargins(ColPartitionGrid *grid); | |
| 174 int FindVerticalMargin(ColPartitionGrid *grid, int start_x, bool decrease) const; | |
| 175 int FindHorizontalMargin(ColPartitionGrid *grid, int start_y, bool decrease) const; | |
| 176 // Calculates stats on the table, namely the median cell height and width. | |
| 177 void CalculateStats(); | |
| 178 | |
| 179 //////// | |
| 180 //////// Functions to try to "fix" some table errors. | |
| 181 //////// | |
| 182 | |
| 183 // Given a whitespaced table, this looks for bordering lines that might | |
| 184 // be page layout boxes around the table. It is necessary to get the margins | |
| 185 // correct on the table. If the lines are not joined, the margins will be | |
| 186 // the distance to the line, which is not right. | |
| 187 void AbsorbNearbyLines(); | |
| 188 | |
| 189 // Nice utility function for finding partition gaps. You feed it a sorted | |
| 190 // list of all of the mins/maxes of the partitions in the table, and it gives | |
| 191 // you the gaps (middle). This works for both vertical and horizontal | |
| 192 // gaps. | |
| 193 // | |
| 194 // If you want to allow slight overlap in the division and the partitions, | |
| 195 // just scale down the partitions before inserting them in the list. | |
| 196 // Likewise, you can force at least some space between partitions. | |
| 197 // This trick is how the horizontal partitions are done (since the page | |
| 198 // skew could make it hard to find splits in the text). | |
| 199 // | |
| 200 // As a result, "0 distance" between closest partitions causes a gap. | |
| 201 // This is not a programmatic assumption. It is intentional and simplifies | |
| 202 // things. | |
| 203 // | |
| 204 // "max_merged" indicates both the minimum number of stacked partitions | |
| 205 // to cause a cell (add 1 to it), and the maximum number of partitions that | |
| 206 // a grid line can intersect. For example, if max_merged is 0, then lines | |
| 207 // are inserted wherever space exists between partitions. If it is 2, | |
| 208 // lines may intersect 2 partitions at most, but you also need at least | |
| 209 // 2 partitions to generate a line. | |
| 210 static void FindCellSplitLocations(const std::vector<int> &min_list, | |
| 211 const std::vector<int> &max_list, int max_merged, | |
| 212 std::vector<int> *locations); | |
| 213 | |
| 214 //////// | |
| 215 //////// Utility function for table queries | |
| 216 //////// | |
| 217 | |
| 218 // Counts the number of ColPartitions that intersect vertical cell | |
| 219 // division at this x value. Used by VerifyLinedTable. | |
| 220 int CountVerticalIntersections(int x); | |
| 221 int CountHorizontalIntersections(int y); | |
| 222 | |
| 223 // Counts how many text partitions are in this box. | |
| 224 int CountPartitions(const TBOX &box); | |
| 225 | |
| 226 //////// | |
| 227 //////// Data members. | |
| 228 //////// | |
| 229 | |
| 230 // Input data, used as read only data to make decisions. | |
| 231 ColPartitionGrid *text_grid_; // Text ColPartitions | |
| 232 ColPartitionGrid *line_grid_; // Line ColPartitions | |
| 233 // Table structure. | |
| 234 // bounding box is a convenient external representation. | |
| 235 // cell_x_ and cell_y_ indicate the grid lines. | |
| 236 TBOX bounding_box_; // Bounding box | |
| 237 std::vector<int> cell_x_; // Locations of vertical divisions (sorted) | |
| 238 std::vector<int> cell_y_; // Locations of horizontal divisions (sorted) | |
| 239 bool is_lined_; // Is the table backed up by a line structure | |
| 240 // Table margins, set via CalculateMargins | |
| 241 int space_above_; | |
| 242 int space_below_; | |
| 243 int space_left_; | |
| 244 int space_right_; | |
| 245 int median_cell_height_; | |
| 246 int median_cell_width_; | |
| 247 // Filters, used to prevent awkward partitions from destroying structure. | |
| 248 int max_text_height_; | |
| 249 }; | |
| 250 | |
| 251 class TESS_API TableRecognizer { | |
| 252 public: | |
| 253 TableRecognizer() = default; | |
| 254 ~TableRecognizer() = default; | |
| 255 | |
| 256 // Initialization code. Must be called after the constructor. | |
| 257 void Init(); | |
| 258 | |
| 259 //////// | |
| 260 //////// Pre-recognize methods to initial table constraints. | |
| 261 //////// | |
| 262 | |
| 263 // Sets the grids used by the table. These can be changed between | |
| 264 // calls to Recognize. They are treated as read-only data. | |
| 265 void set_text_grid(ColPartitionGrid *text); | |
| 266 void set_line_grid(ColPartitionGrid *lines); | |
| 267 // Sets some additional constraints on the table. | |
| 268 void set_min_height(int height); | |
| 269 void set_min_width(int width); | |
| 270 // Filters text partitions that are ridiculously tall to prevent | |
| 271 // merging rows. Note that "filters" refers to allowing horizontal | |
| 272 // cells to slice through them on the premise that they were | |
| 273 // merged text rows during previous layout. | |
| 274 void set_max_text_height(int height); | |
| 275 | |
| 276 // Given a guess location, the RecognizeTable function will try to find a | |
| 277 // structured grid in the area. On success, it will return a new | |
| 278 // StructuredTable (and assumes you will delete it). Otherwise, | |
| 279 // nullptr is returned. | |
| 280 // | |
| 281 // Keep in mind, this may "overgrow" or "undergrow" the size of guess. | |
| 282 // Ideally, there is either a one-to-one correspondence between | |
| 283 // the guess and table or no table at all. This is not the best of | |
| 284 // assumptions right now, but was made to try to keep things simple in | |
| 285 // the first pass. | |
| 286 // | |
| 287 // If a line structure is available on the page in the given region, | |
| 288 // the table will use the linear structure as it is. | |
| 289 // Otherwise, it will try to maximize the whitespace around it while keeping | |
| 290 // a grid structure. This is somewhat working. | |
| 291 // | |
| 292 // Since the combination of adjustments can get high, effort was | |
| 293 // originally made to keep the number of adjustments linear in the number | |
| 294 // of partitions. The underlying structure finding code used to be | |
| 295 // much more complex. I don't know how necessary this constraint is anymore. | |
| 296 // The evaluation of a possible table is kept within O(nlogn) in the size of | |
| 297 // the table (where size is the number of partitions in the table). | |
| 298 // As a result, the algorithm is capable of O(n^2 log n). Depending | |
| 299 // on the grid search size, it may be higher. | |
| 300 // | |
| 301 // Last note: it is possible to just try all partition boundaries at a high | |
| 302 // level O(n^4) and do a verification scheme (at least O(nlogn)). If there | |
| 303 // area 200 partitions on a page, this could be too costly. Effort could go | |
| 304 // into pruning the search, but I opted for something quicker. I'm confident | |
| 305 // that the independent adjustments can get similar results and keep the | |
| 306 // complextiy down. However, the other approach could work without using | |
| 307 // TableFinder at all if it is fast enough. It comes down to properly | |
| 308 // deciding what is a table. The code currently relies on TableFinder's | |
| 309 // guess to the location of a table for that. | |
| 310 StructuredTable *RecognizeTable(const TBOX &guess_box); | |
| 311 | |
| 312 protected: | |
| 313 //////// | |
| 314 //////// Lined tables | |
| 315 //////// | |
| 316 | |
| 317 // Returns true if the given box has a lined table within it. The | |
| 318 // table argument will be updated with the table if the table exists. | |
| 319 bool RecognizeLinedTable(const TBOX &guess_box, StructuredTable *table); | |
| 320 // Returns true if the given box has a large number of horizontal and | |
| 321 // vertical lines present. If so, we assume the extent of these lines | |
| 322 // uniquely defines a table and find that table via SolveLinedTable. | |
| 323 bool HasSignificantLines(const TBOX &guess); | |
| 324 | |
| 325 // Given enough horizontal and vertical lines in a region, find a bounding | |
| 326 // box that encloses all of them (as well as newly introduced lines). | |
| 327 // The bounding box is the smallest box that encloses the lines in guess | |
| 328 // without having any lines sticking out of it. | |
| 329 // bounding_box is an in/out parameter. | |
| 330 // On input, it in the extents of the box to search. | |
| 331 // On output, it is the resulting bounding box. | |
| 332 bool FindLinesBoundingBox(TBOX *bounding_box); | |
| 333 // Iteration in above search. | |
| 334 // bounding_box is an in/out parameter. | |
| 335 // On input, it in the extents of the box to search. | |
| 336 // On output, it is the resulting bounding box. | |
| 337 bool FindLinesBoundingBoxIteration(TBOX *bounding_box); | |
| 338 | |
| 339 //////// | |
| 340 //////// Generic "whitespaced" tables | |
| 341 //////// | |
| 342 | |
| 343 // Returns true if the given box has a whitespaced table within it. The | |
| 344 // table argument will be updated if the table exists. Also note | |
| 345 // that this method will fail if the guess_box center is not | |
| 346 // mostly within the table. | |
| 347 bool RecognizeWhitespacedTable(const TBOX &guess_box, StructuredTable *table); | |
| 348 | |
| 349 // Finds the location of a horizontal split relative to y. | |
| 350 // This function is mostly unused now. If the SolveWhitespacedTable | |
| 351 // changes much, it can be removed. Note, it isn't really as reliable | |
| 352 // as I thought. I went with alternatives for most of the other uses. | |
| 353 int NextHorizontalSplit(int left, int right, int y, bool top_to_bottom); | |
| 354 | |
| 355 // Input data, used as read only data to make decisions. | |
| 356 ColPartitionGrid *text_grid_ = nullptr; // Text ColPartitions | |
| 357 ColPartitionGrid *line_grid_ = nullptr; // Line ColPartitions | |
| 358 // Table constraints, a "good" table must satisfy these. | |
| 359 int min_height_ = 0; | |
| 360 int min_width_ = 0; | |
| 361 // Filters, used to prevent awkward partitions from destroying structure. | |
| 362 int max_text_height_ = INT32_MAX; // Horizontal lines may intersect taller text. | |
| 363 }; | |
| 364 | |
| 365 } // namespace tesseract | |
| 366 | |
| 367 #endif /* TABLERECOG_H_ */ |
