comparison mupdf-source/thirdparty/tesseract/src/textord/tablefind.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 ///////////////////////////////////////////////////////////////////////
2 // File: tablefind.h
3 // Description: Helper classes to find tables from ColPartitions.
4 // Author: Faisal Shafait (faisal.shafait@dfki.de)
5 //
6 // (C) Copyright 2009, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
17 ///////////////////////////////////////////////////////////////////////
18
19 #ifndef TESSERACT_TEXTORD_TABLEFIND_H_
20 #define TESSERACT_TEXTORD_TABLEFIND_H_
21
22 #include "colpartitiongrid.h"
23 #include "elst.h"
24 #include "rect.h"
25
26 namespace tesseract {
27
28 // Possible types for a column segment.
29 enum ColSegType { COL_UNKNOWN, COL_TEXT, COL_TABLE, COL_MIXED, COL_COUNT };
30
31 class ColPartitionSet;
32
33 // ColSegment holds rectangular blocks that represent segmentation of a page
34 // into regions containing single column text/table.
35 class ColSegment;
36 ELISTIZEH(ColSegment)
37 CLISTIZEH(ColSegment)
38
39 class ColSegment : public ELIST_LINK {
40 public:
41 ColSegment();
42 ~ColSegment() = default;
43
44 // Simple accessors and mutators
45 const TBOX &bounding_box() const {
46 return bounding_box_;
47 }
48
49 void set_top(int y) {
50 bounding_box_.set_top(y);
51 }
52
53 void set_bottom(int y) {
54 bounding_box_.set_bottom(y);
55 }
56
57 void set_left(int x) {
58 bounding_box_.set_left(x);
59 }
60
61 void set_right(int x) {
62 bounding_box_.set_right(x);
63 }
64
65 void set_bounding_box(const TBOX &other) {
66 bounding_box_ = other;
67 }
68
69 int get_num_table_cells() const {
70 return num_table_cells_;
71 }
72
73 // set the number of table colpartitions covered by the bounding_box_
74 void set_num_table_cells(int n) {
75 num_table_cells_ = n;
76 }
77
78 int get_num_text_cells() const {
79 return num_text_cells_;
80 }
81
82 // set the number of text colpartitions covered by the bounding_box_
83 void set_num_text_cells(int n) {
84 num_text_cells_ = n;
85 }
86
87 ColSegType type() const {
88 return type_;
89 }
90
91 // set the type of the block based on the ratio of table to text
92 // colpartitions covered by it.
93 void set_type();
94
95 // Provides a color for BBGrid to draw the rectangle.
96 ScrollView::Color BoxColor() const;
97
98 // Insert a rectangle into bounding_box_
99 void InsertBox(const TBOX &other);
100
101 private:
102 TBOX bounding_box_; // bounding box
103 int num_table_cells_;
104 int num_text_cells_;
105 ColSegType type_;
106 };
107
108 // Typedef BBGrid of ColSegments
109 using ColSegmentGrid = BBGrid<ColSegment, ColSegment_CLIST, ColSegment_C_IT>;
110 using ColSegmentGridSearch =
111 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>;
112
113 // TableFinder is a utility class to find a set of tables given a set of
114 // ColPartitions and Columns. The TableFinder will mark candidate ColPartitions
115 // based on research in "Table Detection in Heterogeneous Documents".
116 // Usage flow is as follows:
117 // TableFinder finder;
118 // finder.InsertCleanPartitions(/* grid info */)
119 // finder.LocateTables(/* ColPartitions and Columns */);
120 // finder.Update TODO(nbeato)
121 class TESS_API TableFinder {
122 public:
123 // Constructor is simple initializations
124 TableFinder();
125 ~TableFinder();
126
127 // Set the resolution of the connected components in ppi.
128 void set_resolution(int resolution) {
129 resolution_ = resolution;
130 }
131 // Change the reading order. Initially it is left to right.
132 void set_left_to_right_language(bool order);
133
134 // Initialize
135 void Init(int grid_size, const ICOORD &bottom_left, const ICOORD &top_right);
136
137 // Copy cleaned partitions from ColumnFinder's part_grid_ to this
138 // clean_part_grid_ and insert dot-like noise into period_grid_.
139 // It resizes the grids in this object to the dimensions of grid.
140 void InsertCleanPartitions(ColPartitionGrid *grid, TO_BLOCK *block);
141
142 // High level function to perform table detection
143 // Finds tables and updates the grid object with new partitions for the
144 // tables. The columns and width callbacks are used to merge tables.
145 // The reskew argument is only used to write the tables to the out.png
146 // if that feature is enabled.
147 void LocateTables(ColPartitionGrid *grid, ColPartitionSet **columns,
148 WidthCallback width_cb, const FCOORD &reskew);
149
150 protected:
151 // Access for the grid dimensions.
152 // The results will not be correct until InsertCleanPartitions
153 // has been called. The values are taken from the grid passed as an argument
154 // to that function.
155 int gridsize() const;
156 int gridwidth() const;
157 int gridheight() const;
158 const ICOORD &bleft() const;
159 const ICOORD &tright() const;
160
161 // Makes a window for debugging, see BBGrid
162 ScrollView *MakeWindow(int x, int y, const char *window_name);
163
164 //////// Functions to insert objects from the grid into the table finder.
165 //////// In all cases, ownership is transferred to the table finder.
166 // Inserts text into the table finder.
167 void InsertTextPartition(ColPartition *part);
168 void InsertFragmentedTextPartition(ColPartition *part);
169 void InsertLeaderPartition(ColPartition *part);
170 void InsertRulingPartition(ColPartition *part);
171 void InsertImagePartition(ColPartition *part);
172 void SplitAndInsertFragmentedTextPartition(ColPartition *part);
173 bool AllowTextPartition(const ColPartition &part) const;
174 bool AllowBlob(const BLOBNBOX &blob) const;
175
176 //////// Functions that manipulate ColPartitions in the part_grid_ /////
177 //////// to find tables.
178 ////////
179
180 // Utility function to move segments to col_seg_grid
181 // Note: Move includes ownership,
182 // so segments will be be owned by col_seg_grid
183 void MoveColSegmentsToGrid(ColSegment_LIST *segments,
184 ColSegmentGrid *col_seg_grid);
185
186 //////// Set up code to run during table detection to correctly
187 //////// initialize variables on column partitions that are used later.
188 ////////
189
190 // Initialize the grid and partitions
191 void InitializePartitions(ColPartitionSet **all_columns);
192
193 // Set left, right and top, bottom spacings of each colpartition.
194 // Left/right spacings are w.r.t the column boundaries
195 // Top/bottom spacings are w.r.t. previous and next colpartitions
196 static void SetPartitionSpacings(ColPartitionGrid *grid,
197 ColPartitionSet **all_columns);
198
199 // Set spacing and closest neighbors above and below a given colpartition.
200 void SetVerticalSpacing(ColPartition *part);
201
202 // Set global spacing estimates. This function is dependent on the
203 // partition spacings. So make sure SetPartitionSpacings is called
204 // on the same grid before this.
205 void SetGlobalSpacings(ColPartitionGrid *grid);
206 // Access to the global median xheight. The xheight is the height
207 // of a lowercase 'x' character on the page. This can be viewed as the
208 // average height of a lowercase letter in a textline. As a result
209 // it is used to make assumptions about spacing between words and
210 // table cells.
211 void set_global_median_xheight(int xheight);
212 // Access to the global median blob width. The width is useful
213 // when deciding if a partition is noise.
214 void set_global_median_blob_width(int width);
215 // Access to the global median ledding. The ledding is the distance between
216 // two adjacent text lines. This value can be used to get a rough estimate
217 // for the amount of space between two lines of text. As a result, it
218 // is used to calculate appropriate spacing between adjacent rows of text.
219 void set_global_median_ledding(int ledding);
220
221 // Updates the nearest neighbors for each ColPartition in clean_part_grid_.
222 // The neighbors are most likely SingletonPartner calls after the neighbors
223 // are assigned. This is hear until it is decided to remove the
224 // nearest_neighbor code in ColPartition
225 void FindNeighbors();
226
227 //////// Functions to mark candidate column partitions as tables.
228 //////// Tables are marked as described in
229 //////// Table Detection in Heterogeneous Documents (2010, Shafait & Smith)
230 ////////
231
232 // High level function to mark partitions as table rows/cells.
233 // When this function is done, the column partitions in clean_part_grid_
234 // should mostly be marked as tables.
235 void MarkTablePartitions();
236 // Marks partitions given a local view of a single partition
237 void MarkPartitionsUsingLocalInformation();
238 /////// Heuristics for local marking
239 // Check if the partition has at least one large gap between words or no
240 // significant gap at all
241 // TODO(nbeato): Make const, prevented because blobnbox array access
242 bool HasWideOrNoInterWordGap(ColPartition *part) const;
243 // Checks if a partition is adjacent to leaders on the page
244 bool HasLeaderAdjacent(const ColPartition &part);
245 // Filter individual text partitions marked as table partitions
246 // consisting of paragraph endings, small section headings, and
247 // headers and footers.
248 void FilterFalseAlarms();
249 void FilterParagraphEndings();
250 void FilterHeaderAndFooter();
251 // Mark all ColPartitions as table cells that have a table cell above
252 // and below them
253 void SmoothTablePartitionRuns();
254
255 //////// Functions to create bounding boxes (ColSegment) objects for
256 //////// the columns on the page. The columns are not necessarily
257 //////// vertical lines, meaning if tab stops strongly suggests that
258 //////// a column changes horizontal position, as in the case below,
259 //////// The ColSegment objects will respect that after processing.
260 ////////
261 //////// _____________
262 //////// Ex. | | |
263 //////// |_____|______| 5 boxes: 2 on this line
264 //////// | | | | 3 on this line
265 //////// |___|____|___|
266 ////////
267
268 // Get Column segments from best_columns_
269 void GetColumnBlocks(ColPartitionSet **columns,
270 ColSegment_LIST *col_segments);
271
272 // Group Column segments into consecutive single column regions.
273 void GroupColumnBlocks(ColSegment_LIST *current_segments,
274 ColSegment_LIST *col_segments);
275
276 // Check if two boxes are consecutive within the same column
277 bool ConsecutiveBoxes(const TBOX &b1, const TBOX &b2);
278
279 // Set the ratio of candidate table partitions in each column
280 void SetColumnsType(ColSegment_LIST *col_segments);
281
282 // Merge Column Blocks that were split due to the presence of a table
283 void GridMergeColumnBlocks();
284
285 //////// Functions to turn marked ColPartitions into candidate tables
286 //////// using a modified T-Recs++ algorithm described in
287 //////// Applying The T-Recs Table Recognition System
288 //////// To The Business Letter Domain (2001, Kieninger & Dengel)
289 ////////
290
291 // Merge partititons cells into table columns
292 // Differs from paper by just looking at marked table partitions
293 // instead of similarity metric.
294 // Modified section 4.1 of paper.
295 void GetTableColumns(ColSegment_LIST *table_columns);
296
297 // Finds regions within a column that potentially contain a table.
298 // Ie, the table columns from GetTableColumns are turned into boxes
299 // that span the entire page column (using ColumnBlocks found in
300 // earlier functions) in the x direction and the min/max extent of
301 // overlapping table columns in the y direction.
302 // Section 4.2 of paper.
303 void GetTableRegions(ColSegment_LIST *table_columns,
304 ColSegment_LIST *table_regions);
305
306 //////// Functions to "patch up" found tables
307 ////////
308
309 // Merge table regions corresponding to tables spanning multiple columns
310 void GridMergeTableRegions();
311 bool BelongToOneTable(const TBOX &box1, const TBOX &box2);
312
313 // Adjust table boundaries by building a tight bounding box around all
314 // ColPartitions contained in it.
315 void AdjustTableBoundaries();
316
317 // Grows a table to include partitions that are partially covered
318 // by the table. This includes lines and text. It does not include
319 // noise or images.
320 // On entry, result_box is the minimum size of the result. The results of the
321 // function will union the actual result with result_box.
322 void GrowTableBox(const TBOX &table_box, TBOX *result_box);
323 // Grow a table by increasing the size of the box to include
324 // partitions with significant overlap with the table.
325 void GrowTableToIncludePartials(const TBOX &table_box,
326 const TBOX &search_range, TBOX *result_box);
327 // Grow a table by expanding to the extents of significantly
328 // overlapping lines.
329 void GrowTableToIncludeLines(const TBOX &table_box, const TBOX &search_range,
330 TBOX *result_box);
331 // Checks whether the horizontal line belong to the table by looking at the
332 // side spacing of extra ColPartitions that will be included in the table
333 // due to expansion
334 bool HLineBelongsToTable(const ColPartition &part, const TBOX &table_box);
335
336 // Look for isolated column headers above the given table box and
337 // include them in the table
338 void IncludeLeftOutColumnHeaders(TBOX *table_box);
339
340 // Remove false alarms consisting of a single column
341 void DeleteSingleColumnTables();
342
343 // Return true if at least one gap larger than the global x-height
344 // exists in the horizontal projection
345 bool GapInXProjection(int *xprojection, int length);
346
347 //////// Recognize the tables.
348 ////////
349 // This function will run the table recognizer and try to find better
350 // bounding boxes. The structures of the tables never leave this function
351 // right now. It just tries to prune and merge tables based on info it
352 // has available.
353 void RecognizeTables();
354
355 //////// Debugging functions. Render different structures to GUI
356 //////// for visual debugging / intuition.
357 ////////
358
359 // Displays Colpartitions marked as table row. Overlays them on top of
360 // part_grid_.
361 void DisplayColSegments(ScrollView *win, ColSegment_LIST *cols,
362 ScrollView::Color color);
363
364 // Displays the colpartitions using a new coloring on an existing window.
365 // Note: This method is only for debug purpose during development and
366 // would not be part of checked in code
367 void DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid,
368 ScrollView::Color text_color,
369 ScrollView::Color table_color);
370 void DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid,
371 ScrollView::Color default_color);
372 void DisplayColPartitionConnections(ScrollView *win, ColPartitionGrid *grid,
373 ScrollView::Color default_color);
374
375 // Merge all colpartitions in table regions to make them a single
376 // colpartition and revert types of isolated table cells not
377 // assigned to any table to their original types.
378 void MakeTableBlocks(ColPartitionGrid *grid, ColPartitionSet **columns,
379 const WidthCallback &width_cb);
380
381 /////////////////////////////////////////////////
382 // Useful objects used during table find process.
383 /////////////////////////////////////////////////
384 // Resolution of the connected components in ppi.
385 int resolution_;
386 // Estimate of median x-height over the page
387 int global_median_xheight_;
388 // Estimate of the median blob width on the page
389 int global_median_blob_width_;
390 // Estimate of median leading on the page
391 int global_median_ledding_;
392 // Grid to hold cleaned colpartitions after removing all
393 // colpartitions that consist of only noise blobs, and removing
394 // noise blobs from remaining colpartitions.
395 ColPartitionGrid clean_part_grid_;
396 // Grid contains the leaders and ruling lines.
397 ColPartitionGrid leader_and_ruling_grid_;
398 // Grid contains the broken down column partitions. It can be thought
399 // of as a "word" grid. However, it usually doesn't break apart text lines.
400 // It does break apart table data (most of the time).
401 ColPartitionGrid fragmented_text_grid_;
402 // Grid of page column blocks
403 ColSegmentGrid col_seg_grid_;
404 // Grid of detected tables
405 ColSegmentGrid table_grid_;
406 // The reading order of text. Defaults to true, for languages such as English.
407 bool left_to_right_language_;
408 };
409
410 } // namespace tesseract.
411
412 #endif // TESSERACT_TEXTORD_TABLEFIND_H_