Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/textord/tabfind.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: tabfind.h | |
| 3 // Description: Subclass of BBGrid to find tabstops. | |
| 4 // Author: Ray Smith | |
| 5 // | |
| 6 // (C) Copyright 2008, Google Inc. | |
| 7 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 // you may not use this file except in compliance with the License. | |
| 9 // You may obtain a copy of the License at | |
| 10 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 // Unless required by applicable law or agreed to in writing, software | |
| 12 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 // See the License for the specific language governing permissions and | |
| 15 // limitations under the License. | |
| 16 // | |
| 17 /////////////////////////////////////////////////////////////////////// | |
| 18 | |
| 19 #ifndef TESSERACT_TEXTORD_TABFIND_H_ | |
| 20 #define TESSERACT_TEXTORD_TABFIND_H_ | |
| 21 | |
| 22 #include <functional> // for std::function | |
| 23 #include "alignedblob.h" | |
| 24 #include "linefind.h" | |
| 25 #include "tabvector.h" | |
| 26 | |
| 27 class BLOBNBOX; | |
| 28 class BLOBNBOX_LIST; | |
| 29 class TO_BLOCK; | |
| 30 class ScrollView; | |
| 31 struct Pix; | |
| 32 | |
| 33 namespace tesseract { | |
| 34 | |
| 35 using WidthCallback = std::function<bool(int)>; | |
| 36 | |
| 37 struct AlignedBlobParams; | |
| 38 class ColPartitionGrid; | |
| 39 | |
| 40 /** Pixel resolution of column width estimates. */ | |
| 41 const int kColumnWidthFactor = 20; | |
| 42 | |
| 43 /** | |
| 44 * The TabFind class contains code to find tab-stops and maintain the | |
| 45 * vectors_ list of tab vectors. | |
| 46 * Also provides an interface to find neighbouring blobs | |
| 47 * in the grid of BLOBNBOXes that is used by multiple subclasses. | |
| 48 * Searching is a complex operation because of the need to enforce | |
| 49 * rule/separator lines, and tabstop boundaries, (when available), so | |
| 50 * as the holder of the list of TabVectors this class provides the functions. | |
| 51 */ | |
| 52 class TESS_API TabFind : public AlignedBlob { | |
| 53 public: | |
| 54 TabFind(int gridsize, const ICOORD &bleft, const ICOORD &tright, TabVector_LIST *vlines, | |
| 55 int vertical_x, int vertical_y, int resolution); | |
| 56 ~TabFind() override; | |
| 57 | |
| 58 /** | |
| 59 * Insert a list of blobs into the given grid (not necessarily this). | |
| 60 * See InsertBlob for the other arguments. | |
| 61 * It would seem to make more sense to swap this and grid, but this way | |
| 62 * around allows grid to not be derived from TabFind, eg a ColPartitionGrid, | |
| 63 * while the grid that provides the tab stops(this) has to be derived from | |
| 64 * TabFind. | |
| 65 */ | |
| 66 void InsertBlobsToGrid(bool h_spread, bool v_spread, BLOBNBOX_LIST *blobs, | |
| 67 BBGrid<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> *grid); | |
| 68 | |
| 69 /** | |
| 70 * Insert a single blob into the given grid (not necessarily this). | |
| 71 * If h_spread, then all cells covered horizontally by the box are | |
| 72 * used, otherwise, just the bottom-left. Similarly for v_spread. | |
| 73 * A side effect is that the left and right rule edges of the blob are | |
| 74 * set according to the tab vectors in this (not grid). | |
| 75 */ | |
| 76 bool InsertBlob(bool h_spread, bool v_spread, BLOBNBOX *blob, | |
| 77 BBGrid<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> *grid); | |
| 78 // Calls SetBlobRuleEdges for all the blobs in the given block. | |
| 79 void SetBlockRuleEdges(TO_BLOCK *block); | |
| 80 // Sets the left and right rule and crossing_rules for the blobs in the given | |
| 81 // list by finding the next outermost tabvectors for each blob. | |
| 82 void SetBlobRuleEdges(BLOBNBOX_LIST *blobs); | |
| 83 | |
| 84 // Returns the gutter width of the given TabVector between the given y limits. | |
| 85 // Also returns x-shift to be added to the vector to clear any intersecting | |
| 86 // blobs. The shift is deducted from the returned gutter. | |
| 87 // If ignore_unmergeables is true, then blobs of UnMergeableType are | |
| 88 // ignored as if they don't exist. (Used for text on image.) | |
| 89 // max_gutter_width is used as the maximum width worth searching for in case | |
| 90 // there is nothing near the TabVector. | |
| 91 int GutterWidth(int bottom_y, int top_y, const TabVector &v, bool ignore_unmergeables, | |
| 92 int max_gutter_width, int *required_shift); | |
| 93 /** | |
| 94 * Find the gutter width and distance to inner neighbour for the given blob. | |
| 95 */ | |
| 96 void GutterWidthAndNeighbourGap(int tab_x, int mean_height, int max_gutter, bool left, | |
| 97 BLOBNBOX *bbox, int *gutter_width, int *neighbour_gap); | |
| 98 | |
| 99 /** | |
| 100 * Return the x-coord that corresponds to the right edge for the given | |
| 101 * box. If there is a rule line to the right that vertically overlaps it, | |
| 102 * then return the x-coord of the rule line, otherwise return the right | |
| 103 * edge of the page. For details see RightTabForBox below. | |
| 104 */ | |
| 105 int RightEdgeForBox(const TBOX &box, bool crossing, bool extended); | |
| 106 /** | |
| 107 * As RightEdgeForBox, but finds the left Edge instead. | |
| 108 */ | |
| 109 int LeftEdgeForBox(const TBOX &box, bool crossing, bool extended); | |
| 110 | |
| 111 /** | |
| 112 * Return the TabVector that corresponds to the right edge for the given | |
| 113 * box. If there is a TabVector to the right that vertically overlaps it, | |
| 114 * then return it, otherwise return nullptr. Note that Right and Left refer | |
| 115 * to the position of the TabVector, not its type, ie RightTabForBox | |
| 116 * returns the nearest TabVector to the right of the box, regardless of | |
| 117 * its type. | |
| 118 * If a TabVector crosses right through the box (as opposed to grazing one | |
| 119 * edge or missing entirely), then crossing false will ignore such a line. | |
| 120 * Crossing true will return the line for BOTH left and right edges. | |
| 121 * If extended is true, then TabVectors are considered to extend to their | |
| 122 * extended_start/end_y, otherwise, just the startpt_ and endpt_. | |
| 123 * These functions make use of an internal iterator to the vectors_ list | |
| 124 * for speed when used repeatedly on neighbouring boxes. The caveat is | |
| 125 * that the iterator must be updated whenever the list is modified. | |
| 126 */ | |
| 127 TabVector *RightTabForBox(const TBOX &box, bool crossing, bool extended); | |
| 128 /** | |
| 129 * As RightTabForBox, but finds the left TabVector instead. | |
| 130 */ | |
| 131 TabVector *LeftTabForBox(const TBOX &box, bool crossing, bool extended); | |
| 132 | |
| 133 /** | |
| 134 * Return true if the given width is close to one of the common | |
| 135 * widths in column_widths_. | |
| 136 */ | |
| 137 bool CommonWidth(int width); | |
| 138 /** | |
| 139 * Return true if the sizes are more than a | |
| 140 * factor of 2 different. | |
| 141 */ | |
| 142 static bool DifferentSizes(int size1, int size2); | |
| 143 /** | |
| 144 * Return true if the sizes are more than a | |
| 145 * factor of 5 different. | |
| 146 */ | |
| 147 static bool VeryDifferentSizes(int size1, int size2); | |
| 148 | |
| 149 /** | |
| 150 * Return a callback for testing CommonWidth. | |
| 151 */ | |
| 152 WidthCallback WidthCB() { | |
| 153 return width_cb_; | |
| 154 } | |
| 155 | |
| 156 /** | |
| 157 * Return the coords at which to draw the image backdrop. | |
| 158 */ | |
| 159 const ICOORD &image_origin() const { | |
| 160 return image_origin_; | |
| 161 } | |
| 162 | |
| 163 protected: | |
| 164 /** | |
| 165 // Accessors | |
| 166 */ | |
| 167 TabVector_LIST *vectors() { | |
| 168 return &vectors_; | |
| 169 } | |
| 170 TabVector_LIST *dead_vectors() { | |
| 171 return &dead_vectors_; | |
| 172 } | |
| 173 | |
| 174 /** | |
| 175 * Top-level function to find TabVectors in an input page block. | |
| 176 * Returns false if the detected skew angle is impossible. | |
| 177 * Applies the detected skew angle to deskew the tabs, blobs and part_grid. | |
| 178 * tabfind_aligned_gap_fraction should be the value of parameter | |
| 179 * textord_tabfind_aligned_gap_fraction | |
| 180 */ | |
| 181 bool FindTabVectors(TabVector_LIST *hlines, BLOBNBOX_LIST *image_blobs, TO_BLOCK *block, | |
| 182 int min_gutter_width, double tabfind_aligned_gap_fraction, | |
| 183 ColPartitionGrid *part_grid, FCOORD *deskew, FCOORD *reskew); | |
| 184 | |
| 185 // Top-level function to not find TabVectors in an input page block, | |
| 186 // but setup for single column mode. | |
| 187 void DontFindTabVectors(BLOBNBOX_LIST *image_blobs, TO_BLOCK *block, FCOORD *deskew, | |
| 188 FCOORD *reskew); | |
| 189 | |
| 190 // Cleans up the lists of blobs in the block ready for use by TabFind. | |
| 191 // Large blobs that look like text are moved to the main blobs list. | |
| 192 // Main blobs that are superseded by the image blobs are deleted. | |
| 193 void TidyBlobs(TO_BLOCK *block); | |
| 194 | |
| 195 // Helper function to setup search limits for *TabForBox. | |
| 196 void SetupTabSearch(int x, int y, int *min_key, int *max_key); | |
| 197 | |
| 198 /** | |
| 199 * Display the tab vectors found in this grid. | |
| 200 */ | |
| 201 ScrollView *DisplayTabVectors(ScrollView *tab_win); | |
| 202 | |
| 203 // First part of FindTabVectors, which may be used twice if the text | |
| 204 // is mostly of vertical alignment. If find_vertical_text flag is | |
| 205 // true, this finds vertical textlines in possibly rotated blob space. | |
| 206 // In other words, when the page has mostly vertical lines and is rotated, | |
| 207 // setting this to true will find horizontal lines on the page. | |
| 208 // tabfind_aligned_gap_fraction should be the value of parameter | |
| 209 // textord_tabfind_aligned_gap_fraction | |
| 210 ScrollView *FindInitialTabVectors(BLOBNBOX_LIST *image_blobs, int min_gutter_width, | |
| 211 double tabfind_aligned_gap_fraction, TO_BLOCK *block); | |
| 212 | |
| 213 // Apply the given rotation to the given list of blobs. | |
| 214 static void RotateBlobList(const FCOORD &rotation, BLOBNBOX_LIST *blobs); | |
| 215 | |
| 216 // Flip the vertical and horizontal lines and rotate the grid ready | |
| 217 // for working on the rotated image. | |
| 218 // The min_gutter_width will be adjusted to the median gutter width between | |
| 219 // vertical tabs to set a better threshold for tabboxes in the 2nd pass. | |
| 220 void ResetForVerticalText(const FCOORD &rotate, const FCOORD &rerotate, | |
| 221 TabVector_LIST *horizontal_lines, int *min_gutter_width); | |
| 222 | |
| 223 // Clear the grid and get rid of the tab vectors, but not separators, | |
| 224 // ready to start again. | |
| 225 void Reset(); | |
| 226 | |
| 227 // Reflect the separator tab vectors and the grids in the y-axis. | |
| 228 // Can only be called after Reset! | |
| 229 void ReflectInYAxis(); | |
| 230 | |
| 231 private: | |
| 232 // For each box in the grid, decide whether it is a candidate tab-stop, | |
| 233 // and if so add it to the left and right tab boxes. | |
| 234 // tabfind_aligned_gap_fraction should be the value of parameter | |
| 235 // textord_tabfind_aligned_gap_fraction | |
| 236 ScrollView *FindTabBoxes(int min_gutter_width, double tabfind_aligned_gap_fraction); | |
| 237 | |
| 238 // Return true if this box looks like a candidate tab stop, and set | |
| 239 // the appropriate tab type(s) to TT_UNCONFIRMED. | |
| 240 // tabfind_aligned_gap_fraction should be the value of parameter | |
| 241 // textord_tabfind_aligned_gap_fraction | |
| 242 bool TestBoxForTabs(BLOBNBOX *bbox, int min_gutter_width, double tabfind_aligned_gap_fraction); | |
| 243 | |
| 244 // Returns true if there is nothing in the rectangle of width min_gutter to | |
| 245 // the left of bbox. | |
| 246 bool ConfirmRaggedLeft(BLOBNBOX *bbox, int min_gutter); | |
| 247 // Returns true if there is nothing in the rectangle of width min_gutter to | |
| 248 // the right of bbox. | |
| 249 bool ConfirmRaggedRight(BLOBNBOX *bbox, int min_gutter); | |
| 250 // Returns true if there is nothing in the given search_box that vertically | |
| 251 // overlaps target_box other than target_box itself. | |
| 252 bool NothingYOverlapsInBox(const TBOX &search_box, const TBOX &target_box); | |
| 253 | |
| 254 // Fills the list of TabVector with the tabstops found in the grid, | |
| 255 // and estimates the logical vertical direction. | |
| 256 void FindAllTabVectors(int min_gutter_width); | |
| 257 // Helper for FindAllTabVectors finds the vectors of a particular type. | |
| 258 int FindTabVectors(int search_size_multiple, TabAlignment alignment, int min_gutter_width, | |
| 259 TabVector_LIST *vectors, int *vertical_x, int *vertical_y); | |
| 260 // Finds a vector corresponding to a tabstop running through the | |
| 261 // given box of the given alignment type. | |
| 262 // search_size_multiple is a multiple of height used to control | |
| 263 // the size of the search. | |
| 264 // vertical_x and y are updated with an estimate of the real | |
| 265 // vertical direction. (skew finding.) | |
| 266 // Returns nullptr if no decent tabstop can be found. | |
| 267 TabVector *FindTabVector(int search_size_multiple, int min_gutter_width, TabAlignment alignment, | |
| 268 BLOBNBOX *bbox, int *vertical_x, int *vertical_y); | |
| 269 | |
| 270 // Set the vertical_skew_ member from the given vector and refit | |
| 271 // all vectors parallel to the skew vector. | |
| 272 void SetVerticalSkewAndParallelize(int vertical_x, int vertical_y); | |
| 273 | |
| 274 // Sort all the current vectors using the vertical_skew_ vector. | |
| 275 void SortVectors(); | |
| 276 | |
| 277 // Evaluate all the current tab vectors. | |
| 278 void EvaluateTabs(); | |
| 279 | |
| 280 // Trace textlines from one side to the other of each tab vector, saving | |
| 281 // the most frequent column widths found in a list so that a given width | |
| 282 // can be tested for being a common width with a simple callback function. | |
| 283 void ComputeColumnWidths(ScrollView *tab_win, ColPartitionGrid *part_grid); | |
| 284 | |
| 285 // Finds column width and: | |
| 286 // if col_widths is not null (pass1): | |
| 287 // pair-up tab vectors with existing ColPartitions and accumulate widths. | |
| 288 // else (pass2): | |
| 289 // find the largest real partition width for each recorded column width, | |
| 290 // to be used as the minimum acceptable width. | |
| 291 void ApplyPartitionsToColumnWidths(ColPartitionGrid *part_grid, STATS *col_widths); | |
| 292 | |
| 293 // Helper makes the list of common column widths in column_widths_ from the | |
| 294 // input col_widths. Destroys the content of col_widths by repeatedly | |
| 295 // finding the mode and erasing the peak. | |
| 296 void MakeColumnWidths(int col_widths_size, STATS *col_widths); | |
| 297 | |
| 298 // Mark blobs as being in a vertical text line where that is the case. | |
| 299 void MarkVerticalText(); | |
| 300 | |
| 301 // Returns the median gutter width between pairs of matching tab vectors | |
| 302 // assuming they are sorted left-to-right. If there are too few data | |
| 303 // points (< kMinLinesInColumn), then 0 is returned. | |
| 304 int FindMedianGutterWidth(TabVector_LIST *tab_vectors); | |
| 305 | |
| 306 // Find the next adjacent (to left or right) blob on this text line, | |
| 307 // with the constraint that it must vertically significantly overlap | |
| 308 // the [top_y, bottom_y] range. | |
| 309 // If ignore_images is true, then blobs with aligned_text() < 0 are treated | |
| 310 // as if they do not exist. | |
| 311 BLOBNBOX *AdjacentBlob(const BLOBNBOX *bbox, bool look_left, bool ignore_images, | |
| 312 double min_overlap_fraction, int gap_limit, int top_y, int bottom_y); | |
| 313 | |
| 314 // Add a bi-directional partner relationship between the left | |
| 315 // and the right. If one (or both) of the vectors is a separator, | |
| 316 // extend a nearby extendable vector or create a new one of the | |
| 317 // correct type, using the given left or right blob as a guide. | |
| 318 void AddPartnerVector(BLOBNBOX *left_blob, BLOBNBOX *right_blob, TabVector *left, | |
| 319 TabVector *right); | |
| 320 | |
| 321 /** | |
| 322 * Remove separators and unused tabs from the main vectors_ list | |
| 323 * to the dead_vectors_ list. | |
| 324 */ | |
| 325 void CleanupTabs(); | |
| 326 | |
| 327 /** | |
| 328 * Deskew the tab vectors and blobs, computing the rotation and resetting | |
| 329 * the storked vertical_skew_. The deskew inverse is returned in reskew. | |
| 330 * Returns false if the detected skew angle is impossible. | |
| 331 */ | |
| 332 bool Deskew(TabVector_LIST *hlines, BLOBNBOX_LIST *image_blobs, TO_BLOCK *block, FCOORD *deskew, | |
| 333 FCOORD *reskew); | |
| 334 | |
| 335 // Compute the rotation required to deskew, and its inverse rotation. | |
| 336 void ComputeDeskewVectors(FCOORD *deskew, FCOORD *reskew); | |
| 337 | |
| 338 /** | |
| 339 * Compute and apply constraints to the end positions of TabVectors so | |
| 340 * that where possible partners end at the same y coordinate. | |
| 341 */ | |
| 342 void ApplyTabConstraints(); | |
| 343 | |
| 344 protected: | |
| 345 ICOORD vertical_skew_; ///< Estimate of true vertical in this image. | |
| 346 int resolution_; ///< Of source image in pixels per inch. | |
| 347 private: | |
| 348 ICOORD image_origin_; ///< Top-left of image in deskewed coords | |
| 349 TabVector_LIST vectors_; ///< List of rule line and tabstops. | |
| 350 TabVector_IT v_it_; ///< Iterator for searching vectors_. | |
| 351 TabVector_LIST dead_vectors_; ///< Separators and unpartnered tab vectors. | |
| 352 // List of commonly occurring width ranges with x=min and y=max. | |
| 353 ICOORDELT_LIST column_widths_; ///< List of commonly occurring width ranges. | |
| 354 /** Callback to test an int for being a common width. */ | |
| 355 WidthCallback width_cb_; | |
| 356 // Sets of bounding boxes that are candidate tab stops. | |
| 357 std::vector<BLOBNBOX *> left_tab_boxes_; | |
| 358 std::vector<BLOBNBOX *> right_tab_boxes_; | |
| 359 }; | |
| 360 | |
| 361 } // namespace tesseract. | |
| 362 | |
| 363 #endif // TESSERACT_TEXTORD_TABFIND_H_ |
