Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/textord/colfind.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: colfind.h | |
| 3 // Description: Class to find columns in the grid of BLOBNBOXes. | |
| 4 // Author: Ray Smith | |
| 5 // | |
| 6 // (C) Copyright 2008, Google Inc. | |
| 7 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 // you may not use this file except in compliance with the License. | |
| 9 // You may obtain a copy of the License at | |
| 10 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 // Unless required by applicable law or agreed to in writing, software | |
| 12 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 // See the License for the specific language governing permissions and | |
| 15 // limitations under the License. | |
| 16 // | |
| 17 /////////////////////////////////////////////////////////////////////// | |
| 18 | |
| 19 #ifndef TESSERACT_TEXTORD_COLFIND_H_ | |
| 20 #define TESSERACT_TEXTORD_COLFIND_H_ | |
| 21 | |
| 22 #include "colpartitiongrid.h" | |
| 23 #include "colpartitionset.h" | |
| 24 #include "debugpixa.h" | |
| 25 #include "imagefind.h" | |
| 26 #include "ocrblock.h" | |
| 27 #include "tabfind.h" | |
| 28 #include "textlineprojection.h" | |
| 29 | |
| 30 class BLOCK_LIST; | |
| 31 struct Boxa; | |
| 32 struct Pixa; | |
| 33 class DENORM; | |
| 34 class ScrollView; | |
| 35 class STATS; | |
| 36 class TO_BLOCK; | |
| 37 | |
| 38 namespace tesseract { | |
| 39 | |
| 40 class ColPartitionSet; | |
| 41 class ColPartitionSet_LIST; | |
| 42 class ColSegment_LIST; | |
| 43 class ColumnGroup_LIST; | |
| 44 class LineSpacing; | |
| 45 class StrokeWidth; | |
| 46 class TempColumn_LIST; | |
| 47 class EquationDetectBase; | |
| 48 | |
| 49 // The ColumnFinder class finds columns in the grid. | |
| 50 class TESS_API ColumnFinder : public TabFind { | |
| 51 public: | |
| 52 // Gridsize is an estimate of the text size in the image. A suitable value | |
| 53 // is in TO_BLOCK::line_size after find_components has been used to make | |
| 54 // the blobs. | |
| 55 // bleft and tright are the bounds of the image (rectangle) being processed. | |
| 56 // vlines is a (possibly empty) list of TabVector and vertical_x and y are | |
| 57 // the sum logical vertical vector produced by LineFinder::FindVerticalLines. | |
| 58 // If cjk_script is true, then broken CJK characters are fixed during | |
| 59 // layout analysis to assist in detecting horizontal vs vertically written | |
| 60 // textlines. | |
| 61 ColumnFinder(int gridsize, const ICOORD &bleft, const ICOORD &tright, int resolution, | |
| 62 bool cjk_script, double aligned_gap_fraction, TabVector_LIST *vlines, | |
| 63 TabVector_LIST *hlines, int vertical_x, int vertical_y); | |
| 64 ~ColumnFinder() override; | |
| 65 | |
| 66 // Accessors for testing | |
| 67 const DENORM *denorm() const { | |
| 68 return denorm_; | |
| 69 } | |
| 70 const TextlineProjection *projection() const { | |
| 71 return &projection_; | |
| 72 } | |
| 73 void set_cjk_script(bool is_cjk) { | |
| 74 cjk_script_ = is_cjk; | |
| 75 } | |
| 76 | |
| 77 // ====================================================================== | |
| 78 // The main function of ColumnFinder is broken into pieces to facilitate | |
| 79 // optional insertion of orientation and script detection in an efficient | |
| 80 // way. The calling sequence IS MANDATORY however, whether or not | |
| 81 // OSD is being used: | |
| 82 // 1. Construction. | |
| 83 // 2. SetupAndFilterNoise. | |
| 84 // 3. IsVerticallyAlignedText. | |
| 85 // 4. CorrectOrientation. | |
| 86 // 5. FindBlocks. | |
| 87 // 6. Destruction. Use of a single column finder for multiple images does not | |
| 88 // make sense. | |
| 89 // Throughout these steps, the ColPartitions are owned by part_grid_, which | |
| 90 // means that it must be kept correct. Exception: big_parts_ owns its | |
| 91 // own ColPartitions. | |
| 92 // The BLOBNBOXes are owned by the input TO_BLOCK for the whole time, except | |
| 93 // for a phase in FindBlocks before TransformToBlocks, when they become | |
| 94 // owned by the ColPartitions. The owner() ColPartition of a BLOBNBOX | |
| 95 // indicates more of a betrothal for the majority of layout analysis, ie | |
| 96 // which ColPartition will take ownership when the blobs are release from | |
| 97 // the input TO_BLOCK. Exception: image_bblobs_ owns the fake blobs that | |
| 98 // are part of the image regions, as they are not on any TO_BLOCK list. | |
| 99 // TODO(rays) break up column finder further into smaller classes, as | |
| 100 // there is a lot more to it than column finding now. | |
| 101 // ====================================================================== | |
| 102 | |
| 103 // Performs initial processing on the blobs in the input_block: | |
| 104 // Setup the part_grid, stroke_width_, nontext_map_. | |
| 105 // Obvious noise blobs are filtered out and used to mark the nontext_map_. | |
| 106 // Initial stroke-width analysis is used to get local text alignment | |
| 107 // direction, so the textline projection_ map can be setup. | |
| 108 // On return, IsVerticallyAlignedText may be called (now optionally) to | |
| 109 // determine the gross textline alignment of the page. | |
| 110 void SetupAndFilterNoise(PageSegMode pageseg_mode, Image photo_mask_pix, TO_BLOCK *input_block); | |
| 111 | |
| 112 // Tests for vertical alignment of text (returning true if so), and generates | |
| 113 // a list of blobs (in osd_blobs) for orientation and script detection. | |
| 114 // block is the single block for the whole page or rectangle to be OCRed. | |
| 115 // Note that the vertical alignment may be due to text whose writing direction | |
| 116 // is vertical, like say Japanese, or due to text whose writing direction is | |
| 117 // horizontal but whose text appears vertically aligned because the image is | |
| 118 // not the right way up. | |
| 119 // find_vertical_text_ratio should be textord_tabfind_vertical_text_ratio. | |
| 120 bool IsVerticallyAlignedText(double find_vertical_text_ratio, TO_BLOCK *block, | |
| 121 BLOBNBOX_CLIST *osd_blobs); | |
| 122 | |
| 123 // Rotates the blobs and the TabVectors so that the gross writing direction | |
| 124 // (text lines) are horizontal and lines are read down the page. | |
| 125 // Applied rotation stored in rotation_. | |
| 126 // A second rotation is calculated for application during recognition to | |
| 127 // make the rotated blobs upright for recognition. | |
| 128 // Subsequent rotation stored in text_rotation_. | |
| 129 // | |
| 130 // Arguments: | |
| 131 // vertical_text_lines is true if the text lines are vertical. | |
| 132 // recognition_rotation [0..3] is the number of anti-clockwise 90 degree | |
| 133 // rotations from osd required for the text to be upright and readable. | |
| 134 void CorrectOrientation(TO_BLOCK *block, bool vertical_text_lines, int recognition_rotation); | |
| 135 | |
| 136 // Finds blocks of text, image, rule line, table etc, returning them in the | |
| 137 // blocks and to_blocks | |
| 138 // (Each TO_BLOCK points to the basic BLOCK and adds more information.) | |
| 139 // Image blocks are generated by a combination of photo_mask_pix (which may | |
| 140 // NOT be nullptr) and the rejected text found during preliminary textline | |
| 141 // finding. | |
| 142 // The input_block is the result of a call to find_components, and contains | |
| 143 // the blobs found in the image or rectangle to be OCRed. These blobs will be | |
| 144 // removed and placed in the output blocks, while unused ones will be deleted. | |
| 145 // If single_column is true, the input is treated as single column, but | |
| 146 // it is still divided into blocks of equal line spacing/text size. | |
| 147 // scaled_color is scaled down by scaled_factor from the input color image, | |
| 148 // and may be nullptr if the input was not color. | |
| 149 // grey_pix is optional, but if present must match the photo_mask_pix in size, | |
| 150 // and must be a *real* grey image instead of binary_pix * 255. | |
| 151 // thresholds_pix is expected to be present iff grey_pix is present and | |
| 152 // can be an integer factor reduction of the grey_pix. It represents the | |
| 153 // thresholds that were used to create the binary_pix from the grey_pix. | |
| 154 // Small blobs that confuse the segmentation into lines are placed into | |
| 155 // diacritic_blobs, with the intention that they be put into the most | |
| 156 // appropriate word after the rest of layout analysis. | |
| 157 // Returns -1 if the user hits the 'd' key in the blocks window while running | |
| 158 // in debug mode, which requests a retry with more debug info. | |
| 159 int FindBlocks(PageSegMode pageseg_mode, Image scaled_color, int scaled_factor, TO_BLOCK *block, | |
| 160 Image photo_mask_pix, Image thresholds_pix, Image grey_pix, DebugPixa *pixa_debug, | |
| 161 BLOCK_LIST *blocks, BLOBNBOX_LIST *diacritic_blobs, TO_BLOCK_LIST *to_blocks); | |
| 162 | |
| 163 // Get the rotation required to deskew, and its inverse rotation. | |
| 164 void GetDeskewVectors(FCOORD *deskew, FCOORD *reskew); | |
| 165 | |
| 166 // Set the equation detection pointer. | |
| 167 void SetEquationDetect(EquationDetectBase *detect); | |
| 168 | |
| 169 private: | |
| 170 // Displays the blob and block bounding boxes in a window called Blocks. | |
| 171 void DisplayBlocks(BLOCK_LIST *blocks); | |
| 172 // Displays the column edges at each grid y coordinate defined by | |
| 173 // best_columns_. | |
| 174 void DisplayColumnBounds(PartSetVector *sets); | |
| 175 | |
| 176 ////// Functions involved in determining the columns used on the page. ///// | |
| 177 | |
| 178 // Sets up column_sets_ (the determined column layout at each horizontal | |
| 179 // slice). Returns false if the page is empty. | |
| 180 bool MakeColumns(bool single_column); | |
| 181 // Attempt to improve the column_candidates by expanding the columns | |
| 182 // and adding new partitions from the partition sets in src_sets. | |
| 183 // Src_sets may be equal to column_candidates, in which case it will | |
| 184 // use them as a source to improve themselves. | |
| 185 void ImproveColumnCandidates(PartSetVector *src_sets, PartSetVector *column_sets); | |
| 186 // Prints debug information on the column candidates. | |
| 187 void PrintColumnCandidates(const char *title); | |
| 188 // Finds the optimal set of columns that cover the entire image with as | |
| 189 // few changes in column partition as possible. | |
| 190 // Returns true if any part of the page is multi-column. | |
| 191 bool AssignColumns(const PartSetVector &part_sets); | |
| 192 // Finds the biggest range in part_sets_ that has no assigned column, but | |
| 193 // column assignment is possible. | |
| 194 bool BiggestUnassignedRange(int set_count, const bool *any_columns_possible, int *start, | |
| 195 int *end); | |
| 196 // Finds the modal compatible column_set_ index within the given range. | |
| 197 int RangeModalColumnSet(int **column_set_costs, const int *assigned_costs, int start, int end); | |
| 198 // Given that there are many column_set_id compatible columns in the range, | |
| 199 // shrinks the range to the longest contiguous run of compatibility, allowing | |
| 200 // gaps where no columns are possible, but not where competing columns are | |
| 201 // possible. | |
| 202 void ShrinkRangeToLongestRun(int **column_set_costs, const int *assigned_costs, | |
| 203 const bool *any_columns_possible, int column_set_id, int *best_start, | |
| 204 int *best_end); | |
| 205 // Moves start in the direction of step, up to, but not including end while | |
| 206 // the only incompatible regions are no more than kMaxIncompatibleColumnCount | |
| 207 // in size, and the compatible regions beyond are bigger. | |
| 208 void ExtendRangePastSmallGaps(int **column_set_costs, const int *assigned_costs, | |
| 209 const bool *any_columns_possible, int column_set_id, int step, | |
| 210 int end, int *start); | |
| 211 // Assigns the given column_set_id to the part_sets_ in the given range. | |
| 212 void AssignColumnToRange(int column_set_id, int start, int end, int **column_set_costs, | |
| 213 int *assigned_costs); | |
| 214 | |
| 215 // Computes the mean_column_gap_. | |
| 216 void ComputeMeanColumnGap(bool any_multi_column); | |
| 217 | |
| 218 //////// Functions that manipulate ColPartitions in the part_grid_ ///// | |
| 219 //////// to split, merge, find margins, and find types. ////////////// | |
| 220 | |
| 221 // Hoovers up all un-owned blobs and deletes them. | |
| 222 // The rest get released from the block so the ColPartitions can pass | |
| 223 // ownership to the output blocks. | |
| 224 void ReleaseBlobsAndCleanupUnused(TO_BLOCK *block); | |
| 225 // Splits partitions that cross columns where they have nothing in the gap. | |
| 226 void GridSplitPartitions(); | |
| 227 // Merges partitions where there is vertical overlap, within a single column, | |
| 228 // and the horizontal gap is small enough. | |
| 229 void GridMergePartitions(); | |
| 230 // Inserts remaining noise blobs into the most applicable partition if any. | |
| 231 // If there is no applicable partition, then the blobs are deleted. | |
| 232 void InsertRemainingNoise(TO_BLOCK *block); | |
| 233 // Remove partitions that come from horizontal lines that look like | |
| 234 // underlines, but are not part of a table. | |
| 235 void GridRemoveUnderlinePartitions(); | |
| 236 // Add horizontal line separators as partitions. | |
| 237 void GridInsertHLinePartitions(); | |
| 238 // Add vertical line separators as partitions. | |
| 239 void GridInsertVLinePartitions(); | |
| 240 // For every ColPartition in the grid, sets its type based on position | |
| 241 // in the columns. | |
| 242 void SetPartitionTypes(); | |
| 243 // Only images remain with multiple types in a run of partners. | |
| 244 // Sets the type of all in the group to the maximum of the group. | |
| 245 void SmoothPartnerRuns(); | |
| 246 | |
| 247 //////// Functions that make the final output blocks /////// | |
| 248 | |
| 249 // Helper functions for TransformToBlocks. | |
| 250 // Add the part to the temp list in the correct order. | |
| 251 void AddToTempPartList(ColPartition *part, ColPartition_CLIST *temp_list); | |
| 252 // Add everything from the temp list to the work_set assuming correct order. | |
| 253 void EmptyTempPartList(ColPartition_CLIST *temp_list, WorkingPartSet_LIST *work_set); | |
| 254 | |
| 255 // Transform the grid of partitions to the output blocks. | |
| 256 void TransformToBlocks(BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks); | |
| 257 | |
| 258 // Reflect the blob boxes (but not the outlines) in the y-axis so that | |
| 259 // the blocks get created in the correct RTL order. Rotates the blobs | |
| 260 // in the input_block and the bblobs list. | |
| 261 // The reflection is undone in RotateAndReskewBlocks by | |
| 262 // reflecting the blocks themselves, and then recomputing the blob bounding | |
| 263 // boxes. | |
| 264 void ReflectForRtl(TO_BLOCK *input_block, BLOBNBOX_LIST *bblobs); | |
| 265 | |
| 266 // Undo the deskew that was done in FindTabVectors, as recognition is done | |
| 267 // without correcting blobs or blob outlines for skew. | |
| 268 // Reskew the completed blocks to put them back to the original rotated coords | |
| 269 // that were created by CorrectOrientation. | |
| 270 // If the input_is_rtl, then reflect the blocks in the y-axis to undo the | |
| 271 // reflection that was done before FindTabVectors. | |
| 272 // Blocks that were identified as vertical text (relative to the rotated | |
| 273 // coordinates) are further rotated so the text lines are horizontal. | |
| 274 // blob polygonal outlines are rotated to match the position of the blocks | |
| 275 // that they are in, and their bounding boxes are recalculated to be accurate. | |
| 276 // Record appropriate inverse transformations and required | |
| 277 // classifier transformation in the blocks. | |
| 278 void RotateAndReskewBlocks(bool input_is_rtl, TO_BLOCK_LIST *to_blocks); | |
| 279 | |
| 280 // Computes the rotations for the block (to make textlines horizontal) and | |
| 281 // for the blobs (for classification) and sets the appropriate members | |
| 282 // of the given block. | |
| 283 // Returns the rotation that needs to be applied to the blobs to make | |
| 284 // them sit in the rotated block. | |
| 285 FCOORD ComputeBlockAndClassifyRotation(BLOCK *block); | |
| 286 | |
| 287 // If true then the page language is cjk, so it is safe to perform | |
| 288 // FixBrokenCJK. | |
| 289 bool cjk_script_; | |
| 290 // The minimum gutter width to apply for finding columns. | |
| 291 // Modified when vertical text is detected to prevent detection of | |
| 292 // vertical text lines as columns. | |
| 293 int min_gutter_width_; | |
| 294 // The mean gap between columns over the page. | |
| 295 int mean_column_gap_; | |
| 296 // Config param saved at construction time. Modifies min_gutter_width_ with | |
| 297 // vertical text to prevent detection of vertical text as columns. | |
| 298 double tabfind_aligned_gap_fraction_; | |
| 299 // The rotation vector needed to convert original coords to deskewed. | |
| 300 FCOORD deskew_; | |
| 301 // The rotation vector needed to convert deskewed back to original coords. | |
| 302 FCOORD reskew_; | |
| 303 // The rotation vector used to rotate vertically oriented pages. | |
| 304 FCOORD rotation_; | |
| 305 // The rotation vector needed to convert the rotated back to original coords. | |
| 306 FCOORD rerotate_; | |
| 307 // The additional rotation vector needed to rotate text for recognition. | |
| 308 FCOORD text_rotation_; | |
| 309 // The column_sets_ contain the ordered candidate ColPartitionSets that | |
| 310 // define the possible divisions of the page into columns. | |
| 311 PartSetVector column_sets_; | |
| 312 // A simple array of pointers to the best assigned column division at | |
| 313 // each grid y coordinate. | |
| 314 ColPartitionSet **best_columns_; | |
| 315 // The grid used for creating initial partitions with strokewidth. | |
| 316 StrokeWidth *stroke_width_; | |
| 317 // The grid used to hold ColPartitions after the columns have been determined. | |
| 318 ColPartitionGrid part_grid_; | |
| 319 // List of ColPartitions that are no longer needed after they have been | |
| 320 // turned into regions, but are kept around because they are referenced | |
| 321 // by the part_grid_. | |
| 322 ColPartition_LIST good_parts_; | |
| 323 // List of ColPartitions that are big and might be dropcap or vertically | |
| 324 // joined. | |
| 325 ColPartition_LIST big_parts_; | |
| 326 // List of ColPartitions that have been declared noise. | |
| 327 ColPartition_LIST noise_parts_; | |
| 328 // The fake blobs that are made from the images. | |
| 329 BLOBNBOX_LIST image_bblobs_; | |
| 330 // Horizontal line separators. | |
| 331 TabVector_LIST horizontal_lines_; | |
| 332 // Image map of photo/noise areas on the page. | |
| 333 Image nontext_map_; | |
| 334 // Textline projection map. | |
| 335 TextlineProjection projection_; | |
| 336 // Sequence of DENORMS that indicate how to get back to the original image | |
| 337 // coordinate space. The destructor must delete all the DENORMs in the chain. | |
| 338 DENORM *denorm_; | |
| 339 | |
| 340 // The equation region detector pointer. Note: This pointer is passed in by | |
| 341 // member function SetEquationDetect, and releasing it is NOT owned by this | |
| 342 // class. | |
| 343 EquationDetectBase *equation_detect_; | |
| 344 | |
| 345 #ifndef GRAPHICS_DISABLED | |
| 346 // Various debug windows that automatically go away on completion. | |
| 347 ScrollView *input_blobs_win_ = nullptr; | |
| 348 | |
| 349 // Allow a subsequent instance to reuse the blocks window. | |
| 350 // Not thread-safe, but multiple threads shouldn't be using windows anyway. | |
| 351 static ScrollView *blocks_win_; | |
| 352 #endif | |
| 353 }; | |
| 354 | |
| 355 } // namespace tesseract. | |
| 356 | |
| 357 #endif // TESSERACT_TEXTORD_COLFIND_H_ |
