Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccmain/equationdetect.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: equationdetect.h | |
| 3 // Description: The equation detection class that inherits equationdetectbase. | |
| 4 // Author: Zongyi (Joe) Liu (joeliu@google.com) | |
| 5 // | |
| 6 // (C) Copyright 2011, Google Inc. | |
| 7 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 // you may not use this file except in compliance with the License. | |
| 9 // You may obtain a copy of the License at | |
| 10 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 // Unless required by applicable law or agreed to in writing, software | |
| 12 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 // See the License for the specific language governing permissions and | |
| 15 // limitations under the License. | |
| 16 // | |
| 17 /////////////////////////////////////////////////////////////////////// | |
| 18 | |
| 19 #ifndef TESSERACT_CCMAIN_EQUATIONDETECT_H_ | |
| 20 #define TESSERACT_CCMAIN_EQUATIONDETECT_H_ | |
| 21 | |
| 22 #include <tesseract/unichar.h> // for UNICHAR_ID | |
| 23 #include "blobbox.h" // for BLOBNBOX (ptr only), BlobSpecialText... | |
| 24 #include "equationdetectbase.h" // for EquationDetectBase | |
| 25 #include "tesseractclass.h" // for Tesseract | |
| 26 | |
| 27 class TBOX; | |
| 28 class UNICHARSET; | |
| 29 | |
| 30 namespace tesseract { | |
| 31 | |
| 32 class Tesseract; | |
| 33 class ColPartition; | |
| 34 class ColPartitionGrid; | |
| 35 class ColPartitionSet; | |
| 36 | |
| 37 class TESS_API EquationDetect : public EquationDetectBase { | |
| 38 public: | |
| 39 EquationDetect(const char *equ_datapath, const char *equ_language); | |
| 40 ~EquationDetect() override; | |
| 41 | |
| 42 enum IndentType { NO_INDENT, LEFT_INDENT, RIGHT_INDENT, BOTH_INDENT, INDENT_TYPE_COUNT }; | |
| 43 | |
| 44 // Reset the lang_tesseract_ pointer. This function should be called before we | |
| 45 // do any detector work. | |
| 46 void SetLangTesseract(Tesseract *lang_tesseract); | |
| 47 | |
| 48 // Iterate over the blobs inside to_block, and set the blobs that we want to | |
| 49 // process to BSTT_NONE. (By default, they should be BSTT_SKIP). The function | |
| 50 // returns 0 upon success. | |
| 51 int LabelSpecialText(TO_BLOCK *to_block) override; | |
| 52 | |
| 53 // Find possible equation partitions from part_grid. Should be called | |
| 54 // after the special_text_type of blobs are set. | |
| 55 // It returns 0 upon success. | |
| 56 int FindEquationParts(ColPartitionGrid *part_grid, ColPartitionSet **best_columns) override; | |
| 57 | |
| 58 // Reset the resolution of the processing image. TEST only function. | |
| 59 void SetResolution(const int resolution); | |
| 60 | |
| 61 protected: | |
| 62 // Identify the special text type for one blob, and update its field. When | |
| 63 // height_th is set (> 0), we will label the blob as BSTT_NONE if its height | |
| 64 // is less than height_th. | |
| 65 void IdentifySpecialText(BLOBNBOX *blob, const int height_th); | |
| 66 | |
| 67 // Estimate the type for one unichar. | |
| 68 BlobSpecialTextType EstimateTypeForUnichar(const UNICHARSET &unicharset, | |
| 69 const UNICHAR_ID id) const; | |
| 70 | |
| 71 // Compute special text type for each blobs in part_grid_. | |
| 72 void IdentifySpecialText(); | |
| 73 | |
| 74 // Identify blobs that we want to skip during special blob type | |
| 75 // classification. | |
| 76 void IdentifyBlobsToSkip(ColPartition *part); | |
| 77 | |
| 78 // The ColPartitions in part_grid_ maybe over-segmented, particularly in the | |
| 79 // block equation regions. So we like to identify these partitions and merge | |
| 80 // them before we do the searching. | |
| 81 void MergePartsByLocation(); | |
| 82 | |
| 83 // Staring from the seed center, we do radius search. And for partitions that | |
| 84 // have large overlaps with seed, we remove them from part_grid_ and add into | |
| 85 // parts_overlap. Note: this function may update the part_grid_, so if the | |
| 86 // caller is also running ColPartitionGridSearch, use the RepositionIterator | |
| 87 // to continue. | |
| 88 void SearchByOverlap(ColPartition *seed, std::vector<ColPartition *> *parts_overlap); | |
| 89 | |
| 90 // Insert part back into part_grid_, after it absorbs some other parts. | |
| 91 void InsertPartAfterAbsorb(ColPartition *part); | |
| 92 | |
| 93 // Identify the colparitions in part_grid_, label them as PT_EQUATION, and | |
| 94 // save them into cp_seeds_. | |
| 95 void IdentifySeedParts(); | |
| 96 | |
| 97 // Check the blobs count for a seed region candidate. | |
| 98 bool CheckSeedBlobsCount(ColPartition *part); | |
| 99 | |
| 100 // Compute the foreground pixel density for a tbox area. | |
| 101 float ComputeForegroundDensity(const TBOX &tbox); | |
| 102 | |
| 103 // Check if part from seed2 label: with low math density and left indented. We | |
| 104 // are using two checks: | |
| 105 // 1. If its left is aligned with any coordinates in indented_texts_left, | |
| 106 // which we assume have been sorted. | |
| 107 // 2. If its foreground density is over foreground_density_th. | |
| 108 bool CheckForSeed2(const std::vector<int> &indented_texts_left, | |
| 109 const float foreground_density_th, ColPartition *part); | |
| 110 | |
| 111 // Count the number of values in sorted_vec that is close to val, used to | |
| 112 // check if a partition is aligned with text partitions. | |
| 113 int CountAlignment(const std::vector<int> &sorted_vec, const int val) const; | |
| 114 | |
| 115 // Check for a seed candidate using the foreground pixel density. And we | |
| 116 // return true if the density is below a certain threshold, because characters | |
| 117 // in equation regions usually are apart with more white spaces. | |
| 118 bool CheckSeedFgDensity(const float density_th, ColPartition *part); | |
| 119 | |
| 120 // A light version of SplitCPHor: instead of really doing the part split, we | |
| 121 // simply compute the union bounding box of each split part. | |
| 122 void SplitCPHorLite(ColPartition *part, std::vector<TBOX> *splitted_boxes); | |
| 123 | |
| 124 // Split the part (horizontally), and save the split result into | |
| 125 // parts_splitted. Note that it is caller's responsibility to release the | |
| 126 // memory owns by parts_splitted. On the other hand, the part is unchanged | |
| 127 // during this process and still owns the blobs, so do NOT call DeleteBoxes | |
| 128 // when freeing the colpartitions in parts_splitted. | |
| 129 void SplitCPHor(ColPartition *part, std::vector<ColPartition *> *parts_splitted); | |
| 130 | |
| 131 // Check the density for a seed candidate (part) using its math density and | |
| 132 // italic density, returns true if the check passed. | |
| 133 bool CheckSeedDensity(const float math_density_high, const float math_density_low, | |
| 134 const ColPartition *part) const; | |
| 135 | |
| 136 // Check if part is indented. | |
| 137 IndentType IsIndented(ColPartition *part); | |
| 138 | |
| 139 // Identify inline partitions from cp_seeds_, and re-label them. | |
| 140 void IdentifyInlineParts(); | |
| 141 | |
| 142 // Compute the super bounding box for all colpartitions inside part_grid_. | |
| 143 void ComputeCPsSuperBBox(); | |
| 144 | |
| 145 // Identify inline partitions from cp_seeds_ using the horizontal search. | |
| 146 void IdentifyInlinePartsHorizontal(); | |
| 147 | |
| 148 // Estimate the line spacing between two text partitions. Returns -1 if not | |
| 149 // enough data. | |
| 150 int EstimateTextPartLineSpacing(); | |
| 151 | |
| 152 // Identify inline partitions from cp_seeds_ using vertical search. | |
| 153 void IdentifyInlinePartsVertical(const bool top_to_bottom, const int textPartsLineSpacing); | |
| 154 | |
| 155 // Check if part is an inline equation zone. This should be called after we | |
| 156 // identified the seed regions. | |
| 157 bool IsInline(const bool search_bottom, const int textPartsLineSpacing, ColPartition *part); | |
| 158 | |
| 159 // For a given seed partition, we search the part_grid_ and see if there is | |
| 160 // any partition can be merged with it. It returns true if the seed has been | |
| 161 // expanded. | |
| 162 bool ExpandSeed(ColPartition *seed); | |
| 163 | |
| 164 // Starting from the seed position, we search the part_grid_ | |
| 165 // horizontally/vertically, find all partitions that can be | |
| 166 // merged with seed, remove them from part_grid_, and put them into | |
| 167 // parts_to_merge. | |
| 168 void ExpandSeedHorizontal(const bool search_left, ColPartition *seed, | |
| 169 std::vector<ColPartition *> *parts_to_merge); | |
| 170 void ExpandSeedVertical(const bool search_bottom, ColPartition *seed, | |
| 171 std::vector<ColPartition *> *parts_to_merge); | |
| 172 | |
| 173 // Check if a part_box is the small neighbor of seed_box. | |
| 174 bool IsNearSmallNeighbor(const TBOX &seed_box, const TBOX &part_box) const; | |
| 175 | |
| 176 // Perform the density check for part, which we assume is nearing a seed | |
| 177 // partition. It returns true if the check passed. | |
| 178 bool CheckSeedNeighborDensity(const ColPartition *part) const; | |
| 179 | |
| 180 // After identify the math blocks, we do one more scanning on all text | |
| 181 // partitions, and check if any of them is the satellite of: | |
| 182 // math blocks: here a p is the satellite of q if: | |
| 183 // 1. q is the nearest vertical neighbor of p, and | |
| 184 // 2. y_gap(p, q) is less than a threshold, and | |
| 185 // 3. x_overlap(p, q) is over a threshold. | |
| 186 // Note that p can be the satellites of two blocks: its top neighbor and | |
| 187 // bottom neighbor. | |
| 188 void ProcessMathBlockSatelliteParts(); | |
| 189 | |
| 190 // Check if part is the satellite of one/two math blocks. If it is, we return | |
| 191 // true, and save the blocks into math_blocks. | |
| 192 bool IsMathBlockSatellite(ColPartition *part, std::vector<ColPartition *> *math_blocks); | |
| 193 | |
| 194 // Search the nearest neighbor of part in one vertical direction as defined in | |
| 195 // search_bottom. It returns the neighbor found that major x overlap with it, | |
| 196 // or nullptr when not found. | |
| 197 ColPartition *SearchNNVertical(const bool search_bottom, const ColPartition *part); | |
| 198 | |
| 199 // Check if the neighbor with vertical distance of y_gap is a near and math | |
| 200 // block partition. | |
| 201 bool IsNearMathNeighbor(const int y_gap, const ColPartition *neighbor) const; | |
| 202 | |
| 203 // Generate the tiff file name for output/debug file. | |
| 204 void GetOutputTiffName(const char *name, std::string &image_name) const; | |
| 205 | |
| 206 // Debugger function that renders ColPartitions on the input image, where: | |
| 207 // parts labeled as PT_EQUATION will be painted in red, PT_INLINE_EQUATION | |
| 208 // will be painted in green, and other parts will be painted in blue. | |
| 209 void PaintColParts(const std::string &outfile) const; | |
| 210 | |
| 211 // Debugger function that renders the blobs in part_grid_ over the input | |
| 212 // image. | |
| 213 void PaintSpecialTexts(const std::string &outfile) const; | |
| 214 | |
| 215 // Debugger function that print the math blobs density values for a | |
| 216 // ColPartition object. | |
| 217 void PrintSpecialBlobsDensity(const ColPartition *part) const; | |
| 218 | |
| 219 // The tesseract engine initialized from equation training data. | |
| 220 Tesseract equ_tesseract_; | |
| 221 | |
| 222 // The tesseract engine used for OCR. This pointer is passed in by the caller, | |
| 223 // so do NOT destroy it in this class. | |
| 224 Tesseract *lang_tesseract_; | |
| 225 | |
| 226 // The ColPartitionGrid that we are processing. This pointer is passed in from | |
| 227 // the caller, so do NOT destroy it in the class. | |
| 228 ColPartitionGrid *part_grid_ = nullptr; | |
| 229 | |
| 230 // A simple array of pointers to the best assigned column division at | |
| 231 // each grid y coordinate. This pointer is passed in from the caller, so do | |
| 232 // NOT destroy it in the class. | |
| 233 ColPartitionSet **best_columns_ = nullptr; | |
| 234 | |
| 235 // The super bounding box of all cps in the part_grid_. | |
| 236 TBOX *cps_super_bbox_; | |
| 237 | |
| 238 // The seed ColPartition for equation region. | |
| 239 std::vector<ColPartition *> cp_seeds_; | |
| 240 | |
| 241 // The resolution (dpi) of the processing image. | |
| 242 int resolution_; | |
| 243 | |
| 244 // The number of pages we have processed. | |
| 245 int page_count_; | |
| 246 }; | |
| 247 | |
| 248 } // namespace tesseract | |
| 249 | |
| 250 #endif // TESSERACT_CCMAIN_EQUATIONDETECT_H_ |
