Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccstruct/werd.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /********************************************************************** | |
| 2 * File: werd.h | |
| 3 * Description: Code for the WERD class. | |
| 4 * Author: Ray Smith | |
| 5 * | |
| 6 * (C) Copyright 1991, Hewlett-Packard Ltd. | |
| 7 ** Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 ** you may not use this file except in compliance with the License. | |
| 9 ** You may obtain a copy of the License at | |
| 10 ** http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 ** Unless required by applicable law or agreed to in writing, software | |
| 12 ** distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 ** See the License for the specific language governing permissions and | |
| 15 ** limitations under the License. | |
| 16 * | |
| 17 **********************************************************************/ | |
| 18 | |
| 19 #ifndef WERD_H | |
| 20 #define WERD_H | |
| 21 | |
| 22 #include "elst2.h" | |
| 23 #include "params.h" | |
| 24 #include "stepblob.h" | |
| 25 | |
| 26 #include <bitset> | |
| 27 | |
| 28 namespace tesseract { | |
| 29 | |
| 30 enum WERD_FLAGS { | |
| 31 W_SEGMENTED, ///< correctly segmented | |
| 32 W_ITALIC, ///< italic text | |
| 33 W_BOLD, ///< bold text | |
| 34 W_BOL, ///< start of line | |
| 35 W_EOL, ///< end of line | |
| 36 W_NORMALIZED, ///< flags | |
| 37 W_SCRIPT_HAS_XHEIGHT, ///< x-height concept makes sense. | |
| 38 W_SCRIPT_IS_LATIN, ///< Special case latin for y. splitting. | |
| 39 W_DONT_CHOP, ///< fixed pitch chopped | |
| 40 W_REP_CHAR, ///< repeated character | |
| 41 W_FUZZY_SP, ///< fuzzy space | |
| 42 W_FUZZY_NON, ///< fuzzy nonspace | |
| 43 W_INVERSE ///< white on black | |
| 44 }; | |
| 45 | |
| 46 enum DISPLAY_FLAGS { | |
| 47 /* Display flags bit number allocations */ | |
| 48 DF_BOX, ///< Bounding box | |
| 49 DF_TEXT, ///< Correct ascii | |
| 50 DF_POLYGONAL, ///< Polyg approx | |
| 51 DF_EDGE_STEP, ///< Edge steps | |
| 52 DF_BN_POLYGONAL, ///< BL normalisd polyapx | |
| 53 DF_BLAMER ///< Blamer information | |
| 54 }; | |
| 55 | |
| 56 class ROW; // forward decl | |
| 57 | |
| 58 class TESS_API WERD : public ELIST2_LINK { | |
| 59 public: | |
| 60 WERD() = default; | |
| 61 // WERD constructed with: | |
| 62 // blob_list - blobs of the word (we take this list's contents) | |
| 63 // blanks - number of blanks before the word | |
| 64 // text - correct text (outlives WERD) | |
| 65 WERD(C_BLOB_LIST *blob_list, uint8_t blanks, const char *text); | |
| 66 | |
| 67 // WERD constructed from: | |
| 68 // blob_list - blobs in the word | |
| 69 // clone - werd to clone flags, etc from. | |
| 70 WERD(C_BLOB_LIST *blob_list, WERD *clone); | |
| 71 | |
| 72 // Construct a WERD from a single_blob and clone the flags from this. | |
| 73 // W_BOL and W_EOL flags are set according to the given values. | |
| 74 WERD *ConstructFromSingleBlob(bool bol, bool eol, C_BLOB *blob); | |
| 75 | |
| 76 ~WERD() = default; | |
| 77 | |
| 78 // assignment | |
| 79 WERD &operator=(const WERD &source); | |
| 80 | |
| 81 // This method returns a new werd constructed using the blobs in the input | |
| 82 // all_blobs list, which correspond to the blobs in this werd object. The | |
| 83 // blobs used to construct the new word are consumed and removed from the | |
| 84 // input all_blobs list. | |
| 85 // Returns nullptr if the word couldn't be constructed. | |
| 86 // Returns original blobs for which no matches were found in the output list | |
| 87 // orphan_blobs (appends). | |
| 88 WERD *ConstructWerdWithNewBlobs(C_BLOB_LIST *all_blobs, C_BLOB_LIST *orphan_blobs); | |
| 89 | |
| 90 // Accessors for reject / DUFF blobs in various formats | |
| 91 C_BLOB_LIST *rej_cblob_list() { // compact format | |
| 92 return &rej_cblobs; | |
| 93 } | |
| 94 | |
| 95 // Accessors for good blobs in various formats. | |
| 96 C_BLOB_LIST *cblob_list() { // get compact blobs | |
| 97 return &cblobs; | |
| 98 } | |
| 99 | |
| 100 uint8_t space() const { // access function | |
| 101 return blanks; | |
| 102 } | |
| 103 void set_blanks(uint8_t new_blanks) { | |
| 104 blanks = new_blanks; | |
| 105 } | |
| 106 int script_id() const { | |
| 107 return script_id_; | |
| 108 } | |
| 109 void set_script_id(int id) { | |
| 110 script_id_ = id; | |
| 111 } | |
| 112 | |
| 113 // Returns the (default) bounding box including all the dots. | |
| 114 TBOX bounding_box() const; // compute bounding box | |
| 115 // Returns the bounding box including the desired combination of upper and | |
| 116 // lower noise/diacritic elements. | |
| 117 TBOX restricted_bounding_box(bool upper_dots, bool lower_dots) const; | |
| 118 // Returns the bounding box of only the good blobs. | |
| 119 TBOX true_bounding_box() const; | |
| 120 | |
| 121 const char *text() const { | |
| 122 return correct.c_str(); | |
| 123 } | |
| 124 void set_text(const char *new_text) { | |
| 125 correct = new_text; | |
| 126 } | |
| 127 | |
| 128 bool flag(WERD_FLAGS mask) const { | |
| 129 return flags[mask]; | |
| 130 } | |
| 131 void set_flag(WERD_FLAGS mask, bool value) { | |
| 132 flags.set(mask, value); | |
| 133 } | |
| 134 | |
| 135 bool display_flag(uint8_t flag) const { | |
| 136 return disp_flags[flag]; | |
| 137 } | |
| 138 void set_display_flag(uint8_t flag, bool value) { | |
| 139 disp_flags.set(flag, value); | |
| 140 } | |
| 141 | |
| 142 WERD *shallow_copy(); // shallow copy word | |
| 143 | |
| 144 // reposition word by vector | |
| 145 void move(const ICOORD vec); | |
| 146 | |
| 147 // join other's blobs onto this werd, emptying out other. | |
| 148 void join_on(WERD *other); | |
| 149 | |
| 150 // copy other's blobs onto this word, leaving other intact. | |
| 151 void copy_on(WERD *other); | |
| 152 | |
| 153 // tprintf word metadata (but not blob innards) | |
| 154 void print() const; | |
| 155 | |
| 156 #ifndef GRAPHICS_DISABLED | |
| 157 // plot word on window in a uniform colour | |
| 158 void plot(ScrollView *window, ScrollView::Color colour); | |
| 159 | |
| 160 // Get the next color in the (looping) rainbow. | |
| 161 static ScrollView::Color NextColor(ScrollView::Color colour); | |
| 162 | |
| 163 // plot word on window in a rainbow of colours | |
| 164 void plot(ScrollView *window); | |
| 165 | |
| 166 // plot rejected blobs in a rainbow of colours | |
| 167 void plot_rej_blobs(ScrollView *window); | |
| 168 #endif // !GRAPHICS_DISABLED | |
| 169 | |
| 170 // Removes noise from the word by moving small outlines to the rej_cblobs | |
| 171 // list, based on the size_threshold. | |
| 172 void CleanNoise(float size_threshold); | |
| 173 | |
| 174 // Extracts all the noise outlines and stuffs the pointers into the given | |
| 175 // vector of outlines. Afterwards, the outlines vector owns the pointers. | |
| 176 void GetNoiseOutlines(std::vector<C_OUTLINE *> *outlines); | |
| 177 // Adds the selected outlines to the indcated real blobs, and puts the rest | |
| 178 // back in rej_cblobs where they came from. Where the target_blobs entry is | |
| 179 // nullptr, a run of wanted outlines is put into a single new blob. | |
| 180 // Ownership of the outlines is transferred back to the word. (Hence | |
| 181 // vector and not PointerVector.) | |
| 182 // Returns true if any new blob was added to the start of the word, which | |
| 183 // suggests that it might need joining to the word before it, and likewise | |
| 184 // sets make_next_word_fuzzy true if any new blob was added to the end. | |
| 185 bool AddSelectedOutlines(const std::vector<bool> &wanted, | |
| 186 const std::vector<C_BLOB *> &target_blobs, | |
| 187 const std::vector<C_OUTLINE *> &outlines, bool *make_next_word_fuzzy); | |
| 188 | |
| 189 private: | |
| 190 uint8_t blanks = 0; // no of blanks | |
| 191 std::bitset<16> flags; // flags about word | |
| 192 std::bitset<16> disp_flags; // display flags | |
| 193 int16_t script_id_ = 0; // From unicharset. | |
| 194 std::string correct; // correct text | |
| 195 C_BLOB_LIST cblobs; // compacted blobs | |
| 196 C_BLOB_LIST rej_cblobs; // DUFF blobs | |
| 197 }; | |
| 198 | |
| 199 ELIST2IZEH(WERD) | |
| 200 | |
| 201 } // namespace tesseract | |
| 202 | |
| 203 #include "ocrrow.h" // placed here due to | |
| 204 | |
| 205 namespace tesseract { | |
| 206 | |
| 207 // compare words by increasing order of left edge, suitable for qsort(3) | |
| 208 int word_comparator(const void *word1p, const void *word2p); | |
| 209 | |
| 210 } // namespace tesseract | |
| 211 | |
| 212 #endif |
