Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccstruct/rejctmap.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /********************************************************************** | |
| 2 * File: rejctmap.h (Formerly rejmap.h) | |
| 3 * Description: REJ and REJMAP class functions. | |
| 4 * Author: Phil Cheatle | |
| 5 * | |
| 6 * (C) Copyright 1994, Hewlett-Packard Ltd. | |
| 7 ** Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 ** you may not use this file except in compliance with the License. | |
| 9 ** You may obtain a copy of the License at | |
| 10 ** http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 ** Unless required by applicable law or agreed to in writing, software | |
| 12 ** distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 ** See the License for the specific language governing permissions and | |
| 15 ** limitations under the License. | |
| 16 * | |
| 17 | |
| 18 This module may look unnecessarily verbose, but here's the philosophy... | |
| 19 | |
| 20 ALL processing of the reject map is done in this module. There are lots of | |
| 21 separate calls to set reject/accept flags. These have DELIBERATELY been kept | |
| 22 distinct so that this module can decide what to do. | |
| 23 | |
| 24 Basically, there is a flag for each sort of rejection or acceptance. This | |
| 25 provides a history of what has happened to EACH character. | |
| 26 | |
| 27 Determining whether a character is CURRENTLY rejected depends on implicit | |
| 28 understanding of the SEQUENCE of possible calls. The flags are defined and | |
| 29 grouped in the REJ_FLAGS enum. These groupings are used in determining a | |
| 30 characters CURRENT rejection status. Basically, a character is ACCEPTED if | |
| 31 | |
| 32 none of the permanent rej flags are set | |
| 33 AND ( the character has never been rejected | |
| 34 OR an accept flag is set which is LATER than the latest reject flag ) | |
| 35 | |
| 36 IT IS FUNDAMENTAL THAT ANYONE HACKING THIS CODE UNDERSTANDS THE SIGNIFICANCE | |
| 37 OF THIS IMPLIED TEMPORAL ORDERING OF THE FLAGS!!!! | |
| 38 **********************************************************************/ | |
| 39 | |
| 40 #ifndef REJCTMAP_H | |
| 41 #define REJCTMAP_H | |
| 42 | |
| 43 #include "errcode.h" | |
| 44 #include "params.h" | |
| 45 | |
| 46 #include <bitset> | |
| 47 #include <memory> | |
| 48 | |
| 49 namespace tesseract { | |
| 50 | |
| 51 enum REJ_FLAGS { | |
| 52 /* Reject modes which are NEVER overridden */ | |
| 53 R_TESS_FAILURE, // PERM Tess didn't classify | |
| 54 R_SMALL_XHT, // PERM Xht too small | |
| 55 R_EDGE_CHAR, // PERM Too close to edge of image | |
| 56 R_1IL_CONFLICT, // PERM 1Il confusion | |
| 57 R_POSTNN_1IL, // PERM 1Il unrejected by NN | |
| 58 R_REJ_CBLOB, // PERM Odd blob | |
| 59 R_MM_REJECT, // PERM Matrix match rejection (m's) | |
| 60 R_BAD_REPETITION, // TEMP Repeated char which doesn't match trend | |
| 61 | |
| 62 /* Initial reject modes (pre NN_ACCEPT) */ | |
| 63 R_POOR_MATCH, // TEMP Ray's original heuristic (Not used) | |
| 64 R_NOT_TESS_ACCEPTED, // TEMP Tess didn't accept WERD | |
| 65 R_CONTAINS_BLANKS, // TEMP Tess failed on other chs in WERD | |
| 66 R_BAD_PERMUTER, // POTENTIAL Bad permuter for WERD | |
| 67 | |
| 68 /* Reject modes generated after NN_ACCEPT but before MM_ACCEPT */ | |
| 69 R_HYPHEN, // TEMP Post NN dodgy hyphen or full stop | |
| 70 R_DUBIOUS, // TEMP Post NN dodgy chars | |
| 71 R_NO_ALPHANUMS, // TEMP No alphanumerics in word after NN | |
| 72 R_MOSTLY_REJ, // TEMP Most of word rejected so rej the rest | |
| 73 R_XHT_FIXUP, // TEMP Xht tests unsure | |
| 74 | |
| 75 /* Reject modes generated after MM_ACCEPT but before QUALITY_ACCEPT */ | |
| 76 R_BAD_QUALITY, // TEMP Quality metrics bad for WERD | |
| 77 | |
| 78 /* Reject modes generated after QUALITY_ACCEPT but before MINIMAL_REJ accep*/ | |
| 79 R_DOC_REJ, // TEMP Document rejection | |
| 80 R_BLOCK_REJ, // TEMP Block rejection | |
| 81 R_ROW_REJ, // TEMP Row rejection | |
| 82 R_UNLV_REJ, // TEMP ~ turned to - or ^ turned to space | |
| 83 | |
| 84 /* Accept modes which occur between the above rejection groups */ | |
| 85 R_NN_ACCEPT, // NN acceptance | |
| 86 R_HYPHEN_ACCEPT, // Hyphen acceptance | |
| 87 R_MM_ACCEPT, // Matrix match acceptance | |
| 88 R_QUALITY_ACCEPT, // Accept word in good quality doc | |
| 89 R_MINIMAL_REJ_ACCEPT // Accept EVERYTHING except tess failures | |
| 90 }; | |
| 91 | |
| 92 /* REJECT MAP VALUES */ | |
| 93 | |
| 94 #define MAP_ACCEPT '1' | |
| 95 #define MAP_REJECT_PERM '0' | |
| 96 #define MAP_REJECT_TEMP '2' | |
| 97 #define MAP_REJECT_POTENTIAL '3' | |
| 98 | |
| 99 class REJ { | |
| 100 std::bitset<32> flags; | |
| 101 | |
| 102 void set_flag(REJ_FLAGS rej_flag) { | |
| 103 flags.set(rej_flag); | |
| 104 } | |
| 105 | |
| 106 public: | |
| 107 REJ() = default; | |
| 108 | |
| 109 REJ( // classwise copy | |
| 110 const REJ &source) { | |
| 111 flags = source.flags; | |
| 112 } | |
| 113 | |
| 114 REJ &operator=( // assign REJ | |
| 115 const REJ &source) = default; | |
| 116 | |
| 117 bool flag(REJ_FLAGS rej_flag) const { | |
| 118 return flags[rej_flag]; | |
| 119 } | |
| 120 | |
| 121 char display_char() const { | |
| 122 if (perm_rejected()) { | |
| 123 return MAP_REJECT_PERM; | |
| 124 } else if (accept_if_good_quality()) { | |
| 125 return MAP_REJECT_POTENTIAL; | |
| 126 } else if (rejected()) { | |
| 127 return MAP_REJECT_TEMP; | |
| 128 } else { | |
| 129 return MAP_ACCEPT; | |
| 130 } | |
| 131 } | |
| 132 | |
| 133 bool perm_rejected() const { // Is char perm reject? | |
| 134 return (flag(R_TESS_FAILURE) || flag(R_SMALL_XHT) || flag(R_EDGE_CHAR) || | |
| 135 flag(R_1IL_CONFLICT) || flag(R_POSTNN_1IL) || flag(R_REJ_CBLOB) || | |
| 136 flag(R_BAD_REPETITION) || flag(R_MM_REJECT)); | |
| 137 } | |
| 138 | |
| 139 private: | |
| 140 bool rej_before_nn_accept() const { | |
| 141 return flag(R_POOR_MATCH) || flag(R_NOT_TESS_ACCEPTED) || | |
| 142 flag(R_CONTAINS_BLANKS) || flag(R_BAD_PERMUTER); | |
| 143 } | |
| 144 | |
| 145 bool rej_between_nn_and_mm() const { | |
| 146 return flag(R_HYPHEN) || flag(R_DUBIOUS) || flag(R_NO_ALPHANUMS) || | |
| 147 flag(R_MOSTLY_REJ) || flag(R_XHT_FIXUP); | |
| 148 } | |
| 149 | |
| 150 bool rej_between_mm_and_quality_accept() const { | |
| 151 return flag(R_BAD_QUALITY); | |
| 152 } | |
| 153 | |
| 154 bool rej_between_quality_and_minimal_rej_accept() const { | |
| 155 return flag(R_DOC_REJ) || flag(R_BLOCK_REJ) || flag(R_ROW_REJ) || | |
| 156 flag(R_UNLV_REJ); | |
| 157 } | |
| 158 | |
| 159 bool rej_before_mm_accept() const { | |
| 160 return rej_between_nn_and_mm() || | |
| 161 (rej_before_nn_accept() && !flag(R_NN_ACCEPT) && | |
| 162 !flag(R_HYPHEN_ACCEPT)); | |
| 163 } | |
| 164 | |
| 165 bool rej_before_quality_accept() const { | |
| 166 return rej_between_mm_and_quality_accept() || | |
| 167 (!flag(R_MM_ACCEPT) && rej_before_mm_accept()); | |
| 168 } | |
| 169 | |
| 170 public: | |
| 171 bool rejected() const { // Is char rejected? | |
| 172 if (flag(R_MINIMAL_REJ_ACCEPT)) { | |
| 173 return false; | |
| 174 } else { | |
| 175 return (perm_rejected() || rej_between_quality_and_minimal_rej_accept() || | |
| 176 (!flag(R_QUALITY_ACCEPT) && rej_before_quality_accept())); | |
| 177 } | |
| 178 } | |
| 179 | |
| 180 bool accept_if_good_quality() const { // potential rej? | |
| 181 return (rejected() && !perm_rejected() && flag(R_BAD_PERMUTER) && | |
| 182 !flag(R_POOR_MATCH) && !flag(R_NOT_TESS_ACCEPTED) && | |
| 183 !flag(R_CONTAINS_BLANKS) && | |
| 184 (!rej_between_nn_and_mm() && !rej_between_mm_and_quality_accept() && | |
| 185 !rej_between_quality_and_minimal_rej_accept())); | |
| 186 } | |
| 187 | |
| 188 void setrej_tess_failure() { // Tess generated blank | |
| 189 set_flag(R_TESS_FAILURE); | |
| 190 } | |
| 191 | |
| 192 void setrej_small_xht() { // Small xht char/wd | |
| 193 set_flag(R_SMALL_XHT); | |
| 194 } | |
| 195 | |
| 196 void setrej_edge_char() { // Close to image edge | |
| 197 set_flag(R_EDGE_CHAR); | |
| 198 } | |
| 199 | |
| 200 void setrej_1Il_conflict() { // Initial reject map | |
| 201 set_flag(R_1IL_CONFLICT); | |
| 202 } | |
| 203 | |
| 204 void setrej_postNN_1Il() { // 1Il after NN | |
| 205 set_flag(R_POSTNN_1IL); | |
| 206 } | |
| 207 | |
| 208 void setrej_rej_cblob() { // Insert duff blob | |
| 209 set_flag(R_REJ_CBLOB); | |
| 210 } | |
| 211 | |
| 212 void setrej_mm_reject() { // Matrix matcher | |
| 213 set_flag(R_MM_REJECT); | |
| 214 } | |
| 215 | |
| 216 void setrej_bad_repetition() { // Odd repeated char | |
| 217 set_flag(R_BAD_REPETITION); | |
| 218 } | |
| 219 | |
| 220 void setrej_poor_match() { // Failed Rays heuristic | |
| 221 set_flag(R_POOR_MATCH); | |
| 222 } | |
| 223 | |
| 224 void setrej_not_tess_accepted() { | |
| 225 // TEMP reject_word | |
| 226 set_flag(R_NOT_TESS_ACCEPTED); | |
| 227 } | |
| 228 | |
| 229 void setrej_contains_blanks() { | |
| 230 // TEMP reject_word | |
| 231 set_flag(R_CONTAINS_BLANKS); | |
| 232 } | |
| 233 | |
| 234 void setrej_bad_permuter() { // POTENTIAL reject_word | |
| 235 set_flag(R_BAD_PERMUTER); | |
| 236 } | |
| 237 | |
| 238 void setrej_hyphen() { // PostNN dubious hyphen or . | |
| 239 set_flag(R_HYPHEN); | |
| 240 } | |
| 241 | |
| 242 void setrej_dubious() { // PostNN dubious limit | |
| 243 set_flag(R_DUBIOUS); | |
| 244 } | |
| 245 | |
| 246 void setrej_no_alphanums() { // TEMP reject_word | |
| 247 set_flag(R_NO_ALPHANUMS); | |
| 248 } | |
| 249 | |
| 250 void setrej_mostly_rej() { // TEMP reject_word | |
| 251 set_flag(R_MOSTLY_REJ); | |
| 252 } | |
| 253 | |
| 254 void setrej_xht_fixup() { // xht fixup | |
| 255 set_flag(R_XHT_FIXUP); | |
| 256 } | |
| 257 | |
| 258 void setrej_bad_quality() { // TEMP reject_word | |
| 259 set_flag(R_BAD_QUALITY); | |
| 260 } | |
| 261 | |
| 262 void setrej_doc_rej() { // TEMP reject_word | |
| 263 set_flag(R_DOC_REJ); | |
| 264 } | |
| 265 | |
| 266 void setrej_block_rej() { // TEMP reject_word | |
| 267 set_flag(R_BLOCK_REJ); | |
| 268 } | |
| 269 | |
| 270 void setrej_row_rej() { // TEMP reject_word | |
| 271 set_flag(R_ROW_REJ); | |
| 272 } | |
| 273 | |
| 274 void setrej_unlv_rej() { // TEMP reject_word | |
| 275 set_flag(R_UNLV_REJ); | |
| 276 } | |
| 277 | |
| 278 void setrej_hyphen_accept() { // NN Flipped a char | |
| 279 set_flag(R_HYPHEN_ACCEPT); | |
| 280 } | |
| 281 | |
| 282 void setrej_nn_accept() { // NN Flipped a char | |
| 283 set_flag(R_NN_ACCEPT); | |
| 284 } | |
| 285 | |
| 286 void setrej_mm_accept() { // Matrix matcher | |
| 287 set_flag(R_MM_ACCEPT); | |
| 288 } | |
| 289 | |
| 290 void setrej_quality_accept() { // Quality flip a char | |
| 291 set_flag(R_QUALITY_ACCEPT); | |
| 292 } | |
| 293 | |
| 294 void setrej_minimal_rej_accept() { | |
| 295 // Accept all except blank | |
| 296 set_flag(R_MINIMAL_REJ_ACCEPT); | |
| 297 } | |
| 298 | |
| 299 bool accepted() const { // Is char accepted? | |
| 300 return !rejected(); | |
| 301 } | |
| 302 | |
| 303 bool recoverable() const { | |
| 304 return (rejected() && !perm_rejected()); | |
| 305 } | |
| 306 | |
| 307 void full_print(FILE *fp) const; | |
| 308 }; | |
| 309 | |
| 310 class REJMAP { | |
| 311 std::unique_ptr<REJ[]> ptr; // ptr to the chars | |
| 312 uint16_t len = 0; // Number of chars | |
| 313 | |
| 314 public: | |
| 315 REJMAP() = default; | |
| 316 | |
| 317 REJMAP(const REJMAP &rejmap) { | |
| 318 *this = rejmap; | |
| 319 } | |
| 320 | |
| 321 REJMAP &operator=(const REJMAP &source); | |
| 322 | |
| 323 // Sets up the ptr array to length, whatever it was before. | |
| 324 void initialise(uint16_t length); | |
| 325 | |
| 326 REJ &operator[]( // access function | |
| 327 uint16_t index) const // map index | |
| 328 { | |
| 329 ASSERT_HOST(index < len); | |
| 330 return ptr[index]; // no bounds checks | |
| 331 } | |
| 332 | |
| 333 uint16_t length() const { // map length | |
| 334 return len; | |
| 335 } | |
| 336 | |
| 337 int16_t accept_count() const; // How many accepted? | |
| 338 | |
| 339 int16_t reject_count() const { // How many rejects? | |
| 340 return len - accept_count(); | |
| 341 } | |
| 342 | |
| 343 // Cut out an element. | |
| 344 void remove_pos(uint16_t pos); | |
| 345 | |
| 346 void print(FILE *fp) const; | |
| 347 | |
| 348 void full_print(FILE *fp) const; | |
| 349 | |
| 350 bool recoverable_rejects() const; // Any non perm rejs? | |
| 351 | |
| 352 bool quality_recoverable_rejects() const; | |
| 353 // Any potential rejs? | |
| 354 | |
| 355 void rej_word_small_xht(); // Reject whole word | |
| 356 // Reject whole word | |
| 357 void rej_word_tess_failure(); | |
| 358 void rej_word_not_tess_accepted(); | |
| 359 // Reject whole word | |
| 360 // Reject whole word | |
| 361 void rej_word_contains_blanks(); | |
| 362 // Reject whole word | |
| 363 void rej_word_bad_permuter(); | |
| 364 void rej_word_xht_fixup(); // Reject whole word | |
| 365 // Reject whole word | |
| 366 void rej_word_no_alphanums(); | |
| 367 void rej_word_mostly_rej(); // Reject whole word | |
| 368 void rej_word_bad_quality(); // Reject whole word | |
| 369 void rej_word_doc_rej(); // Reject whole word | |
| 370 void rej_word_block_rej(); // Reject whole word | |
| 371 void rej_word_row_rej(); // Reject whole word | |
| 372 }; | |
| 373 | |
| 374 } // namespace tesseract | |
| 375 | |
| 376 #endif |
