Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/wordrec/lm_consistency.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: lm_consistency.h | |
| 3 // Description: Struct for recording consistency of the paths representing | |
| 4 // OCR hypotheses. | |
| 5 // Author: Rika Antonova | |
| 6 // | |
| 7 // (C) Copyright 2012, Google Inc. | |
| 8 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 9 // you may not use this file except in compliance with the License. | |
| 10 // You may obtain a copy of the License at | |
| 11 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 12 // Unless required by applicable law or agreed to in writing, software | |
| 13 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 15 // See the License for the specific language governing permissions and | |
| 16 // limitations under the License. | |
| 17 // | |
| 18 //////////////////////////////////////////////////////////////////////// | |
| 19 | |
| 20 #ifndef TESSERACT_WORDREC_LM_CONSISTENCY_H_ | |
| 21 #define TESSERACT_WORDREC_LM_CONSISTENCY_H_ | |
| 22 | |
| 23 #include <cstdint> // for INT16_MAX | |
| 24 #include "dawg.h" // for EDGE_REF, NO_EDGE | |
| 25 #include "dict.h" // for XH_GOOD, XH_INCONSISTENT, XHeightConsi... | |
| 26 | |
| 27 class BLOB_CHOICE; | |
| 28 | |
| 29 namespace tesseract { | |
| 30 | |
| 31 static const char *const XHeightConsistencyEnumName[] = { | |
| 32 "XH_GOOD", | |
| 33 "XH_SUBNORMAL", | |
| 34 "XH_INCONSISTENT", | |
| 35 }; | |
| 36 | |
| 37 // Struct for keeping track of the consistency of the path. | |
| 38 struct LMConsistencyInfo { | |
| 39 enum ChartypeEnum { CT_NONE, CT_ALPHA, CT_DIGIT, CT_OTHER }; | |
| 40 | |
| 41 // How much do characters have to be shifted away from normal parameters | |
| 42 // before we say they're not normal? | |
| 43 static const int kShiftThresh = 1; | |
| 44 | |
| 45 // How much shifting from subscript to superscript and back | |
| 46 // before we declare shenanigans? | |
| 47 static const int kMaxEntropy = 1; | |
| 48 | |
| 49 // Script positions - order important for entropy calculation. | |
| 50 static const int kSUB = 0, kNORM = 1, kSUP = 2; | |
| 51 static const int kNumPos = 3; | |
| 52 | |
| 53 explicit LMConsistencyInfo(const LMConsistencyInfo *parent_info) { | |
| 54 if (parent_info == nullptr) { | |
| 55 // Initialize from scratch. | |
| 56 num_alphas = 0; | |
| 57 num_digits = 0; | |
| 58 num_punc = 0; | |
| 59 num_other = 0; | |
| 60 chartype = CT_NONE; | |
| 61 punc_ref = NO_EDGE; | |
| 62 invalid_punc = false; | |
| 63 num_non_first_upper = 0; | |
| 64 num_lower = 0; | |
| 65 script_id = 0; | |
| 66 inconsistent_script = false; | |
| 67 num_inconsistent_spaces = 0; | |
| 68 inconsistent_font = false; | |
| 69 // Initialize XHeight stats. | |
| 70 for (int i = 0; i < kNumPos; i++) { | |
| 71 xht_count[i] = 0; | |
| 72 xht_count_punc[i] = 0; | |
| 73 xht_lo[i] = 0; | |
| 74 xht_hi[i] = 256; // kBlnCellHeight | |
| 75 } | |
| 76 xht_sp = -1; // This invalid value indicates that there was no parent. | |
| 77 xpos_entropy = 0; | |
| 78 xht_decision = XH_GOOD; | |
| 79 } else { | |
| 80 // Copy parent info | |
| 81 *this = *parent_info; | |
| 82 } | |
| 83 } | |
| 84 inline int NumInconsistentPunc() const { | |
| 85 return invalid_punc ? num_punc : 0; | |
| 86 } | |
| 87 inline int NumInconsistentCase() const { | |
| 88 return (num_non_first_upper > num_lower) ? num_lower : num_non_first_upper; | |
| 89 } | |
| 90 inline int NumInconsistentChartype() const { | |
| 91 return (NumInconsistentPunc() + num_other + | |
| 92 ((num_alphas > num_digits) ? num_digits : num_alphas)); | |
| 93 } | |
| 94 inline bool Consistent() const { | |
| 95 return (NumInconsistentPunc() == 0 && NumInconsistentCase() == 0 && | |
| 96 NumInconsistentChartype() == 0 && !inconsistent_script && !inconsistent_font && | |
| 97 !InconsistentXHeight()); | |
| 98 } | |
| 99 inline int NumInconsistentSpaces() const { | |
| 100 return num_inconsistent_spaces; | |
| 101 } | |
| 102 inline int InconsistentXHeight() const { | |
| 103 return xht_decision == XH_INCONSISTENT; | |
| 104 } | |
| 105 void ComputeXheightConsistency(const BLOB_CHOICE *b, bool is_punc); | |
| 106 float BodyMinXHeight() const { | |
| 107 if (InconsistentXHeight()) { | |
| 108 return 0.0f; | |
| 109 } | |
| 110 return xht_lo[kNORM]; | |
| 111 } | |
| 112 float BodyMaxXHeight() const { | |
| 113 if (InconsistentXHeight()) { | |
| 114 return static_cast<float>(INT16_MAX); | |
| 115 } | |
| 116 return xht_hi[kNORM]; | |
| 117 } | |
| 118 | |
| 119 EDGE_REF punc_ref; | |
| 120 int num_alphas; | |
| 121 int num_digits; | |
| 122 int num_punc; | |
| 123 int num_other; | |
| 124 ChartypeEnum chartype; | |
| 125 XHeightConsistencyEnum xht_decision; | |
| 126 int num_non_first_upper; | |
| 127 int num_lower; | |
| 128 int script_id; | |
| 129 int num_inconsistent_spaces; | |
| 130 // Metrics clumped by position. | |
| 131 float xht_lo[kNumPos]; | |
| 132 float xht_hi[kNumPos]; | |
| 133 int16_t xht_count[kNumPos]; | |
| 134 int16_t xht_count_punc[kNumPos]; | |
| 135 int16_t xht_sp; | |
| 136 int16_t xpos_entropy; | |
| 137 bool invalid_punc; | |
| 138 bool inconsistent_script; | |
| 139 bool inconsistent_font; | |
| 140 }; | |
| 141 | |
| 142 } // namespace tesseract | |
| 143 | |
| 144 #endif // TESSERACT_WORDREC_LM_CONSISTENCY_H_ |
