Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/wordrec/lm_state.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: lm_state.h | |
| 3 // Description: Structures and functionality for capturing the state of | |
| 4 // segmentation search guided by the language model. | |
| 5 // Author: Rika Antonova | |
| 6 // | |
| 7 // (C) Copyright 2012, Google Inc. | |
| 8 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 9 // you may not use this file except in compliance with the License. | |
| 10 // You may obtain a copy of the License at | |
| 11 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 12 // Unless required by applicable law or agreed to in writing, software | |
| 13 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 15 // See the License for the specific language governing permissions and | |
| 16 // limitations under the License. | |
| 17 // | |
| 18 /////////////////////////////////////////////////////////////////////// | |
| 19 | |
| 20 #ifndef TESSERACT_WORDREC_LANGUAGE_MODEL_DEFS_H_ | |
| 21 #define TESSERACT_WORDREC_LANGUAGE_MODEL_DEFS_H_ | |
| 22 | |
| 23 #include <tesseract/unichar.h> // for UNICHAR_ID | |
| 24 #include "associate.h" // for AssociateStats | |
| 25 #include "dawg.h" // for DawgPositionVector | |
| 26 #include "elst.h" // for ELIST_ITERATOR, ELISTIZEH, ELIST_LINK | |
| 27 #include "lm_consistency.h" // for LMConsistencyInfo | |
| 28 #include "ratngs.h" // for BLOB_CHOICE, PermuterType | |
| 29 #include "stopper.h" // for DANGERR | |
| 30 #include "unicharset.h" // for UNICHARSET | |
| 31 | |
| 32 namespace tesseract { | |
| 33 | |
| 34 /// Used for expressing various language model flags. | |
| 35 using LanguageModelFlagsType = unsigned char; | |
| 36 | |
| 37 /// The following structs are used for storing the state of the language model | |
| 38 /// in the segmentation search graph. In this graph the nodes are BLOB_CHOICEs | |
| 39 /// and the links are the relationships between the underlying blobs (see | |
| 40 /// segsearch.h for a more detailed description). | |
| 41 /// | |
| 42 /// Each of the BLOB_CHOICEs contains LanguageModelState struct, which has | |
| 43 /// a list of N best paths (list of ViterbiStateEntry) explored by the Viterbi | |
| 44 /// search leading up to and including this BLOB_CHOICE. | |
| 45 /// | |
| 46 /// Each ViterbiStateEntry contains information from various components of the | |
| 47 /// language model: dawgs in which the path is found, character ngram model | |
| 48 /// probability of the path, script/chartype/font consistency info, state for | |
| 49 /// language-specific heuristics (e.g. hyphenated and compound words, | |
| 50 /// lower/upper case preferences, etc). | |
| 51 /// | |
| 52 /// Each ViterbiStateEntry also contains the parent pointer, so that the path | |
| 53 /// that it represents (WERD_CHOICE) can be constructed by following these | |
| 54 /// parent pointers. | |
| 55 | |
| 56 /// Struct for storing additional information used by Dawg language model | |
| 57 /// component. It stores the set of active dawgs in which the sequence of | |
| 58 /// letters on a path can be found. | |
| 59 struct LanguageModelDawgInfo { | |
| 60 LanguageModelDawgInfo(const DawgPositionVector *a, PermuterType pt) | |
| 61 : active_dawgs(*a), permuter(pt) {} | |
| 62 DawgPositionVector active_dawgs; | |
| 63 PermuterType permuter; | |
| 64 }; | |
| 65 | |
| 66 /// Struct for storing additional information used by Ngram language model | |
| 67 /// component. | |
| 68 struct LanguageModelNgramInfo { | |
| 69 LanguageModelNgramInfo(const char *c, int l, bool p, float nc, float ncc) | |
| 70 : context(c) | |
| 71 , context_unichar_step_len(l) | |
| 72 , pruned(p) | |
| 73 , ngram_cost(nc) | |
| 74 , ngram_and_classifier_cost(ncc) {} | |
| 75 std::string context; ///< context string | |
| 76 /// Length of the context measured by advancing using UNICHAR::utf8_step() | |
| 77 /// (should be at most the order of the character ngram model used). | |
| 78 int context_unichar_step_len; | |
| 79 /// The paths with pruned set are pruned out from the perspective of the | |
| 80 /// character ngram model. They are explored further because they represent | |
| 81 /// a dictionary match or a top choice. Thus ngram_info is still computed | |
| 82 /// for them in order to calculate the combined cost. | |
| 83 bool pruned; | |
| 84 /// -ln(P_ngram_model(path)) | |
| 85 float ngram_cost; | |
| 86 /// -[ ln(P_classifier(path)) + scale_factor * ln(P_ngram_model(path)) ] | |
| 87 float ngram_and_classifier_cost; | |
| 88 }; | |
| 89 | |
| 90 /// Struct for storing the information about a path in the segmentation graph | |
| 91 /// explored by Viterbi search. | |
| 92 struct ViterbiStateEntry : public ELIST_LINK { | |
| 93 ViterbiStateEntry(ViterbiStateEntry *pe, BLOB_CHOICE *b, float c, float ol, | |
| 94 const LMConsistencyInfo &ci, const AssociateStats &as, | |
| 95 LanguageModelFlagsType tcf, LanguageModelDawgInfo *d, LanguageModelNgramInfo *n, | |
| 96 const char *debug_uch) | |
| 97 : curr_b(b) | |
| 98 , parent_vse(pe) | |
| 99 , competing_vse(nullptr) | |
| 100 , dawg_info(d) | |
| 101 , ngram_info(n) | |
| 102 , cost(c) | |
| 103 , ratings_sum(b->rating()) | |
| 104 , min_certainty(b->certainty()) | |
| 105 , adapted(b->IsAdapted()) | |
| 106 , length(1) | |
| 107 , outline_length(ol) | |
| 108 , consistency_info(ci) | |
| 109 , associate_stats(as) | |
| 110 , top_choice_flags(tcf) | |
| 111 , updated(true) { | |
| 112 debug_str = (debug_uch == nullptr) ? nullptr : new std::string(); | |
| 113 if (pe != nullptr) { | |
| 114 ratings_sum += pe->ratings_sum; | |
| 115 if (pe->min_certainty < min_certainty) { | |
| 116 min_certainty = pe->min_certainty; | |
| 117 } | |
| 118 adapted += pe->adapted; | |
| 119 length += pe->length; | |
| 120 outline_length += pe->outline_length; | |
| 121 if (debug_uch != nullptr) { | |
| 122 *debug_str += *(pe->debug_str); | |
| 123 } | |
| 124 } | |
| 125 if (debug_str != nullptr && debug_uch != nullptr) { | |
| 126 *debug_str += debug_uch; | |
| 127 } | |
| 128 } | |
| 129 ~ViterbiStateEntry() { | |
| 130 delete dawg_info; | |
| 131 delete ngram_info; | |
| 132 delete debug_str; | |
| 133 } | |
| 134 /// Comparator function for sorting ViterbiStateEntry_LISTs in | |
| 135 /// non-increasing order of costs. | |
| 136 static int Compare(const void *e1, const void *e2) { | |
| 137 const ViterbiStateEntry *ve1 = *static_cast<const ViterbiStateEntry *const *>(e1); | |
| 138 const ViterbiStateEntry *ve2 = *static_cast<const ViterbiStateEntry *const *>(e2); | |
| 139 return (ve1->cost < ve2->cost) ? -1 : 1; | |
| 140 } | |
| 141 inline bool Consistent() const { | |
| 142 if (dawg_info != nullptr && consistency_info.NumInconsistentCase() == 0) { | |
| 143 return true; | |
| 144 } | |
| 145 return consistency_info.Consistent(); | |
| 146 } | |
| 147 /// Returns true if this VSE has an alphanumeric character as its classifier | |
| 148 /// result. | |
| 149 bool HasAlnumChoice(const UNICHARSET &unicharset) { | |
| 150 if (curr_b == nullptr) { | |
| 151 return false; | |
| 152 } | |
| 153 UNICHAR_ID unichar_id = curr_b->unichar_id(); | |
| 154 if (unicharset.get_isalpha(unichar_id) || unicharset.get_isdigit(unichar_id)) { | |
| 155 return true; | |
| 156 } | |
| 157 return false; | |
| 158 } | |
| 159 void Print(const char *msg) const; | |
| 160 | |
| 161 /// Pointers to BLOB_CHOICE and parent ViterbiStateEntry (not owned by this). | |
| 162 BLOB_CHOICE *curr_b; | |
| 163 ViterbiStateEntry *parent_vse; | |
| 164 /// Pointer to a case-competing ViterbiStateEntry in the same list that | |
| 165 /// represents a path ending in the same letter of the opposite case. | |
| 166 ViterbiStateEntry *competing_vse; | |
| 167 | |
| 168 /// Extra information maintained by Dawg language model component | |
| 169 /// (owned by ViterbiStateEntry). | |
| 170 LanguageModelDawgInfo *dawg_info; | |
| 171 | |
| 172 /// Extra information maintained by Ngram language model component | |
| 173 /// (owned by ViterbiStateEntry). | |
| 174 LanguageModelNgramInfo *ngram_info; | |
| 175 | |
| 176 /// UTF8 string representing the path corresponding to this vse. | |
| 177 /// Populated only in when language_model_debug_level > 0. | |
| 178 std::string *debug_str; | |
| 179 | |
| 180 /// The cost is an adjusted ratings sum, that is adjusted by all the language | |
| 181 /// model components that use Viterbi search. | |
| 182 float cost; | |
| 183 | |
| 184 /// Various information about the characters on the path represented | |
| 185 /// by this ViterbiStateEntry. | |
| 186 float ratings_sum; ///< sum of ratings of character on the path | |
| 187 float min_certainty; ///< minimum certainty on the path | |
| 188 int adapted; ///< number of BLOB_CHOICES from adapted templates | |
| 189 int length; ///< number of characters on the path | |
| 190 float outline_length; ///< length of the outline so far | |
| 191 LMConsistencyInfo consistency_info; ///< path consistency info | |
| 192 AssociateStats associate_stats; ///< character widths/gaps/seams | |
| 193 | |
| 194 /// Flags for marking the entry as a top choice path with | |
| 195 /// the smallest rating or lower/upper case letters). | |
| 196 LanguageModelFlagsType top_choice_flags; | |
| 197 | |
| 198 bool updated; ///< set to true if the entry has just been created/updated | |
| 199 }; | |
| 200 | |
| 201 ELISTIZEH(ViterbiStateEntry) | |
| 202 | |
| 203 /// Struct to store information maintained by various language model components. | |
| 204 struct LanguageModelState { | |
| 205 LanguageModelState() | |
| 206 : viterbi_state_entries_prunable_length(0) | |
| 207 , viterbi_state_entries_prunable_max_cost(FLT_MAX) | |
| 208 , viterbi_state_entries_length(0) {} | |
| 209 ~LanguageModelState() = default; | |
| 210 | |
| 211 /// Clears the viterbi search state back to its initial conditions. | |
| 212 void Clear(); | |
| 213 | |
| 214 void Print(const char *msg); | |
| 215 | |
| 216 /// Storage for the Viterbi state. | |
| 217 ViterbiStateEntry_LIST viterbi_state_entries; | |
| 218 /// Number and max cost of prunable paths in viterbi_state_entries. | |
| 219 int viterbi_state_entries_prunable_length; | |
| 220 float viterbi_state_entries_prunable_max_cost; | |
| 221 /// Total number of entries in viterbi_state_entries. | |
| 222 int viterbi_state_entries_length; | |
| 223 }; | |
| 224 | |
| 225 /// Bundle together all the things pertaining to the best choice/state. | |
| 226 struct BestChoiceBundle { | |
| 227 explicit BestChoiceBundle(int matrix_dimension) : updated(false), best_vse(nullptr) { | |
| 228 beam.reserve(matrix_dimension); | |
| 229 for (int i = 0; i < matrix_dimension; ++i) { | |
| 230 beam.push_back(new LanguageModelState); | |
| 231 } | |
| 232 } | |
| 233 ~BestChoiceBundle() { | |
| 234 for (auto &state : beam) { | |
| 235 delete state; | |
| 236 } | |
| 237 } | |
| 238 | |
| 239 /// Flag to indicate whether anything was changed. | |
| 240 bool updated; | |
| 241 /// Places to try to fix the word suggested by ambiguity checking. | |
| 242 DANGERR fixpt; | |
| 243 /// The beam. One LanguageModelState containing a list of ViterbiStateEntry | |
| 244 /// per row in the ratings matrix containing all VSEs whose BLOB_CHOICE is | |
| 245 /// somewhere in the corresponding row. | |
| 246 std::vector<LanguageModelState *> beam; | |
| 247 /// Best ViterbiStateEntry and BLOB_CHOICE. | |
| 248 ViterbiStateEntry *best_vse; | |
| 249 }; | |
| 250 | |
| 251 } // namespace tesseract | |
| 252 | |
| 253 #endif // TESSERACT_WORDREC_LANGUAGE_MODEL_DEFS_H_ |
