comparison mupdf-source/thirdparty/tesseract/src/wordrec/lm_state.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 ///////////////////////////////////////////////////////////////////////
2 // File: lm_state.h
3 // Description: Structures and functionality for capturing the state of
4 // segmentation search guided by the language model.
5 // Author: Rika Antonova
6 //
7 // (C) Copyright 2012, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
18 ///////////////////////////////////////////////////////////////////////
19
20 #ifndef TESSERACT_WORDREC_LANGUAGE_MODEL_DEFS_H_
21 #define TESSERACT_WORDREC_LANGUAGE_MODEL_DEFS_H_
22
23 #include <tesseract/unichar.h> // for UNICHAR_ID
24 #include "associate.h" // for AssociateStats
25 #include "dawg.h" // for DawgPositionVector
26 #include "elst.h" // for ELIST_ITERATOR, ELISTIZEH, ELIST_LINK
27 #include "lm_consistency.h" // for LMConsistencyInfo
28 #include "ratngs.h" // for BLOB_CHOICE, PermuterType
29 #include "stopper.h" // for DANGERR
30 #include "unicharset.h" // for UNICHARSET
31
32 namespace tesseract {
33
34 /// Used for expressing various language model flags.
35 using LanguageModelFlagsType = unsigned char;
36
37 /// The following structs are used for storing the state of the language model
38 /// in the segmentation search graph. In this graph the nodes are BLOB_CHOICEs
39 /// and the links are the relationships between the underlying blobs (see
40 /// segsearch.h for a more detailed description).
41 ///
42 /// Each of the BLOB_CHOICEs contains LanguageModelState struct, which has
43 /// a list of N best paths (list of ViterbiStateEntry) explored by the Viterbi
44 /// search leading up to and including this BLOB_CHOICE.
45 ///
46 /// Each ViterbiStateEntry contains information from various components of the
47 /// language model: dawgs in which the path is found, character ngram model
48 /// probability of the path, script/chartype/font consistency info, state for
49 /// language-specific heuristics (e.g. hyphenated and compound words,
50 /// lower/upper case preferences, etc).
51 ///
52 /// Each ViterbiStateEntry also contains the parent pointer, so that the path
53 /// that it represents (WERD_CHOICE) can be constructed by following these
54 /// parent pointers.
55
56 /// Struct for storing additional information used by Dawg language model
57 /// component. It stores the set of active dawgs in which the sequence of
58 /// letters on a path can be found.
59 struct LanguageModelDawgInfo {
60 LanguageModelDawgInfo(const DawgPositionVector *a, PermuterType pt)
61 : active_dawgs(*a), permuter(pt) {}
62 DawgPositionVector active_dawgs;
63 PermuterType permuter;
64 };
65
66 /// Struct for storing additional information used by Ngram language model
67 /// component.
68 struct LanguageModelNgramInfo {
69 LanguageModelNgramInfo(const char *c, int l, bool p, float nc, float ncc)
70 : context(c)
71 , context_unichar_step_len(l)
72 , pruned(p)
73 , ngram_cost(nc)
74 , ngram_and_classifier_cost(ncc) {}
75 std::string context; ///< context string
76 /// Length of the context measured by advancing using UNICHAR::utf8_step()
77 /// (should be at most the order of the character ngram model used).
78 int context_unichar_step_len;
79 /// The paths with pruned set are pruned out from the perspective of the
80 /// character ngram model. They are explored further because they represent
81 /// a dictionary match or a top choice. Thus ngram_info is still computed
82 /// for them in order to calculate the combined cost.
83 bool pruned;
84 /// -ln(P_ngram_model(path))
85 float ngram_cost;
86 /// -[ ln(P_classifier(path)) + scale_factor * ln(P_ngram_model(path)) ]
87 float ngram_and_classifier_cost;
88 };
89
90 /// Struct for storing the information about a path in the segmentation graph
91 /// explored by Viterbi search.
92 struct ViterbiStateEntry : public ELIST_LINK {
93 ViterbiStateEntry(ViterbiStateEntry *pe, BLOB_CHOICE *b, float c, float ol,
94 const LMConsistencyInfo &ci, const AssociateStats &as,
95 LanguageModelFlagsType tcf, LanguageModelDawgInfo *d, LanguageModelNgramInfo *n,
96 const char *debug_uch)
97 : curr_b(b)
98 , parent_vse(pe)
99 , competing_vse(nullptr)
100 , dawg_info(d)
101 , ngram_info(n)
102 , cost(c)
103 , ratings_sum(b->rating())
104 , min_certainty(b->certainty())
105 , adapted(b->IsAdapted())
106 , length(1)
107 , outline_length(ol)
108 , consistency_info(ci)
109 , associate_stats(as)
110 , top_choice_flags(tcf)
111 , updated(true) {
112 debug_str = (debug_uch == nullptr) ? nullptr : new std::string();
113 if (pe != nullptr) {
114 ratings_sum += pe->ratings_sum;
115 if (pe->min_certainty < min_certainty) {
116 min_certainty = pe->min_certainty;
117 }
118 adapted += pe->adapted;
119 length += pe->length;
120 outline_length += pe->outline_length;
121 if (debug_uch != nullptr) {
122 *debug_str += *(pe->debug_str);
123 }
124 }
125 if (debug_str != nullptr && debug_uch != nullptr) {
126 *debug_str += debug_uch;
127 }
128 }
129 ~ViterbiStateEntry() {
130 delete dawg_info;
131 delete ngram_info;
132 delete debug_str;
133 }
134 /// Comparator function for sorting ViterbiStateEntry_LISTs in
135 /// non-increasing order of costs.
136 static int Compare(const void *e1, const void *e2) {
137 const ViterbiStateEntry *ve1 = *static_cast<const ViterbiStateEntry *const *>(e1);
138 const ViterbiStateEntry *ve2 = *static_cast<const ViterbiStateEntry *const *>(e2);
139 return (ve1->cost < ve2->cost) ? -1 : 1;
140 }
141 inline bool Consistent() const {
142 if (dawg_info != nullptr && consistency_info.NumInconsistentCase() == 0) {
143 return true;
144 }
145 return consistency_info.Consistent();
146 }
147 /// Returns true if this VSE has an alphanumeric character as its classifier
148 /// result.
149 bool HasAlnumChoice(const UNICHARSET &unicharset) {
150 if (curr_b == nullptr) {
151 return false;
152 }
153 UNICHAR_ID unichar_id = curr_b->unichar_id();
154 if (unicharset.get_isalpha(unichar_id) || unicharset.get_isdigit(unichar_id)) {
155 return true;
156 }
157 return false;
158 }
159 void Print(const char *msg) const;
160
161 /// Pointers to BLOB_CHOICE and parent ViterbiStateEntry (not owned by this).
162 BLOB_CHOICE *curr_b;
163 ViterbiStateEntry *parent_vse;
164 /// Pointer to a case-competing ViterbiStateEntry in the same list that
165 /// represents a path ending in the same letter of the opposite case.
166 ViterbiStateEntry *competing_vse;
167
168 /// Extra information maintained by Dawg language model component
169 /// (owned by ViterbiStateEntry).
170 LanguageModelDawgInfo *dawg_info;
171
172 /// Extra information maintained by Ngram language model component
173 /// (owned by ViterbiStateEntry).
174 LanguageModelNgramInfo *ngram_info;
175
176 /// UTF8 string representing the path corresponding to this vse.
177 /// Populated only in when language_model_debug_level > 0.
178 std::string *debug_str;
179
180 /// The cost is an adjusted ratings sum, that is adjusted by all the language
181 /// model components that use Viterbi search.
182 float cost;
183
184 /// Various information about the characters on the path represented
185 /// by this ViterbiStateEntry.
186 float ratings_sum; ///< sum of ratings of character on the path
187 float min_certainty; ///< minimum certainty on the path
188 int adapted; ///< number of BLOB_CHOICES from adapted templates
189 int length; ///< number of characters on the path
190 float outline_length; ///< length of the outline so far
191 LMConsistencyInfo consistency_info; ///< path consistency info
192 AssociateStats associate_stats; ///< character widths/gaps/seams
193
194 /// Flags for marking the entry as a top choice path with
195 /// the smallest rating or lower/upper case letters).
196 LanguageModelFlagsType top_choice_flags;
197
198 bool updated; ///< set to true if the entry has just been created/updated
199 };
200
201 ELISTIZEH(ViterbiStateEntry)
202
203 /// Struct to store information maintained by various language model components.
204 struct LanguageModelState {
205 LanguageModelState()
206 : viterbi_state_entries_prunable_length(0)
207 , viterbi_state_entries_prunable_max_cost(FLT_MAX)
208 , viterbi_state_entries_length(0) {}
209 ~LanguageModelState() = default;
210
211 /// Clears the viterbi search state back to its initial conditions.
212 void Clear();
213
214 void Print(const char *msg);
215
216 /// Storage for the Viterbi state.
217 ViterbiStateEntry_LIST viterbi_state_entries;
218 /// Number and max cost of prunable paths in viterbi_state_entries.
219 int viterbi_state_entries_prunable_length;
220 float viterbi_state_entries_prunable_max_cost;
221 /// Total number of entries in viterbi_state_entries.
222 int viterbi_state_entries_length;
223 };
224
225 /// Bundle together all the things pertaining to the best choice/state.
226 struct BestChoiceBundle {
227 explicit BestChoiceBundle(int matrix_dimension) : updated(false), best_vse(nullptr) {
228 beam.reserve(matrix_dimension);
229 for (int i = 0; i < matrix_dimension; ++i) {
230 beam.push_back(new LanguageModelState);
231 }
232 }
233 ~BestChoiceBundle() {
234 for (auto &state : beam) {
235 delete state;
236 }
237 }
238
239 /// Flag to indicate whether anything was changed.
240 bool updated;
241 /// Places to try to fix the word suggested by ambiguity checking.
242 DANGERR fixpt;
243 /// The beam. One LanguageModelState containing a list of ViterbiStateEntry
244 /// per row in the ratings matrix containing all VSEs whose BLOB_CHOICE is
245 /// somewhere in the corresponding row.
246 std::vector<LanguageModelState *> beam;
247 /// Best ViterbiStateEntry and BLOB_CHOICE.
248 ViterbiStateEntry *best_vse;
249 };
250
251 } // namespace tesseract
252
253 #endif // TESSERACT_WORDREC_LANGUAGE_MODEL_DEFS_H_