Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/wordrec/language_model.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: language_model.h | |
| 3 // Description: Functions that utilize the knowledge about the properties, | |
| 4 // structure and statistics of the language to help segmentation | |
| 5 // search. | |
| 6 // Author: Daria Antonova | |
| 7 // | |
| 8 // (C) Copyright 2009, Google Inc. | |
| 9 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 10 // you may not use this file except in compliance with the License. | |
| 11 // You may obtain a copy of the License at | |
| 12 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 13 // Unless required by applicable law or agreed to in writing, software | |
| 14 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 16 // See the License for the specific language governing permissions and | |
| 17 // limitations under the License. | |
| 18 // | |
| 19 /////////////////////////////////////////////////////////////////////// | |
| 20 | |
| 21 #ifndef TESSERACT_WORDREC_LANGUAGE_MODEL_H_ | |
| 22 #define TESSERACT_WORDREC_LANGUAGE_MODEL_H_ | |
| 23 | |
| 24 #include "associate.h" // for AssociateStats (ptr only), AssociateUtils | |
| 25 #include "dawg.h" // for DawgPositionVector | |
| 26 #include "dict.h" // for DawgArgs, Dict | |
| 27 #include "lm_consistency.h" // for LMConsistencyInfo | |
| 28 #include "lm_state.h" // for ViterbiStateEntry, LanguageModelFlagsType | |
| 29 #include "params.h" // for DoubleParam, double_VAR_H, IntParam, Boo... | |
| 30 #include "params_model.h" // for ParamsModel | |
| 31 #include "ratngs.h" // for BLOB_CHOICE (ptr only), BLOB_CHOICE_LIST... | |
| 32 #include "stopper.h" // for DANGERR | |
| 33 | |
| 34 #include <cmath> // for exp | |
| 35 | |
| 36 namespace tesseract { | |
| 37 | |
| 38 class UNICHARSET; | |
| 39 class WERD_RES; | |
| 40 | |
| 41 struct BlamerBundle; | |
| 42 | |
| 43 template <typename T> | |
| 44 class UnicityTable; | |
| 45 | |
| 46 class LMPainPoints; | |
| 47 struct FontInfo; | |
| 48 | |
| 49 // This class that contains the data structures and functions necessary | |
| 50 // to represent and use the knowledge about the language. | |
| 51 class LanguageModel { | |
| 52 public: | |
| 53 // Masks for keeping track of top choices that should not be pruned out. | |
| 54 static const LanguageModelFlagsType kSmallestRatingFlag = 0x1; | |
| 55 static const LanguageModelFlagsType kLowerCaseFlag = 0x2; | |
| 56 static const LanguageModelFlagsType kUpperCaseFlag = 0x4; | |
| 57 static const LanguageModelFlagsType kDigitFlag = 0x8; | |
| 58 static const LanguageModelFlagsType kXhtConsistentFlag = 0x10; | |
| 59 | |
| 60 // Denominator for normalizing per-letter ngram cost when deriving | |
| 61 // penalty adjustments. | |
| 62 static const float kMaxAvgNgramCost; | |
| 63 | |
| 64 LanguageModel(const UnicityTable<FontInfo> *fontinfo_table, Dict *dict); | |
| 65 ~LanguageModel(); | |
| 66 | |
| 67 // Fills the given floats array with features extracted from path represented | |
| 68 // by the given ViterbiStateEntry. See ccstruct/params_training_featdef.h | |
| 69 // for feature information. | |
| 70 // Note: the function assumes that features points to an array of size | |
| 71 // PTRAIN_NUM_FEATURE_TYPES. | |
| 72 static void ExtractFeaturesFromPath(const ViterbiStateEntry &vse, float features[]); | |
| 73 | |
| 74 // Updates data structures that are used for the duration of the segmentation | |
| 75 // search on the current word; | |
| 76 void InitForWord(const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio, | |
| 77 float rating_cert_scale); | |
| 78 | |
| 79 // Updates language model state of the given BLOB_CHOICE_LIST (from | |
| 80 // the ratings matrix) and its parent. Updates pain_points if new | |
| 81 // problematic points are found in the segmentation graph. | |
| 82 // | |
| 83 // At most language_model_viterbi_list_size are kept in each | |
| 84 // LanguageModelState.viterbi_state_entries list. | |
| 85 // At most language_model_viterbi_list_max_num_prunable of those are prunable | |
| 86 // (non-dictionary) paths. | |
| 87 // The entries that represent dictionary word paths are kept at the front | |
| 88 // of the list. | |
| 89 // The list ordered by cost that is computed collectively by several | |
| 90 // language model components (currently dawg and ngram components). | |
| 91 bool UpdateState(bool just_classified, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, | |
| 92 LanguageModelState *parent_node, LMPainPoints *pain_points, WERD_RES *word_res, | |
| 93 BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle); | |
| 94 | |
| 95 // Returns true if an acceptable best choice was discovered. | |
| 96 inline bool AcceptableChoiceFound() { | |
| 97 return acceptable_choice_found_; | |
| 98 } | |
| 99 inline void SetAcceptableChoiceFound(bool val) { | |
| 100 acceptable_choice_found_ = val; | |
| 101 } | |
| 102 // Returns the reference to ParamsModel. | |
| 103 inline ParamsModel &getParamsModel() { | |
| 104 return params_model_; | |
| 105 } | |
| 106 | |
| 107 protected: | |
| 108 inline float CertaintyScore(float cert) { | |
| 109 if (language_model_use_sigmoidal_certainty) { | |
| 110 // cert is assumed to be between 0 and -dict_->certainty_scale. | |
| 111 // If you enable language_model_use_sigmoidal_certainty, you | |
| 112 // need to adjust language_model_ngram_nonmatch_score as well. | |
| 113 cert = -cert / dict_->certainty_scale; | |
| 114 return 1.0f / (1.0f + exp(10.0f * cert)); | |
| 115 } else { | |
| 116 return (-1.0f / cert); | |
| 117 } | |
| 118 } | |
| 119 | |
| 120 inline float ComputeAdjustment(int num_problems, float penalty) { | |
| 121 if (num_problems == 0) { | |
| 122 return 0.0f; | |
| 123 } | |
| 124 if (num_problems == 1) { | |
| 125 return penalty; | |
| 126 } | |
| 127 return (penalty + (language_model_penalty_increment * static_cast<float>(num_problems - 1))); | |
| 128 } | |
| 129 | |
| 130 // Computes the adjustment to the ratings sum based on the given | |
| 131 // consistency_info. The paths with invalid punctuation, inconsistent | |
| 132 // case and character type are penalized proportionally to the number | |
| 133 // of inconsistencies on the path. | |
| 134 inline float ComputeConsistencyAdjustment(const LanguageModelDawgInfo *dawg_info, | |
| 135 const LMConsistencyInfo &consistency_info) { | |
| 136 if (dawg_info != nullptr) { | |
| 137 return ComputeAdjustment(consistency_info.NumInconsistentCase(), | |
| 138 language_model_penalty_case) + | |
| 139 (consistency_info.inconsistent_script ? language_model_penalty_script : 0.0f); | |
| 140 } | |
| 141 return (ComputeAdjustment(consistency_info.NumInconsistentPunc(), language_model_penalty_punc) + | |
| 142 ComputeAdjustment(consistency_info.NumInconsistentCase(), language_model_penalty_case) + | |
| 143 ComputeAdjustment(consistency_info.NumInconsistentChartype(), | |
| 144 language_model_penalty_chartype) + | |
| 145 ComputeAdjustment(consistency_info.NumInconsistentSpaces(), | |
| 146 language_model_penalty_spacing) + | |
| 147 (consistency_info.inconsistent_script ? language_model_penalty_script : 0.0f) + | |
| 148 (consistency_info.inconsistent_font ? language_model_penalty_font : 0.0f)); | |
| 149 } | |
| 150 | |
| 151 // Returns an adjusted ratings sum that includes inconsistency penalties, | |
| 152 // penalties for non-dictionary paths and paths with dips in ngram | |
| 153 // probability. | |
| 154 float ComputeAdjustedPathCost(ViterbiStateEntry *vse); | |
| 155 | |
| 156 // Finds the first lower and upper case letter and first digit in curr_list. | |
| 157 // Uses the first character in the list in place of empty results. | |
| 158 // Returns true if both alpha and digits are found. | |
| 159 bool GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, | |
| 160 BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const; | |
| 161 // Forces there to be at least one entry in the overall set of the | |
| 162 // viterbi_state_entries of each element of parent_node that has the | |
| 163 // top_choice_flag set for lower, upper and digit using the same rules as | |
| 164 // GetTopLowerUpperDigit, setting the flag on the first found suitable | |
| 165 // candidate, whether or not the flag is set on some other parent. | |
| 166 // Returns 1 if both alpha and digits are found among the parents, -1 if no | |
| 167 // parents are found at all (a legitimate case), and 0 otherwise. | |
| 168 int SetTopParentLowerUpperDigit(LanguageModelState *parent_node) const; | |
| 169 | |
| 170 // Finds the next ViterbiStateEntry with which the given unichar_id can | |
| 171 // combine sensibly, taking into account any mixed alnum/mixed case | |
| 172 // situation, and whether this combination has been inspected before. | |
| 173 ViterbiStateEntry *GetNextParentVSE(bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, | |
| 174 LanguageModelFlagsType blob_choice_flags, | |
| 175 const UNICHARSET &unicharset, WERD_RES *word_res, | |
| 176 ViterbiStateEntry_IT *vse_it, | |
| 177 LanguageModelFlagsType *top_choice_flags) const; | |
| 178 // Helper function that computes the cost of the path composed of the | |
| 179 // path in the given parent ViterbiStateEntry and the given BLOB_CHOICE. | |
| 180 // If the new path looks good enough, adds a new ViterbiStateEntry to the | |
| 181 // list of viterbi entries in the given BLOB_CHOICE and returns true. | |
| 182 bool AddViterbiStateEntry(LanguageModelFlagsType top_choice_flags, float denom, bool word_end, | |
| 183 int curr_col, int curr_row, BLOB_CHOICE *b, | |
| 184 LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, | |
| 185 LMPainPoints *pain_points, WERD_RES *word_res, | |
| 186 BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle); | |
| 187 | |
| 188 // Determines whether a potential entry is a true top choice and | |
| 189 // updates changed accordingly. | |
| 190 // | |
| 191 // Note: The function assumes that b, top_choice_flags and changed | |
| 192 // are not nullptr. | |
| 193 void GenerateTopChoiceInfo(ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, | |
| 194 LanguageModelState *lms); | |
| 195 | |
| 196 // Calls dict_->LetterIsOk() with DawgArgs initialized from parent_vse and | |
| 197 // unichar from b.unichar_id(). Constructs and returns LanguageModelDawgInfo | |
| 198 // with updated active dawgs, constraints and permuter. | |
| 199 // | |
| 200 // Note: the caller is responsible for deleting the returned pointer. | |
| 201 LanguageModelDawgInfo *GenerateDawgInfo(bool word_end, int curr_col, int curr_row, | |
| 202 const BLOB_CHOICE &b, | |
| 203 const ViterbiStateEntry *parent_vse); | |
| 204 | |
| 205 // Computes p(unichar | parent context) and records it in ngram_cost. | |
| 206 // If b.unichar_id() is an unlikely continuation of the parent context | |
| 207 // sets found_small_prob to true and returns nullptr. | |
| 208 // Otherwise creates a new LanguageModelNgramInfo entry containing the | |
| 209 // updated context (that includes b.unichar_id() at the end) and returns it. | |
| 210 // | |
| 211 // Note: the caller is responsible for deleting the returned pointer. | |
| 212 LanguageModelNgramInfo *GenerateNgramInfo(const char *unichar, float certainty, float denom, | |
| 213 int curr_col, int curr_row, float outline_length, | |
| 214 const ViterbiStateEntry *parent_vse); | |
| 215 | |
| 216 // Computes -(log(prob(classifier)) + log(prob(ngram model))) | |
| 217 // for the given unichar in the given context. If there are multiple | |
| 218 // unichars at one position - takes the average of their probabilities. | |
| 219 // UNICHAR::utf8_step() is used to separate out individual UTF8 characters, | |
| 220 // since probability_in_context() can only handle one at a time (while | |
| 221 // unicharset might contain ngrams and glyphs composed from multiple UTF8 | |
| 222 // characters). | |
| 223 float ComputeNgramCost(const char *unichar, float certainty, float denom, const char *context, | |
| 224 int *unichar_step_len, bool *found_small_prob, float *ngram_prob); | |
| 225 | |
| 226 // Computes the normalization factors for the classifier confidences | |
| 227 // (used by ComputeNgramCost()). | |
| 228 float ComputeDenom(BLOB_CHOICE_LIST *curr_list); | |
| 229 | |
| 230 // Fills the given consistency_info based on parent_vse.consistency_info | |
| 231 // and on the consistency of the given unichar_id with parent_vse. | |
| 232 void FillConsistencyInfo(int curr_col, bool word_end, BLOB_CHOICE *b, | |
| 233 ViterbiStateEntry *parent_vse, WERD_RES *word_res, | |
| 234 LMConsistencyInfo *consistency_info); | |
| 235 | |
| 236 // Constructs WERD_CHOICE by recording unichar_ids of the BLOB_CHOICEs | |
| 237 // on the path represented by the given BLOB_CHOICE and language model | |
| 238 // state entries (lmse, dse). The path is re-constructed by following | |
| 239 // the parent pointers in the lang model state entries). If the | |
| 240 // constructed WERD_CHOICE is better than the best/raw choice recorded | |
| 241 // in the best_choice_bundle, this function updates the corresponding | |
| 242 // fields and sets best_choice_bunldle->updated to true. | |
| 243 void UpdateBestChoice(ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, | |
| 244 BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle); | |
| 245 | |
| 246 // Constructs a WERD_CHOICE by tracing parent pointers starting with | |
| 247 // the given LanguageModelStateEntry. Returns the constructed word. | |
| 248 // Updates best_char_choices, certainties and state if they are not | |
| 249 // nullptr (best_char_choices and certainties are assumed to have the | |
| 250 // length equal to lmse->length). | |
| 251 // The caller is responsible for freeing memory associated with the | |
| 252 // returned WERD_CHOICE. | |
| 253 WERD_CHOICE *ConstructWord(ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, | |
| 254 BlamerBundle *blamer_bundle, bool *truth_path); | |
| 255 | |
| 256 // Wrapper around AssociateUtils::ComputeStats(). | |
| 257 inline void ComputeAssociateStats(int col, int row, float max_char_wh_ratio, | |
| 258 ViterbiStateEntry *parent_vse, WERD_RES *word_res, | |
| 259 AssociateStats *associate_stats) { | |
| 260 AssociateUtils::ComputeStats( | |
| 261 col, row, (parent_vse != nullptr) ? &(parent_vse->associate_stats) : nullptr, | |
| 262 (parent_vse != nullptr) ? parent_vse->length : 0, fixed_pitch_, max_char_wh_ratio, word_res, | |
| 263 language_model_debug_level > 2, associate_stats); | |
| 264 } | |
| 265 | |
| 266 // Returns true if the path with such top_choice_flags and dawg_info | |
| 267 // could be pruned out (i.e. is neither a system/user/frequent dictionary | |
| 268 // nor a top choice path). | |
| 269 // In non-space delimited languages all paths can be "somewhat" dictionary | |
| 270 // words. In such languages we cannot do dictionary-driven path pruning, | |
| 271 // so paths with non-empty dawg_info are considered prunable. | |
| 272 inline bool PrunablePath(const ViterbiStateEntry &vse) { | |
| 273 if (vse.top_choice_flags) { | |
| 274 return false; | |
| 275 } | |
| 276 if (vse.dawg_info != nullptr && | |
| 277 (vse.dawg_info->permuter == SYSTEM_DAWG_PERM || vse.dawg_info->permuter == USER_DAWG_PERM || | |
| 278 vse.dawg_info->permuter == FREQ_DAWG_PERM)) { | |
| 279 return false; | |
| 280 } | |
| 281 return true; | |
| 282 } | |
| 283 | |
| 284 // Returns true if the given ViterbiStateEntry represents an acceptable path. | |
| 285 inline bool AcceptablePath(const ViterbiStateEntry &vse) { | |
| 286 return (vse.dawg_info != nullptr || vse.Consistent() || | |
| 287 (vse.ngram_info != nullptr && !vse.ngram_info->pruned)); | |
| 288 } | |
| 289 | |
| 290 public: | |
| 291 // Parameters. | |
| 292 INT_VAR_H(language_model_debug_level); | |
| 293 BOOL_VAR_H(language_model_ngram_on); | |
| 294 INT_VAR_H(language_model_ngram_order); | |
| 295 INT_VAR_H(language_model_viterbi_list_max_num_prunable); | |
| 296 INT_VAR_H(language_model_viterbi_list_max_size); | |
| 297 double_VAR_H(language_model_ngram_small_prob); | |
| 298 double_VAR_H(language_model_ngram_nonmatch_score); | |
| 299 BOOL_VAR_H(language_model_ngram_use_only_first_uft8_step); | |
| 300 double_VAR_H(language_model_ngram_scale_factor); | |
| 301 double_VAR_H(language_model_ngram_rating_factor); | |
| 302 BOOL_VAR_H(language_model_ngram_space_delimited_language); | |
| 303 INT_VAR_H(language_model_min_compound_length); | |
| 304 // Penalties used for adjusting path costs and final word rating. | |
| 305 double_VAR_H(language_model_penalty_non_freq_dict_word); | |
| 306 double_VAR_H(language_model_penalty_non_dict_word); | |
| 307 double_VAR_H(language_model_penalty_punc); | |
| 308 double_VAR_H(language_model_penalty_case); | |
| 309 double_VAR_H(language_model_penalty_script); | |
| 310 double_VAR_H(language_model_penalty_chartype); | |
| 311 double_VAR_H(language_model_penalty_font); | |
| 312 double_VAR_H(language_model_penalty_spacing); | |
| 313 double_VAR_H(language_model_penalty_increment); | |
| 314 INT_VAR_H(wordrec_display_segmentations); | |
| 315 BOOL_VAR_H(language_model_use_sigmoidal_certainty); | |
| 316 | |
| 317 protected: | |
| 318 // Member Variables. | |
| 319 | |
| 320 // Temporary DawgArgs struct that is re-used across different words to | |
| 321 // avoid dynamic memory re-allocation (should be cleared before each use). | |
| 322 DawgArgs dawg_args_; | |
| 323 // Scaling for recovering blob outline length from rating and certainty. | |
| 324 float rating_cert_scale_ = 0.0f; | |
| 325 | |
| 326 // The following variables are set at construction time. | |
| 327 | |
| 328 // Pointer to fontinfo table (not owned by LanguageModel). | |
| 329 const UnicityTable<FontInfo> *fontinfo_table_ = nullptr; | |
| 330 | |
| 331 // Pointer to Dict class, that is used for querying the dictionaries | |
| 332 // (the pointer is not owned by LanguageModel). | |
| 333 Dict *dict_ = nullptr; | |
| 334 | |
| 335 // TODO(daria): the following variables should become LanguageModel params | |
| 336 // when the old code in bestfirst.cpp and heuristic.cpp is deprecated. | |
| 337 // | |
| 338 // Set to true if we are dealing with fixed pitch text | |
| 339 // (set to assume_fixed_pitch_char_segment). | |
| 340 bool fixed_pitch_ = false; | |
| 341 // Max char width-to-height ratio allowed | |
| 342 // (set to segsearch_max_char_wh_ratio). | |
| 343 float max_char_wh_ratio_ = 0.0f; | |
| 344 | |
| 345 // The following variables are initialized with InitForWord(). | |
| 346 | |
| 347 // String representation of the classification of the previous word | |
| 348 // (since this is only used by the character ngram model component, | |
| 349 // only the last language_model_ngram_order of the word are stored). | |
| 350 std::string prev_word_str_; | |
| 351 int prev_word_unichar_step_len_ = 0; | |
| 352 // Active dawg vector. | |
| 353 DawgPositionVector very_beginning_active_dawgs_; // includes continuation | |
| 354 DawgPositionVector beginning_active_dawgs_; | |
| 355 // Set to true if acceptable choice was discovered. | |
| 356 // Note: it would be nice to use this to terminate the search once an | |
| 357 // acceptable choices is found. However we do not do that and once an | |
| 358 // acceptable choice is found we finish looking for alternative choices | |
| 359 // in the current segmentation graph and then exit the search (no more | |
| 360 // classifications are done after an acceptable choice is found). | |
| 361 // This is needed in order to let the search find the words very close to | |
| 362 // the best choice in rating (e.g. what/What, Cat/cat, etc) and log these | |
| 363 // choices. This way the stopper will know that the best choice is not | |
| 364 // ambiguous (i.e. there are best choices in the best choice list that have | |
| 365 // ratings close to the very best one) and will be less likely to mis-adapt. | |
| 366 bool acceptable_choice_found_ = false; | |
| 367 // Set to true if a choice representing correct segmentation was explored. | |
| 368 bool correct_segmentation_explored_ = false; | |
| 369 | |
| 370 // Params models containing weights for computing ViterbiStateEntry costs. | |
| 371 ParamsModel params_model_; | |
| 372 }; | |
| 373 | |
| 374 } // namespace tesseract | |
| 375 | |
| 376 #endif // TESSERACT_WORDREC_LANGUAGE_MODEL_H_ |
