Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/wordrec/language_model.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/wordrec/language_model.h Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,376 @@ +/////////////////////////////////////////////////////////////////////// +// File: language_model.h +// Description: Functions that utilize the knowledge about the properties, +// structure and statistics of the language to help segmentation +// search. +// Author: Daria Antonova +// +// (C) Copyright 2009, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_WORDREC_LANGUAGE_MODEL_H_ +#define TESSERACT_WORDREC_LANGUAGE_MODEL_H_ + +#include "associate.h" // for AssociateStats (ptr only), AssociateUtils +#include "dawg.h" // for DawgPositionVector +#include "dict.h" // for DawgArgs, Dict +#include "lm_consistency.h" // for LMConsistencyInfo +#include "lm_state.h" // for ViterbiStateEntry, LanguageModelFlagsType +#include "params.h" // for DoubleParam, double_VAR_H, IntParam, Boo... +#include "params_model.h" // for ParamsModel +#include "ratngs.h" // for BLOB_CHOICE (ptr only), BLOB_CHOICE_LIST... +#include "stopper.h" // for DANGERR + +#include <cmath> // for exp + +namespace tesseract { + +class UNICHARSET; +class WERD_RES; + +struct BlamerBundle; + +template <typename T> +class UnicityTable; + +class LMPainPoints; +struct FontInfo; + +// This class that contains the data structures and functions necessary +// to represent and use the knowledge about the language. +class LanguageModel { +public: + // Masks for keeping track of top choices that should not be pruned out. + static const LanguageModelFlagsType kSmallestRatingFlag = 0x1; + static const LanguageModelFlagsType kLowerCaseFlag = 0x2; + static const LanguageModelFlagsType kUpperCaseFlag = 0x4; + static const LanguageModelFlagsType kDigitFlag = 0x8; + static const LanguageModelFlagsType kXhtConsistentFlag = 0x10; + + // Denominator for normalizing per-letter ngram cost when deriving + // penalty adjustments. + static const float kMaxAvgNgramCost; + + LanguageModel(const UnicityTable<FontInfo> *fontinfo_table, Dict *dict); + ~LanguageModel(); + + // Fills the given floats array with features extracted from path represented + // by the given ViterbiStateEntry. See ccstruct/params_training_featdef.h + // for feature information. + // Note: the function assumes that features points to an array of size + // PTRAIN_NUM_FEATURE_TYPES. + static void ExtractFeaturesFromPath(const ViterbiStateEntry &vse, float features[]); + + // Updates data structures that are used for the duration of the segmentation + // search on the current word; + void InitForWord(const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio, + float rating_cert_scale); + + // Updates language model state of the given BLOB_CHOICE_LIST (from + // the ratings matrix) and its parent. Updates pain_points if new + // problematic points are found in the segmentation graph. + // + // At most language_model_viterbi_list_size are kept in each + // LanguageModelState.viterbi_state_entries list. + // At most language_model_viterbi_list_max_num_prunable of those are prunable + // (non-dictionary) paths. + // The entries that represent dictionary word paths are kept at the front + // of the list. + // The list ordered by cost that is computed collectively by several + // language model components (currently dawg and ngram components). + bool UpdateState(bool just_classified, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, + LanguageModelState *parent_node, LMPainPoints *pain_points, WERD_RES *word_res, + BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle); + + // Returns true if an acceptable best choice was discovered. + inline bool AcceptableChoiceFound() { + return acceptable_choice_found_; + } + inline void SetAcceptableChoiceFound(bool val) { + acceptable_choice_found_ = val; + } + // Returns the reference to ParamsModel. + inline ParamsModel &getParamsModel() { + return params_model_; + } + +protected: + inline float CertaintyScore(float cert) { + if (language_model_use_sigmoidal_certainty) { + // cert is assumed to be between 0 and -dict_->certainty_scale. + // If you enable language_model_use_sigmoidal_certainty, you + // need to adjust language_model_ngram_nonmatch_score as well. + cert = -cert / dict_->certainty_scale; + return 1.0f / (1.0f + exp(10.0f * cert)); + } else { + return (-1.0f / cert); + } + } + + inline float ComputeAdjustment(int num_problems, float penalty) { + if (num_problems == 0) { + return 0.0f; + } + if (num_problems == 1) { + return penalty; + } + return (penalty + (language_model_penalty_increment * static_cast<float>(num_problems - 1))); + } + + // Computes the adjustment to the ratings sum based on the given + // consistency_info. The paths with invalid punctuation, inconsistent + // case and character type are penalized proportionally to the number + // of inconsistencies on the path. + inline float ComputeConsistencyAdjustment(const LanguageModelDawgInfo *dawg_info, + const LMConsistencyInfo &consistency_info) { + if (dawg_info != nullptr) { + return ComputeAdjustment(consistency_info.NumInconsistentCase(), + language_model_penalty_case) + + (consistency_info.inconsistent_script ? language_model_penalty_script : 0.0f); + } + return (ComputeAdjustment(consistency_info.NumInconsistentPunc(), language_model_penalty_punc) + + ComputeAdjustment(consistency_info.NumInconsistentCase(), language_model_penalty_case) + + ComputeAdjustment(consistency_info.NumInconsistentChartype(), + language_model_penalty_chartype) + + ComputeAdjustment(consistency_info.NumInconsistentSpaces(), + language_model_penalty_spacing) + + (consistency_info.inconsistent_script ? language_model_penalty_script : 0.0f) + + (consistency_info.inconsistent_font ? language_model_penalty_font : 0.0f)); + } + + // Returns an adjusted ratings sum that includes inconsistency penalties, + // penalties for non-dictionary paths and paths with dips in ngram + // probability. + float ComputeAdjustedPathCost(ViterbiStateEntry *vse); + + // Finds the first lower and upper case letter and first digit in curr_list. + // Uses the first character in the list in place of empty results. + // Returns true if both alpha and digits are found. + bool GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, + BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const; + // Forces there to be at least one entry in the overall set of the + // viterbi_state_entries of each element of parent_node that has the + // top_choice_flag set for lower, upper and digit using the same rules as + // GetTopLowerUpperDigit, setting the flag on the first found suitable + // candidate, whether or not the flag is set on some other parent. + // Returns 1 if both alpha and digits are found among the parents, -1 if no + // parents are found at all (a legitimate case), and 0 otherwise. + int SetTopParentLowerUpperDigit(LanguageModelState *parent_node) const; + + // Finds the next ViterbiStateEntry with which the given unichar_id can + // combine sensibly, taking into account any mixed alnum/mixed case + // situation, and whether this combination has been inspected before. + ViterbiStateEntry *GetNextParentVSE(bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, + LanguageModelFlagsType blob_choice_flags, + const UNICHARSET &unicharset, WERD_RES *word_res, + ViterbiStateEntry_IT *vse_it, + LanguageModelFlagsType *top_choice_flags) const; + // Helper function that computes the cost of the path composed of the + // path in the given parent ViterbiStateEntry and the given BLOB_CHOICE. + // If the new path looks good enough, adds a new ViterbiStateEntry to the + // list of viterbi entries in the given BLOB_CHOICE and returns true. + bool AddViterbiStateEntry(LanguageModelFlagsType top_choice_flags, float denom, bool word_end, + int curr_col, int curr_row, BLOB_CHOICE *b, + LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, + LMPainPoints *pain_points, WERD_RES *word_res, + BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle); + + // Determines whether a potential entry is a true top choice and + // updates changed accordingly. + // + // Note: The function assumes that b, top_choice_flags and changed + // are not nullptr. + void GenerateTopChoiceInfo(ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, + LanguageModelState *lms); + + // Calls dict_->LetterIsOk() with DawgArgs initialized from parent_vse and + // unichar from b.unichar_id(). Constructs and returns LanguageModelDawgInfo + // with updated active dawgs, constraints and permuter. + // + // Note: the caller is responsible for deleting the returned pointer. + LanguageModelDawgInfo *GenerateDawgInfo(bool word_end, int curr_col, int curr_row, + const BLOB_CHOICE &b, + const ViterbiStateEntry *parent_vse); + + // Computes p(unichar | parent context) and records it in ngram_cost. + // If b.unichar_id() is an unlikely continuation of the parent context + // sets found_small_prob to true and returns nullptr. + // Otherwise creates a new LanguageModelNgramInfo entry containing the + // updated context (that includes b.unichar_id() at the end) and returns it. + // + // Note: the caller is responsible for deleting the returned pointer. + LanguageModelNgramInfo *GenerateNgramInfo(const char *unichar, float certainty, float denom, + int curr_col, int curr_row, float outline_length, + const ViterbiStateEntry *parent_vse); + + // Computes -(log(prob(classifier)) + log(prob(ngram model))) + // for the given unichar in the given context. If there are multiple + // unichars at one position - takes the average of their probabilities. + // UNICHAR::utf8_step() is used to separate out individual UTF8 characters, + // since probability_in_context() can only handle one at a time (while + // unicharset might contain ngrams and glyphs composed from multiple UTF8 + // characters). + float ComputeNgramCost(const char *unichar, float certainty, float denom, const char *context, + int *unichar_step_len, bool *found_small_prob, float *ngram_prob); + + // Computes the normalization factors for the classifier confidences + // (used by ComputeNgramCost()). + float ComputeDenom(BLOB_CHOICE_LIST *curr_list); + + // Fills the given consistency_info based on parent_vse.consistency_info + // and on the consistency of the given unichar_id with parent_vse. + void FillConsistencyInfo(int curr_col, bool word_end, BLOB_CHOICE *b, + ViterbiStateEntry *parent_vse, WERD_RES *word_res, + LMConsistencyInfo *consistency_info); + + // Constructs WERD_CHOICE by recording unichar_ids of the BLOB_CHOICEs + // on the path represented by the given BLOB_CHOICE and language model + // state entries (lmse, dse). The path is re-constructed by following + // the parent pointers in the lang model state entries). If the + // constructed WERD_CHOICE is better than the best/raw choice recorded + // in the best_choice_bundle, this function updates the corresponding + // fields and sets best_choice_bunldle->updated to true. + void UpdateBestChoice(ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, + BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle); + + // Constructs a WERD_CHOICE by tracing parent pointers starting with + // the given LanguageModelStateEntry. Returns the constructed word. + // Updates best_char_choices, certainties and state if they are not + // nullptr (best_char_choices and certainties are assumed to have the + // length equal to lmse->length). + // The caller is responsible for freeing memory associated with the + // returned WERD_CHOICE. + WERD_CHOICE *ConstructWord(ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, + BlamerBundle *blamer_bundle, bool *truth_path); + + // Wrapper around AssociateUtils::ComputeStats(). + inline void ComputeAssociateStats(int col, int row, float max_char_wh_ratio, + ViterbiStateEntry *parent_vse, WERD_RES *word_res, + AssociateStats *associate_stats) { + AssociateUtils::ComputeStats( + col, row, (parent_vse != nullptr) ? &(parent_vse->associate_stats) : nullptr, + (parent_vse != nullptr) ? parent_vse->length : 0, fixed_pitch_, max_char_wh_ratio, word_res, + language_model_debug_level > 2, associate_stats); + } + + // Returns true if the path with such top_choice_flags and dawg_info + // could be pruned out (i.e. is neither a system/user/frequent dictionary + // nor a top choice path). + // In non-space delimited languages all paths can be "somewhat" dictionary + // words. In such languages we cannot do dictionary-driven path pruning, + // so paths with non-empty dawg_info are considered prunable. + inline bool PrunablePath(const ViterbiStateEntry &vse) { + if (vse.top_choice_flags) { + return false; + } + if (vse.dawg_info != nullptr && + (vse.dawg_info->permuter == SYSTEM_DAWG_PERM || vse.dawg_info->permuter == USER_DAWG_PERM || + vse.dawg_info->permuter == FREQ_DAWG_PERM)) { + return false; + } + return true; + } + + // Returns true if the given ViterbiStateEntry represents an acceptable path. + inline bool AcceptablePath(const ViterbiStateEntry &vse) { + return (vse.dawg_info != nullptr || vse.Consistent() || + (vse.ngram_info != nullptr && !vse.ngram_info->pruned)); + } + +public: + // Parameters. + INT_VAR_H(language_model_debug_level); + BOOL_VAR_H(language_model_ngram_on); + INT_VAR_H(language_model_ngram_order); + INT_VAR_H(language_model_viterbi_list_max_num_prunable); + INT_VAR_H(language_model_viterbi_list_max_size); + double_VAR_H(language_model_ngram_small_prob); + double_VAR_H(language_model_ngram_nonmatch_score); + BOOL_VAR_H(language_model_ngram_use_only_first_uft8_step); + double_VAR_H(language_model_ngram_scale_factor); + double_VAR_H(language_model_ngram_rating_factor); + BOOL_VAR_H(language_model_ngram_space_delimited_language); + INT_VAR_H(language_model_min_compound_length); + // Penalties used for adjusting path costs and final word rating. + double_VAR_H(language_model_penalty_non_freq_dict_word); + double_VAR_H(language_model_penalty_non_dict_word); + double_VAR_H(language_model_penalty_punc); + double_VAR_H(language_model_penalty_case); + double_VAR_H(language_model_penalty_script); + double_VAR_H(language_model_penalty_chartype); + double_VAR_H(language_model_penalty_font); + double_VAR_H(language_model_penalty_spacing); + double_VAR_H(language_model_penalty_increment); + INT_VAR_H(wordrec_display_segmentations); + BOOL_VAR_H(language_model_use_sigmoidal_certainty); + +protected: + // Member Variables. + + // Temporary DawgArgs struct that is re-used across different words to + // avoid dynamic memory re-allocation (should be cleared before each use). + DawgArgs dawg_args_; + // Scaling for recovering blob outline length from rating and certainty. + float rating_cert_scale_ = 0.0f; + + // The following variables are set at construction time. + + // Pointer to fontinfo table (not owned by LanguageModel). + const UnicityTable<FontInfo> *fontinfo_table_ = nullptr; + + // Pointer to Dict class, that is used for querying the dictionaries + // (the pointer is not owned by LanguageModel). + Dict *dict_ = nullptr; + + // TODO(daria): the following variables should become LanguageModel params + // when the old code in bestfirst.cpp and heuristic.cpp is deprecated. + // + // Set to true if we are dealing with fixed pitch text + // (set to assume_fixed_pitch_char_segment). + bool fixed_pitch_ = false; + // Max char width-to-height ratio allowed + // (set to segsearch_max_char_wh_ratio). + float max_char_wh_ratio_ = 0.0f; + + // The following variables are initialized with InitForWord(). + + // String representation of the classification of the previous word + // (since this is only used by the character ngram model component, + // only the last language_model_ngram_order of the word are stored). + std::string prev_word_str_; + int prev_word_unichar_step_len_ = 0; + // Active dawg vector. + DawgPositionVector very_beginning_active_dawgs_; // includes continuation + DawgPositionVector beginning_active_dawgs_; + // Set to true if acceptable choice was discovered. + // Note: it would be nice to use this to terminate the search once an + // acceptable choices is found. However we do not do that and once an + // acceptable choice is found we finish looking for alternative choices + // in the current segmentation graph and then exit the search (no more + // classifications are done after an acceptable choice is found). + // This is needed in order to let the search find the words very close to + // the best choice in rating (e.g. what/What, Cat/cat, etc) and log these + // choices. This way the stopper will know that the best choice is not + // ambiguous (i.e. there are best choices in the best choice list that have + // ratings close to the very best one) and will be less likely to mis-adapt. + bool acceptable_choice_found_ = false; + // Set to true if a choice representing correct segmentation was explored. + bool correct_segmentation_explored_ = false; + + // Params models containing weights for computing ViterbiStateEntry costs. + ParamsModel params_model_; +}; + +} // namespace tesseract + +#endif // TESSERACT_WORDREC_LANGUAGE_MODEL_H_
