Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/wordrec/lm_state.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/wordrec/lm_state.h Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,253 @@ +/////////////////////////////////////////////////////////////////////// +// File: lm_state.h +// Description: Structures and functionality for capturing the state of +// segmentation search guided by the language model. +// Author: Rika Antonova +// +// (C) Copyright 2012, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_WORDREC_LANGUAGE_MODEL_DEFS_H_ +#define TESSERACT_WORDREC_LANGUAGE_MODEL_DEFS_H_ + +#include <tesseract/unichar.h> // for UNICHAR_ID +#include "associate.h" // for AssociateStats +#include "dawg.h" // for DawgPositionVector +#include "elst.h" // for ELIST_ITERATOR, ELISTIZEH, ELIST_LINK +#include "lm_consistency.h" // for LMConsistencyInfo +#include "ratngs.h" // for BLOB_CHOICE, PermuterType +#include "stopper.h" // for DANGERR +#include "unicharset.h" // for UNICHARSET + +namespace tesseract { + +/// Used for expressing various language model flags. +using LanguageModelFlagsType = unsigned char; + +/// The following structs are used for storing the state of the language model +/// in the segmentation search graph. In this graph the nodes are BLOB_CHOICEs +/// and the links are the relationships between the underlying blobs (see +/// segsearch.h for a more detailed description). +/// +/// Each of the BLOB_CHOICEs contains LanguageModelState struct, which has +/// a list of N best paths (list of ViterbiStateEntry) explored by the Viterbi +/// search leading up to and including this BLOB_CHOICE. +/// +/// Each ViterbiStateEntry contains information from various components of the +/// language model: dawgs in which the path is found, character ngram model +/// probability of the path, script/chartype/font consistency info, state for +/// language-specific heuristics (e.g. hyphenated and compound words, +/// lower/upper case preferences, etc). +/// +/// Each ViterbiStateEntry also contains the parent pointer, so that the path +/// that it represents (WERD_CHOICE) can be constructed by following these +/// parent pointers. + +/// Struct for storing additional information used by Dawg language model +/// component. It stores the set of active dawgs in which the sequence of +/// letters on a path can be found. +struct LanguageModelDawgInfo { + LanguageModelDawgInfo(const DawgPositionVector *a, PermuterType pt) + : active_dawgs(*a), permuter(pt) {} + DawgPositionVector active_dawgs; + PermuterType permuter; +}; + +/// Struct for storing additional information used by Ngram language model +/// component. +struct LanguageModelNgramInfo { + LanguageModelNgramInfo(const char *c, int l, bool p, float nc, float ncc) + : context(c) + , context_unichar_step_len(l) + , pruned(p) + , ngram_cost(nc) + , ngram_and_classifier_cost(ncc) {} + std::string context; ///< context string + /// Length of the context measured by advancing using UNICHAR::utf8_step() + /// (should be at most the order of the character ngram model used). + int context_unichar_step_len; + /// The paths with pruned set are pruned out from the perspective of the + /// character ngram model. They are explored further because they represent + /// a dictionary match or a top choice. Thus ngram_info is still computed + /// for them in order to calculate the combined cost. + bool pruned; + /// -ln(P_ngram_model(path)) + float ngram_cost; + /// -[ ln(P_classifier(path)) + scale_factor * ln(P_ngram_model(path)) ] + float ngram_and_classifier_cost; +}; + +/// Struct for storing the information about a path in the segmentation graph +/// explored by Viterbi search. +struct ViterbiStateEntry : public ELIST_LINK { + ViterbiStateEntry(ViterbiStateEntry *pe, BLOB_CHOICE *b, float c, float ol, + const LMConsistencyInfo &ci, const AssociateStats &as, + LanguageModelFlagsType tcf, LanguageModelDawgInfo *d, LanguageModelNgramInfo *n, + const char *debug_uch) + : curr_b(b) + , parent_vse(pe) + , competing_vse(nullptr) + , dawg_info(d) + , ngram_info(n) + , cost(c) + , ratings_sum(b->rating()) + , min_certainty(b->certainty()) + , adapted(b->IsAdapted()) + , length(1) + , outline_length(ol) + , consistency_info(ci) + , associate_stats(as) + , top_choice_flags(tcf) + , updated(true) { + debug_str = (debug_uch == nullptr) ? nullptr : new std::string(); + if (pe != nullptr) { + ratings_sum += pe->ratings_sum; + if (pe->min_certainty < min_certainty) { + min_certainty = pe->min_certainty; + } + adapted += pe->adapted; + length += pe->length; + outline_length += pe->outline_length; + if (debug_uch != nullptr) { + *debug_str += *(pe->debug_str); + } + } + if (debug_str != nullptr && debug_uch != nullptr) { + *debug_str += debug_uch; + } + } + ~ViterbiStateEntry() { + delete dawg_info; + delete ngram_info; + delete debug_str; + } + /// Comparator function for sorting ViterbiStateEntry_LISTs in + /// non-increasing order of costs. + static int Compare(const void *e1, const void *e2) { + const ViterbiStateEntry *ve1 = *static_cast<const ViterbiStateEntry *const *>(e1); + const ViterbiStateEntry *ve2 = *static_cast<const ViterbiStateEntry *const *>(e2); + return (ve1->cost < ve2->cost) ? -1 : 1; + } + inline bool Consistent() const { + if (dawg_info != nullptr && consistency_info.NumInconsistentCase() == 0) { + return true; + } + return consistency_info.Consistent(); + } + /// Returns true if this VSE has an alphanumeric character as its classifier + /// result. + bool HasAlnumChoice(const UNICHARSET &unicharset) { + if (curr_b == nullptr) { + return false; + } + UNICHAR_ID unichar_id = curr_b->unichar_id(); + if (unicharset.get_isalpha(unichar_id) || unicharset.get_isdigit(unichar_id)) { + return true; + } + return false; + } + void Print(const char *msg) const; + + /// Pointers to BLOB_CHOICE and parent ViterbiStateEntry (not owned by this). + BLOB_CHOICE *curr_b; + ViterbiStateEntry *parent_vse; + /// Pointer to a case-competing ViterbiStateEntry in the same list that + /// represents a path ending in the same letter of the opposite case. + ViterbiStateEntry *competing_vse; + + /// Extra information maintained by Dawg language model component + /// (owned by ViterbiStateEntry). + LanguageModelDawgInfo *dawg_info; + + /// Extra information maintained by Ngram language model component + /// (owned by ViterbiStateEntry). + LanguageModelNgramInfo *ngram_info; + + /// UTF8 string representing the path corresponding to this vse. + /// Populated only in when language_model_debug_level > 0. + std::string *debug_str; + + /// The cost is an adjusted ratings sum, that is adjusted by all the language + /// model components that use Viterbi search. + float cost; + + /// Various information about the characters on the path represented + /// by this ViterbiStateEntry. + float ratings_sum; ///< sum of ratings of character on the path + float min_certainty; ///< minimum certainty on the path + int adapted; ///< number of BLOB_CHOICES from adapted templates + int length; ///< number of characters on the path + float outline_length; ///< length of the outline so far + LMConsistencyInfo consistency_info; ///< path consistency info + AssociateStats associate_stats; ///< character widths/gaps/seams + + /// Flags for marking the entry as a top choice path with + /// the smallest rating or lower/upper case letters). + LanguageModelFlagsType top_choice_flags; + + bool updated; ///< set to true if the entry has just been created/updated +}; + +ELISTIZEH(ViterbiStateEntry) + +/// Struct to store information maintained by various language model components. +struct LanguageModelState { + LanguageModelState() + : viterbi_state_entries_prunable_length(0) + , viterbi_state_entries_prunable_max_cost(FLT_MAX) + , viterbi_state_entries_length(0) {} + ~LanguageModelState() = default; + + /// Clears the viterbi search state back to its initial conditions. + void Clear(); + + void Print(const char *msg); + + /// Storage for the Viterbi state. + ViterbiStateEntry_LIST viterbi_state_entries; + /// Number and max cost of prunable paths in viterbi_state_entries. + int viterbi_state_entries_prunable_length; + float viterbi_state_entries_prunable_max_cost; + /// Total number of entries in viterbi_state_entries. + int viterbi_state_entries_length; +}; + +/// Bundle together all the things pertaining to the best choice/state. +struct BestChoiceBundle { + explicit BestChoiceBundle(int matrix_dimension) : updated(false), best_vse(nullptr) { + beam.reserve(matrix_dimension); + for (int i = 0; i < matrix_dimension; ++i) { + beam.push_back(new LanguageModelState); + } + } + ~BestChoiceBundle() { + for (auto &state : beam) { + delete state; + } + } + + /// Flag to indicate whether anything was changed. + bool updated; + /// Places to try to fix the word suggested by ambiguity checking. + DANGERR fixpt; + /// The beam. One LanguageModelState containing a list of ViterbiStateEntry + /// per row in the ratings matrix containing all VSEs whose BLOB_CHOICE is + /// somewhere in the corresponding row. + std::vector<LanguageModelState *> beam; + /// Best ViterbiStateEntry and BLOB_CHOICE. + ViterbiStateEntry *best_vse; +}; + +} // namespace tesseract + +#endif // TESSERACT_WORDREC_LANGUAGE_MODEL_DEFS_H_
