Python2/PyMuPDF: mupdf-source/thirdparty/tesseract/src/wordrec/lm

comparison mupdf-source/thirdparty/tesseract/src/wordrec/lm_state.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.

author	Franz Glasner <fzglas.hg@dom66.de>
date	Mon, 15 Sep 2025 11:43:07 +0200
parents
children

comparison

equal deleted inserted replaced

-:1d09e1dec1d9
+:b50eed0cc0ef
+///////////////////////////////////////////////////////////////////////
+// File:        lm_state.h
+// Description: Structures and functionality for capturing the state of
+//              segmentation search guided by the language model.
+// Author:      Rika Antonova
+//
+// (C) Copyright 2012, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+#ifndef TESSERACT_WORDREC_LANGUAGE_MODEL_DEFS_H_
+#define TESSERACT_WORDREC_LANGUAGE_MODEL_DEFS_H_
+#include <tesseract/unichar.h> // for UNICHAR_ID
+#include "associate.h"         // for AssociateStats
+#include "dawg.h"              // for DawgPositionVector
+#include "elst.h"              // for ELIST_ITERATOR, ELISTIZEH, ELIST_LINK
+#include "lm_consistency.h"    // for LMConsistencyInfo
+#include "ratngs.h"            // for BLOB_CHOICE, PermuterType
+#include "stopper.h"           // for DANGERR
+#include "unicharset.h"        // for UNICHARSET
+namespace tesseract {
+/// Used for expressing various language model flags.
+using LanguageModelFlagsType = unsigned char;
+/// The following structs are used for storing the state of the language model
+/// in the segmentation search graph. In this graph the nodes are BLOB_CHOICEs
+/// and the links are the relationships between the underlying blobs (see
+/// segsearch.h for a more detailed description).
+///
+/// Each of the BLOB_CHOICEs contains LanguageModelState struct, which has
+/// a list of N best paths (list of ViterbiStateEntry) explored by the Viterbi
+/// search leading up to and including this BLOB_CHOICE.
+///
+/// Each ViterbiStateEntry contains information from various components of the
+/// language model: dawgs in which the path is found, character ngram model
+/// probability of the path, script/chartype/font consistency info, state for
+/// language-specific heuristics (e.g. hyphenated and compound words,
+/// lower/upper case preferences, etc).
+///
+/// Each ViterbiStateEntry also contains the parent pointer, so that the path
+/// that it represents (WERD_CHOICE) can be constructed by following these
+/// parent pointers.
+/// Struct for storing additional information used by Dawg language model
+/// component. It stores the set of active dawgs in which the sequence of
+/// letters on a path can be found.
+struct LanguageModelDawgInfo {
+LanguageModelDawgInfo(const DawgPositionVector *a, PermuterType pt)
+: active_dawgs(*a), permuter(pt) {}
+DawgPositionVector active_dawgs;
+PermuterType permuter;
+};
+/// Struct for storing additional information used by Ngram language model
+/// component.
+struct LanguageModelNgramInfo {
+LanguageModelNgramInfo(const char *c, int l, bool p, float nc, float ncc)
+: context(c)
+, context_unichar_step_len(l)
+, pruned(p)
+, ngram_cost(nc)
+, ngram_and_classifier_cost(ncc) {}
+std::string context; ///< context string
+/// Length of the context measured by advancing using UNICHAR::utf8_step()
+/// (should be at most the order of the character ngram model used).
+int context_unichar_step_len;
+/// The paths with pruned set are pruned out from the perspective of the
+/// character ngram model. They are explored further because they represent
+/// a dictionary match or a top choice. Thus ngram_info is still computed
+/// for them in order to calculate the combined cost.
+bool pruned;
+/// -ln(P_ngram_model(path))
+float ngram_cost;
+/// -[ ln(P_classifier(path)) + scale_factor * ln(P_ngram_model(path)) ]
+float ngram_and_classifier_cost;
+};
+/// Struct for storing the information about a path in the segmentation graph
+/// explored by Viterbi search.
+struct ViterbiStateEntry : public ELIST_LINK {
+ViterbiStateEntry(ViterbiStateEntry *pe, BLOB_CHOICE *b, float c, float ol,
+const LMConsistencyInfo &ci, const AssociateStats &as,
+LanguageModelFlagsType tcf, LanguageModelDawgInfo *d, LanguageModelNgramInfo *n,
+const char *debug_uch)
+: curr_b(b)
+, parent_vse(pe)
+, competing_vse(nullptr)
+, dawg_info(d)
+, ngram_info(n)
+, cost(c)
+, ratings_sum(b->rating())
+, min_certainty(b->certainty())
+, adapted(b->IsAdapted())
+, length(1)
+, outline_length(ol)
+, consistency_info(ci)
+, associate_stats(as)
+, top_choice_flags(tcf)
+, updated(true) {
+debug_str = (debug_uch == nullptr) ? nullptr : new std::string();
+if (pe != nullptr) {
+ratings_sum += pe->ratings_sum;
+if (pe->min_certainty < min_certainty) {
+min_certainty = pe->min_certainty;
+}
+adapted += pe->adapted;
+length += pe->length;
+outline_length += pe->outline_length;
+if (debug_uch != nullptr) {
+*debug_str += *(pe->debug_str);
+}
+}
+if (debug_str != nullptr && debug_uch != nullptr) {
+*debug_str += debug_uch;
+}
+}
+~ViterbiStateEntry() {
+delete dawg_info;
+delete ngram_info;
+delete debug_str;
+}
+/// Comparator function for sorting ViterbiStateEntry_LISTs in
+/// non-increasing order of costs.
+static int Compare(const void *e1, const void *e2) {
+const ViterbiStateEntry *ve1 = *static_cast<const ViterbiStateEntry *const *>(e1);
+const ViterbiStateEntry *ve2 = *static_cast<const ViterbiStateEntry *const *>(e2);
+return (ve1->cost < ve2->cost) ? -1 : 1;
+}
+inline bool Consistent() const {
+if (dawg_info != nullptr && consistency_info.NumInconsistentCase() == 0) {
+return true;
+}
+return consistency_info.Consistent();
+}
+/// Returns true if this VSE has an alphanumeric character as its classifier
+/// result.
+bool HasAlnumChoice(const UNICHARSET &unicharset) {
+if (curr_b == nullptr) {
+return false;
+}
+UNICHAR_ID unichar_id = curr_b->unichar_id();
+if (unicharset.get_isalpha(unichar_id) || unicharset.get_isdigit(unichar_id)) {
+return true;
+}
+return false;
+}
+void Print(const char *msg) const;
+/// Pointers to BLOB_CHOICE and parent ViterbiStateEntry (not owned by this).
+BLOB_CHOICE *curr_b;
+ViterbiStateEntry *parent_vse;
+/// Pointer to a case-competing ViterbiStateEntry in the same list that
+/// represents a path ending in the same letter of the opposite case.
+ViterbiStateEntry *competing_vse;
+/// Extra information maintained by Dawg language model component
+/// (owned by ViterbiStateEntry).
+LanguageModelDawgInfo *dawg_info;
+/// Extra information maintained by Ngram language model component
+/// (owned by ViterbiStateEntry).
+LanguageModelNgramInfo *ngram_info;
+/// UTF8 string representing the path corresponding to this vse.
+/// Populated only in when language_model_debug_level > 0.
+std::string *debug_str;
+/// The cost is an adjusted ratings sum, that is adjusted by all the language
+/// model components that use Viterbi search.
+float cost;
+/// Various information about the characters on the path represented
+/// by this ViterbiStateEntry.
+float ratings_sum;                  ///< sum of ratings of character on the path
+float min_certainty;                ///< minimum certainty on the path
+int adapted;                        ///< number of BLOB_CHOICES from adapted templates
+int length;                         ///< number of characters on the path
+float outline_length;               ///< length of the outline so far
+LMConsistencyInfo consistency_info; ///< path consistency info
+AssociateStats associate_stats;     ///< character widths/gaps/seams
+/// Flags for marking the entry as a top choice path with
+/// the smallest rating or lower/upper case letters).
+LanguageModelFlagsType top_choice_flags;
+bool updated; ///< set to true if the entry has just been created/updated
+};
+ELISTIZEH(ViterbiStateEntry)
+/// Struct to store information maintained by various language model components.
+struct LanguageModelState {
+LanguageModelState()
+: viterbi_state_entries_prunable_length(0)
+, viterbi_state_entries_prunable_max_cost(FLT_MAX)
+, viterbi_state_entries_length(0) {}
+~LanguageModelState() = default;
+/// Clears the viterbi search state back to its initial conditions.
+void Clear();
+void Print(const char *msg);
+/// Storage for the Viterbi state.
+ViterbiStateEntry_LIST viterbi_state_entries;
+/// Number and max cost of prunable paths in viterbi_state_entries.
+int viterbi_state_entries_prunable_length;
+float viterbi_state_entries_prunable_max_cost;
+/// Total number of entries in viterbi_state_entries.
+int viterbi_state_entries_length;
+};
+/// Bundle together all the things pertaining to the best choice/state.
+struct BestChoiceBundle {
+explicit BestChoiceBundle(int matrix_dimension) : updated(false), best_vse(nullptr) {
+beam.reserve(matrix_dimension);
+for (int i = 0; i < matrix_dimension; ++i) {
+beam.push_back(new LanguageModelState);
+}
+}
+~BestChoiceBundle() {
+for (auto &state : beam) {
+delete state;
+}
+}
+/// Flag to indicate whether anything was changed.
+bool updated;
+/// Places to try to fix the word suggested by ambiguity checking.
+DANGERR fixpt;
+/// The beam. One LanguageModelState containing a list of ViterbiStateEntry
+/// per row in the ratings matrix containing all VSEs whose BLOB_CHOICE is
+/// somewhere in the corresponding row.
+std::vector<LanguageModelState *> beam;
+/// Best ViterbiStateEntry and BLOB_CHOICE.
+ViterbiStateEntry *best_vse;
+};
+} // namespace tesseract
+#endif // TESSERACT_WORDREC_LANGUAGE_MODEL_DEFS_H_

Mercurial > hgrepos > Python2 > PyMuPDF

comparison mupdf-source/thirdparty/tesseract/src/wordrec/lm_state.h @ 2:b50eed0cc0ef upstream