Python2/PyMuPDF: mupdf-source/thirdparty/tesseract/src/dict/dict.h comparison

comparison mupdf-source/thirdparty/tesseract/src/dict/dict.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.

author	Franz Glasner <fzglas.hg@dom66.de>
date	Mon, 15 Sep 2025 11:43:07 +0200
parents
children

comparison

equal deleted inserted replaced

-:1d09e1dec1d9
+:b50eed0cc0ef
+///////////////////////////////////////////////////////////////////////
+// File:        dict.h
+// Description: dict class.
+// Author:      Samuel Charron
+//
+// (C) Copyright 2006, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+#ifndef TESSERACT_DICT_DICT_H_
+#define TESSERACT_DICT_DICT_H_
+#ifdef HAVE_CONFIG_H
+#  include "config_auto.h" // DISABLED_LEGACY_ENGINE
+#endif
+#ifndef DISABLED_LEGACY_ENGINE
+#  include "ambigs.h"
+#endif
+#include "dawg.h"
+#include "dawg_cache.h"
+#include "ratngs.h"
+#include "stopper.h"
+#include "trie.h"
+#include "unicharset.h"
+#ifndef DISABLED_LEGACY_ENGINE
+#  include "params_training_featdef.h"
+#endif // ndef DISABLED_LEGACY_ENGINE
+namespace tesseract {
+class MATRIX;
+class WERD_RES;
+#define CHARS_PER_LINE 500
+#define MAX_WERD_LENGTH (int64_t)128
+#define NO_RATING -1
+/** Struct used to hold temporary information about fragments. */
+struct CHAR_FRAGMENT_INFO {
+UNICHAR_ID unichar_id;
+const CHAR_FRAGMENT *fragment;
+int num_fragments;
+float rating;
+float certainty;
+};
+using DawgVector = std::vector<Dawg *>;
+//
+// Constants
+//
+static const int kRatingPad = 4;
+static const int kDictMaxWildcards = 2; // max wildcards for a word
+// TODO(daria): If hyphens are different in different languages and can be
+// inferred from training data we should load their values dynamically.
+static const char kHyphenSymbol[] = "-";
+static const char kSlashSymbol[] = "/";
+static const char kQuestionSymbol[] = "?";
+static const char kApostropheSymbol[] = "'";
+static const float kSimCertaintyScale = -10.0;  // similarity matcher scaling
+static const float kSimCertaintyOffset = -10.0; // similarity matcher offset
+static const float kSimilarityFloor = 100.0;    // worst E*L product to stop on
+static const int kDocDictMaxRepChars = 4;
+// Enum for describing whether the x-height for the word is consistent:
+//  0 - everything is good.
+//  1 - there are one or two secondary (but consistent) baselines
+//      [think subscript and superscript], or there is an oversized
+//      first character.
+//  2 - the word is inconsistent.
+enum XHeightConsistencyEnum { XH_GOOD, XH_SUBNORMAL, XH_INCONSISTENT };
+struct DawgArgs {
+DawgArgs(DawgPositionVector *d, DawgPositionVector *up, PermuterType p)
+: active_dawgs(d), updated_dawgs(up), permuter(p), valid_end(false) {}
+DawgPositionVector *active_dawgs;
+DawgPositionVector *updated_dawgs;
+PermuterType permuter;
+// True if the current position is a valid word end.
+bool valid_end;
+};
+class TESS_API Dict {
+public:
+Dict(CCUtil *image_ptr);
+~Dict();
+const CCUtil *getCCUtil() const {
+return ccutil_;
+}
+CCUtil *getCCUtil() {
+return ccutil_;
+}
+const UNICHARSET &getUnicharset() const {
+return getCCUtil()->unicharset;
+}
+UNICHARSET &getUnicharset() {
+return getCCUtil()->unicharset;
+}
+#ifndef DISABLED_LEGACY_ENGINE
+const UnicharAmbigs &getUnicharAmbigs() const {
+return getCCUtil()->unichar_ambigs;
+}
+#endif
+// Returns true if unichar_id is a word compounding character like - or /.
+inline bool compound_marker(UNICHAR_ID unichar_id) {
+const UNICHARSET &unicharset = getUnicharset();
+ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
+const auto &normed_ids = unicharset.normed_ids(unichar_id);
+return normed_ids.size() == 1 &&
+(normed_ids[0] == hyphen_unichar_id_ || normed_ids[0] == slash_unichar_id_);
+}
+// Returns true if unichar_id is an apostrophe-like character that may
+// separate prefix/suffix words from a main body word.
+inline bool is_apostrophe(UNICHAR_ID unichar_id) {
+const UNICHARSET &unicharset = getUnicharset();
+ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
+const auto &normed_ids = unicharset.normed_ids(unichar_id);
+return normed_ids.size() == 1 && normed_ids[0] == apostrophe_unichar_id_;
+}
+/* hyphen.cpp ************************************************************/
+/// Returns true if we've recorded the beginning of a hyphenated word.
+inline bool hyphenated() const {
+return !last_word_on_line_ && hyphen_word_;
+}
+/// Size of the base word (the part on the line before) of a hyphenated word.
+inline int hyphen_base_size() const {
+return this->hyphenated() ? hyphen_word_->length() : 0;
+}
+/// If this word is hyphenated copy the base word (the part on
+/// the line before) of a hyphenated word into the given word.
+/// This function assumes that word is not nullptr.
+inline void copy_hyphen_info(WERD_CHOICE *word) const {
+if (this->hyphenated()) {
+*word = *hyphen_word_;
+if (hyphen_debug_level) {
+word->print("copy_hyphen_info: ");
+}
+}
+}
+/// Check whether the word has a hyphen at the end.
+inline bool has_hyphen_end(const UNICHARSET *unicharset, UNICHAR_ID unichar_id,
+bool first_pos) const {
+if (!last_word_on_line_ || first_pos) {
+return false;
+}
+ASSERT_HOST(unicharset->contains_unichar_id(unichar_id));
+const auto &normed_ids = unicharset->normed_ids(unichar_id);
+return normed_ids.size() == 1 && normed_ids[0] == hyphen_unichar_id_;
+}
+/// Same as above, but check the unichar at the end of the word.
+inline bool has_hyphen_end(const WERD_CHOICE &word) const {
+int word_index = word.length() - 1;
+return has_hyphen_end(word.unicharset(), word.unichar_id(word_index), word_index == 0);
+}
+/// Unless the previous word was the last one on the line, and the current
+/// one is not (thus it is the first one on the line), erase hyphen_word_,
+/// clear hyphen_active_dawgs_, update last_word_on_line_.
+void reset_hyphen_vars(bool last_word_on_line);
+/// Update hyphen_word_, and copy the given DawgPositionVectors into
+/// hyphen_active_dawgs_ .
+void set_hyphen_word(const WERD_CHOICE &word, const DawgPositionVector &active_dawgs);
+/* permdawg.cpp ************************************************************/
+// Note: Functions in permdawg.cpp are only used by NoDangerousAmbig().
+// When this function is refactored, permdawg.cpp can be removed.
+/// Copies word into best_choice if its rating is smaller
+/// than that of best_choice.
+inline void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice) {
+if (word.rating() < best_choice->rating()) {
+*best_choice = word;
+}
+}
+/// Fill the given active_dawgs vector with dawgs that could contain the
+/// beginning of the word. If hyphenated() returns true, copy the entries
+/// from hyphen_active_dawgs_ instead.
+void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const;
+// Fill the given vector with the default collection of any-length dawgs
+void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const;
+/// Recursively explore all the possible character combinations in
+/// the given char_choices. Use go_deeper_dawg_fxn() to explore all the
+/// dawgs in the dawgs_ vector in parallel and discard invalid words.
+///
+/// Allocate and return a WERD_CHOICE with the best valid word found.
+WERD_CHOICE *dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices,
+float rating_limit);
+/// If the choice being composed so far could be a dictionary word
+/// and we have not reached the end of the word keep exploring the
+/// char_choices further.
+void go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
+int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
+bool word_ending, WERD_CHOICE *word, float certainties[], float *limit,
+WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args);
+/// Pointer to go_deeper function.
+void (Dict::*go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
+int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
+bool word_ending, WERD_CHOICE *word, float certainties[],
+float *limit, WERD_CHOICE *best_choice, int *attempts_left,
+void *void_more_args);
+//
+// Helper functions for dawg_permute_and_select().
+//
+void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
+int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
+WERD_CHOICE *word, float certainties[], float *limit,
+WERD_CHOICE *best_choice, int *attempts_left, void *more_args);
+void append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
+const BLOB_CHOICE &blob_choice, int char_choice_index,
+const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word,
+float certainties[], float *limit, WERD_CHOICE *best_choice,
+int *attempts_left, void *more_args);
+bool fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty,
+const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug,
+int word_ending, CHAR_FRAGMENT_INFO *char_frag_info);
+/* stopper.cpp *************************************************************/
+#if !defined(DISABLED_LEGACY_ENGINE)
+bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable,
+MATRIX *ratings);
+#endif // !defined(DISABLED_LEGACY_ENGINE)
+// Replaces the corresponding wrong ngram in werd_choice with the correct
+// one. The whole correct n-gram is inserted into the ratings matrix and
+// the werd_choice: no more fragments!. Rating and certainty of new entries
+// in matrix and werd_choice are the sum and mean of the wrong ngram
+// respectively.
+// E.g. for werd_choice mystring'' and ambiguity ''->": werd_choice becomes
+// mystring", with a new entry in the ratings matrix for ".
+void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id,
+WERD_CHOICE *werd_choice, MATRIX *ratings);
+/// Returns the length of the shortest alpha run in WordChoice.
+int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const;
+/// Returns true if the certainty of the BestChoice word is within a
+/// reasonable range of the average certainties for the best choices for
+/// each character in the segmentation.  This test is used to catch words
+/// in which one character is much worse than the other characters in the
+/// word (i.e. false will be returned in that case). The algorithm computes
+/// the mean and std deviation of the certainties in the word with the worst
+/// certainty thrown out.
+int UniformCertainties(const WERD_CHOICE &word);
+/// Returns true if the given best_choice is good enough to stop.
+bool AcceptableChoice(const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency);
+/// Returns false if the best choice for the current word is questionable
+/// and should be tried again on the second pass or should be flagged to
+/// the user.
+bool AcceptableResult(WERD_RES *word) const;
+#if !defined(DISABLED_LEGACY_ENGINE)
+void EndDangerousAmbigs();
+#endif // !defined(DISABLED_LEGACY_ENGINE)
+/// Prints the current choices for this word to stdout.
+void DebugWordChoices();
+/// Sets up stopper variables in preparation for the first pass.
+void SetupStopperPass1();
+/// Sets up stopper variables in preparation for the second pass.
+void SetupStopperPass2();
+/* context.cpp *************************************************************/
+/// Check a string to see if it matches a set of lexical rules.
+int case_ok(const WERD_CHOICE &word) const;
+/// Returns true if the word looks like an absolute garbage
+/// (e.g. image mistakenly recognized as text).
+bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset);
+/* dict.cpp ****************************************************************/
+/// Initialize Dict class - load dawgs from [lang].traineddata and
+/// user-specified wordlist and parttern list.
+static DawgCache *GlobalDawgCache();
+// Sets up ready for a Load or LoadLSTM.
+void SetupForLoad(DawgCache *dawg_cache);
+// Loads the dawgs needed by Tesseract. Call FinishLoad() after.
+void Load(const std::string &lang, TessdataManager *data_file);
+// Loads the dawgs needed by the LSTM model. Call FinishLoad() after.
+void LoadLSTM(const std::string &lang, TessdataManager *data_file);
+// Completes the loading process after Load() and/or LoadLSTM().
+// Returns false if no dictionaries were loaded.
+bool FinishLoad();
+void End();
+// Resets the document dictionary analogous to ResetAdaptiveClassifier.
+void ResetDocumentDictionary() {
+if (pending_words_ != nullptr) {
+pending_words_->clear();
+}
+if (document_words_ != nullptr) {
+document_words_->clear();
+}
+}
+/**
+* Returns the maximal permuter code (from ccstruct/ratngs.h) if in light
+* of the current state the letter at word_index in the given word
+* is allowed according to at least one of the dawgs in dawgs_,
+* otherwise returns NO_PERM.
+*
+* The state is described by void_dawg_args, which are interpreted as
+* DawgArgs and contain relevant active dawg positions.
+* Each entry in the active_dawgs vector contains an index
+* into the dawgs_ vector and an EDGE_REF that indicates the last edge
+* followed in the dawg.  It also may contain a position in the punctuation
+* dawg which describes surrounding punctuation (see struct DawgPosition).
+*
+* Input:
+* At word_index 0 dawg_args->active_dawgs should contain an entry for each
+* dawg that may start at the beginning of a word, with punc_ref and edge_ref
+* initialized to NO_EDGE.  Since the punctuation dawg includes the empty
+* pattern " " (meaning anything without surrounding punctuation), having a
+* single entry for the punctuation dawg will cover all dawgs reachable
+* there from -- that includes all number and word dawgs. The only dawg
+* non-reachable from the punctuation_dawg is the pattern dawg.
+* If hyphen state needs to be applied, initial dawg_args->active_dawgs can
+* be copied from the saved hyphen state (maintained by Dict).
+* For word_index > 0 the corresponding state (active_dawgs and punc position)
+* can be obtained from dawg_args->updated_dawgs passed to
+* def_letter_is_okay for word_index-1.
+* Note: the function assumes that active_dawgs, and updated_dawgs
+* member variables of dawg_args are not nullptr.
+*
+* Output:
+* The function fills in dawg_args->updated_dawgs vector with the
+* entries for dawgs that contain the word up to the letter at word_index.
+*
+*/
+//
+int def_letter_is_okay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id,
+bool word_end) const;
+int (Dict::*letter_is_okay_)(void *void_dawg_args, const UNICHARSET &unicharset,
+UNICHAR_ID unichar_id, bool word_end) const;
+/// Calls letter_is_okay_ member function.
+int LetterIsOkay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id,
+bool word_end) const {
+return (this->*letter_is_okay_)(void_dawg_args, unicharset, unichar_id, word_end);
+}
+/// Probability in context function used by the ngram permuter.
+double (Dict::*probability_in_context_)(const char *lang, const char *context, int context_bytes,
+const char *character, int character_bytes);
+/// Calls probability_in_context_ member function.
+double ProbabilityInContext(const char *context, int context_bytes, const char *character,
+int character_bytes) {
+return (this->*probability_in_context_)(getCCUtil()->lang.c_str(), context, context_bytes,
+character, character_bytes);
+}
+/// Default (no-op) implementation of probability in context function.
+double def_probability_in_context(const char *lang, const char *context, int context_bytes,
+const char *character, int character_bytes) {
+(void)lang;
+(void)context;
+(void)context_bytes;
+(void)character;
+(void)character_bytes;
+return 0.0;
+}
+inline void SetWildcardID(UNICHAR_ID id) {
+wildcard_unichar_id_ = id;
+}
+inline UNICHAR_ID WildcardID() const {
+return wildcard_unichar_id_;
+}
+/// Return the number of dawgs in the dawgs_ vector.
+inline int NumDawgs() const {
+return dawgs_.size();
+}
+/// Return i-th dawg pointer recorded in the dawgs_ vector.
+inline const Dawg *GetDawg(int index) const {
+return dawgs_[index];
+}
+/// Return the points to the punctuation dawg.
+inline const Dawg *GetPuncDawg() const {
+return punc_dawg_;
+}
+/// Return the points to the unambiguous words dawg.
+inline const Dawg *GetUnambigDawg() const {
+return unambig_dawg_;
+}
+/// Returns the appropriate next node given the EDGE_REF.
+static inline NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref) {
+if (edge_ref == NO_EDGE) {
+return 0; // beginning to explore the dawg
+}
+NODE_REF node = dawg->next_node(edge_ref);
+if (node == 0) {
+node = NO_EDGE; // end of word
+}
+return node;
+}
+// Given a unichar from a string and a given dawg, return the unichar
+// we should use to match in that dawg type.  (for example, in the number
+// dawg, all numbers are transformed to kPatternUnicharId).
+UNICHAR_ID char_for_dawg(const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const {
+if (!dawg) {
+return ch;
+}
+switch (dawg->type()) {
+case DAWG_TYPE_NUMBER:
+return unicharset.get_isdigit(ch) ? Dawg::kPatternUnicharID : ch;
+default:
+return ch;
+}
+}
+/// For each of the character classes of the given unichar_id (and the
+/// unichar_id itself) finds the corresponding outgoing node or self-loop
+/// in the given dawg and (after checking that it is valid) records it in
+/// dawg_args->updated_ative_dawgs. Updates current_permuter if any valid
+/// edges were found.
+void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id,
+bool word_end, DawgArgs *dawg_args,
+PermuterType *current_permuter) const;
+/// Read/Write/Access special purpose dawgs which contain words
+/// only of a certain length (used for phrase search for
+/// non-space-delimited languages).
+/// Check all the DAWGs to see if this word is in any of them.
+inline static bool valid_word_permuter(uint8_t perm, bool numbers_ok) {
+return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM || perm == DOC_DAWG_PERM ||
+perm == USER_DAWG_PERM || perm == USER_PATTERN_PERM || perm == COMPOUND_PERM ||
+(numbers_ok && perm == NUMBER_PERM));
+}
+int valid_word(const WERD_CHOICE &word, bool numbers_ok) const;
+int valid_word(const WERD_CHOICE &word) const {
+return valid_word(word, false); // return NO_PERM for words with digits
+}
+int valid_word_or_number(const WERD_CHOICE &word) const {
+return valid_word(word, true); // return NUMBER_PERM for valid numbers
+}
+/// This function is used by api/tesseract_cube_combiner.cpp
+int valid_word(const char *string) const {
+WERD_CHOICE word(string, getUnicharset());
+return valid_word(word);
+}
+// Do the two WERD_CHOICEs form a meaningful bigram?
+bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const;
+/// Returns true if the word contains a valid punctuation pattern.
+/// Note: Since the domains of punctuation symbols and symblos
+/// used in numbers are not disjoint, a valid number might contain
+/// an invalid punctuation pattern (e.g. .99).
+bool valid_punctuation(const WERD_CHOICE &word);
+/// Returns true if a good answer is found for the unknown blob rating.
+int good_choice(const WERD_CHOICE &choice);
+/// Adds a word found on this document to the document specific dictionary.
+void add_document_word(const WERD_CHOICE &best_choice);
+/// Adjusts the rating of the given word.
+void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency,
+float additional_adjust, bool modify_rating, bool debug);
+/// Set wordseg_rating_adjust_factor_ to the given value.
+inline void SetWordsegRatingAdjustFactor(float f) {
+wordseg_rating_adjust_factor_ = f;
+}
+/// Returns true if the language is space-delimited (not CJ, or T).
+bool IsSpaceDelimitedLang() const;
+private:
+/** Private member variables. */
+CCUtil *ccutil_;
+/**
+* Table that stores ambiguities computed during training
+* (loaded when NoDangerousAmbigs() is called for the first time).
+* Each entry i in the table stores a set of amibiguities whose
+* wrong ngram starts with unichar id i.
+*/
+#ifndef DISABLED_LEGACY_ENGINE
+UnicharAmbigs *dang_ambigs_table_ = nullptr;
+/** Same as above, but for ambiguities with replace flag set. */
+UnicharAmbigs *replace_ambigs_table_ = nullptr;
+#endif
+/** Additional certainty padding allowed before a word is rejected. */
+float reject_offset_;
+// Cached UNICHAR_IDs:
+UNICHAR_ID wildcard_unichar_id_;   // kDictWildcard.
+UNICHAR_ID apostrophe_unichar_id_; // kApostropheSymbol.
+UNICHAR_ID question_unichar_id_;   // kQuestionSymbol.
+UNICHAR_ID slash_unichar_id_;      // kSlashSymbol.
+UNICHAR_ID hyphen_unichar_id_;     // kHyphenSymbol.
+// Hyphen-related variables.
+WERD_CHOICE *hyphen_word_;
+DawgPositionVector hyphen_active_dawgs_;
+bool last_word_on_line_;
+// List of lists of "equivalent" UNICHAR_IDs for the purposes of dictionary
+// matching.  The first member of each list is taken as canonical.  For
+// example, the first list contains hyphens and dashes with the first symbol
+// being the ASCII hyphen minus.
+std::vector<std::vector<UNICHAR_ID>> equivalent_symbols_;
+// Dawg Cache reference - this is who we ask to allocate/deallocate dawgs.
+DawgCache *dawg_cache_;
+bool dawg_cache_is_ours_; // we should delete our own dawg_cache_
+// Dawgs.
+DawgVector dawgs_;
+SuccessorListsVector successors_;
+Trie *pending_words_;
+/// The following pointers are only cached for convenience.
+/// The dawgs will be deleted when dawgs_ vector is destroyed.
+// bigram_dawg_ points to a dawg of two-word bigrams which always supersede if
+// any of them are present on the best choices list for a word pair.
+// the bigrams are stored as space-separated words where:
+// (1) leading and trailing punctuation has been removed from each word and
+// (2) any digits have been replaced with '?' marks.
+Dawg *bigram_dawg_;
+// TODO(daria): need to support multiple languages in the future,
+// so maybe will need to maintain a list of dawgs of each kind.
+Dawg *freq_dawg_;
+Dawg *unambig_dawg_;
+Dawg *punc_dawg_;
+Trie *document_words_;
+/// Current segmentation cost adjust factor for word rating.
+/// See comments in incorporate_segcost.
+float wordseg_rating_adjust_factor_;
+// File for recording ambiguities discovered during dictionary search.
+FILE *output_ambig_words_file_;
+public:
+/// Variable members.
+/// These have to be declared and initialized after image_ptr_, which contains
+/// the pointer to the params vector - the member of its base CCUtil class.
+STRING_VAR_H(user_words_file);
+STRING_VAR_H(user_words_suffix);
+STRING_VAR_H(user_patterns_file);
+STRING_VAR_H(user_patterns_suffix);
+BOOL_VAR_H(load_system_dawg);
+BOOL_VAR_H(load_freq_dawg);
+BOOL_VAR_H(load_unambig_dawg);
+BOOL_VAR_H(load_punc_dawg);
+BOOL_VAR_H(load_number_dawg);
+BOOL_VAR_H(load_bigram_dawg);
+double_VAR_H(xheight_penalty_subscripts);
+double_VAR_H(xheight_penalty_inconsistent);
+double_VAR_H(segment_penalty_dict_frequent_word);
+double_VAR_H(segment_penalty_dict_case_ok);
+double_VAR_H(segment_penalty_dict_case_bad);
+double_VAR_H(segment_penalty_dict_nonword);
+double_VAR_H(segment_penalty_garbage);
+STRING_VAR_H(output_ambig_words_file);
+INT_VAR_H(dawg_debug_level);
+INT_VAR_H(hyphen_debug_level);
+BOOL_VAR_H(use_only_first_uft8_step);
+double_VAR_H(certainty_scale);
+double_VAR_H(stopper_nondict_certainty_base);
+double_VAR_H(stopper_phase2_certainty_rejection_offset);
+INT_VAR_H(stopper_smallword_size);
+double_VAR_H(stopper_certainty_per_char);
+double_VAR_H(stopper_allowable_character_badness);
+INT_VAR_H(stopper_debug_level);
+BOOL_VAR_H(stopper_no_acceptable_choices);
+INT_VAR_H(tessedit_truncate_wordchoice_log);
+STRING_VAR_H(word_to_debug);
+BOOL_VAR_H(segment_nonalphabetic_script);
+BOOL_VAR_H(save_doc_words);
+double_VAR_H(doc_dict_pending_threshold);
+double_VAR_H(doc_dict_certainty_threshold);
+INT_VAR_H(max_permuter_attempts);
+};
+} // namespace tesseract
+#endif // THIRD_PARTY_TESSERACT_DICT_DICT_H_

Mercurial > hgrepos > Python2 > PyMuPDF

comparison mupdf-source/thirdparty/tesseract/src/dict/dict.h @ 2:b50eed0cc0ef upstream