Mercurial > hgrepos > Python2 > PyMuPDF
view mupdf-source/thirdparty/tesseract/src/dict/dict.h @ 21:2f43e400f144
Provide an "all" target to build both the sdist and the wheel
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Fri, 19 Sep 2025 10:28:53 +0200 |
| parents | b50eed0cc0ef |
| children |
line wrap: on
line source
/////////////////////////////////////////////////////////////////////// // File: dict.h // Description: dict class. // Author: Samuel Charron // // (C) Copyright 2006, Google Inc. // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // /////////////////////////////////////////////////////////////////////// #ifndef TESSERACT_DICT_DICT_H_ #define TESSERACT_DICT_DICT_H_ #ifdef HAVE_CONFIG_H # include "config_auto.h" // DISABLED_LEGACY_ENGINE #endif #ifndef DISABLED_LEGACY_ENGINE # include "ambigs.h" #endif #include "dawg.h" #include "dawg_cache.h" #include "ratngs.h" #include "stopper.h" #include "trie.h" #include "unicharset.h" #ifndef DISABLED_LEGACY_ENGINE # include "params_training_featdef.h" #endif // ndef DISABLED_LEGACY_ENGINE namespace tesseract { class MATRIX; class WERD_RES; #define CHARS_PER_LINE 500 #define MAX_WERD_LENGTH (int64_t)128 #define NO_RATING -1 /** Struct used to hold temporary information about fragments. */ struct CHAR_FRAGMENT_INFO { UNICHAR_ID unichar_id; const CHAR_FRAGMENT *fragment; int num_fragments; float rating; float certainty; }; using DawgVector = std::vector<Dawg *>; // // Constants // static const int kRatingPad = 4; static const int kDictMaxWildcards = 2; // max wildcards for a word // TODO(daria): If hyphens are different in different languages and can be // inferred from training data we should load their values dynamically. static const char kHyphenSymbol[] = "-"; static const char kSlashSymbol[] = "/"; static const char kQuestionSymbol[] = "?"; static const char kApostropheSymbol[] = "'"; static const float kSimCertaintyScale = -10.0; // similarity matcher scaling static const float kSimCertaintyOffset = -10.0; // similarity matcher offset static const float kSimilarityFloor = 100.0; // worst E*L product to stop on static const int kDocDictMaxRepChars = 4; // Enum for describing whether the x-height for the word is consistent: // 0 - everything is good. // 1 - there are one or two secondary (but consistent) baselines // [think subscript and superscript], or there is an oversized // first character. // 2 - the word is inconsistent. enum XHeightConsistencyEnum { XH_GOOD, XH_SUBNORMAL, XH_INCONSISTENT }; struct DawgArgs { DawgArgs(DawgPositionVector *d, DawgPositionVector *up, PermuterType p) : active_dawgs(d), updated_dawgs(up), permuter(p), valid_end(false) {} DawgPositionVector *active_dawgs; DawgPositionVector *updated_dawgs; PermuterType permuter; // True if the current position is a valid word end. bool valid_end; }; class TESS_API Dict { public: Dict(CCUtil *image_ptr); ~Dict(); const CCUtil *getCCUtil() const { return ccutil_; } CCUtil *getCCUtil() { return ccutil_; } const UNICHARSET &getUnicharset() const { return getCCUtil()->unicharset; } UNICHARSET &getUnicharset() { return getCCUtil()->unicharset; } #ifndef DISABLED_LEGACY_ENGINE const UnicharAmbigs &getUnicharAmbigs() const { return getCCUtil()->unichar_ambigs; } #endif // Returns true if unichar_id is a word compounding character like - or /. inline bool compound_marker(UNICHAR_ID unichar_id) { const UNICHARSET &unicharset = getUnicharset(); ASSERT_HOST(unicharset.contains_unichar_id(unichar_id)); const auto &normed_ids = unicharset.normed_ids(unichar_id); return normed_ids.size() == 1 && (normed_ids[0] == hyphen_unichar_id_ || normed_ids[0] == slash_unichar_id_); } // Returns true if unichar_id is an apostrophe-like character that may // separate prefix/suffix words from a main body word. inline bool is_apostrophe(UNICHAR_ID unichar_id) { const UNICHARSET &unicharset = getUnicharset(); ASSERT_HOST(unicharset.contains_unichar_id(unichar_id)); const auto &normed_ids = unicharset.normed_ids(unichar_id); return normed_ids.size() == 1 && normed_ids[0] == apostrophe_unichar_id_; } /* hyphen.cpp ************************************************************/ /// Returns true if we've recorded the beginning of a hyphenated word. inline bool hyphenated() const { return !last_word_on_line_ && hyphen_word_; } /// Size of the base word (the part on the line before) of a hyphenated word. inline int hyphen_base_size() const { return this->hyphenated() ? hyphen_word_->length() : 0; } /// If this word is hyphenated copy the base word (the part on /// the line before) of a hyphenated word into the given word. /// This function assumes that word is not nullptr. inline void copy_hyphen_info(WERD_CHOICE *word) const { if (this->hyphenated()) { *word = *hyphen_word_; if (hyphen_debug_level) { word->print("copy_hyphen_info: "); } } } /// Check whether the word has a hyphen at the end. inline bool has_hyphen_end(const UNICHARSET *unicharset, UNICHAR_ID unichar_id, bool first_pos) const { if (!last_word_on_line_ || first_pos) { return false; } ASSERT_HOST(unicharset->contains_unichar_id(unichar_id)); const auto &normed_ids = unicharset->normed_ids(unichar_id); return normed_ids.size() == 1 && normed_ids[0] == hyphen_unichar_id_; } /// Same as above, but check the unichar at the end of the word. inline bool has_hyphen_end(const WERD_CHOICE &word) const { int word_index = word.length() - 1; return has_hyphen_end(word.unicharset(), word.unichar_id(word_index), word_index == 0); } /// Unless the previous word was the last one on the line, and the current /// one is not (thus it is the first one on the line), erase hyphen_word_, /// clear hyphen_active_dawgs_, update last_word_on_line_. void reset_hyphen_vars(bool last_word_on_line); /// Update hyphen_word_, and copy the given DawgPositionVectors into /// hyphen_active_dawgs_ . void set_hyphen_word(const WERD_CHOICE &word, const DawgPositionVector &active_dawgs); /* permdawg.cpp ************************************************************/ // Note: Functions in permdawg.cpp are only used by NoDangerousAmbig(). // When this function is refactored, permdawg.cpp can be removed. /// Copies word into best_choice if its rating is smaller /// than that of best_choice. inline void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice) { if (word.rating() < best_choice->rating()) { *best_choice = word; } } /// Fill the given active_dawgs vector with dawgs that could contain the /// beginning of the word. If hyphenated() returns true, copy the entries /// from hyphen_active_dawgs_ instead. void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const; // Fill the given vector with the default collection of any-length dawgs void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const; /// Recursively explore all the possible character combinations in /// the given char_choices. Use go_deeper_dawg_fxn() to explore all the /// dawgs in the dawgs_ vector in parallel and discard invalid words. /// /// Allocate and return a WERD_CHOICE with the best valid word found. WERD_CHOICE *dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit); /// If the choice being composed so far could be a dictionary word /// and we have not reached the end of the word keep exploring the /// char_choices further. void go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args); /// Pointer to go_deeper function. void (Dict::*go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args); // // Helper functions for dawg_permute_and_select(). // void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args); void append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args); bool fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug, int word_ending, CHAR_FRAGMENT_INFO *char_frag_info); /* stopper.cpp *************************************************************/ #if !defined(DISABLED_LEGACY_ENGINE) bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings); #endif // !defined(DISABLED_LEGACY_ENGINE) // Replaces the corresponding wrong ngram in werd_choice with the correct // one. The whole correct n-gram is inserted into the ratings matrix and // the werd_choice: no more fragments!. Rating and certainty of new entries // in matrix and werd_choice are the sum and mean of the wrong ngram // respectively. // E.g. for werd_choice mystring'' and ambiguity ''->": werd_choice becomes // mystring", with a new entry in the ratings matrix for ". void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, MATRIX *ratings); /// Returns the length of the shortest alpha run in WordChoice. int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const; /// Returns true if the certainty of the BestChoice word is within a /// reasonable range of the average certainties for the best choices for /// each character in the segmentation. This test is used to catch words /// in which one character is much worse than the other characters in the /// word (i.e. false will be returned in that case). The algorithm computes /// the mean and std deviation of the certainties in the word with the worst /// certainty thrown out. int UniformCertainties(const WERD_CHOICE &word); /// Returns true if the given best_choice is good enough to stop. bool AcceptableChoice(const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency); /// Returns false if the best choice for the current word is questionable /// and should be tried again on the second pass or should be flagged to /// the user. bool AcceptableResult(WERD_RES *word) const; #if !defined(DISABLED_LEGACY_ENGINE) void EndDangerousAmbigs(); #endif // !defined(DISABLED_LEGACY_ENGINE) /// Prints the current choices for this word to stdout. void DebugWordChoices(); /// Sets up stopper variables in preparation for the first pass. void SetupStopperPass1(); /// Sets up stopper variables in preparation for the second pass. void SetupStopperPass2(); /* context.cpp *************************************************************/ /// Check a string to see if it matches a set of lexical rules. int case_ok(const WERD_CHOICE &word) const; /// Returns true if the word looks like an absolute garbage /// (e.g. image mistakenly recognized as text). bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset); /* dict.cpp ****************************************************************/ /// Initialize Dict class - load dawgs from [lang].traineddata and /// user-specified wordlist and parttern list. static DawgCache *GlobalDawgCache(); // Sets up ready for a Load or LoadLSTM. void SetupForLoad(DawgCache *dawg_cache); // Loads the dawgs needed by Tesseract. Call FinishLoad() after. void Load(const std::string &lang, TessdataManager *data_file); // Loads the dawgs needed by the LSTM model. Call FinishLoad() after. void LoadLSTM(const std::string &lang, TessdataManager *data_file); // Completes the loading process after Load() and/or LoadLSTM(). // Returns false if no dictionaries were loaded. bool FinishLoad(); void End(); // Resets the document dictionary analogous to ResetAdaptiveClassifier. void ResetDocumentDictionary() { if (pending_words_ != nullptr) { pending_words_->clear(); } if (document_words_ != nullptr) { document_words_->clear(); } } /** * Returns the maximal permuter code (from ccstruct/ratngs.h) if in light * of the current state the letter at word_index in the given word * is allowed according to at least one of the dawgs in dawgs_, * otherwise returns NO_PERM. * * The state is described by void_dawg_args, which are interpreted as * DawgArgs and contain relevant active dawg positions. * Each entry in the active_dawgs vector contains an index * into the dawgs_ vector and an EDGE_REF that indicates the last edge * followed in the dawg. It also may contain a position in the punctuation * dawg which describes surrounding punctuation (see struct DawgPosition). * * Input: * At word_index 0 dawg_args->active_dawgs should contain an entry for each * dawg that may start at the beginning of a word, with punc_ref and edge_ref * initialized to NO_EDGE. Since the punctuation dawg includes the empty * pattern " " (meaning anything without surrounding punctuation), having a * single entry for the punctuation dawg will cover all dawgs reachable * there from -- that includes all number and word dawgs. The only dawg * non-reachable from the punctuation_dawg is the pattern dawg. * If hyphen state needs to be applied, initial dawg_args->active_dawgs can * be copied from the saved hyphen state (maintained by Dict). * For word_index > 0 the corresponding state (active_dawgs and punc position) * can be obtained from dawg_args->updated_dawgs passed to * def_letter_is_okay for word_index-1. * Note: the function assumes that active_dawgs, and updated_dawgs * member variables of dawg_args are not nullptr. * * Output: * The function fills in dawg_args->updated_dawgs vector with the * entries for dawgs that contain the word up to the letter at word_index. * */ // int def_letter_is_okay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const; int (Dict::*letter_is_okay_)(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const; /// Calls letter_is_okay_ member function. int LetterIsOkay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const { return (this->*letter_is_okay_)(void_dawg_args, unicharset, unichar_id, word_end); } /// Probability in context function used by the ngram permuter. double (Dict::*probability_in_context_)(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes); /// Calls probability_in_context_ member function. double ProbabilityInContext(const char *context, int context_bytes, const char *character, int character_bytes) { return (this->*probability_in_context_)(getCCUtil()->lang.c_str(), context, context_bytes, character, character_bytes); } /// Default (no-op) implementation of probability in context function. double def_probability_in_context(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes) { (void)lang; (void)context; (void)context_bytes; (void)character; (void)character_bytes; return 0.0; } inline void SetWildcardID(UNICHAR_ID id) { wildcard_unichar_id_ = id; } inline UNICHAR_ID WildcardID() const { return wildcard_unichar_id_; } /// Return the number of dawgs in the dawgs_ vector. inline int NumDawgs() const { return dawgs_.size(); } /// Return i-th dawg pointer recorded in the dawgs_ vector. inline const Dawg *GetDawg(int index) const { return dawgs_[index]; } /// Return the points to the punctuation dawg. inline const Dawg *GetPuncDawg() const { return punc_dawg_; } /// Return the points to the unambiguous words dawg. inline const Dawg *GetUnambigDawg() const { return unambig_dawg_; } /// Returns the appropriate next node given the EDGE_REF. static inline NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref) { if (edge_ref == NO_EDGE) { return 0; // beginning to explore the dawg } NODE_REF node = dawg->next_node(edge_ref); if (node == 0) { node = NO_EDGE; // end of word } return node; } // Given a unichar from a string and a given dawg, return the unichar // we should use to match in that dawg type. (for example, in the number // dawg, all numbers are transformed to kPatternUnicharId). UNICHAR_ID char_for_dawg(const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const { if (!dawg) { return ch; } switch (dawg->type()) { case DAWG_TYPE_NUMBER: return unicharset.get_isdigit(ch) ? Dawg::kPatternUnicharID : ch; default: return ch; } } /// For each of the character classes of the given unichar_id (and the /// unichar_id itself) finds the corresponding outgoing node or self-loop /// in the given dawg and (after checking that it is valid) records it in /// dawg_args->updated_ative_dawgs. Updates current_permuter if any valid /// edges were found. void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id, bool word_end, DawgArgs *dawg_args, PermuterType *current_permuter) const; /// Read/Write/Access special purpose dawgs which contain words /// only of a certain length (used for phrase search for /// non-space-delimited languages). /// Check all the DAWGs to see if this word is in any of them. inline static bool valid_word_permuter(uint8_t perm, bool numbers_ok) { return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM || perm == DOC_DAWG_PERM || perm == USER_DAWG_PERM || perm == USER_PATTERN_PERM || perm == COMPOUND_PERM || (numbers_ok && perm == NUMBER_PERM)); } int valid_word(const WERD_CHOICE &word, bool numbers_ok) const; int valid_word(const WERD_CHOICE &word) const { return valid_word(word, false); // return NO_PERM for words with digits } int valid_word_or_number(const WERD_CHOICE &word) const { return valid_word(word, true); // return NUMBER_PERM for valid numbers } /// This function is used by api/tesseract_cube_combiner.cpp int valid_word(const char *string) const { WERD_CHOICE word(string, getUnicharset()); return valid_word(word); } // Do the two WERD_CHOICEs form a meaningful bigram? bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const; /// Returns true if the word contains a valid punctuation pattern. /// Note: Since the domains of punctuation symbols and symblos /// used in numbers are not disjoint, a valid number might contain /// an invalid punctuation pattern (e.g. .99). bool valid_punctuation(const WERD_CHOICE &word); /// Returns true if a good answer is found for the unknown blob rating. int good_choice(const WERD_CHOICE &choice); /// Adds a word found on this document to the document specific dictionary. void add_document_word(const WERD_CHOICE &best_choice); /// Adjusts the rating of the given word. void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug); /// Set wordseg_rating_adjust_factor_ to the given value. inline void SetWordsegRatingAdjustFactor(float f) { wordseg_rating_adjust_factor_ = f; } /// Returns true if the language is space-delimited (not CJ, or T). bool IsSpaceDelimitedLang() const; private: /** Private member variables. */ CCUtil *ccutil_; /** * Table that stores ambiguities computed during training * (loaded when NoDangerousAmbigs() is called for the first time). * Each entry i in the table stores a set of amibiguities whose * wrong ngram starts with unichar id i. */ #ifndef DISABLED_LEGACY_ENGINE UnicharAmbigs *dang_ambigs_table_ = nullptr; /** Same as above, but for ambiguities with replace flag set. */ UnicharAmbigs *replace_ambigs_table_ = nullptr; #endif /** Additional certainty padding allowed before a word is rejected. */ float reject_offset_; // Cached UNICHAR_IDs: UNICHAR_ID wildcard_unichar_id_; // kDictWildcard. UNICHAR_ID apostrophe_unichar_id_; // kApostropheSymbol. UNICHAR_ID question_unichar_id_; // kQuestionSymbol. UNICHAR_ID slash_unichar_id_; // kSlashSymbol. UNICHAR_ID hyphen_unichar_id_; // kHyphenSymbol. // Hyphen-related variables. WERD_CHOICE *hyphen_word_; DawgPositionVector hyphen_active_dawgs_; bool last_word_on_line_; // List of lists of "equivalent" UNICHAR_IDs for the purposes of dictionary // matching. The first member of each list is taken as canonical. For // example, the first list contains hyphens and dashes with the first symbol // being the ASCII hyphen minus. std::vector<std::vector<UNICHAR_ID>> equivalent_symbols_; // Dawg Cache reference - this is who we ask to allocate/deallocate dawgs. DawgCache *dawg_cache_; bool dawg_cache_is_ours_; // we should delete our own dawg_cache_ // Dawgs. DawgVector dawgs_; SuccessorListsVector successors_; Trie *pending_words_; /// The following pointers are only cached for convenience. /// The dawgs will be deleted when dawgs_ vector is destroyed. // bigram_dawg_ points to a dawg of two-word bigrams which always supersede if // any of them are present on the best choices list for a word pair. // the bigrams are stored as space-separated words where: // (1) leading and trailing punctuation has been removed from each word and // (2) any digits have been replaced with '?' marks. Dawg *bigram_dawg_; // TODO(daria): need to support multiple languages in the future, // so maybe will need to maintain a list of dawgs of each kind. Dawg *freq_dawg_; Dawg *unambig_dawg_; Dawg *punc_dawg_; Trie *document_words_; /// Current segmentation cost adjust factor for word rating. /// See comments in incorporate_segcost. float wordseg_rating_adjust_factor_; // File for recording ambiguities discovered during dictionary search. FILE *output_ambig_words_file_; public: /// Variable members. /// These have to be declared and initialized after image_ptr_, which contains /// the pointer to the params vector - the member of its base CCUtil class. STRING_VAR_H(user_words_file); STRING_VAR_H(user_words_suffix); STRING_VAR_H(user_patterns_file); STRING_VAR_H(user_patterns_suffix); BOOL_VAR_H(load_system_dawg); BOOL_VAR_H(load_freq_dawg); BOOL_VAR_H(load_unambig_dawg); BOOL_VAR_H(load_punc_dawg); BOOL_VAR_H(load_number_dawg); BOOL_VAR_H(load_bigram_dawg); double_VAR_H(xheight_penalty_subscripts); double_VAR_H(xheight_penalty_inconsistent); double_VAR_H(segment_penalty_dict_frequent_word); double_VAR_H(segment_penalty_dict_case_ok); double_VAR_H(segment_penalty_dict_case_bad); double_VAR_H(segment_penalty_dict_nonword); double_VAR_H(segment_penalty_garbage); STRING_VAR_H(output_ambig_words_file); INT_VAR_H(dawg_debug_level); INT_VAR_H(hyphen_debug_level); BOOL_VAR_H(use_only_first_uft8_step); double_VAR_H(certainty_scale); double_VAR_H(stopper_nondict_certainty_base); double_VAR_H(stopper_phase2_certainty_rejection_offset); INT_VAR_H(stopper_smallword_size); double_VAR_H(stopper_certainty_per_char); double_VAR_H(stopper_allowable_character_badness); INT_VAR_H(stopper_debug_level); BOOL_VAR_H(stopper_no_acceptable_choices); INT_VAR_H(tessedit_truncate_wordchoice_log); STRING_VAR_H(word_to_debug); BOOL_VAR_H(segment_nonalphabetic_script); BOOL_VAR_H(save_doc_words); double_VAR_H(doc_dict_pending_threshold); double_VAR_H(doc_dict_certainty_threshold); INT_VAR_H(max_permuter_attempts); }; } // namespace tesseract #endif // THIRD_PARTY_TESSERACT_DICT_DICT_H_
