Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/dict/dict.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: dict.h | |
| 3 // Description: dict class. | |
| 4 // Author: Samuel Charron | |
| 5 // | |
| 6 // (C) Copyright 2006, Google Inc. | |
| 7 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 // you may not use this file except in compliance with the License. | |
| 9 // You may obtain a copy of the License at | |
| 10 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 // Unless required by applicable law or agreed to in writing, software | |
| 12 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 // See the License for the specific language governing permissions and | |
| 15 // limitations under the License. | |
| 16 // | |
| 17 /////////////////////////////////////////////////////////////////////// | |
| 18 | |
| 19 #ifndef TESSERACT_DICT_DICT_H_ | |
| 20 #define TESSERACT_DICT_DICT_H_ | |
| 21 | |
| 22 #ifdef HAVE_CONFIG_H | |
| 23 # include "config_auto.h" // DISABLED_LEGACY_ENGINE | |
| 24 #endif | |
| 25 | |
| 26 #ifndef DISABLED_LEGACY_ENGINE | |
| 27 # include "ambigs.h" | |
| 28 #endif | |
| 29 #include "dawg.h" | |
| 30 #include "dawg_cache.h" | |
| 31 #include "ratngs.h" | |
| 32 #include "stopper.h" | |
| 33 #include "trie.h" | |
| 34 #include "unicharset.h" | |
| 35 #ifndef DISABLED_LEGACY_ENGINE | |
| 36 # include "params_training_featdef.h" | |
| 37 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 38 | |
| 39 namespace tesseract { | |
| 40 | |
| 41 class MATRIX; | |
| 42 class WERD_RES; | |
| 43 | |
| 44 #define CHARS_PER_LINE 500 | |
| 45 #define MAX_WERD_LENGTH (int64_t)128 | |
| 46 #define NO_RATING -1 | |
| 47 | |
| 48 /** Struct used to hold temporary information about fragments. */ | |
| 49 struct CHAR_FRAGMENT_INFO { | |
| 50 UNICHAR_ID unichar_id; | |
| 51 const CHAR_FRAGMENT *fragment; | |
| 52 int num_fragments; | |
| 53 float rating; | |
| 54 float certainty; | |
| 55 }; | |
| 56 | |
| 57 using DawgVector = std::vector<Dawg *>; | |
| 58 | |
| 59 // | |
| 60 // Constants | |
| 61 // | |
| 62 static const int kRatingPad = 4; | |
| 63 static const int kDictMaxWildcards = 2; // max wildcards for a word | |
| 64 // TODO(daria): If hyphens are different in different languages and can be | |
| 65 // inferred from training data we should load their values dynamically. | |
| 66 static const char kHyphenSymbol[] = "-"; | |
| 67 static const char kSlashSymbol[] = "/"; | |
| 68 static const char kQuestionSymbol[] = "?"; | |
| 69 static const char kApostropheSymbol[] = "'"; | |
| 70 static const float kSimCertaintyScale = -10.0; // similarity matcher scaling | |
| 71 static const float kSimCertaintyOffset = -10.0; // similarity matcher offset | |
| 72 static const float kSimilarityFloor = 100.0; // worst E*L product to stop on | |
| 73 static const int kDocDictMaxRepChars = 4; | |
| 74 | |
| 75 // Enum for describing whether the x-height for the word is consistent: | |
| 76 // 0 - everything is good. | |
| 77 // 1 - there are one or two secondary (but consistent) baselines | |
| 78 // [think subscript and superscript], or there is an oversized | |
| 79 // first character. | |
| 80 // 2 - the word is inconsistent. | |
| 81 enum XHeightConsistencyEnum { XH_GOOD, XH_SUBNORMAL, XH_INCONSISTENT }; | |
| 82 | |
| 83 struct DawgArgs { | |
| 84 DawgArgs(DawgPositionVector *d, DawgPositionVector *up, PermuterType p) | |
| 85 : active_dawgs(d), updated_dawgs(up), permuter(p), valid_end(false) {} | |
| 86 | |
| 87 DawgPositionVector *active_dawgs; | |
| 88 DawgPositionVector *updated_dawgs; | |
| 89 PermuterType permuter; | |
| 90 // True if the current position is a valid word end. | |
| 91 bool valid_end; | |
| 92 }; | |
| 93 | |
| 94 class TESS_API Dict { | |
| 95 public: | |
| 96 Dict(CCUtil *image_ptr); | |
| 97 ~Dict(); | |
| 98 const CCUtil *getCCUtil() const { | |
| 99 return ccutil_; | |
| 100 } | |
| 101 CCUtil *getCCUtil() { | |
| 102 return ccutil_; | |
| 103 } | |
| 104 const UNICHARSET &getUnicharset() const { | |
| 105 return getCCUtil()->unicharset; | |
| 106 } | |
| 107 UNICHARSET &getUnicharset() { | |
| 108 return getCCUtil()->unicharset; | |
| 109 } | |
| 110 #ifndef DISABLED_LEGACY_ENGINE | |
| 111 const UnicharAmbigs &getUnicharAmbigs() const { | |
| 112 return getCCUtil()->unichar_ambigs; | |
| 113 } | |
| 114 #endif | |
| 115 // Returns true if unichar_id is a word compounding character like - or /. | |
| 116 inline bool compound_marker(UNICHAR_ID unichar_id) { | |
| 117 const UNICHARSET &unicharset = getUnicharset(); | |
| 118 ASSERT_HOST(unicharset.contains_unichar_id(unichar_id)); | |
| 119 const auto &normed_ids = unicharset.normed_ids(unichar_id); | |
| 120 return normed_ids.size() == 1 && | |
| 121 (normed_ids[0] == hyphen_unichar_id_ || normed_ids[0] == slash_unichar_id_); | |
| 122 } | |
| 123 // Returns true if unichar_id is an apostrophe-like character that may | |
| 124 // separate prefix/suffix words from a main body word. | |
| 125 inline bool is_apostrophe(UNICHAR_ID unichar_id) { | |
| 126 const UNICHARSET &unicharset = getUnicharset(); | |
| 127 ASSERT_HOST(unicharset.contains_unichar_id(unichar_id)); | |
| 128 const auto &normed_ids = unicharset.normed_ids(unichar_id); | |
| 129 return normed_ids.size() == 1 && normed_ids[0] == apostrophe_unichar_id_; | |
| 130 } | |
| 131 | |
| 132 /* hyphen.cpp ************************************************************/ | |
| 133 | |
| 134 /// Returns true if we've recorded the beginning of a hyphenated word. | |
| 135 inline bool hyphenated() const { | |
| 136 return !last_word_on_line_ && hyphen_word_; | |
| 137 } | |
| 138 /// Size of the base word (the part on the line before) of a hyphenated word. | |
| 139 inline int hyphen_base_size() const { | |
| 140 return this->hyphenated() ? hyphen_word_->length() : 0; | |
| 141 } | |
| 142 /// If this word is hyphenated copy the base word (the part on | |
| 143 /// the line before) of a hyphenated word into the given word. | |
| 144 /// This function assumes that word is not nullptr. | |
| 145 inline void copy_hyphen_info(WERD_CHOICE *word) const { | |
| 146 if (this->hyphenated()) { | |
| 147 *word = *hyphen_word_; | |
| 148 if (hyphen_debug_level) { | |
| 149 word->print("copy_hyphen_info: "); | |
| 150 } | |
| 151 } | |
| 152 } | |
| 153 /// Check whether the word has a hyphen at the end. | |
| 154 inline bool has_hyphen_end(const UNICHARSET *unicharset, UNICHAR_ID unichar_id, | |
| 155 bool first_pos) const { | |
| 156 if (!last_word_on_line_ || first_pos) { | |
| 157 return false; | |
| 158 } | |
| 159 ASSERT_HOST(unicharset->contains_unichar_id(unichar_id)); | |
| 160 const auto &normed_ids = unicharset->normed_ids(unichar_id); | |
| 161 return normed_ids.size() == 1 && normed_ids[0] == hyphen_unichar_id_; | |
| 162 } | |
| 163 /// Same as above, but check the unichar at the end of the word. | |
| 164 inline bool has_hyphen_end(const WERD_CHOICE &word) const { | |
| 165 int word_index = word.length() - 1; | |
| 166 return has_hyphen_end(word.unicharset(), word.unichar_id(word_index), word_index == 0); | |
| 167 } | |
| 168 /// Unless the previous word was the last one on the line, and the current | |
| 169 /// one is not (thus it is the first one on the line), erase hyphen_word_, | |
| 170 /// clear hyphen_active_dawgs_, update last_word_on_line_. | |
| 171 void reset_hyphen_vars(bool last_word_on_line); | |
| 172 /// Update hyphen_word_, and copy the given DawgPositionVectors into | |
| 173 /// hyphen_active_dawgs_ . | |
| 174 void set_hyphen_word(const WERD_CHOICE &word, const DawgPositionVector &active_dawgs); | |
| 175 | |
| 176 /* permdawg.cpp ************************************************************/ | |
| 177 // Note: Functions in permdawg.cpp are only used by NoDangerousAmbig(). | |
| 178 // When this function is refactored, permdawg.cpp can be removed. | |
| 179 | |
| 180 /// Copies word into best_choice if its rating is smaller | |
| 181 /// than that of best_choice. | |
| 182 inline void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice) { | |
| 183 if (word.rating() < best_choice->rating()) { | |
| 184 *best_choice = word; | |
| 185 } | |
| 186 } | |
| 187 /// Fill the given active_dawgs vector with dawgs that could contain the | |
| 188 /// beginning of the word. If hyphenated() returns true, copy the entries | |
| 189 /// from hyphen_active_dawgs_ instead. | |
| 190 void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const; | |
| 191 // Fill the given vector with the default collection of any-length dawgs | |
| 192 void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const; | |
| 193 | |
| 194 /// Recursively explore all the possible character combinations in | |
| 195 /// the given char_choices. Use go_deeper_dawg_fxn() to explore all the | |
| 196 /// dawgs in the dawgs_ vector in parallel and discard invalid words. | |
| 197 /// | |
| 198 /// Allocate and return a WERD_CHOICE with the best valid word found. | |
| 199 WERD_CHOICE *dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices, | |
| 200 float rating_limit); | |
| 201 /// If the choice being composed so far could be a dictionary word | |
| 202 /// and we have not reached the end of the word keep exploring the | |
| 203 /// char_choices further. | |
| 204 void go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, | |
| 205 int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, | |
| 206 bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, | |
| 207 WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args); | |
| 208 | |
| 209 /// Pointer to go_deeper function. | |
| 210 void (Dict::*go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, | |
| 211 int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, | |
| 212 bool word_ending, WERD_CHOICE *word, float certainties[], | |
| 213 float *limit, WERD_CHOICE *best_choice, int *attempts_left, | |
| 214 void *void_more_args); | |
| 215 // | |
| 216 // Helper functions for dawg_permute_and_select(). | |
| 217 // | |
| 218 void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, | |
| 219 int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, | |
| 220 WERD_CHOICE *word, float certainties[], float *limit, | |
| 221 WERD_CHOICE *best_choice, int *attempts_left, void *more_args); | |
| 222 | |
| 223 void append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, | |
| 224 const BLOB_CHOICE &blob_choice, int char_choice_index, | |
| 225 const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, | |
| 226 float certainties[], float *limit, WERD_CHOICE *best_choice, | |
| 227 int *attempts_left, void *more_args); | |
| 228 | |
| 229 bool fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, | |
| 230 const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug, | |
| 231 int word_ending, CHAR_FRAGMENT_INFO *char_frag_info); | |
| 232 | |
| 233 /* stopper.cpp *************************************************************/ | |
| 234 #if !defined(DISABLED_LEGACY_ENGINE) | |
| 235 bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, | |
| 236 MATRIX *ratings); | |
| 237 #endif // !defined(DISABLED_LEGACY_ENGINE) | |
| 238 // Replaces the corresponding wrong ngram in werd_choice with the correct | |
| 239 // one. The whole correct n-gram is inserted into the ratings matrix and | |
| 240 // the werd_choice: no more fragments!. Rating and certainty of new entries | |
| 241 // in matrix and werd_choice are the sum and mean of the wrong ngram | |
| 242 // respectively. | |
| 243 // E.g. for werd_choice mystring'' and ambiguity ''->": werd_choice becomes | |
| 244 // mystring", with a new entry in the ratings matrix for ". | |
| 245 void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, | |
| 246 WERD_CHOICE *werd_choice, MATRIX *ratings); | |
| 247 | |
| 248 /// Returns the length of the shortest alpha run in WordChoice. | |
| 249 int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const; | |
| 250 /// Returns true if the certainty of the BestChoice word is within a | |
| 251 /// reasonable range of the average certainties for the best choices for | |
| 252 /// each character in the segmentation. This test is used to catch words | |
| 253 /// in which one character is much worse than the other characters in the | |
| 254 /// word (i.e. false will be returned in that case). The algorithm computes | |
| 255 /// the mean and std deviation of the certainties in the word with the worst | |
| 256 /// certainty thrown out. | |
| 257 int UniformCertainties(const WERD_CHOICE &word); | |
| 258 /// Returns true if the given best_choice is good enough to stop. | |
| 259 bool AcceptableChoice(const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency); | |
| 260 /// Returns false if the best choice for the current word is questionable | |
| 261 /// and should be tried again on the second pass or should be flagged to | |
| 262 /// the user. | |
| 263 bool AcceptableResult(WERD_RES *word) const; | |
| 264 #if !defined(DISABLED_LEGACY_ENGINE) | |
| 265 void EndDangerousAmbigs(); | |
| 266 #endif // !defined(DISABLED_LEGACY_ENGINE) | |
| 267 /// Prints the current choices for this word to stdout. | |
| 268 void DebugWordChoices(); | |
| 269 /// Sets up stopper variables in preparation for the first pass. | |
| 270 void SetupStopperPass1(); | |
| 271 /// Sets up stopper variables in preparation for the second pass. | |
| 272 void SetupStopperPass2(); | |
| 273 /* context.cpp *************************************************************/ | |
| 274 /// Check a string to see if it matches a set of lexical rules. | |
| 275 int case_ok(const WERD_CHOICE &word) const; | |
| 276 /// Returns true if the word looks like an absolute garbage | |
| 277 /// (e.g. image mistakenly recognized as text). | |
| 278 bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset); | |
| 279 | |
| 280 /* dict.cpp ****************************************************************/ | |
| 281 | |
| 282 /// Initialize Dict class - load dawgs from [lang].traineddata and | |
| 283 /// user-specified wordlist and parttern list. | |
| 284 static DawgCache *GlobalDawgCache(); | |
| 285 // Sets up ready for a Load or LoadLSTM. | |
| 286 void SetupForLoad(DawgCache *dawg_cache); | |
| 287 // Loads the dawgs needed by Tesseract. Call FinishLoad() after. | |
| 288 void Load(const std::string &lang, TessdataManager *data_file); | |
| 289 // Loads the dawgs needed by the LSTM model. Call FinishLoad() after. | |
| 290 void LoadLSTM(const std::string &lang, TessdataManager *data_file); | |
| 291 // Completes the loading process after Load() and/or LoadLSTM(). | |
| 292 // Returns false if no dictionaries were loaded. | |
| 293 bool FinishLoad(); | |
| 294 void End(); | |
| 295 | |
| 296 // Resets the document dictionary analogous to ResetAdaptiveClassifier. | |
| 297 void ResetDocumentDictionary() { | |
| 298 if (pending_words_ != nullptr) { | |
| 299 pending_words_->clear(); | |
| 300 } | |
| 301 if (document_words_ != nullptr) { | |
| 302 document_words_->clear(); | |
| 303 } | |
| 304 } | |
| 305 | |
| 306 /** | |
| 307 * Returns the maximal permuter code (from ccstruct/ratngs.h) if in light | |
| 308 * of the current state the letter at word_index in the given word | |
| 309 * is allowed according to at least one of the dawgs in dawgs_, | |
| 310 * otherwise returns NO_PERM. | |
| 311 * | |
| 312 * The state is described by void_dawg_args, which are interpreted as | |
| 313 * DawgArgs and contain relevant active dawg positions. | |
| 314 * Each entry in the active_dawgs vector contains an index | |
| 315 * into the dawgs_ vector and an EDGE_REF that indicates the last edge | |
| 316 * followed in the dawg. It also may contain a position in the punctuation | |
| 317 * dawg which describes surrounding punctuation (see struct DawgPosition). | |
| 318 * | |
| 319 * Input: | |
| 320 * At word_index 0 dawg_args->active_dawgs should contain an entry for each | |
| 321 * dawg that may start at the beginning of a word, with punc_ref and edge_ref | |
| 322 * initialized to NO_EDGE. Since the punctuation dawg includes the empty | |
| 323 * pattern " " (meaning anything without surrounding punctuation), having a | |
| 324 * single entry for the punctuation dawg will cover all dawgs reachable | |
| 325 * there from -- that includes all number and word dawgs. The only dawg | |
| 326 * non-reachable from the punctuation_dawg is the pattern dawg. | |
| 327 * If hyphen state needs to be applied, initial dawg_args->active_dawgs can | |
| 328 * be copied from the saved hyphen state (maintained by Dict). | |
| 329 * For word_index > 0 the corresponding state (active_dawgs and punc position) | |
| 330 * can be obtained from dawg_args->updated_dawgs passed to | |
| 331 * def_letter_is_okay for word_index-1. | |
| 332 * Note: the function assumes that active_dawgs, and updated_dawgs | |
| 333 * member variables of dawg_args are not nullptr. | |
| 334 * | |
| 335 * Output: | |
| 336 * The function fills in dawg_args->updated_dawgs vector with the | |
| 337 * entries for dawgs that contain the word up to the letter at word_index. | |
| 338 * | |
| 339 */ | |
| 340 | |
| 341 // | |
| 342 int def_letter_is_okay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, | |
| 343 bool word_end) const; | |
| 344 | |
| 345 int (Dict::*letter_is_okay_)(void *void_dawg_args, const UNICHARSET &unicharset, | |
| 346 UNICHAR_ID unichar_id, bool word_end) const; | |
| 347 /// Calls letter_is_okay_ member function. | |
| 348 int LetterIsOkay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, | |
| 349 bool word_end) const { | |
| 350 return (this->*letter_is_okay_)(void_dawg_args, unicharset, unichar_id, word_end); | |
| 351 } | |
| 352 | |
| 353 /// Probability in context function used by the ngram permuter. | |
| 354 double (Dict::*probability_in_context_)(const char *lang, const char *context, int context_bytes, | |
| 355 const char *character, int character_bytes); | |
| 356 /// Calls probability_in_context_ member function. | |
| 357 double ProbabilityInContext(const char *context, int context_bytes, const char *character, | |
| 358 int character_bytes) { | |
| 359 return (this->*probability_in_context_)(getCCUtil()->lang.c_str(), context, context_bytes, | |
| 360 character, character_bytes); | |
| 361 } | |
| 362 | |
| 363 /// Default (no-op) implementation of probability in context function. | |
| 364 double def_probability_in_context(const char *lang, const char *context, int context_bytes, | |
| 365 const char *character, int character_bytes) { | |
| 366 (void)lang; | |
| 367 (void)context; | |
| 368 (void)context_bytes; | |
| 369 (void)character; | |
| 370 (void)character_bytes; | |
| 371 return 0.0; | |
| 372 } | |
| 373 | |
| 374 inline void SetWildcardID(UNICHAR_ID id) { | |
| 375 wildcard_unichar_id_ = id; | |
| 376 } | |
| 377 inline UNICHAR_ID WildcardID() const { | |
| 378 return wildcard_unichar_id_; | |
| 379 } | |
| 380 /// Return the number of dawgs in the dawgs_ vector. | |
| 381 inline int NumDawgs() const { | |
| 382 return dawgs_.size(); | |
| 383 } | |
| 384 /// Return i-th dawg pointer recorded in the dawgs_ vector. | |
| 385 inline const Dawg *GetDawg(int index) const { | |
| 386 return dawgs_[index]; | |
| 387 } | |
| 388 /// Return the points to the punctuation dawg. | |
| 389 inline const Dawg *GetPuncDawg() const { | |
| 390 return punc_dawg_; | |
| 391 } | |
| 392 /// Return the points to the unambiguous words dawg. | |
| 393 inline const Dawg *GetUnambigDawg() const { | |
| 394 return unambig_dawg_; | |
| 395 } | |
| 396 /// Returns the appropriate next node given the EDGE_REF. | |
| 397 static inline NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref) { | |
| 398 if (edge_ref == NO_EDGE) { | |
| 399 return 0; // beginning to explore the dawg | |
| 400 } | |
| 401 NODE_REF node = dawg->next_node(edge_ref); | |
| 402 if (node == 0) { | |
| 403 node = NO_EDGE; // end of word | |
| 404 } | |
| 405 return node; | |
| 406 } | |
| 407 | |
| 408 // Given a unichar from a string and a given dawg, return the unichar | |
| 409 // we should use to match in that dawg type. (for example, in the number | |
| 410 // dawg, all numbers are transformed to kPatternUnicharId). | |
| 411 UNICHAR_ID char_for_dawg(const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const { | |
| 412 if (!dawg) { | |
| 413 return ch; | |
| 414 } | |
| 415 switch (dawg->type()) { | |
| 416 case DAWG_TYPE_NUMBER: | |
| 417 return unicharset.get_isdigit(ch) ? Dawg::kPatternUnicharID : ch; | |
| 418 default: | |
| 419 return ch; | |
| 420 } | |
| 421 } | |
| 422 | |
| 423 /// For each of the character classes of the given unichar_id (and the | |
| 424 /// unichar_id itself) finds the corresponding outgoing node or self-loop | |
| 425 /// in the given dawg and (after checking that it is valid) records it in | |
| 426 /// dawg_args->updated_ative_dawgs. Updates current_permuter if any valid | |
| 427 /// edges were found. | |
| 428 void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id, | |
| 429 bool word_end, DawgArgs *dawg_args, | |
| 430 PermuterType *current_permuter) const; | |
| 431 | |
| 432 /// Read/Write/Access special purpose dawgs which contain words | |
| 433 /// only of a certain length (used for phrase search for | |
| 434 /// non-space-delimited languages). | |
| 435 | |
| 436 /// Check all the DAWGs to see if this word is in any of them. | |
| 437 inline static bool valid_word_permuter(uint8_t perm, bool numbers_ok) { | |
| 438 return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM || perm == DOC_DAWG_PERM || | |
| 439 perm == USER_DAWG_PERM || perm == USER_PATTERN_PERM || perm == COMPOUND_PERM || | |
| 440 (numbers_ok && perm == NUMBER_PERM)); | |
| 441 } | |
| 442 int valid_word(const WERD_CHOICE &word, bool numbers_ok) const; | |
| 443 int valid_word(const WERD_CHOICE &word) const { | |
| 444 return valid_word(word, false); // return NO_PERM for words with digits | |
| 445 } | |
| 446 int valid_word_or_number(const WERD_CHOICE &word) const { | |
| 447 return valid_word(word, true); // return NUMBER_PERM for valid numbers | |
| 448 } | |
| 449 /// This function is used by api/tesseract_cube_combiner.cpp | |
| 450 int valid_word(const char *string) const { | |
| 451 WERD_CHOICE word(string, getUnicharset()); | |
| 452 return valid_word(word); | |
| 453 } | |
| 454 // Do the two WERD_CHOICEs form a meaningful bigram? | |
| 455 bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const; | |
| 456 /// Returns true if the word contains a valid punctuation pattern. | |
| 457 /// Note: Since the domains of punctuation symbols and symblos | |
| 458 /// used in numbers are not disjoint, a valid number might contain | |
| 459 /// an invalid punctuation pattern (e.g. .99). | |
| 460 bool valid_punctuation(const WERD_CHOICE &word); | |
| 461 /// Returns true if a good answer is found for the unknown blob rating. | |
| 462 int good_choice(const WERD_CHOICE &choice); | |
| 463 /// Adds a word found on this document to the document specific dictionary. | |
| 464 void add_document_word(const WERD_CHOICE &best_choice); | |
| 465 /// Adjusts the rating of the given word. | |
| 466 void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, | |
| 467 float additional_adjust, bool modify_rating, bool debug); | |
| 468 /// Set wordseg_rating_adjust_factor_ to the given value. | |
| 469 inline void SetWordsegRatingAdjustFactor(float f) { | |
| 470 wordseg_rating_adjust_factor_ = f; | |
| 471 } | |
| 472 /// Returns true if the language is space-delimited (not CJ, or T). | |
| 473 bool IsSpaceDelimitedLang() const; | |
| 474 | |
| 475 private: | |
| 476 /** Private member variables. */ | |
| 477 CCUtil *ccutil_; | |
| 478 /** | |
| 479 * Table that stores ambiguities computed during training | |
| 480 * (loaded when NoDangerousAmbigs() is called for the first time). | |
| 481 * Each entry i in the table stores a set of amibiguities whose | |
| 482 * wrong ngram starts with unichar id i. | |
| 483 */ | |
| 484 #ifndef DISABLED_LEGACY_ENGINE | |
| 485 UnicharAmbigs *dang_ambigs_table_ = nullptr; | |
| 486 /** Same as above, but for ambiguities with replace flag set. */ | |
| 487 UnicharAmbigs *replace_ambigs_table_ = nullptr; | |
| 488 #endif | |
| 489 /** Additional certainty padding allowed before a word is rejected. */ | |
| 490 float reject_offset_; | |
| 491 // Cached UNICHAR_IDs: | |
| 492 UNICHAR_ID wildcard_unichar_id_; // kDictWildcard. | |
| 493 UNICHAR_ID apostrophe_unichar_id_; // kApostropheSymbol. | |
| 494 UNICHAR_ID question_unichar_id_; // kQuestionSymbol. | |
| 495 UNICHAR_ID slash_unichar_id_; // kSlashSymbol. | |
| 496 UNICHAR_ID hyphen_unichar_id_; // kHyphenSymbol. | |
| 497 // Hyphen-related variables. | |
| 498 WERD_CHOICE *hyphen_word_; | |
| 499 DawgPositionVector hyphen_active_dawgs_; | |
| 500 bool last_word_on_line_; | |
| 501 // List of lists of "equivalent" UNICHAR_IDs for the purposes of dictionary | |
| 502 // matching. The first member of each list is taken as canonical. For | |
| 503 // example, the first list contains hyphens and dashes with the first symbol | |
| 504 // being the ASCII hyphen minus. | |
| 505 std::vector<std::vector<UNICHAR_ID>> equivalent_symbols_; | |
| 506 // Dawg Cache reference - this is who we ask to allocate/deallocate dawgs. | |
| 507 DawgCache *dawg_cache_; | |
| 508 bool dawg_cache_is_ours_; // we should delete our own dawg_cache_ | |
| 509 // Dawgs. | |
| 510 DawgVector dawgs_; | |
| 511 SuccessorListsVector successors_; | |
| 512 Trie *pending_words_; | |
| 513 /// The following pointers are only cached for convenience. | |
| 514 /// The dawgs will be deleted when dawgs_ vector is destroyed. | |
| 515 // bigram_dawg_ points to a dawg of two-word bigrams which always supersede if | |
| 516 // any of them are present on the best choices list for a word pair. | |
| 517 // the bigrams are stored as space-separated words where: | |
| 518 // (1) leading and trailing punctuation has been removed from each word and | |
| 519 // (2) any digits have been replaced with '?' marks. | |
| 520 Dawg *bigram_dawg_; | |
| 521 // TODO(daria): need to support multiple languages in the future, | |
| 522 // so maybe will need to maintain a list of dawgs of each kind. | |
| 523 Dawg *freq_dawg_; | |
| 524 Dawg *unambig_dawg_; | |
| 525 Dawg *punc_dawg_; | |
| 526 Trie *document_words_; | |
| 527 /// Current segmentation cost adjust factor for word rating. | |
| 528 /// See comments in incorporate_segcost. | |
| 529 float wordseg_rating_adjust_factor_; | |
| 530 // File for recording ambiguities discovered during dictionary search. | |
| 531 FILE *output_ambig_words_file_; | |
| 532 | |
| 533 public: | |
| 534 /// Variable members. | |
| 535 /// These have to be declared and initialized after image_ptr_, which contains | |
| 536 /// the pointer to the params vector - the member of its base CCUtil class. | |
| 537 STRING_VAR_H(user_words_file); | |
| 538 STRING_VAR_H(user_words_suffix); | |
| 539 STRING_VAR_H(user_patterns_file); | |
| 540 STRING_VAR_H(user_patterns_suffix); | |
| 541 BOOL_VAR_H(load_system_dawg); | |
| 542 BOOL_VAR_H(load_freq_dawg); | |
| 543 BOOL_VAR_H(load_unambig_dawg); | |
| 544 BOOL_VAR_H(load_punc_dawg); | |
| 545 BOOL_VAR_H(load_number_dawg); | |
| 546 BOOL_VAR_H(load_bigram_dawg); | |
| 547 double_VAR_H(xheight_penalty_subscripts); | |
| 548 double_VAR_H(xheight_penalty_inconsistent); | |
| 549 double_VAR_H(segment_penalty_dict_frequent_word); | |
| 550 double_VAR_H(segment_penalty_dict_case_ok); | |
| 551 double_VAR_H(segment_penalty_dict_case_bad); | |
| 552 double_VAR_H(segment_penalty_dict_nonword); | |
| 553 double_VAR_H(segment_penalty_garbage); | |
| 554 STRING_VAR_H(output_ambig_words_file); | |
| 555 INT_VAR_H(dawg_debug_level); | |
| 556 INT_VAR_H(hyphen_debug_level); | |
| 557 BOOL_VAR_H(use_only_first_uft8_step); | |
| 558 double_VAR_H(certainty_scale); | |
| 559 double_VAR_H(stopper_nondict_certainty_base); | |
| 560 double_VAR_H(stopper_phase2_certainty_rejection_offset); | |
| 561 INT_VAR_H(stopper_smallword_size); | |
| 562 double_VAR_H(stopper_certainty_per_char); | |
| 563 double_VAR_H(stopper_allowable_character_badness); | |
| 564 INT_VAR_H(stopper_debug_level); | |
| 565 BOOL_VAR_H(stopper_no_acceptable_choices); | |
| 566 INT_VAR_H(tessedit_truncate_wordchoice_log); | |
| 567 STRING_VAR_H(word_to_debug); | |
| 568 BOOL_VAR_H(segment_nonalphabetic_script); | |
| 569 BOOL_VAR_H(save_doc_words); | |
| 570 double_VAR_H(doc_dict_pending_threshold); | |
| 571 double_VAR_H(doc_dict_certainty_threshold); | |
| 572 INT_VAR_H(max_permuter_attempts); | |
| 573 }; | |
| 574 | |
| 575 } // namespace tesseract | |
| 576 | |
| 577 #endif // THIRD_PARTY_TESSERACT_DICT_DICT_H_ |
