comparison mupdf-source/thirdparty/tesseract/src/dict/dict.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 ///////////////////////////////////////////////////////////////////////
2 // File: dict.h
3 // Description: dict class.
4 // Author: Samuel Charron
5 //
6 // (C) Copyright 2006, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
17 ///////////////////////////////////////////////////////////////////////
18
19 #ifndef TESSERACT_DICT_DICT_H_
20 #define TESSERACT_DICT_DICT_H_
21
22 #ifdef HAVE_CONFIG_H
23 # include "config_auto.h" // DISABLED_LEGACY_ENGINE
24 #endif
25
26 #ifndef DISABLED_LEGACY_ENGINE
27 # include "ambigs.h"
28 #endif
29 #include "dawg.h"
30 #include "dawg_cache.h"
31 #include "ratngs.h"
32 #include "stopper.h"
33 #include "trie.h"
34 #include "unicharset.h"
35 #ifndef DISABLED_LEGACY_ENGINE
36 # include "params_training_featdef.h"
37 #endif // ndef DISABLED_LEGACY_ENGINE
38
39 namespace tesseract {
40
41 class MATRIX;
42 class WERD_RES;
43
44 #define CHARS_PER_LINE 500
45 #define MAX_WERD_LENGTH (int64_t)128
46 #define NO_RATING -1
47
48 /** Struct used to hold temporary information about fragments. */
49 struct CHAR_FRAGMENT_INFO {
50 UNICHAR_ID unichar_id;
51 const CHAR_FRAGMENT *fragment;
52 int num_fragments;
53 float rating;
54 float certainty;
55 };
56
57 using DawgVector = std::vector<Dawg *>;
58
59 //
60 // Constants
61 //
62 static const int kRatingPad = 4;
63 static const int kDictMaxWildcards = 2; // max wildcards for a word
64 // TODO(daria): If hyphens are different in different languages and can be
65 // inferred from training data we should load their values dynamically.
66 static const char kHyphenSymbol[] = "-";
67 static const char kSlashSymbol[] = "/";
68 static const char kQuestionSymbol[] = "?";
69 static const char kApostropheSymbol[] = "'";
70 static const float kSimCertaintyScale = -10.0; // similarity matcher scaling
71 static const float kSimCertaintyOffset = -10.0; // similarity matcher offset
72 static const float kSimilarityFloor = 100.0; // worst E*L product to stop on
73 static const int kDocDictMaxRepChars = 4;
74
75 // Enum for describing whether the x-height for the word is consistent:
76 // 0 - everything is good.
77 // 1 - there are one or two secondary (but consistent) baselines
78 // [think subscript and superscript], or there is an oversized
79 // first character.
80 // 2 - the word is inconsistent.
81 enum XHeightConsistencyEnum { XH_GOOD, XH_SUBNORMAL, XH_INCONSISTENT };
82
83 struct DawgArgs {
84 DawgArgs(DawgPositionVector *d, DawgPositionVector *up, PermuterType p)
85 : active_dawgs(d), updated_dawgs(up), permuter(p), valid_end(false) {}
86
87 DawgPositionVector *active_dawgs;
88 DawgPositionVector *updated_dawgs;
89 PermuterType permuter;
90 // True if the current position is a valid word end.
91 bool valid_end;
92 };
93
94 class TESS_API Dict {
95 public:
96 Dict(CCUtil *image_ptr);
97 ~Dict();
98 const CCUtil *getCCUtil() const {
99 return ccutil_;
100 }
101 CCUtil *getCCUtil() {
102 return ccutil_;
103 }
104 const UNICHARSET &getUnicharset() const {
105 return getCCUtil()->unicharset;
106 }
107 UNICHARSET &getUnicharset() {
108 return getCCUtil()->unicharset;
109 }
110 #ifndef DISABLED_LEGACY_ENGINE
111 const UnicharAmbigs &getUnicharAmbigs() const {
112 return getCCUtil()->unichar_ambigs;
113 }
114 #endif
115 // Returns true if unichar_id is a word compounding character like - or /.
116 inline bool compound_marker(UNICHAR_ID unichar_id) {
117 const UNICHARSET &unicharset = getUnicharset();
118 ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
119 const auto &normed_ids = unicharset.normed_ids(unichar_id);
120 return normed_ids.size() == 1 &&
121 (normed_ids[0] == hyphen_unichar_id_ || normed_ids[0] == slash_unichar_id_);
122 }
123 // Returns true if unichar_id is an apostrophe-like character that may
124 // separate prefix/suffix words from a main body word.
125 inline bool is_apostrophe(UNICHAR_ID unichar_id) {
126 const UNICHARSET &unicharset = getUnicharset();
127 ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
128 const auto &normed_ids = unicharset.normed_ids(unichar_id);
129 return normed_ids.size() == 1 && normed_ids[0] == apostrophe_unichar_id_;
130 }
131
132 /* hyphen.cpp ************************************************************/
133
134 /// Returns true if we've recorded the beginning of a hyphenated word.
135 inline bool hyphenated() const {
136 return !last_word_on_line_ && hyphen_word_;
137 }
138 /// Size of the base word (the part on the line before) of a hyphenated word.
139 inline int hyphen_base_size() const {
140 return this->hyphenated() ? hyphen_word_->length() : 0;
141 }
142 /// If this word is hyphenated copy the base word (the part on
143 /// the line before) of a hyphenated word into the given word.
144 /// This function assumes that word is not nullptr.
145 inline void copy_hyphen_info(WERD_CHOICE *word) const {
146 if (this->hyphenated()) {
147 *word = *hyphen_word_;
148 if (hyphen_debug_level) {
149 word->print("copy_hyphen_info: ");
150 }
151 }
152 }
153 /// Check whether the word has a hyphen at the end.
154 inline bool has_hyphen_end(const UNICHARSET *unicharset, UNICHAR_ID unichar_id,
155 bool first_pos) const {
156 if (!last_word_on_line_ || first_pos) {
157 return false;
158 }
159 ASSERT_HOST(unicharset->contains_unichar_id(unichar_id));
160 const auto &normed_ids = unicharset->normed_ids(unichar_id);
161 return normed_ids.size() == 1 && normed_ids[0] == hyphen_unichar_id_;
162 }
163 /// Same as above, but check the unichar at the end of the word.
164 inline bool has_hyphen_end(const WERD_CHOICE &word) const {
165 int word_index = word.length() - 1;
166 return has_hyphen_end(word.unicharset(), word.unichar_id(word_index), word_index == 0);
167 }
168 /// Unless the previous word was the last one on the line, and the current
169 /// one is not (thus it is the first one on the line), erase hyphen_word_,
170 /// clear hyphen_active_dawgs_, update last_word_on_line_.
171 void reset_hyphen_vars(bool last_word_on_line);
172 /// Update hyphen_word_, and copy the given DawgPositionVectors into
173 /// hyphen_active_dawgs_ .
174 void set_hyphen_word(const WERD_CHOICE &word, const DawgPositionVector &active_dawgs);
175
176 /* permdawg.cpp ************************************************************/
177 // Note: Functions in permdawg.cpp are only used by NoDangerousAmbig().
178 // When this function is refactored, permdawg.cpp can be removed.
179
180 /// Copies word into best_choice if its rating is smaller
181 /// than that of best_choice.
182 inline void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice) {
183 if (word.rating() < best_choice->rating()) {
184 *best_choice = word;
185 }
186 }
187 /// Fill the given active_dawgs vector with dawgs that could contain the
188 /// beginning of the word. If hyphenated() returns true, copy the entries
189 /// from hyphen_active_dawgs_ instead.
190 void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const;
191 // Fill the given vector with the default collection of any-length dawgs
192 void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const;
193
194 /// Recursively explore all the possible character combinations in
195 /// the given char_choices. Use go_deeper_dawg_fxn() to explore all the
196 /// dawgs in the dawgs_ vector in parallel and discard invalid words.
197 ///
198 /// Allocate and return a WERD_CHOICE with the best valid word found.
199 WERD_CHOICE *dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices,
200 float rating_limit);
201 /// If the choice being composed so far could be a dictionary word
202 /// and we have not reached the end of the word keep exploring the
203 /// char_choices further.
204 void go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
205 int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
206 bool word_ending, WERD_CHOICE *word, float certainties[], float *limit,
207 WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args);
208
209 /// Pointer to go_deeper function.
210 void (Dict::*go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
211 int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
212 bool word_ending, WERD_CHOICE *word, float certainties[],
213 float *limit, WERD_CHOICE *best_choice, int *attempts_left,
214 void *void_more_args);
215 //
216 // Helper functions for dawg_permute_and_select().
217 //
218 void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
219 int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
220 WERD_CHOICE *word, float certainties[], float *limit,
221 WERD_CHOICE *best_choice, int *attempts_left, void *more_args);
222
223 void append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
224 const BLOB_CHOICE &blob_choice, int char_choice_index,
225 const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word,
226 float certainties[], float *limit, WERD_CHOICE *best_choice,
227 int *attempts_left, void *more_args);
228
229 bool fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty,
230 const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug,
231 int word_ending, CHAR_FRAGMENT_INFO *char_frag_info);
232
233 /* stopper.cpp *************************************************************/
234 #if !defined(DISABLED_LEGACY_ENGINE)
235 bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable,
236 MATRIX *ratings);
237 #endif // !defined(DISABLED_LEGACY_ENGINE)
238 // Replaces the corresponding wrong ngram in werd_choice with the correct
239 // one. The whole correct n-gram is inserted into the ratings matrix and
240 // the werd_choice: no more fragments!. Rating and certainty of new entries
241 // in matrix and werd_choice are the sum and mean of the wrong ngram
242 // respectively.
243 // E.g. for werd_choice mystring'' and ambiguity ''->": werd_choice becomes
244 // mystring", with a new entry in the ratings matrix for ".
245 void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id,
246 WERD_CHOICE *werd_choice, MATRIX *ratings);
247
248 /// Returns the length of the shortest alpha run in WordChoice.
249 int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const;
250 /// Returns true if the certainty of the BestChoice word is within a
251 /// reasonable range of the average certainties for the best choices for
252 /// each character in the segmentation. This test is used to catch words
253 /// in which one character is much worse than the other characters in the
254 /// word (i.e. false will be returned in that case). The algorithm computes
255 /// the mean and std deviation of the certainties in the word with the worst
256 /// certainty thrown out.
257 int UniformCertainties(const WERD_CHOICE &word);
258 /// Returns true if the given best_choice is good enough to stop.
259 bool AcceptableChoice(const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency);
260 /// Returns false if the best choice for the current word is questionable
261 /// and should be tried again on the second pass or should be flagged to
262 /// the user.
263 bool AcceptableResult(WERD_RES *word) const;
264 #if !defined(DISABLED_LEGACY_ENGINE)
265 void EndDangerousAmbigs();
266 #endif // !defined(DISABLED_LEGACY_ENGINE)
267 /// Prints the current choices for this word to stdout.
268 void DebugWordChoices();
269 /// Sets up stopper variables in preparation for the first pass.
270 void SetupStopperPass1();
271 /// Sets up stopper variables in preparation for the second pass.
272 void SetupStopperPass2();
273 /* context.cpp *************************************************************/
274 /// Check a string to see if it matches a set of lexical rules.
275 int case_ok(const WERD_CHOICE &word) const;
276 /// Returns true if the word looks like an absolute garbage
277 /// (e.g. image mistakenly recognized as text).
278 bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset);
279
280 /* dict.cpp ****************************************************************/
281
282 /// Initialize Dict class - load dawgs from [lang].traineddata and
283 /// user-specified wordlist and parttern list.
284 static DawgCache *GlobalDawgCache();
285 // Sets up ready for a Load or LoadLSTM.
286 void SetupForLoad(DawgCache *dawg_cache);
287 // Loads the dawgs needed by Tesseract. Call FinishLoad() after.
288 void Load(const std::string &lang, TessdataManager *data_file);
289 // Loads the dawgs needed by the LSTM model. Call FinishLoad() after.
290 void LoadLSTM(const std::string &lang, TessdataManager *data_file);
291 // Completes the loading process after Load() and/or LoadLSTM().
292 // Returns false if no dictionaries were loaded.
293 bool FinishLoad();
294 void End();
295
296 // Resets the document dictionary analogous to ResetAdaptiveClassifier.
297 void ResetDocumentDictionary() {
298 if (pending_words_ != nullptr) {
299 pending_words_->clear();
300 }
301 if (document_words_ != nullptr) {
302 document_words_->clear();
303 }
304 }
305
306 /**
307 * Returns the maximal permuter code (from ccstruct/ratngs.h) if in light
308 * of the current state the letter at word_index in the given word
309 * is allowed according to at least one of the dawgs in dawgs_,
310 * otherwise returns NO_PERM.
311 *
312 * The state is described by void_dawg_args, which are interpreted as
313 * DawgArgs and contain relevant active dawg positions.
314 * Each entry in the active_dawgs vector contains an index
315 * into the dawgs_ vector and an EDGE_REF that indicates the last edge
316 * followed in the dawg. It also may contain a position in the punctuation
317 * dawg which describes surrounding punctuation (see struct DawgPosition).
318 *
319 * Input:
320 * At word_index 0 dawg_args->active_dawgs should contain an entry for each
321 * dawg that may start at the beginning of a word, with punc_ref and edge_ref
322 * initialized to NO_EDGE. Since the punctuation dawg includes the empty
323 * pattern " " (meaning anything without surrounding punctuation), having a
324 * single entry for the punctuation dawg will cover all dawgs reachable
325 * there from -- that includes all number and word dawgs. The only dawg
326 * non-reachable from the punctuation_dawg is the pattern dawg.
327 * If hyphen state needs to be applied, initial dawg_args->active_dawgs can
328 * be copied from the saved hyphen state (maintained by Dict).
329 * For word_index > 0 the corresponding state (active_dawgs and punc position)
330 * can be obtained from dawg_args->updated_dawgs passed to
331 * def_letter_is_okay for word_index-1.
332 * Note: the function assumes that active_dawgs, and updated_dawgs
333 * member variables of dawg_args are not nullptr.
334 *
335 * Output:
336 * The function fills in dawg_args->updated_dawgs vector with the
337 * entries for dawgs that contain the word up to the letter at word_index.
338 *
339 */
340
341 //
342 int def_letter_is_okay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id,
343 bool word_end) const;
344
345 int (Dict::*letter_is_okay_)(void *void_dawg_args, const UNICHARSET &unicharset,
346 UNICHAR_ID unichar_id, bool word_end) const;
347 /// Calls letter_is_okay_ member function.
348 int LetterIsOkay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id,
349 bool word_end) const {
350 return (this->*letter_is_okay_)(void_dawg_args, unicharset, unichar_id, word_end);
351 }
352
353 /// Probability in context function used by the ngram permuter.
354 double (Dict::*probability_in_context_)(const char *lang, const char *context, int context_bytes,
355 const char *character, int character_bytes);
356 /// Calls probability_in_context_ member function.
357 double ProbabilityInContext(const char *context, int context_bytes, const char *character,
358 int character_bytes) {
359 return (this->*probability_in_context_)(getCCUtil()->lang.c_str(), context, context_bytes,
360 character, character_bytes);
361 }
362
363 /// Default (no-op) implementation of probability in context function.
364 double def_probability_in_context(const char *lang, const char *context, int context_bytes,
365 const char *character, int character_bytes) {
366 (void)lang;
367 (void)context;
368 (void)context_bytes;
369 (void)character;
370 (void)character_bytes;
371 return 0.0;
372 }
373
374 inline void SetWildcardID(UNICHAR_ID id) {
375 wildcard_unichar_id_ = id;
376 }
377 inline UNICHAR_ID WildcardID() const {
378 return wildcard_unichar_id_;
379 }
380 /// Return the number of dawgs in the dawgs_ vector.
381 inline int NumDawgs() const {
382 return dawgs_.size();
383 }
384 /// Return i-th dawg pointer recorded in the dawgs_ vector.
385 inline const Dawg *GetDawg(int index) const {
386 return dawgs_[index];
387 }
388 /// Return the points to the punctuation dawg.
389 inline const Dawg *GetPuncDawg() const {
390 return punc_dawg_;
391 }
392 /// Return the points to the unambiguous words dawg.
393 inline const Dawg *GetUnambigDawg() const {
394 return unambig_dawg_;
395 }
396 /// Returns the appropriate next node given the EDGE_REF.
397 static inline NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref) {
398 if (edge_ref == NO_EDGE) {
399 return 0; // beginning to explore the dawg
400 }
401 NODE_REF node = dawg->next_node(edge_ref);
402 if (node == 0) {
403 node = NO_EDGE; // end of word
404 }
405 return node;
406 }
407
408 // Given a unichar from a string and a given dawg, return the unichar
409 // we should use to match in that dawg type. (for example, in the number
410 // dawg, all numbers are transformed to kPatternUnicharId).
411 UNICHAR_ID char_for_dawg(const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const {
412 if (!dawg) {
413 return ch;
414 }
415 switch (dawg->type()) {
416 case DAWG_TYPE_NUMBER:
417 return unicharset.get_isdigit(ch) ? Dawg::kPatternUnicharID : ch;
418 default:
419 return ch;
420 }
421 }
422
423 /// For each of the character classes of the given unichar_id (and the
424 /// unichar_id itself) finds the corresponding outgoing node or self-loop
425 /// in the given dawg and (after checking that it is valid) records it in
426 /// dawg_args->updated_ative_dawgs. Updates current_permuter if any valid
427 /// edges were found.
428 void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id,
429 bool word_end, DawgArgs *dawg_args,
430 PermuterType *current_permuter) const;
431
432 /// Read/Write/Access special purpose dawgs which contain words
433 /// only of a certain length (used for phrase search for
434 /// non-space-delimited languages).
435
436 /// Check all the DAWGs to see if this word is in any of them.
437 inline static bool valid_word_permuter(uint8_t perm, bool numbers_ok) {
438 return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM || perm == DOC_DAWG_PERM ||
439 perm == USER_DAWG_PERM || perm == USER_PATTERN_PERM || perm == COMPOUND_PERM ||
440 (numbers_ok && perm == NUMBER_PERM));
441 }
442 int valid_word(const WERD_CHOICE &word, bool numbers_ok) const;
443 int valid_word(const WERD_CHOICE &word) const {
444 return valid_word(word, false); // return NO_PERM for words with digits
445 }
446 int valid_word_or_number(const WERD_CHOICE &word) const {
447 return valid_word(word, true); // return NUMBER_PERM for valid numbers
448 }
449 /// This function is used by api/tesseract_cube_combiner.cpp
450 int valid_word(const char *string) const {
451 WERD_CHOICE word(string, getUnicharset());
452 return valid_word(word);
453 }
454 // Do the two WERD_CHOICEs form a meaningful bigram?
455 bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const;
456 /// Returns true if the word contains a valid punctuation pattern.
457 /// Note: Since the domains of punctuation symbols and symblos
458 /// used in numbers are not disjoint, a valid number might contain
459 /// an invalid punctuation pattern (e.g. .99).
460 bool valid_punctuation(const WERD_CHOICE &word);
461 /// Returns true if a good answer is found for the unknown blob rating.
462 int good_choice(const WERD_CHOICE &choice);
463 /// Adds a word found on this document to the document specific dictionary.
464 void add_document_word(const WERD_CHOICE &best_choice);
465 /// Adjusts the rating of the given word.
466 void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency,
467 float additional_adjust, bool modify_rating, bool debug);
468 /// Set wordseg_rating_adjust_factor_ to the given value.
469 inline void SetWordsegRatingAdjustFactor(float f) {
470 wordseg_rating_adjust_factor_ = f;
471 }
472 /// Returns true if the language is space-delimited (not CJ, or T).
473 bool IsSpaceDelimitedLang() const;
474
475 private:
476 /** Private member variables. */
477 CCUtil *ccutil_;
478 /**
479 * Table that stores ambiguities computed during training
480 * (loaded when NoDangerousAmbigs() is called for the first time).
481 * Each entry i in the table stores a set of amibiguities whose
482 * wrong ngram starts with unichar id i.
483 */
484 #ifndef DISABLED_LEGACY_ENGINE
485 UnicharAmbigs *dang_ambigs_table_ = nullptr;
486 /** Same as above, but for ambiguities with replace flag set. */
487 UnicharAmbigs *replace_ambigs_table_ = nullptr;
488 #endif
489 /** Additional certainty padding allowed before a word is rejected. */
490 float reject_offset_;
491 // Cached UNICHAR_IDs:
492 UNICHAR_ID wildcard_unichar_id_; // kDictWildcard.
493 UNICHAR_ID apostrophe_unichar_id_; // kApostropheSymbol.
494 UNICHAR_ID question_unichar_id_; // kQuestionSymbol.
495 UNICHAR_ID slash_unichar_id_; // kSlashSymbol.
496 UNICHAR_ID hyphen_unichar_id_; // kHyphenSymbol.
497 // Hyphen-related variables.
498 WERD_CHOICE *hyphen_word_;
499 DawgPositionVector hyphen_active_dawgs_;
500 bool last_word_on_line_;
501 // List of lists of "equivalent" UNICHAR_IDs for the purposes of dictionary
502 // matching. The first member of each list is taken as canonical. For
503 // example, the first list contains hyphens and dashes with the first symbol
504 // being the ASCII hyphen minus.
505 std::vector<std::vector<UNICHAR_ID>> equivalent_symbols_;
506 // Dawg Cache reference - this is who we ask to allocate/deallocate dawgs.
507 DawgCache *dawg_cache_;
508 bool dawg_cache_is_ours_; // we should delete our own dawg_cache_
509 // Dawgs.
510 DawgVector dawgs_;
511 SuccessorListsVector successors_;
512 Trie *pending_words_;
513 /// The following pointers are only cached for convenience.
514 /// The dawgs will be deleted when dawgs_ vector is destroyed.
515 // bigram_dawg_ points to a dawg of two-word bigrams which always supersede if
516 // any of them are present on the best choices list for a word pair.
517 // the bigrams are stored as space-separated words where:
518 // (1) leading and trailing punctuation has been removed from each word and
519 // (2) any digits have been replaced with '?' marks.
520 Dawg *bigram_dawg_;
521 // TODO(daria): need to support multiple languages in the future,
522 // so maybe will need to maintain a list of dawgs of each kind.
523 Dawg *freq_dawg_;
524 Dawg *unambig_dawg_;
525 Dawg *punc_dawg_;
526 Trie *document_words_;
527 /// Current segmentation cost adjust factor for word rating.
528 /// See comments in incorporate_segcost.
529 float wordseg_rating_adjust_factor_;
530 // File for recording ambiguities discovered during dictionary search.
531 FILE *output_ambig_words_file_;
532
533 public:
534 /// Variable members.
535 /// These have to be declared and initialized after image_ptr_, which contains
536 /// the pointer to the params vector - the member of its base CCUtil class.
537 STRING_VAR_H(user_words_file);
538 STRING_VAR_H(user_words_suffix);
539 STRING_VAR_H(user_patterns_file);
540 STRING_VAR_H(user_patterns_suffix);
541 BOOL_VAR_H(load_system_dawg);
542 BOOL_VAR_H(load_freq_dawg);
543 BOOL_VAR_H(load_unambig_dawg);
544 BOOL_VAR_H(load_punc_dawg);
545 BOOL_VAR_H(load_number_dawg);
546 BOOL_VAR_H(load_bigram_dawg);
547 double_VAR_H(xheight_penalty_subscripts);
548 double_VAR_H(xheight_penalty_inconsistent);
549 double_VAR_H(segment_penalty_dict_frequent_word);
550 double_VAR_H(segment_penalty_dict_case_ok);
551 double_VAR_H(segment_penalty_dict_case_bad);
552 double_VAR_H(segment_penalty_dict_nonword);
553 double_VAR_H(segment_penalty_garbage);
554 STRING_VAR_H(output_ambig_words_file);
555 INT_VAR_H(dawg_debug_level);
556 INT_VAR_H(hyphen_debug_level);
557 BOOL_VAR_H(use_only_first_uft8_step);
558 double_VAR_H(certainty_scale);
559 double_VAR_H(stopper_nondict_certainty_base);
560 double_VAR_H(stopper_phase2_certainty_rejection_offset);
561 INT_VAR_H(stopper_smallword_size);
562 double_VAR_H(stopper_certainty_per_char);
563 double_VAR_H(stopper_allowable_character_badness);
564 INT_VAR_H(stopper_debug_level);
565 BOOL_VAR_H(stopper_no_acceptable_choices);
566 INT_VAR_H(tessedit_truncate_wordchoice_log);
567 STRING_VAR_H(word_to_debug);
568 BOOL_VAR_H(segment_nonalphabetic_script);
569 BOOL_VAR_H(save_doc_words);
570 double_VAR_H(doc_dict_pending_threshold);
571 double_VAR_H(doc_dict_certainty_threshold);
572 INT_VAR_H(max_permuter_attempts);
573 };
574
575 } // namespace tesseract
576
577 #endif // THIRD_PARTY_TESSERACT_DICT_DICT_H_