comparison mupdf-source/thirdparty/tesseract/src/dict/dict.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 ///////////////////////////////////////////////////////////////////////
2 // File: dict.cpp
3 // Description: dict class.
4 // Author: Samuel Charron
5 //
6 // (C) Copyright 2006, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
17 ///////////////////////////////////////////////////////////////////////
18
19 #include "dict.h"
20
21 #include "tesserrstream.h" // for tesserr
22 #include "tprintf.h"
23
24 #include <cstdio>
25
26 namespace tesseract {
27
28 class Image;
29
30 Dict::Dict(CCUtil *ccutil)
31 : letter_is_okay_(&tesseract::Dict::def_letter_is_okay)
32 , probability_in_context_(&tesseract::Dict::def_probability_in_context)
33 , ccutil_(ccutil)
34 , wildcard_unichar_id_(INVALID_UNICHAR_ID)
35 , apostrophe_unichar_id_(INVALID_UNICHAR_ID)
36 , question_unichar_id_(INVALID_UNICHAR_ID)
37 , slash_unichar_id_(INVALID_UNICHAR_ID)
38 , hyphen_unichar_id_(INVALID_UNICHAR_ID)
39 , STRING_MEMBER(user_words_file, "", "A filename of user-provided words.",
40 getCCUtil()->params())
41 , STRING_INIT_MEMBER(user_words_suffix, "",
42 "A suffix of user-provided words located in tessdata.",
43 getCCUtil()->params())
44 , STRING_MEMBER(user_patterns_file, "", "A filename of user-provided patterns.",
45 getCCUtil()->params())
46 , STRING_INIT_MEMBER(user_patterns_suffix, "",
47 "A suffix of user-provided patterns located in "
48 "tessdata.",
49 getCCUtil()->params())
50 , BOOL_INIT_MEMBER(load_system_dawg, true, "Load system word dawg.", getCCUtil()->params())
51 , BOOL_INIT_MEMBER(load_freq_dawg, true, "Load frequent word dawg.", getCCUtil()->params())
52 , BOOL_INIT_MEMBER(load_unambig_dawg, true, "Load unambiguous word dawg.",
53 getCCUtil()->params())
54 , BOOL_INIT_MEMBER(load_punc_dawg, true,
55 "Load dawg with punctuation"
56 " patterns.",
57 getCCUtil()->params())
58 , BOOL_INIT_MEMBER(load_number_dawg, true,
59 "Load dawg with number"
60 " patterns.",
61 getCCUtil()->params())
62 , BOOL_INIT_MEMBER(load_bigram_dawg, true,
63 "Load dawg with special word "
64 "bigrams.",
65 getCCUtil()->params())
66 , double_MEMBER(xheight_penalty_subscripts, 0.125,
67 "Score penalty (0.1 = 10%) added if there are subscripts "
68 "or superscripts in a word, but it is otherwise OK.",
69 getCCUtil()->params())
70 , double_MEMBER(xheight_penalty_inconsistent, 0.25,
71 "Score penalty (0.1 = 10%) added if an xheight is "
72 "inconsistent.",
73 getCCUtil()->params())
74 , double_MEMBER(segment_penalty_dict_frequent_word, 1.0,
75 "Score multiplier for word matches which have good case and"
76 " are frequent in the given language (lower is better).",
77 getCCUtil()->params())
78 , double_MEMBER(segment_penalty_dict_case_ok, 1.1,
79 "Score multiplier for word matches that have good case "
80 "(lower is better).",
81 getCCUtil()->params())
82 , double_MEMBER(segment_penalty_dict_case_bad, 1.3125,
83 "Default score multiplier for word matches, which may have "
84 "case issues (lower is better).",
85 getCCUtil()->params())
86 , double_MEMBER(segment_penalty_dict_nonword, 1.25,
87 "Score multiplier for glyph fragment segmentations which "
88 "do not match a dictionary word (lower is better).",
89 getCCUtil()->params())
90 , double_MEMBER(segment_penalty_garbage, 1.50,
91 "Score multiplier for poorly cased strings that are not in"
92 " the dictionary and generally look like garbage (lower is"
93 " better).",
94 getCCUtil()->params())
95 , STRING_MEMBER(output_ambig_words_file, "",
96 "Output file for ambiguities found in the dictionary", getCCUtil()->params())
97 , INT_MEMBER(dawg_debug_level, 0,
98 "Set to 1 for general debug info"
99 ", to 2 for more details, to 3 to see all the debug messages",
100 getCCUtil()->params())
101 , INT_MEMBER(hyphen_debug_level, 0, "Debug level for hyphenated words.", getCCUtil()->params())
102 , BOOL_MEMBER(use_only_first_uft8_step, false,
103 "Use only the first UTF8 step of the given string"
104 " when computing log probabilities.",
105 getCCUtil()->params())
106 , double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor", getCCUtil()->params())
107 , double_MEMBER(stopper_nondict_certainty_base, -2.50, "Certainty threshold for non-dict words",
108 getCCUtil()->params())
109 , double_MEMBER(stopper_phase2_certainty_rejection_offset, 1.0, "Reject certainty offset",
110 getCCUtil()->params())
111 , INT_MEMBER(stopper_smallword_size, 2, "Size of dict word to be treated as non-dict word",
112 getCCUtil()->params())
113 , double_MEMBER(stopper_certainty_per_char, -0.50,
114 "Certainty to add"
115 " for each dict char above small word size.",
116 getCCUtil()->params())
117 , double_MEMBER(stopper_allowable_character_badness, 3.0,
118 "Max certainty variation allowed in a word (in sigma)", getCCUtil()->params())
119 , INT_MEMBER(stopper_debug_level, 0, "Stopper debug level", getCCUtil()->params())
120 , BOOL_MEMBER(stopper_no_acceptable_choices, false,
121 "Make AcceptableChoice() always return false. Useful"
122 " when there is a need to explore all segmentations",
123 getCCUtil()->params())
124 , INT_MEMBER(tessedit_truncate_wordchoice_log, 10, "Max words to keep in list",
125 getCCUtil()->params())
126 , STRING_MEMBER(word_to_debug, "",
127 "Word for which stopper debug"
128 " information should be printed to stdout",
129 getCCUtil()->params())
130 , BOOL_MEMBER(segment_nonalphabetic_script, false,
131 "Don't use any alphabetic-specific tricks."
132 " Set to true in the traineddata config file for"
133 " scripts that are cursive or inherently fixed-pitch",
134 getCCUtil()->params())
135 , BOOL_MEMBER(save_doc_words, 0, "Save Document Words", getCCUtil()->params())
136 , double_MEMBER(doc_dict_pending_threshold, 0.0, "Worst certainty for using pending dictionary",
137 getCCUtil()->params())
138 , double_MEMBER(doc_dict_certainty_threshold, -2.25,
139 "Worst certainty for words that can be inserted into the"
140 " document dictionary",
141 getCCUtil()->params())
142 , INT_MEMBER(max_permuter_attempts, 10000,
143 "Maximum number of different"
144 " character choices to consider during permutation."
145 " This limit is especially useful when user patterns"
146 " are specified, since overly generic patterns can result in"
147 " dawg search exploring an overly large number of options.",
148 getCCUtil()->params()) {
149 reject_offset_ = 0.0;
150 go_deeper_fxn_ = nullptr;
151 hyphen_word_ = nullptr;
152 last_word_on_line_ = false;
153 document_words_ = nullptr;
154 dawg_cache_ = nullptr;
155 dawg_cache_is_ours_ = false;
156 pending_words_ = nullptr;
157 bigram_dawg_ = nullptr;
158 freq_dawg_ = nullptr;
159 punc_dawg_ = nullptr;
160 unambig_dawg_ = nullptr;
161 wordseg_rating_adjust_factor_ = -1.0f;
162 output_ambig_words_file_ = nullptr;
163 }
164
165 Dict::~Dict() {
166 End();
167 delete hyphen_word_;
168 if (output_ambig_words_file_ != nullptr) {
169 fclose(output_ambig_words_file_);
170 }
171 }
172
173 DawgCache *Dict::GlobalDawgCache() {
174 // This global cache (a singleton) will outlive every Tesseract instance
175 // (even those that someone else might declare as global static variables).
176 static DawgCache cache;
177 return &cache;
178 }
179
180 // Sets up ready for a Load or LoadLSTM.
181 void Dict::SetupForLoad(DawgCache *dawg_cache) {
182 if (dawgs_.size() != 0) {
183 this->End();
184 }
185
186 apostrophe_unichar_id_ = getUnicharset().unichar_to_id(kApostropheSymbol);
187 question_unichar_id_ = getUnicharset().unichar_to_id(kQuestionSymbol);
188 slash_unichar_id_ = getUnicharset().unichar_to_id(kSlashSymbol);
189 hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol);
190
191 if (dawg_cache != nullptr) {
192 dawg_cache_ = dawg_cache;
193 dawg_cache_is_ours_ = false;
194 } else {
195 dawg_cache_ = new DawgCache();
196 dawg_cache_is_ours_ = true;
197 }
198 }
199
200 // Loads the dawgs needed by Tesseract. Call FinishLoad() after.
201 void Dict::Load(const std::string &lang, TessdataManager *data_file) {
202 // Load dawgs_.
203 if (load_punc_dawg) {
204 punc_dawg_ =
205 dawg_cache_->GetSquishedDawg(lang, TESSDATA_PUNC_DAWG, dawg_debug_level, data_file);
206 if (punc_dawg_) {
207 dawgs_.push_back(punc_dawg_);
208 }
209 }
210 if (load_system_dawg) {
211 Dawg *system_dawg =
212 dawg_cache_->GetSquishedDawg(lang, TESSDATA_SYSTEM_DAWG, dawg_debug_level, data_file);
213 if (system_dawg) {
214 dawgs_.push_back(system_dawg);
215 }
216 }
217 if (load_number_dawg) {
218 Dawg *number_dawg =
219 dawg_cache_->GetSquishedDawg(lang, TESSDATA_NUMBER_DAWG, dawg_debug_level, data_file);
220 if (number_dawg) {
221 dawgs_.push_back(number_dawg);
222 }
223 }
224 if (load_bigram_dawg) {
225 bigram_dawg_ =
226 dawg_cache_->GetSquishedDawg(lang, TESSDATA_BIGRAM_DAWG, dawg_debug_level, data_file);
227 // The bigram_dawg_ is NOT used like the other dawgs! DO NOT add to the
228 // dawgs_!!
229 }
230 if (load_freq_dawg) {
231 freq_dawg_ =
232 dawg_cache_->GetSquishedDawg(lang, TESSDATA_FREQ_DAWG, dawg_debug_level, data_file);
233 if (freq_dawg_) {
234 dawgs_.push_back(freq_dawg_);
235 }
236 }
237 if (load_unambig_dawg) {
238 unambig_dawg_ =
239 dawg_cache_->GetSquishedDawg(lang, TESSDATA_UNAMBIG_DAWG, dawg_debug_level, data_file);
240 if (unambig_dawg_) {
241 dawgs_.push_back(unambig_dawg_);
242 }
243 }
244
245 std::string name;
246 if (!user_words_suffix.empty() || !user_words_file.empty()) {
247 Trie *trie_ptr =
248 new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM, getUnicharset().size(), dawg_debug_level);
249 if (!user_words_file.empty()) {
250 name = user_words_file;
251 } else {
252 name = getCCUtil()->language_data_path_prefix;
253 name += user_words_suffix;
254 }
255 if (!trie_ptr->read_and_add_word_list(name.c_str(), getUnicharset(),
256 Trie::RRP_REVERSE_IF_HAS_RTL)) {
257 tprintf("Error: failed to load %s\n", name.c_str());
258 delete trie_ptr;
259 } else {
260 dawgs_.push_back(trie_ptr);
261 }
262 }
263
264 if (!user_patterns_suffix.empty() || !user_patterns_file.empty()) {
265 Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM, getUnicharset().size(),
266 dawg_debug_level);
267 trie_ptr->initialize_patterns(&(getUnicharset()));
268 if (!user_patterns_file.empty()) {
269 name = user_patterns_file;
270 } else {
271 name = getCCUtil()->language_data_path_prefix;
272 name += user_patterns_suffix;
273 }
274 if (!trie_ptr->read_pattern_list(name.c_str(), getUnicharset())) {
275 tprintf("Error: failed to load %s\n", name.c_str());
276 delete trie_ptr;
277 } else {
278 dawgs_.push_back(trie_ptr);
279 }
280 }
281
282 document_words_ =
283 new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM, getUnicharset().size(), dawg_debug_level);
284 dawgs_.push_back(document_words_);
285
286 // This dawg is temporary and should not be searched by letter_is_ok.
287 pending_words_ =
288 new Trie(DAWG_TYPE_WORD, lang, NO_PERM, getUnicharset().size(), dawg_debug_level);
289 }
290
291 // Loads the dawgs needed by the LSTM model. Call FinishLoad() after.
292 void Dict::LoadLSTM(const std::string &lang, TessdataManager *data_file) {
293 // Load dawgs_.
294 if (load_punc_dawg) {
295 punc_dawg_ =
296 dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_PUNC_DAWG, dawg_debug_level, data_file);
297 if (punc_dawg_) {
298 dawgs_.push_back(punc_dawg_);
299 }
300 }
301 if (load_system_dawg) {
302 Dawg *system_dawg =
303 dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_SYSTEM_DAWG, dawg_debug_level, data_file);
304 if (system_dawg) {
305 dawgs_.push_back(system_dawg);
306 }
307 }
308 if (load_number_dawg) {
309 Dawg *number_dawg =
310 dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_NUMBER_DAWG, dawg_debug_level, data_file);
311 if (number_dawg) {
312 dawgs_.push_back(number_dawg);
313 }
314 }
315
316 // stolen from Dict::Load (but needs params_ from Tesseract
317 // langdata/config/api):
318 std::string name;
319 if (!user_words_suffix.empty() || !user_words_file.empty()) {
320 Trie *trie_ptr =
321 new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM, getUnicharset().size(), dawg_debug_level);
322 if (!user_words_file.empty()) {
323 name = user_words_file;
324 } else {
325 name = getCCUtil()->language_data_path_prefix;
326 name += user_words_suffix;
327 }
328 if (!trie_ptr->read_and_add_word_list(name.c_str(), getUnicharset(),
329 Trie::RRP_REVERSE_IF_HAS_RTL)) {
330 tprintf("Error: failed to load %s\n", name.c_str());
331 delete trie_ptr;
332 } else {
333 dawgs_.push_back(trie_ptr);
334 }
335 }
336
337 if (!user_patterns_suffix.empty() || !user_patterns_file.empty()) {
338 Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM, getUnicharset().size(),
339 dawg_debug_level);
340 trie_ptr->initialize_patterns(&(getUnicharset()));
341 if (!user_patterns_file.empty()) {
342 name = user_patterns_file;
343 } else {
344 name = getCCUtil()->language_data_path_prefix;
345 name += user_patterns_suffix;
346 }
347 if (!trie_ptr->read_pattern_list(name.c_str(), getUnicharset())) {
348 tprintf("Error: failed to load %s\n", name.c_str());
349 delete trie_ptr;
350 } else {
351 dawgs_.push_back(trie_ptr);
352 }
353 }
354 }
355
356 // Completes the loading process after Load() and/or LoadLSTM().
357 // Returns false if no dictionaries were loaded.
358 bool Dict::FinishLoad() {
359 if (dawgs_.empty()) {
360 return false;
361 }
362 // Construct a list of corresponding successors for each dawg. Each entry, i,
363 // in the successors_ vector is a vector of integers that represent the
364 // indices into the dawgs_ vector of the successors for dawg i.
365 successors_.reserve(dawgs_.size());
366 for (auto dawg : dawgs_) {
367 auto *lst = new SuccessorList();
368 for (unsigned j = 0; j < dawgs_.size(); ++j) {
369 const Dawg *other = dawgs_[j];
370 if (dawg != nullptr && other != nullptr && (dawg->lang() == other->lang()) &&
371 kDawgSuccessors[dawg->type()][other->type()]) {
372 lst->push_back(j);
373 }
374 }
375 successors_.push_back(lst);
376 }
377 return true;
378 }
379
380 void Dict::End() {
381 if (dawgs_.empty()) {
382 return; // Not safe to call twice.
383 }
384 for (auto &dawg : dawgs_) {
385 if (!dawg_cache_->FreeDawg(dawg)) {
386 delete dawg;
387 }
388 }
389 dawg_cache_->FreeDawg(bigram_dawg_);
390 if (dawg_cache_is_ours_) {
391 delete dawg_cache_;
392 dawg_cache_ = nullptr;
393 }
394 for (auto successor : successors_) {
395 delete successor;
396 }
397 dawgs_.clear();
398 successors_.clear();
399 document_words_ = nullptr;
400 delete pending_words_;
401 pending_words_ = nullptr;
402 }
403
404 // Returns true if in light of the current state unichar_id is allowed
405 // according to at least one of the dawgs in the dawgs_ vector.
406 // See more extensive comments in dict.h where this function is declared.
407 int Dict::def_letter_is_okay(void *void_dawg_args, const UNICHARSET &unicharset,
408 UNICHAR_ID unichar_id, bool word_end) const {
409 auto *dawg_args = static_cast<DawgArgs *>(void_dawg_args);
410
411 ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
412
413 if (dawg_debug_level >= 3) {
414 tesserr << "def_letter_is_okay: current unichar="
415 << getUnicharset().debug_str(unichar_id)
416 << " word_end=" << word_end
417 << " num active dawgs=" << dawg_args->active_dawgs->size() << '\n';
418 }
419
420 // Do not accept words that contain kPatternUnicharID.
421 // (otherwise pattern dawgs would not function correctly).
422 // Do not accept words containing INVALID_UNICHAR_IDs.
423 if (unichar_id == Dawg::kPatternUnicharID || unichar_id == INVALID_UNICHAR_ID) {
424 dawg_args->permuter = NO_PERM;
425 return NO_PERM;
426 }
427
428 // Initialization.
429 PermuterType curr_perm = NO_PERM;
430 dawg_args->updated_dawgs->clear();
431 dawg_args->valid_end = false;
432
433 // Go over the active_dawgs vector and insert DawgPosition records
434 // with the updated ref (an edge with the corresponding unichar id) into
435 // dawg_args->updated_pos.
436 for (unsigned a = 0; a < dawg_args->active_dawgs->size(); ++a) {
437 const DawgPosition &pos = (*dawg_args->active_dawgs)[a];
438 const Dawg *punc_dawg = pos.punc_index >= 0 ? dawgs_[pos.punc_index] : nullptr;
439 const Dawg *dawg = pos.dawg_index >= 0 ? dawgs_[pos.dawg_index] : nullptr;
440
441 if (!dawg && !punc_dawg) {
442 // shouldn't happen.
443 tprintf("Received DawgPosition with no dawg or punc_dawg. wth?\n");
444 continue;
445 }
446 if (!dawg) {
447 // We're in the punctuation dawg. A core dawg has not been chosen.
448 NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
449 EDGE_REF punc_transition_edge =
450 punc_dawg->edge_char_of(punc_node, Dawg::kPatternUnicharID, word_end);
451 if (punc_transition_edge != NO_EDGE) {
452 // Find all successors, and see which can transition.
453 const SuccessorList &slist = *(successors_[pos.punc_index]);
454 for (int sdawg_index : slist) {
455 const Dawg *sdawg = dawgs_[sdawg_index];
456 UNICHAR_ID ch = char_for_dawg(unicharset, unichar_id, sdawg);
457 EDGE_REF dawg_edge = sdawg->edge_char_of(0, ch, word_end);
458 if (dawg_edge != NO_EDGE) {
459 if (dawg_debug_level >= 3) {
460 tprintf("Letter found in dawg %d\n", sdawg_index);
461 }
462 dawg_args->updated_dawgs->add_unique(
463 DawgPosition(sdawg_index, dawg_edge, pos.punc_index, punc_transition_edge, false),
464 dawg_debug_level > 0, "Append transition from punc dawg to current dawgs: ");
465 if (sdawg->permuter() > curr_perm) {
466 curr_perm = sdawg->permuter();
467 }
468 if (sdawg->end_of_word(dawg_edge) && punc_dawg->end_of_word(punc_transition_edge)) {
469 dawg_args->valid_end = true;
470 }
471 }
472 }
473 }
474 EDGE_REF punc_edge = punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
475 if (punc_edge != NO_EDGE) {
476 if (dawg_debug_level >= 3) {
477 tprintf("Letter found in punctuation dawg\n");
478 }
479 dawg_args->updated_dawgs->add_unique(
480 DawgPosition(-1, NO_EDGE, pos.punc_index, punc_edge, false), dawg_debug_level > 0,
481 "Extend punctuation dawg: ");
482 if (PUNC_PERM > curr_perm) {
483 curr_perm = PUNC_PERM;
484 }
485 if (punc_dawg->end_of_word(punc_edge)) {
486 dawg_args->valid_end = true;
487 }
488 }
489 continue;
490 }
491
492 if (punc_dawg && dawg->end_of_word(pos.dawg_ref)) {
493 // We can end the main word here.
494 // If we can continue on the punc ref, add that possibility.
495 NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
496 EDGE_REF punc_edge =
497 punc_node == NO_EDGE ? NO_EDGE : punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
498 if (punc_edge != NO_EDGE) {
499 dawg_args->updated_dawgs->add_unique(
500 DawgPosition(pos.dawg_index, pos.dawg_ref, pos.punc_index, punc_edge, true),
501 dawg_debug_level > 0, "Return to punctuation dawg: ");
502 if (dawg->permuter() > curr_perm) {
503 curr_perm = dawg->permuter();
504 }
505 if (punc_dawg->end_of_word(punc_edge)) {
506 dawg_args->valid_end = true;
507 }
508 }
509 }
510
511 if (pos.back_to_punc) {
512 continue;
513 }
514
515 // If we are dealing with the pattern dawg, look up all the
516 // possible edges, not only for the exact unichar_id, but also
517 // for all its character classes (alpha, digit, etc).
518 if (dawg->type() == DAWG_TYPE_PATTERN) {
519 ProcessPatternEdges(dawg, pos, unichar_id, word_end, dawg_args, &curr_perm);
520 // There can't be any successors to dawg that is of type
521 // DAWG_TYPE_PATTERN, so we are done examining this DawgPosition.
522 continue;
523 }
524
525 // Find the edge out of the node for the unichar_id.
526 NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
527 EDGE_REF edge =
528 (node == NO_EDGE)
529 ? NO_EDGE
530 : dawg->edge_char_of(node, char_for_dawg(unicharset, unichar_id, dawg), word_end);
531
532 if (dawg_debug_level >= 3) {
533 tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n", pos.dawg_index, node, edge);
534 }
535
536 if (edge != NO_EDGE) { // the unichar was found in the current dawg
537 if (dawg_debug_level >= 3) {
538 tprintf("Letter found in dawg %d\n", pos.dawg_index);
539 }
540 if (word_end && punc_dawg && !punc_dawg->end_of_word(pos.punc_ref)) {
541 if (dawg_debug_level >= 3) {
542 tprintf("Punctuation constraint not satisfied at end of word.\n");
543 }
544 continue;
545 }
546 if (dawg->permuter() > curr_perm) {
547 curr_perm = dawg->permuter();
548 }
549 if (dawg->end_of_word(edge) &&
550 (punc_dawg == nullptr || punc_dawg->end_of_word(pos.punc_ref))) {
551 dawg_args->valid_end = true;
552 }
553 dawg_args->updated_dawgs->add_unique(
554 DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref, false),
555 dawg_debug_level > 0, "Append current dawg to updated active dawgs: ");
556 }
557 } // end for
558 // Update dawg_args->permuter if it used to be NO_PERM or became NO_PERM
559 // or if we found the current letter in a non-punctuation dawg. This
560 // allows preserving information on which dawg the "core" word came from.
561 // Keep the old value of dawg_args->permuter if it is COMPOUND_PERM.
562 if (dawg_args->permuter == NO_PERM || curr_perm == NO_PERM ||
563 (curr_perm != PUNC_PERM && dawg_args->permuter != COMPOUND_PERM)) {
564 dawg_args->permuter = curr_perm;
565 }
566 if (dawg_debug_level >= 2) {
567 tprintf("Returning %d for permuter code for this character.\n", dawg_args->permuter);
568 }
569 return dawg_args->permuter;
570 }
571
572 void Dict::ProcessPatternEdges(const Dawg *dawg, const DawgPosition &pos, UNICHAR_ID unichar_id,
573 bool word_end, DawgArgs *dawg_args, PermuterType *curr_perm) const {
574 NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
575 // Try to find the edge corresponding to the exact unichar_id and to all the
576 // edges corresponding to the character class of unichar_id.
577 std::vector<UNICHAR_ID> unichar_id_patterns;
578 unichar_id_patterns.push_back(unichar_id);
579 dawg->unichar_id_to_patterns(unichar_id, getUnicharset(), &unichar_id_patterns);
580 for (int unichar_id_pattern : unichar_id_patterns) {
581 // On the first iteration check all the outgoing edges.
582 // On the second iteration check all self-loops.
583 for (int k = 0; k < 2; ++k) {
584 EDGE_REF edge = (k == 0)
585 ? dawg->edge_char_of(node, unichar_id_pattern, word_end)
586 : dawg->pattern_loop_edge(pos.dawg_ref, unichar_id_pattern, word_end);
587 if (edge == NO_EDGE) {
588 continue;
589 }
590 if (dawg_debug_level >= 3) {
591 tprintf("Pattern dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n", pos.dawg_index, node,
592 edge);
593 tprintf("Letter found in pattern dawg %d\n", pos.dawg_index);
594 }
595 if (dawg->permuter() > *curr_perm) {
596 *curr_perm = dawg->permuter();
597 }
598 if (dawg->end_of_word(edge)) {
599 dawg_args->valid_end = true;
600 }
601 dawg_args->updated_dawgs->add_unique(
602 DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref, pos.back_to_punc),
603 dawg_debug_level > 0, "Append current dawg to updated active dawgs: ");
604 }
605 }
606 }
607
608 // Fill the given active_dawgs vector with dawgs that could contain the
609 // beginning of the word. If hyphenated() returns true, copy the entries
610 // from hyphen_active_dawgs_ instead.
611 void Dict::init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const {
612 if (hyphenated()) {
613 *active_dawgs = hyphen_active_dawgs_;
614 if (dawg_debug_level >= 3) {
615 for (unsigned i = 0; i < hyphen_active_dawgs_.size(); ++i) {
616 tprintf("Adding hyphen beginning dawg [%d, " REFFORMAT "]\n",
617 hyphen_active_dawgs_[i].dawg_index, hyphen_active_dawgs_[i].dawg_ref);
618 }
619 }
620 } else {
621 default_dawgs(active_dawgs, ambigs_mode);
622 }
623 }
624
625 void Dict::default_dawgs(DawgPositionVector *dawg_pos_vec, bool suppress_patterns) const {
626 bool punc_dawg_available = (punc_dawg_ != nullptr) &&
627 punc_dawg_->edge_char_of(0, Dawg::kPatternUnicharID, true) != NO_EDGE;
628
629 for (unsigned i = 0; i < dawgs_.size(); i++) {
630 if (dawgs_[i] != nullptr && !(suppress_patterns && (dawgs_[i])->type() == DAWG_TYPE_PATTERN)) {
631 int dawg_ty = dawgs_[i]->type();
632 bool subsumed_by_punc = kDawgSuccessors[DAWG_TYPE_PUNCTUATION][dawg_ty];
633 if (dawg_ty == DAWG_TYPE_PUNCTUATION) {
634 dawg_pos_vec->push_back(DawgPosition(-1, NO_EDGE, i, NO_EDGE, false));
635 if (dawg_debug_level >= 3) {
636 tprintf("Adding beginning punc dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
637 }
638 } else if (!punc_dawg_available || !subsumed_by_punc) {
639 dawg_pos_vec->push_back(DawgPosition(i, NO_EDGE, -1, NO_EDGE, false));
640 if (dawg_debug_level >= 3) {
641 tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
642 }
643 }
644 }
645 }
646 }
647
648 void Dict::add_document_word(const WERD_CHOICE &best_choice) {
649 // Do not add hyphenated word parts to the document dawg.
650 // hyphen_word_ will be non-nullptr after the set_hyphen_word() is
651 // called when the first part of the hyphenated word is
652 // discovered and while the second part of the word is recognized.
653 // hyphen_word_ is cleared in cc_recg() before the next word on
654 // the line is recognized.
655 if (hyphen_word_) {
656 return;
657 }
658
659 int stringlen = best_choice.length();
660
661 if (valid_word(best_choice) || stringlen < 2) {
662 return;
663 }
664
665 // Discard words that contain >= kDocDictMaxRepChars repeating unichars.
666 if (best_choice.length() >= kDocDictMaxRepChars) {
667 int num_rep_chars = 1;
668 UNICHAR_ID uch_id = best_choice.unichar_id(0);
669 for (unsigned i = 1; i < best_choice.length(); ++i) {
670 if (best_choice.unichar_id(i) != uch_id) {
671 num_rep_chars = 1;
672 uch_id = best_choice.unichar_id(i);
673 } else {
674 ++num_rep_chars;
675 if (num_rep_chars == kDocDictMaxRepChars) {
676 return;
677 }
678 }
679 }
680 }
681
682 if (best_choice.certainty() < doc_dict_certainty_threshold || stringlen == 2) {
683 if (best_choice.certainty() < doc_dict_pending_threshold) {
684 return;
685 }
686
687 if (!pending_words_->word_in_dawg(best_choice)) {
688 if (stringlen > 2 ||
689 (stringlen == 2 && getUnicharset().get_isupper(best_choice.unichar_id(0)) &&
690 getUnicharset().get_isupper(best_choice.unichar_id(1)))) {
691 pending_words_->add_word_to_dawg(best_choice);
692 }
693 return;
694 }
695 }
696
697 if (save_doc_words) {
698 std::string filename(getCCUtil()->imagefile);
699 filename += ".doc";
700 FILE *doc_word_file = fopen(filename.c_str(), "a");
701 if (doc_word_file == nullptr) {
702 tprintf("Error: Could not open file %s\n", filename.c_str());
703 ASSERT_HOST(doc_word_file);
704 }
705 fprintf(doc_word_file, "%s\n", best_choice.debug_string().c_str());
706 fclose(doc_word_file);
707 }
708 document_words_->add_word_to_dawg(best_choice);
709 }
710
711 void Dict::adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency,
712 float additional_adjust, bool modify_rating, bool debug) {
713 bool is_han = (getUnicharset().han_sid() != getUnicharset().null_sid() &&
714 word->GetTopScriptID() == getUnicharset().han_sid());
715 bool case_is_ok = (is_han || case_ok(*word));
716 bool punc_is_ok = (is_han || !nonword || valid_punctuation(*word));
717
718 float adjust_factor = additional_adjust;
719 float new_rating = word->rating();
720 new_rating += kRatingPad;
721 const char *xheight_triggered = "";
722 if (word->length() > 1) {
723 // Calculate x-height and y-offset consistency penalties.
724 switch (xheight_consistency) {
725 case XH_INCONSISTENT:
726 adjust_factor += xheight_penalty_inconsistent;
727 xheight_triggered = ", xhtBAD";
728 break;
729 case XH_SUBNORMAL:
730 adjust_factor += xheight_penalty_subscripts;
731 xheight_triggered = ", xhtSUB";
732 break;
733 case XH_GOOD:
734 // leave the factor alone - all good!
735 break;
736 }
737 // TODO(eger): if nonword is true, but there is a "core" that is a dict
738 // word, negate nonword status.
739 } else {
740 if (debug) {
741 tprintf("Consistency could not be calculated.\n");
742 }
743 }
744 if (debug) {
745 tprintf("%sWord: %s %4.2f%s", nonword ? "Non-" : "", word->unichar_string().c_str(),
746 word->rating(), xheight_triggered);
747 }
748
749 if (nonword) { // non-dictionary word
750 if (case_is_ok && punc_is_ok) {
751 adjust_factor += segment_penalty_dict_nonword;
752 new_rating *= adjust_factor;
753 if (debug) {
754 tprintf(", W");
755 }
756 } else {
757 adjust_factor += segment_penalty_garbage;
758 new_rating *= adjust_factor;
759 if (debug) {
760 if (!case_is_ok) {
761 tprintf(", C");
762 }
763 if (!punc_is_ok) {
764 tprintf(", P");
765 }
766 }
767 }
768 } else { // dictionary word
769 if (case_is_ok) {
770 if (!is_han && freq_dawg_ != nullptr && freq_dawg_->word_in_dawg(*word)) {
771 word->set_permuter(FREQ_DAWG_PERM);
772 adjust_factor += segment_penalty_dict_frequent_word;
773 new_rating *= adjust_factor;
774 if (debug) {
775 tprintf(", F");
776 }
777 } else {
778 adjust_factor += segment_penalty_dict_case_ok;
779 new_rating *= adjust_factor;
780 if (debug) {
781 tprintf(", ");
782 }
783 }
784 } else {
785 adjust_factor += segment_penalty_dict_case_bad;
786 new_rating *= adjust_factor;
787 if (debug) {
788 tprintf(", C");
789 }
790 }
791 }
792 new_rating -= kRatingPad;
793 if (modify_rating) {
794 word->set_rating(new_rating);
795 }
796 if (debug) {
797 tprintf(" %4.2f --> %4.2f\n", adjust_factor, new_rating);
798 }
799 word->set_adjust_factor(adjust_factor);
800 }
801
802 int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) const {
803 const WERD_CHOICE *word_ptr = &word;
804 WERD_CHOICE temp_word(word.unicharset());
805 if (hyphenated() && hyphen_word_->unicharset() == word.unicharset()) {
806 copy_hyphen_info(&temp_word);
807 temp_word += word;
808 word_ptr = &temp_word;
809 }
810 if (word_ptr->empty()) {
811 return NO_PERM;
812 }
813 // Allocate vectors for holding current and updated
814 // active_dawgs and initialize them.
815 DawgPositionVector active_dawgs[2];
816 init_active_dawgs(&(active_dawgs[0]), false);
817 DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
818 int last_index = word_ptr->length() - 1;
819 // Call letter_is_okay for each letter in the word.
820 for (int i = hyphen_base_size(); i <= last_index; ++i) {
821 if (!((this->*letter_is_okay_)(&dawg_args, *word_ptr->unicharset(), word_ptr->unichar_id(i),
822 i == last_index))) {
823 break;
824 }
825 // Swap active_dawgs, constraints with the corresponding updated vector.
826 if (dawg_args.updated_dawgs == &(active_dawgs[1])) {
827 dawg_args.updated_dawgs = &(active_dawgs[0]);
828 ++(dawg_args.active_dawgs);
829 } else {
830 ++(dawg_args.updated_dawgs);
831 dawg_args.active_dawgs = &(active_dawgs[0]);
832 }
833 }
834 return valid_word_permuter(dawg_args.permuter, numbers_ok) ? dawg_args.permuter : NO_PERM;
835 }
836
837 bool Dict::valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const {
838 if (bigram_dawg_ == nullptr) {
839 return false;
840 }
841
842 // Extract the core word from the middle of each word with any digits
843 // replaced with question marks.
844 unsigned w1start, w1end, w2start, w2end;
845 word1.punct_stripped(&w1start, &w1end);
846 word2.punct_stripped(&w2start, &w2end);
847
848 // We don't want to penalize a single guillemet, hyphen, etc.
849 // But our bigram list doesn't have any information about punctuation.
850 if (w1start >= w1end) {
851 return word1.length() < 3;
852 }
853 if (w2start >= w2end) {
854 return word2.length() < 3;
855 }
856
857 const UNICHARSET &uchset = getUnicharset();
858 std::vector<UNICHAR_ID> bigram_string;
859 bigram_string.reserve(w1end + w2end + 1);
860 for (auto i = w1start; i < w1end; i++) {
861 const auto &normed_ids = getUnicharset().normed_ids(word1.unichar_id(i));
862 if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0])) {
863 bigram_string.push_back(question_unichar_id_);
864 } else {
865 bigram_string.insert(bigram_string.end(), normed_ids.begin(), normed_ids.end());
866 }
867 }
868 bigram_string.push_back(UNICHAR_SPACE);
869 for (auto i = w2start; i < w2end; i++) {
870 const auto &normed_ids = getUnicharset().normed_ids(word2.unichar_id(i));
871 if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0])) {
872 bigram_string.push_back(question_unichar_id_);
873 } else {
874 bigram_string.insert(bigram_string.end(), normed_ids.begin(), normed_ids.end());
875 }
876 }
877 WERD_CHOICE normalized_word(&uchset, bigram_string.size());
878 for (int i : bigram_string) {
879 normalized_word.append_unichar_id_space_allocated(i, 1, 0.0f, 0.0f);
880 }
881 return bigram_dawg_->word_in_dawg(normalized_word);
882 }
883
884 bool Dict::valid_punctuation(const WERD_CHOICE &word) {
885 if (word.empty()) {
886 return NO_PERM;
887 }
888 WERD_CHOICE new_word(word.unicharset());
889 auto last_index = word.length() - 1;
890 int new_len;
891 for (unsigned i = 0; i <= last_index; ++i) {
892 UNICHAR_ID unichar_id = (word.unichar_id(i));
893 if (getUnicharset().get_ispunctuation(unichar_id)) {
894 new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0);
895 } else if (!getUnicharset().get_isalpha(unichar_id) &&
896 !getUnicharset().get_isdigit(unichar_id)) {
897 return false; // neither punc, nor alpha, nor digit
898 } else if ((new_len = new_word.length()) == 0 ||
899 new_word.unichar_id(new_len - 1) != Dawg::kPatternUnicharID) {
900 new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0);
901 }
902 }
903 for (unsigned i = 0; i < dawgs_.size(); ++i) {
904 if (dawgs_[i] != nullptr && dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION &&
905 dawgs_[i]->word_in_dawg(new_word)) {
906 return true;
907 }
908 }
909 return false;
910 }
911
912 /// Returns true if the language is space-delimited (not CJ, or T).
913 bool Dict::IsSpaceDelimitedLang() const {
914 const UNICHARSET &u_set = getUnicharset();
915 if (u_set.han_sid() > 0) {
916 return false;
917 }
918 if (u_set.katakana_sid() > 0) {
919 return false;
920 }
921 if (u_set.thai_sid() > 0) {
922 return false;
923 }
924 return true;
925 }
926
927 } // namespace tesseract