comparison mupdf-source/thirdparty/tesseract/src/ccmain/control.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /******************************************************************
2 * File: control.cpp (Formerly control.c)
3 * Description: Module-independent matcher controller.
4 * Author: Ray Smith
5 *
6 * (C) Copyright 1992, Hewlett-Packard Ltd.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19 // Include automatically generated configuration file if running autoconf.
20 #ifdef HAVE_CONFIG_H
21 # include "config_auto.h"
22 #endif
23
24 #include <cctype>
25 #include <cmath>
26 #include <cstdint> // for int16_t, int32_t
27 #include <cstdio> // for fclose, fopen, FILE
28 #include <ctime> // for clock
29 #include "control.h"
30 #ifndef DISABLED_LEGACY_ENGINE
31 # include "docqual.h"
32 # include "drawfx.h"
33 # include "fixspace.h"
34 #endif
35 #include <tesseract/ocrclass.h>
36 #include "lstmrecognizer.h"
37 #include "output.h"
38 #include "pageres.h" // for WERD_RES, PAGE_RES_IT, PAGE_RES, BLO...
39 #ifndef DISABLED_LEGACY_ENGINE
40 # include "reject.h"
41 #endif
42 #include "sorthelper.h"
43 #include "tesseractclass.h"
44 #include "tesserrstream.h" // for tesserr
45 #include "tessvars.h"
46 #include "werdit.h"
47
48 const char *const kBackUpConfigFile = "tempconfigdata.config";
49 #ifndef DISABLED_LEGACY_ENGINE
50 // Min believable x-height for any text when refitting as a fraction of
51 // original x-height
52 const double kMinRefitXHeightFraction = 0.5;
53 #endif // ! DISABLED_LEGACY_ENGINE
54
55 namespace tesseract {
56
57 /**
58 * Make a word from the selected blobs and run Tess on them.
59 *
60 * @param page_res recognise blobs
61 * @param selection_box within this box
62 */
63
64 void Tesseract::recog_pseudo_word(PAGE_RES *page_res, TBOX &selection_box) {
65 PAGE_RES_IT *it = make_pseudo_word(page_res, selection_box);
66 if (it != nullptr) {
67 recog_interactive(it);
68 it->DeleteCurrentWord();
69 delete it;
70 }
71 }
72
73 /**
74 * Recognize a single word in interactive mode.
75 *
76 * @param pr_it the page results iterator
77 */
78 bool Tesseract::recog_interactive(PAGE_RES_IT *pr_it) {
79 WordData word_data(*pr_it);
80 SetupWordPassN(2, &word_data);
81 // LSTM doesn't run on pass2, but we want to run pass2 for tesseract.
82 if (lstm_recognizer_ == nullptr) {
83 #ifndef DISABLED_LEGACY_ENGINE
84 classify_word_and_language(2, pr_it, &word_data);
85 #endif // ndef DISABLED_LEGACY_ENGINE
86 } else {
87 classify_word_and_language(1, pr_it, &word_data);
88 }
89 #ifndef DISABLED_LEGACY_ENGINE
90 if (tessedit_debug_quality_metrics) {
91 int16_t char_qual;
92 int16_t good_char_qual;
93 WERD_RES *word_res = pr_it->word();
94 word_char_quality(word_res, &char_qual, &good_char_qual);
95 tprintf(
96 "\n%d chars; word_blob_quality: %d; outline_errs: %d; "
97 "char_quality: %d; good_char_quality: %d\n",
98 word_res->reject_map.length(), word_blob_quality(word_res), word_outline_errs(word_res),
99 char_qual, good_char_qual);
100 }
101 #endif // ndef DISABLED_LEGACY_ENGINE
102 return true;
103 }
104
105 // Helper function to check for a target word and handle it appropriately.
106 // Inspired by Jetsoft's requirement to process only single words on pass2
107 // and beyond.
108 // If word_config is not null:
109 // If the word_box and target_word_box overlap, read the word_config file
110 // else reset to previous config data.
111 // return true.
112 // else
113 // If the word_box and target_word_box overlap or pass <= 1, return true.
114 // Note that this function uses a fixed temporary file for storing the previous
115 // configs, so it is neither thread-safe, nor process-safe, but the assumption
116 // is that it will only be used for one debug window at a time.
117 //
118 // Since this function is used for debugging (and not to change OCR results)
119 // set only debug params from the word config file.
120 bool Tesseract::ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box,
121 const char *word_config, int pass) {
122 if (word_config != nullptr) {
123 if (word_box.major_overlap(target_word_box)) {
124 if (backup_config_file_ == nullptr) {
125 backup_config_file_ = kBackUpConfigFile;
126 FILE *config_fp = fopen(backup_config_file_, "wb");
127 if (config_fp == nullptr) {
128 tprintf("Error, failed to open file \"%s\"\n", backup_config_file_);
129 } else {
130 ParamUtils::PrintParams(config_fp, params());
131 fclose(config_fp);
132 }
133 ParamUtils::ReadParamsFile(word_config, SET_PARAM_CONSTRAINT_DEBUG_ONLY, params());
134 }
135 } else {
136 if (backup_config_file_ != nullptr) {
137 ParamUtils::ReadParamsFile(backup_config_file_, SET_PARAM_CONSTRAINT_DEBUG_ONLY, params());
138 backup_config_file_ = nullptr;
139 }
140 }
141 } else if (pass > 1 && !word_box.major_overlap(target_word_box)) {
142 return false;
143 }
144 return true;
145 }
146
147 /** If tesseract is to be run, sets the words up ready for it. */
148 void Tesseract::SetupAllWordsPassN(int pass_n, const TBOX *target_word_box, const char *word_config,
149 PAGE_RES *page_res, std::vector<WordData> *words) {
150 // Prepare all the words.
151 PAGE_RES_IT page_res_it(page_res);
152 for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
153 if (target_word_box == nullptr || ProcessTargetWord(page_res_it.word()->word->bounding_box(),
154 *target_word_box, word_config, 1)) {
155 words->push_back(WordData(page_res_it));
156 }
157 }
158 // Setup all the words for recognition with polygonal approximation.
159 for (unsigned w = 0; w < words->size(); ++w) {
160 SetupWordPassN(pass_n, &(*words)[w]);
161 if (w > 0) {
162 (*words)[w].prev_word = &(*words)[w - 1];
163 }
164 }
165 }
166
167 // Sets up the single word ready for whichever engine is to be run.
168 void Tesseract::SetupWordPassN(int pass_n, WordData *word) {
169 if (pass_n == 1 || !word->word->done) {
170 if (pass_n == 1) {
171 word->word->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode,
172 nullptr, classify_bln_numeric_mode, textord_use_cjk_fp_model,
173 poly_allow_detailed_fx, word->row, word->block);
174 } else if (pass_n == 2) {
175 // TODO(rays) Should we do this on pass1 too?
176 word->word->caps_height = 0.0;
177 if (word->word->x_height == 0.0f) {
178 word->word->x_height = word->row->x_height();
179 }
180 }
181 word->lang_words.truncate(0);
182 for (unsigned s = 0; s <= sub_langs_.size(); ++s) {
183 // The sub_langs_.size() entry is for the master language.
184 Tesseract *lang_t = s < sub_langs_.size() ? sub_langs_[s] : this;
185 auto *word_res = new WERD_RES;
186 word_res->InitForRetryRecognition(*word->word);
187 word->lang_words.push_back(word_res);
188 // LSTM doesn't get setup for pass2.
189 if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode != OEM_LSTM_ONLY) {
190 word_res->SetupForRecognition(
191 lang_t->unicharset, lang_t, BestPix(), lang_t->tessedit_ocr_engine_mode, nullptr,
192 lang_t->classify_bln_numeric_mode, lang_t->textord_use_cjk_fp_model,
193 lang_t->poly_allow_detailed_fx, word->row, word->block);
194 }
195 }
196 }
197 }
198
199 // Runs word recognition on all the words.
200 bool Tesseract::RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it,
201 std::vector<WordData> *words) {
202 // TODO(rays) Before this loop can be parallelized (it would yield a massive
203 // speed-up) all remaining member globals need to be converted to local/heap
204 // (eg set_pass1 and set_pass2) and an intermediate adaption pass needs to be
205 // added. The results will be significantly different with adaption on, and
206 // deterioration will need investigation.
207 pr_it->restart_page();
208 for (unsigned w = 0; w < words->size(); ++w) {
209 WordData *word = &(*words)[w];
210 if (w > 0) {
211 word->prev_word = &(*words)[w - 1];
212 }
213 if (monitor != nullptr) {
214 monitor->ocr_alive = true;
215 if (pass_n == 1) {
216 monitor->progress = 70 * w / words->size();
217 } else {
218 monitor->progress = 70 + 30 * w / words->size();
219 }
220 if (monitor->progress_callback2 != nullptr) {
221 TBOX box = pr_it->word()->word->bounding_box();
222 (*monitor->progress_callback2)(monitor, box.left(), box.right(), box.top(), box.bottom());
223 }
224 if (monitor->deadline_exceeded() ||
225 (monitor->cancel != nullptr && (*monitor->cancel)(monitor->cancel_this, words->size()))) {
226 // Timeout. Fake out the rest of the words.
227 for (; w < words->size(); ++w) {
228 (*words)[w].word->SetupFake(unicharset);
229 }
230 return false;
231 }
232 }
233 if (word->word->tess_failed) {
234 unsigned s;
235 for (s = 0; s < word->lang_words.size() && word->lang_words[s]->tess_failed; ++s) {
236 }
237 // If all are failed, skip it. Image words are skipped by this test.
238 if (s > word->lang_words.size()) {
239 continue;
240 }
241 }
242 // Sync pr_it with the WordData.
243 while (pr_it->word() != nullptr && pr_it->word() != word->word) {
244 pr_it->forward();
245 }
246 ASSERT_HOST(pr_it->word() != nullptr);
247 bool make_next_word_fuzzy = false;
248 #ifndef DISABLED_LEGACY_ENGINE
249 if (!AnyLSTMLang() && ReassignDiacritics(pass_n, pr_it, &make_next_word_fuzzy)) {
250 // Needs to be setup again to see the new outlines in the chopped_word.
251 SetupWordPassN(pass_n, word);
252 }
253 #endif // ndef DISABLED_LEGACY_ENGINE
254
255 classify_word_and_language(pass_n, pr_it, word);
256 if (tessedit_dump_choices || debug_noise_removal) {
257 tprintf("Pass%d: %s [%s]\n", pass_n, word->word->best_choice->unichar_string().c_str(),
258 word->word->best_choice->debug_string().c_str());
259 }
260 pr_it->forward();
261 if (make_next_word_fuzzy && pr_it->word() != nullptr) {
262 pr_it->MakeCurrentWordFuzzy();
263 }
264 }
265 return true;
266 }
267
268 /**
269 * recog_all_words()
270 *
271 * Walk the page_res, recognizing all the words.
272 * If monitor is not null, it is used as a progress monitor/timeout/cancel.
273 * If dopasses is 0, all recognition passes are run,
274 * 1 just pass 1, 2 passes2 and higher.
275 * If target_word_box is not null, special things are done to words that
276 * overlap the target_word_box:
277 * if word_config is not null, the word config file is read for just the
278 * target word(s), otherwise, on pass 2 and beyond ONLY the target words
279 * are processed (Jetsoft modification.)
280 * Returns false if we cancelled prematurely.
281 *
282 * @param page_res page structure
283 * @param monitor progress monitor
284 * @param word_config word_config file
285 * @param target_word_box specifies just to extract a rectangle
286 * @param dopasses 0 - all, 1 just pass 1, 2 passes 2 and higher
287 */
288
289 bool Tesseract::recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor,
290 const TBOX *target_word_box, const char *word_config,
291 int dopasses) {
292 PAGE_RES_IT page_res_it(page_res);
293
294 if (tessedit_minimal_rej_pass1) {
295 tessedit_test_adaption.set_value(true);
296 tessedit_minimal_rejection.set_value(true);
297 }
298
299 if (dopasses == 0 || dopasses == 1) {
300 page_res_it.restart_page();
301 // ****************** Pass 1 *******************
302
303 #ifndef DISABLED_LEGACY_ENGINE
304 // If the adaptive classifier is full switch to one we prepared earlier,
305 // ie on the previous page. If the current adaptive classifier is non-empty,
306 // prepare a backup starting at this page, in case it fills up. Do all this
307 // independently for each language.
308 if (AdaptiveClassifierIsFull()) {
309 SwitchAdaptiveClassifier();
310 } else if (!AdaptiveClassifierIsEmpty()) {
311 StartBackupAdaptiveClassifier();
312 }
313 // Now check the sub-langs as well.
314 for (auto &lang : sub_langs_) {
315 if (lang->AdaptiveClassifierIsFull()) {
316 lang->SwitchAdaptiveClassifier();
317 } else if (!lang->AdaptiveClassifierIsEmpty()) {
318 lang->StartBackupAdaptiveClassifier();
319 }
320 }
321
322 #endif // ndef DISABLED_LEGACY_ENGINE
323
324 // Set up all words ready for recognition, so that if parallelism is on
325 // all the input and output classes are ready to run the classifier.
326 std::vector<WordData> words;
327 SetupAllWordsPassN(1, target_word_box, word_config, page_res, &words);
328 #ifndef DISABLED_LEGACY_ENGINE
329 if (tessedit_parallelize) {
330 PrerecAllWordsPar(words);
331 }
332 #endif // ndef DISABLED_LEGACY_ENGINE
333
334 stats_.word_count = words.size();
335
336 stats_.dict_words = 0;
337 stats_.doc_blob_quality = 0;
338 stats_.doc_outline_errs = 0;
339 stats_.doc_char_quality = 0;
340 stats_.good_char_count = 0;
341 stats_.doc_good_char_quality = 0;
342
343 most_recently_used_ = this;
344 // Run pass 1 word recognition.
345 if (!RecogAllWordsPassN(1, monitor, &page_res_it, &words)) {
346 return false;
347 }
348 // Pass 1 post-processing.
349 for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
350 if (page_res_it.word()->word->flag(W_REP_CHAR)) {
351 fix_rep_char(&page_res_it);
352 continue;
353 }
354
355 // Count dict words.
356 if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM) {
357 ++(stats_.dict_words);
358 }
359
360 // Update misadaption log (we only need to do it on pass 1, since
361 // adaption only happens on this pass).
362 if (page_res_it.word()->blamer_bundle != nullptr &&
363 page_res_it.word()->blamer_bundle->misadaption_debug().length() > 0) {
364 page_res->misadaption_log.push_back(page_res_it.word()->blamer_bundle->misadaption_debug());
365 }
366 }
367 }
368
369 if (dopasses == 1) {
370 return true;
371 }
372
373 #ifndef DISABLED_LEGACY_ENGINE
374
375 // ****************** Pass 2 *******************
376 if (tessedit_tess_adaption_mode != 0x0 && !tessedit_test_adaption && AnyTessLang()) {
377 page_res_it.restart_page();
378 std::vector<WordData> words;
379 SetupAllWordsPassN(2, target_word_box, word_config, page_res, &words);
380 if (tessedit_parallelize) {
381 PrerecAllWordsPar(words);
382 }
383 most_recently_used_ = this;
384 // Run pass 2 word recognition.
385 if (!RecogAllWordsPassN(2, monitor, &page_res_it, &words)) {
386 return false;
387 }
388 }
389
390 // The next passes are only required for Tess-only.
391 if (AnyTessLang() && !AnyLSTMLang()) {
392 // ****************** Pass 3 *******************
393 // Fix fuzzy spaces.
394
395 if (!tessedit_test_adaption && tessedit_fix_fuzzy_spaces && !tessedit_word_for_word &&
396 !right_to_left()) {
397 fix_fuzzy_spaces(monitor, stats_.word_count, page_res);
398 }
399
400 // ****************** Pass 4 *******************
401 if (tessedit_enable_dict_correction) {
402 dictionary_correction_pass(page_res);
403 }
404 if (tessedit_enable_bigram_correction) {
405 bigram_correction_pass(page_res);
406 }
407
408 // ****************** Pass 5,6 *******************
409 rejection_passes(page_res, monitor, target_word_box, word_config);
410
411 // ****************** Pass 8 *******************
412 font_recognition_pass(page_res);
413
414 // ****************** Pass 9 *******************
415 // Check the correctness of the final results.
416 blamer_pass(page_res);
417 script_pos_pass(page_res);
418 }
419
420 #endif // ndef DISABLED_LEGACY_ENGINE
421
422 // Write results pass.
423 // This is now redundant, but retained commented so show how to obtain
424 // bounding boxes and style information.
425
426 #ifndef DISABLED_LEGACY_ENGINE
427 // changed by jetsoft
428 // needed for dll to output memory structure
429 if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv)) {
430 output_pass(page_res_it, target_word_box);
431 }
432 // end jetsoft
433 #endif // ndef DISABLED_LEGACY_ENGINE
434
435 const auto pageseg_mode = static_cast<PageSegMode>(static_cast<int>(tessedit_pageseg_mode));
436 textord_.CleanupSingleRowResult(pageseg_mode, page_res);
437
438 // Remove empty words, as these mess up the result iterators.
439 for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
440 const WERD_RES *word = page_res_it.word();
441 const POLY_BLOCK *pb = page_res_it.block()->block != nullptr
442 ? page_res_it.block()->block->pdblk.poly_block()
443 : nullptr;
444 if (word->best_choice == nullptr || word->best_choice->empty() ||
445 (word->best_choice->IsAllSpaces() && (pb == nullptr || pb->IsText()))) {
446 page_res_it.DeleteCurrentWord();
447 }
448 }
449
450 if (monitor != nullptr) {
451 monitor->progress = 100;
452 }
453 return true;
454 }
455
456 #ifndef DISABLED_LEGACY_ENGINE
457
458 void Tesseract::bigram_correction_pass(PAGE_RES *page_res) {
459 PAGE_RES_IT word_it(page_res);
460
461 WERD_RES *w_prev = nullptr;
462 WERD_RES *w = word_it.word();
463 while (true) {
464 w_prev = w;
465 while (word_it.forward() != nullptr && (!word_it.word() || word_it.word()->part_of_combo)) {
466 // advance word_it, skipping over parts of combos
467 }
468 if (!word_it.word()) {
469 break;
470 }
471 w = word_it.word();
472 if (!w || !w_prev || w->uch_set != w_prev->uch_set) {
473 continue;
474 }
475 if (w_prev->word->flag(W_REP_CHAR) || w->word->flag(W_REP_CHAR)) {
476 if (tessedit_bigram_debug) {
477 tprintf("Skipping because one of the words is W_REP_CHAR\n");
478 }
479 continue;
480 }
481 // Two words sharing the same language model, excellent!
482 std::vector<WERD_CHOICE *> overrides_word1;
483 std::vector<WERD_CHOICE *> overrides_word2;
484
485 const auto &orig_w1_str = w_prev->best_choice->unichar_string();
486 const auto &orig_w2_str = w->best_choice->unichar_string();
487 WERD_CHOICE prev_best(w->uch_set);
488 {
489 int w1start, w1end;
490 w_prev->best_choice->GetNonSuperscriptSpan(&w1start, &w1end);
491 prev_best = w_prev->best_choice->shallow_copy(w1start, w1end);
492 }
493 WERD_CHOICE this_best(w->uch_set);
494 {
495 int w2start, w2end;
496 w->best_choice->GetNonSuperscriptSpan(&w2start, &w2end);
497 this_best = w->best_choice->shallow_copy(w2start, w2end);
498 }
499
500 if (w->tesseract->getDict().valid_bigram(prev_best, this_best)) {
501 if (tessedit_bigram_debug) {
502 tprintf("Top choice \"%s %s\" verified by bigram model.\n", orig_w1_str.c_str(),
503 orig_w2_str.c_str());
504 }
505 continue;
506 }
507 if (tessedit_bigram_debug > 2) {
508 tprintf("Examining alt choices for \"%s %s\".\n", orig_w1_str.c_str(), orig_w2_str.c_str());
509 }
510 if (tessedit_bigram_debug > 1) {
511 if (!w_prev->best_choices.singleton()) {
512 w_prev->PrintBestChoices();
513 }
514 if (!w->best_choices.singleton()) {
515 w->PrintBestChoices();
516 }
517 }
518 float best_rating = 0.0;
519 int best_idx = 0;
520 WERD_CHOICE_IT prev_it(&w_prev->best_choices);
521 for (prev_it.mark_cycle_pt(); !prev_it.cycled_list(); prev_it.forward()) {
522 WERD_CHOICE *p1 = prev_it.data();
523 WERD_CHOICE strip1(w->uch_set);
524 {
525 int p1start, p1end;
526 p1->GetNonSuperscriptSpan(&p1start, &p1end);
527 strip1 = p1->shallow_copy(p1start, p1end);
528 }
529 WERD_CHOICE_IT w_it(&w->best_choices);
530 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
531 WERD_CHOICE *p2 = w_it.data();
532 WERD_CHOICE strip2(w->uch_set);
533 {
534 int p2start, p2end;
535 p2->GetNonSuperscriptSpan(&p2start, &p2end);
536 strip2 = p2->shallow_copy(p2start, p2end);
537 }
538 if (w->tesseract->getDict().valid_bigram(strip1, strip2)) {
539 overrides_word1.push_back(p1);
540 overrides_word2.push_back(p2);
541 if (overrides_word1.size() == 1 || p1->rating() + p2->rating() < best_rating) {
542 best_rating = p1->rating() + p2->rating();
543 best_idx = overrides_word1.size() - 1;
544 }
545 }
546 }
547 }
548 if (!overrides_word1.empty()) {
549 // Excellent, we have some bigram matches.
550 if (EqualIgnoringCaseAndTerminalPunct(*w_prev->best_choice, *overrides_word1[best_idx]) &&
551 EqualIgnoringCaseAndTerminalPunct(*w->best_choice, *overrides_word2[best_idx])) {
552 if (tessedit_bigram_debug > 1) {
553 tprintf(
554 "Top choice \"%s %s\" verified (sans case) by bigram "
555 "model.\n",
556 orig_w1_str.c_str(), orig_w2_str.c_str());
557 }
558 continue;
559 }
560 const auto &new_w1_str = overrides_word1[best_idx]->unichar_string();
561 const auto &new_w2_str = overrides_word2[best_idx]->unichar_string();
562 if (new_w1_str != orig_w1_str) {
563 w_prev->ReplaceBestChoice(overrides_word1[best_idx]);
564 }
565 if (new_w2_str != orig_w2_str) {
566 w->ReplaceBestChoice(overrides_word2[best_idx]);
567 }
568 if (tessedit_bigram_debug > 0) {
569 std::string choices_description;
570 int num_bigram_choices = overrides_word1.size() * overrides_word2.size();
571 if (num_bigram_choices == 1) {
572 choices_description = "This was the unique bigram choice.";
573 } else {
574 if (tessedit_bigram_debug > 1) {
575 std::string bigrams_list;
576 const int kMaxChoicesToPrint = 20;
577 for (unsigned i = 0; i < overrides_word1.size() && i < kMaxChoicesToPrint; i++) {
578 if (i > 0) {
579 bigrams_list += ", ";
580 }
581 WERD_CHOICE *p1 = overrides_word1[i];
582 WERD_CHOICE *p2 = overrides_word2[i];
583 bigrams_list += p1->unichar_string() + " " + p2->unichar_string();
584 }
585 choices_description = "There were many choices: {";
586 choices_description += bigrams_list;
587 choices_description += "}";
588 } else {
589 choices_description += "There were " + std::to_string(num_bigram_choices);
590 choices_description += " compatible bigrams.";
591 }
592 }
593 tprintf("Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n", orig_w1_str.c_str(),
594 orig_w2_str.c_str(), new_w1_str.c_str(), new_w2_str.c_str(),
595 choices_description.c_str());
596 }
597 }
598 }
599 }
600
601 void Tesseract::rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor,
602 const TBOX *target_word_box, const char *word_config) {
603 PAGE_RES_IT page_res_it(page_res);
604 // ****************** Pass 5 *******************
605 // Gather statistics on rejects.
606 int word_index = 0;
607 while (!tessedit_test_adaption && page_res_it.word() != nullptr) {
608 WERD_RES *word = page_res_it.word();
609 word_index++;
610 if (monitor != nullptr) {
611 monitor->ocr_alive = true;
612 monitor->progress = 95 + 5 * word_index / stats_.word_count;
613 }
614 if (word->rebuild_word == nullptr) {
615 // Word was not processed by tesseract.
616 page_res_it.forward();
617 continue;
618 }
619 check_debug_pt(word, 70);
620
621 // changed by jetsoft
622 // specific to its needs to extract one word when need
623 if (target_word_box &&
624 !ProcessTargetWord(word->word->bounding_box(), *target_word_box, word_config, 4)) {
625 page_res_it.forward();
626 continue;
627 }
628 // end jetsoft
629
630 page_res_it.rej_stat_word();
631 const int chars_in_word = word->reject_map.length();
632 const int rejects_in_word = word->reject_map.reject_count();
633
634 const int blob_quality = word_blob_quality(word);
635 stats_.doc_blob_quality += blob_quality;
636 const int outline_errs = word_outline_errs(word);
637 stats_.doc_outline_errs += outline_errs;
638 int16_t all_char_quality;
639 int16_t accepted_all_char_quality;
640 word_char_quality(word, &all_char_quality, &accepted_all_char_quality);
641 stats_.doc_char_quality += all_char_quality;
642 const uint8_t permuter_type = word->best_choice->permuter();
643 if ((permuter_type == SYSTEM_DAWG_PERM) || (permuter_type == FREQ_DAWG_PERM) ||
644 (permuter_type == USER_DAWG_PERM)) {
645 stats_.good_char_count += chars_in_word - rejects_in_word;
646 stats_.doc_good_char_quality += accepted_all_char_quality;
647 }
648 check_debug_pt(word, 80);
649 if (tessedit_reject_bad_qual_wds && (blob_quality == 0) && (outline_errs >= chars_in_word)) {
650 word->reject_map.rej_word_bad_quality();
651 }
652 check_debug_pt(word, 90);
653 page_res_it.forward();
654 }
655
656 if (tessedit_debug_quality_metrics) {
657 tprintf(
658 "QUALITY: num_chs= %d num_rejs= %d %5.3f blob_qual= %d %5.3f"
659 " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
660 page_res->char_count, page_res->rej_count,
661 page_res->rej_count / static_cast<float>(page_res->char_count), stats_.doc_blob_quality,
662 stats_.doc_blob_quality / static_cast<float>(page_res->char_count), stats_.doc_outline_errs,
663 stats_.doc_outline_errs / static_cast<float>(page_res->char_count), stats_.doc_char_quality,
664 stats_.doc_char_quality / static_cast<float>(page_res->char_count),
665 stats_.doc_good_char_quality,
666 (stats_.good_char_count > 0)
667 ? (stats_.doc_good_char_quality / static_cast<float>(stats_.good_char_count))
668 : 0.0);
669 }
670 bool good_quality_doc =
671 ((page_res->rej_count / static_cast<float>(page_res->char_count)) <= quality_rej_pc) &&
672 (stats_.doc_blob_quality / static_cast<float>(page_res->char_count) >= quality_blob_pc) &&
673 (stats_.doc_outline_errs / static_cast<float>(page_res->char_count) <= quality_outline_pc) &&
674 (stats_.doc_char_quality / static_cast<float>(page_res->char_count) >= quality_char_pc);
675
676 // ****************** Pass 6 *******************
677 // Do whole document or whole block rejection pass
678 if (!tessedit_test_adaption) {
679 quality_based_rejection(page_res_it, good_quality_doc);
680 }
681 }
682
683 #endif // ndef DISABLED_LEGACY_ENGINE
684
685 void Tesseract::blamer_pass(PAGE_RES *page_res) {
686 if (!wordrec_run_blamer) {
687 return;
688 }
689 PAGE_RES_IT page_res_it(page_res);
690 for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
691 WERD_RES *word = page_res_it.word();
692 BlamerBundle::LastChanceBlame(wordrec_debug_blamer, word);
693 page_res->blame_reasons[word->blamer_bundle->incorrect_result_reason()]++;
694 }
695 tprintf("Blame reasons:\n");
696 for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) {
697 tprintf("%s %d\n", BlamerBundle::IncorrectReasonName(static_cast<IncorrectResultReason>(bl)),
698 page_res->blame_reasons[bl]);
699 }
700 if (page_res->misadaption_log.size() > 0) {
701 tprintf("Misadaption log:\n");
702 for (auto &log : page_res->misadaption_log) {
703 tprintf("%s\n", log.c_str());
704 }
705 }
706 }
707
708 // Sets script positions and detects smallcaps on all output words.
709 void Tesseract::script_pos_pass(PAGE_RES *page_res) {
710 PAGE_RES_IT page_res_it(page_res);
711 for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
712 WERD_RES *word = page_res_it.word();
713 if (word->word->flag(W_REP_CHAR)) {
714 page_res_it.forward();
715 continue;
716 }
717 const float x_height = page_res_it.block()->block->x_height();
718 float word_x_height = word->x_height;
719 if (word_x_height < word->best_choice->min_x_height() ||
720 word_x_height > word->best_choice->max_x_height()) {
721 word_x_height =
722 (word->best_choice->min_x_height() + word->best_choice->max_x_height()) / 2.0f;
723 }
724 // Test for small caps. Word capheight must be close to block xheight,
725 // and word must contain no lower case letters, and at least one upper case.
726 const double small_cap_xheight = x_height * kXHeightCapRatio;
727 const double small_cap_delta = (x_height - small_cap_xheight) / 2.0;
728 if (word->uch_set->script_has_xheight() &&
729 small_cap_xheight - small_cap_delta <= word_x_height &&
730 word_x_height <= small_cap_xheight + small_cap_delta) {
731 // Scan for upper/lower.
732 int num_upper = 0;
733 int num_lower = 0;
734 for (unsigned i = 0; i < word->best_choice->length(); ++i) {
735 if (word->uch_set->get_isupper(word->best_choice->unichar_id(i))) {
736 ++num_upper;
737 } else if (word->uch_set->get_islower(word->best_choice->unichar_id(i))) {
738 ++num_lower;
739 }
740 }
741 if (num_upper > 0 && num_lower == 0) {
742 word->small_caps = true;
743 }
744 }
745 word->SetScriptPositions();
746 }
747 }
748
749 // Helper finds the gap between the index word and the next.
750 static void WordGap(const PointerVector<WERD_RES> &words, unsigned index, int *right, int *next_left) {
751 *right = -INT32_MAX;
752 *next_left = INT32_MAX;
753 if (index < words.size()) {
754 *right = words[index]->word->bounding_box().right();
755 if (index + 1 < words.size()) {
756 *next_left = words[index + 1]->word->bounding_box().left();
757 }
758 }
759 }
760
761 // Factored helper computes the rating, certainty, badness and validity of
762 // the permuter of the words in [first_index, end_index).
763 static void EvaluateWordSpan(const PointerVector<WERD_RES> &words, unsigned first_index, unsigned end_index,
764 float *rating, float *certainty, bool *bad, bool *valid_permuter) {
765 if (end_index <= first_index) {
766 *bad = true;
767 *valid_permuter = false;
768 }
769 for (unsigned index = first_index; index < end_index && index < words.size(); ++index) {
770 WERD_CHOICE *choice = words[index]->best_choice;
771 if (choice == nullptr) {
772 *bad = true;
773 } else {
774 *rating += choice->rating();
775 *certainty = std::min(*certainty, choice->certainty());
776 if (!Dict::valid_word_permuter(choice->permuter(), false)) {
777 *valid_permuter = false;
778 }
779 }
780 }
781 }
782
783 // Helper chooses the best combination of words, transferring good ones from
784 // new_words to best_words. To win, a new word must have (better rating and
785 // certainty) or (better permuter status and rating within rating ratio and
786 // certainty within certainty margin) than current best.
787 // All the new_words are consumed (moved to best_words or deleted.)
788 // The return value is the number of new_words used minus the number of
789 // best_words that remain in the output.
790 static int SelectBestWords(double rating_ratio, double certainty_margin, bool debug,
791 PointerVector<WERD_RES> *new_words,
792 PointerVector<WERD_RES> *best_words) {
793 // Process the smallest groups of words that have an overlapping word
794 // boundary at the end.
795 std::vector<WERD_RES *> out_words;
796 // Index into each word vector (best, new).
797 unsigned b = 0, n = 0;
798 int num_best = 0, num_new = 0;
799 while (b < best_words->size() || n < new_words->size()) {
800 // Start of the current run in each.
801 auto start_b = b, start_n = n;
802 while (b < best_words->size() || n < new_words->size()) {
803 int b_right = -INT32_MAX;
804 int next_b_left = INT32_MAX;
805 WordGap(*best_words, b, &b_right, &next_b_left);
806 int n_right = -INT32_MAX;
807 int next_n_left = INT32_MAX;
808 WordGap(*new_words, n, &n_right, &next_n_left);
809 if (std::max(b_right, n_right) < std::min(next_b_left, next_n_left)) {
810 // The word breaks overlap. [start_b,b] and [start_n, n] match.
811 break;
812 }
813 // Keep searching for the matching word break.
814 if ((b_right < n_right && b < best_words->size()) || n == new_words->size()) {
815 ++b;
816 } else {
817 ++n;
818 }
819 }
820 // Rating of the current run in each.
821 float b_rating = 0.0f, n_rating = 0.0f;
822 // Certainty of the current run in each.
823 float b_certainty = 0.0f, n_certainty = 0.0f;
824 // True if any word is missing its best choice.
825 bool b_bad = false, n_bad = false;
826 // True if all words have a valid permuter.
827 bool b_valid_permuter = true, n_valid_permuter = true;
828 const int end_b = b < best_words->size() ? b + 1 : b;
829 const int end_n = n < new_words->size() ? n + 1 : n;
830 EvaluateWordSpan(*best_words, start_b, end_b, &b_rating, &b_certainty, &b_bad,
831 &b_valid_permuter);
832 EvaluateWordSpan(*new_words, start_n, end_n, &n_rating, &n_certainty, &n_bad,
833 &n_valid_permuter);
834 bool new_better = false;
835 if (!n_bad && (b_bad || (n_certainty > b_certainty && n_rating < b_rating) ||
836 (!b_valid_permuter && n_valid_permuter && n_rating < b_rating * rating_ratio &&
837 n_certainty > b_certainty - certainty_margin))) {
838 // New is better.
839 for (int i = start_n; i < end_n; ++i) {
840 out_words.push_back((*new_words)[i]);
841 (*new_words)[i] = nullptr;
842 ++num_new;
843 }
844 new_better = true;
845 } else if (!b_bad) {
846 // Current best is better.
847 for (int i = start_b; i < end_b; ++i) {
848 out_words.push_back((*best_words)[i]);
849 (*best_words)[i] = nullptr;
850 ++num_best;
851 }
852 }
853 if (debug) {
854 tprintf(
855 "%d new words %s than %d old words: r: %g v %g c: %g v %g"
856 " valid dict: %d v %d\n",
857 end_n - start_n, new_better ? "better" : "worse", end_b - start_b, n_rating, b_rating,
858 n_certainty, b_certainty, n_valid_permuter, b_valid_permuter);
859 }
860 // Move on to the next group.
861 b = end_b;
862 n = end_n;
863 }
864 // Transfer from out_words to best_words.
865 best_words->clear();
866 for (auto &out_word : out_words) {
867 best_words->push_back(out_word);
868 }
869 return num_new - num_best;
870 }
871
872 // Helper to recognize the word using the given (language-specific) tesseract.
873 // Returns positive if this recognizer found more new best words than the
874 // number kept from best_words.
875 int Tesseract::RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, bool debug,
876 WERD_RES **in_word, PointerVector<WERD_RES> *best_words) {
877 if (debug) {
878 tprintf("Trying word using lang %s, oem %d\n", lang.c_str(),
879 static_cast<int>(tessedit_ocr_engine_mode));
880 }
881 // Run the recognizer on the word.
882 PointerVector<WERD_RES> new_words;
883 (this->*recognizer)(word_data, in_word, &new_words);
884 if (new_words.empty()) {
885 // Transfer input word to new_words, as the classifier must have put
886 // the result back in the input.
887 new_words.push_back(*in_word);
888 *in_word = nullptr;
889 }
890 if (debug) {
891 for (unsigned i = 0; i < new_words.size(); ++i) {
892 new_words[i]->DebugTopChoice("Lang result");
893 }
894 }
895 // Initial version is a bit of a hack based on better certainty and rating
896 // or a dictionary vs non-dictionary word.
897 return SelectBestWords(classify_max_rating_ratio, classify_max_certainty_margin, debug,
898 &new_words, best_words);
899 }
900
901 // Helper returns true if all the words are acceptable.
902 static bool WordsAcceptable(const PointerVector<WERD_RES> &words) {
903 for (unsigned w = 0; w < words.size(); ++w) {
904 if (words[w]->tess_failed || !words[w]->tess_accepted) {
905 return false;
906 }
907 }
908 return true;
909 }
910
911 #ifndef DISABLED_LEGACY_ENGINE
912
913 // Moves good-looking "noise"/diacritics from the reject list to the main
914 // blob list on the current word. Returns true if anything was done, and
915 // sets make_next_word_fuzzy if blob(s) were added to the end of the word.
916 bool Tesseract::ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy) {
917 *make_next_word_fuzzy = false;
918 WERD *real_word = pr_it->word()->word;
919 if (real_word->rej_cblob_list()->empty() || real_word->cblob_list()->empty() ||
920 real_word->rej_cblob_list()->length() > noise_maxperword) {
921 return false;
922 }
923 real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle);
924 // Get the noise outlines into a vector with matching bool map.
925 std::vector<C_OUTLINE *> outlines;
926 real_word->GetNoiseOutlines(&outlines);
927 std::vector<bool> word_wanted;
928 std::vector<bool> overlapped_any_blob;
929 std::vector<C_BLOB *> target_blobs;
930 AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it, &word_wanted,
931 &overlapped_any_blob, &target_blobs);
932 // Filter the outlines that overlapped any blob and put them into the word
933 // now. This simplifies the remaining task and also makes it more accurate
934 // as it has more completed blobs to work on.
935 std::vector<bool> wanted;
936 std::vector<C_BLOB *> wanted_blobs;
937 std::vector<C_OUTLINE *> wanted_outlines;
938 int num_overlapped = 0;
939 int num_overlapped_used = 0;
940 for (unsigned i = 0; i < overlapped_any_blob.size(); ++i) {
941 if (overlapped_any_blob[i]) {
942 ++num_overlapped;
943 if (word_wanted[i]) {
944 ++num_overlapped_used;
945 }
946 wanted.push_back(word_wanted[i]);
947 wanted_blobs.push_back(target_blobs[i]);
948 wanted_outlines.push_back(outlines[i]);
949 outlines[i] = nullptr;
950 }
951 }
952 real_word->AddSelectedOutlines(wanted, wanted_blobs, wanted_outlines, nullptr);
953 AssignDiacriticsToNewBlobs(outlines, pass, real_word, pr_it, &word_wanted, &target_blobs);
954 // TODO: check code.
955 int non_overlapped = 0;
956 int non_overlapped_used = 0;
957 for (unsigned i = 0; i < word_wanted.size(); ++i) {
958 if (word_wanted[i]) {
959 ++non_overlapped_used;
960 }
961 if (outlines[i] != nullptr) {
962 ++non_overlapped_used;
963 }
964 }
965 if (debug_noise_removal) {
966 tprintf("Used %d/%d overlapped %d/%d non-overlapped diacritics on word:", num_overlapped_used,
967 num_overlapped, non_overlapped_used, non_overlapped);
968 real_word->bounding_box().print();
969 }
970 // Now we have decided which outlines we want, put them into the real_word.
971 if (real_word->AddSelectedOutlines(word_wanted, target_blobs, outlines, make_next_word_fuzzy)) {
972 pr_it->MakeCurrentWordFuzzy();
973 }
974 // TODO(rays) Parts of combos have a deep copy of the real word, and need
975 // to have their noise outlines moved/assigned in the same way!!
976 return num_overlapped_used != 0 || non_overlapped_used != 0;
977 }
978
979 // Attempts to put noise/diacritic outlines into the blobs that they overlap.
980 // Input: a set of noisy outlines that probably belong to the real_word.
981 // Output: word_wanted indicates which outlines are to be assigned to a blob,
982 // target_blobs indicates which to assign to, and overlapped_any_blob is
983 // true for all outlines that overlapped a blob.
984 void Tesseract::AssignDiacriticsToOverlappingBlobs(const std::vector<C_OUTLINE *> &outlines,
985 int pass, WERD *real_word, PAGE_RES_IT *pr_it,
986 std::vector<bool> *word_wanted,
987 std::vector<bool> *overlapped_any_blob,
988 std::vector<C_BLOB *> *target_blobs) {
989 std::vector<bool> blob_wanted;
990 word_wanted->clear();
991 word_wanted->resize(outlines.size());
992 overlapped_any_blob->clear();
993 overlapped_any_blob->resize(outlines.size());
994 target_blobs->clear();
995 target_blobs->resize(outlines.size());
996 // For each real blob, find the outlines that seriously overlap it.
997 // A single blob could be several merged characters, so there can be quite
998 // a few outlines overlapping, and the full engine needs to be used to chop
999 // and join to get a sensible result.
1000 C_BLOB_IT blob_it(real_word->cblob_list());
1001 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1002 C_BLOB *blob = blob_it.data();
1003 const TBOX blob_box = blob->bounding_box();
1004 blob_wanted.clear();
1005 blob_wanted.resize(outlines.size());
1006 int num_blob_outlines = 0;
1007 for (unsigned i = 0; i < outlines.size(); ++i) {
1008 if (blob_box.major_x_overlap(outlines[i]->bounding_box()) && !(*word_wanted)[i]) {
1009 blob_wanted[i] = true;
1010 (*overlapped_any_blob)[i] = true;
1011 ++num_blob_outlines;
1012 }
1013 }
1014 if (debug_noise_removal) {
1015 tprintf("%d noise outlines overlap blob at:", num_blob_outlines);
1016 blob_box.print();
1017 }
1018 // If any outlines overlap the blob, and not too many, classify the blob
1019 // (using the full engine, languages and all), and choose the maximal
1020 // combination of outlines that doesn't hurt the end-result classification
1021 // by too much. Mark them as wanted.
1022 if (0 < num_blob_outlines && num_blob_outlines < noise_maxperblob) {
1023 if (SelectGoodDiacriticOutlines(pass, noise_cert_basechar, pr_it, blob, outlines,
1024 num_blob_outlines, &blob_wanted)) {
1025 for (unsigned i = 0; i < blob_wanted.size(); ++i) {
1026 if (blob_wanted[i]) {
1027 // Claim the outline and record where it is going.
1028 (*word_wanted)[i] = true;
1029 (*target_blobs)[i] = blob;
1030 }
1031 }
1032 }
1033 }
1034 }
1035 }
1036
1037 // Attempts to assign non-overlapping outlines to their nearest blobs or
1038 // make new blobs out of them.
1039 void Tesseract::AssignDiacriticsToNewBlobs(const std::vector<C_OUTLINE *> &outlines, int pass,
1040 WERD *real_word, PAGE_RES_IT *pr_it,
1041 std::vector<bool> *word_wanted,
1042 std::vector<C_BLOB *> *target_blobs) {
1043 std::vector<bool> blob_wanted;
1044 word_wanted->clear();
1045 word_wanted->resize(outlines.size());
1046 target_blobs->clear();
1047 target_blobs->resize(outlines.size());
1048 // Check for outlines that need to be turned into stand-alone blobs.
1049 for (unsigned i = 0; i < outlines.size(); ++i) {
1050 if (outlines[i] == nullptr) {
1051 continue;
1052 }
1053 // Get a set of adjacent outlines that don't overlap any existing blob.
1054 blob_wanted.clear();
1055 blob_wanted.resize(outlines.size());
1056 int num_blob_outlines = 0;
1057 TBOX total_ol_box(outlines[i]->bounding_box());
1058 while (i < outlines.size() && outlines[i] != nullptr) {
1059 blob_wanted[i] = true;
1060 total_ol_box += outlines[i]->bounding_box();
1061 ++i;
1062 ++num_blob_outlines;
1063 }
1064 // Find the insertion point.
1065 C_BLOB_IT blob_it(real_word->cblob_list());
1066 while (!blob_it.at_last() &&
1067 blob_it.data_relative(1)->bounding_box().left() <= total_ol_box.left()) {
1068 blob_it.forward();
1069 }
1070 // Choose which combination of them we actually want and where to put
1071 // them.
1072 if (debug_noise_removal) {
1073 tprintf("Num blobless outlines = %d\n", num_blob_outlines);
1074 }
1075 C_BLOB *left_blob = blob_it.data();
1076 TBOX left_box = left_blob->bounding_box();
1077 C_BLOB *right_blob = blob_it.at_last() ? nullptr : blob_it.data_relative(1);
1078 if ((left_box.x_overlap(total_ol_box) || right_blob == nullptr ||
1079 !right_blob->bounding_box().x_overlap(total_ol_box)) &&
1080 SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, left_blob, outlines,
1081 num_blob_outlines, &blob_wanted)) {
1082 if (debug_noise_removal) {
1083 tprintf("Added to left blob\n");
1084 }
1085 for (unsigned j = 0; j < blob_wanted.size(); ++j) {
1086 if (blob_wanted[j]) {
1087 (*word_wanted)[j] = true;
1088 (*target_blobs)[j] = left_blob;
1089 }
1090 }
1091 } else if (right_blob != nullptr &&
1092 (!left_box.x_overlap(total_ol_box) ||
1093 right_blob->bounding_box().x_overlap(total_ol_box)) &&
1094 SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, right_blob, outlines,
1095 num_blob_outlines, &blob_wanted)) {
1096 if (debug_noise_removal) {
1097 tprintf("Added to right blob\n");
1098 }
1099 for (unsigned j = 0; j < blob_wanted.size(); ++j) {
1100 if (blob_wanted[j]) {
1101 (*word_wanted)[j] = true;
1102 (*target_blobs)[j] = right_blob;
1103 }
1104 }
1105 } else if (SelectGoodDiacriticOutlines(pass, noise_cert_punc, pr_it, nullptr, outlines,
1106 num_blob_outlines, &blob_wanted)) {
1107 if (debug_noise_removal) {
1108 tprintf("Fitted between blobs\n");
1109 }
1110 for (unsigned j = 0; j < blob_wanted.size(); ++j) {
1111 if (blob_wanted[j]) {
1112 (*word_wanted)[j] = true;
1113 (*target_blobs)[j] = nullptr;
1114 }
1115 }
1116 }
1117 }
1118 }
1119
1120 // Starting with ok_outlines set to indicate which outlines overlap the blob,
1121 // chooses the optimal set (approximately) and returns true if any outlines
1122 // are desired, in which case ok_outlines indicates which ones.
1123 bool Tesseract::SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it,
1124 C_BLOB *blob,
1125 const std::vector<C_OUTLINE *> &outlines,
1126 int num_outlines, std::vector<bool> *ok_outlines) {
1127 float target_cert = certainty_threshold;
1128 if (blob != nullptr) {
1129 std::string best_str;
1130 float target_c2;
1131 target_cert = ClassifyBlobAsWord(pass, pr_it, blob, best_str, &target_c2);
1132 if (debug_noise_removal) {
1133 tprintf("No Noise blob classified as %s=%g(%g) at:", best_str.c_str(), target_cert,
1134 target_c2);
1135 blob->bounding_box().print();
1136 }
1137 target_cert -= (target_cert - certainty_threshold) * noise_cert_factor;
1138 }
1139 std::vector<bool> test_outlines = *ok_outlines;
1140 // Start with all the outlines in.
1141 std::string all_str;
1142 std::vector<bool> best_outlines = *ok_outlines;
1143 float best_cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass, pr_it, blob, all_str);
1144 if (debug_noise_removal) {
1145 TBOX ol_box;
1146 for (unsigned i = 0; i < test_outlines.size(); ++i) {
1147 if (test_outlines[i]) {
1148 ol_box += outlines[i]->bounding_box();
1149 }
1150 }
1151 tprintf("All Noise blob classified as %s=%g, delta=%g at:", all_str.c_str(), best_cert,
1152 best_cert - target_cert);
1153 ol_box.print();
1154 }
1155 // Iteratively zero out the bit that improves the certainty the most, until
1156 // we get past the threshold, have zero bits, or fail to improve.
1157 int best_index = 0; // To zero out.
1158 while (num_outlines > 1 && best_index >= 0 &&
1159 (blob == nullptr || best_cert < target_cert || blob != nullptr)) {
1160 // Find the best bit to zero out.
1161 best_index = -1;
1162 for (unsigned i = 0; i < outlines.size(); ++i) {
1163 if (test_outlines[i]) {
1164 test_outlines[i] = false;
1165 std::string str;
1166 float cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass, pr_it, blob, str);
1167 if (debug_noise_removal) {
1168 TBOX ol_box;
1169 for (unsigned j = 0; j < outlines.size(); ++j) {
1170 if (test_outlines[j]) {
1171 ol_box += outlines[j]->bounding_box();
1172 }
1173 tprintf("%c", test_outlines[j] ? 'T' : 'F');
1174 }
1175 tprintf(" blob classified as %s=%g, delta=%g) at:", str.c_str(), cert,
1176 cert - target_cert);
1177 ol_box.print();
1178 }
1179 if (cert > best_cert) {
1180 best_cert = cert;
1181 best_index = i;
1182 best_outlines = test_outlines;
1183 }
1184 test_outlines[i] = true;
1185 }
1186 }
1187 if (best_index >= 0) {
1188 test_outlines[best_index] = false;
1189 --num_outlines;
1190 }
1191 }
1192 if (best_cert >= target_cert) {
1193 // Save the best combination.
1194 *ok_outlines = best_outlines;
1195 if (debug_noise_removal) {
1196 tprintf("%s noise combination ", blob ? "Adding" : "New");
1197 for (auto &&best_outline : best_outlines) {
1198 tprintf("%c", best_outline ? 'T' : 'F');
1199 }
1200 tprintf(" yields certainty %g, beating target of %g\n", best_cert, target_cert);
1201 }
1202 return true;
1203 }
1204
1205 return false;
1206 }
1207
1208 // Classifies the given blob plus the outlines flagged by ok_outlines, undoes
1209 // the inclusion of the outlines, and returns the certainty of the raw choice.
1210 float Tesseract::ClassifyBlobPlusOutlines(const std::vector<bool> &ok_outlines,
1211 const std::vector<C_OUTLINE *> &outlines, int pass_n,
1212 PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str) {
1213 C_OUTLINE_IT ol_it;
1214 C_OUTLINE *first_to_keep = nullptr;
1215 C_BLOB *local_blob = nullptr;
1216 if (blob != nullptr) {
1217 // Add the required outlines to the blob.
1218 ol_it.set_to_list(blob->out_list());
1219 first_to_keep = ol_it.data();
1220 }
1221 for (unsigned i = 0; i < ok_outlines.size(); ++i) {
1222 if (ok_outlines[i]) {
1223 // This outline is to be added.
1224 if (blob == nullptr) {
1225 local_blob = new C_BLOB(outlines[i]);
1226 blob = local_blob;
1227 ol_it.set_to_list(blob->out_list());
1228 } else {
1229 ol_it.add_before_stay_put(outlines[i]);
1230 }
1231 }
1232 }
1233 float c2;
1234 float cert = ClassifyBlobAsWord(pass_n, pr_it, blob, best_str, &c2);
1235 ol_it.move_to_first();
1236 if (first_to_keep == nullptr) {
1237 // We created blob. Empty its outlines and delete it.
1238 for (; !ol_it.empty(); ol_it.forward()) {
1239 ol_it.extract();
1240 }
1241 delete local_blob;
1242 cert = -c2;
1243 } else {
1244 // Remove the outlines that we put in.
1245 for (; ol_it.data() != first_to_keep; ol_it.forward()) {
1246 ol_it.extract();
1247 }
1248 }
1249 return cert;
1250 }
1251
1252 // Classifies the given blob (part of word_data->word->word) as an individual
1253 // word, using languages, chopper etc, returning only the certainty of the
1254 // best raw choice, and undoing all the work done to fake out the word.
1255 float Tesseract::ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str,
1256 float *c2) {
1257 WERD *real_word = pr_it->word()->word;
1258 WERD *word = real_word->ConstructFromSingleBlob(real_word->flag(W_BOL), real_word->flag(W_EOL),
1259 C_BLOB::deep_copy(blob));
1260 WERD_RES *word_res = pr_it->InsertSimpleCloneWord(*pr_it->word(), word);
1261 // Get a new iterator that points to the new word.
1262 PAGE_RES_IT it(pr_it->page_res);
1263 while (it.word() != word_res && it.word() != nullptr) {
1264 it.forward();
1265 }
1266 ASSERT_HOST(it.word() == word_res);
1267 WordData wd(it);
1268 // Force full initialization.
1269 SetupWordPassN(1, &wd);
1270 classify_word_and_language(pass_n, &it, &wd);
1271 if (debug_noise_removal) {
1272 if (wd.word->raw_choice != nullptr) {
1273 tprintf("word xheight=%g, row=%g, range=[%g,%g]\n", word_res->x_height, wd.row->x_height(),
1274 wd.word->raw_choice->min_x_height(), wd.word->raw_choice->max_x_height());
1275 } else {
1276 tprintf("Got word with null raw choice xheight=%g, row=%g\n", word_res->x_height,
1277 wd.row->x_height());
1278 }
1279 }
1280 float cert = 0.0f;
1281 if (wd.word->raw_choice != nullptr) { // This probably shouldn't happen, but...
1282 cert = wd.word->raw_choice->certainty();
1283 float rat = wd.word->raw_choice->rating();
1284 *c2 = rat > 0.0f ? cert * cert / rat : 0.0f;
1285 best_str = wd.word->raw_choice->unichar_string();
1286 } else {
1287 *c2 = 0.0f;
1288 best_str.clear();
1289 }
1290 it.DeleteCurrentWord();
1291 pr_it->ResetWordIterator();
1292 return cert;
1293 }
1294
1295 #endif // ndef DISABLED_LEGACY_ENGINE
1296
1297 // Generic function for classifying a word. Can be used either for pass1 or
1298 // pass2 according to the function passed to recognizer.
1299 // word_data holds the word to be recognized, and its block and row, and
1300 // pr_it points to the word as well, in case we are running LSTM and it wants
1301 // to output multiple words.
1302 // Recognizes in the current language, and if successful that is all.
1303 // If recognition was not successful, tries all available languages until
1304 // it gets a successful result or runs out of languages. Keeps the best result.
1305 void Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data) {
1306 #ifdef DISABLED_LEGACY_ENGINE
1307 WordRecognizer recognizer = &Tesseract::classify_word_pass1;
1308 #else
1309 WordRecognizer recognizer =
1310 pass_n == 1 ? &Tesseract::classify_word_pass1 : &Tesseract::classify_word_pass2;
1311 #endif // def DISABLED_LEGACY_ENGINE
1312
1313 // Best result so far.
1314 PointerVector<WERD_RES> best_words;
1315 // Points to the best result. May be word or in lang_words.
1316 const WERD_RES *word = word_data->word;
1317 clock_t total_time = 0;
1318 const bool timing_debug = tessedit_timing_debug;
1319 if (timing_debug) {
1320 total_time = clock();
1321 }
1322 const bool debug = classify_debug_level > 0 || multilang_debug_level > 0;
1323 if (debug) {
1324 tprintf("%s word with lang %s at:", word->done ? "Already done" : "Processing",
1325 most_recently_used_->lang.c_str());
1326 word->word->bounding_box().print();
1327 }
1328 if (word->done) {
1329 // If done on pass1, leave it as-is.
1330 if (!word->tess_failed) {
1331 most_recently_used_ = word->tesseract;
1332 }
1333 return;
1334 }
1335 auto sub = sub_langs_.size();
1336 if (most_recently_used_ != this) {
1337 // Get the index of the most_recently_used_.
1338 for (sub = 0; sub < sub_langs_.size() && most_recently_used_ != sub_langs_[sub]; ++sub) {
1339 }
1340 }
1341 most_recently_used_->RetryWithLanguage(*word_data, recognizer, debug, &word_data->lang_words[sub],
1342 &best_words);
1343 Tesseract *best_lang_tess = most_recently_used_;
1344 if (!WordsAcceptable(best_words)) {
1345 // Try all the other languages to see if they are any better.
1346 if (most_recently_used_ != this &&
1347 this->RetryWithLanguage(*word_data, recognizer, debug,
1348 &word_data->lang_words[sub_langs_.size()], &best_words) > 0) {
1349 best_lang_tess = this;
1350 }
1351 for (unsigned i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size(); ++i) {
1352 if (most_recently_used_ != sub_langs_[i] &&
1353 sub_langs_[i]->RetryWithLanguage(*word_data, recognizer, debug, &word_data->lang_words[i],
1354 &best_words) > 0) {
1355 best_lang_tess = sub_langs_[i];
1356 }
1357 }
1358 }
1359 most_recently_used_ = best_lang_tess;
1360 if (!best_words.empty()) {
1361 if (best_words.size() == 1 && !best_words[0]->combination) {
1362 // Move the best single result to the main word.
1363 word_data->word->ConsumeWordResults(best_words[0]);
1364 } else {
1365 // Words came from LSTM, and must be moved to the PAGE_RES properly.
1366 word_data->word = best_words.back();
1367 pr_it->ReplaceCurrentWord(&best_words);
1368 }
1369 ASSERT_HOST(word_data->word->box_word != nullptr);
1370 } else {
1371 tprintf("no best words!!\n");
1372 }
1373 if (timing_debug) {
1374 total_time = clock() - total_time;
1375 tesserr << word_data->word->best_choice->unichar_string()
1376 << " (ocr took " << 1000 * total_time / CLOCKS_PER_SEC << " ms)\n";
1377 }
1378 }
1379
1380 /**
1381 * classify_word_pass1
1382 *
1383 * Baseline normalize the word and pass it to Tess.
1384 */
1385
1386 void Tesseract::classify_word_pass1(const WordData &word_data, WERD_RES **in_word,
1387 PointerVector<WERD_RES> *out_words) {
1388 ROW *row = word_data.row;
1389 BLOCK *block = word_data.block;
1390 prev_word_best_choice_ =
1391 word_data.prev_word != nullptr ? word_data.prev_word->word->best_choice : nullptr;
1392 #ifdef DISABLED_LEGACY_ENGINE
1393 if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
1394 #else
1395 if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY ||
1396 tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) {
1397 #endif // def DISABLED_LEGACY_ENGINE
1398 if (!(*in_word)->odd_size || tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
1399 LSTMRecognizeWord(*block, row, *in_word, out_words);
1400 if (!out_words->empty()) {
1401 return; // Successful lstm recognition.
1402 }
1403 }
1404 if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
1405 // No fallback allowed, so use a fake.
1406 (*in_word)->SetupFake(lstm_recognizer_->GetUnicharset());
1407 return;
1408 }
1409
1410 #ifndef DISABLED_LEGACY_ENGINE
1411 // Fall back to tesseract for failed words or odd words.
1412 (*in_word)->SetupForRecognition(unicharset, this, BestPix(), OEM_TESSERACT_ONLY, nullptr,
1413 classify_bln_numeric_mode, textord_use_cjk_fp_model,
1414 poly_allow_detailed_fx, row, block);
1415 #endif // ndef DISABLED_LEGACY_ENGINE
1416 }
1417
1418 #ifndef DISABLED_LEGACY_ENGINE
1419 WERD_RES *word = *in_word;
1420 match_word_pass_n(1, word, row, block);
1421 if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
1422 word->tess_would_adapt = AdaptableWord(word);
1423 bool adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode);
1424
1425 if (adapt_ok) {
1426 // Send word to adaptive classifier for training.
1427 word->BestChoiceToCorrectText();
1428 LearnWord(nullptr, word);
1429 // Mark misadaptions if running blamer.
1430 if (word->blamer_bundle != nullptr) {
1431 word->blamer_bundle->SetMisAdaptionDebug(word->best_choice, wordrec_debug_blamer);
1432 }
1433 }
1434
1435 if (tessedit_enable_doc_dict && !word->IsAmbiguous()) {
1436 tess_add_doc_word(word->best_choice);
1437 }
1438 }
1439 #endif // ndef DISABLED_LEGACY_ENGINE
1440 }
1441
1442 // Helper to report the result of the xheight fix.
1443 void Tesseract::ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word,
1444 WERD_RES *new_word) {
1445 tprintf("New XHT Match:%s = %s ", word->best_choice->unichar_string().c_str(),
1446 word->best_choice->debug_string().c_str());
1447 word->reject_map.print(debug_fp);
1448 tprintf(" -> %s = %s ", new_word->best_choice->unichar_string().c_str(),
1449 new_word->best_choice->debug_string().c_str());
1450 new_word->reject_map.print(debug_fp);
1451 tprintf(" %s->%s %s %s\n", word->guessed_x_ht ? "GUESS" : "CERT",
1452 new_word->guessed_x_ht ? "GUESS" : "CERT", new_x_ht > 0.1 ? "STILL DOUBT" : "OK",
1453 accept_new_word ? "ACCEPTED" : "");
1454 }
1455
1456 #ifndef DISABLED_LEGACY_ENGINE
1457
1458 // Run the x-height fix-up, based on min/max top/bottom information in
1459 // unicharset.
1460 // Returns true if the word was changed.
1461 // See the comment in fixxht.cpp for a description of the overall process.
1462 bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row) {
1463 int original_misfits = CountMisfitTops(word);
1464 if (original_misfits == 0) {
1465 return false;
1466 }
1467 float baseline_shift = 0.0f;
1468 float new_x_ht = ComputeCompatibleXheight(word, &baseline_shift);
1469 if (baseline_shift != 0.0f) {
1470 // Try the shift on its own first.
1471 if (!TestNewNormalization(original_misfits, baseline_shift, word->x_height, word, block, row)) {
1472 return false;
1473 }
1474 original_misfits = CountMisfitTops(word);
1475 if (original_misfits > 0) {
1476 float new_baseline_shift;
1477 // Now recompute the new x_height.
1478 new_x_ht = ComputeCompatibleXheight(word, &new_baseline_shift);
1479 if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
1480 // No test of return value here, as we are definitely making a change
1481 // to the word by shifting the baseline.
1482 TestNewNormalization(original_misfits, baseline_shift, new_x_ht, word, block, row);
1483 }
1484 }
1485 return true;
1486 } else if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
1487 return TestNewNormalization(original_misfits, 0.0f, new_x_ht, word, block, row);
1488 } else {
1489 return false;
1490 }
1491 }
1492
1493 // Runs recognition with the test baseline shift and x-height and returns true
1494 // if there was an improvement in recognition result.
1495 bool Tesseract::TestNewNormalization(int original_misfits, float baseline_shift, float new_x_ht,
1496 WERD_RES *word, BLOCK *block, ROW *row) {
1497 bool accept_new_x_ht = false;
1498 WERD_RES new_x_ht_word(word->word);
1499 if (word->blamer_bundle != nullptr) {
1500 new_x_ht_word.blamer_bundle = new BlamerBundle();
1501 new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle));
1502 }
1503 new_x_ht_word.x_height = new_x_ht;
1504 new_x_ht_word.baseline_shift = baseline_shift;
1505 new_x_ht_word.caps_height = 0.0;
1506 new_x_ht_word.SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,
1507 classify_bln_numeric_mode, textord_use_cjk_fp_model,
1508 poly_allow_detailed_fx, row, block);
1509 match_word_pass_n(2, &new_x_ht_word, row, block);
1510 if (!new_x_ht_word.tess_failed) {
1511 int new_misfits = CountMisfitTops(&new_x_ht_word);
1512 if (debug_x_ht_level >= 1) {
1513 tprintf("Old misfits=%d with x-height %f, new=%d with x-height %f\n", original_misfits,
1514 word->x_height, new_misfits, new_x_ht);
1515 tprintf("Old rating= %f, certainty=%f, new=%f, %f\n", word->best_choice->rating(),
1516 word->best_choice->certainty(), new_x_ht_word.best_choice->rating(),
1517 new_x_ht_word.best_choice->certainty());
1518 }
1519 // The misfits must improve and either the rating or certainty.
1520 accept_new_x_ht = new_misfits < original_misfits &&
1521 (new_x_ht_word.best_choice->certainty() > word->best_choice->certainty() ||
1522 new_x_ht_word.best_choice->rating() < word->best_choice->rating());
1523 if (debug_x_ht_level >= 1) {
1524 ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word);
1525 }
1526 }
1527 if (accept_new_x_ht) {
1528 word->ConsumeWordResults(&new_x_ht_word);
1529 return true;
1530 }
1531 return false;
1532 }
1533
1534 #endif // ndef DISABLED_LEGACY_ENGINE
1535
1536 /**
1537 * classify_word_pass2
1538 *
1539 * Control what to do with the word in pass 2
1540 */
1541
1542 void Tesseract::classify_word_pass2(const WordData &word_data, WERD_RES **in_word,
1543 PointerVector<WERD_RES> *out_words) {
1544 // Return if we do not want to run Tesseract.
1545 if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
1546 return;
1547 }
1548 #ifndef DISABLED_LEGACY_ENGINE
1549 ROW *row = word_data.row;
1550 BLOCK *block = word_data.block;
1551 WERD_RES *word = *in_word;
1552 prev_word_best_choice_ =
1553 word_data.prev_word != nullptr ? word_data.prev_word->word->best_choice : nullptr;
1554
1555 check_debug_pt(word, 30);
1556 if (!word->done) {
1557 word->caps_height = 0.0;
1558 if (word->x_height == 0.0f) {
1559 word->x_height = row->x_height();
1560 }
1561 match_word_pass_n(2, word, row, block);
1562 check_debug_pt(word, 40);
1563 }
1564
1565 SubAndSuperscriptFix(word);
1566
1567 if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
1568 if (unicharset.top_bottom_useful() && unicharset.script_has_xheight() &&
1569 block->classify_rotation().y() == 0.0f) {
1570 // Use the tops and bottoms since they are available.
1571 TrainedXheightFix(word, block, row);
1572 }
1573 }
1574 # ifndef GRAPHICS_DISABLED
1575 if (tessedit_display_outwords) {
1576 if (fx_win == nullptr) {
1577 create_fx_win();
1578 }
1579 clear_fx_win();
1580 word->rebuild_word->plot(fx_win);
1581 TBOX wbox = word->rebuild_word->bounding_box();
1582 fx_win->ZoomToRectangle(wbox.left(), wbox.top(), wbox.right(), wbox.bottom());
1583 ScrollView::Update();
1584 }
1585 # endif
1586 check_debug_pt(word, 50);
1587 #endif // ndef DISABLED_LEGACY_ENGINE
1588 }
1589
1590 #ifndef DISABLED_LEGACY_ENGINE
1591 /**
1592 * match_word_pass2
1593 *
1594 * Baseline normalize the word and pass it to Tess.
1595 */
1596 void Tesseract::match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block) {
1597 if (word->tess_failed) {
1598 return;
1599 }
1600 tess_segment_pass_n(pass_n, word);
1601
1602 if (!word->tess_failed) {
1603 if (!word->word->flag(W_REP_CHAR)) {
1604 word->fix_quotes();
1605 if (tessedit_fix_hyphens) {
1606 word->fix_hyphens();
1607 }
1608 /* Don't trust fix_quotes! - though I think I've fixed the bug */
1609 if (static_cast<unsigned>(word->best_choice->length()) != word->box_word->length()) {
1610 tprintf(
1611 "POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
1612 " #Blobs=%u\n",
1613 word->best_choice->debug_string().c_str(), word->best_choice->length(),
1614 word->box_word->length());
1615 }
1616 word->tess_accepted = tess_acceptable_word(word);
1617
1618 // Also sets word->done flag
1619 make_reject_map(word, row, pass_n);
1620 }
1621 }
1622 set_word_fonts(word);
1623
1624 ASSERT_HOST(word->raw_choice != nullptr);
1625 }
1626 #endif // ndef DISABLED_LEGACY_ENGINE
1627
1628 // Helper to return the best rated BLOB_CHOICE in the whole word that matches
1629 // the given char_id, or nullptr if none can be found.
1630 static BLOB_CHOICE *FindBestMatchingChoice(UNICHAR_ID char_id, WERD_RES *word_res) {
1631 // Find the corresponding best BLOB_CHOICE from any position in the word_res.
1632 BLOB_CHOICE *best_choice = nullptr;
1633 for (unsigned i = 0; i < word_res->best_choice->length(); ++i) {
1634 BLOB_CHOICE *choice = FindMatchingChoice(char_id, word_res->GetBlobChoices(i));
1635 if (choice != nullptr) {
1636 if (best_choice == nullptr || choice->rating() < best_choice->rating()) {
1637 best_choice = choice;
1638 }
1639 }
1640 }
1641 return best_choice;
1642 }
1643
1644 // Helper to insert blob_choice in each location in the leader word if there is
1645 // no matching BLOB_CHOICE there already, and correct any incorrect results
1646 // in the best_choice.
1647 static void CorrectRepcharChoices(BLOB_CHOICE *blob_choice, WERD_RES *word_res) {
1648 WERD_CHOICE *word = word_res->best_choice;
1649 for (unsigned i = 0; i < word_res->best_choice->length(); ++i) {
1650 BLOB_CHOICE *choice =
1651 FindMatchingChoice(blob_choice->unichar_id(), word_res->GetBlobChoices(i));
1652 if (choice == nullptr) {
1653 BLOB_CHOICE_IT choice_it(word_res->GetBlobChoices(i));
1654 choice_it.add_before_stay_put(new BLOB_CHOICE(*blob_choice));
1655 }
1656 }
1657 // Correct any incorrect results in word.
1658 for (unsigned i = 0; i < word->length(); ++i) {
1659 if (word->unichar_id(i) != blob_choice->unichar_id()) {
1660 word->set_unichar_id(blob_choice->unichar_id(), i);
1661 }
1662 }
1663 }
1664
1665 /**
1666 * fix_rep_char()
1667 * The word is a repeated char. (Leader.) Find the repeated char character.
1668 * Create the appropriate single-word or multi-word sequence according to
1669 * the size of spaces in between blobs, and correct the classifications
1670 * where some of the characters disagree with the majority.
1671 */
1672 void Tesseract::fix_rep_char(PAGE_RES_IT *page_res_it) {
1673 WERD_RES *word_res = page_res_it->word();
1674 const WERD_CHOICE &word = *(word_res->best_choice);
1675
1676 // Find the frequency of each unique character in the word.
1677 SortHelper<UNICHAR_ID> rep_ch(word.length());
1678 for (unsigned i = 0; i < word.length(); ++i) {
1679 rep_ch.Add(word.unichar_id(i), 1);
1680 }
1681
1682 // Find the most frequent result.
1683 UNICHAR_ID maxch_id = INVALID_UNICHAR_ID; // most common char
1684 int max_count = rep_ch.MaxCount(&maxch_id);
1685 // Find the best exemplar of a classifier result for maxch_id.
1686 BLOB_CHOICE *best_choice = FindBestMatchingChoice(maxch_id, word_res);
1687 if (best_choice == nullptr) {
1688 tprintf("Failed to find a choice for %s, occurring %d times\n",
1689 word_res->uch_set->debug_str(maxch_id).c_str(), max_count);
1690 return;
1691 }
1692 word_res->done = true;
1693
1694 // Just correct existing classification.
1695 CorrectRepcharChoices(best_choice, word_res);
1696 word_res->reject_map.initialise(word.length());
1697 }
1698
1699 ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(const UNICHARSET &char_set, const char *s,
1700 const char *lengths) {
1701 int i = 0;
1702 int offset = 0;
1703 int leading_punct_count;
1704 int upper_count = 0;
1705 int hyphen_pos = -1;
1706 ACCEPTABLE_WERD_TYPE word_type = AC_UNACCEPTABLE;
1707
1708 if (strlen(lengths) > 20) {
1709 return word_type;
1710 }
1711
1712 /* Single Leading punctuation char*/
1713
1714 if (s[offset] != '\0' && chs_leading_punct.contains(s[offset])) {
1715 offset += lengths[i++];
1716 }
1717 leading_punct_count = i;
1718
1719 /* Initial cap */
1720 while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i])) {
1721 offset += lengths[i++];
1722 upper_count++;
1723 }
1724 if (upper_count > 1) {
1725 word_type = AC_UPPER_CASE;
1726 } else {
1727 /* Lower case word, possibly with an initial cap */
1728 while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i])) {
1729 offset += lengths[i++];
1730 }
1731 if (i - leading_punct_count < quality_min_initial_alphas_reqd) {
1732 goto not_a_word;
1733 }
1734 /*
1735 Allow a single hyphen in a lower case word
1736 - don't trust upper case - I've seen several cases of "H" -> "I-I"
1737 */
1738 if (lengths[i] == 1 && s[offset] == '-') {
1739 hyphen_pos = i;
1740 offset += lengths[i++];
1741 if (s[offset] != '\0') {
1742 while ((s[offset] != '\0') && char_set.get_islower(s + offset, lengths[i])) {
1743 offset += lengths[i++];
1744 }
1745 if (i < hyphen_pos + 3) {
1746 goto not_a_word;
1747 }
1748 }
1749 } else {
1750 /* Allow "'s" in NON hyphenated lower case words */
1751 if (lengths[i] == 1 && (s[offset] == '\'') && lengths[i + 1] == 1 &&
1752 (s[offset + lengths[i]] == 's')) {
1753 offset += lengths[i++];
1754 offset += lengths[i++];
1755 }
1756 }
1757 if (upper_count > 0) {
1758 word_type = AC_INITIAL_CAP;
1759 } else {
1760 word_type = AC_LOWER_CASE;
1761 }
1762 }
1763
1764 /* Up to two different, constrained trailing punctuation chars */
1765 if (lengths[i] == 1 && s[offset] != '\0' && chs_trailing_punct1.contains(s[offset])) {
1766 offset += lengths[i++];
1767 }
1768 if (lengths[i] == 1 && s[offset] != '\0' && i > 0 && s[offset - lengths[i - 1]] != s[offset] &&
1769 chs_trailing_punct2.contains(s[offset])) {
1770 offset += lengths[i++];
1771 }
1772
1773 if (s[offset] != '\0') {
1774 word_type = AC_UNACCEPTABLE;
1775 }
1776
1777 not_a_word:
1778
1779 if (word_type == AC_UNACCEPTABLE) {
1780 /* Look for abbreviation string */
1781 i = 0;
1782 offset = 0;
1783 if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) {
1784 word_type = AC_UC_ABBREV;
1785 while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i]) &&
1786 lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1787 offset += lengths[i++];
1788 offset += lengths[i++];
1789 }
1790 } else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) {
1791 word_type = AC_LC_ABBREV;
1792 while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i]) &&
1793 lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1794 offset += lengths[i++];
1795 offset += lengths[i++];
1796 }
1797 }
1798 if (s[offset] != '\0') {
1799 word_type = AC_UNACCEPTABLE;
1800 }
1801 }
1802
1803 return word_type;
1804 }
1805
1806 bool Tesseract::check_debug_pt(WERD_RES *word, int location) {
1807 if (!test_pt) {
1808 return false;
1809 }
1810
1811 tessedit_rejection_debug.set_value(false);
1812 debug_x_ht_level.set_value(0);
1813
1814 if (word->word->bounding_box().contains(FCOORD(test_pt_x, test_pt_y))) {
1815 if (location < 0) {
1816 return true; // For breakpoint use
1817 }
1818 bool show_map_detail = false;
1819 tessedit_rejection_debug.set_value(true);
1820 debug_x_ht_level.set_value(2);
1821 tprintf("\n\nTESTWD::");
1822 switch (location) {
1823 case 0:
1824 tprintf("classify_word_pass1 start\n");
1825 word->word->print();
1826 break;
1827 case 10:
1828 tprintf("make_reject_map: initial map");
1829 break;
1830 case 20:
1831 tprintf("make_reject_map: after NN");
1832 break;
1833 case 30:
1834 tprintf("classify_word_pass2 - START");
1835 break;
1836 case 40:
1837 tprintf("classify_word_pass2 - Pre Xht");
1838 break;
1839 case 50:
1840 tprintf("classify_word_pass2 - END");
1841 show_map_detail = true;
1842 break;
1843 case 60:
1844 tprintf("fixspace");
1845 break;
1846 case 70:
1847 tprintf("MM pass START");
1848 break;
1849 case 80:
1850 tprintf("MM pass END");
1851 break;
1852 case 90:
1853 tprintf("After Poor quality rejection");
1854 break;
1855 case 100:
1856 tprintf("unrej_good_quality_words - START");
1857 break;
1858 case 110:
1859 tprintf("unrej_good_quality_words - END");
1860 break;
1861 case 120:
1862 tprintf("Write results pass");
1863 show_map_detail = true;
1864 break;
1865 }
1866 if (word->best_choice != nullptr) {
1867 tprintf(" \"%s\" ", word->best_choice->unichar_string().c_str());
1868 word->reject_map.print(debug_fp);
1869 tprintf("\n");
1870 if (show_map_detail) {
1871 tprintf("\"%s\"\n", word->best_choice->unichar_string().c_str());
1872 for (unsigned i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
1873 tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
1874 word->reject_map[i].full_print(debug_fp);
1875 }
1876 }
1877 } else {
1878 tprintf("null best choice\n");
1879 }
1880 tprintf("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
1881 tprintf("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
1882 return true;
1883 } else {
1884 return false;
1885 }
1886 }
1887
1888 /**
1889 * find_modal_font
1890 *
1891 * Find the modal font and remove from the stats.
1892 */
1893 #ifndef DISABLED_LEGACY_ENGINE
1894 static void find_modal_font( // good chars in word
1895 STATS *fonts, // font stats
1896 int16_t *font_out, // output font
1897 int8_t *font_count // output count
1898 ) {
1899 if (fonts->get_total() > 0) {
1900 // font index
1901 int16_t font = static_cast<int16_t>(fonts->mode());
1902 *font_out = font;
1903 // pile count
1904 int32_t count = fonts->pile_count(font);
1905 *font_count = count < INT8_MAX ? count : INT8_MAX;
1906 fonts->add(font, -*font_count);
1907 } else {
1908 *font_out = -1;
1909 *font_count = 0;
1910 }
1911 }
1912 #endif // ! DISABLED_LEGACY_ENGINE
1913
1914 /**
1915 * set_word_fonts
1916 *
1917 * Get the fonts for the word.
1918 */
1919 void Tesseract::set_word_fonts(WERD_RES *word) {
1920 // Don't try to set the word fonts for an lstm word, as the configs
1921 // will be meaningless.
1922 if (word->chopped_word == nullptr) {
1923 return;
1924 }
1925 ASSERT_HOST(word->best_choice != nullptr);
1926
1927 #ifndef DISABLED_LEGACY_ENGINE
1928 const int fontinfo_size = fontinfo_table_.size();
1929 if (fontinfo_size == 0) {
1930 return;
1931 }
1932 if (tessedit_font_id > 0) {
1933 if (tessedit_font_id >= fontinfo_size) {
1934 tprintf("Error, invalid font ID provided: must be below %d.\n"
1935 "Falling back to font auto-detection.\n", fontinfo_size);
1936 } else {
1937 word->fontinfo = &fontinfo_table_.at(tessedit_font_id);
1938 word->fontinfo2 = nullptr;
1939 word->fontinfo_id_count = INT8_MAX;
1940 word->fontinfo_id2_count = 0;
1941 return;
1942 }
1943 }
1944 std::vector<int> font_total_score(fontinfo_size);
1945
1946 // Compute the font scores for the word
1947 if (tessedit_debug_fonts) {
1948 tprintf("Examining fonts in %s\n", word->best_choice->debug_string().c_str());
1949 }
1950 for (unsigned b = 0; b < word->best_choice->length(); ++b) {
1951 const BLOB_CHOICE *choice = word->GetBlobChoice(b);
1952 if (choice == nullptr) {
1953 continue;
1954 }
1955 auto &fonts = choice->fonts();
1956 for (auto &f : fonts) {
1957 const int fontinfo_id = f.fontinfo_id;
1958 if (0 <= fontinfo_id && fontinfo_id < fontinfo_size) {
1959 font_total_score[fontinfo_id] += f.score;
1960 }
1961 }
1962 }
1963 // Find the top and 2nd choice for the word.
1964 int score1 = 0, score2 = 0;
1965 int16_t font_id1 = -1, font_id2 = -1;
1966 for (int f = 0; f < fontinfo_size; ++f) {
1967 if (tessedit_debug_fonts && font_total_score[f] > 0) {
1968 tprintf("Font %s, total score = %d\n", fontinfo_table_.at(f).name, font_total_score[f]);
1969 }
1970 if (font_total_score[f] > score1) {
1971 score2 = score1;
1972 font_id2 = font_id1;
1973 score1 = font_total_score[f];
1974 font_id1 = f;
1975 } else if (font_total_score[f] > score2) {
1976 score2 = font_total_score[f];
1977 font_id2 = f;
1978 }
1979 }
1980 word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.at(font_id1) : nullptr;
1981 word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.at(font_id2) : nullptr;
1982 // Each score has a limit of UINT16_MAX, so divide by that to get the number
1983 // of "votes" for that font, ie number of perfect scores.
1984 word->fontinfo_id_count = ClipToRange<int>(score1 / UINT16_MAX, 1, INT8_MAX);
1985 word->fontinfo_id2_count = ClipToRange<int>(score2 / UINT16_MAX, 0, INT8_MAX);
1986 if (score1 > 0) {
1987 const FontInfo fi = fontinfo_table_.at(font_id1);
1988 if (tessedit_debug_fonts) {
1989 if (word->fontinfo_id2_count > 0 && font_id2 >= 0) {
1990 tprintf("Word modal font=%s, score=%d, 2nd choice %s/%d\n", fi.name,
1991 word->fontinfo_id_count, fontinfo_table_.at(font_id2).name,
1992 word->fontinfo_id2_count);
1993 } else {
1994 tprintf("Word modal font=%s, score=%d. No 2nd choice\n", fi.name, word->fontinfo_id_count);
1995 }
1996 }
1997 }
1998 #endif // ndef DISABLED_LEGACY_ENGINE
1999 }
2000
2001 #ifndef DISABLED_LEGACY_ENGINE
2002 /**
2003 * font_recognition_pass
2004 *
2005 * Smooth the fonts for the document.
2006 */
2007 void Tesseract::font_recognition_pass(PAGE_RES *page_res) {
2008 PAGE_RES_IT page_res_it(page_res);
2009 WERD_RES *word; // current word
2010 STATS doc_fonts(0, font_table_size_ - 1); // font counters
2011
2012 // Gather font id statistics.
2013 for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
2014 word = page_res_it.word();
2015 if (word->fontinfo != nullptr) {
2016 doc_fonts.add(word->fontinfo->universal_id, word->fontinfo_id_count);
2017 }
2018 if (word->fontinfo2 != nullptr) {
2019 doc_fonts.add(word->fontinfo2->universal_id, word->fontinfo_id2_count);
2020 }
2021 }
2022 int16_t doc_font; // modal font
2023 int8_t doc_font_count; // modal font
2024 find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
2025 if (doc_font_count == 0) {
2026 return;
2027 }
2028 // Get the modal font pointer.
2029 const FontInfo *modal_font = nullptr;
2030 for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
2031 word = page_res_it.word();
2032 if (word->fontinfo != nullptr && word->fontinfo->universal_id == doc_font) {
2033 modal_font = word->fontinfo;
2034 break;
2035 }
2036 if (word->fontinfo2 != nullptr && word->fontinfo2->universal_id == doc_font) {
2037 modal_font = word->fontinfo2;
2038 break;
2039 }
2040 }
2041 ASSERT_HOST(modal_font != nullptr);
2042
2043 // Assign modal font to weak words.
2044 for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
2045 word = page_res_it.word();
2046 const int length = word->best_choice->length();
2047
2048 const int count = word->fontinfo_id_count;
2049 if (!(count == length || (length > 3 && count >= length * 3 / 4))) {
2050 word->fontinfo = modal_font;
2051 // Counts only get 1 as it came from the doc.
2052 word->fontinfo_id_count = 1;
2053 }
2054 }
2055 }
2056 #endif // ndef DISABLED_LEGACY_ENGINE
2057
2058 // If a word has multiple alternates check if the best choice is in the
2059 // dictionary. If not, replace it with an alternate that exists in the
2060 // dictionary.
2061 void Tesseract::dictionary_correction_pass(PAGE_RES *page_res) {
2062 PAGE_RES_IT word_it(page_res);
2063 for (WERD_RES *word = word_it.word(); word != nullptr; word = word_it.forward()) {
2064 if (word->best_choices.singleton()) {
2065 continue; // There are no alternates.
2066 }
2067
2068 const WERD_CHOICE *best = word->best_choice;
2069 if (word->tesseract->getDict().valid_word(*best) != 0) {
2070 continue; // The best choice is in the dictionary.
2071 }
2072
2073 WERD_CHOICE_IT choice_it(&word->best_choices);
2074 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) {
2075 WERD_CHOICE *alternate = choice_it.data();
2076 if (word->tesseract->getDict().valid_word(*alternate)) {
2077 // The alternate choice is in the dictionary.
2078 if (tessedit_bigram_debug) {
2079 tprintf("Dictionary correction replaces best choice '%s' with '%s'\n",
2080 best->unichar_string().c_str(), alternate->unichar_string().c_str());
2081 }
2082 // Replace the 'best' choice with a better choice.
2083 word->ReplaceBestChoice(alternate);
2084 break;
2085 }
2086 }
2087 }
2088 }
2089
2090 } // namespace tesseract