Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/ccmain/control.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/ccmain/control.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,2090 @@ +/****************************************************************** + * File: control.cpp (Formerly control.c) + * Description: Module-independent matcher controller. + * Author: Ray Smith + * + * (C) Copyright 1992, Hewlett-Packard Ltd. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + **********************************************************************/ + +// Include automatically generated configuration file if running autoconf. +#ifdef HAVE_CONFIG_H +# include "config_auto.h" +#endif + +#include <cctype> +#include <cmath> +#include <cstdint> // for int16_t, int32_t +#include <cstdio> // for fclose, fopen, FILE +#include <ctime> // for clock +#include "control.h" +#ifndef DISABLED_LEGACY_ENGINE +# include "docqual.h" +# include "drawfx.h" +# include "fixspace.h" +#endif +#include <tesseract/ocrclass.h> +#include "lstmrecognizer.h" +#include "output.h" +#include "pageres.h" // for WERD_RES, PAGE_RES_IT, PAGE_RES, BLO... +#ifndef DISABLED_LEGACY_ENGINE +# include "reject.h" +#endif +#include "sorthelper.h" +#include "tesseractclass.h" +#include "tesserrstream.h" // for tesserr +#include "tessvars.h" +#include "werdit.h" + +const char *const kBackUpConfigFile = "tempconfigdata.config"; +#ifndef DISABLED_LEGACY_ENGINE +// Min believable x-height for any text when refitting as a fraction of +// original x-height +const double kMinRefitXHeightFraction = 0.5; +#endif // ! DISABLED_LEGACY_ENGINE + +namespace tesseract { + +/** + * Make a word from the selected blobs and run Tess on them. + * + * @param page_res recognise blobs + * @param selection_box within this box + */ + +void Tesseract::recog_pseudo_word(PAGE_RES *page_res, TBOX &selection_box) { + PAGE_RES_IT *it = make_pseudo_word(page_res, selection_box); + if (it != nullptr) { + recog_interactive(it); + it->DeleteCurrentWord(); + delete it; + } +} + +/** + * Recognize a single word in interactive mode. + * + * @param pr_it the page results iterator + */ +bool Tesseract::recog_interactive(PAGE_RES_IT *pr_it) { + WordData word_data(*pr_it); + SetupWordPassN(2, &word_data); + // LSTM doesn't run on pass2, but we want to run pass2 for tesseract. + if (lstm_recognizer_ == nullptr) { +#ifndef DISABLED_LEGACY_ENGINE + classify_word_and_language(2, pr_it, &word_data); +#endif // ndef DISABLED_LEGACY_ENGINE + } else { + classify_word_and_language(1, pr_it, &word_data); + } +#ifndef DISABLED_LEGACY_ENGINE + if (tessedit_debug_quality_metrics) { + int16_t char_qual; + int16_t good_char_qual; + WERD_RES *word_res = pr_it->word(); + word_char_quality(word_res, &char_qual, &good_char_qual); + tprintf( + "\n%d chars; word_blob_quality: %d; outline_errs: %d; " + "char_quality: %d; good_char_quality: %d\n", + word_res->reject_map.length(), word_blob_quality(word_res), word_outline_errs(word_res), + char_qual, good_char_qual); + } +#endif // ndef DISABLED_LEGACY_ENGINE + return true; +} + +// Helper function to check for a target word and handle it appropriately. +// Inspired by Jetsoft's requirement to process only single words on pass2 +// and beyond. +// If word_config is not null: +// If the word_box and target_word_box overlap, read the word_config file +// else reset to previous config data. +// return true. +// else +// If the word_box and target_word_box overlap or pass <= 1, return true. +// Note that this function uses a fixed temporary file for storing the previous +// configs, so it is neither thread-safe, nor process-safe, but the assumption +// is that it will only be used for one debug window at a time. +// +// Since this function is used for debugging (and not to change OCR results) +// set only debug params from the word config file. +bool Tesseract::ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box, + const char *word_config, int pass) { + if (word_config != nullptr) { + if (word_box.major_overlap(target_word_box)) { + if (backup_config_file_ == nullptr) { + backup_config_file_ = kBackUpConfigFile; + FILE *config_fp = fopen(backup_config_file_, "wb"); + if (config_fp == nullptr) { + tprintf("Error, failed to open file \"%s\"\n", backup_config_file_); + } else { + ParamUtils::PrintParams(config_fp, params()); + fclose(config_fp); + } + ParamUtils::ReadParamsFile(word_config, SET_PARAM_CONSTRAINT_DEBUG_ONLY, params()); + } + } else { + if (backup_config_file_ != nullptr) { + ParamUtils::ReadParamsFile(backup_config_file_, SET_PARAM_CONSTRAINT_DEBUG_ONLY, params()); + backup_config_file_ = nullptr; + } + } + } else if (pass > 1 && !word_box.major_overlap(target_word_box)) { + return false; + } + return true; +} + +/** If tesseract is to be run, sets the words up ready for it. */ +void Tesseract::SetupAllWordsPassN(int pass_n, const TBOX *target_word_box, const char *word_config, + PAGE_RES *page_res, std::vector<WordData> *words) { + // Prepare all the words. + PAGE_RES_IT page_res_it(page_res); + for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) { + if (target_word_box == nullptr || ProcessTargetWord(page_res_it.word()->word->bounding_box(), + *target_word_box, word_config, 1)) { + words->push_back(WordData(page_res_it)); + } + } + // Setup all the words for recognition with polygonal approximation. + for (unsigned w = 0; w < words->size(); ++w) { + SetupWordPassN(pass_n, &(*words)[w]); + if (w > 0) { + (*words)[w].prev_word = &(*words)[w - 1]; + } + } +} + +// Sets up the single word ready for whichever engine is to be run. +void Tesseract::SetupWordPassN(int pass_n, WordData *word) { + if (pass_n == 1 || !word->word->done) { + if (pass_n == 1) { + word->word->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, + nullptr, classify_bln_numeric_mode, textord_use_cjk_fp_model, + poly_allow_detailed_fx, word->row, word->block); + } else if (pass_n == 2) { + // TODO(rays) Should we do this on pass1 too? + word->word->caps_height = 0.0; + if (word->word->x_height == 0.0f) { + word->word->x_height = word->row->x_height(); + } + } + word->lang_words.truncate(0); + for (unsigned s = 0; s <= sub_langs_.size(); ++s) { + // The sub_langs_.size() entry is for the master language. + Tesseract *lang_t = s < sub_langs_.size() ? sub_langs_[s] : this; + auto *word_res = new WERD_RES; + word_res->InitForRetryRecognition(*word->word); + word->lang_words.push_back(word_res); + // LSTM doesn't get setup for pass2. + if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode != OEM_LSTM_ONLY) { + word_res->SetupForRecognition( + lang_t->unicharset, lang_t, BestPix(), lang_t->tessedit_ocr_engine_mode, nullptr, + lang_t->classify_bln_numeric_mode, lang_t->textord_use_cjk_fp_model, + lang_t->poly_allow_detailed_fx, word->row, word->block); + } + } + } +} + +// Runs word recognition on all the words. +bool Tesseract::RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, + std::vector<WordData> *words) { + // TODO(rays) Before this loop can be parallelized (it would yield a massive + // speed-up) all remaining member globals need to be converted to local/heap + // (eg set_pass1 and set_pass2) and an intermediate adaption pass needs to be + // added. The results will be significantly different with adaption on, and + // deterioration will need investigation. + pr_it->restart_page(); + for (unsigned w = 0; w < words->size(); ++w) { + WordData *word = &(*words)[w]; + if (w > 0) { + word->prev_word = &(*words)[w - 1]; + } + if (monitor != nullptr) { + monitor->ocr_alive = true; + if (pass_n == 1) { + monitor->progress = 70 * w / words->size(); + } else { + monitor->progress = 70 + 30 * w / words->size(); + } + if (monitor->progress_callback2 != nullptr) { + TBOX box = pr_it->word()->word->bounding_box(); + (*monitor->progress_callback2)(monitor, box.left(), box.right(), box.top(), box.bottom()); + } + if (monitor->deadline_exceeded() || + (monitor->cancel != nullptr && (*monitor->cancel)(monitor->cancel_this, words->size()))) { + // Timeout. Fake out the rest of the words. + for (; w < words->size(); ++w) { + (*words)[w].word->SetupFake(unicharset); + } + return false; + } + } + if (word->word->tess_failed) { + unsigned s; + for (s = 0; s < word->lang_words.size() && word->lang_words[s]->tess_failed; ++s) { + } + // If all are failed, skip it. Image words are skipped by this test. + if (s > word->lang_words.size()) { + continue; + } + } + // Sync pr_it with the WordData. + while (pr_it->word() != nullptr && pr_it->word() != word->word) { + pr_it->forward(); + } + ASSERT_HOST(pr_it->word() != nullptr); + bool make_next_word_fuzzy = false; +#ifndef DISABLED_LEGACY_ENGINE + if (!AnyLSTMLang() && ReassignDiacritics(pass_n, pr_it, &make_next_word_fuzzy)) { + // Needs to be setup again to see the new outlines in the chopped_word. + SetupWordPassN(pass_n, word); + } +#endif // ndef DISABLED_LEGACY_ENGINE + + classify_word_and_language(pass_n, pr_it, word); + if (tessedit_dump_choices || debug_noise_removal) { + tprintf("Pass%d: %s [%s]\n", pass_n, word->word->best_choice->unichar_string().c_str(), + word->word->best_choice->debug_string().c_str()); + } + pr_it->forward(); + if (make_next_word_fuzzy && pr_it->word() != nullptr) { + pr_it->MakeCurrentWordFuzzy(); + } + } + return true; +} + +/** + * recog_all_words() + * + * Walk the page_res, recognizing all the words. + * If monitor is not null, it is used as a progress monitor/timeout/cancel. + * If dopasses is 0, all recognition passes are run, + * 1 just pass 1, 2 passes2 and higher. + * If target_word_box is not null, special things are done to words that + * overlap the target_word_box: + * if word_config is not null, the word config file is read for just the + * target word(s), otherwise, on pass 2 and beyond ONLY the target words + * are processed (Jetsoft modification.) + * Returns false if we cancelled prematurely. + * + * @param page_res page structure + * @param monitor progress monitor + * @param word_config word_config file + * @param target_word_box specifies just to extract a rectangle + * @param dopasses 0 - all, 1 just pass 1, 2 passes 2 and higher + */ + +bool Tesseract::recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, + const TBOX *target_word_box, const char *word_config, + int dopasses) { + PAGE_RES_IT page_res_it(page_res); + + if (tessedit_minimal_rej_pass1) { + tessedit_test_adaption.set_value(true); + tessedit_minimal_rejection.set_value(true); + } + + if (dopasses == 0 || dopasses == 1) { + page_res_it.restart_page(); + // ****************** Pass 1 ******************* + +#ifndef DISABLED_LEGACY_ENGINE + // If the adaptive classifier is full switch to one we prepared earlier, + // ie on the previous page. If the current adaptive classifier is non-empty, + // prepare a backup starting at this page, in case it fills up. Do all this + // independently for each language. + if (AdaptiveClassifierIsFull()) { + SwitchAdaptiveClassifier(); + } else if (!AdaptiveClassifierIsEmpty()) { + StartBackupAdaptiveClassifier(); + } + // Now check the sub-langs as well. + for (auto &lang : sub_langs_) { + if (lang->AdaptiveClassifierIsFull()) { + lang->SwitchAdaptiveClassifier(); + } else if (!lang->AdaptiveClassifierIsEmpty()) { + lang->StartBackupAdaptiveClassifier(); + } + } + +#endif // ndef DISABLED_LEGACY_ENGINE + + // Set up all words ready for recognition, so that if parallelism is on + // all the input and output classes are ready to run the classifier. + std::vector<WordData> words; + SetupAllWordsPassN(1, target_word_box, word_config, page_res, &words); +#ifndef DISABLED_LEGACY_ENGINE + if (tessedit_parallelize) { + PrerecAllWordsPar(words); + } +#endif // ndef DISABLED_LEGACY_ENGINE + + stats_.word_count = words.size(); + + stats_.dict_words = 0; + stats_.doc_blob_quality = 0; + stats_.doc_outline_errs = 0; + stats_.doc_char_quality = 0; + stats_.good_char_count = 0; + stats_.doc_good_char_quality = 0; + + most_recently_used_ = this; + // Run pass 1 word recognition. + if (!RecogAllWordsPassN(1, monitor, &page_res_it, &words)) { + return false; + } + // Pass 1 post-processing. + for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) { + if (page_res_it.word()->word->flag(W_REP_CHAR)) { + fix_rep_char(&page_res_it); + continue; + } + + // Count dict words. + if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM) { + ++(stats_.dict_words); + } + + // Update misadaption log (we only need to do it on pass 1, since + // adaption only happens on this pass). + if (page_res_it.word()->blamer_bundle != nullptr && + page_res_it.word()->blamer_bundle->misadaption_debug().length() > 0) { + page_res->misadaption_log.push_back(page_res_it.word()->blamer_bundle->misadaption_debug()); + } + } + } + + if (dopasses == 1) { + return true; + } + +#ifndef DISABLED_LEGACY_ENGINE + + // ****************** Pass 2 ******************* + if (tessedit_tess_adaption_mode != 0x0 && !tessedit_test_adaption && AnyTessLang()) { + page_res_it.restart_page(); + std::vector<WordData> words; + SetupAllWordsPassN(2, target_word_box, word_config, page_res, &words); + if (tessedit_parallelize) { + PrerecAllWordsPar(words); + } + most_recently_used_ = this; + // Run pass 2 word recognition. + if (!RecogAllWordsPassN(2, monitor, &page_res_it, &words)) { + return false; + } + } + + // The next passes are only required for Tess-only. + if (AnyTessLang() && !AnyLSTMLang()) { + // ****************** Pass 3 ******************* + // Fix fuzzy spaces. + + if (!tessedit_test_adaption && tessedit_fix_fuzzy_spaces && !tessedit_word_for_word && + !right_to_left()) { + fix_fuzzy_spaces(monitor, stats_.word_count, page_res); + } + + // ****************** Pass 4 ******************* + if (tessedit_enable_dict_correction) { + dictionary_correction_pass(page_res); + } + if (tessedit_enable_bigram_correction) { + bigram_correction_pass(page_res); + } + + // ****************** Pass 5,6 ******************* + rejection_passes(page_res, monitor, target_word_box, word_config); + + // ****************** Pass 8 ******************* + font_recognition_pass(page_res); + + // ****************** Pass 9 ******************* + // Check the correctness of the final results. + blamer_pass(page_res); + script_pos_pass(page_res); + } + +#endif // ndef DISABLED_LEGACY_ENGINE + + // Write results pass. + // This is now redundant, but retained commented so show how to obtain + // bounding boxes and style information. + +#ifndef DISABLED_LEGACY_ENGINE + // changed by jetsoft + // needed for dll to output memory structure + if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv)) { + output_pass(page_res_it, target_word_box); + } +// end jetsoft +#endif // ndef DISABLED_LEGACY_ENGINE + + const auto pageseg_mode = static_cast<PageSegMode>(static_cast<int>(tessedit_pageseg_mode)); + textord_.CleanupSingleRowResult(pageseg_mode, page_res); + + // Remove empty words, as these mess up the result iterators. + for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) { + const WERD_RES *word = page_res_it.word(); + const POLY_BLOCK *pb = page_res_it.block()->block != nullptr + ? page_res_it.block()->block->pdblk.poly_block() + : nullptr; + if (word->best_choice == nullptr || word->best_choice->empty() || + (word->best_choice->IsAllSpaces() && (pb == nullptr || pb->IsText()))) { + page_res_it.DeleteCurrentWord(); + } + } + + if (monitor != nullptr) { + monitor->progress = 100; + } + return true; +} + +#ifndef DISABLED_LEGACY_ENGINE + +void Tesseract::bigram_correction_pass(PAGE_RES *page_res) { + PAGE_RES_IT word_it(page_res); + + WERD_RES *w_prev = nullptr; + WERD_RES *w = word_it.word(); + while (true) { + w_prev = w; + while (word_it.forward() != nullptr && (!word_it.word() || word_it.word()->part_of_combo)) { + // advance word_it, skipping over parts of combos + } + if (!word_it.word()) { + break; + } + w = word_it.word(); + if (!w || !w_prev || w->uch_set != w_prev->uch_set) { + continue; + } + if (w_prev->word->flag(W_REP_CHAR) || w->word->flag(W_REP_CHAR)) { + if (tessedit_bigram_debug) { + tprintf("Skipping because one of the words is W_REP_CHAR\n"); + } + continue; + } + // Two words sharing the same language model, excellent! + std::vector<WERD_CHOICE *> overrides_word1; + std::vector<WERD_CHOICE *> overrides_word2; + + const auto &orig_w1_str = w_prev->best_choice->unichar_string(); + const auto &orig_w2_str = w->best_choice->unichar_string(); + WERD_CHOICE prev_best(w->uch_set); + { + int w1start, w1end; + w_prev->best_choice->GetNonSuperscriptSpan(&w1start, &w1end); + prev_best = w_prev->best_choice->shallow_copy(w1start, w1end); + } + WERD_CHOICE this_best(w->uch_set); + { + int w2start, w2end; + w->best_choice->GetNonSuperscriptSpan(&w2start, &w2end); + this_best = w->best_choice->shallow_copy(w2start, w2end); + } + + if (w->tesseract->getDict().valid_bigram(prev_best, this_best)) { + if (tessedit_bigram_debug) { + tprintf("Top choice \"%s %s\" verified by bigram model.\n", orig_w1_str.c_str(), + orig_w2_str.c_str()); + } + continue; + } + if (tessedit_bigram_debug > 2) { + tprintf("Examining alt choices for \"%s %s\".\n", orig_w1_str.c_str(), orig_w2_str.c_str()); + } + if (tessedit_bigram_debug > 1) { + if (!w_prev->best_choices.singleton()) { + w_prev->PrintBestChoices(); + } + if (!w->best_choices.singleton()) { + w->PrintBestChoices(); + } + } + float best_rating = 0.0; + int best_idx = 0; + WERD_CHOICE_IT prev_it(&w_prev->best_choices); + for (prev_it.mark_cycle_pt(); !prev_it.cycled_list(); prev_it.forward()) { + WERD_CHOICE *p1 = prev_it.data(); + WERD_CHOICE strip1(w->uch_set); + { + int p1start, p1end; + p1->GetNonSuperscriptSpan(&p1start, &p1end); + strip1 = p1->shallow_copy(p1start, p1end); + } + WERD_CHOICE_IT w_it(&w->best_choices); + for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { + WERD_CHOICE *p2 = w_it.data(); + WERD_CHOICE strip2(w->uch_set); + { + int p2start, p2end; + p2->GetNonSuperscriptSpan(&p2start, &p2end); + strip2 = p2->shallow_copy(p2start, p2end); + } + if (w->tesseract->getDict().valid_bigram(strip1, strip2)) { + overrides_word1.push_back(p1); + overrides_word2.push_back(p2); + if (overrides_word1.size() == 1 || p1->rating() + p2->rating() < best_rating) { + best_rating = p1->rating() + p2->rating(); + best_idx = overrides_word1.size() - 1; + } + } + } + } + if (!overrides_word1.empty()) { + // Excellent, we have some bigram matches. + if (EqualIgnoringCaseAndTerminalPunct(*w_prev->best_choice, *overrides_word1[best_idx]) && + EqualIgnoringCaseAndTerminalPunct(*w->best_choice, *overrides_word2[best_idx])) { + if (tessedit_bigram_debug > 1) { + tprintf( + "Top choice \"%s %s\" verified (sans case) by bigram " + "model.\n", + orig_w1_str.c_str(), orig_w2_str.c_str()); + } + continue; + } + const auto &new_w1_str = overrides_word1[best_idx]->unichar_string(); + const auto &new_w2_str = overrides_word2[best_idx]->unichar_string(); + if (new_w1_str != orig_w1_str) { + w_prev->ReplaceBestChoice(overrides_word1[best_idx]); + } + if (new_w2_str != orig_w2_str) { + w->ReplaceBestChoice(overrides_word2[best_idx]); + } + if (tessedit_bigram_debug > 0) { + std::string choices_description; + int num_bigram_choices = overrides_word1.size() * overrides_word2.size(); + if (num_bigram_choices == 1) { + choices_description = "This was the unique bigram choice."; + } else { + if (tessedit_bigram_debug > 1) { + std::string bigrams_list; + const int kMaxChoicesToPrint = 20; + for (unsigned i = 0; i < overrides_word1.size() && i < kMaxChoicesToPrint; i++) { + if (i > 0) { + bigrams_list += ", "; + } + WERD_CHOICE *p1 = overrides_word1[i]; + WERD_CHOICE *p2 = overrides_word2[i]; + bigrams_list += p1->unichar_string() + " " + p2->unichar_string(); + } + choices_description = "There were many choices: {"; + choices_description += bigrams_list; + choices_description += "}"; + } else { + choices_description += "There were " + std::to_string(num_bigram_choices); + choices_description += " compatible bigrams."; + } + } + tprintf("Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n", orig_w1_str.c_str(), + orig_w2_str.c_str(), new_w1_str.c_str(), new_w2_str.c_str(), + choices_description.c_str()); + } + } + } +} + +void Tesseract::rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor, + const TBOX *target_word_box, const char *word_config) { + PAGE_RES_IT page_res_it(page_res); + // ****************** Pass 5 ******************* + // Gather statistics on rejects. + int word_index = 0; + while (!tessedit_test_adaption && page_res_it.word() != nullptr) { + WERD_RES *word = page_res_it.word(); + word_index++; + if (monitor != nullptr) { + monitor->ocr_alive = true; + monitor->progress = 95 + 5 * word_index / stats_.word_count; + } + if (word->rebuild_word == nullptr) { + // Word was not processed by tesseract. + page_res_it.forward(); + continue; + } + check_debug_pt(word, 70); + + // changed by jetsoft + // specific to its needs to extract one word when need + if (target_word_box && + !ProcessTargetWord(word->word->bounding_box(), *target_word_box, word_config, 4)) { + page_res_it.forward(); + continue; + } + // end jetsoft + + page_res_it.rej_stat_word(); + const int chars_in_word = word->reject_map.length(); + const int rejects_in_word = word->reject_map.reject_count(); + + const int blob_quality = word_blob_quality(word); + stats_.doc_blob_quality += blob_quality; + const int outline_errs = word_outline_errs(word); + stats_.doc_outline_errs += outline_errs; + int16_t all_char_quality; + int16_t accepted_all_char_quality; + word_char_quality(word, &all_char_quality, &accepted_all_char_quality); + stats_.doc_char_quality += all_char_quality; + const uint8_t permuter_type = word->best_choice->permuter(); + if ((permuter_type == SYSTEM_DAWG_PERM) || (permuter_type == FREQ_DAWG_PERM) || + (permuter_type == USER_DAWG_PERM)) { + stats_.good_char_count += chars_in_word - rejects_in_word; + stats_.doc_good_char_quality += accepted_all_char_quality; + } + check_debug_pt(word, 80); + if (tessedit_reject_bad_qual_wds && (blob_quality == 0) && (outline_errs >= chars_in_word)) { + word->reject_map.rej_word_bad_quality(); + } + check_debug_pt(word, 90); + page_res_it.forward(); + } + + if (tessedit_debug_quality_metrics) { + tprintf( + "QUALITY: num_chs= %d num_rejs= %d %5.3f blob_qual= %d %5.3f" + " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n", + page_res->char_count, page_res->rej_count, + page_res->rej_count / static_cast<float>(page_res->char_count), stats_.doc_blob_quality, + stats_.doc_blob_quality / static_cast<float>(page_res->char_count), stats_.doc_outline_errs, + stats_.doc_outline_errs / static_cast<float>(page_res->char_count), stats_.doc_char_quality, + stats_.doc_char_quality / static_cast<float>(page_res->char_count), + stats_.doc_good_char_quality, + (stats_.good_char_count > 0) + ? (stats_.doc_good_char_quality / static_cast<float>(stats_.good_char_count)) + : 0.0); + } + bool good_quality_doc = + ((page_res->rej_count / static_cast<float>(page_res->char_count)) <= quality_rej_pc) && + (stats_.doc_blob_quality / static_cast<float>(page_res->char_count) >= quality_blob_pc) && + (stats_.doc_outline_errs / static_cast<float>(page_res->char_count) <= quality_outline_pc) && + (stats_.doc_char_quality / static_cast<float>(page_res->char_count) >= quality_char_pc); + + // ****************** Pass 6 ******************* + // Do whole document or whole block rejection pass + if (!tessedit_test_adaption) { + quality_based_rejection(page_res_it, good_quality_doc); + } +} + +#endif // ndef DISABLED_LEGACY_ENGINE + +void Tesseract::blamer_pass(PAGE_RES *page_res) { + if (!wordrec_run_blamer) { + return; + } + PAGE_RES_IT page_res_it(page_res); + for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) { + WERD_RES *word = page_res_it.word(); + BlamerBundle::LastChanceBlame(wordrec_debug_blamer, word); + page_res->blame_reasons[word->blamer_bundle->incorrect_result_reason()]++; + } + tprintf("Blame reasons:\n"); + for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) { + tprintf("%s %d\n", BlamerBundle::IncorrectReasonName(static_cast<IncorrectResultReason>(bl)), + page_res->blame_reasons[bl]); + } + if (page_res->misadaption_log.size() > 0) { + tprintf("Misadaption log:\n"); + for (auto &log : page_res->misadaption_log) { + tprintf("%s\n", log.c_str()); + } + } +} + +// Sets script positions and detects smallcaps on all output words. +void Tesseract::script_pos_pass(PAGE_RES *page_res) { + PAGE_RES_IT page_res_it(page_res); + for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) { + WERD_RES *word = page_res_it.word(); + if (word->word->flag(W_REP_CHAR)) { + page_res_it.forward(); + continue; + } + const float x_height = page_res_it.block()->block->x_height(); + float word_x_height = word->x_height; + if (word_x_height < word->best_choice->min_x_height() || + word_x_height > word->best_choice->max_x_height()) { + word_x_height = + (word->best_choice->min_x_height() + word->best_choice->max_x_height()) / 2.0f; + } + // Test for small caps. Word capheight must be close to block xheight, + // and word must contain no lower case letters, and at least one upper case. + const double small_cap_xheight = x_height * kXHeightCapRatio; + const double small_cap_delta = (x_height - small_cap_xheight) / 2.0; + if (word->uch_set->script_has_xheight() && + small_cap_xheight - small_cap_delta <= word_x_height && + word_x_height <= small_cap_xheight + small_cap_delta) { + // Scan for upper/lower. + int num_upper = 0; + int num_lower = 0; + for (unsigned i = 0; i < word->best_choice->length(); ++i) { + if (word->uch_set->get_isupper(word->best_choice->unichar_id(i))) { + ++num_upper; + } else if (word->uch_set->get_islower(word->best_choice->unichar_id(i))) { + ++num_lower; + } + } + if (num_upper > 0 && num_lower == 0) { + word->small_caps = true; + } + } + word->SetScriptPositions(); + } +} + +// Helper finds the gap between the index word and the next. +static void WordGap(const PointerVector<WERD_RES> &words, unsigned index, int *right, int *next_left) { + *right = -INT32_MAX; + *next_left = INT32_MAX; + if (index < words.size()) { + *right = words[index]->word->bounding_box().right(); + if (index + 1 < words.size()) { + *next_left = words[index + 1]->word->bounding_box().left(); + } + } +} + +// Factored helper computes the rating, certainty, badness and validity of +// the permuter of the words in [first_index, end_index). +static void EvaluateWordSpan(const PointerVector<WERD_RES> &words, unsigned first_index, unsigned end_index, + float *rating, float *certainty, bool *bad, bool *valid_permuter) { + if (end_index <= first_index) { + *bad = true; + *valid_permuter = false; + } + for (unsigned index = first_index; index < end_index && index < words.size(); ++index) { + WERD_CHOICE *choice = words[index]->best_choice; + if (choice == nullptr) { + *bad = true; + } else { + *rating += choice->rating(); + *certainty = std::min(*certainty, choice->certainty()); + if (!Dict::valid_word_permuter(choice->permuter(), false)) { + *valid_permuter = false; + } + } + } +} + +// Helper chooses the best combination of words, transferring good ones from +// new_words to best_words. To win, a new word must have (better rating and +// certainty) or (better permuter status and rating within rating ratio and +// certainty within certainty margin) than current best. +// All the new_words are consumed (moved to best_words or deleted.) +// The return value is the number of new_words used minus the number of +// best_words that remain in the output. +static int SelectBestWords(double rating_ratio, double certainty_margin, bool debug, + PointerVector<WERD_RES> *new_words, + PointerVector<WERD_RES> *best_words) { + // Process the smallest groups of words that have an overlapping word + // boundary at the end. + std::vector<WERD_RES *> out_words; + // Index into each word vector (best, new). + unsigned b = 0, n = 0; + int num_best = 0, num_new = 0; + while (b < best_words->size() || n < new_words->size()) { + // Start of the current run in each. + auto start_b = b, start_n = n; + while (b < best_words->size() || n < new_words->size()) { + int b_right = -INT32_MAX; + int next_b_left = INT32_MAX; + WordGap(*best_words, b, &b_right, &next_b_left); + int n_right = -INT32_MAX; + int next_n_left = INT32_MAX; + WordGap(*new_words, n, &n_right, &next_n_left); + if (std::max(b_right, n_right) < std::min(next_b_left, next_n_left)) { + // The word breaks overlap. [start_b,b] and [start_n, n] match. + break; + } + // Keep searching for the matching word break. + if ((b_right < n_right && b < best_words->size()) || n == new_words->size()) { + ++b; + } else { + ++n; + } + } + // Rating of the current run in each. + float b_rating = 0.0f, n_rating = 0.0f; + // Certainty of the current run in each. + float b_certainty = 0.0f, n_certainty = 0.0f; + // True if any word is missing its best choice. + bool b_bad = false, n_bad = false; + // True if all words have a valid permuter. + bool b_valid_permuter = true, n_valid_permuter = true; + const int end_b = b < best_words->size() ? b + 1 : b; + const int end_n = n < new_words->size() ? n + 1 : n; + EvaluateWordSpan(*best_words, start_b, end_b, &b_rating, &b_certainty, &b_bad, + &b_valid_permuter); + EvaluateWordSpan(*new_words, start_n, end_n, &n_rating, &n_certainty, &n_bad, + &n_valid_permuter); + bool new_better = false; + if (!n_bad && (b_bad || (n_certainty > b_certainty && n_rating < b_rating) || + (!b_valid_permuter && n_valid_permuter && n_rating < b_rating * rating_ratio && + n_certainty > b_certainty - certainty_margin))) { + // New is better. + for (int i = start_n; i < end_n; ++i) { + out_words.push_back((*new_words)[i]); + (*new_words)[i] = nullptr; + ++num_new; + } + new_better = true; + } else if (!b_bad) { + // Current best is better. + for (int i = start_b; i < end_b; ++i) { + out_words.push_back((*best_words)[i]); + (*best_words)[i] = nullptr; + ++num_best; + } + } + if (debug) { + tprintf( + "%d new words %s than %d old words: r: %g v %g c: %g v %g" + " valid dict: %d v %d\n", + end_n - start_n, new_better ? "better" : "worse", end_b - start_b, n_rating, b_rating, + n_certainty, b_certainty, n_valid_permuter, b_valid_permuter); + } + // Move on to the next group. + b = end_b; + n = end_n; + } + // Transfer from out_words to best_words. + best_words->clear(); + for (auto &out_word : out_words) { + best_words->push_back(out_word); + } + return num_new - num_best; +} + +// Helper to recognize the word using the given (language-specific) tesseract. +// Returns positive if this recognizer found more new best words than the +// number kept from best_words. +int Tesseract::RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, bool debug, + WERD_RES **in_word, PointerVector<WERD_RES> *best_words) { + if (debug) { + tprintf("Trying word using lang %s, oem %d\n", lang.c_str(), + static_cast<int>(tessedit_ocr_engine_mode)); + } + // Run the recognizer on the word. + PointerVector<WERD_RES> new_words; + (this->*recognizer)(word_data, in_word, &new_words); + if (new_words.empty()) { + // Transfer input word to new_words, as the classifier must have put + // the result back in the input. + new_words.push_back(*in_word); + *in_word = nullptr; + } + if (debug) { + for (unsigned i = 0; i < new_words.size(); ++i) { + new_words[i]->DebugTopChoice("Lang result"); + } + } + // Initial version is a bit of a hack based on better certainty and rating + // or a dictionary vs non-dictionary word. + return SelectBestWords(classify_max_rating_ratio, classify_max_certainty_margin, debug, + &new_words, best_words); +} + +// Helper returns true if all the words are acceptable. +static bool WordsAcceptable(const PointerVector<WERD_RES> &words) { + for (unsigned w = 0; w < words.size(); ++w) { + if (words[w]->tess_failed || !words[w]->tess_accepted) { + return false; + } + } + return true; +} + +#ifndef DISABLED_LEGACY_ENGINE + +// Moves good-looking "noise"/diacritics from the reject list to the main +// blob list on the current word. Returns true if anything was done, and +// sets make_next_word_fuzzy if blob(s) were added to the end of the word. +bool Tesseract::ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy) { + *make_next_word_fuzzy = false; + WERD *real_word = pr_it->word()->word; + if (real_word->rej_cblob_list()->empty() || real_word->cblob_list()->empty() || + real_word->rej_cblob_list()->length() > noise_maxperword) { + return false; + } + real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle); + // Get the noise outlines into a vector with matching bool map. + std::vector<C_OUTLINE *> outlines; + real_word->GetNoiseOutlines(&outlines); + std::vector<bool> word_wanted; + std::vector<bool> overlapped_any_blob; + std::vector<C_BLOB *> target_blobs; + AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it, &word_wanted, + &overlapped_any_blob, &target_blobs); + // Filter the outlines that overlapped any blob and put them into the word + // now. This simplifies the remaining task and also makes it more accurate + // as it has more completed blobs to work on. + std::vector<bool> wanted; + std::vector<C_BLOB *> wanted_blobs; + std::vector<C_OUTLINE *> wanted_outlines; + int num_overlapped = 0; + int num_overlapped_used = 0; + for (unsigned i = 0; i < overlapped_any_blob.size(); ++i) { + if (overlapped_any_blob[i]) { + ++num_overlapped; + if (word_wanted[i]) { + ++num_overlapped_used; + } + wanted.push_back(word_wanted[i]); + wanted_blobs.push_back(target_blobs[i]); + wanted_outlines.push_back(outlines[i]); + outlines[i] = nullptr; + } + } + real_word->AddSelectedOutlines(wanted, wanted_blobs, wanted_outlines, nullptr); + AssignDiacriticsToNewBlobs(outlines, pass, real_word, pr_it, &word_wanted, &target_blobs); + // TODO: check code. + int non_overlapped = 0; + int non_overlapped_used = 0; + for (unsigned i = 0; i < word_wanted.size(); ++i) { + if (word_wanted[i]) { + ++non_overlapped_used; + } + if (outlines[i] != nullptr) { + ++non_overlapped_used; + } + } + if (debug_noise_removal) { + tprintf("Used %d/%d overlapped %d/%d non-overlapped diacritics on word:", num_overlapped_used, + num_overlapped, non_overlapped_used, non_overlapped); + real_word->bounding_box().print(); + } + // Now we have decided which outlines we want, put them into the real_word. + if (real_word->AddSelectedOutlines(word_wanted, target_blobs, outlines, make_next_word_fuzzy)) { + pr_it->MakeCurrentWordFuzzy(); + } + // TODO(rays) Parts of combos have a deep copy of the real word, and need + // to have their noise outlines moved/assigned in the same way!! + return num_overlapped_used != 0 || non_overlapped_used != 0; +} + +// Attempts to put noise/diacritic outlines into the blobs that they overlap. +// Input: a set of noisy outlines that probably belong to the real_word. +// Output: word_wanted indicates which outlines are to be assigned to a blob, +// target_blobs indicates which to assign to, and overlapped_any_blob is +// true for all outlines that overlapped a blob. +void Tesseract::AssignDiacriticsToOverlappingBlobs(const std::vector<C_OUTLINE *> &outlines, + int pass, WERD *real_word, PAGE_RES_IT *pr_it, + std::vector<bool> *word_wanted, + std::vector<bool> *overlapped_any_blob, + std::vector<C_BLOB *> *target_blobs) { + std::vector<bool> blob_wanted; + word_wanted->clear(); + word_wanted->resize(outlines.size()); + overlapped_any_blob->clear(); + overlapped_any_blob->resize(outlines.size()); + target_blobs->clear(); + target_blobs->resize(outlines.size()); + // For each real blob, find the outlines that seriously overlap it. + // A single blob could be several merged characters, so there can be quite + // a few outlines overlapping, and the full engine needs to be used to chop + // and join to get a sensible result. + C_BLOB_IT blob_it(real_word->cblob_list()); + for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { + C_BLOB *blob = blob_it.data(); + const TBOX blob_box = blob->bounding_box(); + blob_wanted.clear(); + blob_wanted.resize(outlines.size()); + int num_blob_outlines = 0; + for (unsigned i = 0; i < outlines.size(); ++i) { + if (blob_box.major_x_overlap(outlines[i]->bounding_box()) && !(*word_wanted)[i]) { + blob_wanted[i] = true; + (*overlapped_any_blob)[i] = true; + ++num_blob_outlines; + } + } + if (debug_noise_removal) { + tprintf("%d noise outlines overlap blob at:", num_blob_outlines); + blob_box.print(); + } + // If any outlines overlap the blob, and not too many, classify the blob + // (using the full engine, languages and all), and choose the maximal + // combination of outlines that doesn't hurt the end-result classification + // by too much. Mark them as wanted. + if (0 < num_blob_outlines && num_blob_outlines < noise_maxperblob) { + if (SelectGoodDiacriticOutlines(pass, noise_cert_basechar, pr_it, blob, outlines, + num_blob_outlines, &blob_wanted)) { + for (unsigned i = 0; i < blob_wanted.size(); ++i) { + if (blob_wanted[i]) { + // Claim the outline and record where it is going. + (*word_wanted)[i] = true; + (*target_blobs)[i] = blob; + } + } + } + } + } +} + +// Attempts to assign non-overlapping outlines to their nearest blobs or +// make new blobs out of them. +void Tesseract::AssignDiacriticsToNewBlobs(const std::vector<C_OUTLINE *> &outlines, int pass, + WERD *real_word, PAGE_RES_IT *pr_it, + std::vector<bool> *word_wanted, + std::vector<C_BLOB *> *target_blobs) { + std::vector<bool> blob_wanted; + word_wanted->clear(); + word_wanted->resize(outlines.size()); + target_blobs->clear(); + target_blobs->resize(outlines.size()); + // Check for outlines that need to be turned into stand-alone blobs. + for (unsigned i = 0; i < outlines.size(); ++i) { + if (outlines[i] == nullptr) { + continue; + } + // Get a set of adjacent outlines that don't overlap any existing blob. + blob_wanted.clear(); + blob_wanted.resize(outlines.size()); + int num_blob_outlines = 0; + TBOX total_ol_box(outlines[i]->bounding_box()); + while (i < outlines.size() && outlines[i] != nullptr) { + blob_wanted[i] = true; + total_ol_box += outlines[i]->bounding_box(); + ++i; + ++num_blob_outlines; + } + // Find the insertion point. + C_BLOB_IT blob_it(real_word->cblob_list()); + while (!blob_it.at_last() && + blob_it.data_relative(1)->bounding_box().left() <= total_ol_box.left()) { + blob_it.forward(); + } + // Choose which combination of them we actually want and where to put + // them. + if (debug_noise_removal) { + tprintf("Num blobless outlines = %d\n", num_blob_outlines); + } + C_BLOB *left_blob = blob_it.data(); + TBOX left_box = left_blob->bounding_box(); + C_BLOB *right_blob = blob_it.at_last() ? nullptr : blob_it.data_relative(1); + if ((left_box.x_overlap(total_ol_box) || right_blob == nullptr || + !right_blob->bounding_box().x_overlap(total_ol_box)) && + SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, left_blob, outlines, + num_blob_outlines, &blob_wanted)) { + if (debug_noise_removal) { + tprintf("Added to left blob\n"); + } + for (unsigned j = 0; j < blob_wanted.size(); ++j) { + if (blob_wanted[j]) { + (*word_wanted)[j] = true; + (*target_blobs)[j] = left_blob; + } + } + } else if (right_blob != nullptr && + (!left_box.x_overlap(total_ol_box) || + right_blob->bounding_box().x_overlap(total_ol_box)) && + SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, right_blob, outlines, + num_blob_outlines, &blob_wanted)) { + if (debug_noise_removal) { + tprintf("Added to right blob\n"); + } + for (unsigned j = 0; j < blob_wanted.size(); ++j) { + if (blob_wanted[j]) { + (*word_wanted)[j] = true; + (*target_blobs)[j] = right_blob; + } + } + } else if (SelectGoodDiacriticOutlines(pass, noise_cert_punc, pr_it, nullptr, outlines, + num_blob_outlines, &blob_wanted)) { + if (debug_noise_removal) { + tprintf("Fitted between blobs\n"); + } + for (unsigned j = 0; j < blob_wanted.size(); ++j) { + if (blob_wanted[j]) { + (*word_wanted)[j] = true; + (*target_blobs)[j] = nullptr; + } + } + } + } +} + +// Starting with ok_outlines set to indicate which outlines overlap the blob, +// chooses the optimal set (approximately) and returns true if any outlines +// are desired, in which case ok_outlines indicates which ones. +bool Tesseract::SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, + C_BLOB *blob, + const std::vector<C_OUTLINE *> &outlines, + int num_outlines, std::vector<bool> *ok_outlines) { + float target_cert = certainty_threshold; + if (blob != nullptr) { + std::string best_str; + float target_c2; + target_cert = ClassifyBlobAsWord(pass, pr_it, blob, best_str, &target_c2); + if (debug_noise_removal) { + tprintf("No Noise blob classified as %s=%g(%g) at:", best_str.c_str(), target_cert, + target_c2); + blob->bounding_box().print(); + } + target_cert -= (target_cert - certainty_threshold) * noise_cert_factor; + } + std::vector<bool> test_outlines = *ok_outlines; + // Start with all the outlines in. + std::string all_str; + std::vector<bool> best_outlines = *ok_outlines; + float best_cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass, pr_it, blob, all_str); + if (debug_noise_removal) { + TBOX ol_box; + for (unsigned i = 0; i < test_outlines.size(); ++i) { + if (test_outlines[i]) { + ol_box += outlines[i]->bounding_box(); + } + } + tprintf("All Noise blob classified as %s=%g, delta=%g at:", all_str.c_str(), best_cert, + best_cert - target_cert); + ol_box.print(); + } + // Iteratively zero out the bit that improves the certainty the most, until + // we get past the threshold, have zero bits, or fail to improve. + int best_index = 0; // To zero out. + while (num_outlines > 1 && best_index >= 0 && + (blob == nullptr || best_cert < target_cert || blob != nullptr)) { + // Find the best bit to zero out. + best_index = -1; + for (unsigned i = 0; i < outlines.size(); ++i) { + if (test_outlines[i]) { + test_outlines[i] = false; + std::string str; + float cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass, pr_it, blob, str); + if (debug_noise_removal) { + TBOX ol_box; + for (unsigned j = 0; j < outlines.size(); ++j) { + if (test_outlines[j]) { + ol_box += outlines[j]->bounding_box(); + } + tprintf("%c", test_outlines[j] ? 'T' : 'F'); + } + tprintf(" blob classified as %s=%g, delta=%g) at:", str.c_str(), cert, + cert - target_cert); + ol_box.print(); + } + if (cert > best_cert) { + best_cert = cert; + best_index = i; + best_outlines = test_outlines; + } + test_outlines[i] = true; + } + } + if (best_index >= 0) { + test_outlines[best_index] = false; + --num_outlines; + } + } + if (best_cert >= target_cert) { + // Save the best combination. + *ok_outlines = best_outlines; + if (debug_noise_removal) { + tprintf("%s noise combination ", blob ? "Adding" : "New"); + for (auto &&best_outline : best_outlines) { + tprintf("%c", best_outline ? 'T' : 'F'); + } + tprintf(" yields certainty %g, beating target of %g\n", best_cert, target_cert); + } + return true; + } + + return false; +} + +// Classifies the given blob plus the outlines flagged by ok_outlines, undoes +// the inclusion of the outlines, and returns the certainty of the raw choice. +float Tesseract::ClassifyBlobPlusOutlines(const std::vector<bool> &ok_outlines, + const std::vector<C_OUTLINE *> &outlines, int pass_n, + PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str) { + C_OUTLINE_IT ol_it; + C_OUTLINE *first_to_keep = nullptr; + C_BLOB *local_blob = nullptr; + if (blob != nullptr) { + // Add the required outlines to the blob. + ol_it.set_to_list(blob->out_list()); + first_to_keep = ol_it.data(); + } + for (unsigned i = 0; i < ok_outlines.size(); ++i) { + if (ok_outlines[i]) { + // This outline is to be added. + if (blob == nullptr) { + local_blob = new C_BLOB(outlines[i]); + blob = local_blob; + ol_it.set_to_list(blob->out_list()); + } else { + ol_it.add_before_stay_put(outlines[i]); + } + } + } + float c2; + float cert = ClassifyBlobAsWord(pass_n, pr_it, blob, best_str, &c2); + ol_it.move_to_first(); + if (first_to_keep == nullptr) { + // We created blob. Empty its outlines and delete it. + for (; !ol_it.empty(); ol_it.forward()) { + ol_it.extract(); + } + delete local_blob; + cert = -c2; + } else { + // Remove the outlines that we put in. + for (; ol_it.data() != first_to_keep; ol_it.forward()) { + ol_it.extract(); + } + } + return cert; +} + +// Classifies the given blob (part of word_data->word->word) as an individual +// word, using languages, chopper etc, returning only the certainty of the +// best raw choice, and undoing all the work done to fake out the word. +float Tesseract::ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str, + float *c2) { + WERD *real_word = pr_it->word()->word; + WERD *word = real_word->ConstructFromSingleBlob(real_word->flag(W_BOL), real_word->flag(W_EOL), + C_BLOB::deep_copy(blob)); + WERD_RES *word_res = pr_it->InsertSimpleCloneWord(*pr_it->word(), word); + // Get a new iterator that points to the new word. + PAGE_RES_IT it(pr_it->page_res); + while (it.word() != word_res && it.word() != nullptr) { + it.forward(); + } + ASSERT_HOST(it.word() == word_res); + WordData wd(it); + // Force full initialization. + SetupWordPassN(1, &wd); + classify_word_and_language(pass_n, &it, &wd); + if (debug_noise_removal) { + if (wd.word->raw_choice != nullptr) { + tprintf("word xheight=%g, row=%g, range=[%g,%g]\n", word_res->x_height, wd.row->x_height(), + wd.word->raw_choice->min_x_height(), wd.word->raw_choice->max_x_height()); + } else { + tprintf("Got word with null raw choice xheight=%g, row=%g\n", word_res->x_height, + wd.row->x_height()); + } + } + float cert = 0.0f; + if (wd.word->raw_choice != nullptr) { // This probably shouldn't happen, but... + cert = wd.word->raw_choice->certainty(); + float rat = wd.word->raw_choice->rating(); + *c2 = rat > 0.0f ? cert * cert / rat : 0.0f; + best_str = wd.word->raw_choice->unichar_string(); + } else { + *c2 = 0.0f; + best_str.clear(); + } + it.DeleteCurrentWord(); + pr_it->ResetWordIterator(); + return cert; +} + +#endif // ndef DISABLED_LEGACY_ENGINE + +// Generic function for classifying a word. Can be used either for pass1 or +// pass2 according to the function passed to recognizer. +// word_data holds the word to be recognized, and its block and row, and +// pr_it points to the word as well, in case we are running LSTM and it wants +// to output multiple words. +// Recognizes in the current language, and if successful that is all. +// If recognition was not successful, tries all available languages until +// it gets a successful result or runs out of languages. Keeps the best result. +void Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data) { +#ifdef DISABLED_LEGACY_ENGINE + WordRecognizer recognizer = &Tesseract::classify_word_pass1; +#else + WordRecognizer recognizer = + pass_n == 1 ? &Tesseract::classify_word_pass1 : &Tesseract::classify_word_pass2; +#endif // def DISABLED_LEGACY_ENGINE + + // Best result so far. + PointerVector<WERD_RES> best_words; + // Points to the best result. May be word or in lang_words. + const WERD_RES *word = word_data->word; + clock_t total_time = 0; + const bool timing_debug = tessedit_timing_debug; + if (timing_debug) { + total_time = clock(); + } + const bool debug = classify_debug_level > 0 || multilang_debug_level > 0; + if (debug) { + tprintf("%s word with lang %s at:", word->done ? "Already done" : "Processing", + most_recently_used_->lang.c_str()); + word->word->bounding_box().print(); + } + if (word->done) { + // If done on pass1, leave it as-is. + if (!word->tess_failed) { + most_recently_used_ = word->tesseract; + } + return; + } + auto sub = sub_langs_.size(); + if (most_recently_used_ != this) { + // Get the index of the most_recently_used_. + for (sub = 0; sub < sub_langs_.size() && most_recently_used_ != sub_langs_[sub]; ++sub) { + } + } + most_recently_used_->RetryWithLanguage(*word_data, recognizer, debug, &word_data->lang_words[sub], + &best_words); + Tesseract *best_lang_tess = most_recently_used_; + if (!WordsAcceptable(best_words)) { + // Try all the other languages to see if they are any better. + if (most_recently_used_ != this && + this->RetryWithLanguage(*word_data, recognizer, debug, + &word_data->lang_words[sub_langs_.size()], &best_words) > 0) { + best_lang_tess = this; + } + for (unsigned i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size(); ++i) { + if (most_recently_used_ != sub_langs_[i] && + sub_langs_[i]->RetryWithLanguage(*word_data, recognizer, debug, &word_data->lang_words[i], + &best_words) > 0) { + best_lang_tess = sub_langs_[i]; + } + } + } + most_recently_used_ = best_lang_tess; + if (!best_words.empty()) { + if (best_words.size() == 1 && !best_words[0]->combination) { + // Move the best single result to the main word. + word_data->word->ConsumeWordResults(best_words[0]); + } else { + // Words came from LSTM, and must be moved to the PAGE_RES properly. + word_data->word = best_words.back(); + pr_it->ReplaceCurrentWord(&best_words); + } + ASSERT_HOST(word_data->word->box_word != nullptr); + } else { + tprintf("no best words!!\n"); + } + if (timing_debug) { + total_time = clock() - total_time; + tesserr << word_data->word->best_choice->unichar_string() + << " (ocr took " << 1000 * total_time / CLOCKS_PER_SEC << " ms)\n"; + } +} + +/** + * classify_word_pass1 + * + * Baseline normalize the word and pass it to Tess. + */ + +void Tesseract::classify_word_pass1(const WordData &word_data, WERD_RES **in_word, + PointerVector<WERD_RES> *out_words) { + ROW *row = word_data.row; + BLOCK *block = word_data.block; + prev_word_best_choice_ = + word_data.prev_word != nullptr ? word_data.prev_word->word->best_choice : nullptr; +#ifdef DISABLED_LEGACY_ENGINE + if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) { +#else + if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY || + tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) { +#endif // def DISABLED_LEGACY_ENGINE + if (!(*in_word)->odd_size || tessedit_ocr_engine_mode == OEM_LSTM_ONLY) { + LSTMRecognizeWord(*block, row, *in_word, out_words); + if (!out_words->empty()) { + return; // Successful lstm recognition. + } + } + if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) { + // No fallback allowed, so use a fake. + (*in_word)->SetupFake(lstm_recognizer_->GetUnicharset()); + return; + } + +#ifndef DISABLED_LEGACY_ENGINE + // Fall back to tesseract for failed words or odd words. + (*in_word)->SetupForRecognition(unicharset, this, BestPix(), OEM_TESSERACT_ONLY, nullptr, + classify_bln_numeric_mode, textord_use_cjk_fp_model, + poly_allow_detailed_fx, row, block); +#endif // ndef DISABLED_LEGACY_ENGINE + } + +#ifndef DISABLED_LEGACY_ENGINE + WERD_RES *word = *in_word; + match_word_pass_n(1, word, row, block); + if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) { + word->tess_would_adapt = AdaptableWord(word); + bool adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode); + + if (adapt_ok) { + // Send word to adaptive classifier for training. + word->BestChoiceToCorrectText(); + LearnWord(nullptr, word); + // Mark misadaptions if running blamer. + if (word->blamer_bundle != nullptr) { + word->blamer_bundle->SetMisAdaptionDebug(word->best_choice, wordrec_debug_blamer); + } + } + + if (tessedit_enable_doc_dict && !word->IsAmbiguous()) { + tess_add_doc_word(word->best_choice); + } + } +#endif // ndef DISABLED_LEGACY_ENGINE +} + +// Helper to report the result of the xheight fix. +void Tesseract::ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word, + WERD_RES *new_word) { + tprintf("New XHT Match:%s = %s ", word->best_choice->unichar_string().c_str(), + word->best_choice->debug_string().c_str()); + word->reject_map.print(debug_fp); + tprintf(" -> %s = %s ", new_word->best_choice->unichar_string().c_str(), + new_word->best_choice->debug_string().c_str()); + new_word->reject_map.print(debug_fp); + tprintf(" %s->%s %s %s\n", word->guessed_x_ht ? "GUESS" : "CERT", + new_word->guessed_x_ht ? "GUESS" : "CERT", new_x_ht > 0.1 ? "STILL DOUBT" : "OK", + accept_new_word ? "ACCEPTED" : ""); +} + +#ifndef DISABLED_LEGACY_ENGINE + +// Run the x-height fix-up, based on min/max top/bottom information in +// unicharset. +// Returns true if the word was changed. +// See the comment in fixxht.cpp for a description of the overall process. +bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row) { + int original_misfits = CountMisfitTops(word); + if (original_misfits == 0) { + return false; + } + float baseline_shift = 0.0f; + float new_x_ht = ComputeCompatibleXheight(word, &baseline_shift); + if (baseline_shift != 0.0f) { + // Try the shift on its own first. + if (!TestNewNormalization(original_misfits, baseline_shift, word->x_height, word, block, row)) { + return false; + } + original_misfits = CountMisfitTops(word); + if (original_misfits > 0) { + float new_baseline_shift; + // Now recompute the new x_height. + new_x_ht = ComputeCompatibleXheight(word, &new_baseline_shift); + if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) { + // No test of return value here, as we are definitely making a change + // to the word by shifting the baseline. + TestNewNormalization(original_misfits, baseline_shift, new_x_ht, word, block, row); + } + } + return true; + } else if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) { + return TestNewNormalization(original_misfits, 0.0f, new_x_ht, word, block, row); + } else { + return false; + } +} + +// Runs recognition with the test baseline shift and x-height and returns true +// if there was an improvement in recognition result. +bool Tesseract::TestNewNormalization(int original_misfits, float baseline_shift, float new_x_ht, + WERD_RES *word, BLOCK *block, ROW *row) { + bool accept_new_x_ht = false; + WERD_RES new_x_ht_word(word->word); + if (word->blamer_bundle != nullptr) { + new_x_ht_word.blamer_bundle = new BlamerBundle(); + new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle)); + } + new_x_ht_word.x_height = new_x_ht; + new_x_ht_word.baseline_shift = baseline_shift; + new_x_ht_word.caps_height = 0.0; + new_x_ht_word.SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr, + classify_bln_numeric_mode, textord_use_cjk_fp_model, + poly_allow_detailed_fx, row, block); + match_word_pass_n(2, &new_x_ht_word, row, block); + if (!new_x_ht_word.tess_failed) { + int new_misfits = CountMisfitTops(&new_x_ht_word); + if (debug_x_ht_level >= 1) { + tprintf("Old misfits=%d with x-height %f, new=%d with x-height %f\n", original_misfits, + word->x_height, new_misfits, new_x_ht); + tprintf("Old rating= %f, certainty=%f, new=%f, %f\n", word->best_choice->rating(), + word->best_choice->certainty(), new_x_ht_word.best_choice->rating(), + new_x_ht_word.best_choice->certainty()); + } + // The misfits must improve and either the rating or certainty. + accept_new_x_ht = new_misfits < original_misfits && + (new_x_ht_word.best_choice->certainty() > word->best_choice->certainty() || + new_x_ht_word.best_choice->rating() < word->best_choice->rating()); + if (debug_x_ht_level >= 1) { + ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word); + } + } + if (accept_new_x_ht) { + word->ConsumeWordResults(&new_x_ht_word); + return true; + } + return false; +} + +#endif // ndef DISABLED_LEGACY_ENGINE + +/** + * classify_word_pass2 + * + * Control what to do with the word in pass 2 + */ + +void Tesseract::classify_word_pass2(const WordData &word_data, WERD_RES **in_word, + PointerVector<WERD_RES> *out_words) { + // Return if we do not want to run Tesseract. + if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) { + return; + } +#ifndef DISABLED_LEGACY_ENGINE + ROW *row = word_data.row; + BLOCK *block = word_data.block; + WERD_RES *word = *in_word; + prev_word_best_choice_ = + word_data.prev_word != nullptr ? word_data.prev_word->word->best_choice : nullptr; + + check_debug_pt(word, 30); + if (!word->done) { + word->caps_height = 0.0; + if (word->x_height == 0.0f) { + word->x_height = row->x_height(); + } + match_word_pass_n(2, word, row, block); + check_debug_pt(word, 40); + } + + SubAndSuperscriptFix(word); + + if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) { + if (unicharset.top_bottom_useful() && unicharset.script_has_xheight() && + block->classify_rotation().y() == 0.0f) { + // Use the tops and bottoms since they are available. + TrainedXheightFix(word, block, row); + } + } +# ifndef GRAPHICS_DISABLED + if (tessedit_display_outwords) { + if (fx_win == nullptr) { + create_fx_win(); + } + clear_fx_win(); + word->rebuild_word->plot(fx_win); + TBOX wbox = word->rebuild_word->bounding_box(); + fx_win->ZoomToRectangle(wbox.left(), wbox.top(), wbox.right(), wbox.bottom()); + ScrollView::Update(); + } +# endif + check_debug_pt(word, 50); +#endif // ndef DISABLED_LEGACY_ENGINE +} + +#ifndef DISABLED_LEGACY_ENGINE +/** + * match_word_pass2 + * + * Baseline normalize the word and pass it to Tess. + */ +void Tesseract::match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block) { + if (word->tess_failed) { + return; + } + tess_segment_pass_n(pass_n, word); + + if (!word->tess_failed) { + if (!word->word->flag(W_REP_CHAR)) { + word->fix_quotes(); + if (tessedit_fix_hyphens) { + word->fix_hyphens(); + } + /* Don't trust fix_quotes! - though I think I've fixed the bug */ + if (static_cast<unsigned>(word->best_choice->length()) != word->box_word->length()) { + tprintf( + "POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;" + " #Blobs=%u\n", + word->best_choice->debug_string().c_str(), word->best_choice->length(), + word->box_word->length()); + } + word->tess_accepted = tess_acceptable_word(word); + + // Also sets word->done flag + make_reject_map(word, row, pass_n); + } + } + set_word_fonts(word); + + ASSERT_HOST(word->raw_choice != nullptr); +} +#endif // ndef DISABLED_LEGACY_ENGINE + +// Helper to return the best rated BLOB_CHOICE in the whole word that matches +// the given char_id, or nullptr if none can be found. +static BLOB_CHOICE *FindBestMatchingChoice(UNICHAR_ID char_id, WERD_RES *word_res) { + // Find the corresponding best BLOB_CHOICE from any position in the word_res. + BLOB_CHOICE *best_choice = nullptr; + for (unsigned i = 0; i < word_res->best_choice->length(); ++i) { + BLOB_CHOICE *choice = FindMatchingChoice(char_id, word_res->GetBlobChoices(i)); + if (choice != nullptr) { + if (best_choice == nullptr || choice->rating() < best_choice->rating()) { + best_choice = choice; + } + } + } + return best_choice; +} + +// Helper to insert blob_choice in each location in the leader word if there is +// no matching BLOB_CHOICE there already, and correct any incorrect results +// in the best_choice. +static void CorrectRepcharChoices(BLOB_CHOICE *blob_choice, WERD_RES *word_res) { + WERD_CHOICE *word = word_res->best_choice; + for (unsigned i = 0; i < word_res->best_choice->length(); ++i) { + BLOB_CHOICE *choice = + FindMatchingChoice(blob_choice->unichar_id(), word_res->GetBlobChoices(i)); + if (choice == nullptr) { + BLOB_CHOICE_IT choice_it(word_res->GetBlobChoices(i)); + choice_it.add_before_stay_put(new BLOB_CHOICE(*blob_choice)); + } + } + // Correct any incorrect results in word. + for (unsigned i = 0; i < word->length(); ++i) { + if (word->unichar_id(i) != blob_choice->unichar_id()) { + word->set_unichar_id(blob_choice->unichar_id(), i); + } + } +} + +/** + * fix_rep_char() + * The word is a repeated char. (Leader.) Find the repeated char character. + * Create the appropriate single-word or multi-word sequence according to + * the size of spaces in between blobs, and correct the classifications + * where some of the characters disagree with the majority. + */ +void Tesseract::fix_rep_char(PAGE_RES_IT *page_res_it) { + WERD_RES *word_res = page_res_it->word(); + const WERD_CHOICE &word = *(word_res->best_choice); + + // Find the frequency of each unique character in the word. + SortHelper<UNICHAR_ID> rep_ch(word.length()); + for (unsigned i = 0; i < word.length(); ++i) { + rep_ch.Add(word.unichar_id(i), 1); + } + + // Find the most frequent result. + UNICHAR_ID maxch_id = INVALID_UNICHAR_ID; // most common char + int max_count = rep_ch.MaxCount(&maxch_id); + // Find the best exemplar of a classifier result for maxch_id. + BLOB_CHOICE *best_choice = FindBestMatchingChoice(maxch_id, word_res); + if (best_choice == nullptr) { + tprintf("Failed to find a choice for %s, occurring %d times\n", + word_res->uch_set->debug_str(maxch_id).c_str(), max_count); + return; + } + word_res->done = true; + + // Just correct existing classification. + CorrectRepcharChoices(best_choice, word_res); + word_res->reject_map.initialise(word.length()); +} + +ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(const UNICHARSET &char_set, const char *s, + const char *lengths) { + int i = 0; + int offset = 0; + int leading_punct_count; + int upper_count = 0; + int hyphen_pos = -1; + ACCEPTABLE_WERD_TYPE word_type = AC_UNACCEPTABLE; + + if (strlen(lengths) > 20) { + return word_type; + } + + /* Single Leading punctuation char*/ + + if (s[offset] != '\0' && chs_leading_punct.contains(s[offset])) { + offset += lengths[i++]; + } + leading_punct_count = i; + + /* Initial cap */ + while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i])) { + offset += lengths[i++]; + upper_count++; + } + if (upper_count > 1) { + word_type = AC_UPPER_CASE; + } else { + /* Lower case word, possibly with an initial cap */ + while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i])) { + offset += lengths[i++]; + } + if (i - leading_punct_count < quality_min_initial_alphas_reqd) { + goto not_a_word; + } + /* +Allow a single hyphen in a lower case word +- don't trust upper case - I've seen several cases of "H" -> "I-I" +*/ + if (lengths[i] == 1 && s[offset] == '-') { + hyphen_pos = i; + offset += lengths[i++]; + if (s[offset] != '\0') { + while ((s[offset] != '\0') && char_set.get_islower(s + offset, lengths[i])) { + offset += lengths[i++]; + } + if (i < hyphen_pos + 3) { + goto not_a_word; + } + } + } else { + /* Allow "'s" in NON hyphenated lower case words */ + if (lengths[i] == 1 && (s[offset] == '\'') && lengths[i + 1] == 1 && + (s[offset + lengths[i]] == 's')) { + offset += lengths[i++]; + offset += lengths[i++]; + } + } + if (upper_count > 0) { + word_type = AC_INITIAL_CAP; + } else { + word_type = AC_LOWER_CASE; + } + } + + /* Up to two different, constrained trailing punctuation chars */ + if (lengths[i] == 1 && s[offset] != '\0' && chs_trailing_punct1.contains(s[offset])) { + offset += lengths[i++]; + } + if (lengths[i] == 1 && s[offset] != '\0' && i > 0 && s[offset - lengths[i - 1]] != s[offset] && + chs_trailing_punct2.contains(s[offset])) { + offset += lengths[i++]; + } + + if (s[offset] != '\0') { + word_type = AC_UNACCEPTABLE; + } + +not_a_word: + + if (word_type == AC_UNACCEPTABLE) { + /* Look for abbreviation string */ + i = 0; + offset = 0; + if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) { + word_type = AC_UC_ABBREV; + while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i]) && + lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') { + offset += lengths[i++]; + offset += lengths[i++]; + } + } else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) { + word_type = AC_LC_ABBREV; + while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i]) && + lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') { + offset += lengths[i++]; + offset += lengths[i++]; + } + } + if (s[offset] != '\0') { + word_type = AC_UNACCEPTABLE; + } + } + + return word_type; +} + +bool Tesseract::check_debug_pt(WERD_RES *word, int location) { + if (!test_pt) { + return false; + } + + tessedit_rejection_debug.set_value(false); + debug_x_ht_level.set_value(0); + + if (word->word->bounding_box().contains(FCOORD(test_pt_x, test_pt_y))) { + if (location < 0) { + return true; // For breakpoint use + } + bool show_map_detail = false; + tessedit_rejection_debug.set_value(true); + debug_x_ht_level.set_value(2); + tprintf("\n\nTESTWD::"); + switch (location) { + case 0: + tprintf("classify_word_pass1 start\n"); + word->word->print(); + break; + case 10: + tprintf("make_reject_map: initial map"); + break; + case 20: + tprintf("make_reject_map: after NN"); + break; + case 30: + tprintf("classify_word_pass2 - START"); + break; + case 40: + tprintf("classify_word_pass2 - Pre Xht"); + break; + case 50: + tprintf("classify_word_pass2 - END"); + show_map_detail = true; + break; + case 60: + tprintf("fixspace"); + break; + case 70: + tprintf("MM pass START"); + break; + case 80: + tprintf("MM pass END"); + break; + case 90: + tprintf("After Poor quality rejection"); + break; + case 100: + tprintf("unrej_good_quality_words - START"); + break; + case 110: + tprintf("unrej_good_quality_words - END"); + break; + case 120: + tprintf("Write results pass"); + show_map_detail = true; + break; + } + if (word->best_choice != nullptr) { + tprintf(" \"%s\" ", word->best_choice->unichar_string().c_str()); + word->reject_map.print(debug_fp); + tprintf("\n"); + if (show_map_detail) { + tprintf("\"%s\"\n", word->best_choice->unichar_string().c_str()); + for (unsigned i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) { + tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]); + word->reject_map[i].full_print(debug_fp); + } + } + } else { + tprintf("null best choice\n"); + } + tprintf("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE"); + tprintf("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE"); + return true; + } else { + return false; + } +} + +/** + * find_modal_font + * + * Find the modal font and remove from the stats. + */ +#ifndef DISABLED_LEGACY_ENGINE +static void find_modal_font( // good chars in word + STATS *fonts, // font stats + int16_t *font_out, // output font + int8_t *font_count // output count +) { + if (fonts->get_total() > 0) { + // font index + int16_t font = static_cast<int16_t>(fonts->mode()); + *font_out = font; + // pile count + int32_t count = fonts->pile_count(font); + *font_count = count < INT8_MAX ? count : INT8_MAX; + fonts->add(font, -*font_count); + } else { + *font_out = -1; + *font_count = 0; + } +} +#endif // ! DISABLED_LEGACY_ENGINE + +/** + * set_word_fonts + * + * Get the fonts for the word. + */ +void Tesseract::set_word_fonts(WERD_RES *word) { + // Don't try to set the word fonts for an lstm word, as the configs + // will be meaningless. + if (word->chopped_word == nullptr) { + return; + } + ASSERT_HOST(word->best_choice != nullptr); + +#ifndef DISABLED_LEGACY_ENGINE + const int fontinfo_size = fontinfo_table_.size(); + if (fontinfo_size == 0) { + return; + } + if (tessedit_font_id > 0) { + if (tessedit_font_id >= fontinfo_size) { + tprintf("Error, invalid font ID provided: must be below %d.\n" + "Falling back to font auto-detection.\n", fontinfo_size); + } else { + word->fontinfo = &fontinfo_table_.at(tessedit_font_id); + word->fontinfo2 = nullptr; + word->fontinfo_id_count = INT8_MAX; + word->fontinfo_id2_count = 0; + return; + } + } + std::vector<int> font_total_score(fontinfo_size); + + // Compute the font scores for the word + if (tessedit_debug_fonts) { + tprintf("Examining fonts in %s\n", word->best_choice->debug_string().c_str()); + } + for (unsigned b = 0; b < word->best_choice->length(); ++b) { + const BLOB_CHOICE *choice = word->GetBlobChoice(b); + if (choice == nullptr) { + continue; + } + auto &fonts = choice->fonts(); + for (auto &f : fonts) { + const int fontinfo_id = f.fontinfo_id; + if (0 <= fontinfo_id && fontinfo_id < fontinfo_size) { + font_total_score[fontinfo_id] += f.score; + } + } + } + // Find the top and 2nd choice for the word. + int score1 = 0, score2 = 0; + int16_t font_id1 = -1, font_id2 = -1; + for (int f = 0; f < fontinfo_size; ++f) { + if (tessedit_debug_fonts && font_total_score[f] > 0) { + tprintf("Font %s, total score = %d\n", fontinfo_table_.at(f).name, font_total_score[f]); + } + if (font_total_score[f] > score1) { + score2 = score1; + font_id2 = font_id1; + score1 = font_total_score[f]; + font_id1 = f; + } else if (font_total_score[f] > score2) { + score2 = font_total_score[f]; + font_id2 = f; + } + } + word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.at(font_id1) : nullptr; + word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.at(font_id2) : nullptr; + // Each score has a limit of UINT16_MAX, so divide by that to get the number + // of "votes" for that font, ie number of perfect scores. + word->fontinfo_id_count = ClipToRange<int>(score1 / UINT16_MAX, 1, INT8_MAX); + word->fontinfo_id2_count = ClipToRange<int>(score2 / UINT16_MAX, 0, INT8_MAX); + if (score1 > 0) { + const FontInfo fi = fontinfo_table_.at(font_id1); + if (tessedit_debug_fonts) { + if (word->fontinfo_id2_count > 0 && font_id2 >= 0) { + tprintf("Word modal font=%s, score=%d, 2nd choice %s/%d\n", fi.name, + word->fontinfo_id_count, fontinfo_table_.at(font_id2).name, + word->fontinfo_id2_count); + } else { + tprintf("Word modal font=%s, score=%d. No 2nd choice\n", fi.name, word->fontinfo_id_count); + } + } + } +#endif // ndef DISABLED_LEGACY_ENGINE +} + +#ifndef DISABLED_LEGACY_ENGINE +/** + * font_recognition_pass + * + * Smooth the fonts for the document. + */ +void Tesseract::font_recognition_pass(PAGE_RES *page_res) { + PAGE_RES_IT page_res_it(page_res); + WERD_RES *word; // current word + STATS doc_fonts(0, font_table_size_ - 1); // font counters + + // Gather font id statistics. + for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) { + word = page_res_it.word(); + if (word->fontinfo != nullptr) { + doc_fonts.add(word->fontinfo->universal_id, word->fontinfo_id_count); + } + if (word->fontinfo2 != nullptr) { + doc_fonts.add(word->fontinfo2->universal_id, word->fontinfo_id2_count); + } + } + int16_t doc_font; // modal font + int8_t doc_font_count; // modal font + find_modal_font(&doc_fonts, &doc_font, &doc_font_count); + if (doc_font_count == 0) { + return; + } + // Get the modal font pointer. + const FontInfo *modal_font = nullptr; + for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) { + word = page_res_it.word(); + if (word->fontinfo != nullptr && word->fontinfo->universal_id == doc_font) { + modal_font = word->fontinfo; + break; + } + if (word->fontinfo2 != nullptr && word->fontinfo2->universal_id == doc_font) { + modal_font = word->fontinfo2; + break; + } + } + ASSERT_HOST(modal_font != nullptr); + + // Assign modal font to weak words. + for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) { + word = page_res_it.word(); + const int length = word->best_choice->length(); + + const int count = word->fontinfo_id_count; + if (!(count == length || (length > 3 && count >= length * 3 / 4))) { + word->fontinfo = modal_font; + // Counts only get 1 as it came from the doc. + word->fontinfo_id_count = 1; + } + } +} +#endif // ndef DISABLED_LEGACY_ENGINE + +// If a word has multiple alternates check if the best choice is in the +// dictionary. If not, replace it with an alternate that exists in the +// dictionary. +void Tesseract::dictionary_correction_pass(PAGE_RES *page_res) { + PAGE_RES_IT word_it(page_res); + for (WERD_RES *word = word_it.word(); word != nullptr; word = word_it.forward()) { + if (word->best_choices.singleton()) { + continue; // There are no alternates. + } + + const WERD_CHOICE *best = word->best_choice; + if (word->tesseract->getDict().valid_word(*best) != 0) { + continue; // The best choice is in the dictionary. + } + + WERD_CHOICE_IT choice_it(&word->best_choices); + for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) { + WERD_CHOICE *alternate = choice_it.data(); + if (word->tesseract->getDict().valid_word(*alternate)) { + // The alternate choice is in the dictionary. + if (tessedit_bigram_debug) { + tprintf("Dictionary correction replaces best choice '%s' with '%s'\n", + best->unichar_string().c_str(), alternate->unichar_string().c_str()); + } + // Replace the 'best' choice with a better choice. + word->ReplaceBestChoice(alternate); + break; + } + } + } +} + +} // namespace tesseract
