Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccmain/control.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /****************************************************************** | |
| 2 * File: control.cpp (Formerly control.c) | |
| 3 * Description: Module-independent matcher controller. | |
| 4 * Author: Ray Smith | |
| 5 * | |
| 6 * (C) Copyright 1992, Hewlett-Packard Ltd. | |
| 7 ** Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 ** you may not use this file except in compliance with the License. | |
| 9 ** You may obtain a copy of the License at | |
| 10 ** http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 ** Unless required by applicable law or agreed to in writing, software | |
| 12 ** distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 ** See the License for the specific language governing permissions and | |
| 15 ** limitations under the License. | |
| 16 * | |
| 17 **********************************************************************/ | |
| 18 | |
| 19 // Include automatically generated configuration file if running autoconf. | |
| 20 #ifdef HAVE_CONFIG_H | |
| 21 # include "config_auto.h" | |
| 22 #endif | |
| 23 | |
| 24 #include <cctype> | |
| 25 #include <cmath> | |
| 26 #include <cstdint> // for int16_t, int32_t | |
| 27 #include <cstdio> // for fclose, fopen, FILE | |
| 28 #include <ctime> // for clock | |
| 29 #include "control.h" | |
| 30 #ifndef DISABLED_LEGACY_ENGINE | |
| 31 # include "docqual.h" | |
| 32 # include "drawfx.h" | |
| 33 # include "fixspace.h" | |
| 34 #endif | |
| 35 #include <tesseract/ocrclass.h> | |
| 36 #include "lstmrecognizer.h" | |
| 37 #include "output.h" | |
| 38 #include "pageres.h" // for WERD_RES, PAGE_RES_IT, PAGE_RES, BLO... | |
| 39 #ifndef DISABLED_LEGACY_ENGINE | |
| 40 # include "reject.h" | |
| 41 #endif | |
| 42 #include "sorthelper.h" | |
| 43 #include "tesseractclass.h" | |
| 44 #include "tesserrstream.h" // for tesserr | |
| 45 #include "tessvars.h" | |
| 46 #include "werdit.h" | |
| 47 | |
| 48 const char *const kBackUpConfigFile = "tempconfigdata.config"; | |
| 49 #ifndef DISABLED_LEGACY_ENGINE | |
| 50 // Min believable x-height for any text when refitting as a fraction of | |
| 51 // original x-height | |
| 52 const double kMinRefitXHeightFraction = 0.5; | |
| 53 #endif // ! DISABLED_LEGACY_ENGINE | |
| 54 | |
| 55 namespace tesseract { | |
| 56 | |
| 57 /** | |
| 58 * Make a word from the selected blobs and run Tess on them. | |
| 59 * | |
| 60 * @param page_res recognise blobs | |
| 61 * @param selection_box within this box | |
| 62 */ | |
| 63 | |
| 64 void Tesseract::recog_pseudo_word(PAGE_RES *page_res, TBOX &selection_box) { | |
| 65 PAGE_RES_IT *it = make_pseudo_word(page_res, selection_box); | |
| 66 if (it != nullptr) { | |
| 67 recog_interactive(it); | |
| 68 it->DeleteCurrentWord(); | |
| 69 delete it; | |
| 70 } | |
| 71 } | |
| 72 | |
| 73 /** | |
| 74 * Recognize a single word in interactive mode. | |
| 75 * | |
| 76 * @param pr_it the page results iterator | |
| 77 */ | |
| 78 bool Tesseract::recog_interactive(PAGE_RES_IT *pr_it) { | |
| 79 WordData word_data(*pr_it); | |
| 80 SetupWordPassN(2, &word_data); | |
| 81 // LSTM doesn't run on pass2, but we want to run pass2 for tesseract. | |
| 82 if (lstm_recognizer_ == nullptr) { | |
| 83 #ifndef DISABLED_LEGACY_ENGINE | |
| 84 classify_word_and_language(2, pr_it, &word_data); | |
| 85 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 86 } else { | |
| 87 classify_word_and_language(1, pr_it, &word_data); | |
| 88 } | |
| 89 #ifndef DISABLED_LEGACY_ENGINE | |
| 90 if (tessedit_debug_quality_metrics) { | |
| 91 int16_t char_qual; | |
| 92 int16_t good_char_qual; | |
| 93 WERD_RES *word_res = pr_it->word(); | |
| 94 word_char_quality(word_res, &char_qual, &good_char_qual); | |
| 95 tprintf( | |
| 96 "\n%d chars; word_blob_quality: %d; outline_errs: %d; " | |
| 97 "char_quality: %d; good_char_quality: %d\n", | |
| 98 word_res->reject_map.length(), word_blob_quality(word_res), word_outline_errs(word_res), | |
| 99 char_qual, good_char_qual); | |
| 100 } | |
| 101 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 102 return true; | |
| 103 } | |
| 104 | |
| 105 // Helper function to check for a target word and handle it appropriately. | |
| 106 // Inspired by Jetsoft's requirement to process only single words on pass2 | |
| 107 // and beyond. | |
| 108 // If word_config is not null: | |
| 109 // If the word_box and target_word_box overlap, read the word_config file | |
| 110 // else reset to previous config data. | |
| 111 // return true. | |
| 112 // else | |
| 113 // If the word_box and target_word_box overlap or pass <= 1, return true. | |
| 114 // Note that this function uses a fixed temporary file for storing the previous | |
| 115 // configs, so it is neither thread-safe, nor process-safe, but the assumption | |
| 116 // is that it will only be used for one debug window at a time. | |
| 117 // | |
| 118 // Since this function is used for debugging (and not to change OCR results) | |
| 119 // set only debug params from the word config file. | |
| 120 bool Tesseract::ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box, | |
| 121 const char *word_config, int pass) { | |
| 122 if (word_config != nullptr) { | |
| 123 if (word_box.major_overlap(target_word_box)) { | |
| 124 if (backup_config_file_ == nullptr) { | |
| 125 backup_config_file_ = kBackUpConfigFile; | |
| 126 FILE *config_fp = fopen(backup_config_file_, "wb"); | |
| 127 if (config_fp == nullptr) { | |
| 128 tprintf("Error, failed to open file \"%s\"\n", backup_config_file_); | |
| 129 } else { | |
| 130 ParamUtils::PrintParams(config_fp, params()); | |
| 131 fclose(config_fp); | |
| 132 } | |
| 133 ParamUtils::ReadParamsFile(word_config, SET_PARAM_CONSTRAINT_DEBUG_ONLY, params()); | |
| 134 } | |
| 135 } else { | |
| 136 if (backup_config_file_ != nullptr) { | |
| 137 ParamUtils::ReadParamsFile(backup_config_file_, SET_PARAM_CONSTRAINT_DEBUG_ONLY, params()); | |
| 138 backup_config_file_ = nullptr; | |
| 139 } | |
| 140 } | |
| 141 } else if (pass > 1 && !word_box.major_overlap(target_word_box)) { | |
| 142 return false; | |
| 143 } | |
| 144 return true; | |
| 145 } | |
| 146 | |
| 147 /** If tesseract is to be run, sets the words up ready for it. */ | |
| 148 void Tesseract::SetupAllWordsPassN(int pass_n, const TBOX *target_word_box, const char *word_config, | |
| 149 PAGE_RES *page_res, std::vector<WordData> *words) { | |
| 150 // Prepare all the words. | |
| 151 PAGE_RES_IT page_res_it(page_res); | |
| 152 for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) { | |
| 153 if (target_word_box == nullptr || ProcessTargetWord(page_res_it.word()->word->bounding_box(), | |
| 154 *target_word_box, word_config, 1)) { | |
| 155 words->push_back(WordData(page_res_it)); | |
| 156 } | |
| 157 } | |
| 158 // Setup all the words for recognition with polygonal approximation. | |
| 159 for (unsigned w = 0; w < words->size(); ++w) { | |
| 160 SetupWordPassN(pass_n, &(*words)[w]); | |
| 161 if (w > 0) { | |
| 162 (*words)[w].prev_word = &(*words)[w - 1]; | |
| 163 } | |
| 164 } | |
| 165 } | |
| 166 | |
| 167 // Sets up the single word ready for whichever engine is to be run. | |
| 168 void Tesseract::SetupWordPassN(int pass_n, WordData *word) { | |
| 169 if (pass_n == 1 || !word->word->done) { | |
| 170 if (pass_n == 1) { | |
| 171 word->word->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, | |
| 172 nullptr, classify_bln_numeric_mode, textord_use_cjk_fp_model, | |
| 173 poly_allow_detailed_fx, word->row, word->block); | |
| 174 } else if (pass_n == 2) { | |
| 175 // TODO(rays) Should we do this on pass1 too? | |
| 176 word->word->caps_height = 0.0; | |
| 177 if (word->word->x_height == 0.0f) { | |
| 178 word->word->x_height = word->row->x_height(); | |
| 179 } | |
| 180 } | |
| 181 word->lang_words.truncate(0); | |
| 182 for (unsigned s = 0; s <= sub_langs_.size(); ++s) { | |
| 183 // The sub_langs_.size() entry is for the master language. | |
| 184 Tesseract *lang_t = s < sub_langs_.size() ? sub_langs_[s] : this; | |
| 185 auto *word_res = new WERD_RES; | |
| 186 word_res->InitForRetryRecognition(*word->word); | |
| 187 word->lang_words.push_back(word_res); | |
| 188 // LSTM doesn't get setup for pass2. | |
| 189 if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode != OEM_LSTM_ONLY) { | |
| 190 word_res->SetupForRecognition( | |
| 191 lang_t->unicharset, lang_t, BestPix(), lang_t->tessedit_ocr_engine_mode, nullptr, | |
| 192 lang_t->classify_bln_numeric_mode, lang_t->textord_use_cjk_fp_model, | |
| 193 lang_t->poly_allow_detailed_fx, word->row, word->block); | |
| 194 } | |
| 195 } | |
| 196 } | |
| 197 } | |
| 198 | |
| 199 // Runs word recognition on all the words. | |
| 200 bool Tesseract::RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, | |
| 201 std::vector<WordData> *words) { | |
| 202 // TODO(rays) Before this loop can be parallelized (it would yield a massive | |
| 203 // speed-up) all remaining member globals need to be converted to local/heap | |
| 204 // (eg set_pass1 and set_pass2) and an intermediate adaption pass needs to be | |
| 205 // added. The results will be significantly different with adaption on, and | |
| 206 // deterioration will need investigation. | |
| 207 pr_it->restart_page(); | |
| 208 for (unsigned w = 0; w < words->size(); ++w) { | |
| 209 WordData *word = &(*words)[w]; | |
| 210 if (w > 0) { | |
| 211 word->prev_word = &(*words)[w - 1]; | |
| 212 } | |
| 213 if (monitor != nullptr) { | |
| 214 monitor->ocr_alive = true; | |
| 215 if (pass_n == 1) { | |
| 216 monitor->progress = 70 * w / words->size(); | |
| 217 } else { | |
| 218 monitor->progress = 70 + 30 * w / words->size(); | |
| 219 } | |
| 220 if (monitor->progress_callback2 != nullptr) { | |
| 221 TBOX box = pr_it->word()->word->bounding_box(); | |
| 222 (*monitor->progress_callback2)(monitor, box.left(), box.right(), box.top(), box.bottom()); | |
| 223 } | |
| 224 if (monitor->deadline_exceeded() || | |
| 225 (monitor->cancel != nullptr && (*monitor->cancel)(monitor->cancel_this, words->size()))) { | |
| 226 // Timeout. Fake out the rest of the words. | |
| 227 for (; w < words->size(); ++w) { | |
| 228 (*words)[w].word->SetupFake(unicharset); | |
| 229 } | |
| 230 return false; | |
| 231 } | |
| 232 } | |
| 233 if (word->word->tess_failed) { | |
| 234 unsigned s; | |
| 235 for (s = 0; s < word->lang_words.size() && word->lang_words[s]->tess_failed; ++s) { | |
| 236 } | |
| 237 // If all are failed, skip it. Image words are skipped by this test. | |
| 238 if (s > word->lang_words.size()) { | |
| 239 continue; | |
| 240 } | |
| 241 } | |
| 242 // Sync pr_it with the WordData. | |
| 243 while (pr_it->word() != nullptr && pr_it->word() != word->word) { | |
| 244 pr_it->forward(); | |
| 245 } | |
| 246 ASSERT_HOST(pr_it->word() != nullptr); | |
| 247 bool make_next_word_fuzzy = false; | |
| 248 #ifndef DISABLED_LEGACY_ENGINE | |
| 249 if (!AnyLSTMLang() && ReassignDiacritics(pass_n, pr_it, &make_next_word_fuzzy)) { | |
| 250 // Needs to be setup again to see the new outlines in the chopped_word. | |
| 251 SetupWordPassN(pass_n, word); | |
| 252 } | |
| 253 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 254 | |
| 255 classify_word_and_language(pass_n, pr_it, word); | |
| 256 if (tessedit_dump_choices || debug_noise_removal) { | |
| 257 tprintf("Pass%d: %s [%s]\n", pass_n, word->word->best_choice->unichar_string().c_str(), | |
| 258 word->word->best_choice->debug_string().c_str()); | |
| 259 } | |
| 260 pr_it->forward(); | |
| 261 if (make_next_word_fuzzy && pr_it->word() != nullptr) { | |
| 262 pr_it->MakeCurrentWordFuzzy(); | |
| 263 } | |
| 264 } | |
| 265 return true; | |
| 266 } | |
| 267 | |
| 268 /** | |
| 269 * recog_all_words() | |
| 270 * | |
| 271 * Walk the page_res, recognizing all the words. | |
| 272 * If monitor is not null, it is used as a progress monitor/timeout/cancel. | |
| 273 * If dopasses is 0, all recognition passes are run, | |
| 274 * 1 just pass 1, 2 passes2 and higher. | |
| 275 * If target_word_box is not null, special things are done to words that | |
| 276 * overlap the target_word_box: | |
| 277 * if word_config is not null, the word config file is read for just the | |
| 278 * target word(s), otherwise, on pass 2 and beyond ONLY the target words | |
| 279 * are processed (Jetsoft modification.) | |
| 280 * Returns false if we cancelled prematurely. | |
| 281 * | |
| 282 * @param page_res page structure | |
| 283 * @param monitor progress monitor | |
| 284 * @param word_config word_config file | |
| 285 * @param target_word_box specifies just to extract a rectangle | |
| 286 * @param dopasses 0 - all, 1 just pass 1, 2 passes 2 and higher | |
| 287 */ | |
| 288 | |
| 289 bool Tesseract::recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, | |
| 290 const TBOX *target_word_box, const char *word_config, | |
| 291 int dopasses) { | |
| 292 PAGE_RES_IT page_res_it(page_res); | |
| 293 | |
| 294 if (tessedit_minimal_rej_pass1) { | |
| 295 tessedit_test_adaption.set_value(true); | |
| 296 tessedit_minimal_rejection.set_value(true); | |
| 297 } | |
| 298 | |
| 299 if (dopasses == 0 || dopasses == 1) { | |
| 300 page_res_it.restart_page(); | |
| 301 // ****************** Pass 1 ******************* | |
| 302 | |
| 303 #ifndef DISABLED_LEGACY_ENGINE | |
| 304 // If the adaptive classifier is full switch to one we prepared earlier, | |
| 305 // ie on the previous page. If the current adaptive classifier is non-empty, | |
| 306 // prepare a backup starting at this page, in case it fills up. Do all this | |
| 307 // independently for each language. | |
| 308 if (AdaptiveClassifierIsFull()) { | |
| 309 SwitchAdaptiveClassifier(); | |
| 310 } else if (!AdaptiveClassifierIsEmpty()) { | |
| 311 StartBackupAdaptiveClassifier(); | |
| 312 } | |
| 313 // Now check the sub-langs as well. | |
| 314 for (auto &lang : sub_langs_) { | |
| 315 if (lang->AdaptiveClassifierIsFull()) { | |
| 316 lang->SwitchAdaptiveClassifier(); | |
| 317 } else if (!lang->AdaptiveClassifierIsEmpty()) { | |
| 318 lang->StartBackupAdaptiveClassifier(); | |
| 319 } | |
| 320 } | |
| 321 | |
| 322 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 323 | |
| 324 // Set up all words ready for recognition, so that if parallelism is on | |
| 325 // all the input and output classes are ready to run the classifier. | |
| 326 std::vector<WordData> words; | |
| 327 SetupAllWordsPassN(1, target_word_box, word_config, page_res, &words); | |
| 328 #ifndef DISABLED_LEGACY_ENGINE | |
| 329 if (tessedit_parallelize) { | |
| 330 PrerecAllWordsPar(words); | |
| 331 } | |
| 332 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 333 | |
| 334 stats_.word_count = words.size(); | |
| 335 | |
| 336 stats_.dict_words = 0; | |
| 337 stats_.doc_blob_quality = 0; | |
| 338 stats_.doc_outline_errs = 0; | |
| 339 stats_.doc_char_quality = 0; | |
| 340 stats_.good_char_count = 0; | |
| 341 stats_.doc_good_char_quality = 0; | |
| 342 | |
| 343 most_recently_used_ = this; | |
| 344 // Run pass 1 word recognition. | |
| 345 if (!RecogAllWordsPassN(1, monitor, &page_res_it, &words)) { | |
| 346 return false; | |
| 347 } | |
| 348 // Pass 1 post-processing. | |
| 349 for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) { | |
| 350 if (page_res_it.word()->word->flag(W_REP_CHAR)) { | |
| 351 fix_rep_char(&page_res_it); | |
| 352 continue; | |
| 353 } | |
| 354 | |
| 355 // Count dict words. | |
| 356 if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM) { | |
| 357 ++(stats_.dict_words); | |
| 358 } | |
| 359 | |
| 360 // Update misadaption log (we only need to do it on pass 1, since | |
| 361 // adaption only happens on this pass). | |
| 362 if (page_res_it.word()->blamer_bundle != nullptr && | |
| 363 page_res_it.word()->blamer_bundle->misadaption_debug().length() > 0) { | |
| 364 page_res->misadaption_log.push_back(page_res_it.word()->blamer_bundle->misadaption_debug()); | |
| 365 } | |
| 366 } | |
| 367 } | |
| 368 | |
| 369 if (dopasses == 1) { | |
| 370 return true; | |
| 371 } | |
| 372 | |
| 373 #ifndef DISABLED_LEGACY_ENGINE | |
| 374 | |
| 375 // ****************** Pass 2 ******************* | |
| 376 if (tessedit_tess_adaption_mode != 0x0 && !tessedit_test_adaption && AnyTessLang()) { | |
| 377 page_res_it.restart_page(); | |
| 378 std::vector<WordData> words; | |
| 379 SetupAllWordsPassN(2, target_word_box, word_config, page_res, &words); | |
| 380 if (tessedit_parallelize) { | |
| 381 PrerecAllWordsPar(words); | |
| 382 } | |
| 383 most_recently_used_ = this; | |
| 384 // Run pass 2 word recognition. | |
| 385 if (!RecogAllWordsPassN(2, monitor, &page_res_it, &words)) { | |
| 386 return false; | |
| 387 } | |
| 388 } | |
| 389 | |
| 390 // The next passes are only required for Tess-only. | |
| 391 if (AnyTessLang() && !AnyLSTMLang()) { | |
| 392 // ****************** Pass 3 ******************* | |
| 393 // Fix fuzzy spaces. | |
| 394 | |
| 395 if (!tessedit_test_adaption && tessedit_fix_fuzzy_spaces && !tessedit_word_for_word && | |
| 396 !right_to_left()) { | |
| 397 fix_fuzzy_spaces(monitor, stats_.word_count, page_res); | |
| 398 } | |
| 399 | |
| 400 // ****************** Pass 4 ******************* | |
| 401 if (tessedit_enable_dict_correction) { | |
| 402 dictionary_correction_pass(page_res); | |
| 403 } | |
| 404 if (tessedit_enable_bigram_correction) { | |
| 405 bigram_correction_pass(page_res); | |
| 406 } | |
| 407 | |
| 408 // ****************** Pass 5,6 ******************* | |
| 409 rejection_passes(page_res, monitor, target_word_box, word_config); | |
| 410 | |
| 411 // ****************** Pass 8 ******************* | |
| 412 font_recognition_pass(page_res); | |
| 413 | |
| 414 // ****************** Pass 9 ******************* | |
| 415 // Check the correctness of the final results. | |
| 416 blamer_pass(page_res); | |
| 417 script_pos_pass(page_res); | |
| 418 } | |
| 419 | |
| 420 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 421 | |
| 422 // Write results pass. | |
| 423 // This is now redundant, but retained commented so show how to obtain | |
| 424 // bounding boxes and style information. | |
| 425 | |
| 426 #ifndef DISABLED_LEGACY_ENGINE | |
| 427 // changed by jetsoft | |
| 428 // needed for dll to output memory structure | |
| 429 if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv)) { | |
| 430 output_pass(page_res_it, target_word_box); | |
| 431 } | |
| 432 // end jetsoft | |
| 433 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 434 | |
| 435 const auto pageseg_mode = static_cast<PageSegMode>(static_cast<int>(tessedit_pageseg_mode)); | |
| 436 textord_.CleanupSingleRowResult(pageseg_mode, page_res); | |
| 437 | |
| 438 // Remove empty words, as these mess up the result iterators. | |
| 439 for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) { | |
| 440 const WERD_RES *word = page_res_it.word(); | |
| 441 const POLY_BLOCK *pb = page_res_it.block()->block != nullptr | |
| 442 ? page_res_it.block()->block->pdblk.poly_block() | |
| 443 : nullptr; | |
| 444 if (word->best_choice == nullptr || word->best_choice->empty() || | |
| 445 (word->best_choice->IsAllSpaces() && (pb == nullptr || pb->IsText()))) { | |
| 446 page_res_it.DeleteCurrentWord(); | |
| 447 } | |
| 448 } | |
| 449 | |
| 450 if (monitor != nullptr) { | |
| 451 monitor->progress = 100; | |
| 452 } | |
| 453 return true; | |
| 454 } | |
| 455 | |
| 456 #ifndef DISABLED_LEGACY_ENGINE | |
| 457 | |
| 458 void Tesseract::bigram_correction_pass(PAGE_RES *page_res) { | |
| 459 PAGE_RES_IT word_it(page_res); | |
| 460 | |
| 461 WERD_RES *w_prev = nullptr; | |
| 462 WERD_RES *w = word_it.word(); | |
| 463 while (true) { | |
| 464 w_prev = w; | |
| 465 while (word_it.forward() != nullptr && (!word_it.word() || word_it.word()->part_of_combo)) { | |
| 466 // advance word_it, skipping over parts of combos | |
| 467 } | |
| 468 if (!word_it.word()) { | |
| 469 break; | |
| 470 } | |
| 471 w = word_it.word(); | |
| 472 if (!w || !w_prev || w->uch_set != w_prev->uch_set) { | |
| 473 continue; | |
| 474 } | |
| 475 if (w_prev->word->flag(W_REP_CHAR) || w->word->flag(W_REP_CHAR)) { | |
| 476 if (tessedit_bigram_debug) { | |
| 477 tprintf("Skipping because one of the words is W_REP_CHAR\n"); | |
| 478 } | |
| 479 continue; | |
| 480 } | |
| 481 // Two words sharing the same language model, excellent! | |
| 482 std::vector<WERD_CHOICE *> overrides_word1; | |
| 483 std::vector<WERD_CHOICE *> overrides_word2; | |
| 484 | |
| 485 const auto &orig_w1_str = w_prev->best_choice->unichar_string(); | |
| 486 const auto &orig_w2_str = w->best_choice->unichar_string(); | |
| 487 WERD_CHOICE prev_best(w->uch_set); | |
| 488 { | |
| 489 int w1start, w1end; | |
| 490 w_prev->best_choice->GetNonSuperscriptSpan(&w1start, &w1end); | |
| 491 prev_best = w_prev->best_choice->shallow_copy(w1start, w1end); | |
| 492 } | |
| 493 WERD_CHOICE this_best(w->uch_set); | |
| 494 { | |
| 495 int w2start, w2end; | |
| 496 w->best_choice->GetNonSuperscriptSpan(&w2start, &w2end); | |
| 497 this_best = w->best_choice->shallow_copy(w2start, w2end); | |
| 498 } | |
| 499 | |
| 500 if (w->tesseract->getDict().valid_bigram(prev_best, this_best)) { | |
| 501 if (tessedit_bigram_debug) { | |
| 502 tprintf("Top choice \"%s %s\" verified by bigram model.\n", orig_w1_str.c_str(), | |
| 503 orig_w2_str.c_str()); | |
| 504 } | |
| 505 continue; | |
| 506 } | |
| 507 if (tessedit_bigram_debug > 2) { | |
| 508 tprintf("Examining alt choices for \"%s %s\".\n", orig_w1_str.c_str(), orig_w2_str.c_str()); | |
| 509 } | |
| 510 if (tessedit_bigram_debug > 1) { | |
| 511 if (!w_prev->best_choices.singleton()) { | |
| 512 w_prev->PrintBestChoices(); | |
| 513 } | |
| 514 if (!w->best_choices.singleton()) { | |
| 515 w->PrintBestChoices(); | |
| 516 } | |
| 517 } | |
| 518 float best_rating = 0.0; | |
| 519 int best_idx = 0; | |
| 520 WERD_CHOICE_IT prev_it(&w_prev->best_choices); | |
| 521 for (prev_it.mark_cycle_pt(); !prev_it.cycled_list(); prev_it.forward()) { | |
| 522 WERD_CHOICE *p1 = prev_it.data(); | |
| 523 WERD_CHOICE strip1(w->uch_set); | |
| 524 { | |
| 525 int p1start, p1end; | |
| 526 p1->GetNonSuperscriptSpan(&p1start, &p1end); | |
| 527 strip1 = p1->shallow_copy(p1start, p1end); | |
| 528 } | |
| 529 WERD_CHOICE_IT w_it(&w->best_choices); | |
| 530 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { | |
| 531 WERD_CHOICE *p2 = w_it.data(); | |
| 532 WERD_CHOICE strip2(w->uch_set); | |
| 533 { | |
| 534 int p2start, p2end; | |
| 535 p2->GetNonSuperscriptSpan(&p2start, &p2end); | |
| 536 strip2 = p2->shallow_copy(p2start, p2end); | |
| 537 } | |
| 538 if (w->tesseract->getDict().valid_bigram(strip1, strip2)) { | |
| 539 overrides_word1.push_back(p1); | |
| 540 overrides_word2.push_back(p2); | |
| 541 if (overrides_word1.size() == 1 || p1->rating() + p2->rating() < best_rating) { | |
| 542 best_rating = p1->rating() + p2->rating(); | |
| 543 best_idx = overrides_word1.size() - 1; | |
| 544 } | |
| 545 } | |
| 546 } | |
| 547 } | |
| 548 if (!overrides_word1.empty()) { | |
| 549 // Excellent, we have some bigram matches. | |
| 550 if (EqualIgnoringCaseAndTerminalPunct(*w_prev->best_choice, *overrides_word1[best_idx]) && | |
| 551 EqualIgnoringCaseAndTerminalPunct(*w->best_choice, *overrides_word2[best_idx])) { | |
| 552 if (tessedit_bigram_debug > 1) { | |
| 553 tprintf( | |
| 554 "Top choice \"%s %s\" verified (sans case) by bigram " | |
| 555 "model.\n", | |
| 556 orig_w1_str.c_str(), orig_w2_str.c_str()); | |
| 557 } | |
| 558 continue; | |
| 559 } | |
| 560 const auto &new_w1_str = overrides_word1[best_idx]->unichar_string(); | |
| 561 const auto &new_w2_str = overrides_word2[best_idx]->unichar_string(); | |
| 562 if (new_w1_str != orig_w1_str) { | |
| 563 w_prev->ReplaceBestChoice(overrides_word1[best_idx]); | |
| 564 } | |
| 565 if (new_w2_str != orig_w2_str) { | |
| 566 w->ReplaceBestChoice(overrides_word2[best_idx]); | |
| 567 } | |
| 568 if (tessedit_bigram_debug > 0) { | |
| 569 std::string choices_description; | |
| 570 int num_bigram_choices = overrides_word1.size() * overrides_word2.size(); | |
| 571 if (num_bigram_choices == 1) { | |
| 572 choices_description = "This was the unique bigram choice."; | |
| 573 } else { | |
| 574 if (tessedit_bigram_debug > 1) { | |
| 575 std::string bigrams_list; | |
| 576 const int kMaxChoicesToPrint = 20; | |
| 577 for (unsigned i = 0; i < overrides_word1.size() && i < kMaxChoicesToPrint; i++) { | |
| 578 if (i > 0) { | |
| 579 bigrams_list += ", "; | |
| 580 } | |
| 581 WERD_CHOICE *p1 = overrides_word1[i]; | |
| 582 WERD_CHOICE *p2 = overrides_word2[i]; | |
| 583 bigrams_list += p1->unichar_string() + " " + p2->unichar_string(); | |
| 584 } | |
| 585 choices_description = "There were many choices: {"; | |
| 586 choices_description += bigrams_list; | |
| 587 choices_description += "}"; | |
| 588 } else { | |
| 589 choices_description += "There were " + std::to_string(num_bigram_choices); | |
| 590 choices_description += " compatible bigrams."; | |
| 591 } | |
| 592 } | |
| 593 tprintf("Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n", orig_w1_str.c_str(), | |
| 594 orig_w2_str.c_str(), new_w1_str.c_str(), new_w2_str.c_str(), | |
| 595 choices_description.c_str()); | |
| 596 } | |
| 597 } | |
| 598 } | |
| 599 } | |
| 600 | |
| 601 void Tesseract::rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor, | |
| 602 const TBOX *target_word_box, const char *word_config) { | |
| 603 PAGE_RES_IT page_res_it(page_res); | |
| 604 // ****************** Pass 5 ******************* | |
| 605 // Gather statistics on rejects. | |
| 606 int word_index = 0; | |
| 607 while (!tessedit_test_adaption && page_res_it.word() != nullptr) { | |
| 608 WERD_RES *word = page_res_it.word(); | |
| 609 word_index++; | |
| 610 if (monitor != nullptr) { | |
| 611 monitor->ocr_alive = true; | |
| 612 monitor->progress = 95 + 5 * word_index / stats_.word_count; | |
| 613 } | |
| 614 if (word->rebuild_word == nullptr) { | |
| 615 // Word was not processed by tesseract. | |
| 616 page_res_it.forward(); | |
| 617 continue; | |
| 618 } | |
| 619 check_debug_pt(word, 70); | |
| 620 | |
| 621 // changed by jetsoft | |
| 622 // specific to its needs to extract one word when need | |
| 623 if (target_word_box && | |
| 624 !ProcessTargetWord(word->word->bounding_box(), *target_word_box, word_config, 4)) { | |
| 625 page_res_it.forward(); | |
| 626 continue; | |
| 627 } | |
| 628 // end jetsoft | |
| 629 | |
| 630 page_res_it.rej_stat_word(); | |
| 631 const int chars_in_word = word->reject_map.length(); | |
| 632 const int rejects_in_word = word->reject_map.reject_count(); | |
| 633 | |
| 634 const int blob_quality = word_blob_quality(word); | |
| 635 stats_.doc_blob_quality += blob_quality; | |
| 636 const int outline_errs = word_outline_errs(word); | |
| 637 stats_.doc_outline_errs += outline_errs; | |
| 638 int16_t all_char_quality; | |
| 639 int16_t accepted_all_char_quality; | |
| 640 word_char_quality(word, &all_char_quality, &accepted_all_char_quality); | |
| 641 stats_.doc_char_quality += all_char_quality; | |
| 642 const uint8_t permuter_type = word->best_choice->permuter(); | |
| 643 if ((permuter_type == SYSTEM_DAWG_PERM) || (permuter_type == FREQ_DAWG_PERM) || | |
| 644 (permuter_type == USER_DAWG_PERM)) { | |
| 645 stats_.good_char_count += chars_in_word - rejects_in_word; | |
| 646 stats_.doc_good_char_quality += accepted_all_char_quality; | |
| 647 } | |
| 648 check_debug_pt(word, 80); | |
| 649 if (tessedit_reject_bad_qual_wds && (blob_quality == 0) && (outline_errs >= chars_in_word)) { | |
| 650 word->reject_map.rej_word_bad_quality(); | |
| 651 } | |
| 652 check_debug_pt(word, 90); | |
| 653 page_res_it.forward(); | |
| 654 } | |
| 655 | |
| 656 if (tessedit_debug_quality_metrics) { | |
| 657 tprintf( | |
| 658 "QUALITY: num_chs= %d num_rejs= %d %5.3f blob_qual= %d %5.3f" | |
| 659 " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n", | |
| 660 page_res->char_count, page_res->rej_count, | |
| 661 page_res->rej_count / static_cast<float>(page_res->char_count), stats_.doc_blob_quality, | |
| 662 stats_.doc_blob_quality / static_cast<float>(page_res->char_count), stats_.doc_outline_errs, | |
| 663 stats_.doc_outline_errs / static_cast<float>(page_res->char_count), stats_.doc_char_quality, | |
| 664 stats_.doc_char_quality / static_cast<float>(page_res->char_count), | |
| 665 stats_.doc_good_char_quality, | |
| 666 (stats_.good_char_count > 0) | |
| 667 ? (stats_.doc_good_char_quality / static_cast<float>(stats_.good_char_count)) | |
| 668 : 0.0); | |
| 669 } | |
| 670 bool good_quality_doc = | |
| 671 ((page_res->rej_count / static_cast<float>(page_res->char_count)) <= quality_rej_pc) && | |
| 672 (stats_.doc_blob_quality / static_cast<float>(page_res->char_count) >= quality_blob_pc) && | |
| 673 (stats_.doc_outline_errs / static_cast<float>(page_res->char_count) <= quality_outline_pc) && | |
| 674 (stats_.doc_char_quality / static_cast<float>(page_res->char_count) >= quality_char_pc); | |
| 675 | |
| 676 // ****************** Pass 6 ******************* | |
| 677 // Do whole document or whole block rejection pass | |
| 678 if (!tessedit_test_adaption) { | |
| 679 quality_based_rejection(page_res_it, good_quality_doc); | |
| 680 } | |
| 681 } | |
| 682 | |
| 683 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 684 | |
| 685 void Tesseract::blamer_pass(PAGE_RES *page_res) { | |
| 686 if (!wordrec_run_blamer) { | |
| 687 return; | |
| 688 } | |
| 689 PAGE_RES_IT page_res_it(page_res); | |
| 690 for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) { | |
| 691 WERD_RES *word = page_res_it.word(); | |
| 692 BlamerBundle::LastChanceBlame(wordrec_debug_blamer, word); | |
| 693 page_res->blame_reasons[word->blamer_bundle->incorrect_result_reason()]++; | |
| 694 } | |
| 695 tprintf("Blame reasons:\n"); | |
| 696 for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) { | |
| 697 tprintf("%s %d\n", BlamerBundle::IncorrectReasonName(static_cast<IncorrectResultReason>(bl)), | |
| 698 page_res->blame_reasons[bl]); | |
| 699 } | |
| 700 if (page_res->misadaption_log.size() > 0) { | |
| 701 tprintf("Misadaption log:\n"); | |
| 702 for (auto &log : page_res->misadaption_log) { | |
| 703 tprintf("%s\n", log.c_str()); | |
| 704 } | |
| 705 } | |
| 706 } | |
| 707 | |
| 708 // Sets script positions and detects smallcaps on all output words. | |
| 709 void Tesseract::script_pos_pass(PAGE_RES *page_res) { | |
| 710 PAGE_RES_IT page_res_it(page_res); | |
| 711 for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) { | |
| 712 WERD_RES *word = page_res_it.word(); | |
| 713 if (word->word->flag(W_REP_CHAR)) { | |
| 714 page_res_it.forward(); | |
| 715 continue; | |
| 716 } | |
| 717 const float x_height = page_res_it.block()->block->x_height(); | |
| 718 float word_x_height = word->x_height; | |
| 719 if (word_x_height < word->best_choice->min_x_height() || | |
| 720 word_x_height > word->best_choice->max_x_height()) { | |
| 721 word_x_height = | |
| 722 (word->best_choice->min_x_height() + word->best_choice->max_x_height()) / 2.0f; | |
| 723 } | |
| 724 // Test for small caps. Word capheight must be close to block xheight, | |
| 725 // and word must contain no lower case letters, and at least one upper case. | |
| 726 const double small_cap_xheight = x_height * kXHeightCapRatio; | |
| 727 const double small_cap_delta = (x_height - small_cap_xheight) / 2.0; | |
| 728 if (word->uch_set->script_has_xheight() && | |
| 729 small_cap_xheight - small_cap_delta <= word_x_height && | |
| 730 word_x_height <= small_cap_xheight + small_cap_delta) { | |
| 731 // Scan for upper/lower. | |
| 732 int num_upper = 0; | |
| 733 int num_lower = 0; | |
| 734 for (unsigned i = 0; i < word->best_choice->length(); ++i) { | |
| 735 if (word->uch_set->get_isupper(word->best_choice->unichar_id(i))) { | |
| 736 ++num_upper; | |
| 737 } else if (word->uch_set->get_islower(word->best_choice->unichar_id(i))) { | |
| 738 ++num_lower; | |
| 739 } | |
| 740 } | |
| 741 if (num_upper > 0 && num_lower == 0) { | |
| 742 word->small_caps = true; | |
| 743 } | |
| 744 } | |
| 745 word->SetScriptPositions(); | |
| 746 } | |
| 747 } | |
| 748 | |
| 749 // Helper finds the gap between the index word and the next. | |
| 750 static void WordGap(const PointerVector<WERD_RES> &words, unsigned index, int *right, int *next_left) { | |
| 751 *right = -INT32_MAX; | |
| 752 *next_left = INT32_MAX; | |
| 753 if (index < words.size()) { | |
| 754 *right = words[index]->word->bounding_box().right(); | |
| 755 if (index + 1 < words.size()) { | |
| 756 *next_left = words[index + 1]->word->bounding_box().left(); | |
| 757 } | |
| 758 } | |
| 759 } | |
| 760 | |
| 761 // Factored helper computes the rating, certainty, badness and validity of | |
| 762 // the permuter of the words in [first_index, end_index). | |
| 763 static void EvaluateWordSpan(const PointerVector<WERD_RES> &words, unsigned first_index, unsigned end_index, | |
| 764 float *rating, float *certainty, bool *bad, bool *valid_permuter) { | |
| 765 if (end_index <= first_index) { | |
| 766 *bad = true; | |
| 767 *valid_permuter = false; | |
| 768 } | |
| 769 for (unsigned index = first_index; index < end_index && index < words.size(); ++index) { | |
| 770 WERD_CHOICE *choice = words[index]->best_choice; | |
| 771 if (choice == nullptr) { | |
| 772 *bad = true; | |
| 773 } else { | |
| 774 *rating += choice->rating(); | |
| 775 *certainty = std::min(*certainty, choice->certainty()); | |
| 776 if (!Dict::valid_word_permuter(choice->permuter(), false)) { | |
| 777 *valid_permuter = false; | |
| 778 } | |
| 779 } | |
| 780 } | |
| 781 } | |
| 782 | |
| 783 // Helper chooses the best combination of words, transferring good ones from | |
| 784 // new_words to best_words. To win, a new word must have (better rating and | |
| 785 // certainty) or (better permuter status and rating within rating ratio and | |
| 786 // certainty within certainty margin) than current best. | |
| 787 // All the new_words are consumed (moved to best_words or deleted.) | |
| 788 // The return value is the number of new_words used minus the number of | |
| 789 // best_words that remain in the output. | |
| 790 static int SelectBestWords(double rating_ratio, double certainty_margin, bool debug, | |
| 791 PointerVector<WERD_RES> *new_words, | |
| 792 PointerVector<WERD_RES> *best_words) { | |
| 793 // Process the smallest groups of words that have an overlapping word | |
| 794 // boundary at the end. | |
| 795 std::vector<WERD_RES *> out_words; | |
| 796 // Index into each word vector (best, new). | |
| 797 unsigned b = 0, n = 0; | |
| 798 int num_best = 0, num_new = 0; | |
| 799 while (b < best_words->size() || n < new_words->size()) { | |
| 800 // Start of the current run in each. | |
| 801 auto start_b = b, start_n = n; | |
| 802 while (b < best_words->size() || n < new_words->size()) { | |
| 803 int b_right = -INT32_MAX; | |
| 804 int next_b_left = INT32_MAX; | |
| 805 WordGap(*best_words, b, &b_right, &next_b_left); | |
| 806 int n_right = -INT32_MAX; | |
| 807 int next_n_left = INT32_MAX; | |
| 808 WordGap(*new_words, n, &n_right, &next_n_left); | |
| 809 if (std::max(b_right, n_right) < std::min(next_b_left, next_n_left)) { | |
| 810 // The word breaks overlap. [start_b,b] and [start_n, n] match. | |
| 811 break; | |
| 812 } | |
| 813 // Keep searching for the matching word break. | |
| 814 if ((b_right < n_right && b < best_words->size()) || n == new_words->size()) { | |
| 815 ++b; | |
| 816 } else { | |
| 817 ++n; | |
| 818 } | |
| 819 } | |
| 820 // Rating of the current run in each. | |
| 821 float b_rating = 0.0f, n_rating = 0.0f; | |
| 822 // Certainty of the current run in each. | |
| 823 float b_certainty = 0.0f, n_certainty = 0.0f; | |
| 824 // True if any word is missing its best choice. | |
| 825 bool b_bad = false, n_bad = false; | |
| 826 // True if all words have a valid permuter. | |
| 827 bool b_valid_permuter = true, n_valid_permuter = true; | |
| 828 const int end_b = b < best_words->size() ? b + 1 : b; | |
| 829 const int end_n = n < new_words->size() ? n + 1 : n; | |
| 830 EvaluateWordSpan(*best_words, start_b, end_b, &b_rating, &b_certainty, &b_bad, | |
| 831 &b_valid_permuter); | |
| 832 EvaluateWordSpan(*new_words, start_n, end_n, &n_rating, &n_certainty, &n_bad, | |
| 833 &n_valid_permuter); | |
| 834 bool new_better = false; | |
| 835 if (!n_bad && (b_bad || (n_certainty > b_certainty && n_rating < b_rating) || | |
| 836 (!b_valid_permuter && n_valid_permuter && n_rating < b_rating * rating_ratio && | |
| 837 n_certainty > b_certainty - certainty_margin))) { | |
| 838 // New is better. | |
| 839 for (int i = start_n; i < end_n; ++i) { | |
| 840 out_words.push_back((*new_words)[i]); | |
| 841 (*new_words)[i] = nullptr; | |
| 842 ++num_new; | |
| 843 } | |
| 844 new_better = true; | |
| 845 } else if (!b_bad) { | |
| 846 // Current best is better. | |
| 847 for (int i = start_b; i < end_b; ++i) { | |
| 848 out_words.push_back((*best_words)[i]); | |
| 849 (*best_words)[i] = nullptr; | |
| 850 ++num_best; | |
| 851 } | |
| 852 } | |
| 853 if (debug) { | |
| 854 tprintf( | |
| 855 "%d new words %s than %d old words: r: %g v %g c: %g v %g" | |
| 856 " valid dict: %d v %d\n", | |
| 857 end_n - start_n, new_better ? "better" : "worse", end_b - start_b, n_rating, b_rating, | |
| 858 n_certainty, b_certainty, n_valid_permuter, b_valid_permuter); | |
| 859 } | |
| 860 // Move on to the next group. | |
| 861 b = end_b; | |
| 862 n = end_n; | |
| 863 } | |
| 864 // Transfer from out_words to best_words. | |
| 865 best_words->clear(); | |
| 866 for (auto &out_word : out_words) { | |
| 867 best_words->push_back(out_word); | |
| 868 } | |
| 869 return num_new - num_best; | |
| 870 } | |
| 871 | |
| 872 // Helper to recognize the word using the given (language-specific) tesseract. | |
| 873 // Returns positive if this recognizer found more new best words than the | |
| 874 // number kept from best_words. | |
| 875 int Tesseract::RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, bool debug, | |
| 876 WERD_RES **in_word, PointerVector<WERD_RES> *best_words) { | |
| 877 if (debug) { | |
| 878 tprintf("Trying word using lang %s, oem %d\n", lang.c_str(), | |
| 879 static_cast<int>(tessedit_ocr_engine_mode)); | |
| 880 } | |
| 881 // Run the recognizer on the word. | |
| 882 PointerVector<WERD_RES> new_words; | |
| 883 (this->*recognizer)(word_data, in_word, &new_words); | |
| 884 if (new_words.empty()) { | |
| 885 // Transfer input word to new_words, as the classifier must have put | |
| 886 // the result back in the input. | |
| 887 new_words.push_back(*in_word); | |
| 888 *in_word = nullptr; | |
| 889 } | |
| 890 if (debug) { | |
| 891 for (unsigned i = 0; i < new_words.size(); ++i) { | |
| 892 new_words[i]->DebugTopChoice("Lang result"); | |
| 893 } | |
| 894 } | |
| 895 // Initial version is a bit of a hack based on better certainty and rating | |
| 896 // or a dictionary vs non-dictionary word. | |
| 897 return SelectBestWords(classify_max_rating_ratio, classify_max_certainty_margin, debug, | |
| 898 &new_words, best_words); | |
| 899 } | |
| 900 | |
| 901 // Helper returns true if all the words are acceptable. | |
| 902 static bool WordsAcceptable(const PointerVector<WERD_RES> &words) { | |
| 903 for (unsigned w = 0; w < words.size(); ++w) { | |
| 904 if (words[w]->tess_failed || !words[w]->tess_accepted) { | |
| 905 return false; | |
| 906 } | |
| 907 } | |
| 908 return true; | |
| 909 } | |
| 910 | |
| 911 #ifndef DISABLED_LEGACY_ENGINE | |
| 912 | |
| 913 // Moves good-looking "noise"/diacritics from the reject list to the main | |
| 914 // blob list on the current word. Returns true if anything was done, and | |
| 915 // sets make_next_word_fuzzy if blob(s) were added to the end of the word. | |
| 916 bool Tesseract::ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy) { | |
| 917 *make_next_word_fuzzy = false; | |
| 918 WERD *real_word = pr_it->word()->word; | |
| 919 if (real_word->rej_cblob_list()->empty() || real_word->cblob_list()->empty() || | |
| 920 real_word->rej_cblob_list()->length() > noise_maxperword) { | |
| 921 return false; | |
| 922 } | |
| 923 real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle); | |
| 924 // Get the noise outlines into a vector with matching bool map. | |
| 925 std::vector<C_OUTLINE *> outlines; | |
| 926 real_word->GetNoiseOutlines(&outlines); | |
| 927 std::vector<bool> word_wanted; | |
| 928 std::vector<bool> overlapped_any_blob; | |
| 929 std::vector<C_BLOB *> target_blobs; | |
| 930 AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it, &word_wanted, | |
| 931 &overlapped_any_blob, &target_blobs); | |
| 932 // Filter the outlines that overlapped any blob and put them into the word | |
| 933 // now. This simplifies the remaining task and also makes it more accurate | |
| 934 // as it has more completed blobs to work on. | |
| 935 std::vector<bool> wanted; | |
| 936 std::vector<C_BLOB *> wanted_blobs; | |
| 937 std::vector<C_OUTLINE *> wanted_outlines; | |
| 938 int num_overlapped = 0; | |
| 939 int num_overlapped_used = 0; | |
| 940 for (unsigned i = 0; i < overlapped_any_blob.size(); ++i) { | |
| 941 if (overlapped_any_blob[i]) { | |
| 942 ++num_overlapped; | |
| 943 if (word_wanted[i]) { | |
| 944 ++num_overlapped_used; | |
| 945 } | |
| 946 wanted.push_back(word_wanted[i]); | |
| 947 wanted_blobs.push_back(target_blobs[i]); | |
| 948 wanted_outlines.push_back(outlines[i]); | |
| 949 outlines[i] = nullptr; | |
| 950 } | |
| 951 } | |
| 952 real_word->AddSelectedOutlines(wanted, wanted_blobs, wanted_outlines, nullptr); | |
| 953 AssignDiacriticsToNewBlobs(outlines, pass, real_word, pr_it, &word_wanted, &target_blobs); | |
| 954 // TODO: check code. | |
| 955 int non_overlapped = 0; | |
| 956 int non_overlapped_used = 0; | |
| 957 for (unsigned i = 0; i < word_wanted.size(); ++i) { | |
| 958 if (word_wanted[i]) { | |
| 959 ++non_overlapped_used; | |
| 960 } | |
| 961 if (outlines[i] != nullptr) { | |
| 962 ++non_overlapped_used; | |
| 963 } | |
| 964 } | |
| 965 if (debug_noise_removal) { | |
| 966 tprintf("Used %d/%d overlapped %d/%d non-overlapped diacritics on word:", num_overlapped_used, | |
| 967 num_overlapped, non_overlapped_used, non_overlapped); | |
| 968 real_word->bounding_box().print(); | |
| 969 } | |
| 970 // Now we have decided which outlines we want, put them into the real_word. | |
| 971 if (real_word->AddSelectedOutlines(word_wanted, target_blobs, outlines, make_next_word_fuzzy)) { | |
| 972 pr_it->MakeCurrentWordFuzzy(); | |
| 973 } | |
| 974 // TODO(rays) Parts of combos have a deep copy of the real word, and need | |
| 975 // to have their noise outlines moved/assigned in the same way!! | |
| 976 return num_overlapped_used != 0 || non_overlapped_used != 0; | |
| 977 } | |
| 978 | |
| 979 // Attempts to put noise/diacritic outlines into the blobs that they overlap. | |
| 980 // Input: a set of noisy outlines that probably belong to the real_word. | |
| 981 // Output: word_wanted indicates which outlines are to be assigned to a blob, | |
| 982 // target_blobs indicates which to assign to, and overlapped_any_blob is | |
| 983 // true for all outlines that overlapped a blob. | |
| 984 void Tesseract::AssignDiacriticsToOverlappingBlobs(const std::vector<C_OUTLINE *> &outlines, | |
| 985 int pass, WERD *real_word, PAGE_RES_IT *pr_it, | |
| 986 std::vector<bool> *word_wanted, | |
| 987 std::vector<bool> *overlapped_any_blob, | |
| 988 std::vector<C_BLOB *> *target_blobs) { | |
| 989 std::vector<bool> blob_wanted; | |
| 990 word_wanted->clear(); | |
| 991 word_wanted->resize(outlines.size()); | |
| 992 overlapped_any_blob->clear(); | |
| 993 overlapped_any_blob->resize(outlines.size()); | |
| 994 target_blobs->clear(); | |
| 995 target_blobs->resize(outlines.size()); | |
| 996 // For each real blob, find the outlines that seriously overlap it. | |
| 997 // A single blob could be several merged characters, so there can be quite | |
| 998 // a few outlines overlapping, and the full engine needs to be used to chop | |
| 999 // and join to get a sensible result. | |
| 1000 C_BLOB_IT blob_it(real_word->cblob_list()); | |
| 1001 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { | |
| 1002 C_BLOB *blob = blob_it.data(); | |
| 1003 const TBOX blob_box = blob->bounding_box(); | |
| 1004 blob_wanted.clear(); | |
| 1005 blob_wanted.resize(outlines.size()); | |
| 1006 int num_blob_outlines = 0; | |
| 1007 for (unsigned i = 0; i < outlines.size(); ++i) { | |
| 1008 if (blob_box.major_x_overlap(outlines[i]->bounding_box()) && !(*word_wanted)[i]) { | |
| 1009 blob_wanted[i] = true; | |
| 1010 (*overlapped_any_blob)[i] = true; | |
| 1011 ++num_blob_outlines; | |
| 1012 } | |
| 1013 } | |
| 1014 if (debug_noise_removal) { | |
| 1015 tprintf("%d noise outlines overlap blob at:", num_blob_outlines); | |
| 1016 blob_box.print(); | |
| 1017 } | |
| 1018 // If any outlines overlap the blob, and not too many, classify the blob | |
| 1019 // (using the full engine, languages and all), and choose the maximal | |
| 1020 // combination of outlines that doesn't hurt the end-result classification | |
| 1021 // by too much. Mark them as wanted. | |
| 1022 if (0 < num_blob_outlines && num_blob_outlines < noise_maxperblob) { | |
| 1023 if (SelectGoodDiacriticOutlines(pass, noise_cert_basechar, pr_it, blob, outlines, | |
| 1024 num_blob_outlines, &blob_wanted)) { | |
| 1025 for (unsigned i = 0; i < blob_wanted.size(); ++i) { | |
| 1026 if (blob_wanted[i]) { | |
| 1027 // Claim the outline and record where it is going. | |
| 1028 (*word_wanted)[i] = true; | |
| 1029 (*target_blobs)[i] = blob; | |
| 1030 } | |
| 1031 } | |
| 1032 } | |
| 1033 } | |
| 1034 } | |
| 1035 } | |
| 1036 | |
| 1037 // Attempts to assign non-overlapping outlines to their nearest blobs or | |
| 1038 // make new blobs out of them. | |
| 1039 void Tesseract::AssignDiacriticsToNewBlobs(const std::vector<C_OUTLINE *> &outlines, int pass, | |
| 1040 WERD *real_word, PAGE_RES_IT *pr_it, | |
| 1041 std::vector<bool> *word_wanted, | |
| 1042 std::vector<C_BLOB *> *target_blobs) { | |
| 1043 std::vector<bool> blob_wanted; | |
| 1044 word_wanted->clear(); | |
| 1045 word_wanted->resize(outlines.size()); | |
| 1046 target_blobs->clear(); | |
| 1047 target_blobs->resize(outlines.size()); | |
| 1048 // Check for outlines that need to be turned into stand-alone blobs. | |
| 1049 for (unsigned i = 0; i < outlines.size(); ++i) { | |
| 1050 if (outlines[i] == nullptr) { | |
| 1051 continue; | |
| 1052 } | |
| 1053 // Get a set of adjacent outlines that don't overlap any existing blob. | |
| 1054 blob_wanted.clear(); | |
| 1055 blob_wanted.resize(outlines.size()); | |
| 1056 int num_blob_outlines = 0; | |
| 1057 TBOX total_ol_box(outlines[i]->bounding_box()); | |
| 1058 while (i < outlines.size() && outlines[i] != nullptr) { | |
| 1059 blob_wanted[i] = true; | |
| 1060 total_ol_box += outlines[i]->bounding_box(); | |
| 1061 ++i; | |
| 1062 ++num_blob_outlines; | |
| 1063 } | |
| 1064 // Find the insertion point. | |
| 1065 C_BLOB_IT blob_it(real_word->cblob_list()); | |
| 1066 while (!blob_it.at_last() && | |
| 1067 blob_it.data_relative(1)->bounding_box().left() <= total_ol_box.left()) { | |
| 1068 blob_it.forward(); | |
| 1069 } | |
| 1070 // Choose which combination of them we actually want and where to put | |
| 1071 // them. | |
| 1072 if (debug_noise_removal) { | |
| 1073 tprintf("Num blobless outlines = %d\n", num_blob_outlines); | |
| 1074 } | |
| 1075 C_BLOB *left_blob = blob_it.data(); | |
| 1076 TBOX left_box = left_blob->bounding_box(); | |
| 1077 C_BLOB *right_blob = blob_it.at_last() ? nullptr : blob_it.data_relative(1); | |
| 1078 if ((left_box.x_overlap(total_ol_box) || right_blob == nullptr || | |
| 1079 !right_blob->bounding_box().x_overlap(total_ol_box)) && | |
| 1080 SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, left_blob, outlines, | |
| 1081 num_blob_outlines, &blob_wanted)) { | |
| 1082 if (debug_noise_removal) { | |
| 1083 tprintf("Added to left blob\n"); | |
| 1084 } | |
| 1085 for (unsigned j = 0; j < blob_wanted.size(); ++j) { | |
| 1086 if (blob_wanted[j]) { | |
| 1087 (*word_wanted)[j] = true; | |
| 1088 (*target_blobs)[j] = left_blob; | |
| 1089 } | |
| 1090 } | |
| 1091 } else if (right_blob != nullptr && | |
| 1092 (!left_box.x_overlap(total_ol_box) || | |
| 1093 right_blob->bounding_box().x_overlap(total_ol_box)) && | |
| 1094 SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, right_blob, outlines, | |
| 1095 num_blob_outlines, &blob_wanted)) { | |
| 1096 if (debug_noise_removal) { | |
| 1097 tprintf("Added to right blob\n"); | |
| 1098 } | |
| 1099 for (unsigned j = 0; j < blob_wanted.size(); ++j) { | |
| 1100 if (blob_wanted[j]) { | |
| 1101 (*word_wanted)[j] = true; | |
| 1102 (*target_blobs)[j] = right_blob; | |
| 1103 } | |
| 1104 } | |
| 1105 } else if (SelectGoodDiacriticOutlines(pass, noise_cert_punc, pr_it, nullptr, outlines, | |
| 1106 num_blob_outlines, &blob_wanted)) { | |
| 1107 if (debug_noise_removal) { | |
| 1108 tprintf("Fitted between blobs\n"); | |
| 1109 } | |
| 1110 for (unsigned j = 0; j < blob_wanted.size(); ++j) { | |
| 1111 if (blob_wanted[j]) { | |
| 1112 (*word_wanted)[j] = true; | |
| 1113 (*target_blobs)[j] = nullptr; | |
| 1114 } | |
| 1115 } | |
| 1116 } | |
| 1117 } | |
| 1118 } | |
| 1119 | |
| 1120 // Starting with ok_outlines set to indicate which outlines overlap the blob, | |
| 1121 // chooses the optimal set (approximately) and returns true if any outlines | |
| 1122 // are desired, in which case ok_outlines indicates which ones. | |
| 1123 bool Tesseract::SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, | |
| 1124 C_BLOB *blob, | |
| 1125 const std::vector<C_OUTLINE *> &outlines, | |
| 1126 int num_outlines, std::vector<bool> *ok_outlines) { | |
| 1127 float target_cert = certainty_threshold; | |
| 1128 if (blob != nullptr) { | |
| 1129 std::string best_str; | |
| 1130 float target_c2; | |
| 1131 target_cert = ClassifyBlobAsWord(pass, pr_it, blob, best_str, &target_c2); | |
| 1132 if (debug_noise_removal) { | |
| 1133 tprintf("No Noise blob classified as %s=%g(%g) at:", best_str.c_str(), target_cert, | |
| 1134 target_c2); | |
| 1135 blob->bounding_box().print(); | |
| 1136 } | |
| 1137 target_cert -= (target_cert - certainty_threshold) * noise_cert_factor; | |
| 1138 } | |
| 1139 std::vector<bool> test_outlines = *ok_outlines; | |
| 1140 // Start with all the outlines in. | |
| 1141 std::string all_str; | |
| 1142 std::vector<bool> best_outlines = *ok_outlines; | |
| 1143 float best_cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass, pr_it, blob, all_str); | |
| 1144 if (debug_noise_removal) { | |
| 1145 TBOX ol_box; | |
| 1146 for (unsigned i = 0; i < test_outlines.size(); ++i) { | |
| 1147 if (test_outlines[i]) { | |
| 1148 ol_box += outlines[i]->bounding_box(); | |
| 1149 } | |
| 1150 } | |
| 1151 tprintf("All Noise blob classified as %s=%g, delta=%g at:", all_str.c_str(), best_cert, | |
| 1152 best_cert - target_cert); | |
| 1153 ol_box.print(); | |
| 1154 } | |
| 1155 // Iteratively zero out the bit that improves the certainty the most, until | |
| 1156 // we get past the threshold, have zero bits, or fail to improve. | |
| 1157 int best_index = 0; // To zero out. | |
| 1158 while (num_outlines > 1 && best_index >= 0 && | |
| 1159 (blob == nullptr || best_cert < target_cert || blob != nullptr)) { | |
| 1160 // Find the best bit to zero out. | |
| 1161 best_index = -1; | |
| 1162 for (unsigned i = 0; i < outlines.size(); ++i) { | |
| 1163 if (test_outlines[i]) { | |
| 1164 test_outlines[i] = false; | |
| 1165 std::string str; | |
| 1166 float cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass, pr_it, blob, str); | |
| 1167 if (debug_noise_removal) { | |
| 1168 TBOX ol_box; | |
| 1169 for (unsigned j = 0; j < outlines.size(); ++j) { | |
| 1170 if (test_outlines[j]) { | |
| 1171 ol_box += outlines[j]->bounding_box(); | |
| 1172 } | |
| 1173 tprintf("%c", test_outlines[j] ? 'T' : 'F'); | |
| 1174 } | |
| 1175 tprintf(" blob classified as %s=%g, delta=%g) at:", str.c_str(), cert, | |
| 1176 cert - target_cert); | |
| 1177 ol_box.print(); | |
| 1178 } | |
| 1179 if (cert > best_cert) { | |
| 1180 best_cert = cert; | |
| 1181 best_index = i; | |
| 1182 best_outlines = test_outlines; | |
| 1183 } | |
| 1184 test_outlines[i] = true; | |
| 1185 } | |
| 1186 } | |
| 1187 if (best_index >= 0) { | |
| 1188 test_outlines[best_index] = false; | |
| 1189 --num_outlines; | |
| 1190 } | |
| 1191 } | |
| 1192 if (best_cert >= target_cert) { | |
| 1193 // Save the best combination. | |
| 1194 *ok_outlines = best_outlines; | |
| 1195 if (debug_noise_removal) { | |
| 1196 tprintf("%s noise combination ", blob ? "Adding" : "New"); | |
| 1197 for (auto &&best_outline : best_outlines) { | |
| 1198 tprintf("%c", best_outline ? 'T' : 'F'); | |
| 1199 } | |
| 1200 tprintf(" yields certainty %g, beating target of %g\n", best_cert, target_cert); | |
| 1201 } | |
| 1202 return true; | |
| 1203 } | |
| 1204 | |
| 1205 return false; | |
| 1206 } | |
| 1207 | |
| 1208 // Classifies the given blob plus the outlines flagged by ok_outlines, undoes | |
| 1209 // the inclusion of the outlines, and returns the certainty of the raw choice. | |
| 1210 float Tesseract::ClassifyBlobPlusOutlines(const std::vector<bool> &ok_outlines, | |
| 1211 const std::vector<C_OUTLINE *> &outlines, int pass_n, | |
| 1212 PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str) { | |
| 1213 C_OUTLINE_IT ol_it; | |
| 1214 C_OUTLINE *first_to_keep = nullptr; | |
| 1215 C_BLOB *local_blob = nullptr; | |
| 1216 if (blob != nullptr) { | |
| 1217 // Add the required outlines to the blob. | |
| 1218 ol_it.set_to_list(blob->out_list()); | |
| 1219 first_to_keep = ol_it.data(); | |
| 1220 } | |
| 1221 for (unsigned i = 0; i < ok_outlines.size(); ++i) { | |
| 1222 if (ok_outlines[i]) { | |
| 1223 // This outline is to be added. | |
| 1224 if (blob == nullptr) { | |
| 1225 local_blob = new C_BLOB(outlines[i]); | |
| 1226 blob = local_blob; | |
| 1227 ol_it.set_to_list(blob->out_list()); | |
| 1228 } else { | |
| 1229 ol_it.add_before_stay_put(outlines[i]); | |
| 1230 } | |
| 1231 } | |
| 1232 } | |
| 1233 float c2; | |
| 1234 float cert = ClassifyBlobAsWord(pass_n, pr_it, blob, best_str, &c2); | |
| 1235 ol_it.move_to_first(); | |
| 1236 if (first_to_keep == nullptr) { | |
| 1237 // We created blob. Empty its outlines and delete it. | |
| 1238 for (; !ol_it.empty(); ol_it.forward()) { | |
| 1239 ol_it.extract(); | |
| 1240 } | |
| 1241 delete local_blob; | |
| 1242 cert = -c2; | |
| 1243 } else { | |
| 1244 // Remove the outlines that we put in. | |
| 1245 for (; ol_it.data() != first_to_keep; ol_it.forward()) { | |
| 1246 ol_it.extract(); | |
| 1247 } | |
| 1248 } | |
| 1249 return cert; | |
| 1250 } | |
| 1251 | |
| 1252 // Classifies the given blob (part of word_data->word->word) as an individual | |
| 1253 // word, using languages, chopper etc, returning only the certainty of the | |
| 1254 // best raw choice, and undoing all the work done to fake out the word. | |
| 1255 float Tesseract::ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str, | |
| 1256 float *c2) { | |
| 1257 WERD *real_word = pr_it->word()->word; | |
| 1258 WERD *word = real_word->ConstructFromSingleBlob(real_word->flag(W_BOL), real_word->flag(W_EOL), | |
| 1259 C_BLOB::deep_copy(blob)); | |
| 1260 WERD_RES *word_res = pr_it->InsertSimpleCloneWord(*pr_it->word(), word); | |
| 1261 // Get a new iterator that points to the new word. | |
| 1262 PAGE_RES_IT it(pr_it->page_res); | |
| 1263 while (it.word() != word_res && it.word() != nullptr) { | |
| 1264 it.forward(); | |
| 1265 } | |
| 1266 ASSERT_HOST(it.word() == word_res); | |
| 1267 WordData wd(it); | |
| 1268 // Force full initialization. | |
| 1269 SetupWordPassN(1, &wd); | |
| 1270 classify_word_and_language(pass_n, &it, &wd); | |
| 1271 if (debug_noise_removal) { | |
| 1272 if (wd.word->raw_choice != nullptr) { | |
| 1273 tprintf("word xheight=%g, row=%g, range=[%g,%g]\n", word_res->x_height, wd.row->x_height(), | |
| 1274 wd.word->raw_choice->min_x_height(), wd.word->raw_choice->max_x_height()); | |
| 1275 } else { | |
| 1276 tprintf("Got word with null raw choice xheight=%g, row=%g\n", word_res->x_height, | |
| 1277 wd.row->x_height()); | |
| 1278 } | |
| 1279 } | |
| 1280 float cert = 0.0f; | |
| 1281 if (wd.word->raw_choice != nullptr) { // This probably shouldn't happen, but... | |
| 1282 cert = wd.word->raw_choice->certainty(); | |
| 1283 float rat = wd.word->raw_choice->rating(); | |
| 1284 *c2 = rat > 0.0f ? cert * cert / rat : 0.0f; | |
| 1285 best_str = wd.word->raw_choice->unichar_string(); | |
| 1286 } else { | |
| 1287 *c2 = 0.0f; | |
| 1288 best_str.clear(); | |
| 1289 } | |
| 1290 it.DeleteCurrentWord(); | |
| 1291 pr_it->ResetWordIterator(); | |
| 1292 return cert; | |
| 1293 } | |
| 1294 | |
| 1295 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 1296 | |
| 1297 // Generic function for classifying a word. Can be used either for pass1 or | |
| 1298 // pass2 according to the function passed to recognizer. | |
| 1299 // word_data holds the word to be recognized, and its block and row, and | |
| 1300 // pr_it points to the word as well, in case we are running LSTM and it wants | |
| 1301 // to output multiple words. | |
| 1302 // Recognizes in the current language, and if successful that is all. | |
| 1303 // If recognition was not successful, tries all available languages until | |
| 1304 // it gets a successful result or runs out of languages. Keeps the best result. | |
| 1305 void Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data) { | |
| 1306 #ifdef DISABLED_LEGACY_ENGINE | |
| 1307 WordRecognizer recognizer = &Tesseract::classify_word_pass1; | |
| 1308 #else | |
| 1309 WordRecognizer recognizer = | |
| 1310 pass_n == 1 ? &Tesseract::classify_word_pass1 : &Tesseract::classify_word_pass2; | |
| 1311 #endif // def DISABLED_LEGACY_ENGINE | |
| 1312 | |
| 1313 // Best result so far. | |
| 1314 PointerVector<WERD_RES> best_words; | |
| 1315 // Points to the best result. May be word or in lang_words. | |
| 1316 const WERD_RES *word = word_data->word; | |
| 1317 clock_t total_time = 0; | |
| 1318 const bool timing_debug = tessedit_timing_debug; | |
| 1319 if (timing_debug) { | |
| 1320 total_time = clock(); | |
| 1321 } | |
| 1322 const bool debug = classify_debug_level > 0 || multilang_debug_level > 0; | |
| 1323 if (debug) { | |
| 1324 tprintf("%s word with lang %s at:", word->done ? "Already done" : "Processing", | |
| 1325 most_recently_used_->lang.c_str()); | |
| 1326 word->word->bounding_box().print(); | |
| 1327 } | |
| 1328 if (word->done) { | |
| 1329 // If done on pass1, leave it as-is. | |
| 1330 if (!word->tess_failed) { | |
| 1331 most_recently_used_ = word->tesseract; | |
| 1332 } | |
| 1333 return; | |
| 1334 } | |
| 1335 auto sub = sub_langs_.size(); | |
| 1336 if (most_recently_used_ != this) { | |
| 1337 // Get the index of the most_recently_used_. | |
| 1338 for (sub = 0; sub < sub_langs_.size() && most_recently_used_ != sub_langs_[sub]; ++sub) { | |
| 1339 } | |
| 1340 } | |
| 1341 most_recently_used_->RetryWithLanguage(*word_data, recognizer, debug, &word_data->lang_words[sub], | |
| 1342 &best_words); | |
| 1343 Tesseract *best_lang_tess = most_recently_used_; | |
| 1344 if (!WordsAcceptable(best_words)) { | |
| 1345 // Try all the other languages to see if they are any better. | |
| 1346 if (most_recently_used_ != this && | |
| 1347 this->RetryWithLanguage(*word_data, recognizer, debug, | |
| 1348 &word_data->lang_words[sub_langs_.size()], &best_words) > 0) { | |
| 1349 best_lang_tess = this; | |
| 1350 } | |
| 1351 for (unsigned i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size(); ++i) { | |
| 1352 if (most_recently_used_ != sub_langs_[i] && | |
| 1353 sub_langs_[i]->RetryWithLanguage(*word_data, recognizer, debug, &word_data->lang_words[i], | |
| 1354 &best_words) > 0) { | |
| 1355 best_lang_tess = sub_langs_[i]; | |
| 1356 } | |
| 1357 } | |
| 1358 } | |
| 1359 most_recently_used_ = best_lang_tess; | |
| 1360 if (!best_words.empty()) { | |
| 1361 if (best_words.size() == 1 && !best_words[0]->combination) { | |
| 1362 // Move the best single result to the main word. | |
| 1363 word_data->word->ConsumeWordResults(best_words[0]); | |
| 1364 } else { | |
| 1365 // Words came from LSTM, and must be moved to the PAGE_RES properly. | |
| 1366 word_data->word = best_words.back(); | |
| 1367 pr_it->ReplaceCurrentWord(&best_words); | |
| 1368 } | |
| 1369 ASSERT_HOST(word_data->word->box_word != nullptr); | |
| 1370 } else { | |
| 1371 tprintf("no best words!!\n"); | |
| 1372 } | |
| 1373 if (timing_debug) { | |
| 1374 total_time = clock() - total_time; | |
| 1375 tesserr << word_data->word->best_choice->unichar_string() | |
| 1376 << " (ocr took " << 1000 * total_time / CLOCKS_PER_SEC << " ms)\n"; | |
| 1377 } | |
| 1378 } | |
| 1379 | |
| 1380 /** | |
| 1381 * classify_word_pass1 | |
| 1382 * | |
| 1383 * Baseline normalize the word and pass it to Tess. | |
| 1384 */ | |
| 1385 | |
| 1386 void Tesseract::classify_word_pass1(const WordData &word_data, WERD_RES **in_word, | |
| 1387 PointerVector<WERD_RES> *out_words) { | |
| 1388 ROW *row = word_data.row; | |
| 1389 BLOCK *block = word_data.block; | |
| 1390 prev_word_best_choice_ = | |
| 1391 word_data.prev_word != nullptr ? word_data.prev_word->word->best_choice : nullptr; | |
| 1392 #ifdef DISABLED_LEGACY_ENGINE | |
| 1393 if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) { | |
| 1394 #else | |
| 1395 if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY || | |
| 1396 tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) { | |
| 1397 #endif // def DISABLED_LEGACY_ENGINE | |
| 1398 if (!(*in_word)->odd_size || tessedit_ocr_engine_mode == OEM_LSTM_ONLY) { | |
| 1399 LSTMRecognizeWord(*block, row, *in_word, out_words); | |
| 1400 if (!out_words->empty()) { | |
| 1401 return; // Successful lstm recognition. | |
| 1402 } | |
| 1403 } | |
| 1404 if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) { | |
| 1405 // No fallback allowed, so use a fake. | |
| 1406 (*in_word)->SetupFake(lstm_recognizer_->GetUnicharset()); | |
| 1407 return; | |
| 1408 } | |
| 1409 | |
| 1410 #ifndef DISABLED_LEGACY_ENGINE | |
| 1411 // Fall back to tesseract for failed words or odd words. | |
| 1412 (*in_word)->SetupForRecognition(unicharset, this, BestPix(), OEM_TESSERACT_ONLY, nullptr, | |
| 1413 classify_bln_numeric_mode, textord_use_cjk_fp_model, | |
| 1414 poly_allow_detailed_fx, row, block); | |
| 1415 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 1416 } | |
| 1417 | |
| 1418 #ifndef DISABLED_LEGACY_ENGINE | |
| 1419 WERD_RES *word = *in_word; | |
| 1420 match_word_pass_n(1, word, row, block); | |
| 1421 if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) { | |
| 1422 word->tess_would_adapt = AdaptableWord(word); | |
| 1423 bool adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode); | |
| 1424 | |
| 1425 if (adapt_ok) { | |
| 1426 // Send word to adaptive classifier for training. | |
| 1427 word->BestChoiceToCorrectText(); | |
| 1428 LearnWord(nullptr, word); | |
| 1429 // Mark misadaptions if running blamer. | |
| 1430 if (word->blamer_bundle != nullptr) { | |
| 1431 word->blamer_bundle->SetMisAdaptionDebug(word->best_choice, wordrec_debug_blamer); | |
| 1432 } | |
| 1433 } | |
| 1434 | |
| 1435 if (tessedit_enable_doc_dict && !word->IsAmbiguous()) { | |
| 1436 tess_add_doc_word(word->best_choice); | |
| 1437 } | |
| 1438 } | |
| 1439 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 1440 } | |
| 1441 | |
| 1442 // Helper to report the result of the xheight fix. | |
| 1443 void Tesseract::ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word, | |
| 1444 WERD_RES *new_word) { | |
| 1445 tprintf("New XHT Match:%s = %s ", word->best_choice->unichar_string().c_str(), | |
| 1446 word->best_choice->debug_string().c_str()); | |
| 1447 word->reject_map.print(debug_fp); | |
| 1448 tprintf(" -> %s = %s ", new_word->best_choice->unichar_string().c_str(), | |
| 1449 new_word->best_choice->debug_string().c_str()); | |
| 1450 new_word->reject_map.print(debug_fp); | |
| 1451 tprintf(" %s->%s %s %s\n", word->guessed_x_ht ? "GUESS" : "CERT", | |
| 1452 new_word->guessed_x_ht ? "GUESS" : "CERT", new_x_ht > 0.1 ? "STILL DOUBT" : "OK", | |
| 1453 accept_new_word ? "ACCEPTED" : ""); | |
| 1454 } | |
| 1455 | |
| 1456 #ifndef DISABLED_LEGACY_ENGINE | |
| 1457 | |
| 1458 // Run the x-height fix-up, based on min/max top/bottom information in | |
| 1459 // unicharset. | |
| 1460 // Returns true if the word was changed. | |
| 1461 // See the comment in fixxht.cpp for a description of the overall process. | |
| 1462 bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row) { | |
| 1463 int original_misfits = CountMisfitTops(word); | |
| 1464 if (original_misfits == 0) { | |
| 1465 return false; | |
| 1466 } | |
| 1467 float baseline_shift = 0.0f; | |
| 1468 float new_x_ht = ComputeCompatibleXheight(word, &baseline_shift); | |
| 1469 if (baseline_shift != 0.0f) { | |
| 1470 // Try the shift on its own first. | |
| 1471 if (!TestNewNormalization(original_misfits, baseline_shift, word->x_height, word, block, row)) { | |
| 1472 return false; | |
| 1473 } | |
| 1474 original_misfits = CountMisfitTops(word); | |
| 1475 if (original_misfits > 0) { | |
| 1476 float new_baseline_shift; | |
| 1477 // Now recompute the new x_height. | |
| 1478 new_x_ht = ComputeCompatibleXheight(word, &new_baseline_shift); | |
| 1479 if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) { | |
| 1480 // No test of return value here, as we are definitely making a change | |
| 1481 // to the word by shifting the baseline. | |
| 1482 TestNewNormalization(original_misfits, baseline_shift, new_x_ht, word, block, row); | |
| 1483 } | |
| 1484 } | |
| 1485 return true; | |
| 1486 } else if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) { | |
| 1487 return TestNewNormalization(original_misfits, 0.0f, new_x_ht, word, block, row); | |
| 1488 } else { | |
| 1489 return false; | |
| 1490 } | |
| 1491 } | |
| 1492 | |
| 1493 // Runs recognition with the test baseline shift and x-height and returns true | |
| 1494 // if there was an improvement in recognition result. | |
| 1495 bool Tesseract::TestNewNormalization(int original_misfits, float baseline_shift, float new_x_ht, | |
| 1496 WERD_RES *word, BLOCK *block, ROW *row) { | |
| 1497 bool accept_new_x_ht = false; | |
| 1498 WERD_RES new_x_ht_word(word->word); | |
| 1499 if (word->blamer_bundle != nullptr) { | |
| 1500 new_x_ht_word.blamer_bundle = new BlamerBundle(); | |
| 1501 new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle)); | |
| 1502 } | |
| 1503 new_x_ht_word.x_height = new_x_ht; | |
| 1504 new_x_ht_word.baseline_shift = baseline_shift; | |
| 1505 new_x_ht_word.caps_height = 0.0; | |
| 1506 new_x_ht_word.SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr, | |
| 1507 classify_bln_numeric_mode, textord_use_cjk_fp_model, | |
| 1508 poly_allow_detailed_fx, row, block); | |
| 1509 match_word_pass_n(2, &new_x_ht_word, row, block); | |
| 1510 if (!new_x_ht_word.tess_failed) { | |
| 1511 int new_misfits = CountMisfitTops(&new_x_ht_word); | |
| 1512 if (debug_x_ht_level >= 1) { | |
| 1513 tprintf("Old misfits=%d with x-height %f, new=%d with x-height %f\n", original_misfits, | |
| 1514 word->x_height, new_misfits, new_x_ht); | |
| 1515 tprintf("Old rating= %f, certainty=%f, new=%f, %f\n", word->best_choice->rating(), | |
| 1516 word->best_choice->certainty(), new_x_ht_word.best_choice->rating(), | |
| 1517 new_x_ht_word.best_choice->certainty()); | |
| 1518 } | |
| 1519 // The misfits must improve and either the rating or certainty. | |
| 1520 accept_new_x_ht = new_misfits < original_misfits && | |
| 1521 (new_x_ht_word.best_choice->certainty() > word->best_choice->certainty() || | |
| 1522 new_x_ht_word.best_choice->rating() < word->best_choice->rating()); | |
| 1523 if (debug_x_ht_level >= 1) { | |
| 1524 ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word); | |
| 1525 } | |
| 1526 } | |
| 1527 if (accept_new_x_ht) { | |
| 1528 word->ConsumeWordResults(&new_x_ht_word); | |
| 1529 return true; | |
| 1530 } | |
| 1531 return false; | |
| 1532 } | |
| 1533 | |
| 1534 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 1535 | |
| 1536 /** | |
| 1537 * classify_word_pass2 | |
| 1538 * | |
| 1539 * Control what to do with the word in pass 2 | |
| 1540 */ | |
| 1541 | |
| 1542 void Tesseract::classify_word_pass2(const WordData &word_data, WERD_RES **in_word, | |
| 1543 PointerVector<WERD_RES> *out_words) { | |
| 1544 // Return if we do not want to run Tesseract. | |
| 1545 if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) { | |
| 1546 return; | |
| 1547 } | |
| 1548 #ifndef DISABLED_LEGACY_ENGINE | |
| 1549 ROW *row = word_data.row; | |
| 1550 BLOCK *block = word_data.block; | |
| 1551 WERD_RES *word = *in_word; | |
| 1552 prev_word_best_choice_ = | |
| 1553 word_data.prev_word != nullptr ? word_data.prev_word->word->best_choice : nullptr; | |
| 1554 | |
| 1555 check_debug_pt(word, 30); | |
| 1556 if (!word->done) { | |
| 1557 word->caps_height = 0.0; | |
| 1558 if (word->x_height == 0.0f) { | |
| 1559 word->x_height = row->x_height(); | |
| 1560 } | |
| 1561 match_word_pass_n(2, word, row, block); | |
| 1562 check_debug_pt(word, 40); | |
| 1563 } | |
| 1564 | |
| 1565 SubAndSuperscriptFix(word); | |
| 1566 | |
| 1567 if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) { | |
| 1568 if (unicharset.top_bottom_useful() && unicharset.script_has_xheight() && | |
| 1569 block->classify_rotation().y() == 0.0f) { | |
| 1570 // Use the tops and bottoms since they are available. | |
| 1571 TrainedXheightFix(word, block, row); | |
| 1572 } | |
| 1573 } | |
| 1574 # ifndef GRAPHICS_DISABLED | |
| 1575 if (tessedit_display_outwords) { | |
| 1576 if (fx_win == nullptr) { | |
| 1577 create_fx_win(); | |
| 1578 } | |
| 1579 clear_fx_win(); | |
| 1580 word->rebuild_word->plot(fx_win); | |
| 1581 TBOX wbox = word->rebuild_word->bounding_box(); | |
| 1582 fx_win->ZoomToRectangle(wbox.left(), wbox.top(), wbox.right(), wbox.bottom()); | |
| 1583 ScrollView::Update(); | |
| 1584 } | |
| 1585 # endif | |
| 1586 check_debug_pt(word, 50); | |
| 1587 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 1588 } | |
| 1589 | |
| 1590 #ifndef DISABLED_LEGACY_ENGINE | |
| 1591 /** | |
| 1592 * match_word_pass2 | |
| 1593 * | |
| 1594 * Baseline normalize the word and pass it to Tess. | |
| 1595 */ | |
| 1596 void Tesseract::match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block) { | |
| 1597 if (word->tess_failed) { | |
| 1598 return; | |
| 1599 } | |
| 1600 tess_segment_pass_n(pass_n, word); | |
| 1601 | |
| 1602 if (!word->tess_failed) { | |
| 1603 if (!word->word->flag(W_REP_CHAR)) { | |
| 1604 word->fix_quotes(); | |
| 1605 if (tessedit_fix_hyphens) { | |
| 1606 word->fix_hyphens(); | |
| 1607 } | |
| 1608 /* Don't trust fix_quotes! - though I think I've fixed the bug */ | |
| 1609 if (static_cast<unsigned>(word->best_choice->length()) != word->box_word->length()) { | |
| 1610 tprintf( | |
| 1611 "POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;" | |
| 1612 " #Blobs=%u\n", | |
| 1613 word->best_choice->debug_string().c_str(), word->best_choice->length(), | |
| 1614 word->box_word->length()); | |
| 1615 } | |
| 1616 word->tess_accepted = tess_acceptable_word(word); | |
| 1617 | |
| 1618 // Also sets word->done flag | |
| 1619 make_reject_map(word, row, pass_n); | |
| 1620 } | |
| 1621 } | |
| 1622 set_word_fonts(word); | |
| 1623 | |
| 1624 ASSERT_HOST(word->raw_choice != nullptr); | |
| 1625 } | |
| 1626 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 1627 | |
| 1628 // Helper to return the best rated BLOB_CHOICE in the whole word that matches | |
| 1629 // the given char_id, or nullptr if none can be found. | |
| 1630 static BLOB_CHOICE *FindBestMatchingChoice(UNICHAR_ID char_id, WERD_RES *word_res) { | |
| 1631 // Find the corresponding best BLOB_CHOICE from any position in the word_res. | |
| 1632 BLOB_CHOICE *best_choice = nullptr; | |
| 1633 for (unsigned i = 0; i < word_res->best_choice->length(); ++i) { | |
| 1634 BLOB_CHOICE *choice = FindMatchingChoice(char_id, word_res->GetBlobChoices(i)); | |
| 1635 if (choice != nullptr) { | |
| 1636 if (best_choice == nullptr || choice->rating() < best_choice->rating()) { | |
| 1637 best_choice = choice; | |
| 1638 } | |
| 1639 } | |
| 1640 } | |
| 1641 return best_choice; | |
| 1642 } | |
| 1643 | |
| 1644 // Helper to insert blob_choice in each location in the leader word if there is | |
| 1645 // no matching BLOB_CHOICE there already, and correct any incorrect results | |
| 1646 // in the best_choice. | |
| 1647 static void CorrectRepcharChoices(BLOB_CHOICE *blob_choice, WERD_RES *word_res) { | |
| 1648 WERD_CHOICE *word = word_res->best_choice; | |
| 1649 for (unsigned i = 0; i < word_res->best_choice->length(); ++i) { | |
| 1650 BLOB_CHOICE *choice = | |
| 1651 FindMatchingChoice(blob_choice->unichar_id(), word_res->GetBlobChoices(i)); | |
| 1652 if (choice == nullptr) { | |
| 1653 BLOB_CHOICE_IT choice_it(word_res->GetBlobChoices(i)); | |
| 1654 choice_it.add_before_stay_put(new BLOB_CHOICE(*blob_choice)); | |
| 1655 } | |
| 1656 } | |
| 1657 // Correct any incorrect results in word. | |
| 1658 for (unsigned i = 0; i < word->length(); ++i) { | |
| 1659 if (word->unichar_id(i) != blob_choice->unichar_id()) { | |
| 1660 word->set_unichar_id(blob_choice->unichar_id(), i); | |
| 1661 } | |
| 1662 } | |
| 1663 } | |
| 1664 | |
| 1665 /** | |
| 1666 * fix_rep_char() | |
| 1667 * The word is a repeated char. (Leader.) Find the repeated char character. | |
| 1668 * Create the appropriate single-word or multi-word sequence according to | |
| 1669 * the size of spaces in between blobs, and correct the classifications | |
| 1670 * where some of the characters disagree with the majority. | |
| 1671 */ | |
| 1672 void Tesseract::fix_rep_char(PAGE_RES_IT *page_res_it) { | |
| 1673 WERD_RES *word_res = page_res_it->word(); | |
| 1674 const WERD_CHOICE &word = *(word_res->best_choice); | |
| 1675 | |
| 1676 // Find the frequency of each unique character in the word. | |
| 1677 SortHelper<UNICHAR_ID> rep_ch(word.length()); | |
| 1678 for (unsigned i = 0; i < word.length(); ++i) { | |
| 1679 rep_ch.Add(word.unichar_id(i), 1); | |
| 1680 } | |
| 1681 | |
| 1682 // Find the most frequent result. | |
| 1683 UNICHAR_ID maxch_id = INVALID_UNICHAR_ID; // most common char | |
| 1684 int max_count = rep_ch.MaxCount(&maxch_id); | |
| 1685 // Find the best exemplar of a classifier result for maxch_id. | |
| 1686 BLOB_CHOICE *best_choice = FindBestMatchingChoice(maxch_id, word_res); | |
| 1687 if (best_choice == nullptr) { | |
| 1688 tprintf("Failed to find a choice for %s, occurring %d times\n", | |
| 1689 word_res->uch_set->debug_str(maxch_id).c_str(), max_count); | |
| 1690 return; | |
| 1691 } | |
| 1692 word_res->done = true; | |
| 1693 | |
| 1694 // Just correct existing classification. | |
| 1695 CorrectRepcharChoices(best_choice, word_res); | |
| 1696 word_res->reject_map.initialise(word.length()); | |
| 1697 } | |
| 1698 | |
| 1699 ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(const UNICHARSET &char_set, const char *s, | |
| 1700 const char *lengths) { | |
| 1701 int i = 0; | |
| 1702 int offset = 0; | |
| 1703 int leading_punct_count; | |
| 1704 int upper_count = 0; | |
| 1705 int hyphen_pos = -1; | |
| 1706 ACCEPTABLE_WERD_TYPE word_type = AC_UNACCEPTABLE; | |
| 1707 | |
| 1708 if (strlen(lengths) > 20) { | |
| 1709 return word_type; | |
| 1710 } | |
| 1711 | |
| 1712 /* Single Leading punctuation char*/ | |
| 1713 | |
| 1714 if (s[offset] != '\0' && chs_leading_punct.contains(s[offset])) { | |
| 1715 offset += lengths[i++]; | |
| 1716 } | |
| 1717 leading_punct_count = i; | |
| 1718 | |
| 1719 /* Initial cap */ | |
| 1720 while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i])) { | |
| 1721 offset += lengths[i++]; | |
| 1722 upper_count++; | |
| 1723 } | |
| 1724 if (upper_count > 1) { | |
| 1725 word_type = AC_UPPER_CASE; | |
| 1726 } else { | |
| 1727 /* Lower case word, possibly with an initial cap */ | |
| 1728 while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i])) { | |
| 1729 offset += lengths[i++]; | |
| 1730 } | |
| 1731 if (i - leading_punct_count < quality_min_initial_alphas_reqd) { | |
| 1732 goto not_a_word; | |
| 1733 } | |
| 1734 /* | |
| 1735 Allow a single hyphen in a lower case word | |
| 1736 - don't trust upper case - I've seen several cases of "H" -> "I-I" | |
| 1737 */ | |
| 1738 if (lengths[i] == 1 && s[offset] == '-') { | |
| 1739 hyphen_pos = i; | |
| 1740 offset += lengths[i++]; | |
| 1741 if (s[offset] != '\0') { | |
| 1742 while ((s[offset] != '\0') && char_set.get_islower(s + offset, lengths[i])) { | |
| 1743 offset += lengths[i++]; | |
| 1744 } | |
| 1745 if (i < hyphen_pos + 3) { | |
| 1746 goto not_a_word; | |
| 1747 } | |
| 1748 } | |
| 1749 } else { | |
| 1750 /* Allow "'s" in NON hyphenated lower case words */ | |
| 1751 if (lengths[i] == 1 && (s[offset] == '\'') && lengths[i + 1] == 1 && | |
| 1752 (s[offset + lengths[i]] == 's')) { | |
| 1753 offset += lengths[i++]; | |
| 1754 offset += lengths[i++]; | |
| 1755 } | |
| 1756 } | |
| 1757 if (upper_count > 0) { | |
| 1758 word_type = AC_INITIAL_CAP; | |
| 1759 } else { | |
| 1760 word_type = AC_LOWER_CASE; | |
| 1761 } | |
| 1762 } | |
| 1763 | |
| 1764 /* Up to two different, constrained trailing punctuation chars */ | |
| 1765 if (lengths[i] == 1 && s[offset] != '\0' && chs_trailing_punct1.contains(s[offset])) { | |
| 1766 offset += lengths[i++]; | |
| 1767 } | |
| 1768 if (lengths[i] == 1 && s[offset] != '\0' && i > 0 && s[offset - lengths[i - 1]] != s[offset] && | |
| 1769 chs_trailing_punct2.contains(s[offset])) { | |
| 1770 offset += lengths[i++]; | |
| 1771 } | |
| 1772 | |
| 1773 if (s[offset] != '\0') { | |
| 1774 word_type = AC_UNACCEPTABLE; | |
| 1775 } | |
| 1776 | |
| 1777 not_a_word: | |
| 1778 | |
| 1779 if (word_type == AC_UNACCEPTABLE) { | |
| 1780 /* Look for abbreviation string */ | |
| 1781 i = 0; | |
| 1782 offset = 0; | |
| 1783 if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) { | |
| 1784 word_type = AC_UC_ABBREV; | |
| 1785 while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i]) && | |
| 1786 lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') { | |
| 1787 offset += lengths[i++]; | |
| 1788 offset += lengths[i++]; | |
| 1789 } | |
| 1790 } else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) { | |
| 1791 word_type = AC_LC_ABBREV; | |
| 1792 while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i]) && | |
| 1793 lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') { | |
| 1794 offset += lengths[i++]; | |
| 1795 offset += lengths[i++]; | |
| 1796 } | |
| 1797 } | |
| 1798 if (s[offset] != '\0') { | |
| 1799 word_type = AC_UNACCEPTABLE; | |
| 1800 } | |
| 1801 } | |
| 1802 | |
| 1803 return word_type; | |
| 1804 } | |
| 1805 | |
| 1806 bool Tesseract::check_debug_pt(WERD_RES *word, int location) { | |
| 1807 if (!test_pt) { | |
| 1808 return false; | |
| 1809 } | |
| 1810 | |
| 1811 tessedit_rejection_debug.set_value(false); | |
| 1812 debug_x_ht_level.set_value(0); | |
| 1813 | |
| 1814 if (word->word->bounding_box().contains(FCOORD(test_pt_x, test_pt_y))) { | |
| 1815 if (location < 0) { | |
| 1816 return true; // For breakpoint use | |
| 1817 } | |
| 1818 bool show_map_detail = false; | |
| 1819 tessedit_rejection_debug.set_value(true); | |
| 1820 debug_x_ht_level.set_value(2); | |
| 1821 tprintf("\n\nTESTWD::"); | |
| 1822 switch (location) { | |
| 1823 case 0: | |
| 1824 tprintf("classify_word_pass1 start\n"); | |
| 1825 word->word->print(); | |
| 1826 break; | |
| 1827 case 10: | |
| 1828 tprintf("make_reject_map: initial map"); | |
| 1829 break; | |
| 1830 case 20: | |
| 1831 tprintf("make_reject_map: after NN"); | |
| 1832 break; | |
| 1833 case 30: | |
| 1834 tprintf("classify_word_pass2 - START"); | |
| 1835 break; | |
| 1836 case 40: | |
| 1837 tprintf("classify_word_pass2 - Pre Xht"); | |
| 1838 break; | |
| 1839 case 50: | |
| 1840 tprintf("classify_word_pass2 - END"); | |
| 1841 show_map_detail = true; | |
| 1842 break; | |
| 1843 case 60: | |
| 1844 tprintf("fixspace"); | |
| 1845 break; | |
| 1846 case 70: | |
| 1847 tprintf("MM pass START"); | |
| 1848 break; | |
| 1849 case 80: | |
| 1850 tprintf("MM pass END"); | |
| 1851 break; | |
| 1852 case 90: | |
| 1853 tprintf("After Poor quality rejection"); | |
| 1854 break; | |
| 1855 case 100: | |
| 1856 tprintf("unrej_good_quality_words - START"); | |
| 1857 break; | |
| 1858 case 110: | |
| 1859 tprintf("unrej_good_quality_words - END"); | |
| 1860 break; | |
| 1861 case 120: | |
| 1862 tprintf("Write results pass"); | |
| 1863 show_map_detail = true; | |
| 1864 break; | |
| 1865 } | |
| 1866 if (word->best_choice != nullptr) { | |
| 1867 tprintf(" \"%s\" ", word->best_choice->unichar_string().c_str()); | |
| 1868 word->reject_map.print(debug_fp); | |
| 1869 tprintf("\n"); | |
| 1870 if (show_map_detail) { | |
| 1871 tprintf("\"%s\"\n", word->best_choice->unichar_string().c_str()); | |
| 1872 for (unsigned i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) { | |
| 1873 tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]); | |
| 1874 word->reject_map[i].full_print(debug_fp); | |
| 1875 } | |
| 1876 } | |
| 1877 } else { | |
| 1878 tprintf("null best choice\n"); | |
| 1879 } | |
| 1880 tprintf("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE"); | |
| 1881 tprintf("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE"); | |
| 1882 return true; | |
| 1883 } else { | |
| 1884 return false; | |
| 1885 } | |
| 1886 } | |
| 1887 | |
| 1888 /** | |
| 1889 * find_modal_font | |
| 1890 * | |
| 1891 * Find the modal font and remove from the stats. | |
| 1892 */ | |
| 1893 #ifndef DISABLED_LEGACY_ENGINE | |
| 1894 static void find_modal_font( // good chars in word | |
| 1895 STATS *fonts, // font stats | |
| 1896 int16_t *font_out, // output font | |
| 1897 int8_t *font_count // output count | |
| 1898 ) { | |
| 1899 if (fonts->get_total() > 0) { | |
| 1900 // font index | |
| 1901 int16_t font = static_cast<int16_t>(fonts->mode()); | |
| 1902 *font_out = font; | |
| 1903 // pile count | |
| 1904 int32_t count = fonts->pile_count(font); | |
| 1905 *font_count = count < INT8_MAX ? count : INT8_MAX; | |
| 1906 fonts->add(font, -*font_count); | |
| 1907 } else { | |
| 1908 *font_out = -1; | |
| 1909 *font_count = 0; | |
| 1910 } | |
| 1911 } | |
| 1912 #endif // ! DISABLED_LEGACY_ENGINE | |
| 1913 | |
| 1914 /** | |
| 1915 * set_word_fonts | |
| 1916 * | |
| 1917 * Get the fonts for the word. | |
| 1918 */ | |
| 1919 void Tesseract::set_word_fonts(WERD_RES *word) { | |
| 1920 // Don't try to set the word fonts for an lstm word, as the configs | |
| 1921 // will be meaningless. | |
| 1922 if (word->chopped_word == nullptr) { | |
| 1923 return; | |
| 1924 } | |
| 1925 ASSERT_HOST(word->best_choice != nullptr); | |
| 1926 | |
| 1927 #ifndef DISABLED_LEGACY_ENGINE | |
| 1928 const int fontinfo_size = fontinfo_table_.size(); | |
| 1929 if (fontinfo_size == 0) { | |
| 1930 return; | |
| 1931 } | |
| 1932 if (tessedit_font_id > 0) { | |
| 1933 if (tessedit_font_id >= fontinfo_size) { | |
| 1934 tprintf("Error, invalid font ID provided: must be below %d.\n" | |
| 1935 "Falling back to font auto-detection.\n", fontinfo_size); | |
| 1936 } else { | |
| 1937 word->fontinfo = &fontinfo_table_.at(tessedit_font_id); | |
| 1938 word->fontinfo2 = nullptr; | |
| 1939 word->fontinfo_id_count = INT8_MAX; | |
| 1940 word->fontinfo_id2_count = 0; | |
| 1941 return; | |
| 1942 } | |
| 1943 } | |
| 1944 std::vector<int> font_total_score(fontinfo_size); | |
| 1945 | |
| 1946 // Compute the font scores for the word | |
| 1947 if (tessedit_debug_fonts) { | |
| 1948 tprintf("Examining fonts in %s\n", word->best_choice->debug_string().c_str()); | |
| 1949 } | |
| 1950 for (unsigned b = 0; b < word->best_choice->length(); ++b) { | |
| 1951 const BLOB_CHOICE *choice = word->GetBlobChoice(b); | |
| 1952 if (choice == nullptr) { | |
| 1953 continue; | |
| 1954 } | |
| 1955 auto &fonts = choice->fonts(); | |
| 1956 for (auto &f : fonts) { | |
| 1957 const int fontinfo_id = f.fontinfo_id; | |
| 1958 if (0 <= fontinfo_id && fontinfo_id < fontinfo_size) { | |
| 1959 font_total_score[fontinfo_id] += f.score; | |
| 1960 } | |
| 1961 } | |
| 1962 } | |
| 1963 // Find the top and 2nd choice for the word. | |
| 1964 int score1 = 0, score2 = 0; | |
| 1965 int16_t font_id1 = -1, font_id2 = -1; | |
| 1966 for (int f = 0; f < fontinfo_size; ++f) { | |
| 1967 if (tessedit_debug_fonts && font_total_score[f] > 0) { | |
| 1968 tprintf("Font %s, total score = %d\n", fontinfo_table_.at(f).name, font_total_score[f]); | |
| 1969 } | |
| 1970 if (font_total_score[f] > score1) { | |
| 1971 score2 = score1; | |
| 1972 font_id2 = font_id1; | |
| 1973 score1 = font_total_score[f]; | |
| 1974 font_id1 = f; | |
| 1975 } else if (font_total_score[f] > score2) { | |
| 1976 score2 = font_total_score[f]; | |
| 1977 font_id2 = f; | |
| 1978 } | |
| 1979 } | |
| 1980 word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.at(font_id1) : nullptr; | |
| 1981 word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.at(font_id2) : nullptr; | |
| 1982 // Each score has a limit of UINT16_MAX, so divide by that to get the number | |
| 1983 // of "votes" for that font, ie number of perfect scores. | |
| 1984 word->fontinfo_id_count = ClipToRange<int>(score1 / UINT16_MAX, 1, INT8_MAX); | |
| 1985 word->fontinfo_id2_count = ClipToRange<int>(score2 / UINT16_MAX, 0, INT8_MAX); | |
| 1986 if (score1 > 0) { | |
| 1987 const FontInfo fi = fontinfo_table_.at(font_id1); | |
| 1988 if (tessedit_debug_fonts) { | |
| 1989 if (word->fontinfo_id2_count > 0 && font_id2 >= 0) { | |
| 1990 tprintf("Word modal font=%s, score=%d, 2nd choice %s/%d\n", fi.name, | |
| 1991 word->fontinfo_id_count, fontinfo_table_.at(font_id2).name, | |
| 1992 word->fontinfo_id2_count); | |
| 1993 } else { | |
| 1994 tprintf("Word modal font=%s, score=%d. No 2nd choice\n", fi.name, word->fontinfo_id_count); | |
| 1995 } | |
| 1996 } | |
| 1997 } | |
| 1998 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 1999 } | |
| 2000 | |
| 2001 #ifndef DISABLED_LEGACY_ENGINE | |
| 2002 /** | |
| 2003 * font_recognition_pass | |
| 2004 * | |
| 2005 * Smooth the fonts for the document. | |
| 2006 */ | |
| 2007 void Tesseract::font_recognition_pass(PAGE_RES *page_res) { | |
| 2008 PAGE_RES_IT page_res_it(page_res); | |
| 2009 WERD_RES *word; // current word | |
| 2010 STATS doc_fonts(0, font_table_size_ - 1); // font counters | |
| 2011 | |
| 2012 // Gather font id statistics. | |
| 2013 for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) { | |
| 2014 word = page_res_it.word(); | |
| 2015 if (word->fontinfo != nullptr) { | |
| 2016 doc_fonts.add(word->fontinfo->universal_id, word->fontinfo_id_count); | |
| 2017 } | |
| 2018 if (word->fontinfo2 != nullptr) { | |
| 2019 doc_fonts.add(word->fontinfo2->universal_id, word->fontinfo_id2_count); | |
| 2020 } | |
| 2021 } | |
| 2022 int16_t doc_font; // modal font | |
| 2023 int8_t doc_font_count; // modal font | |
| 2024 find_modal_font(&doc_fonts, &doc_font, &doc_font_count); | |
| 2025 if (doc_font_count == 0) { | |
| 2026 return; | |
| 2027 } | |
| 2028 // Get the modal font pointer. | |
| 2029 const FontInfo *modal_font = nullptr; | |
| 2030 for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) { | |
| 2031 word = page_res_it.word(); | |
| 2032 if (word->fontinfo != nullptr && word->fontinfo->universal_id == doc_font) { | |
| 2033 modal_font = word->fontinfo; | |
| 2034 break; | |
| 2035 } | |
| 2036 if (word->fontinfo2 != nullptr && word->fontinfo2->universal_id == doc_font) { | |
| 2037 modal_font = word->fontinfo2; | |
| 2038 break; | |
| 2039 } | |
| 2040 } | |
| 2041 ASSERT_HOST(modal_font != nullptr); | |
| 2042 | |
| 2043 // Assign modal font to weak words. | |
| 2044 for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) { | |
| 2045 word = page_res_it.word(); | |
| 2046 const int length = word->best_choice->length(); | |
| 2047 | |
| 2048 const int count = word->fontinfo_id_count; | |
| 2049 if (!(count == length || (length > 3 && count >= length * 3 / 4))) { | |
| 2050 word->fontinfo = modal_font; | |
| 2051 // Counts only get 1 as it came from the doc. | |
| 2052 word->fontinfo_id_count = 1; | |
| 2053 } | |
| 2054 } | |
| 2055 } | |
| 2056 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 2057 | |
| 2058 // If a word has multiple alternates check if the best choice is in the | |
| 2059 // dictionary. If not, replace it with an alternate that exists in the | |
| 2060 // dictionary. | |
| 2061 void Tesseract::dictionary_correction_pass(PAGE_RES *page_res) { | |
| 2062 PAGE_RES_IT word_it(page_res); | |
| 2063 for (WERD_RES *word = word_it.word(); word != nullptr; word = word_it.forward()) { | |
| 2064 if (word->best_choices.singleton()) { | |
| 2065 continue; // There are no alternates. | |
| 2066 } | |
| 2067 | |
| 2068 const WERD_CHOICE *best = word->best_choice; | |
| 2069 if (word->tesseract->getDict().valid_word(*best) != 0) { | |
| 2070 continue; // The best choice is in the dictionary. | |
| 2071 } | |
| 2072 | |
| 2073 WERD_CHOICE_IT choice_it(&word->best_choices); | |
| 2074 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) { | |
| 2075 WERD_CHOICE *alternate = choice_it.data(); | |
| 2076 if (word->tesseract->getDict().valid_word(*alternate)) { | |
| 2077 // The alternate choice is in the dictionary. | |
| 2078 if (tessedit_bigram_debug) { | |
| 2079 tprintf("Dictionary correction replaces best choice '%s' with '%s'\n", | |
| 2080 best->unichar_string().c_str(), alternate->unichar_string().c_str()); | |
| 2081 } | |
| 2082 // Replace the 'best' choice with a better choice. | |
| 2083 word->ReplaceBestChoice(alternate); | |
| 2084 break; | |
| 2085 } | |
| 2086 } | |
| 2087 } | |
| 2088 } | |
| 2089 | |
| 2090 } // namespace tesseract |
