Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/dict/stopper.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /****************************************************************************** | |
| 2 ** Filename: stopper.c | |
| 3 ** Purpose: Stopping criteria for word classifier. | |
| 4 ** Author: Dan Johnson | |
| 5 ** | |
| 6 ** (c) Copyright Hewlett-Packard Company, 1988. | |
| 7 ** Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 ** you may not use this file except in compliance with the License. | |
| 9 ** You may obtain a copy of the License at | |
| 10 ** http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 ** Unless required by applicable law or agreed to in writing, software | |
| 12 ** distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 ** See the License for the specific language governing permissions and | |
| 15 ** limitations under the License. | |
| 16 ******************************************************************************/ | |
| 17 | |
| 18 #include <cctype> | |
| 19 #include <cmath> | |
| 20 #include <cstdio> | |
| 21 #include <cstring> | |
| 22 | |
| 23 #include "stopper.h" | |
| 24 #ifndef DISABLED_LEGACY_ENGINE | |
| 25 # include "ambigs.h" | |
| 26 #endif | |
| 27 #include <tesseract/unichar.h> | |
| 28 #include "ccutil.h" | |
| 29 #include "dict.h" | |
| 30 #include "helpers.h" | |
| 31 #include "matchdefs.h" | |
| 32 #include "pageres.h" | |
| 33 #include "params.h" | |
| 34 #include "ratngs.h" | |
| 35 | |
| 36 /*---------------------------------------------------------------------------- | |
| 37 Private Code | |
| 38 ----------------------------------------------------------------------------*/ | |
| 39 | |
| 40 namespace tesseract { | |
| 41 | |
| 42 bool Dict::AcceptableChoice(const WERD_CHOICE &best_choice, | |
| 43 XHeightConsistencyEnum xheight_consistency) { | |
| 44 float CertaintyThreshold = stopper_nondict_certainty_base; | |
| 45 int WordSize; | |
| 46 | |
| 47 if (stopper_no_acceptable_choices) { | |
| 48 return false; | |
| 49 } | |
| 50 | |
| 51 if (best_choice.empty()) { | |
| 52 return false; | |
| 53 } | |
| 54 | |
| 55 bool no_dang_ambigs = !best_choice.dangerous_ambig_found(); | |
| 56 bool is_valid_word = valid_word_permuter(best_choice.permuter(), false); | |
| 57 bool is_case_ok = case_ok(best_choice); | |
| 58 | |
| 59 if (stopper_debug_level >= 1) { | |
| 60 const char *xht = "UNKNOWN"; | |
| 61 switch (xheight_consistency) { | |
| 62 case XH_GOOD: | |
| 63 xht = "NORMAL"; | |
| 64 break; | |
| 65 case XH_SUBNORMAL: | |
| 66 xht = "SUBNORMAL"; | |
| 67 break; | |
| 68 case XH_INCONSISTENT: | |
| 69 xht = "INCONSISTENT"; | |
| 70 break; | |
| 71 default: | |
| 72 xht = "UNKNOWN"; | |
| 73 } | |
| 74 tprintf("\nStopper: %s (word=%c, case=%c, xht_ok=%s=[%g,%g])\n", | |
| 75 best_choice.unichar_string().c_str(), (is_valid_word ? 'y' : 'n'), | |
| 76 (is_case_ok ? 'y' : 'n'), xht, best_choice.min_x_height(), best_choice.max_x_height()); | |
| 77 } | |
| 78 // Do not accept invalid words in PASS1. | |
| 79 if (reject_offset_ <= 0.0f && !is_valid_word) { | |
| 80 return false; | |
| 81 } | |
| 82 if (is_valid_word && is_case_ok) { | |
| 83 WordSize = LengthOfShortestAlphaRun(best_choice); | |
| 84 WordSize -= stopper_smallword_size; | |
| 85 if (WordSize < 0) { | |
| 86 WordSize = 0; | |
| 87 } | |
| 88 CertaintyThreshold += WordSize * stopper_certainty_per_char; | |
| 89 } | |
| 90 | |
| 91 if (stopper_debug_level >= 1) { | |
| 92 tprintf("Stopper: Rating = %4.1f, Certainty = %4.1f, Threshold = %4.1f\n", | |
| 93 best_choice.rating(), best_choice.certainty(), CertaintyThreshold); | |
| 94 } | |
| 95 | |
| 96 if (no_dang_ambigs && best_choice.certainty() > CertaintyThreshold && | |
| 97 xheight_consistency < XH_INCONSISTENT && UniformCertainties(best_choice)) { | |
| 98 return true; | |
| 99 } else { | |
| 100 if (stopper_debug_level >= 1) { | |
| 101 tprintf( | |
| 102 "AcceptableChoice() returned false" | |
| 103 " (no_dang_ambig:%d cert:%.4g thresh:%g uniform:%d)\n", | |
| 104 no_dang_ambigs, best_choice.certainty(), CertaintyThreshold, | |
| 105 UniformCertainties(best_choice)); | |
| 106 } | |
| 107 return false; | |
| 108 } | |
| 109 } | |
| 110 | |
| 111 bool Dict::AcceptableResult(WERD_RES *word) const { | |
| 112 if (word->best_choice == nullptr) { | |
| 113 return false; | |
| 114 } | |
| 115 float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_; | |
| 116 int WordSize; | |
| 117 | |
| 118 if (stopper_debug_level >= 1) { | |
| 119 tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c, multiple=%c)\n", | |
| 120 word->best_choice->debug_string().c_str(), (valid_word(*word->best_choice) ? 'y' : 'n'), | |
| 121 (case_ok(*word->best_choice) ? 'y' : 'n'), | |
| 122 word->best_choice->dangerous_ambig_found() ? 'n' : 'y', | |
| 123 word->best_choices.singleton() ? 'n' : 'y'); | |
| 124 } | |
| 125 | |
| 126 if (word->best_choice->empty() || !word->best_choices.singleton()) { | |
| 127 return false; | |
| 128 } | |
| 129 if (valid_word(*word->best_choice) && case_ok(*word->best_choice)) { | |
| 130 WordSize = LengthOfShortestAlphaRun(*word->best_choice); | |
| 131 WordSize -= stopper_smallword_size; | |
| 132 if (WordSize < 0) { | |
| 133 WordSize = 0; | |
| 134 } | |
| 135 CertaintyThreshold += WordSize * stopper_certainty_per_char; | |
| 136 } | |
| 137 | |
| 138 if (stopper_debug_level >= 1) { | |
| 139 tprintf("Rejecter: Certainty = %4.1f, Threshold = %4.1f ", word->best_choice->certainty(), | |
| 140 CertaintyThreshold); | |
| 141 } | |
| 142 | |
| 143 if (word->best_choice->certainty() > CertaintyThreshold && !stopper_no_acceptable_choices) { | |
| 144 if (stopper_debug_level >= 1) { | |
| 145 tprintf("ACCEPTED\n"); | |
| 146 } | |
| 147 return true; | |
| 148 } else { | |
| 149 if (stopper_debug_level >= 1) { | |
| 150 tprintf("REJECTED\n"); | |
| 151 } | |
| 152 return false; | |
| 153 } | |
| 154 } | |
| 155 | |
| 156 #if !defined(DISABLED_LEGACY_ENGINE) | |
| 157 | |
| 158 bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice, DANGERR *fixpt, bool fix_replaceable, | |
| 159 MATRIX *ratings) { | |
| 160 if (stopper_debug_level > 2) { | |
| 161 tprintf("\nRunning NoDangerousAmbig() for %s\n", best_choice->debug_string().c_str()); | |
| 162 } | |
| 163 | |
| 164 // Construct BLOB_CHOICE_LIST_VECTOR with ambiguities | |
| 165 // for each unichar id in BestChoice. | |
| 166 BLOB_CHOICE_LIST_VECTOR ambig_blob_choices; | |
| 167 bool ambigs_found = false; | |
| 168 // For each position in best_choice: | |
| 169 // -- choose AMBIG_SPEC_LIST that corresponds to unichar_id at best_choice[i] | |
| 170 // -- initialize wrong_ngram with a single unichar_id at best_choice[i] | |
| 171 // -- look for ambiguities corresponding to wrong_ngram in the list while | |
| 172 // adding the following unichar_ids from best_choice to wrong_ngram | |
| 173 // | |
| 174 // Repeat the above procedure twice: first time look through | |
| 175 // ambigs to be replaced and replace all the ambiguities found; | |
| 176 // second time look through dangerous ambiguities and construct | |
| 177 // ambig_blob_choices with fake a blob choice for each ambiguity | |
| 178 // and pass them to dawg_permute_and_select() to search for | |
| 179 // ambiguous words in the dictionaries. | |
| 180 // | |
| 181 // Note that during the execution of the for loop (on the first pass) | |
| 182 // if replacements are made the length of best_choice might change. | |
| 183 for (int pass = 0; pass < (fix_replaceable ? 2 : 1); ++pass) { | |
| 184 bool replace = (fix_replaceable && pass == 0); | |
| 185 const UnicharAmbigsVector &table = | |
| 186 replace ? getUnicharAmbigs().replace_ambigs() : getUnicharAmbigs().dang_ambigs(); | |
| 187 if (!replace) { | |
| 188 // Initialize ambig_blob_choices with lists containing a single | |
| 189 // unichar id for the corresponding position in best_choice. | |
| 190 // best_choice consisting from only the original letters will | |
| 191 // have a rating of 0.0. | |
| 192 for (unsigned i = 0; i < best_choice->length(); ++i) { | |
| 193 auto *lst = new BLOB_CHOICE_LIST(); | |
| 194 BLOB_CHOICE_IT lst_it(lst); | |
| 195 // TODO(rays/antonova) Put real xheights and y shifts here. | |
| 196 lst_it.add_to_end( | |
| 197 new BLOB_CHOICE(best_choice->unichar_id(i), 0.0, 0.0, -1, 0, 1, 0, BCC_AMBIG)); | |
| 198 ambig_blob_choices.push_back(lst); | |
| 199 } | |
| 200 } | |
| 201 UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1]; | |
| 202 int wrong_ngram_index; | |
| 203 int blob_index = 0; | |
| 204 for (unsigned i = 0; i < best_choice->length(); blob_index += best_choice->state(i), ++i) { | |
| 205 auto curr_unichar_id = best_choice->unichar_id(i); | |
| 206 if (stopper_debug_level > 2) { | |
| 207 tprintf("Looking for %s ngrams starting with %s:\n", replace ? "replaceable" : "ambiguous", | |
| 208 getUnicharset().debug_str(curr_unichar_id).c_str()); | |
| 209 } | |
| 210 int num_wrong_blobs = best_choice->state(i); | |
| 211 wrong_ngram_index = 0; | |
| 212 wrong_ngram[wrong_ngram_index] = curr_unichar_id; | |
| 213 if (curr_unichar_id == INVALID_UNICHAR_ID || static_cast<size_t>(curr_unichar_id) >= table.size() || | |
| 214 table[curr_unichar_id] == nullptr) { | |
| 215 continue; // there is no ambig spec for this unichar id | |
| 216 } | |
| 217 AmbigSpec_IT spec_it(table[curr_unichar_id]); | |
| 218 for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) { | |
| 219 const AmbigSpec *ambig_spec = spec_it.data(); | |
| 220 wrong_ngram[wrong_ngram_index + 1] = INVALID_UNICHAR_ID; | |
| 221 int compare = UnicharIdArrayUtils::compare(wrong_ngram, ambig_spec->wrong_ngram); | |
| 222 if (stopper_debug_level > 2) { | |
| 223 tprintf("candidate ngram: "); | |
| 224 UnicharIdArrayUtils::print(wrong_ngram, getUnicharset()); | |
| 225 tprintf("current ngram from spec: "); | |
| 226 UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, getUnicharset()); | |
| 227 tprintf("comparison result: %d\n", compare); | |
| 228 } | |
| 229 if (compare == 0) { | |
| 230 // Record the place where we found an ambiguity. | |
| 231 if (fixpt != nullptr) { | |
| 232 UNICHAR_ID leftmost_id = ambig_spec->correct_fragments[0]; | |
| 233 fixpt->push_back(DANGERR_INFO(blob_index, blob_index + num_wrong_blobs, replace, | |
| 234 getUnicharset().get_isngram(ambig_spec->correct_ngram_id), | |
| 235 leftmost_id)); | |
| 236 if (stopper_debug_level > 1) { | |
| 237 tprintf("fixpt+=(%d %d %d %d %s)\n", blob_index, blob_index + num_wrong_blobs, false, | |
| 238 getUnicharset().get_isngram(ambig_spec->correct_ngram_id), | |
| 239 getUnicharset().id_to_unichar(leftmost_id)); | |
| 240 } | |
| 241 } | |
| 242 | |
| 243 if (replace) { | |
| 244 if (stopper_debug_level > 2) { | |
| 245 tprintf("replace ambiguity with %s : ", | |
| 246 getUnicharset().id_to_unichar(ambig_spec->correct_ngram_id)); | |
| 247 UnicharIdArrayUtils::print(ambig_spec->correct_fragments, getUnicharset()); | |
| 248 } | |
| 249 ReplaceAmbig(i, ambig_spec->wrong_ngram_size, ambig_spec->correct_ngram_id, best_choice, | |
| 250 ratings); | |
| 251 } else if (i > 0 || ambig_spec->type != CASE_AMBIG) { | |
| 252 // We found dang ambig - update ambig_blob_choices. | |
| 253 if (stopper_debug_level > 2) { | |
| 254 tprintf("found ambiguity: "); | |
| 255 UnicharIdArrayUtils::print(ambig_spec->correct_fragments, getUnicharset()); | |
| 256 } | |
| 257 ambigs_found = true; | |
| 258 for (int tmp_index = 0; tmp_index <= wrong_ngram_index; ++tmp_index) { | |
| 259 // Add a blob choice for the corresponding fragment of the | |
| 260 // ambiguity. These fake blob choices are initialized with | |
| 261 // negative ratings (which are not possible for real blob | |
| 262 // choices), so that dawg_permute_and_select() considers any | |
| 263 // word not consisting of only the original letters a better | |
| 264 // choice and stops searching for alternatives once such a | |
| 265 // choice is found. | |
| 266 BLOB_CHOICE_IT bc_it(ambig_blob_choices[i + tmp_index]); | |
| 267 bc_it.add_to_end(new BLOB_CHOICE(ambig_spec->correct_fragments[tmp_index], -1.0, 0.0, | |
| 268 -1, 0, 1, 0, BCC_AMBIG)); | |
| 269 } | |
| 270 } | |
| 271 spec_it.forward(); | |
| 272 } else if (compare == -1) { | |
| 273 unsigned next_index; | |
| 274 if (wrong_ngram_index + 1 < ambig_spec->wrong_ngram_size && | |
| 275 ((next_index = wrong_ngram_index + 1 + i) < best_choice->length())) { | |
| 276 // Add the next unichar id to wrong_ngram and keep looking for | |
| 277 // more ambigs starting with curr_unichar_id in AMBIG_SPEC_LIST. | |
| 278 wrong_ngram[++wrong_ngram_index] = best_choice->unichar_id(next_index); | |
| 279 num_wrong_blobs += best_choice->state(next_index); | |
| 280 } else { | |
| 281 break; // no more matching ambigs in this AMBIG_SPEC_LIST | |
| 282 } | |
| 283 } else { | |
| 284 spec_it.forward(); | |
| 285 } | |
| 286 } // end searching AmbigSpec_LIST | |
| 287 } // end searching best_choice | |
| 288 } // end searching replace and dangerous ambigs | |
| 289 | |
| 290 // If any ambiguities were found permute the constructed ambig_blob_choices | |
| 291 // to see if an alternative dictionary word can be found. | |
| 292 if (ambigs_found) { | |
| 293 if (stopper_debug_level > 2) { | |
| 294 tprintf("\nResulting ambig_blob_choices:\n"); | |
| 295 for (unsigned i = 0; i < ambig_blob_choices.size(); ++i) { | |
| 296 print_ratings_list("", ambig_blob_choices.at(i), getUnicharset()); | |
| 297 tprintf("\n"); | |
| 298 } | |
| 299 } | |
| 300 WERD_CHOICE *alt_word = dawg_permute_and_select(ambig_blob_choices, 0.0); | |
| 301 ambigs_found = (alt_word->rating() < 0.0); | |
| 302 if (ambigs_found) { | |
| 303 if (stopper_debug_level >= 1) { | |
| 304 tprintf("Stopper: Possible ambiguous word = %s\n", alt_word->debug_string().c_str()); | |
| 305 } | |
| 306 if (fixpt != nullptr) { | |
| 307 // Note: Currently character choices combined from fragments can only | |
| 308 // be generated by NoDangrousAmbigs(). This code should be updated if | |
| 309 // the capability to produce classifications combined from character | |
| 310 // fragments is added to other functions. | |
| 311 int orig_i = 0; | |
| 312 for (unsigned i = 0; i < alt_word->length(); ++i) { | |
| 313 const UNICHARSET &uchset = getUnicharset(); | |
| 314 bool replacement_is_ngram = uchset.get_isngram(alt_word->unichar_id(i)); | |
| 315 UNICHAR_ID leftmost_id = alt_word->unichar_id(i); | |
| 316 if (replacement_is_ngram) { | |
| 317 // we have to extract the leftmost unichar from the ngram. | |
| 318 const char *str = uchset.id_to_unichar(leftmost_id); | |
| 319 int step = uchset.step(str); | |
| 320 if (step) { | |
| 321 leftmost_id = uchset.unichar_to_id(str, step); | |
| 322 } | |
| 323 } | |
| 324 int end_i = orig_i + alt_word->state(i); | |
| 325 if (alt_word->state(i) > 1 || (orig_i + 1 == end_i && replacement_is_ngram)) { | |
| 326 // Compute proper blob indices. | |
| 327 int blob_start = 0; | |
| 328 for (int j = 0; j < orig_i; ++j) { | |
| 329 blob_start += best_choice->state(j); | |
| 330 } | |
| 331 int blob_end = blob_start; | |
| 332 for (int j = orig_i; j < end_i; ++j) { | |
| 333 blob_end += best_choice->state(j); | |
| 334 } | |
| 335 fixpt->push_back( | |
| 336 DANGERR_INFO(blob_start, blob_end, true, replacement_is_ngram, leftmost_id)); | |
| 337 if (stopper_debug_level > 1) { | |
| 338 tprintf("fixpt->dangerous+=(%d %d %d %d %s)\n", orig_i, end_i, true, | |
| 339 replacement_is_ngram, uchset.id_to_unichar(leftmost_id)); | |
| 340 } | |
| 341 } | |
| 342 orig_i += alt_word->state(i); | |
| 343 } | |
| 344 } | |
| 345 } | |
| 346 delete alt_word; | |
| 347 } | |
| 348 if (output_ambig_words_file_ != nullptr) { | |
| 349 fprintf(output_ambig_words_file_, "\n"); | |
| 350 } | |
| 351 | |
| 352 for (auto data : ambig_blob_choices) { | |
| 353 delete data; | |
| 354 } | |
| 355 return !ambigs_found; | |
| 356 } | |
| 357 | |
| 358 void Dict::EndDangerousAmbigs() {} | |
| 359 | |
| 360 #endif // !defined(DISABLED_LEGACY_ENGINE) | |
| 361 | |
| 362 void Dict::SetupStopperPass1() { | |
| 363 reject_offset_ = 0.0; | |
| 364 } | |
| 365 | |
| 366 void Dict::SetupStopperPass2() { | |
| 367 reject_offset_ = stopper_phase2_certainty_rejection_offset; | |
| 368 } | |
| 369 | |
| 370 void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, | |
| 371 UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, MATRIX *ratings) { | |
| 372 int num_blobs_to_replace = 0; | |
| 373 int begin_blob_index = 0; | |
| 374 int i; | |
| 375 // Rating and certainty for the new BLOB_CHOICE are derived from the | |
| 376 // replaced choices. | |
| 377 float new_rating = 0.0f; | |
| 378 float new_certainty = 0.0f; | |
| 379 BLOB_CHOICE *old_choice = nullptr; | |
| 380 for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) { | |
| 381 if (i >= wrong_ngram_begin_index) { | |
| 382 int num_blobs = werd_choice->state(i); | |
| 383 int col = begin_blob_index + num_blobs_to_replace; | |
| 384 int row = col + num_blobs - 1; | |
| 385 BLOB_CHOICE_LIST *choices = ratings->get(col, row); | |
| 386 ASSERT_HOST(choices != nullptr); | |
| 387 old_choice = FindMatchingChoice(werd_choice->unichar_id(i), choices); | |
| 388 ASSERT_HOST(old_choice != nullptr); | |
| 389 new_rating += old_choice->rating(); | |
| 390 new_certainty += old_choice->certainty(); | |
| 391 num_blobs_to_replace += num_blobs; | |
| 392 } else { | |
| 393 begin_blob_index += werd_choice->state(i); | |
| 394 } | |
| 395 } | |
| 396 new_certainty /= wrong_ngram_size; | |
| 397 // If there is no entry in the ratings matrix, add it. | |
| 398 MATRIX_COORD coord(begin_blob_index, begin_blob_index + num_blobs_to_replace - 1); | |
| 399 if (!coord.Valid(*ratings)) { | |
| 400 ratings->IncreaseBandSize(coord.row - coord.col + 1); | |
| 401 } | |
| 402 if (ratings->get(coord.col, coord.row) == nullptr) { | |
| 403 ratings->put(coord.col, coord.row, new BLOB_CHOICE_LIST); | |
| 404 } | |
| 405 BLOB_CHOICE_LIST *new_choices = ratings->get(coord.col, coord.row); | |
| 406 BLOB_CHOICE *choice = FindMatchingChoice(correct_ngram_id, new_choices); | |
| 407 if (choice != nullptr) { | |
| 408 // Already there. Upgrade if new rating better. | |
| 409 if (new_rating < choice->rating()) { | |
| 410 choice->set_rating(new_rating); | |
| 411 } | |
| 412 if (new_certainty < choice->certainty()) { | |
| 413 choice->set_certainty(new_certainty); | |
| 414 } | |
| 415 // DO NOT SORT!! It will mess up the iterator in LanguageModel::UpdateState. | |
| 416 } else { | |
| 417 // Need a new choice with the correct_ngram_id. | |
| 418 choice = new BLOB_CHOICE(*old_choice); | |
| 419 choice->set_unichar_id(correct_ngram_id); | |
| 420 choice->set_rating(new_rating); | |
| 421 choice->set_certainty(new_certainty); | |
| 422 choice->set_classifier(BCC_AMBIG); | |
| 423 choice->set_matrix_cell(coord.col, coord.row); | |
| 424 BLOB_CHOICE_IT it(new_choices); | |
| 425 it.add_to_end(choice); | |
| 426 } | |
| 427 // Remove current unichar from werd_choice. On the last iteration | |
| 428 // set the correct replacement unichar instead of removing a unichar. | |
| 429 for (int replaced_count = 0; replaced_count < wrong_ngram_size; ++replaced_count) { | |
| 430 if (replaced_count + 1 == wrong_ngram_size) { | |
| 431 werd_choice->set_blob_choice(wrong_ngram_begin_index, num_blobs_to_replace, choice); | |
| 432 } else { | |
| 433 werd_choice->remove_unichar_id(wrong_ngram_begin_index + 1); | |
| 434 } | |
| 435 } | |
| 436 if (stopper_debug_level >= 1) { | |
| 437 werd_choice->print("ReplaceAmbig() "); | |
| 438 tprintf("Modified blob_choices: "); | |
| 439 print_ratings_list("\n", new_choices, getUnicharset()); | |
| 440 } | |
| 441 } | |
| 442 | |
| 443 int Dict::LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const { | |
| 444 int shortest = INT32_MAX; | |
| 445 int curr_len = 0; | |
| 446 for (unsigned w = 0; w < WordChoice.length(); ++w) { | |
| 447 if (WordChoice.unicharset()->get_isalpha(WordChoice.unichar_id(w))) { | |
| 448 curr_len++; | |
| 449 } else if (curr_len > 0) { | |
| 450 if (curr_len < shortest) { | |
| 451 shortest = curr_len; | |
| 452 } | |
| 453 curr_len = 0; | |
| 454 } | |
| 455 } | |
| 456 if (curr_len > 0 && curr_len < shortest) { | |
| 457 shortest = curr_len; | |
| 458 } else if (shortest == INT32_MAX) { | |
| 459 shortest = 0; | |
| 460 } | |
| 461 return shortest; | |
| 462 } | |
| 463 | |
| 464 int Dict::UniformCertainties(const WERD_CHOICE &word) { | |
| 465 float Certainty; | |
| 466 float WorstCertainty = FLT_MAX; | |
| 467 float CertaintyThreshold; | |
| 468 double TotalCertainty; | |
| 469 double TotalCertaintySquared; | |
| 470 double Variance; | |
| 471 float Mean, StdDev; | |
| 472 int word_length = word.length(); | |
| 473 | |
| 474 if (word_length < 3) { | |
| 475 return true; | |
| 476 } | |
| 477 | |
| 478 TotalCertainty = TotalCertaintySquared = 0.0; | |
| 479 for (int i = 0; i < word_length; ++i) { | |
| 480 Certainty = word.certainty(i); | |
| 481 TotalCertainty += Certainty; | |
| 482 TotalCertaintySquared += static_cast<double>(Certainty) * Certainty; | |
| 483 if (Certainty < WorstCertainty) { | |
| 484 WorstCertainty = Certainty; | |
| 485 } | |
| 486 } | |
| 487 | |
| 488 // Subtract off worst certainty from statistics. | |
| 489 word_length--; | |
| 490 TotalCertainty -= WorstCertainty; | |
| 491 TotalCertaintySquared -= static_cast<double>(WorstCertainty) * WorstCertainty; | |
| 492 | |
| 493 Mean = TotalCertainty / word_length; | |
| 494 Variance = ((word_length * TotalCertaintySquared - TotalCertainty * TotalCertainty) / | |
| 495 (word_length * (word_length - 1))); | |
| 496 if (Variance < 0.0) { | |
| 497 Variance = 0.0; | |
| 498 } | |
| 499 StdDev = sqrt(Variance); | |
| 500 | |
| 501 CertaintyThreshold = Mean - stopper_allowable_character_badness * StdDev; | |
| 502 if (CertaintyThreshold > stopper_nondict_certainty_base) { | |
| 503 CertaintyThreshold = stopper_nondict_certainty_base; | |
| 504 } | |
| 505 | |
| 506 if (word.certainty() < CertaintyThreshold) { | |
| 507 if (stopper_debug_level >= 1) { | |
| 508 tprintf( | |
| 509 "Stopper: Non-uniform certainty = %4.1f" | |
| 510 " (m=%4.1f, s=%4.1f, t=%4.1f)\n", | |
| 511 word.certainty(), Mean, StdDev, CertaintyThreshold); | |
| 512 } | |
| 513 return false; | |
| 514 } else { | |
| 515 return true; | |
| 516 } | |
| 517 } | |
| 518 | |
| 519 } // namespace tesseract |
