Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccmain/reject.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /********************************************************************** | |
| 2 * File: reject.cpp (Formerly reject.c) | |
| 3 * Description: Rejection functions used in tessedit | |
| 4 * Author: Phil Cheatle | |
| 5 * | |
| 6 * (C) Copyright 1992, Hewlett-Packard Ltd. | |
| 7 ** Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 ** you may not use this file except in compliance with the License. | |
| 9 ** You may obtain a copy of the License at | |
| 10 ** http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 ** Unless required by applicable law or agreed to in writing, software | |
| 12 ** distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 ** See the License for the specific language governing permissions and | |
| 15 ** limitations under the License. | |
| 16 * | |
| 17 **********************************************************************/ | |
| 18 | |
| 19 // Include automatically generated configuration file if running autoconf. | |
| 20 #ifdef HAVE_CONFIG_H | |
| 21 # include "config_auto.h" | |
| 22 #endif | |
| 23 | |
| 24 #include "reject.h" | |
| 25 | |
| 26 #ifdef DISABLED_LEGACY_ENGINE | |
| 27 | |
| 28 # include "tesseractclass.h" | |
| 29 | |
| 30 namespace tesseract { | |
| 31 | |
| 32 int16_t Tesseract::safe_dict_word(const WERD_RES *werd_res) { | |
| 33 const WERD_CHOICE &word = *werd_res->best_choice; | |
| 34 int dict_word_type = werd_res->tesseract->dict_word(word); | |
| 35 return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type; | |
| 36 } | |
| 37 } // namespace tesseract | |
| 38 | |
| 39 #else | |
| 40 | |
| 41 # include "control.h" | |
| 42 # include "docqual.h" | |
| 43 # include "tesseractclass.h" | |
| 44 # include "tessvars.h" | |
| 45 | |
| 46 # include "helpers.h" | |
| 47 | |
| 48 # include <algorithm> // for std::sort | |
| 49 # include <cctype> | |
| 50 # include <cerrno> | |
| 51 # include <cstring> | |
| 52 # include <vector> // for std::vector | |
| 53 | |
| 54 namespace tesseract { | |
| 55 | |
| 56 /************************************************************************* | |
| 57 * set_done() | |
| 58 * | |
| 59 * Set the done flag based on the word acceptability criteria | |
| 60 *************************************************************************/ | |
| 61 | |
| 62 void Tesseract::set_done(WERD_RES *word, int16_t pass) { | |
| 63 word->done = | |
| 64 word->tess_accepted && (strchr(word->best_choice->unichar_string().c_str(), ' ') == nullptr); | |
| 65 bool word_is_ambig = word->best_choice->dangerous_ambig_found(); | |
| 66 bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM || | |
| 67 word->best_choice->permuter() == FREQ_DAWG_PERM || | |
| 68 word->best_choice->permuter() == USER_DAWG_PERM; | |
| 69 if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) && | |
| 70 one_ell_conflict(word, false)) { | |
| 71 if (tessedit_rejection_debug) { | |
| 72 tprintf("one_ell_conflict detected\n"); | |
| 73 } | |
| 74 word->done = false; | |
| 75 } | |
| 76 if (word->done && | |
| 77 ((!word_from_dict && word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) { | |
| 78 if (tessedit_rejection_debug) { | |
| 79 tprintf("non-dict or ambig word detected\n"); | |
| 80 } | |
| 81 word->done = false; | |
| 82 } | |
| 83 if (tessedit_rejection_debug) { | |
| 84 tprintf("set_done(): done=%d\n", word->done); | |
| 85 word->best_choice->print(""); | |
| 86 } | |
| 87 } | |
| 88 | |
| 89 /************************************************************************* | |
| 90 * make_reject_map() | |
| 91 * | |
| 92 * Sets the done flag to indicate whether the resylt is acceptable. | |
| 93 * | |
| 94 * Sets a reject map for the word. | |
| 95 *************************************************************************/ | |
| 96 void Tesseract::make_reject_map(WERD_RES *word, ROW *row, int16_t pass) { | |
| 97 flip_0O(word); | |
| 98 check_debug_pt(word, -1); // For trap only | |
| 99 set_done(word, pass); // Set acceptance | |
| 100 word->reject_map.initialise(word->best_choice->unichar_lengths().length()); | |
| 101 reject_blanks(word); | |
| 102 /* | |
| 103 0: Rays original heuristic - the baseline | |
| 104 */ | |
| 105 if (tessedit_reject_mode == 0) { | |
| 106 if (!word->done) { | |
| 107 reject_poor_matches(word); | |
| 108 } | |
| 109 } else if (tessedit_reject_mode == 5) { | |
| 110 /* | |
| 111 5: Reject I/1/l from words where there is no strong contextual confirmation; | |
| 112 the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls); | |
| 113 and the whole of any words which are very small | |
| 114 */ | |
| 115 if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) { | |
| 116 word->reject_map.rej_word_small_xht(); | |
| 117 } else { | |
| 118 one_ell_conflict(word, true); | |
| 119 /* | |
| 120 Originally the code here just used the done flag. Now I have duplicated | |
| 121 and unpacked the conditions for setting the done flag so that each | |
| 122 mechanism can be turned on or off independently. This works WITHOUT | |
| 123 affecting the done flag setting. | |
| 124 */ | |
| 125 if (rej_use_tess_accepted && !word->tess_accepted) { | |
| 126 word->reject_map.rej_word_not_tess_accepted(); | |
| 127 } | |
| 128 | |
| 129 if (rej_use_tess_blanks && | |
| 130 (strchr(word->best_choice->unichar_string().c_str(), ' ') != nullptr)) { | |
| 131 word->reject_map.rej_word_contains_blanks(); | |
| 132 } | |
| 133 | |
| 134 WERD_CHOICE *best_choice = word->best_choice; | |
| 135 if (rej_use_good_perm) { | |
| 136 if ((best_choice->permuter() == SYSTEM_DAWG_PERM || | |
| 137 best_choice->permuter() == FREQ_DAWG_PERM || | |
| 138 best_choice->permuter() == USER_DAWG_PERM) && | |
| 139 (!rej_use_sensible_wd || | |
| 140 acceptable_word_string(*word->uch_set, best_choice->unichar_string().c_str(), | |
| 141 best_choice->unichar_lengths().c_str()) != AC_UNACCEPTABLE)) { | |
| 142 // PASSED TEST | |
| 143 } else if (best_choice->permuter() == NUMBER_PERM) { | |
| 144 if (rej_alphas_in_number_perm) { | |
| 145 for (int i = 0, offset = 0; best_choice->unichar_string()[offset] != '\0'; | |
| 146 offset += best_choice->unichar_lengths()[i++]) { | |
| 147 if (word->reject_map[i].accepted() && | |
| 148 word->uch_set->get_isalpha(best_choice->unichar_string().c_str() + offset, | |
| 149 best_choice->unichar_lengths()[i])) { | |
| 150 word->reject_map[i].setrej_bad_permuter(); | |
| 151 } | |
| 152 // rej alpha | |
| 153 } | |
| 154 } | |
| 155 } else { | |
| 156 word->reject_map.rej_word_bad_permuter(); | |
| 157 } | |
| 158 } | |
| 159 /* Ambig word rejection was here once !!*/ | |
| 160 } | |
| 161 } else { | |
| 162 tprintf("BAD tessedit_reject_mode\n"); | |
| 163 ASSERT_HOST("Fatal error encountered!" == nullptr); | |
| 164 } | |
| 165 | |
| 166 if (tessedit_image_border > -1) { | |
| 167 reject_edge_blobs(word); | |
| 168 } | |
| 169 | |
| 170 check_debug_pt(word, 10); | |
| 171 if (tessedit_rejection_debug) { | |
| 172 tprintf("Permuter Type = %d\n", word->best_choice->permuter()); | |
| 173 tprintf("Certainty: %f Rating: %f\n", word->best_choice->certainty(), | |
| 174 word->best_choice->rating()); | |
| 175 tprintf("Dict word: %d\n", dict_word(*(word->best_choice))); | |
| 176 } | |
| 177 | |
| 178 flip_hyphens(word); | |
| 179 check_debug_pt(word, 20); | |
| 180 } | |
| 181 | |
| 182 void reject_blanks(WERD_RES *word) { | |
| 183 int16_t i; | |
| 184 int16_t offset; | |
| 185 | |
| 186 for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0'; | |
| 187 offset += word->best_choice->unichar_lengths()[i], i += 1) { | |
| 188 if (word->best_choice->unichar_string()[offset] == ' ') { | |
| 189 // rej unrecognised blobs | |
| 190 word->reject_map[i].setrej_tess_failure(); | |
| 191 } | |
| 192 } | |
| 193 } | |
| 194 | |
| 195 void Tesseract::reject_I_1_L(WERD_RES *word) { | |
| 196 int16_t i; | |
| 197 int16_t offset; | |
| 198 | |
| 199 for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0'; | |
| 200 offset += word->best_choice->unichar_lengths()[i], i += 1) { | |
| 201 if (conflict_set_I_l_1.contains(word->best_choice->unichar_string()[offset])) { | |
| 202 // rej 1Il conflict | |
| 203 word->reject_map[i].setrej_1Il_conflict(); | |
| 204 } | |
| 205 } | |
| 206 } | |
| 207 | |
| 208 void reject_poor_matches(WERD_RES *word) { | |
| 209 float threshold = compute_reject_threshold(word->best_choice); | |
| 210 for (unsigned i = 0; i < word->best_choice->length(); ++i) { | |
| 211 if (word->best_choice->unichar_id(i) == UNICHAR_SPACE) { | |
| 212 word->reject_map[i].setrej_tess_failure(); | |
| 213 } else if (word->best_choice->certainty(i) < threshold) { | |
| 214 word->reject_map[i].setrej_poor_match(); | |
| 215 } | |
| 216 } | |
| 217 } | |
| 218 | |
| 219 /********************************************************************** | |
| 220 * compute_reject_threshold | |
| 221 * | |
| 222 * Set a rejection threshold for this word. | |
| 223 * Initially this is a trivial function which looks for the largest | |
| 224 * gap in the certainty value. | |
| 225 **********************************************************************/ | |
| 226 | |
| 227 float compute_reject_threshold(WERD_CHOICE *word) { | |
| 228 float threshold; // rejection threshold | |
| 229 float bestgap = 0.0f; // biggest gap | |
| 230 float gapstart; // bottom of gap | |
| 231 | |
| 232 auto blob_count = word->length(); | |
| 233 std::vector<float> ratings; | |
| 234 ratings.reserve(blob_count); | |
| 235 for (unsigned i = 0; i < blob_count; ++i) { | |
| 236 ratings.push_back(word->certainty(i)); | |
| 237 } | |
| 238 std::sort(ratings.begin(), ratings.end()); | |
| 239 gapstart = ratings[0] - 1; // all reject if none better | |
| 240 if (blob_count >= 3) { | |
| 241 for (unsigned index = 0; index < blob_count - 1; index++) { | |
| 242 if (ratings[index + 1] - ratings[index] > bestgap) { | |
| 243 bestgap = ratings[index + 1] - ratings[index]; | |
| 244 // find biggest | |
| 245 gapstart = ratings[index]; | |
| 246 } | |
| 247 } | |
| 248 } | |
| 249 threshold = gapstart + bestgap / 2; | |
| 250 | |
| 251 return threshold; | |
| 252 } | |
| 253 | |
| 254 /************************************************************************* | |
| 255 * reject_edge_blobs() | |
| 256 * | |
| 257 * If the word is perilously close to the edge of the image, reject those blobs | |
| 258 * in the word which are too close to the edge as they could be clipped. | |
| 259 *************************************************************************/ | |
| 260 void Tesseract::reject_edge_blobs(WERD_RES *word) { | |
| 261 TBOX word_box = word->word->bounding_box(); | |
| 262 // Use the box_word as it is already denormed back to image coordinates. | |
| 263 int blobcount = word->box_word->length(); | |
| 264 | |
| 265 if (word_box.left() < tessedit_image_border || word_box.bottom() < tessedit_image_border || | |
| 266 word_box.right() + tessedit_image_border > ImageWidth() - 1 || | |
| 267 word_box.top() + tessedit_image_border > ImageHeight() - 1) { | |
| 268 ASSERT_HOST(word->reject_map.length() == blobcount); | |
| 269 for (int blobindex = 0; blobindex < blobcount; blobindex++) { | |
| 270 TBOX blob_box = word->box_word->BlobBox(blobindex); | |
| 271 if (blob_box.left() < tessedit_image_border || blob_box.bottom() < tessedit_image_border || | |
| 272 blob_box.right() + tessedit_image_border > ImageWidth() - 1 || | |
| 273 blob_box.top() + tessedit_image_border > ImageHeight() - 1) { | |
| 274 word->reject_map[blobindex].setrej_edge_char(); | |
| 275 // Close to edge | |
| 276 } | |
| 277 } | |
| 278 } | |
| 279 } | |
| 280 | |
| 281 /********************************************************************** | |
| 282 * one_ell_conflict() | |
| 283 * | |
| 284 * Identify words where there is a potential I/l/1 error. | |
| 285 * - A bundle of contextual heuristics! | |
| 286 **********************************************************************/ | |
| 287 bool Tesseract::one_ell_conflict(WERD_RES *word_res, bool update_map) { | |
| 288 const char *word; | |
| 289 const char *lengths; | |
| 290 int16_t word_len; // its length | |
| 291 int16_t first_alphanum_index_; | |
| 292 int16_t first_alphanum_offset_; | |
| 293 int16_t i; | |
| 294 int16_t offset; | |
| 295 bool non_conflict_set_char; // non conf set a/n? | |
| 296 ACCEPTABLE_WERD_TYPE word_type; | |
| 297 bool dict_perm_type; | |
| 298 bool dict_word_ok; | |
| 299 int dict_word_type; | |
| 300 | |
| 301 word = word_res->best_choice->unichar_string().c_str(); | |
| 302 lengths = word_res->best_choice->unichar_lengths().c_str(); | |
| 303 word_len = strlen(lengths); | |
| 304 /* | |
| 305 If there are no occurrences of the conflict set characters then the word | |
| 306 is OK. | |
| 307 */ | |
| 308 if (strpbrk(word, conflict_set_I_l_1.c_str()) == nullptr) { | |
| 309 return false; | |
| 310 } | |
| 311 | |
| 312 /* | |
| 313 There is a conflict if there are NO other (confirmed) alphanumerics apart | |
| 314 from those in the conflict set. | |
| 315 */ | |
| 316 | |
| 317 for (i = 0, offset = 0, non_conflict_set_char = false; (i < word_len) && !non_conflict_set_char; | |
| 318 offset += lengths[i++]) { | |
| 319 non_conflict_set_char = (word_res->uch_set->get_isalpha(word + offset, lengths[i]) || | |
| 320 word_res->uch_set->get_isdigit(word + offset, lengths[i])) && | |
| 321 !conflict_set_I_l_1.contains(word[offset]); | |
| 322 } | |
| 323 if (!non_conflict_set_char) { | |
| 324 if (update_map) { | |
| 325 reject_I_1_L(word_res); | |
| 326 } | |
| 327 return true; | |
| 328 } | |
| 329 | |
| 330 /* | |
| 331 If the word is accepted by a dawg permuter, and the first alpha character | |
| 332 is "I" or "l", check to see if the alternative is also a dawg word. If it | |
| 333 is, then there is a potential error otherwise the word is ok. | |
| 334 */ | |
| 335 | |
| 336 dict_perm_type = (word_res->best_choice->permuter() == SYSTEM_DAWG_PERM) || | |
| 337 (word_res->best_choice->permuter() == USER_DAWG_PERM) || | |
| 338 (rej_trust_doc_dawg && (word_res->best_choice->permuter() == DOC_DAWG_PERM)) || | |
| 339 (word_res->best_choice->permuter() == FREQ_DAWG_PERM); | |
| 340 dict_word_type = dict_word(*(word_res->best_choice)); | |
| 341 dict_word_ok = (dict_word_type > 0) && (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM)); | |
| 342 | |
| 343 if ((rej_1Il_use_dict_word && dict_word_ok) || (rej_1Il_trust_permuter_type && dict_perm_type) || | |
| 344 (dict_perm_type && dict_word_ok)) { | |
| 345 first_alphanum_index_ = first_alphanum_index(word, lengths); | |
| 346 first_alphanum_offset_ = first_alphanum_offset(word, lengths); | |
| 347 if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'I') { | |
| 348 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; | |
| 349 if (safe_dict_word(word_res) > 0) { | |
| 350 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; | |
| 351 if (update_map) { | |
| 352 word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict(); | |
| 353 } | |
| 354 return true; | |
| 355 } else { | |
| 356 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; | |
| 357 return false; | |
| 358 } | |
| 359 } | |
| 360 | |
| 361 if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'l') { | |
| 362 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; | |
| 363 if (safe_dict_word(word_res) > 0) { | |
| 364 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; | |
| 365 if (update_map) { | |
| 366 word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict(); | |
| 367 } | |
| 368 return true; | |
| 369 } else { | |
| 370 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; | |
| 371 return false; | |
| 372 } | |
| 373 } | |
| 374 return false; | |
| 375 } | |
| 376 | |
| 377 /* | |
| 378 NEW 1Il code. The old code relied on permuter types too much. In fact, | |
| 379 tess will use TOP_CHOICE permute for good things like "palette". | |
| 380 In this code the string is examined independently to see if it looks like | |
| 381 a well formed word. | |
| 382 */ | |
| 383 | |
| 384 /* | |
| 385 REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a | |
| 386 dictionary word. | |
| 387 */ | |
| 388 first_alphanum_index_ = first_alphanum_index(word, lengths); | |
| 389 first_alphanum_offset_ = first_alphanum_offset(word, lengths); | |
| 390 if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'l') { | |
| 391 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; | |
| 392 if (safe_dict_word(word_res) > 0) { | |
| 393 return false; | |
| 394 } else { | |
| 395 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; | |
| 396 } | |
| 397 } else if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'I') { | |
| 398 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; | |
| 399 if (safe_dict_word(word_res) > 0) { | |
| 400 return false; | |
| 401 } else { | |
| 402 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; | |
| 403 } | |
| 404 } | |
| 405 /* | |
| 406 For strings containing digits: | |
| 407 If there are no alphas OR the numeric permuter liked the word, | |
| 408 reject any non 1 conflict chs | |
| 409 Else reject all conflict chs | |
| 410 */ | |
| 411 if (word_contains_non_1_digit(word, lengths)) { | |
| 412 bool allow_1s = | |
| 413 (alpha_count(word, lengths) == 0) || (word_res->best_choice->permuter() == NUMBER_PERM); | |
| 414 | |
| 415 int16_t offset; | |
| 416 bool conflict = false; | |
| 417 for (i = 0, offset = 0; word[offset] != '\0'; | |
| 418 offset += word_res->best_choice->unichar_lengths()[i++]) { | |
| 419 if ((!allow_1s || (word[offset] != '1')) && | |
| 420 conflict_set_I_l_1.contains(word[offset])) { | |
| 421 if (update_map) { | |
| 422 word_res->reject_map[i].setrej_1Il_conflict(); | |
| 423 } | |
| 424 conflict = true; | |
| 425 } | |
| 426 } | |
| 427 return conflict; | |
| 428 } | |
| 429 /* | |
| 430 For anything else. See if it conforms to an acceptable word type. If so, | |
| 431 treat accordingly. | |
| 432 */ | |
| 433 word_type = acceptable_word_string(*word_res->uch_set, word, lengths); | |
| 434 if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) { | |
| 435 first_alphanum_index_ = first_alphanum_index(word, lengths); | |
| 436 first_alphanum_offset_ = first_alphanum_offset(word, lengths); | |
| 437 if (conflict_set_I_l_1.contains(word[first_alphanum_offset_])) { | |
| 438 if (update_map) { | |
| 439 word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict(); | |
| 440 } | |
| 441 return true; | |
| 442 } else { | |
| 443 return false; | |
| 444 } | |
| 445 } else if (word_type == AC_UPPER_CASE) { | |
| 446 return false; | |
| 447 } else { | |
| 448 if (update_map) { | |
| 449 reject_I_1_L(word_res); | |
| 450 } | |
| 451 return true; | |
| 452 } | |
| 453 } | |
| 454 | |
| 455 int16_t Tesseract::first_alphanum_index(const char *word, const char *word_lengths) { | |
| 456 int16_t i; | |
| 457 int16_t offset; | |
| 458 | |
| 459 for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) { | |
| 460 if (unicharset.get_isalpha(word + offset, word_lengths[i]) || | |
| 461 unicharset.get_isdigit(word + offset, word_lengths[i])) { | |
| 462 return i; | |
| 463 } | |
| 464 } | |
| 465 return -1; | |
| 466 } | |
| 467 | |
| 468 int16_t Tesseract::first_alphanum_offset(const char *word, const char *word_lengths) { | |
| 469 int16_t i; | |
| 470 int16_t offset; | |
| 471 | |
| 472 for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) { | |
| 473 if (unicharset.get_isalpha(word + offset, word_lengths[i]) || | |
| 474 unicharset.get_isdigit(word + offset, word_lengths[i])) { | |
| 475 return offset; | |
| 476 } | |
| 477 } | |
| 478 return -1; | |
| 479 } | |
| 480 | |
| 481 int16_t Tesseract::alpha_count(const char *word, const char *word_lengths) { | |
| 482 int16_t i; | |
| 483 int16_t offset; | |
| 484 int16_t count = 0; | |
| 485 | |
| 486 for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) { | |
| 487 if (unicharset.get_isalpha(word + offset, word_lengths[i])) { | |
| 488 count++; | |
| 489 } | |
| 490 } | |
| 491 return count; | |
| 492 } | |
| 493 | |
| 494 bool Tesseract::word_contains_non_1_digit(const char *word, const char *word_lengths) { | |
| 495 int16_t i; | |
| 496 int16_t offset; | |
| 497 | |
| 498 for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) { | |
| 499 if (unicharset.get_isdigit(word + offset, word_lengths[i]) && | |
| 500 (word_lengths[i] != 1 || word[offset] != '1')) { | |
| 501 return true; | |
| 502 } | |
| 503 } | |
| 504 return false; | |
| 505 } | |
| 506 | |
| 507 /************************************************************************* | |
| 508 * dont_allow_1Il() | |
| 509 * Don't unreject LONE accepted 1Il conflict set chars | |
| 510 *************************************************************************/ | |
| 511 void Tesseract::dont_allow_1Il(WERD_RES *word) { | |
| 512 int word_len = word->reject_map.length(); | |
| 513 const char *s = word->best_choice->unichar_string().c_str(); | |
| 514 const char *lengths = word->best_choice->unichar_lengths().c_str(); | |
| 515 bool accepted_1Il = false; | |
| 516 | |
| 517 for (int i = 0, offset = 0; i < word_len; offset += word->best_choice->unichar_lengths()[i++]) { | |
| 518 if (word->reject_map[i].accepted()) { | |
| 519 if (conflict_set_I_l_1.contains(s[offset])) { | |
| 520 accepted_1Il = true; | |
| 521 } else { | |
| 522 if (word->uch_set->get_isalpha(s + offset, lengths[i]) || | |
| 523 word->uch_set->get_isdigit(s + offset, lengths[i])) { | |
| 524 return; // >=1 non 1Il ch accepted | |
| 525 } | |
| 526 } | |
| 527 } | |
| 528 } | |
| 529 if (!accepted_1Il) { | |
| 530 return; // Nothing to worry about | |
| 531 } | |
| 532 | |
| 533 for (int i = 0, offset = 0; i < word_len; offset += word->best_choice->unichar_lengths()[i++]) { | |
| 534 if (conflict_set_I_l_1.contains(s[offset]) && word->reject_map[i].accepted()) { | |
| 535 word->reject_map[i].setrej_postNN_1Il(); | |
| 536 } | |
| 537 } | |
| 538 } | |
| 539 | |
| 540 int16_t Tesseract::count_alphanums(WERD_RES *word_res) { | |
| 541 int count = 0; | |
| 542 const WERD_CHOICE *best_choice = word_res->best_choice; | |
| 543 for (unsigned i = 0; i < word_res->reject_map.length(); ++i) { | |
| 544 if ((word_res->reject_map[i].accepted()) && | |
| 545 (word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) || | |
| 546 word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) { | |
| 547 count++; | |
| 548 } | |
| 549 } | |
| 550 return count; | |
| 551 } | |
| 552 | |
| 553 // reject all if most rejected. | |
| 554 void Tesseract::reject_mostly_rejects(WERD_RES *word) { | |
| 555 /* Reject the whole of the word if the fraction of rejects exceeds a limit */ | |
| 556 | |
| 557 if (static_cast<float>(word->reject_map.reject_count()) / word->reject_map.length() >= | |
| 558 rej_whole_of_mostly_reject_word_fract) { | |
| 559 word->reject_map.rej_word_mostly_rej(); | |
| 560 } | |
| 561 } | |
| 562 | |
| 563 bool Tesseract::repeated_nonalphanum_wd(WERD_RES *word, ROW *row) { | |
| 564 if (word->best_choice->unichar_lengths().length() <= 1) { | |
| 565 return false; | |
| 566 } | |
| 567 | |
| 568 if (!ok_repeated_ch_non_alphanum_wds.contains(word->best_choice->unichar_string()[0])) { | |
| 569 return false; | |
| 570 } | |
| 571 | |
| 572 UNICHAR_ID uch_id = word->best_choice->unichar_id(0); | |
| 573 for (unsigned i = 1; i < word->best_choice->length(); ++i) { | |
| 574 if (word->best_choice->unichar_id(i) != uch_id) { | |
| 575 return false; | |
| 576 } | |
| 577 } | |
| 578 | |
| 579 int16_t char_quality; | |
| 580 int16_t accepted_char_quality; | |
| 581 word_char_quality(word, &char_quality, &accepted_char_quality); | |
| 582 | |
| 583 if ((word->best_choice->unichar_lengths().length() == static_cast<size_t>(char_quality)) && | |
| 584 (char_quality == accepted_char_quality)) { | |
| 585 return true; | |
| 586 } else { | |
| 587 return false; | |
| 588 } | |
| 589 } | |
| 590 | |
| 591 int16_t Tesseract::safe_dict_word(const WERD_RES *werd_res) { | |
| 592 const WERD_CHOICE &word = *werd_res->best_choice; | |
| 593 int dict_word_type = werd_res->tesseract->dict_word(word); | |
| 594 return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type; | |
| 595 } | |
| 596 | |
| 597 // Note: After running this function word_res->ratings | |
| 598 // might not contain the right BLOB_CHOICE corresponding to each character | |
| 599 // in word_res->best_choice. | |
| 600 void Tesseract::flip_hyphens(WERD_RES *word_res) { | |
| 601 WERD_CHOICE *best_choice = word_res->best_choice; | |
| 602 int prev_right = -9999; | |
| 603 int next_left; | |
| 604 TBOX out_box; | |
| 605 float aspect_ratio; | |
| 606 | |
| 607 if (tessedit_lower_flip_hyphen <= 1) { | |
| 608 return; | |
| 609 } | |
| 610 | |
| 611 auto num_blobs = word_res->rebuild_word->NumBlobs(); | |
| 612 UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-"); | |
| 613 for (unsigned i = 0; i < best_choice->length() && i < num_blobs; ++i) { | |
| 614 TBLOB *blob = word_res->rebuild_word->blobs[i]; | |
| 615 out_box = blob->bounding_box(); | |
| 616 if (i + 1 == num_blobs) { | |
| 617 next_left = 9999; | |
| 618 } else { | |
| 619 next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left(); | |
| 620 } | |
| 621 // Don't touch small or touching blobs - it is too dangerous. | |
| 622 if ((out_box.width() > 8 * word_res->denorm.x_scale()) && (out_box.left() > prev_right) && | |
| 623 (out_box.right() < next_left)) { | |
| 624 aspect_ratio = out_box.width() / static_cast<float>(out_box.height()); | |
| 625 if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) { | |
| 626 if (aspect_ratio >= tessedit_upper_flip_hyphen && | |
| 627 word_res->uch_set->contains_unichar_id(unichar_dash) && | |
| 628 word_res->uch_set->get_enabled(unichar_dash)) { | |
| 629 /* Certain HYPHEN */ | |
| 630 best_choice->set_unichar_id(unichar_dash, i); | |
| 631 if (word_res->reject_map[i].rejected()) { | |
| 632 word_res->reject_map[i].setrej_hyphen_accept(); | |
| 633 } | |
| 634 } | |
| 635 if ((aspect_ratio > tessedit_lower_flip_hyphen) && word_res->reject_map[i].accepted()) { | |
| 636 // Suspected HYPHEN | |
| 637 word_res->reject_map[i].setrej_hyphen(); | |
| 638 } | |
| 639 } else if (best_choice->unichar_id(i) == unichar_dash) { | |
| 640 if ((aspect_ratio >= tessedit_upper_flip_hyphen) && (word_res->reject_map[i].rejected())) { | |
| 641 word_res->reject_map[i].setrej_hyphen_accept(); | |
| 642 } | |
| 643 // Certain HYPHEN | |
| 644 | |
| 645 if ((aspect_ratio <= tessedit_lower_flip_hyphen) && (word_res->reject_map[i].accepted())) { | |
| 646 // Suspected HYPHEN | |
| 647 word_res->reject_map[i].setrej_hyphen(); | |
| 648 } | |
| 649 } | |
| 650 } | |
| 651 prev_right = out_box.right(); | |
| 652 } | |
| 653 } | |
| 654 | |
| 655 // Note: After running this function word_res->ratings | |
| 656 // might not contain the right BLOB_CHOICE corresponding to each character | |
| 657 // in word_res->best_choice. | |
| 658 void Tesseract::flip_0O(WERD_RES *word_res) { | |
| 659 WERD_CHOICE *best_choice = word_res->best_choice; | |
| 660 TBOX out_box; | |
| 661 | |
| 662 if (!tessedit_flip_0O) { | |
| 663 return; | |
| 664 } | |
| 665 | |
| 666 auto num_blobs = word_res->rebuild_word->NumBlobs(); | |
| 667 for (unsigned i = 0; i < best_choice->length() && i < num_blobs; ++i) { | |
| 668 TBLOB *blob = word_res->rebuild_word->blobs[i]; | |
| 669 if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) || | |
| 670 word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) { | |
| 671 out_box = blob->bounding_box(); | |
| 672 if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) || | |
| 673 (out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4)) { | |
| 674 return; // Beware words with sub/superscripts | |
| 675 } | |
| 676 } | |
| 677 } | |
| 678 UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0"); | |
| 679 UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O"); | |
| 680 if (unichar_0 == INVALID_UNICHAR_ID || !word_res->uch_set->get_enabled(unichar_0) || | |
| 681 unichar_O == INVALID_UNICHAR_ID || !word_res->uch_set->get_enabled(unichar_O)) { | |
| 682 return; // 0 or O are not present/enabled in unicharset | |
| 683 } | |
| 684 for (unsigned i = 1; i < best_choice->length(); ++i) { | |
| 685 if (best_choice->unichar_id(i) == unichar_0 || best_choice->unichar_id(i) == unichar_O) { | |
| 686 /* A0A */ | |
| 687 if ((i + 1) < best_choice->length() && | |
| 688 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) && | |
| 689 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i + 1))) { | |
| 690 best_choice->set_unichar_id(unichar_O, i); | |
| 691 } | |
| 692 /* A00A */ | |
| 693 if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) && | |
| 694 (i + 1) < best_choice->length() && | |
| 695 (best_choice->unichar_id(i + 1) == unichar_0 || | |
| 696 best_choice->unichar_id(i + 1) == unichar_O) && | |
| 697 (i + 2) < best_choice->length() && | |
| 698 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i + 2))) { | |
| 699 best_choice->set_unichar_id(unichar_O, i); | |
| 700 i++; | |
| 701 } | |
| 702 /* AA0<non digit or end of word> */ | |
| 703 if ((i > 1) && non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 2)) && | |
| 704 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) && | |
| 705 (((i + 1) < best_choice->length() && | |
| 706 !word_res->uch_set->get_isdigit(best_choice->unichar_id(i + 1)) && | |
| 707 !word_res->uch_set->eq(best_choice->unichar_id(i + 1), "l") && | |
| 708 !word_res->uch_set->eq(best_choice->unichar_id(i + 1), "I")) || | |
| 709 (i == best_choice->length() - 1))) { | |
| 710 best_choice->set_unichar_id(unichar_O, i); | |
| 711 } | |
| 712 /* 9O9 */ | |
| 713 if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) && | |
| 714 (i + 1) < best_choice->length() && | |
| 715 non_0_digit(*word_res->uch_set, best_choice->unichar_id(i + 1))) { | |
| 716 best_choice->set_unichar_id(unichar_0, i); | |
| 717 } | |
| 718 /* 9OOO */ | |
| 719 if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) && | |
| 720 (i + 2) < best_choice->length() && | |
| 721 (best_choice->unichar_id(i + 1) == unichar_0 || | |
| 722 best_choice->unichar_id(i + 1) == unichar_O) && | |
| 723 (best_choice->unichar_id(i + 2) == unichar_0 || | |
| 724 best_choice->unichar_id(i + 2) == unichar_O)) { | |
| 725 best_choice->set_unichar_id(unichar_0, i); | |
| 726 best_choice->set_unichar_id(unichar_0, i + 1); | |
| 727 best_choice->set_unichar_id(unichar_0, i + 2); | |
| 728 i += 2; | |
| 729 } | |
| 730 /* 9OO<non upper> */ | |
| 731 if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) && | |
| 732 (i + 2) < best_choice->length() && | |
| 733 (best_choice->unichar_id(i + 1) == unichar_0 || | |
| 734 best_choice->unichar_id(i + 1) == unichar_O) && | |
| 735 !word_res->uch_set->get_isupper(best_choice->unichar_id(i + 2))) { | |
| 736 best_choice->set_unichar_id(unichar_0, i); | |
| 737 best_choice->set_unichar_id(unichar_0, i + 1); | |
| 738 i++; | |
| 739 } | |
| 740 /* 9O<non upper> */ | |
| 741 if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) && | |
| 742 (i + 1) < best_choice->length() && | |
| 743 !word_res->uch_set->get_isupper(best_choice->unichar_id(i + 1))) { | |
| 744 best_choice->set_unichar_id(unichar_0, i); | |
| 745 } | |
| 746 /* 9[.,]OOO.. */ | |
| 747 if ((i > 1) && | |
| 748 (word_res->uch_set->eq(best_choice->unichar_id(i - 1), ".") || | |
| 749 word_res->uch_set->eq(best_choice->unichar_id(i - 1), ",")) && | |
| 750 (word_res->uch_set->get_isdigit(best_choice->unichar_id(i - 2)) || | |
| 751 best_choice->unichar_id(i - 2) == unichar_O)) { | |
| 752 if (best_choice->unichar_id(i - 2) == unichar_O) { | |
| 753 best_choice->set_unichar_id(unichar_0, i - 2); | |
| 754 } | |
| 755 while (i < best_choice->length() && (best_choice->unichar_id(i) == unichar_O || | |
| 756 best_choice->unichar_id(i) == unichar_0)) { | |
| 757 best_choice->set_unichar_id(unichar_0, i); | |
| 758 i++; | |
| 759 } | |
| 760 i--; | |
| 761 } | |
| 762 } | |
| 763 } | |
| 764 } | |
| 765 | |
| 766 bool Tesseract::non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id) { | |
| 767 return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O"); | |
| 768 } | |
| 769 | |
| 770 bool Tesseract::non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id) { | |
| 771 return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, "0"); | |
| 772 } | |
| 773 } // namespace tesseract | |
| 774 | |
| 775 #endif // def DISABLED_LEGACY_ENGINE |
