Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccmain/docqual.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /****************************************************************** | |
| 2 * File: docqual.cpp (Formerly docqual.c) | |
| 3 * Description: Document Quality Metrics | |
| 4 * Author: Phil Cheatle | |
| 5 * | |
| 6 * (C) Copyright 1994, Hewlett-Packard Ltd. | |
| 7 ** Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 ** you may not use this file except in compliance with the License. | |
| 9 ** You may obtain a copy of the License at | |
| 10 ** http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 ** Unless required by applicable law or agreed to in writing, software | |
| 12 ** distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 ** See the License for the specific language governing permissions and | |
| 15 ** limitations under the License. | |
| 16 * | |
| 17 **********************************************************************/ | |
| 18 | |
| 19 #include "docqual.h" | |
| 20 #include <cctype> | |
| 21 #include "reject.h" | |
| 22 #include "tesseractclass.h" | |
| 23 #include "tessvars.h" | |
| 24 | |
| 25 namespace tesseract { | |
| 26 | |
| 27 static void countMatchingBlobs(int16_t &match_count, int /*index*/) { | |
| 28 ++match_count; | |
| 29 } | |
| 30 | |
| 31 static void countAcceptedBlobs(WERD_RES *word, int16_t &match_count, int16_t &accepted_match_count, | |
| 32 int index) { | |
| 33 if (word->reject_map[index].accepted()) { | |
| 34 ++accepted_match_count; | |
| 35 } | |
| 36 ++match_count; | |
| 37 } | |
| 38 | |
| 39 static void acceptIfGoodQuality(WERD_RES *word, int index) { | |
| 40 if (word->reject_map[index].accept_if_good_quality()) { | |
| 41 word->reject_map[index].setrej_quality_accept(); | |
| 42 } | |
| 43 } | |
| 44 | |
| 45 /************************************************************************* | |
| 46 * word_blob_quality() | |
| 47 * How many blobs in the box_word are identical to those of the inword? | |
| 48 * ASSUME blobs in both initial word and box_word are in ascending order of | |
| 49 * left hand blob edge. | |
| 50 *************************************************************************/ | |
| 51 int16_t Tesseract::word_blob_quality(WERD_RES *word) { | |
| 52 int16_t match_count = 0; | |
| 53 if (word->bln_boxes != nullptr && word->rebuild_word != nullptr && | |
| 54 !word->rebuild_word->blobs.empty()) { | |
| 55 using namespace std::placeholders; // for _1 | |
| 56 word->bln_boxes->ProcessMatchedBlobs(*word->rebuild_word, | |
| 57 std::bind(countMatchingBlobs, match_count, _1)); | |
| 58 } | |
| 59 return match_count; | |
| 60 } | |
| 61 | |
| 62 int16_t Tesseract::word_outline_errs(WERD_RES *word) { | |
| 63 int16_t err_count = 0; | |
| 64 | |
| 65 if (word->rebuild_word != nullptr) { | |
| 66 int16_t i = 0; | |
| 67 for (unsigned b = 0; b < word->rebuild_word->NumBlobs(); ++b) { | |
| 68 TBLOB *blob = word->rebuild_word->blobs[b]; | |
| 69 err_count += count_outline_errs(word->best_choice->unichar_string()[i], blob->NumOutlines()); | |
| 70 i++; | |
| 71 } | |
| 72 } | |
| 73 return err_count; | |
| 74 } | |
| 75 | |
| 76 /************************************************************************* | |
| 77 * word_char_quality() | |
| 78 * Combination of blob quality and outline quality - how many good chars are | |
| 79 * there? - I.e chars which pass the blob AND outline tests. | |
| 80 *************************************************************************/ | |
| 81 void Tesseract::word_char_quality(WERD_RES *word, int16_t *match_count, | |
| 82 int16_t *accepted_match_count) { | |
| 83 *match_count = 0; | |
| 84 *accepted_match_count = 0; | |
| 85 if (word->bln_boxes != nullptr && word->rebuild_word != nullptr && | |
| 86 !word->rebuild_word->blobs.empty()) { | |
| 87 using namespace std::placeholders; // for _1 | |
| 88 word->bln_boxes->ProcessMatchedBlobs( | |
| 89 *word->rebuild_word, | |
| 90 std::bind(countAcceptedBlobs, word, *match_count, *accepted_match_count, _1)); | |
| 91 } | |
| 92 } | |
| 93 | |
| 94 /************************************************************************* | |
| 95 * unrej_good_chs() | |
| 96 * Unreject POTENTIAL rejects if the blob passes the blob and outline checks | |
| 97 *************************************************************************/ | |
| 98 void Tesseract::unrej_good_chs(WERD_RES *word) { | |
| 99 if (word->bln_boxes != nullptr && word->rebuild_word != nullptr && | |
| 100 word->rebuild_word->blobs.empty()) { | |
| 101 using namespace std::placeholders; // for _1 | |
| 102 word->bln_boxes->ProcessMatchedBlobs(*word->rebuild_word, | |
| 103 std::bind(acceptIfGoodQuality, word, _1)); | |
| 104 } | |
| 105 } | |
| 106 | |
| 107 int16_t Tesseract::count_outline_errs(char c, int16_t outline_count) { | |
| 108 int expected_outline_count; | |
| 109 | |
| 110 if (outlines_odd.contains(c)) { | |
| 111 return 0; // Don't use this char | |
| 112 } else if (outlines_2.contains(c)) { | |
| 113 expected_outline_count = 2; | |
| 114 } else { | |
| 115 expected_outline_count = 1; | |
| 116 } | |
| 117 return abs(outline_count - expected_outline_count); | |
| 118 } | |
| 119 | |
| 120 void Tesseract::quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc) { | |
| 121 if ((tessedit_good_quality_unrej && good_quality_doc)) { | |
| 122 unrej_good_quality_words(page_res_it); | |
| 123 } | |
| 124 doc_and_block_rejection(page_res_it, good_quality_doc); | |
| 125 if (unlv_tilde_crunching) { | |
| 126 tilde_crunch(page_res_it); | |
| 127 tilde_delete(page_res_it); | |
| 128 } | |
| 129 } | |
| 130 | |
| 131 /************************************************************************* | |
| 132 * unrej_good_quality_words() | |
| 133 * Accept potential rejects in words which pass the following checks: | |
| 134 * - Contains a potential reject | |
| 135 * - Word looks like a sensible alpha word. | |
| 136 * - Word segmentation is the same as the original image | |
| 137 * - All characters have the expected number of outlines | |
| 138 * NOTE - the rejection counts are recalculated after unrejection | |
| 139 * - CAN'T do it in a single pass without a bit of fiddling | |
| 140 * - keep it simple but inefficient | |
| 141 *************************************************************************/ | |
| 142 void Tesseract::unrej_good_quality_words( // unreject potential | |
| 143 PAGE_RES_IT &page_res_it) { | |
| 144 WERD_RES *word; | |
| 145 ROW_RES *current_row; | |
| 146 BLOCK_RES *current_block; | |
| 147 int i; | |
| 148 | |
| 149 page_res_it.restart_page(); | |
| 150 while (page_res_it.word() != nullptr) { | |
| 151 check_debug_pt(page_res_it.word(), 100); | |
| 152 if (bland_unrej) { | |
| 153 word = page_res_it.word(); | |
| 154 for (i = 0; i < word->reject_map.length(); i++) { | |
| 155 if (word->reject_map[i].accept_if_good_quality()) { | |
| 156 word->reject_map[i].setrej_quality_accept(); | |
| 157 } | |
| 158 } | |
| 159 page_res_it.forward(); | |
| 160 } else if ((page_res_it.row()->char_count > 0) && | |
| 161 ((page_res_it.row()->rej_count / | |
| 162 static_cast<float>(page_res_it.row()->char_count)) <= quality_rowrej_pc)) { | |
| 163 word = page_res_it.word(); | |
| 164 if (word->reject_map.quality_recoverable_rejects() && | |
| 165 (tessedit_unrej_any_wd || | |
| 166 acceptable_word_string(*word->uch_set, word->best_choice->unichar_string().c_str(), | |
| 167 word->best_choice->unichar_lengths().c_str()) != | |
| 168 AC_UNACCEPTABLE)) { | |
| 169 unrej_good_chs(word); | |
| 170 } | |
| 171 page_res_it.forward(); | |
| 172 } else { | |
| 173 // Skip to end of dodgy row. | |
| 174 current_row = page_res_it.row(); | |
| 175 while ((page_res_it.word() != nullptr) && (page_res_it.row() == current_row)) { | |
| 176 page_res_it.forward(); | |
| 177 } | |
| 178 } | |
| 179 check_debug_pt(page_res_it.word(), 110); | |
| 180 } | |
| 181 page_res_it.restart_page(); | |
| 182 page_res_it.page_res->char_count = 0; | |
| 183 page_res_it.page_res->rej_count = 0; | |
| 184 current_block = nullptr; | |
| 185 current_row = nullptr; | |
| 186 while (page_res_it.word() != nullptr) { | |
| 187 if (current_block != page_res_it.block()) { | |
| 188 current_block = page_res_it.block(); | |
| 189 current_block->char_count = 0; | |
| 190 current_block->rej_count = 0; | |
| 191 } | |
| 192 if (current_row != page_res_it.row()) { | |
| 193 current_row = page_res_it.row(); | |
| 194 current_row->char_count = 0; | |
| 195 current_row->rej_count = 0; | |
| 196 current_row->whole_word_rej_count = 0; | |
| 197 } | |
| 198 page_res_it.rej_stat_word(); | |
| 199 page_res_it.forward(); | |
| 200 } | |
| 201 } | |
| 202 | |
| 203 /************************************************************************* | |
| 204 * doc_and_block_rejection() | |
| 205 * | |
| 206 * If the page has too many rejects - reject all of it. | |
| 207 * If any block has too many rejects - reject all words in the block | |
| 208 *************************************************************************/ | |
| 209 | |
| 210 void Tesseract::doc_and_block_rejection( // reject big chunks | |
| 211 PAGE_RES_IT &page_res_it, bool good_quality_doc) { | |
| 212 BLOCK_RES *current_block; | |
| 213 | |
| 214 int16_t char_quality = 0; | |
| 215 int16_t accepted_char_quality; | |
| 216 | |
| 217 if (page_res_it.page_res->rej_count * 100.0 / page_res_it.page_res->char_count > | |
| 218 tessedit_reject_doc_percent) { | |
| 219 reject_whole_page(page_res_it); | |
| 220 if (tessedit_debug_doc_rejection) { | |
| 221 tprintf("REJECT ALL #chars: %d #Rejects: %d; \n", page_res_it.page_res->char_count, | |
| 222 page_res_it.page_res->rej_count); | |
| 223 } | |
| 224 } else { | |
| 225 if (tessedit_debug_doc_rejection) { | |
| 226 tprintf("NO PAGE REJECTION #chars: %d # Rejects: %d; \n", page_res_it.page_res->char_count, | |
| 227 page_res_it.page_res->rej_count); | |
| 228 } | |
| 229 | |
| 230 /* Walk blocks testing for block rejection */ | |
| 231 | |
| 232 page_res_it.restart_page(); | |
| 233 WERD_RES *word; | |
| 234 while ((word = page_res_it.word()) != nullptr) { | |
| 235 current_block = page_res_it.block(); | |
| 236 int16_t block_no = current_block->block->pdblk.index(); | |
| 237 if (current_block->char_count > 0 && | |
| 238 (current_block->rej_count * 100.0 / current_block->char_count) > | |
| 239 tessedit_reject_block_percent) { | |
| 240 if (tessedit_debug_block_rejection) { | |
| 241 tprintf("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n", block_no, | |
| 242 current_block->char_count, current_block->rej_count); | |
| 243 } | |
| 244 bool prev_word_rejected = false; | |
| 245 while ((word = page_res_it.word()) != nullptr && (page_res_it.block() == current_block)) { | |
| 246 bool rej_word; | |
| 247 if (tessedit_preserve_blk_rej_perfect_wds) { | |
| 248 rej_word = word->reject_map.reject_count() > 0 || | |
| 249 word->reject_map.length() < tessedit_preserve_min_wd_len; | |
| 250 if (rej_word && tessedit_dont_blkrej_good_wds && | |
| 251 word->reject_map.length() >= tessedit_preserve_min_wd_len && | |
| 252 acceptable_word_string(*word->uch_set, word->best_choice->unichar_string().c_str(), | |
| 253 word->best_choice->unichar_lengths().c_str()) != | |
| 254 AC_UNACCEPTABLE) { | |
| 255 word_char_quality(word, &char_quality, &accepted_char_quality); | |
| 256 rej_word = char_quality != word->reject_map.length(); | |
| 257 } | |
| 258 } else { | |
| 259 rej_word = true; | |
| 260 } | |
| 261 if (rej_word) { | |
| 262 /* | |
| 263 Reject spacing if both current and prev words are rejected. | |
| 264 NOTE - this is NOT restricted to FUZZY spaces. - When tried this | |
| 265 generated more space errors. | |
| 266 */ | |
| 267 if (tessedit_use_reject_spaces && prev_word_rejected && | |
| 268 page_res_it.prev_row() == page_res_it.row() && word->word->space() == 1) { | |
| 269 word->reject_spaces = true; | |
| 270 } | |
| 271 word->reject_map.rej_word_block_rej(); | |
| 272 } | |
| 273 prev_word_rejected = rej_word; | |
| 274 page_res_it.forward(); | |
| 275 } | |
| 276 } else { | |
| 277 if (tessedit_debug_block_rejection) { | |
| 278 tprintf("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n", block_no, | |
| 279 page_res_it.block()->char_count, page_res_it.block()->rej_count); | |
| 280 } | |
| 281 | |
| 282 /* Walk rows in block testing for row rejection */ | |
| 283 int16_t row_no = 0; | |
| 284 while (page_res_it.word() != nullptr && page_res_it.block() == current_block) { | |
| 285 ROW_RES *current_row = page_res_it.row(); | |
| 286 row_no++; | |
| 287 /* Reject whole row if: | |
| 288 fraction of chars on row which are rejected exceed a limit AND | |
| 289 fraction rejects which occur in WHOLE WERD rejects is LESS THAN a | |
| 290 limit | |
| 291 */ | |
| 292 if (current_row->char_count > 0 && | |
| 293 (current_row->rej_count * 100.0 / current_row->char_count) > | |
| 294 tessedit_reject_row_percent && | |
| 295 (current_row->whole_word_rej_count * 100.0 / current_row->rej_count) < | |
| 296 tessedit_whole_wd_rej_row_percent) { | |
| 297 if (tessedit_debug_block_rejection) { | |
| 298 tprintf("REJECTING ROW %d #chars: %d; #Rejects: %d\n", row_no, | |
| 299 current_row->char_count, current_row->rej_count); | |
| 300 } | |
| 301 bool prev_word_rejected = false; | |
| 302 while ((word = page_res_it.word()) != nullptr && page_res_it.row() == current_row) { | |
| 303 /* Preserve words on good docs unless they are mostly rejected*/ | |
| 304 bool rej_word; | |
| 305 if (!tessedit_row_rej_good_docs && good_quality_doc) { | |
| 306 rej_word = word->reject_map.reject_count() / | |
| 307 static_cast<float>(word->reject_map.length()) > | |
| 308 tessedit_good_doc_still_rowrej_wd; | |
| 309 } else if (tessedit_preserve_row_rej_perfect_wds) { | |
| 310 /* Preserve perfect words anyway */ | |
| 311 rej_word = word->reject_map.reject_count() > 0 || | |
| 312 word->reject_map.length() < tessedit_preserve_min_wd_len; | |
| 313 if (rej_word && tessedit_dont_rowrej_good_wds && | |
| 314 word->reject_map.length() >= tessedit_preserve_min_wd_len && | |
| 315 acceptable_word_string( | |
| 316 *word->uch_set, word->best_choice->unichar_string().c_str(), | |
| 317 word->best_choice->unichar_lengths().c_str()) != AC_UNACCEPTABLE) { | |
| 318 word_char_quality(word, &char_quality, &accepted_char_quality); | |
| 319 rej_word = char_quality != word->reject_map.length(); | |
| 320 } | |
| 321 } else { | |
| 322 rej_word = true; | |
| 323 } | |
| 324 if (rej_word) { | |
| 325 /* | |
| 326 Reject spacing if both current and prev words are rejected. | |
| 327 NOTE - this is NOT restricted to FUZZY spaces. - When tried | |
| 328 this generated more space errors. | |
| 329 */ | |
| 330 if (tessedit_use_reject_spaces && prev_word_rejected && | |
| 331 page_res_it.prev_row() == page_res_it.row() && word->word->space() == 1) { | |
| 332 word->reject_spaces = true; | |
| 333 } | |
| 334 word->reject_map.rej_word_row_rej(); | |
| 335 } | |
| 336 prev_word_rejected = rej_word; | |
| 337 page_res_it.forward(); | |
| 338 } | |
| 339 } else { | |
| 340 if (tessedit_debug_block_rejection) { | |
| 341 tprintf("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n", row_no, | |
| 342 current_row->char_count, current_row->rej_count); | |
| 343 } | |
| 344 while (page_res_it.word() != nullptr && page_res_it.row() == current_row) { | |
| 345 page_res_it.forward(); | |
| 346 } | |
| 347 } | |
| 348 } | |
| 349 } | |
| 350 } | |
| 351 } | |
| 352 } | |
| 353 | |
| 354 /************************************************************************* | |
| 355 * reject_whole_page() | |
| 356 * Don't believe any of it - set the reject map to 00..00 in all words | |
| 357 * | |
| 358 *************************************************************************/ | |
| 359 | |
| 360 void reject_whole_page(PAGE_RES_IT &page_res_it) { | |
| 361 page_res_it.restart_page(); | |
| 362 while (page_res_it.word() != nullptr) { | |
| 363 page_res_it.word()->reject_map.rej_word_doc_rej(); | |
| 364 page_res_it.forward(); | |
| 365 } | |
| 366 // whole page is rejected | |
| 367 page_res_it.page_res->rejected = true; | |
| 368 } | |
| 369 | |
| 370 void Tesseract::tilde_crunch(PAGE_RES_IT &page_res_it) { | |
| 371 WERD_RES *word; | |
| 372 GARBAGE_LEVEL garbage_level; | |
| 373 PAGE_RES_IT copy_it; | |
| 374 bool prev_potential_marked = false; | |
| 375 bool found_terrible_word = false; | |
| 376 bool ok_dict_word; | |
| 377 | |
| 378 page_res_it.restart_page(); | |
| 379 while (page_res_it.word() != nullptr) { | |
| 380 POLY_BLOCK *pb = page_res_it.block()->block->pdblk.poly_block(); | |
| 381 if (pb != nullptr && !pb->IsText()) { | |
| 382 page_res_it.forward(); | |
| 383 continue; | |
| 384 } | |
| 385 word = page_res_it.word(); | |
| 386 | |
| 387 if (crunch_early_convert_bad_unlv_chs) { | |
| 388 convert_bad_unlv_chs(word); | |
| 389 } | |
| 390 | |
| 391 if (crunch_early_merge_tess_fails) { | |
| 392 word->merge_tess_fails(); | |
| 393 } | |
| 394 | |
| 395 if (word->reject_map.accept_count() != 0) { | |
| 396 found_terrible_word = false; | |
| 397 // Forget earlier potential crunches | |
| 398 prev_potential_marked = false; | |
| 399 } else { | |
| 400 ok_dict_word = safe_dict_word(word); | |
| 401 garbage_level = garbage_word(word, ok_dict_word); | |
| 402 | |
| 403 if ((garbage_level != G_NEVER_CRUNCH) && (terrible_word_crunch(word, garbage_level))) { | |
| 404 if (crunch_debug > 0) { | |
| 405 tprintf("T CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().c_str()); | |
| 406 } | |
| 407 word->unlv_crunch_mode = CR_KEEP_SPACE; | |
| 408 if (prev_potential_marked) { | |
| 409 while (copy_it.word() != word) { | |
| 410 if (crunch_debug > 0) { | |
| 411 tprintf("P1 CRUNCHING: \"%s\"\n", | |
| 412 copy_it.word()->best_choice->unichar_string().c_str()); | |
| 413 } | |
| 414 copy_it.word()->unlv_crunch_mode = CR_KEEP_SPACE; | |
| 415 copy_it.forward(); | |
| 416 } | |
| 417 prev_potential_marked = false; | |
| 418 } | |
| 419 found_terrible_word = true; | |
| 420 } else if ((garbage_level != G_NEVER_CRUNCH) && | |
| 421 (potential_word_crunch(word, garbage_level, ok_dict_word))) { | |
| 422 if (found_terrible_word) { | |
| 423 if (crunch_debug > 0) { | |
| 424 tprintf("P2 CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().c_str()); | |
| 425 } | |
| 426 word->unlv_crunch_mode = CR_KEEP_SPACE; | |
| 427 } else if (!prev_potential_marked) { | |
| 428 copy_it = page_res_it; | |
| 429 prev_potential_marked = true; | |
| 430 if (crunch_debug > 1) { | |
| 431 tprintf("P3 CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().c_str()); | |
| 432 } | |
| 433 } | |
| 434 } else { | |
| 435 found_terrible_word = false; | |
| 436 // Forget earlier potential crunches | |
| 437 prev_potential_marked = false; | |
| 438 if (crunch_debug > 2) { | |
| 439 tprintf("NO CRUNCH: \"%s\"\n", word->best_choice->unichar_string().c_str()); | |
| 440 } | |
| 441 } | |
| 442 } | |
| 443 page_res_it.forward(); | |
| 444 } | |
| 445 } | |
| 446 | |
| 447 bool Tesseract::terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level) { | |
| 448 int crunch_mode = 0; | |
| 449 | |
| 450 if (word->best_choice->unichar_string().empty() || | |
| 451 (strspn(word->best_choice->unichar_string().c_str(), " ") == | |
| 452 word->best_choice->unichar_string().size())) { | |
| 453 crunch_mode = 1; | |
| 454 } else { | |
| 455 int adjusted_len = word->reject_map.length(); | |
| 456 if (adjusted_len > crunch_rating_max) { | |
| 457 adjusted_len = crunch_rating_max; | |
| 458 } | |
| 459 float rating_per_ch = word->best_choice->rating() / adjusted_len; | |
| 460 | |
| 461 if (rating_per_ch > crunch_terrible_rating) { | |
| 462 crunch_mode = 2; | |
| 463 } else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE)) { | |
| 464 crunch_mode = 3; | |
| 465 } else if ((word->best_choice->certainty() < crunch_poor_garbage_cert) && | |
| 466 (garbage_level != G_OK)) { | |
| 467 crunch_mode = 4; | |
| 468 } else if ((rating_per_ch > crunch_poor_garbage_rate) && (garbage_level != G_OK)) { | |
| 469 crunch_mode = 5; | |
| 470 } | |
| 471 } | |
| 472 if (crunch_mode > 0) { | |
| 473 if (crunch_debug > 2) { | |
| 474 tprintf("Terrible_word_crunch (%d) on \"%s\"\n", crunch_mode, | |
| 475 word->best_choice->unichar_string().c_str()); | |
| 476 } | |
| 477 return true; | |
| 478 } else { | |
| 479 return false; | |
| 480 } | |
| 481 } | |
| 482 | |
| 483 bool Tesseract::potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, | |
| 484 bool ok_dict_word) { | |
| 485 float rating_per_ch; | |
| 486 int adjusted_len; | |
| 487 const char *str = word->best_choice->unichar_string().c_str(); | |
| 488 const char *lengths = word->best_choice->unichar_lengths().c_str(); | |
| 489 bool word_crunchable; | |
| 490 int poor_indicator_count = 0; | |
| 491 | |
| 492 word_crunchable = | |
| 493 !crunch_leave_accept_strings || word->reject_map.length() < 3 || | |
| 494 (acceptable_word_string(*word->uch_set, str, lengths) == AC_UNACCEPTABLE && !ok_dict_word); | |
| 495 | |
| 496 adjusted_len = word->reject_map.length(); | |
| 497 if (adjusted_len > 10) { | |
| 498 adjusted_len = 10; | |
| 499 } | |
| 500 rating_per_ch = word->best_choice->rating() / adjusted_len; | |
| 501 | |
| 502 if (rating_per_ch > crunch_pot_poor_rate) { | |
| 503 if (crunch_debug > 2) { | |
| 504 tprintf("Potential poor rating on \"%s\"\n", word->best_choice->unichar_string().c_str()); | |
| 505 } | |
| 506 poor_indicator_count++; | |
| 507 } | |
| 508 | |
| 509 if (word_crunchable && word->best_choice->certainty() < crunch_pot_poor_cert) { | |
| 510 if (crunch_debug > 2) { | |
| 511 tprintf("Potential poor cert on \"%s\"\n", word->best_choice->unichar_string().c_str()); | |
| 512 } | |
| 513 poor_indicator_count++; | |
| 514 } | |
| 515 | |
| 516 if (garbage_level != G_OK) { | |
| 517 if (crunch_debug > 2) { | |
| 518 tprintf("Potential garbage on \"%s\"\n", word->best_choice->unichar_string().c_str()); | |
| 519 } | |
| 520 poor_indicator_count++; | |
| 521 } | |
| 522 return poor_indicator_count >= crunch_pot_indicators; | |
| 523 } | |
| 524 | |
| 525 void Tesseract::tilde_delete(PAGE_RES_IT &page_res_it) { | |
| 526 PAGE_RES_IT copy_it; | |
| 527 bool deleting_from_bol = false; | |
| 528 bool marked_delete_point = false; | |
| 529 int16_t debug_delete_mode; | |
| 530 CRUNCH_MODE delete_mode; | |
| 531 int16_t x_debug_delete_mode; | |
| 532 CRUNCH_MODE x_delete_mode; | |
| 533 | |
| 534 page_res_it.restart_page(); | |
| 535 while (page_res_it.word() != nullptr) { | |
| 536 WERD_RES *word = page_res_it.word(); | |
| 537 | |
| 538 delete_mode = word_deletable(word, debug_delete_mode); | |
| 539 if (delete_mode != CR_NONE) { | |
| 540 if (word->word->flag(W_BOL) || deleting_from_bol) { | |
| 541 if (crunch_debug > 0) { | |
| 542 tprintf("BOL CRUNCH DELETING(%d): \"%s\"\n", debug_delete_mode, | |
| 543 word->best_choice->unichar_string().c_str()); | |
| 544 } | |
| 545 word->unlv_crunch_mode = delete_mode; | |
| 546 deleting_from_bol = true; | |
| 547 } else if (word->word->flag(W_EOL)) { | |
| 548 if (marked_delete_point) { | |
| 549 while (copy_it.word() != word) { | |
| 550 x_delete_mode = word_deletable(copy_it.word(), x_debug_delete_mode); | |
| 551 if (crunch_debug > 0) { | |
| 552 tprintf("EOL CRUNCH DELETING(%d): \"%s\"\n", x_debug_delete_mode, | |
| 553 copy_it.word()->best_choice->unichar_string().c_str()); | |
| 554 } | |
| 555 copy_it.word()->unlv_crunch_mode = x_delete_mode; | |
| 556 copy_it.forward(); | |
| 557 } | |
| 558 } | |
| 559 if (crunch_debug > 0) { | |
| 560 tprintf("EOL CRUNCH DELETING(%d): \"%s\"\n", debug_delete_mode, | |
| 561 word->best_choice->unichar_string().c_str()); | |
| 562 } | |
| 563 word->unlv_crunch_mode = delete_mode; | |
| 564 deleting_from_bol = false; | |
| 565 marked_delete_point = false; | |
| 566 } else { | |
| 567 if (!marked_delete_point) { | |
| 568 copy_it = page_res_it; | |
| 569 marked_delete_point = true; | |
| 570 } | |
| 571 } | |
| 572 } else { | |
| 573 deleting_from_bol = false; | |
| 574 // Forget earlier potential crunches | |
| 575 marked_delete_point = false; | |
| 576 } | |
| 577 /* | |
| 578 The following step has been left till now as the tess fails are used to | |
| 579 determine if the word is deletable. | |
| 580 */ | |
| 581 if (!crunch_early_merge_tess_fails) { | |
| 582 word->merge_tess_fails(); | |
| 583 } | |
| 584 page_res_it.forward(); | |
| 585 } | |
| 586 } | |
| 587 | |
| 588 void Tesseract::convert_bad_unlv_chs(WERD_RES *word_res) { | |
| 589 int i; | |
| 590 UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-"); | |
| 591 UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" "); | |
| 592 UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~"); | |
| 593 UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^"); | |
| 594 for (i = 0; i < word_res->reject_map.length(); ++i) { | |
| 595 if (word_res->best_choice->unichar_id(i) == unichar_tilde) { | |
| 596 word_res->best_choice->set_unichar_id(unichar_dash, i); | |
| 597 if (word_res->reject_map[i].accepted()) { | |
| 598 word_res->reject_map[i].setrej_unlv_rej(); | |
| 599 } | |
| 600 } | |
| 601 if (word_res->best_choice->unichar_id(i) == unichar_pow) { | |
| 602 word_res->best_choice->set_unichar_id(unichar_space, i); | |
| 603 if (word_res->reject_map[i].accepted()) { | |
| 604 word_res->reject_map[i].setrej_unlv_rej(); | |
| 605 } | |
| 606 } | |
| 607 } | |
| 608 } | |
| 609 | |
| 610 GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, bool ok_dict_word) { | |
| 611 enum STATES { | |
| 612 JUNK, | |
| 613 FIRST_UPPER, | |
| 614 FIRST_LOWER, | |
| 615 FIRST_NUM, | |
| 616 SUBSEQUENT_UPPER, | |
| 617 SUBSEQUENT_LOWER, | |
| 618 SUBSEQUENT_NUM | |
| 619 }; | |
| 620 const char *str = word->best_choice->unichar_string().c_str(); | |
| 621 const char *lengths = word->best_choice->unichar_lengths().c_str(); | |
| 622 STATES state = JUNK; | |
| 623 int len = 0; | |
| 624 int isolated_digits = 0; | |
| 625 int isolated_alphas = 0; | |
| 626 int bad_char_count = 0; | |
| 627 int tess_rejs = 0; | |
| 628 int dodgy_chars = 0; | |
| 629 int ok_chars; | |
| 630 UNICHAR_ID last_char = -1; | |
| 631 int alpha_repetition_count = 0; | |
| 632 int longest_alpha_repetition_count = 0; | |
| 633 int longest_lower_run_len = 0; | |
| 634 int lower_string_count = 0; | |
| 635 int longest_upper_run_len = 0; | |
| 636 int upper_string_count = 0; | |
| 637 int total_alpha_count = 0; | |
| 638 int total_digit_count = 0; | |
| 639 | |
| 640 for (; *str != '\0'; str += *(lengths++)) { | |
| 641 len++; | |
| 642 if (word->uch_set->get_isupper(str, *lengths)) { | |
| 643 total_alpha_count++; | |
| 644 switch (state) { | |
| 645 case SUBSEQUENT_UPPER: | |
| 646 case FIRST_UPPER: | |
| 647 state = SUBSEQUENT_UPPER; | |
| 648 upper_string_count++; | |
| 649 if (longest_upper_run_len < upper_string_count) { | |
| 650 longest_upper_run_len = upper_string_count; | |
| 651 } | |
| 652 if (last_char == word->uch_set->unichar_to_id(str, *lengths)) { | |
| 653 alpha_repetition_count++; | |
| 654 if (longest_alpha_repetition_count < alpha_repetition_count) { | |
| 655 longest_alpha_repetition_count = alpha_repetition_count; | |
| 656 } | |
| 657 } else { | |
| 658 last_char = word->uch_set->unichar_to_id(str, *lengths); | |
| 659 alpha_repetition_count = 1; | |
| 660 } | |
| 661 break; | |
| 662 case FIRST_NUM: | |
| 663 isolated_digits++; | |
| 664 // Fall through. | |
| 665 default: | |
| 666 state = FIRST_UPPER; | |
| 667 last_char = word->uch_set->unichar_to_id(str, *lengths); | |
| 668 alpha_repetition_count = 1; | |
| 669 upper_string_count = 1; | |
| 670 break; | |
| 671 } | |
| 672 } else if (word->uch_set->get_islower(str, *lengths)) { | |
| 673 total_alpha_count++; | |
| 674 switch (state) { | |
| 675 case SUBSEQUENT_LOWER: | |
| 676 case FIRST_LOWER: | |
| 677 state = SUBSEQUENT_LOWER; | |
| 678 lower_string_count++; | |
| 679 if (longest_lower_run_len < lower_string_count) { | |
| 680 longest_lower_run_len = lower_string_count; | |
| 681 } | |
| 682 if (last_char == word->uch_set->unichar_to_id(str, *lengths)) { | |
| 683 alpha_repetition_count++; | |
| 684 if (longest_alpha_repetition_count < alpha_repetition_count) { | |
| 685 longest_alpha_repetition_count = alpha_repetition_count; | |
| 686 } | |
| 687 } else { | |
| 688 last_char = word->uch_set->unichar_to_id(str, *lengths); | |
| 689 alpha_repetition_count = 1; | |
| 690 } | |
| 691 break; | |
| 692 case FIRST_NUM: | |
| 693 isolated_digits++; | |
| 694 // Fall through. | |
| 695 default: | |
| 696 state = FIRST_LOWER; | |
| 697 last_char = word->uch_set->unichar_to_id(str, *lengths); | |
| 698 alpha_repetition_count = 1; | |
| 699 lower_string_count = 1; | |
| 700 break; | |
| 701 } | |
| 702 } else if (word->uch_set->get_isdigit(str, *lengths)) { | |
| 703 total_digit_count++; | |
| 704 switch (state) { | |
| 705 case FIRST_NUM: | |
| 706 state = SUBSEQUENT_NUM; | |
| 707 case SUBSEQUENT_NUM: | |
| 708 break; | |
| 709 case FIRST_UPPER: | |
| 710 case FIRST_LOWER: | |
| 711 isolated_alphas++; | |
| 712 // Fall through. | |
| 713 default: | |
| 714 state = FIRST_NUM; | |
| 715 break; | |
| 716 } | |
| 717 } else { | |
| 718 if (*lengths == 1 && *str == ' ') { | |
| 719 tess_rejs++; | |
| 720 } else { | |
| 721 bad_char_count++; | |
| 722 } | |
| 723 switch (state) { | |
| 724 case FIRST_NUM: | |
| 725 isolated_digits++; | |
| 726 break; | |
| 727 case FIRST_UPPER: | |
| 728 case FIRST_LOWER: | |
| 729 isolated_alphas++; | |
| 730 default: | |
| 731 break; | |
| 732 } | |
| 733 state = JUNK; | |
| 734 } | |
| 735 } | |
| 736 | |
| 737 switch (state) { | |
| 738 case FIRST_NUM: | |
| 739 isolated_digits++; | |
| 740 break; | |
| 741 case FIRST_UPPER: | |
| 742 case FIRST_LOWER: | |
| 743 isolated_alphas++; | |
| 744 default: | |
| 745 break; | |
| 746 } | |
| 747 | |
| 748 if (crunch_include_numerals) { | |
| 749 total_alpha_count += total_digit_count - isolated_digits; | |
| 750 } | |
| 751 | |
| 752 if (crunch_leave_ok_strings && len >= 4 && 2 * (total_alpha_count - isolated_alphas) > len && | |
| 753 longest_alpha_repetition_count < crunch_long_repetitions) { | |
| 754 if ((crunch_accept_ok && | |
| 755 acceptable_word_string(*word->uch_set, str, lengths) != AC_UNACCEPTABLE) || | |
| 756 longest_lower_run_len > crunch_leave_lc_strings || | |
| 757 longest_upper_run_len > crunch_leave_uc_strings) { | |
| 758 return G_NEVER_CRUNCH; | |
| 759 } | |
| 760 } | |
| 761 if (word->reject_map.length() > 1 && strpbrk(str, " ") == nullptr && | |
| 762 (word->best_choice->permuter() == SYSTEM_DAWG_PERM || | |
| 763 word->best_choice->permuter() == FREQ_DAWG_PERM || | |
| 764 word->best_choice->permuter() == USER_DAWG_PERM || | |
| 765 word->best_choice->permuter() == NUMBER_PERM || | |
| 766 acceptable_word_string(*word->uch_set, str, lengths) != AC_UNACCEPTABLE || ok_dict_word)) { | |
| 767 return G_OK; | |
| 768 } | |
| 769 | |
| 770 ok_chars = len - bad_char_count - isolated_digits - isolated_alphas - tess_rejs; | |
| 771 | |
| 772 if (crunch_debug > 3) { | |
| 773 tprintf("garbage_word: \"%s\"\n", word->best_choice->unichar_string().c_str()); | |
| 774 tprintf("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n", len, bad_char_count, | |
| 775 isolated_digits, isolated_alphas, tess_rejs); | |
| 776 } | |
| 777 if (bad_char_count == 0 && tess_rejs == 0 && | |
| 778 (len > isolated_digits + isolated_alphas || len <= 2)) { | |
| 779 return G_OK; | |
| 780 } | |
| 781 | |
| 782 if (tess_rejs > ok_chars || (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len)) { | |
| 783 return G_TERRIBLE; | |
| 784 } | |
| 785 | |
| 786 if (len > 4) { | |
| 787 dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits + isolated_alphas; | |
| 788 if (dodgy_chars > 5 || (dodgy_chars / static_cast<float>(len)) > 0.5) { | |
| 789 return G_DODGY; | |
| 790 } else { | |
| 791 return G_OK; | |
| 792 } | |
| 793 } else { | |
| 794 dodgy_chars = 2 * tess_rejs + bad_char_count; | |
| 795 if ((len == 4 && dodgy_chars > 2) || (len == 3 && dodgy_chars > 2) || dodgy_chars >= len) { | |
| 796 return G_DODGY; | |
| 797 } else { | |
| 798 return G_OK; | |
| 799 } | |
| 800 } | |
| 801 } | |
| 802 | |
| 803 /************************************************************************* | |
| 804 * word_deletable() | |
| 805 * DELETE WERDS AT ENDS OF ROWS IF | |
| 806 * Word is crunched && | |
| 807 * ( string length = 0 OR | |
| 808 * > 50% of chars are "|" (before merging) OR | |
| 809 * certainty < -10 OR | |
| 810 * rating /char > 60 OR | |
| 811 * TOP of word is more than 0.5 xht BELOW baseline OR | |
| 812 * BOTTOM of word is more than 0.5 xht ABOVE xht OR | |
| 813 * length of word < 3xht OR | |
| 814 * height of word < 0.7 xht OR | |
| 815 * height of word > 3.0 xht OR | |
| 816 * >75% of the outline BBs have longest dimension < 0.5xht | |
| 817 *************************************************************************/ | |
| 818 | |
| 819 CRUNCH_MODE Tesseract::word_deletable(WERD_RES *word, int16_t &delete_mode) { | |
| 820 int word_len = word->reject_map.length(); | |
| 821 float rating_per_ch; | |
| 822 TBOX box; // BB of word | |
| 823 | |
| 824 if (word->unlv_crunch_mode == CR_NONE) { | |
| 825 delete_mode = 0; | |
| 826 return CR_NONE; | |
| 827 } | |
| 828 | |
| 829 if (word_len == 0) { | |
| 830 delete_mode = 1; | |
| 831 return CR_DELETE; | |
| 832 } | |
| 833 | |
| 834 if (word->rebuild_word != nullptr) { | |
| 835 // Cube leaves rebuild_word nullptr. | |
| 836 box = word->rebuild_word->bounding_box(); | |
| 837 if (box.height() < crunch_del_min_ht * kBlnXHeight) { | |
| 838 delete_mode = 4; | |
| 839 return CR_DELETE; | |
| 840 } | |
| 841 | |
| 842 if (noise_outlines(word->rebuild_word)) { | |
| 843 delete_mode = 5; | |
| 844 return CR_DELETE; | |
| 845 } | |
| 846 } | |
| 847 | |
| 848 if ((failure_count(word) * 1.5) > word_len) { | |
| 849 delete_mode = 2; | |
| 850 return CR_LOOSE_SPACE; | |
| 851 } | |
| 852 | |
| 853 if (word->best_choice->certainty() < crunch_del_cert) { | |
| 854 delete_mode = 7; | |
| 855 return CR_LOOSE_SPACE; | |
| 856 } | |
| 857 | |
| 858 rating_per_ch = word->best_choice->rating() / word_len; | |
| 859 | |
| 860 if (rating_per_ch > crunch_del_rating) { | |
| 861 delete_mode = 8; | |
| 862 return CR_LOOSE_SPACE; | |
| 863 } | |
| 864 | |
| 865 if (box.top() < kBlnBaselineOffset - crunch_del_low_word * kBlnXHeight) { | |
| 866 delete_mode = 9; | |
| 867 return CR_LOOSE_SPACE; | |
| 868 } | |
| 869 | |
| 870 if (box.bottom() > kBlnBaselineOffset + crunch_del_high_word * kBlnXHeight) { | |
| 871 delete_mode = 10; | |
| 872 return CR_LOOSE_SPACE; | |
| 873 } | |
| 874 | |
| 875 if (box.height() > crunch_del_max_ht * kBlnXHeight) { | |
| 876 delete_mode = 11; | |
| 877 return CR_LOOSE_SPACE; | |
| 878 } | |
| 879 | |
| 880 if (box.width() < crunch_del_min_width * kBlnXHeight) { | |
| 881 delete_mode = 3; | |
| 882 return CR_LOOSE_SPACE; | |
| 883 } | |
| 884 | |
| 885 delete_mode = 0; | |
| 886 return CR_NONE; | |
| 887 } | |
| 888 | |
| 889 int16_t Tesseract::failure_count(WERD_RES *word) { | |
| 890 const char *str = word->best_choice->unichar_string().c_str(); | |
| 891 int tess_rejs = 0; | |
| 892 | |
| 893 for (; *str != '\0'; str++) { | |
| 894 if (*str == ' ') { | |
| 895 tess_rejs++; | |
| 896 } | |
| 897 } | |
| 898 return tess_rejs; | |
| 899 } | |
| 900 | |
| 901 bool Tesseract::noise_outlines(TWERD *word) { | |
| 902 TBOX box; // BB of outline | |
| 903 int16_t outline_count = 0; | |
| 904 int16_t small_outline_count = 0; | |
| 905 int16_t max_dimension; | |
| 906 float small_limit = kBlnXHeight * crunch_small_outlines_size; | |
| 907 | |
| 908 for (unsigned b = 0; b < word->NumBlobs(); ++b) { | |
| 909 TBLOB *blob = word->blobs[b]; | |
| 910 for (TESSLINE *ol = blob->outlines; ol != nullptr; ol = ol->next) { | |
| 911 outline_count++; | |
| 912 box = ol->bounding_box(); | |
| 913 if (box.height() > box.width()) { | |
| 914 max_dimension = box.height(); | |
| 915 } else { | |
| 916 max_dimension = box.width(); | |
| 917 } | |
| 918 if (max_dimension < small_limit) { | |
| 919 small_outline_count++; | |
| 920 } | |
| 921 } | |
| 922 } | |
| 923 return small_outline_count >= outline_count; | |
| 924 } | |
| 925 | |
| 926 } // namespace tesseract |
