Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccmain/fixspace.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /****************************************************************** | |
| 2 * File: fixspace.cpp (Formerly fixspace.c) | |
| 3 * Description: Implements a pass over the page res, exploring the alternative | |
| 4 * spacing possibilities, trying to use context to improve the | |
| 5 * word spacing | |
| 6 * Author: Phil Cheatle | |
| 7 * | |
| 8 * (C) Copyright 1993, Hewlett-Packard Ltd. | |
| 9 ** Licensed under the Apache License, Version 2.0 (the "License"); | |
| 10 ** you may not use this file except in compliance with the License. | |
| 11 ** You may obtain a copy of the License at | |
| 12 ** http://www.apache.org/licenses/LICENSE-2.0 | |
| 13 ** Unless required by applicable law or agreed to in writing, software | |
| 14 ** distributed under the License is distributed on an "AS IS" BASIS, | |
| 15 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 16 ** See the License for the specific language governing permissions and | |
| 17 ** limitations under the License. | |
| 18 * | |
| 19 **********************************************************************/ | |
| 20 | |
| 21 #include "fixspace.h" | |
| 22 | |
| 23 #include "blobs.h" // for TWERD, TBLOB, TESSLINE | |
| 24 #include "boxword.h" // for BoxWord | |
| 25 #include "errcode.h" // for ASSERT_HOST | |
| 26 #include "normalis.h" // for kBlnXHeight, kBlnBaselineOffset | |
| 27 #include "pageres.h" // for WERD_RES_IT, WERD_RES, WERD_RES_LIST | |
| 28 #include "params.h" // for IntParam, StringParam, BoolParam, DoubleParam, ... | |
| 29 #include "ratngs.h" // for WERD_CHOICE, FREQ_DAWG_PERM, NUMBER_PERM | |
| 30 #include "rect.h" // for TBOX | |
| 31 #include "stepblob.h" // for C_BLOB_IT, C_BLOB_LIST, C_BLOB | |
| 32 #include "tesseractclass.h" // for Tesseract, TesseractStats, WordData | |
| 33 #include "tessvars.h" // for debug_fp | |
| 34 #include "tprintf.h" // for tprintf | |
| 35 #include "unicharset.h" // for UNICHARSET | |
| 36 #include "werd.h" // for WERD, W_EOL, W_FUZZY_NON, W_FUZZY_SP | |
| 37 | |
| 38 #include <tesseract/ocrclass.h> // for ETEXT_DESC | |
| 39 #include <tesseract/unichar.h> // for UNICHAR_ID | |
| 40 | |
| 41 #include <cstdint> // for INT16_MAX, int16_t, int32_t | |
| 42 | |
| 43 namespace tesseract { | |
| 44 | |
| 45 class BLOCK; | |
| 46 class ROW; | |
| 47 | |
| 48 #define PERFECT_WERDS 999 | |
| 49 | |
| 50 /********************************************************************** | |
| 51 * c_blob_comparator() | |
| 52 * | |
| 53 * Blob comparator used to sort a blob list so that blobs are in increasing | |
| 54 * order of left edge. | |
| 55 **********************************************************************/ | |
| 56 | |
| 57 static int c_blob_comparator( // sort blobs | |
| 58 const void *blob1p, // ptr to ptr to blob1 | |
| 59 const void *blob2p // ptr to ptr to blob2 | |
| 60 ) { | |
| 61 const C_BLOB *blob1 = *reinterpret_cast<const C_BLOB *const *>(blob1p); | |
| 62 const C_BLOB *blob2 = *reinterpret_cast<const C_BLOB *const *>(blob2p); | |
| 63 | |
| 64 return blob1->bounding_box().left() - blob2->bounding_box().left(); | |
| 65 } | |
| 66 | |
| 67 /** | |
| 68 * @name fix_fuzzy_spaces() | |
| 69 * Walk over the page finding sequences of words joined by fuzzy spaces. Extract | |
| 70 * them as a sublist, process the sublist to find the optimal arrangement of | |
| 71 * spaces then replace the sublist in the ROW_RES. | |
| 72 * | |
| 73 * @param monitor progress monitor | |
| 74 * @param word_count count of words in doc | |
| 75 * @param[out] page_res | |
| 76 */ | |
| 77 void Tesseract::fix_fuzzy_spaces(ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res) { | |
| 78 BLOCK_RES_IT block_res_it; | |
| 79 ROW_RES_IT row_res_it; | |
| 80 WERD_RES_IT word_res_it_from; | |
| 81 WERD_RES_IT word_res_it_to; | |
| 82 WERD_RES *word_res; | |
| 83 WERD_RES_LIST fuzzy_space_words; | |
| 84 int16_t new_length; | |
| 85 bool prevent_null_wd_fixsp; // DON'T process blobless wds | |
| 86 int32_t word_index; // current word | |
| 87 | |
| 88 block_res_it.set_to_list(&page_res->block_res_list); | |
| 89 word_index = 0; | |
| 90 for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list(); block_res_it.forward()) { | |
| 91 row_res_it.set_to_list(&block_res_it.data()->row_res_list); | |
| 92 for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list(); row_res_it.forward()) { | |
| 93 word_res_it_from.set_to_list(&row_res_it.data()->word_res_list); | |
| 94 while (!word_res_it_from.at_last()) { | |
| 95 word_res = word_res_it_from.data(); | |
| 96 while (!word_res_it_from.at_last() && | |
| 97 !(word_res->combination || | |
| 98 word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) || | |
| 99 word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) { | |
| 100 fix_sp_fp_word(word_res_it_from, row_res_it.data()->row, block_res_it.data()->block); | |
| 101 word_res = word_res_it_from.forward(); | |
| 102 word_index++; | |
| 103 if (monitor != nullptr) { | |
| 104 monitor->ocr_alive = true; | |
| 105 monitor->progress = 90 + 5 * word_index / word_count; | |
| 106 if (monitor->deadline_exceeded() || | |
| 107 (monitor->cancel != nullptr && | |
| 108 (*monitor->cancel)(monitor->cancel_this, stats_.dict_words))) { | |
| 109 return; | |
| 110 } | |
| 111 } | |
| 112 } | |
| 113 | |
| 114 if (!word_res_it_from.at_last()) { | |
| 115 word_res_it_to = word_res_it_from; | |
| 116 prevent_null_wd_fixsp = word_res->word->cblob_list()->empty(); | |
| 117 if (check_debug_pt(word_res, 60)) { | |
| 118 debug_fix_space_level.set_value(10); | |
| 119 } | |
| 120 word_res_it_to.forward(); | |
| 121 word_index++; | |
| 122 if (monitor != nullptr) { | |
| 123 monitor->ocr_alive = true; | |
| 124 monitor->progress = 90 + 5 * word_index / word_count; | |
| 125 if (monitor->deadline_exceeded() || | |
| 126 (monitor->cancel != nullptr && | |
| 127 (*monitor->cancel)(monitor->cancel_this, stats_.dict_words))) { | |
| 128 return; | |
| 129 } | |
| 130 } | |
| 131 while (!word_res_it_to.at_last() && | |
| 132 (word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) || | |
| 133 word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) { | |
| 134 if (check_debug_pt(word_res, 60)) { | |
| 135 debug_fix_space_level.set_value(10); | |
| 136 } | |
| 137 if (word_res->word->cblob_list()->empty()) { | |
| 138 prevent_null_wd_fixsp = true; | |
| 139 } | |
| 140 word_res = word_res_it_to.forward(); | |
| 141 } | |
| 142 if (check_debug_pt(word_res, 60)) { | |
| 143 debug_fix_space_level.set_value(10); | |
| 144 } | |
| 145 if (word_res->word->cblob_list()->empty()) { | |
| 146 prevent_null_wd_fixsp = true; | |
| 147 } | |
| 148 if (prevent_null_wd_fixsp) { | |
| 149 word_res_it_from = word_res_it_to; | |
| 150 } else { | |
| 151 fuzzy_space_words.assign_to_sublist(&word_res_it_from, &word_res_it_to); | |
| 152 fix_fuzzy_space_list(fuzzy_space_words, row_res_it.data()->row, | |
| 153 block_res_it.data()->block); | |
| 154 new_length = fuzzy_space_words.length(); | |
| 155 word_res_it_from.add_list_before(&fuzzy_space_words); | |
| 156 for (; !word_res_it_from.at_last() && new_length > 0; new_length--) { | |
| 157 word_res_it_from.forward(); | |
| 158 } | |
| 159 } | |
| 160 if (test_pt) { | |
| 161 debug_fix_space_level.set_value(0); | |
| 162 } | |
| 163 } | |
| 164 fix_sp_fp_word(word_res_it_from, row_res_it.data()->row, block_res_it.data()->block); | |
| 165 // Last word in row | |
| 166 } | |
| 167 } | |
| 168 } | |
| 169 } | |
| 170 | |
| 171 void Tesseract::fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block) { | |
| 172 int16_t best_score; | |
| 173 WERD_RES_LIST current_perm; | |
| 174 bool improved = false; | |
| 175 | |
| 176 best_score = eval_word_spacing(best_perm); // default score | |
| 177 dump_words(best_perm, best_score, 1, improved); | |
| 178 | |
| 179 if (best_score != PERFECT_WERDS) { | |
| 180 initialise_search(best_perm, current_perm); | |
| 181 } | |
| 182 | |
| 183 while ((best_score != PERFECT_WERDS) && !current_perm.empty()) { | |
| 184 match_current_words(current_perm, row, block); | |
| 185 int16_t current_score = eval_word_spacing(current_perm); | |
| 186 dump_words(current_perm, current_score, 2, improved); | |
| 187 if (current_score > best_score) { | |
| 188 best_perm.clear(); | |
| 189 best_perm.deep_copy(¤t_perm, &WERD_RES::deep_copy); | |
| 190 best_score = current_score; | |
| 191 improved = true; | |
| 192 } | |
| 193 if (current_score < PERFECT_WERDS) { | |
| 194 transform_to_next_perm(current_perm); | |
| 195 } | |
| 196 } | |
| 197 dump_words(best_perm, best_score, 3, improved); | |
| 198 } | |
| 199 | |
| 200 void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list) { | |
| 201 WERD_RES_IT src_it(&src_list); | |
| 202 WERD_RES_IT new_it(&new_list); | |
| 203 WERD_RES *new_wd; | |
| 204 | |
| 205 for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) { | |
| 206 WERD_RES *src_wd = src_it.data(); | |
| 207 if (!src_wd->combination) { | |
| 208 new_wd = WERD_RES::deep_copy(src_wd); | |
| 209 new_wd->combination = false; | |
| 210 new_wd->part_of_combo = false; | |
| 211 new_it.add_after_then_move(new_wd); | |
| 212 } | |
| 213 } | |
| 214 } | |
| 215 | |
| 216 void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block) { | |
| 217 WERD_RES_IT word_it(&words); | |
| 218 WERD_RES *word; | |
| 219 // Since we are not using PAGE_RES to iterate over words, we need to update | |
| 220 // prev_word_best_choice_ before calling classify_word_pass2(). | |
| 221 prev_word_best_choice_ = nullptr; | |
| 222 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { | |
| 223 word = word_it.data(); | |
| 224 if ((!word->part_of_combo) && (word->box_word == nullptr)) { | |
| 225 WordData word_data(block, row, word); | |
| 226 SetupWordPassN(2, &word_data); | |
| 227 classify_word_and_language(2, nullptr, &word_data); | |
| 228 } | |
| 229 prev_word_best_choice_ = word->best_choice; | |
| 230 } | |
| 231 } | |
| 232 | |
| 233 /** | |
| 234 * @name eval_word_spacing() | |
| 235 * The basic measure is the number of characters in contextually confirmed | |
| 236 * words. (I.e the word is done) | |
| 237 * If all words are contextually confirmed the evaluation is deemed perfect. | |
| 238 * | |
| 239 * Some fiddles are done to handle "1"s as these are VERY frequent causes of | |
| 240 * fuzzy spaces. The problem with the basic measure is that "561 63" would score | |
| 241 * the same as "56163", though given our knowledge that the space is fuzzy, and | |
| 242 * that there is a "1" next to the fuzzy space, we need to ensure that "56163" | |
| 243 * is preferred. | |
| 244 * | |
| 245 * The solution is to NOT COUNT the score of any word which has a digit at one | |
| 246 * end and a "1Il" as the character the other side of the space. | |
| 247 * | |
| 248 * Conversely, any character next to a "1" within a word is counted as a | |
| 249 * positive score. Thus "561 63" would score 4 (3 chars in a numeric word plus 1 | |
| 250 * side of the "1" joined). "56163" would score 7 - all chars in a numeric word | |
| 251 * + 2 sides of a "1" joined. | |
| 252 * | |
| 253 * The joined 1 rule is applied to any word REGARDLESS of contextual | |
| 254 * confirmation. Thus "PS7a71 3/7a" scores 1 (neither word is contexutally | |
| 255 * confirmed. The only score is from the joined 1. "PS7a713/7a" scores 2. | |
| 256 * | |
| 257 */ | |
| 258 int16_t Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) { | |
| 259 WERD_RES_IT word_res_it(&word_res_list); | |
| 260 int16_t total_score = 0; | |
| 261 int16_t word_count = 0; | |
| 262 int16_t done_word_count = 0; | |
| 263 int i; | |
| 264 int16_t offset; | |
| 265 int16_t prev_word_score = 0; | |
| 266 bool prev_word_done = false; | |
| 267 bool prev_char_1 = false; // prev ch a "1/I/l"? | |
| 268 bool prev_char_digit = false; // prev ch 2..9 or 0 | |
| 269 const char *punct_chars = "!\"`',.:;"; | |
| 270 do { | |
| 271 // current word | |
| 272 WERD_RES *word = word_res_it.data(); | |
| 273 bool word_done = fixspace_thinks_word_done(word); | |
| 274 word_count++; | |
| 275 if (word->tess_failed) { | |
| 276 total_score += prev_word_score; | |
| 277 if (prev_word_done) { | |
| 278 done_word_count++; | |
| 279 } | |
| 280 prev_word_score = 0; | |
| 281 prev_char_1 = false; | |
| 282 prev_char_digit = false; | |
| 283 prev_word_done = false; | |
| 284 } else { | |
| 285 /* | |
| 286 Can we add the prev word score and potentially count this word? | |
| 287 Yes IF it didn't end in a 1 when the first char of this word is a digit | |
| 288 AND it didn't end in a digit when the first char of this word is a 1 | |
| 289 */ | |
| 290 auto word_len = word->reject_map.length(); | |
| 291 bool current_word_ok_so_far = false; | |
| 292 if (!((prev_char_1 && digit_or_numeric_punct(word, 0)) || | |
| 293 (prev_char_digit && | |
| 294 ((word_done && word->best_choice->unichar_lengths().c_str()[0] == 1 && | |
| 295 word->best_choice->unichar_string()[0] == '1') || | |
| 296 (!word_done && | |
| 297 conflict_set_I_l_1.contains(word->best_choice->unichar_string()[0])))))) { | |
| 298 total_score += prev_word_score; | |
| 299 if (prev_word_done) { | |
| 300 done_word_count++; | |
| 301 } | |
| 302 current_word_ok_so_far = word_done; | |
| 303 } | |
| 304 | |
| 305 if (current_word_ok_so_far) { | |
| 306 prev_word_done = true; | |
| 307 prev_word_score = word_len; | |
| 308 } else { | |
| 309 prev_word_done = false; | |
| 310 prev_word_score = 0; | |
| 311 } | |
| 312 | |
| 313 /* Add 1 to total score for every joined 1 regardless of context and | |
| 314 rejtn */ | |
| 315 for (i = 0, prev_char_1 = false; i < word_len; i++) { | |
| 316 bool current_char_1 = word->best_choice->unichar_string()[i] == '1'; | |
| 317 if (prev_char_1 || (current_char_1 && (i > 0))) { | |
| 318 total_score++; | |
| 319 } | |
| 320 prev_char_1 = current_char_1; | |
| 321 } | |
| 322 | |
| 323 /* Add 1 to total score for every joined punctuation regardless of context | |
| 324 and rejtn */ | |
| 325 if (tessedit_prefer_joined_punct) { | |
| 326 bool prev_char_punct; | |
| 327 for (i = 0, offset = 0, prev_char_punct = false; i < word_len; | |
| 328 offset += word->best_choice->unichar_lengths()[i++]) { | |
| 329 bool current_char_punct = | |
| 330 strchr(punct_chars, word->best_choice->unichar_string()[offset]) != nullptr; | |
| 331 if (prev_char_punct || (current_char_punct && i > 0)) { | |
| 332 total_score++; | |
| 333 } | |
| 334 prev_char_punct = current_char_punct; | |
| 335 } | |
| 336 } | |
| 337 prev_char_digit = digit_or_numeric_punct(word, word_len - 1); | |
| 338 for (i = 0, offset = 0; i < word_len - 1; | |
| 339 offset += word->best_choice->unichar_lengths()[i++]) { | |
| 340 ; | |
| 341 } | |
| 342 prev_char_1 = | |
| 343 ((word_done && (word->best_choice->unichar_string()[offset] == '1')) || | |
| 344 (!word_done && | |
| 345 conflict_set_I_l_1.contains(word->best_choice->unichar_string()[offset]))); | |
| 346 } | |
| 347 /* Find next word */ | |
| 348 do { | |
| 349 word_res_it.forward(); | |
| 350 } while (word_res_it.data()->part_of_combo); | |
| 351 } while (!word_res_it.at_first()); | |
| 352 total_score += prev_word_score; | |
| 353 if (prev_word_done) { | |
| 354 done_word_count++; | |
| 355 } | |
| 356 if (done_word_count == word_count) { | |
| 357 return PERFECT_WERDS; | |
| 358 } else { | |
| 359 return total_score; | |
| 360 } | |
| 361 } | |
| 362 | |
| 363 bool Tesseract::digit_or_numeric_punct(WERD_RES *word, int char_position) { | |
| 364 int i; | |
| 365 int offset; | |
| 366 | |
| 367 for (i = 0, offset = 0; i < char_position; offset += word->best_choice->unichar_lengths()[i++]) { | |
| 368 ; | |
| 369 } | |
| 370 return ( | |
| 371 word->uch_set->get_isdigit(word->best_choice->unichar_string().c_str() + offset, | |
| 372 word->best_choice->unichar_lengths()[i]) || | |
| 373 (word->best_choice->permuter() == NUMBER_PERM && | |
| 374 numeric_punctuation.contains(word->best_choice->unichar_string().c_str()[offset]))); | |
| 375 } | |
| 376 | |
| 377 /** | |
| 378 * @name transform_to_next_perm() | |
| 379 * Examines the current word list to find the smallest word gap size. Then walks | |
| 380 * the word list closing any gaps of this size by either inserted new | |
| 381 * combination words, or extending existing ones. | |
| 382 * | |
| 383 * The routine COULD be limited to stop it building words longer than N blobs. | |
| 384 * | |
| 385 * If there are no more gaps then it DELETES the entire list and returns the | |
| 386 * empty list to cause termination. | |
| 387 */ | |
| 388 void transform_to_next_perm(WERD_RES_LIST &words) { | |
| 389 WERD_RES_IT word_it(&words); | |
| 390 WERD_RES_IT prev_word_it(&words); | |
| 391 WERD_RES *word; | |
| 392 WERD_RES *prev_word; | |
| 393 int16_t prev_right = -INT16_MAX; | |
| 394 TBOX box; | |
| 395 int16_t gap; | |
| 396 int16_t min_gap = INT16_MAX; | |
| 397 | |
| 398 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { | |
| 399 word = word_it.data(); | |
| 400 if (!word->part_of_combo) { | |
| 401 box = word->word->bounding_box(); | |
| 402 if (prev_right > -INT16_MAX) { | |
| 403 gap = box.left() - prev_right; | |
| 404 if (gap < min_gap) { | |
| 405 min_gap = gap; | |
| 406 } | |
| 407 } | |
| 408 prev_right = box.right(); | |
| 409 } | |
| 410 } | |
| 411 if (min_gap < INT16_MAX) { | |
| 412 prev_right = -INT16_MAX; // back to start | |
| 413 word_it.set_to_list(&words); | |
| 414 // Note: we can't use cycle_pt due to inserted combos at start of list. | |
| 415 for (; (prev_right == -INT16_MAX) || !word_it.at_first(); word_it.forward()) { | |
| 416 word = word_it.data(); | |
| 417 if (!word->part_of_combo) { | |
| 418 box = word->word->bounding_box(); | |
| 419 if (prev_right > -INT16_MAX) { | |
| 420 gap = box.left() - prev_right; | |
| 421 if (gap <= min_gap) { | |
| 422 prev_word = prev_word_it.data(); | |
| 423 WERD_RES *combo; | |
| 424 if (prev_word->combination) { | |
| 425 combo = prev_word; | |
| 426 } else { | |
| 427 /* Make a new combination and insert before | |
| 428 * the first word being joined. */ | |
| 429 auto *copy_word = new WERD; | |
| 430 *copy_word = *(prev_word->word); | |
| 431 // deep copy | |
| 432 combo = new WERD_RES(copy_word); | |
| 433 combo->combination = true; | |
| 434 combo->x_height = prev_word->x_height; | |
| 435 prev_word->part_of_combo = true; | |
| 436 prev_word_it.add_before_then_move(combo); | |
| 437 } | |
| 438 combo->word->set_flag(W_EOL, word->word->flag(W_EOL)); | |
| 439 if (word->combination) { | |
| 440 combo->word->join_on(word->word); | |
| 441 // Move blobs to combo | |
| 442 // old combo no longer needed | |
| 443 delete word_it.extract(); | |
| 444 } else { | |
| 445 // Copy current wd to combo | |
| 446 combo->copy_on(word); | |
| 447 word->part_of_combo = true; | |
| 448 } | |
| 449 combo->done = false; | |
| 450 combo->ClearResults(); | |
| 451 } else { | |
| 452 prev_word_it = word_it; // catch up | |
| 453 } | |
| 454 } | |
| 455 prev_right = box.right(); | |
| 456 } | |
| 457 } | |
| 458 } else { | |
| 459 words.clear(); // signal termination | |
| 460 } | |
| 461 } | |
| 462 | |
| 463 void Tesseract::dump_words(WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved) { | |
| 464 WERD_RES_IT word_res_it(&perm); | |
| 465 | |
| 466 if (debug_fix_space_level > 0) { | |
| 467 if (mode == 1) { | |
| 468 stats_.dump_words_str = ""; | |
| 469 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) { | |
| 470 if (!word_res_it.data()->part_of_combo) { | |
| 471 stats_.dump_words_str += word_res_it.data()->best_choice->unichar_string(); | |
| 472 stats_.dump_words_str += ' '; | |
| 473 } | |
| 474 } | |
| 475 } | |
| 476 | |
| 477 if (debug_fix_space_level > 1) { | |
| 478 switch (mode) { | |
| 479 case 1: | |
| 480 tprintf("EXTRACTED (%d): \"", score); | |
| 481 break; | |
| 482 case 2: | |
| 483 tprintf("TESTED (%d): \"", score); | |
| 484 break; | |
| 485 case 3: | |
| 486 tprintf("RETURNED (%d): \"", score); | |
| 487 break; | |
| 488 } | |
| 489 | |
| 490 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) { | |
| 491 if (!word_res_it.data()->part_of_combo) { | |
| 492 tprintf("%s/%1d ", word_res_it.data()->best_choice->unichar_string().c_str(), | |
| 493 static_cast<int>(word_res_it.data()->best_choice->permuter())); | |
| 494 } | |
| 495 } | |
| 496 tprintf("\"\n"); | |
| 497 } else if (improved) { | |
| 498 tprintf("FIX SPACING \"%s\" => \"", stats_.dump_words_str.c_str()); | |
| 499 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) { | |
| 500 if (!word_res_it.data()->part_of_combo) { | |
| 501 tprintf("%s/%1d ", word_res_it.data()->best_choice->unichar_string().c_str(), | |
| 502 static_cast<int>(word_res_it.data()->best_choice->permuter())); | |
| 503 } | |
| 504 } | |
| 505 tprintf("\"\n"); | |
| 506 } | |
| 507 } | |
| 508 } | |
| 509 | |
| 510 bool Tesseract::fixspace_thinks_word_done(WERD_RES *word) { | |
| 511 if (word->done) { | |
| 512 return true; | |
| 513 } | |
| 514 | |
| 515 /* | |
| 516 Use all the standard pass 2 conditions for mode 5 in set_done() in | |
| 517 reject.c BUT DON'T REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DON'T | |
| 518 CARE WHETHER WE HAVE of/at on/an etc. | |
| 519 */ | |
| 520 if (fixsp_done_mode > 0 && | |
| 521 (word->tess_accepted || (fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) || | |
| 522 fixsp_done_mode == 3) && | |
| 523 (strchr(word->best_choice->unichar_string().c_str(), ' ') == nullptr) && | |
| 524 ((word->best_choice->permuter() == SYSTEM_DAWG_PERM) || | |
| 525 (word->best_choice->permuter() == FREQ_DAWG_PERM) || | |
| 526 (word->best_choice->permuter() == USER_DAWG_PERM) || | |
| 527 (word->best_choice->permuter() == NUMBER_PERM))) { | |
| 528 return true; | |
| 529 } else { | |
| 530 return false; | |
| 531 } | |
| 532 } | |
| 533 | |
| 534 /** | |
| 535 * @name fix_sp_fp_word() | |
| 536 * Test the current word to see if it can be split by deleting noise blobs. If | |
| 537 * so, do the business. | |
| 538 * Return with the iterator pointing to the same place if the word is unchanged, | |
| 539 * or the last of the replacement words. | |
| 540 */ | |
| 541 void Tesseract::fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block) { | |
| 542 WERD_RES *word_res; | |
| 543 WERD_RES_LIST sub_word_list; | |
| 544 WERD_RES_IT sub_word_list_it(&sub_word_list); | |
| 545 int16_t new_length; | |
| 546 float junk; | |
| 547 | |
| 548 word_res = word_res_it.data(); | |
| 549 if (word_res->word->flag(W_REP_CHAR) || word_res->combination || word_res->part_of_combo || | |
| 550 !word_res->word->flag(W_DONT_CHOP)) { | |
| 551 return; | |
| 552 } | |
| 553 | |
| 554 auto blob_index = worst_noise_blob(word_res, &junk); | |
| 555 if (blob_index < 0) { | |
| 556 return; | |
| 557 } | |
| 558 | |
| 559 if (debug_fix_space_level > 1) { | |
| 560 tprintf("FP fixspace working on \"%s\"\n", word_res->best_choice->unichar_string().c_str()); | |
| 561 } | |
| 562 word_res->word->rej_cblob_list()->sort(c_blob_comparator); | |
| 563 sub_word_list_it.add_after_stay_put(word_res_it.extract()); | |
| 564 fix_noisy_space_list(sub_word_list, row, block); | |
| 565 new_length = sub_word_list.length(); | |
| 566 word_res_it.add_list_before(&sub_word_list); | |
| 567 for (; !word_res_it.at_last() && new_length > 1; new_length--) { | |
| 568 word_res_it.forward(); | |
| 569 } | |
| 570 } | |
| 571 | |
| 572 void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block) { | |
| 573 int16_t best_score; | |
| 574 WERD_RES_IT best_perm_it(&best_perm); | |
| 575 WERD_RES_LIST current_perm; | |
| 576 WERD_RES_IT current_perm_it(¤t_perm); | |
| 577 WERD_RES *old_word_res; | |
| 578 int16_t current_score; | |
| 579 bool improved = false; | |
| 580 | |
| 581 best_score = fp_eval_word_spacing(best_perm); // default score | |
| 582 | |
| 583 dump_words(best_perm, best_score, 1, improved); | |
| 584 | |
| 585 old_word_res = best_perm_it.data(); | |
| 586 // Even deep_copy doesn't copy the underlying WERD unless its combination | |
| 587 // flag is true!. | |
| 588 old_word_res->combination = true; // Kludge to force deep copy | |
| 589 current_perm_it.add_to_end(WERD_RES::deep_copy(old_word_res)); | |
| 590 old_word_res->combination = false; // Undo kludge | |
| 591 | |
| 592 break_noisiest_blob_word(current_perm); | |
| 593 | |
| 594 while (best_score != PERFECT_WERDS && !current_perm.empty()) { | |
| 595 match_current_words(current_perm, row, block); | |
| 596 current_score = fp_eval_word_spacing(current_perm); | |
| 597 dump_words(current_perm, current_score, 2, improved); | |
| 598 if (current_score > best_score) { | |
| 599 best_perm.clear(); | |
| 600 best_perm.deep_copy(¤t_perm, &WERD_RES::deep_copy); | |
| 601 best_score = current_score; | |
| 602 improved = true; | |
| 603 } | |
| 604 if (current_score < PERFECT_WERDS) { | |
| 605 break_noisiest_blob_word(current_perm); | |
| 606 } | |
| 607 } | |
| 608 dump_words(best_perm, best_score, 3, improved); | |
| 609 } | |
| 610 | |
| 611 /** | |
| 612 * break_noisiest_blob_word() | |
| 613 * Find the word with the blob which looks like the worst noise. | |
| 614 * Break the word into two, deleting the noise blob. | |
| 615 */ | |
| 616 void Tesseract::break_noisiest_blob_word(WERD_RES_LIST &words) { | |
| 617 WERD_RES_IT word_it(&words); | |
| 618 WERD_RES_IT worst_word_it; | |
| 619 float worst_noise_score = 9999; | |
| 620 int worst_blob_index = -1; // Noisiest blob of noisiest wd | |
| 621 float noise_score; // of wds noisiest blob | |
| 622 WERD_RES *word_res; | |
| 623 C_BLOB_IT blob_it; | |
| 624 C_BLOB_IT rej_cblob_it; | |
| 625 C_BLOB_LIST new_blob_list; | |
| 626 C_BLOB_IT new_blob_it; | |
| 627 C_BLOB_IT new_rej_cblob_it; | |
| 628 WERD *new_word; | |
| 629 int16_t start_of_noise_blob; | |
| 630 int16_t i; | |
| 631 | |
| 632 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { | |
| 633 auto blob_index = worst_noise_blob(word_it.data(), &noise_score); | |
| 634 if (blob_index > -1 && worst_noise_score > noise_score) { | |
| 635 worst_noise_score = noise_score; | |
| 636 worst_blob_index = blob_index; | |
| 637 worst_word_it = word_it; | |
| 638 } | |
| 639 } | |
| 640 if (worst_blob_index < 0) { | |
| 641 words.clear(); // signal termination | |
| 642 return; | |
| 643 } | |
| 644 | |
| 645 /* Now split the worst_word_it */ | |
| 646 | |
| 647 word_res = worst_word_it.data(); | |
| 648 | |
| 649 /* Move blobs before noise blob to a new bloblist */ | |
| 650 | |
| 651 new_blob_it.set_to_list(&new_blob_list); | |
| 652 blob_it.set_to_list(word_res->word->cblob_list()); | |
| 653 for (i = 0; i < worst_blob_index; i++, blob_it.forward()) { | |
| 654 new_blob_it.add_after_then_move(blob_it.extract()); | |
| 655 } | |
| 656 start_of_noise_blob = blob_it.data()->bounding_box().left(); | |
| 657 delete blob_it.extract(); // throw out noise blob | |
| 658 | |
| 659 new_word = new WERD(&new_blob_list, word_res->word); | |
| 660 new_word->set_flag(W_EOL, false); | |
| 661 word_res->word->set_flag(W_BOL, false); | |
| 662 word_res->word->set_blanks(1); // After break | |
| 663 | |
| 664 new_rej_cblob_it.set_to_list(new_word->rej_cblob_list()); | |
| 665 rej_cblob_it.set_to_list(word_res->word->rej_cblob_list()); | |
| 666 for (; (!rej_cblob_it.empty() && | |
| 667 (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob)); | |
| 668 rej_cblob_it.forward()) { | |
| 669 new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract()); | |
| 670 } | |
| 671 | |
| 672 auto *new_word_res = new WERD_RES(new_word); | |
| 673 new_word_res->combination = true; | |
| 674 worst_word_it.add_before_then_move(new_word_res); | |
| 675 | |
| 676 word_res->ClearResults(); | |
| 677 } | |
| 678 | |
| 679 int16_t Tesseract::worst_noise_blob(WERD_RES *word_res, float *worst_noise_score) { | |
| 680 float noise_score[512]; | |
| 681 int min_noise_blob; // 1st contender | |
| 682 int max_noise_blob; // last contender | |
| 683 int non_noise_count; | |
| 684 int worst_noise_blob; // Worst blob | |
| 685 float small_limit = kBlnXHeight * fixsp_small_outlines_size; | |
| 686 float non_noise_limit = kBlnXHeight * 0.8; | |
| 687 | |
| 688 if (word_res->rebuild_word == nullptr) { | |
| 689 return -1; // Can't handle cube words. | |
| 690 } | |
| 691 | |
| 692 // Normalised. | |
| 693 auto blob_count = word_res->box_word->length(); | |
| 694 ASSERT_HOST(blob_count <= 512); | |
| 695 if (blob_count < 5) { | |
| 696 return -1; // too short to split | |
| 697 } | |
| 698 | |
| 699 /* Get the noise scores for all blobs */ | |
| 700 | |
| 701 #ifndef SECURE_NAMES | |
| 702 if (debug_fix_space_level > 5) { | |
| 703 tprintf("FP fixspace Noise metrics for \"%s\": ", | |
| 704 word_res->best_choice->unichar_string().c_str()); | |
| 705 } | |
| 706 #endif | |
| 707 | |
| 708 for (unsigned i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) { | |
| 709 TBLOB *blob = word_res->rebuild_word->blobs[i]; | |
| 710 if (word_res->reject_map[i].accepted()) { | |
| 711 noise_score[i] = non_noise_limit; | |
| 712 } else { | |
| 713 noise_score[i] = blob_noise_score(blob); | |
| 714 } | |
| 715 | |
| 716 if (debug_fix_space_level > 5) { | |
| 717 tprintf("%1.1f ", noise_score[i]); | |
| 718 } | |
| 719 } | |
| 720 if (debug_fix_space_level > 5) { | |
| 721 tprintf("\n"); | |
| 722 } | |
| 723 | |
| 724 /* Now find the worst one which is far enough away from the end of the word */ | |
| 725 | |
| 726 non_noise_count = 0; | |
| 727 int i; | |
| 728 for (i = 0; static_cast<unsigned>(i) < blob_count && non_noise_count < fixsp_non_noise_limit; i++) { | |
| 729 if (noise_score[i] >= non_noise_limit) { | |
| 730 non_noise_count++; | |
| 731 } | |
| 732 } | |
| 733 if (non_noise_count < fixsp_non_noise_limit) { | |
| 734 return -1; | |
| 735 } | |
| 736 | |
| 737 min_noise_blob = i; | |
| 738 | |
| 739 non_noise_count = 0; | |
| 740 for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit; i--) { | |
| 741 if (noise_score[i] >= non_noise_limit) { | |
| 742 non_noise_count++; | |
| 743 } | |
| 744 } | |
| 745 if (non_noise_count < fixsp_non_noise_limit) { | |
| 746 return -1; | |
| 747 } | |
| 748 | |
| 749 max_noise_blob = i; | |
| 750 | |
| 751 if (min_noise_blob > max_noise_blob) { | |
| 752 return -1; | |
| 753 } | |
| 754 | |
| 755 *worst_noise_score = small_limit; | |
| 756 worst_noise_blob = -1; | |
| 757 for (auto i = min_noise_blob; i <= max_noise_blob; i++) { | |
| 758 if (noise_score[i] < *worst_noise_score) { | |
| 759 worst_noise_blob = i; | |
| 760 *worst_noise_score = noise_score[i]; | |
| 761 } | |
| 762 } | |
| 763 return worst_noise_blob; | |
| 764 } | |
| 765 | |
| 766 float Tesseract::blob_noise_score(TBLOB *blob) { | |
| 767 TBOX box; // BB of outline | |
| 768 int16_t outline_count = 0; | |
| 769 int16_t max_dimension; | |
| 770 int16_t largest_outline_dimension = 0; | |
| 771 | |
| 772 for (TESSLINE *ol = blob->outlines; ol != nullptr; ol = ol->next) { | |
| 773 outline_count++; | |
| 774 box = ol->bounding_box(); | |
| 775 if (box.height() > box.width()) { | |
| 776 max_dimension = box.height(); | |
| 777 } else { | |
| 778 max_dimension = box.width(); | |
| 779 } | |
| 780 | |
| 781 if (largest_outline_dimension < max_dimension) { | |
| 782 largest_outline_dimension = max_dimension; | |
| 783 } | |
| 784 } | |
| 785 | |
| 786 if (outline_count > 5) { | |
| 787 // penalise LOTS of blobs | |
| 788 largest_outline_dimension *= 2; | |
| 789 } | |
| 790 | |
| 791 box = blob->bounding_box(); | |
| 792 if (box.bottom() > kBlnBaselineOffset * 4 || box.top() < kBlnBaselineOffset / 2) { | |
| 793 // Lax blob is if high or low | |
| 794 largest_outline_dimension /= 2; | |
| 795 } | |
| 796 | |
| 797 return largest_outline_dimension; | |
| 798 } | |
| 799 | |
| 800 void fixspace_dbg(WERD_RES *word) { | |
| 801 TBOX box = word->word->bounding_box(); | |
| 802 const bool show_map_detail = false; | |
| 803 | |
| 804 box.print(); | |
| 805 tprintf(" \"%s\" ", word->best_choice->unichar_string().c_str()); | |
| 806 tprintf("Blob count: %d (word); %d/%d (rebuild word)\n", word->word->cblob_list()->length(), | |
| 807 word->rebuild_word->NumBlobs(), word->box_word->length()); | |
| 808 word->reject_map.print(debug_fp); | |
| 809 tprintf("\n"); | |
| 810 if (show_map_detail) { | |
| 811 tprintf("\"%s\"\n", word->best_choice->unichar_string().c_str()); | |
| 812 for (unsigned i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) { | |
| 813 tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]); | |
| 814 word->reject_map[i].full_print(debug_fp); | |
| 815 } | |
| 816 } | |
| 817 | |
| 818 tprintf("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE"); | |
| 819 tprintf("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE"); | |
| 820 } | |
| 821 | |
| 822 /** | |
| 823 * fp_eval_word_spacing() | |
| 824 * Evaluation function for fixed pitch word lists. | |
| 825 * | |
| 826 * Basically, count the number of "nice" characters - those which are in tess | |
| 827 * acceptable words or in dict words and are not rejected. | |
| 828 * Penalise any potential noise chars | |
| 829 */ | |
| 830 int16_t Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) { | |
| 831 WERD_RES_IT word_it(&word_res_list); | |
| 832 WERD_RES *word; | |
| 833 int16_t score = 0; | |
| 834 float small_limit = kBlnXHeight * fixsp_small_outlines_size; | |
| 835 | |
| 836 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { | |
| 837 word = word_it.data(); | |
| 838 if (word->rebuild_word == nullptr) { | |
| 839 continue; // Can't handle cube words. | |
| 840 } | |
| 841 if (word->done || word->tess_accepted || word->best_choice->permuter() == SYSTEM_DAWG_PERM || | |
| 842 word->best_choice->permuter() == FREQ_DAWG_PERM || | |
| 843 word->best_choice->permuter() == USER_DAWG_PERM || safe_dict_word(word) > 0) { | |
| 844 auto num_blobs = word->rebuild_word->NumBlobs(); | |
| 845 UNICHAR_ID space = word->uch_set->unichar_to_id(" "); | |
| 846 for (unsigned i = 0; i < word->best_choice->length() && i < num_blobs; ++i) { | |
| 847 TBLOB *blob = word->rebuild_word->blobs[i]; | |
| 848 if (word->best_choice->unichar_id(i) == space || blob_noise_score(blob) < small_limit) { | |
| 849 score -= 1; // penalise possibly erroneous non-space | |
| 850 } else if (word->reject_map[i].accepted()) { | |
| 851 score++; | |
| 852 } | |
| 853 } | |
| 854 } | |
| 855 } | |
| 856 if (score < 0) { | |
| 857 score = 0; | |
| 858 } | |
| 859 return score; | |
| 860 } | |
| 861 | |
| 862 } // namespace tesseract |
