comparison mupdf-source/thirdparty/tesseract/src/ccmain/fixspace.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /******************************************************************
2 * File: fixspace.cpp (Formerly fixspace.c)
3 * Description: Implements a pass over the page res, exploring the alternative
4 * spacing possibilities, trying to use context to improve the
5 * word spacing
6 * Author: Phil Cheatle
7 *
8 * (C) Copyright 1993, Hewlett-Packard Ltd.
9 ** Licensed under the Apache License, Version 2.0 (the "License");
10 ** you may not use this file except in compliance with the License.
11 ** You may obtain a copy of the License at
12 ** http://www.apache.org/licenses/LICENSE-2.0
13 ** Unless required by applicable law or agreed to in writing, software
14 ** distributed under the License is distributed on an "AS IS" BASIS,
15 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 ** See the License for the specific language governing permissions and
17 ** limitations under the License.
18 *
19 **********************************************************************/
20
21 #include "fixspace.h"
22
23 #include "blobs.h" // for TWERD, TBLOB, TESSLINE
24 #include "boxword.h" // for BoxWord
25 #include "errcode.h" // for ASSERT_HOST
26 #include "normalis.h" // for kBlnXHeight, kBlnBaselineOffset
27 #include "pageres.h" // for WERD_RES_IT, WERD_RES, WERD_RES_LIST
28 #include "params.h" // for IntParam, StringParam, BoolParam, DoubleParam, ...
29 #include "ratngs.h" // for WERD_CHOICE, FREQ_DAWG_PERM, NUMBER_PERM
30 #include "rect.h" // for TBOX
31 #include "stepblob.h" // for C_BLOB_IT, C_BLOB_LIST, C_BLOB
32 #include "tesseractclass.h" // for Tesseract, TesseractStats, WordData
33 #include "tessvars.h" // for debug_fp
34 #include "tprintf.h" // for tprintf
35 #include "unicharset.h" // for UNICHARSET
36 #include "werd.h" // for WERD, W_EOL, W_FUZZY_NON, W_FUZZY_SP
37
38 #include <tesseract/ocrclass.h> // for ETEXT_DESC
39 #include <tesseract/unichar.h> // for UNICHAR_ID
40
41 #include <cstdint> // for INT16_MAX, int16_t, int32_t
42
43 namespace tesseract {
44
45 class BLOCK;
46 class ROW;
47
48 #define PERFECT_WERDS 999
49
50 /**********************************************************************
51 * c_blob_comparator()
52 *
53 * Blob comparator used to sort a blob list so that blobs are in increasing
54 * order of left edge.
55 **********************************************************************/
56
57 static int c_blob_comparator( // sort blobs
58 const void *blob1p, // ptr to ptr to blob1
59 const void *blob2p // ptr to ptr to blob2
60 ) {
61 const C_BLOB *blob1 = *reinterpret_cast<const C_BLOB *const *>(blob1p);
62 const C_BLOB *blob2 = *reinterpret_cast<const C_BLOB *const *>(blob2p);
63
64 return blob1->bounding_box().left() - blob2->bounding_box().left();
65 }
66
67 /**
68 * @name fix_fuzzy_spaces()
69 * Walk over the page finding sequences of words joined by fuzzy spaces. Extract
70 * them as a sublist, process the sublist to find the optimal arrangement of
71 * spaces then replace the sublist in the ROW_RES.
72 *
73 * @param monitor progress monitor
74 * @param word_count count of words in doc
75 * @param[out] page_res
76 */
77 void Tesseract::fix_fuzzy_spaces(ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res) {
78 BLOCK_RES_IT block_res_it;
79 ROW_RES_IT row_res_it;
80 WERD_RES_IT word_res_it_from;
81 WERD_RES_IT word_res_it_to;
82 WERD_RES *word_res;
83 WERD_RES_LIST fuzzy_space_words;
84 int16_t new_length;
85 bool prevent_null_wd_fixsp; // DON'T process blobless wds
86 int32_t word_index; // current word
87
88 block_res_it.set_to_list(&page_res->block_res_list);
89 word_index = 0;
90 for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list(); block_res_it.forward()) {
91 row_res_it.set_to_list(&block_res_it.data()->row_res_list);
92 for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list(); row_res_it.forward()) {
93 word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);
94 while (!word_res_it_from.at_last()) {
95 word_res = word_res_it_from.data();
96 while (!word_res_it_from.at_last() &&
97 !(word_res->combination ||
98 word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) ||
99 word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) {
100 fix_sp_fp_word(word_res_it_from, row_res_it.data()->row, block_res_it.data()->block);
101 word_res = word_res_it_from.forward();
102 word_index++;
103 if (monitor != nullptr) {
104 monitor->ocr_alive = true;
105 monitor->progress = 90 + 5 * word_index / word_count;
106 if (monitor->deadline_exceeded() ||
107 (monitor->cancel != nullptr &&
108 (*monitor->cancel)(monitor->cancel_this, stats_.dict_words))) {
109 return;
110 }
111 }
112 }
113
114 if (!word_res_it_from.at_last()) {
115 word_res_it_to = word_res_it_from;
116 prevent_null_wd_fixsp = word_res->word->cblob_list()->empty();
117 if (check_debug_pt(word_res, 60)) {
118 debug_fix_space_level.set_value(10);
119 }
120 word_res_it_to.forward();
121 word_index++;
122 if (monitor != nullptr) {
123 monitor->ocr_alive = true;
124 monitor->progress = 90 + 5 * word_index / word_count;
125 if (monitor->deadline_exceeded() ||
126 (monitor->cancel != nullptr &&
127 (*monitor->cancel)(monitor->cancel_this, stats_.dict_words))) {
128 return;
129 }
130 }
131 while (!word_res_it_to.at_last() &&
132 (word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) ||
133 word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) {
134 if (check_debug_pt(word_res, 60)) {
135 debug_fix_space_level.set_value(10);
136 }
137 if (word_res->word->cblob_list()->empty()) {
138 prevent_null_wd_fixsp = true;
139 }
140 word_res = word_res_it_to.forward();
141 }
142 if (check_debug_pt(word_res, 60)) {
143 debug_fix_space_level.set_value(10);
144 }
145 if (word_res->word->cblob_list()->empty()) {
146 prevent_null_wd_fixsp = true;
147 }
148 if (prevent_null_wd_fixsp) {
149 word_res_it_from = word_res_it_to;
150 } else {
151 fuzzy_space_words.assign_to_sublist(&word_res_it_from, &word_res_it_to);
152 fix_fuzzy_space_list(fuzzy_space_words, row_res_it.data()->row,
153 block_res_it.data()->block);
154 new_length = fuzzy_space_words.length();
155 word_res_it_from.add_list_before(&fuzzy_space_words);
156 for (; !word_res_it_from.at_last() && new_length > 0; new_length--) {
157 word_res_it_from.forward();
158 }
159 }
160 if (test_pt) {
161 debug_fix_space_level.set_value(0);
162 }
163 }
164 fix_sp_fp_word(word_res_it_from, row_res_it.data()->row, block_res_it.data()->block);
165 // Last word in row
166 }
167 }
168 }
169 }
170
171 void Tesseract::fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block) {
172 int16_t best_score;
173 WERD_RES_LIST current_perm;
174 bool improved = false;
175
176 best_score = eval_word_spacing(best_perm); // default score
177 dump_words(best_perm, best_score, 1, improved);
178
179 if (best_score != PERFECT_WERDS) {
180 initialise_search(best_perm, current_perm);
181 }
182
183 while ((best_score != PERFECT_WERDS) && !current_perm.empty()) {
184 match_current_words(current_perm, row, block);
185 int16_t current_score = eval_word_spacing(current_perm);
186 dump_words(current_perm, current_score, 2, improved);
187 if (current_score > best_score) {
188 best_perm.clear();
189 best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
190 best_score = current_score;
191 improved = true;
192 }
193 if (current_score < PERFECT_WERDS) {
194 transform_to_next_perm(current_perm);
195 }
196 }
197 dump_words(best_perm, best_score, 3, improved);
198 }
199
200 void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list) {
201 WERD_RES_IT src_it(&src_list);
202 WERD_RES_IT new_it(&new_list);
203 WERD_RES *new_wd;
204
205 for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
206 WERD_RES *src_wd = src_it.data();
207 if (!src_wd->combination) {
208 new_wd = WERD_RES::deep_copy(src_wd);
209 new_wd->combination = false;
210 new_wd->part_of_combo = false;
211 new_it.add_after_then_move(new_wd);
212 }
213 }
214 }
215
216 void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block) {
217 WERD_RES_IT word_it(&words);
218 WERD_RES *word;
219 // Since we are not using PAGE_RES to iterate over words, we need to update
220 // prev_word_best_choice_ before calling classify_word_pass2().
221 prev_word_best_choice_ = nullptr;
222 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
223 word = word_it.data();
224 if ((!word->part_of_combo) && (word->box_word == nullptr)) {
225 WordData word_data(block, row, word);
226 SetupWordPassN(2, &word_data);
227 classify_word_and_language(2, nullptr, &word_data);
228 }
229 prev_word_best_choice_ = word->best_choice;
230 }
231 }
232
233 /**
234 * @name eval_word_spacing()
235 * The basic measure is the number of characters in contextually confirmed
236 * words. (I.e the word is done)
237 * If all words are contextually confirmed the evaluation is deemed perfect.
238 *
239 * Some fiddles are done to handle "1"s as these are VERY frequent causes of
240 * fuzzy spaces. The problem with the basic measure is that "561 63" would score
241 * the same as "56163", though given our knowledge that the space is fuzzy, and
242 * that there is a "1" next to the fuzzy space, we need to ensure that "56163"
243 * is preferred.
244 *
245 * The solution is to NOT COUNT the score of any word which has a digit at one
246 * end and a "1Il" as the character the other side of the space.
247 *
248 * Conversely, any character next to a "1" within a word is counted as a
249 * positive score. Thus "561 63" would score 4 (3 chars in a numeric word plus 1
250 * side of the "1" joined). "56163" would score 7 - all chars in a numeric word
251 * + 2 sides of a "1" joined.
252 *
253 * The joined 1 rule is applied to any word REGARDLESS of contextual
254 * confirmation. Thus "PS7a71 3/7a" scores 1 (neither word is contexutally
255 * confirmed. The only score is from the joined 1. "PS7a713/7a" scores 2.
256 *
257 */
258 int16_t Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {
259 WERD_RES_IT word_res_it(&word_res_list);
260 int16_t total_score = 0;
261 int16_t word_count = 0;
262 int16_t done_word_count = 0;
263 int i;
264 int16_t offset;
265 int16_t prev_word_score = 0;
266 bool prev_word_done = false;
267 bool prev_char_1 = false; // prev ch a "1/I/l"?
268 bool prev_char_digit = false; // prev ch 2..9 or 0
269 const char *punct_chars = "!\"`',.:;";
270 do {
271 // current word
272 WERD_RES *word = word_res_it.data();
273 bool word_done = fixspace_thinks_word_done(word);
274 word_count++;
275 if (word->tess_failed) {
276 total_score += prev_word_score;
277 if (prev_word_done) {
278 done_word_count++;
279 }
280 prev_word_score = 0;
281 prev_char_1 = false;
282 prev_char_digit = false;
283 prev_word_done = false;
284 } else {
285 /*
286 Can we add the prev word score and potentially count this word?
287 Yes IF it didn't end in a 1 when the first char of this word is a digit
288 AND it didn't end in a digit when the first char of this word is a 1
289 */
290 auto word_len = word->reject_map.length();
291 bool current_word_ok_so_far = false;
292 if (!((prev_char_1 && digit_or_numeric_punct(word, 0)) ||
293 (prev_char_digit &&
294 ((word_done && word->best_choice->unichar_lengths().c_str()[0] == 1 &&
295 word->best_choice->unichar_string()[0] == '1') ||
296 (!word_done &&
297 conflict_set_I_l_1.contains(word->best_choice->unichar_string()[0])))))) {
298 total_score += prev_word_score;
299 if (prev_word_done) {
300 done_word_count++;
301 }
302 current_word_ok_so_far = word_done;
303 }
304
305 if (current_word_ok_so_far) {
306 prev_word_done = true;
307 prev_word_score = word_len;
308 } else {
309 prev_word_done = false;
310 prev_word_score = 0;
311 }
312
313 /* Add 1 to total score for every joined 1 regardless of context and
314 rejtn */
315 for (i = 0, prev_char_1 = false; i < word_len; i++) {
316 bool current_char_1 = word->best_choice->unichar_string()[i] == '1';
317 if (prev_char_1 || (current_char_1 && (i > 0))) {
318 total_score++;
319 }
320 prev_char_1 = current_char_1;
321 }
322
323 /* Add 1 to total score for every joined punctuation regardless of context
324 and rejtn */
325 if (tessedit_prefer_joined_punct) {
326 bool prev_char_punct;
327 for (i = 0, offset = 0, prev_char_punct = false; i < word_len;
328 offset += word->best_choice->unichar_lengths()[i++]) {
329 bool current_char_punct =
330 strchr(punct_chars, word->best_choice->unichar_string()[offset]) != nullptr;
331 if (prev_char_punct || (current_char_punct && i > 0)) {
332 total_score++;
333 }
334 prev_char_punct = current_char_punct;
335 }
336 }
337 prev_char_digit = digit_or_numeric_punct(word, word_len - 1);
338 for (i = 0, offset = 0; i < word_len - 1;
339 offset += word->best_choice->unichar_lengths()[i++]) {
340 ;
341 }
342 prev_char_1 =
343 ((word_done && (word->best_choice->unichar_string()[offset] == '1')) ||
344 (!word_done &&
345 conflict_set_I_l_1.contains(word->best_choice->unichar_string()[offset])));
346 }
347 /* Find next word */
348 do {
349 word_res_it.forward();
350 } while (word_res_it.data()->part_of_combo);
351 } while (!word_res_it.at_first());
352 total_score += prev_word_score;
353 if (prev_word_done) {
354 done_word_count++;
355 }
356 if (done_word_count == word_count) {
357 return PERFECT_WERDS;
358 } else {
359 return total_score;
360 }
361 }
362
363 bool Tesseract::digit_or_numeric_punct(WERD_RES *word, int char_position) {
364 int i;
365 int offset;
366
367 for (i = 0, offset = 0; i < char_position; offset += word->best_choice->unichar_lengths()[i++]) {
368 ;
369 }
370 return (
371 word->uch_set->get_isdigit(word->best_choice->unichar_string().c_str() + offset,
372 word->best_choice->unichar_lengths()[i]) ||
373 (word->best_choice->permuter() == NUMBER_PERM &&
374 numeric_punctuation.contains(word->best_choice->unichar_string().c_str()[offset])));
375 }
376
377 /**
378 * @name transform_to_next_perm()
379 * Examines the current word list to find the smallest word gap size. Then walks
380 * the word list closing any gaps of this size by either inserted new
381 * combination words, or extending existing ones.
382 *
383 * The routine COULD be limited to stop it building words longer than N blobs.
384 *
385 * If there are no more gaps then it DELETES the entire list and returns the
386 * empty list to cause termination.
387 */
388 void transform_to_next_perm(WERD_RES_LIST &words) {
389 WERD_RES_IT word_it(&words);
390 WERD_RES_IT prev_word_it(&words);
391 WERD_RES *word;
392 WERD_RES *prev_word;
393 int16_t prev_right = -INT16_MAX;
394 TBOX box;
395 int16_t gap;
396 int16_t min_gap = INT16_MAX;
397
398 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
399 word = word_it.data();
400 if (!word->part_of_combo) {
401 box = word->word->bounding_box();
402 if (prev_right > -INT16_MAX) {
403 gap = box.left() - prev_right;
404 if (gap < min_gap) {
405 min_gap = gap;
406 }
407 }
408 prev_right = box.right();
409 }
410 }
411 if (min_gap < INT16_MAX) {
412 prev_right = -INT16_MAX; // back to start
413 word_it.set_to_list(&words);
414 // Note: we can't use cycle_pt due to inserted combos at start of list.
415 for (; (prev_right == -INT16_MAX) || !word_it.at_first(); word_it.forward()) {
416 word = word_it.data();
417 if (!word->part_of_combo) {
418 box = word->word->bounding_box();
419 if (prev_right > -INT16_MAX) {
420 gap = box.left() - prev_right;
421 if (gap <= min_gap) {
422 prev_word = prev_word_it.data();
423 WERD_RES *combo;
424 if (prev_word->combination) {
425 combo = prev_word;
426 } else {
427 /* Make a new combination and insert before
428 * the first word being joined. */
429 auto *copy_word = new WERD;
430 *copy_word = *(prev_word->word);
431 // deep copy
432 combo = new WERD_RES(copy_word);
433 combo->combination = true;
434 combo->x_height = prev_word->x_height;
435 prev_word->part_of_combo = true;
436 prev_word_it.add_before_then_move(combo);
437 }
438 combo->word->set_flag(W_EOL, word->word->flag(W_EOL));
439 if (word->combination) {
440 combo->word->join_on(word->word);
441 // Move blobs to combo
442 // old combo no longer needed
443 delete word_it.extract();
444 } else {
445 // Copy current wd to combo
446 combo->copy_on(word);
447 word->part_of_combo = true;
448 }
449 combo->done = false;
450 combo->ClearResults();
451 } else {
452 prev_word_it = word_it; // catch up
453 }
454 }
455 prev_right = box.right();
456 }
457 }
458 } else {
459 words.clear(); // signal termination
460 }
461 }
462
463 void Tesseract::dump_words(WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved) {
464 WERD_RES_IT word_res_it(&perm);
465
466 if (debug_fix_space_level > 0) {
467 if (mode == 1) {
468 stats_.dump_words_str = "";
469 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) {
470 if (!word_res_it.data()->part_of_combo) {
471 stats_.dump_words_str += word_res_it.data()->best_choice->unichar_string();
472 stats_.dump_words_str += ' ';
473 }
474 }
475 }
476
477 if (debug_fix_space_level > 1) {
478 switch (mode) {
479 case 1:
480 tprintf("EXTRACTED (%d): \"", score);
481 break;
482 case 2:
483 tprintf("TESTED (%d): \"", score);
484 break;
485 case 3:
486 tprintf("RETURNED (%d): \"", score);
487 break;
488 }
489
490 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) {
491 if (!word_res_it.data()->part_of_combo) {
492 tprintf("%s/%1d ", word_res_it.data()->best_choice->unichar_string().c_str(),
493 static_cast<int>(word_res_it.data()->best_choice->permuter()));
494 }
495 }
496 tprintf("\"\n");
497 } else if (improved) {
498 tprintf("FIX SPACING \"%s\" => \"", stats_.dump_words_str.c_str());
499 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) {
500 if (!word_res_it.data()->part_of_combo) {
501 tprintf("%s/%1d ", word_res_it.data()->best_choice->unichar_string().c_str(),
502 static_cast<int>(word_res_it.data()->best_choice->permuter()));
503 }
504 }
505 tprintf("\"\n");
506 }
507 }
508 }
509
510 bool Tesseract::fixspace_thinks_word_done(WERD_RES *word) {
511 if (word->done) {
512 return true;
513 }
514
515 /*
516 Use all the standard pass 2 conditions for mode 5 in set_done() in
517 reject.c BUT DON'T REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DON'T
518 CARE WHETHER WE HAVE of/at on/an etc.
519 */
520 if (fixsp_done_mode > 0 &&
521 (word->tess_accepted || (fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) ||
522 fixsp_done_mode == 3) &&
523 (strchr(word->best_choice->unichar_string().c_str(), ' ') == nullptr) &&
524 ((word->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
525 (word->best_choice->permuter() == FREQ_DAWG_PERM) ||
526 (word->best_choice->permuter() == USER_DAWG_PERM) ||
527 (word->best_choice->permuter() == NUMBER_PERM))) {
528 return true;
529 } else {
530 return false;
531 }
532 }
533
534 /**
535 * @name fix_sp_fp_word()
536 * Test the current word to see if it can be split by deleting noise blobs. If
537 * so, do the business.
538 * Return with the iterator pointing to the same place if the word is unchanged,
539 * or the last of the replacement words.
540 */
541 void Tesseract::fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block) {
542 WERD_RES *word_res;
543 WERD_RES_LIST sub_word_list;
544 WERD_RES_IT sub_word_list_it(&sub_word_list);
545 int16_t new_length;
546 float junk;
547
548 word_res = word_res_it.data();
549 if (word_res->word->flag(W_REP_CHAR) || word_res->combination || word_res->part_of_combo ||
550 !word_res->word->flag(W_DONT_CHOP)) {
551 return;
552 }
553
554 auto blob_index = worst_noise_blob(word_res, &junk);
555 if (blob_index < 0) {
556 return;
557 }
558
559 if (debug_fix_space_level > 1) {
560 tprintf("FP fixspace working on \"%s\"\n", word_res->best_choice->unichar_string().c_str());
561 }
562 word_res->word->rej_cblob_list()->sort(c_blob_comparator);
563 sub_word_list_it.add_after_stay_put(word_res_it.extract());
564 fix_noisy_space_list(sub_word_list, row, block);
565 new_length = sub_word_list.length();
566 word_res_it.add_list_before(&sub_word_list);
567 for (; !word_res_it.at_last() && new_length > 1; new_length--) {
568 word_res_it.forward();
569 }
570 }
571
572 void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block) {
573 int16_t best_score;
574 WERD_RES_IT best_perm_it(&best_perm);
575 WERD_RES_LIST current_perm;
576 WERD_RES_IT current_perm_it(&current_perm);
577 WERD_RES *old_word_res;
578 int16_t current_score;
579 bool improved = false;
580
581 best_score = fp_eval_word_spacing(best_perm); // default score
582
583 dump_words(best_perm, best_score, 1, improved);
584
585 old_word_res = best_perm_it.data();
586 // Even deep_copy doesn't copy the underlying WERD unless its combination
587 // flag is true!.
588 old_word_res->combination = true; // Kludge to force deep copy
589 current_perm_it.add_to_end(WERD_RES::deep_copy(old_word_res));
590 old_word_res->combination = false; // Undo kludge
591
592 break_noisiest_blob_word(current_perm);
593
594 while (best_score != PERFECT_WERDS && !current_perm.empty()) {
595 match_current_words(current_perm, row, block);
596 current_score = fp_eval_word_spacing(current_perm);
597 dump_words(current_perm, current_score, 2, improved);
598 if (current_score > best_score) {
599 best_perm.clear();
600 best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
601 best_score = current_score;
602 improved = true;
603 }
604 if (current_score < PERFECT_WERDS) {
605 break_noisiest_blob_word(current_perm);
606 }
607 }
608 dump_words(best_perm, best_score, 3, improved);
609 }
610
611 /**
612 * break_noisiest_blob_word()
613 * Find the word with the blob which looks like the worst noise.
614 * Break the word into two, deleting the noise blob.
615 */
616 void Tesseract::break_noisiest_blob_word(WERD_RES_LIST &words) {
617 WERD_RES_IT word_it(&words);
618 WERD_RES_IT worst_word_it;
619 float worst_noise_score = 9999;
620 int worst_blob_index = -1; // Noisiest blob of noisiest wd
621 float noise_score; // of wds noisiest blob
622 WERD_RES *word_res;
623 C_BLOB_IT blob_it;
624 C_BLOB_IT rej_cblob_it;
625 C_BLOB_LIST new_blob_list;
626 C_BLOB_IT new_blob_it;
627 C_BLOB_IT new_rej_cblob_it;
628 WERD *new_word;
629 int16_t start_of_noise_blob;
630 int16_t i;
631
632 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
633 auto blob_index = worst_noise_blob(word_it.data(), &noise_score);
634 if (blob_index > -1 && worst_noise_score > noise_score) {
635 worst_noise_score = noise_score;
636 worst_blob_index = blob_index;
637 worst_word_it = word_it;
638 }
639 }
640 if (worst_blob_index < 0) {
641 words.clear(); // signal termination
642 return;
643 }
644
645 /* Now split the worst_word_it */
646
647 word_res = worst_word_it.data();
648
649 /* Move blobs before noise blob to a new bloblist */
650
651 new_blob_it.set_to_list(&new_blob_list);
652 blob_it.set_to_list(word_res->word->cblob_list());
653 for (i = 0; i < worst_blob_index; i++, blob_it.forward()) {
654 new_blob_it.add_after_then_move(blob_it.extract());
655 }
656 start_of_noise_blob = blob_it.data()->bounding_box().left();
657 delete blob_it.extract(); // throw out noise blob
658
659 new_word = new WERD(&new_blob_list, word_res->word);
660 new_word->set_flag(W_EOL, false);
661 word_res->word->set_flag(W_BOL, false);
662 word_res->word->set_blanks(1); // After break
663
664 new_rej_cblob_it.set_to_list(new_word->rej_cblob_list());
665 rej_cblob_it.set_to_list(word_res->word->rej_cblob_list());
666 for (; (!rej_cblob_it.empty() &&
667 (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));
668 rej_cblob_it.forward()) {
669 new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());
670 }
671
672 auto *new_word_res = new WERD_RES(new_word);
673 new_word_res->combination = true;
674 worst_word_it.add_before_then_move(new_word_res);
675
676 word_res->ClearResults();
677 }
678
679 int16_t Tesseract::worst_noise_blob(WERD_RES *word_res, float *worst_noise_score) {
680 float noise_score[512];
681 int min_noise_blob; // 1st contender
682 int max_noise_blob; // last contender
683 int non_noise_count;
684 int worst_noise_blob; // Worst blob
685 float small_limit = kBlnXHeight * fixsp_small_outlines_size;
686 float non_noise_limit = kBlnXHeight * 0.8;
687
688 if (word_res->rebuild_word == nullptr) {
689 return -1; // Can't handle cube words.
690 }
691
692 // Normalised.
693 auto blob_count = word_res->box_word->length();
694 ASSERT_HOST(blob_count <= 512);
695 if (blob_count < 5) {
696 return -1; // too short to split
697 }
698
699 /* Get the noise scores for all blobs */
700
701 #ifndef SECURE_NAMES
702 if (debug_fix_space_level > 5) {
703 tprintf("FP fixspace Noise metrics for \"%s\": ",
704 word_res->best_choice->unichar_string().c_str());
705 }
706 #endif
707
708 for (unsigned i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) {
709 TBLOB *blob = word_res->rebuild_word->blobs[i];
710 if (word_res->reject_map[i].accepted()) {
711 noise_score[i] = non_noise_limit;
712 } else {
713 noise_score[i] = blob_noise_score(blob);
714 }
715
716 if (debug_fix_space_level > 5) {
717 tprintf("%1.1f ", noise_score[i]);
718 }
719 }
720 if (debug_fix_space_level > 5) {
721 tprintf("\n");
722 }
723
724 /* Now find the worst one which is far enough away from the end of the word */
725
726 non_noise_count = 0;
727 int i;
728 for (i = 0; static_cast<unsigned>(i) < blob_count && non_noise_count < fixsp_non_noise_limit; i++) {
729 if (noise_score[i] >= non_noise_limit) {
730 non_noise_count++;
731 }
732 }
733 if (non_noise_count < fixsp_non_noise_limit) {
734 return -1;
735 }
736
737 min_noise_blob = i;
738
739 non_noise_count = 0;
740 for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit; i--) {
741 if (noise_score[i] >= non_noise_limit) {
742 non_noise_count++;
743 }
744 }
745 if (non_noise_count < fixsp_non_noise_limit) {
746 return -1;
747 }
748
749 max_noise_blob = i;
750
751 if (min_noise_blob > max_noise_blob) {
752 return -1;
753 }
754
755 *worst_noise_score = small_limit;
756 worst_noise_blob = -1;
757 for (auto i = min_noise_blob; i <= max_noise_blob; i++) {
758 if (noise_score[i] < *worst_noise_score) {
759 worst_noise_blob = i;
760 *worst_noise_score = noise_score[i];
761 }
762 }
763 return worst_noise_blob;
764 }
765
766 float Tesseract::blob_noise_score(TBLOB *blob) {
767 TBOX box; // BB of outline
768 int16_t outline_count = 0;
769 int16_t max_dimension;
770 int16_t largest_outline_dimension = 0;
771
772 for (TESSLINE *ol = blob->outlines; ol != nullptr; ol = ol->next) {
773 outline_count++;
774 box = ol->bounding_box();
775 if (box.height() > box.width()) {
776 max_dimension = box.height();
777 } else {
778 max_dimension = box.width();
779 }
780
781 if (largest_outline_dimension < max_dimension) {
782 largest_outline_dimension = max_dimension;
783 }
784 }
785
786 if (outline_count > 5) {
787 // penalise LOTS of blobs
788 largest_outline_dimension *= 2;
789 }
790
791 box = blob->bounding_box();
792 if (box.bottom() > kBlnBaselineOffset * 4 || box.top() < kBlnBaselineOffset / 2) {
793 // Lax blob is if high or low
794 largest_outline_dimension /= 2;
795 }
796
797 return largest_outline_dimension;
798 }
799
800 void fixspace_dbg(WERD_RES *word) {
801 TBOX box = word->word->bounding_box();
802 const bool show_map_detail = false;
803
804 box.print();
805 tprintf(" \"%s\" ", word->best_choice->unichar_string().c_str());
806 tprintf("Blob count: %d (word); %d/%d (rebuild word)\n", word->word->cblob_list()->length(),
807 word->rebuild_word->NumBlobs(), word->box_word->length());
808 word->reject_map.print(debug_fp);
809 tprintf("\n");
810 if (show_map_detail) {
811 tprintf("\"%s\"\n", word->best_choice->unichar_string().c_str());
812 for (unsigned i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
813 tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
814 word->reject_map[i].full_print(debug_fp);
815 }
816 }
817
818 tprintf("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
819 tprintf("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
820 }
821
822 /**
823 * fp_eval_word_spacing()
824 * Evaluation function for fixed pitch word lists.
825 *
826 * Basically, count the number of "nice" characters - those which are in tess
827 * acceptable words or in dict words and are not rejected.
828 * Penalise any potential noise chars
829 */
830 int16_t Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {
831 WERD_RES_IT word_it(&word_res_list);
832 WERD_RES *word;
833 int16_t score = 0;
834 float small_limit = kBlnXHeight * fixsp_small_outlines_size;
835
836 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
837 word = word_it.data();
838 if (word->rebuild_word == nullptr) {
839 continue; // Can't handle cube words.
840 }
841 if (word->done || word->tess_accepted || word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
842 word->best_choice->permuter() == FREQ_DAWG_PERM ||
843 word->best_choice->permuter() == USER_DAWG_PERM || safe_dict_word(word) > 0) {
844 auto num_blobs = word->rebuild_word->NumBlobs();
845 UNICHAR_ID space = word->uch_set->unichar_to_id(" ");
846 for (unsigned i = 0; i < word->best_choice->length() && i < num_blobs; ++i) {
847 TBLOB *blob = word->rebuild_word->blobs[i];
848 if (word->best_choice->unichar_id(i) == space || blob_noise_score(blob) < small_limit) {
849 score -= 1; // penalise possibly erroneous non-space
850 } else if (word->reject_map[i].accepted()) {
851 score++;
852 }
853 }
854 }
855 }
856 if (score < 0) {
857 score = 0;
858 }
859 return score;
860 }
861
862 } // namespace tesseract