comparison mupdf-source/thirdparty/tesseract/src/ccmain/docqual.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /******************************************************************
2 * File: docqual.cpp (Formerly docqual.c)
3 * Description: Document Quality Metrics
4 * Author: Phil Cheatle
5 *
6 * (C) Copyright 1994, Hewlett-Packard Ltd.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19 #include "docqual.h"
20 #include <cctype>
21 #include "reject.h"
22 #include "tesseractclass.h"
23 #include "tessvars.h"
24
25 namespace tesseract {
26
27 static void countMatchingBlobs(int16_t &match_count, int /*index*/) {
28 ++match_count;
29 }
30
31 static void countAcceptedBlobs(WERD_RES *word, int16_t &match_count, int16_t &accepted_match_count,
32 int index) {
33 if (word->reject_map[index].accepted()) {
34 ++accepted_match_count;
35 }
36 ++match_count;
37 }
38
39 static void acceptIfGoodQuality(WERD_RES *word, int index) {
40 if (word->reject_map[index].accept_if_good_quality()) {
41 word->reject_map[index].setrej_quality_accept();
42 }
43 }
44
45 /*************************************************************************
46 * word_blob_quality()
47 * How many blobs in the box_word are identical to those of the inword?
48 * ASSUME blobs in both initial word and box_word are in ascending order of
49 * left hand blob edge.
50 *************************************************************************/
51 int16_t Tesseract::word_blob_quality(WERD_RES *word) {
52 int16_t match_count = 0;
53 if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
54 !word->rebuild_word->blobs.empty()) {
55 using namespace std::placeholders; // for _1
56 word->bln_boxes->ProcessMatchedBlobs(*word->rebuild_word,
57 std::bind(countMatchingBlobs, match_count, _1));
58 }
59 return match_count;
60 }
61
62 int16_t Tesseract::word_outline_errs(WERD_RES *word) {
63 int16_t err_count = 0;
64
65 if (word->rebuild_word != nullptr) {
66 int16_t i = 0;
67 for (unsigned b = 0; b < word->rebuild_word->NumBlobs(); ++b) {
68 TBLOB *blob = word->rebuild_word->blobs[b];
69 err_count += count_outline_errs(word->best_choice->unichar_string()[i], blob->NumOutlines());
70 i++;
71 }
72 }
73 return err_count;
74 }
75
76 /*************************************************************************
77 * word_char_quality()
78 * Combination of blob quality and outline quality - how many good chars are
79 * there? - I.e chars which pass the blob AND outline tests.
80 *************************************************************************/
81 void Tesseract::word_char_quality(WERD_RES *word, int16_t *match_count,
82 int16_t *accepted_match_count) {
83 *match_count = 0;
84 *accepted_match_count = 0;
85 if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
86 !word->rebuild_word->blobs.empty()) {
87 using namespace std::placeholders; // for _1
88 word->bln_boxes->ProcessMatchedBlobs(
89 *word->rebuild_word,
90 std::bind(countAcceptedBlobs, word, *match_count, *accepted_match_count, _1));
91 }
92 }
93
94 /*************************************************************************
95 * unrej_good_chs()
96 * Unreject POTENTIAL rejects if the blob passes the blob and outline checks
97 *************************************************************************/
98 void Tesseract::unrej_good_chs(WERD_RES *word) {
99 if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
100 word->rebuild_word->blobs.empty()) {
101 using namespace std::placeholders; // for _1
102 word->bln_boxes->ProcessMatchedBlobs(*word->rebuild_word,
103 std::bind(acceptIfGoodQuality, word, _1));
104 }
105 }
106
107 int16_t Tesseract::count_outline_errs(char c, int16_t outline_count) {
108 int expected_outline_count;
109
110 if (outlines_odd.contains(c)) {
111 return 0; // Don't use this char
112 } else if (outlines_2.contains(c)) {
113 expected_outline_count = 2;
114 } else {
115 expected_outline_count = 1;
116 }
117 return abs(outline_count - expected_outline_count);
118 }
119
120 void Tesseract::quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc) {
121 if ((tessedit_good_quality_unrej && good_quality_doc)) {
122 unrej_good_quality_words(page_res_it);
123 }
124 doc_and_block_rejection(page_res_it, good_quality_doc);
125 if (unlv_tilde_crunching) {
126 tilde_crunch(page_res_it);
127 tilde_delete(page_res_it);
128 }
129 }
130
131 /*************************************************************************
132 * unrej_good_quality_words()
133 * Accept potential rejects in words which pass the following checks:
134 * - Contains a potential reject
135 * - Word looks like a sensible alpha word.
136 * - Word segmentation is the same as the original image
137 * - All characters have the expected number of outlines
138 * NOTE - the rejection counts are recalculated after unrejection
139 * - CAN'T do it in a single pass without a bit of fiddling
140 * - keep it simple but inefficient
141 *************************************************************************/
142 void Tesseract::unrej_good_quality_words( // unreject potential
143 PAGE_RES_IT &page_res_it) {
144 WERD_RES *word;
145 ROW_RES *current_row;
146 BLOCK_RES *current_block;
147 int i;
148
149 page_res_it.restart_page();
150 while (page_res_it.word() != nullptr) {
151 check_debug_pt(page_res_it.word(), 100);
152 if (bland_unrej) {
153 word = page_res_it.word();
154 for (i = 0; i < word->reject_map.length(); i++) {
155 if (word->reject_map[i].accept_if_good_quality()) {
156 word->reject_map[i].setrej_quality_accept();
157 }
158 }
159 page_res_it.forward();
160 } else if ((page_res_it.row()->char_count > 0) &&
161 ((page_res_it.row()->rej_count /
162 static_cast<float>(page_res_it.row()->char_count)) <= quality_rowrej_pc)) {
163 word = page_res_it.word();
164 if (word->reject_map.quality_recoverable_rejects() &&
165 (tessedit_unrej_any_wd ||
166 acceptable_word_string(*word->uch_set, word->best_choice->unichar_string().c_str(),
167 word->best_choice->unichar_lengths().c_str()) !=
168 AC_UNACCEPTABLE)) {
169 unrej_good_chs(word);
170 }
171 page_res_it.forward();
172 } else {
173 // Skip to end of dodgy row.
174 current_row = page_res_it.row();
175 while ((page_res_it.word() != nullptr) && (page_res_it.row() == current_row)) {
176 page_res_it.forward();
177 }
178 }
179 check_debug_pt(page_res_it.word(), 110);
180 }
181 page_res_it.restart_page();
182 page_res_it.page_res->char_count = 0;
183 page_res_it.page_res->rej_count = 0;
184 current_block = nullptr;
185 current_row = nullptr;
186 while (page_res_it.word() != nullptr) {
187 if (current_block != page_res_it.block()) {
188 current_block = page_res_it.block();
189 current_block->char_count = 0;
190 current_block->rej_count = 0;
191 }
192 if (current_row != page_res_it.row()) {
193 current_row = page_res_it.row();
194 current_row->char_count = 0;
195 current_row->rej_count = 0;
196 current_row->whole_word_rej_count = 0;
197 }
198 page_res_it.rej_stat_word();
199 page_res_it.forward();
200 }
201 }
202
203 /*************************************************************************
204 * doc_and_block_rejection()
205 *
206 * If the page has too many rejects - reject all of it.
207 * If any block has too many rejects - reject all words in the block
208 *************************************************************************/
209
210 void Tesseract::doc_and_block_rejection( // reject big chunks
211 PAGE_RES_IT &page_res_it, bool good_quality_doc) {
212 BLOCK_RES *current_block;
213
214 int16_t char_quality = 0;
215 int16_t accepted_char_quality;
216
217 if (page_res_it.page_res->rej_count * 100.0 / page_res_it.page_res->char_count >
218 tessedit_reject_doc_percent) {
219 reject_whole_page(page_res_it);
220 if (tessedit_debug_doc_rejection) {
221 tprintf("REJECT ALL #chars: %d #Rejects: %d; \n", page_res_it.page_res->char_count,
222 page_res_it.page_res->rej_count);
223 }
224 } else {
225 if (tessedit_debug_doc_rejection) {
226 tprintf("NO PAGE REJECTION #chars: %d # Rejects: %d; \n", page_res_it.page_res->char_count,
227 page_res_it.page_res->rej_count);
228 }
229
230 /* Walk blocks testing for block rejection */
231
232 page_res_it.restart_page();
233 WERD_RES *word;
234 while ((word = page_res_it.word()) != nullptr) {
235 current_block = page_res_it.block();
236 int16_t block_no = current_block->block->pdblk.index();
237 if (current_block->char_count > 0 &&
238 (current_block->rej_count * 100.0 / current_block->char_count) >
239 tessedit_reject_block_percent) {
240 if (tessedit_debug_block_rejection) {
241 tprintf("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n", block_no,
242 current_block->char_count, current_block->rej_count);
243 }
244 bool prev_word_rejected = false;
245 while ((word = page_res_it.word()) != nullptr && (page_res_it.block() == current_block)) {
246 bool rej_word;
247 if (tessedit_preserve_blk_rej_perfect_wds) {
248 rej_word = word->reject_map.reject_count() > 0 ||
249 word->reject_map.length() < tessedit_preserve_min_wd_len;
250 if (rej_word && tessedit_dont_blkrej_good_wds &&
251 word->reject_map.length() >= tessedit_preserve_min_wd_len &&
252 acceptable_word_string(*word->uch_set, word->best_choice->unichar_string().c_str(),
253 word->best_choice->unichar_lengths().c_str()) !=
254 AC_UNACCEPTABLE) {
255 word_char_quality(word, &char_quality, &accepted_char_quality);
256 rej_word = char_quality != word->reject_map.length();
257 }
258 } else {
259 rej_word = true;
260 }
261 if (rej_word) {
262 /*
263 Reject spacing if both current and prev words are rejected.
264 NOTE - this is NOT restricted to FUZZY spaces. - When tried this
265 generated more space errors.
266 */
267 if (tessedit_use_reject_spaces && prev_word_rejected &&
268 page_res_it.prev_row() == page_res_it.row() && word->word->space() == 1) {
269 word->reject_spaces = true;
270 }
271 word->reject_map.rej_word_block_rej();
272 }
273 prev_word_rejected = rej_word;
274 page_res_it.forward();
275 }
276 } else {
277 if (tessedit_debug_block_rejection) {
278 tprintf("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n", block_no,
279 page_res_it.block()->char_count, page_res_it.block()->rej_count);
280 }
281
282 /* Walk rows in block testing for row rejection */
283 int16_t row_no = 0;
284 while (page_res_it.word() != nullptr && page_res_it.block() == current_block) {
285 ROW_RES *current_row = page_res_it.row();
286 row_no++;
287 /* Reject whole row if:
288 fraction of chars on row which are rejected exceed a limit AND
289 fraction rejects which occur in WHOLE WERD rejects is LESS THAN a
290 limit
291 */
292 if (current_row->char_count > 0 &&
293 (current_row->rej_count * 100.0 / current_row->char_count) >
294 tessedit_reject_row_percent &&
295 (current_row->whole_word_rej_count * 100.0 / current_row->rej_count) <
296 tessedit_whole_wd_rej_row_percent) {
297 if (tessedit_debug_block_rejection) {
298 tprintf("REJECTING ROW %d #chars: %d; #Rejects: %d\n", row_no,
299 current_row->char_count, current_row->rej_count);
300 }
301 bool prev_word_rejected = false;
302 while ((word = page_res_it.word()) != nullptr && page_res_it.row() == current_row) {
303 /* Preserve words on good docs unless they are mostly rejected*/
304 bool rej_word;
305 if (!tessedit_row_rej_good_docs && good_quality_doc) {
306 rej_word = word->reject_map.reject_count() /
307 static_cast<float>(word->reject_map.length()) >
308 tessedit_good_doc_still_rowrej_wd;
309 } else if (tessedit_preserve_row_rej_perfect_wds) {
310 /* Preserve perfect words anyway */
311 rej_word = word->reject_map.reject_count() > 0 ||
312 word->reject_map.length() < tessedit_preserve_min_wd_len;
313 if (rej_word && tessedit_dont_rowrej_good_wds &&
314 word->reject_map.length() >= tessedit_preserve_min_wd_len &&
315 acceptable_word_string(
316 *word->uch_set, word->best_choice->unichar_string().c_str(),
317 word->best_choice->unichar_lengths().c_str()) != AC_UNACCEPTABLE) {
318 word_char_quality(word, &char_quality, &accepted_char_quality);
319 rej_word = char_quality != word->reject_map.length();
320 }
321 } else {
322 rej_word = true;
323 }
324 if (rej_word) {
325 /*
326 Reject spacing if both current and prev words are rejected.
327 NOTE - this is NOT restricted to FUZZY spaces. - When tried
328 this generated more space errors.
329 */
330 if (tessedit_use_reject_spaces && prev_word_rejected &&
331 page_res_it.prev_row() == page_res_it.row() && word->word->space() == 1) {
332 word->reject_spaces = true;
333 }
334 word->reject_map.rej_word_row_rej();
335 }
336 prev_word_rejected = rej_word;
337 page_res_it.forward();
338 }
339 } else {
340 if (tessedit_debug_block_rejection) {
341 tprintf("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n", row_no,
342 current_row->char_count, current_row->rej_count);
343 }
344 while (page_res_it.word() != nullptr && page_res_it.row() == current_row) {
345 page_res_it.forward();
346 }
347 }
348 }
349 }
350 }
351 }
352 }
353
354 /*************************************************************************
355 * reject_whole_page()
356 * Don't believe any of it - set the reject map to 00..00 in all words
357 *
358 *************************************************************************/
359
360 void reject_whole_page(PAGE_RES_IT &page_res_it) {
361 page_res_it.restart_page();
362 while (page_res_it.word() != nullptr) {
363 page_res_it.word()->reject_map.rej_word_doc_rej();
364 page_res_it.forward();
365 }
366 // whole page is rejected
367 page_res_it.page_res->rejected = true;
368 }
369
370 void Tesseract::tilde_crunch(PAGE_RES_IT &page_res_it) {
371 WERD_RES *word;
372 GARBAGE_LEVEL garbage_level;
373 PAGE_RES_IT copy_it;
374 bool prev_potential_marked = false;
375 bool found_terrible_word = false;
376 bool ok_dict_word;
377
378 page_res_it.restart_page();
379 while (page_res_it.word() != nullptr) {
380 POLY_BLOCK *pb = page_res_it.block()->block->pdblk.poly_block();
381 if (pb != nullptr && !pb->IsText()) {
382 page_res_it.forward();
383 continue;
384 }
385 word = page_res_it.word();
386
387 if (crunch_early_convert_bad_unlv_chs) {
388 convert_bad_unlv_chs(word);
389 }
390
391 if (crunch_early_merge_tess_fails) {
392 word->merge_tess_fails();
393 }
394
395 if (word->reject_map.accept_count() != 0) {
396 found_terrible_word = false;
397 // Forget earlier potential crunches
398 prev_potential_marked = false;
399 } else {
400 ok_dict_word = safe_dict_word(word);
401 garbage_level = garbage_word(word, ok_dict_word);
402
403 if ((garbage_level != G_NEVER_CRUNCH) && (terrible_word_crunch(word, garbage_level))) {
404 if (crunch_debug > 0) {
405 tprintf("T CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().c_str());
406 }
407 word->unlv_crunch_mode = CR_KEEP_SPACE;
408 if (prev_potential_marked) {
409 while (copy_it.word() != word) {
410 if (crunch_debug > 0) {
411 tprintf("P1 CRUNCHING: \"%s\"\n",
412 copy_it.word()->best_choice->unichar_string().c_str());
413 }
414 copy_it.word()->unlv_crunch_mode = CR_KEEP_SPACE;
415 copy_it.forward();
416 }
417 prev_potential_marked = false;
418 }
419 found_terrible_word = true;
420 } else if ((garbage_level != G_NEVER_CRUNCH) &&
421 (potential_word_crunch(word, garbage_level, ok_dict_word))) {
422 if (found_terrible_word) {
423 if (crunch_debug > 0) {
424 tprintf("P2 CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().c_str());
425 }
426 word->unlv_crunch_mode = CR_KEEP_SPACE;
427 } else if (!prev_potential_marked) {
428 copy_it = page_res_it;
429 prev_potential_marked = true;
430 if (crunch_debug > 1) {
431 tprintf("P3 CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().c_str());
432 }
433 }
434 } else {
435 found_terrible_word = false;
436 // Forget earlier potential crunches
437 prev_potential_marked = false;
438 if (crunch_debug > 2) {
439 tprintf("NO CRUNCH: \"%s\"\n", word->best_choice->unichar_string().c_str());
440 }
441 }
442 }
443 page_res_it.forward();
444 }
445 }
446
447 bool Tesseract::terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level) {
448 int crunch_mode = 0;
449
450 if (word->best_choice->unichar_string().empty() ||
451 (strspn(word->best_choice->unichar_string().c_str(), " ") ==
452 word->best_choice->unichar_string().size())) {
453 crunch_mode = 1;
454 } else {
455 int adjusted_len = word->reject_map.length();
456 if (adjusted_len > crunch_rating_max) {
457 adjusted_len = crunch_rating_max;
458 }
459 float rating_per_ch = word->best_choice->rating() / adjusted_len;
460
461 if (rating_per_ch > crunch_terrible_rating) {
462 crunch_mode = 2;
463 } else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE)) {
464 crunch_mode = 3;
465 } else if ((word->best_choice->certainty() < crunch_poor_garbage_cert) &&
466 (garbage_level != G_OK)) {
467 crunch_mode = 4;
468 } else if ((rating_per_ch > crunch_poor_garbage_rate) && (garbage_level != G_OK)) {
469 crunch_mode = 5;
470 }
471 }
472 if (crunch_mode > 0) {
473 if (crunch_debug > 2) {
474 tprintf("Terrible_word_crunch (%d) on \"%s\"\n", crunch_mode,
475 word->best_choice->unichar_string().c_str());
476 }
477 return true;
478 } else {
479 return false;
480 }
481 }
482
483 bool Tesseract::potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level,
484 bool ok_dict_word) {
485 float rating_per_ch;
486 int adjusted_len;
487 const char *str = word->best_choice->unichar_string().c_str();
488 const char *lengths = word->best_choice->unichar_lengths().c_str();
489 bool word_crunchable;
490 int poor_indicator_count = 0;
491
492 word_crunchable =
493 !crunch_leave_accept_strings || word->reject_map.length() < 3 ||
494 (acceptable_word_string(*word->uch_set, str, lengths) == AC_UNACCEPTABLE && !ok_dict_word);
495
496 adjusted_len = word->reject_map.length();
497 if (adjusted_len > 10) {
498 adjusted_len = 10;
499 }
500 rating_per_ch = word->best_choice->rating() / adjusted_len;
501
502 if (rating_per_ch > crunch_pot_poor_rate) {
503 if (crunch_debug > 2) {
504 tprintf("Potential poor rating on \"%s\"\n", word->best_choice->unichar_string().c_str());
505 }
506 poor_indicator_count++;
507 }
508
509 if (word_crunchable && word->best_choice->certainty() < crunch_pot_poor_cert) {
510 if (crunch_debug > 2) {
511 tprintf("Potential poor cert on \"%s\"\n", word->best_choice->unichar_string().c_str());
512 }
513 poor_indicator_count++;
514 }
515
516 if (garbage_level != G_OK) {
517 if (crunch_debug > 2) {
518 tprintf("Potential garbage on \"%s\"\n", word->best_choice->unichar_string().c_str());
519 }
520 poor_indicator_count++;
521 }
522 return poor_indicator_count >= crunch_pot_indicators;
523 }
524
525 void Tesseract::tilde_delete(PAGE_RES_IT &page_res_it) {
526 PAGE_RES_IT copy_it;
527 bool deleting_from_bol = false;
528 bool marked_delete_point = false;
529 int16_t debug_delete_mode;
530 CRUNCH_MODE delete_mode;
531 int16_t x_debug_delete_mode;
532 CRUNCH_MODE x_delete_mode;
533
534 page_res_it.restart_page();
535 while (page_res_it.word() != nullptr) {
536 WERD_RES *word = page_res_it.word();
537
538 delete_mode = word_deletable(word, debug_delete_mode);
539 if (delete_mode != CR_NONE) {
540 if (word->word->flag(W_BOL) || deleting_from_bol) {
541 if (crunch_debug > 0) {
542 tprintf("BOL CRUNCH DELETING(%d): \"%s\"\n", debug_delete_mode,
543 word->best_choice->unichar_string().c_str());
544 }
545 word->unlv_crunch_mode = delete_mode;
546 deleting_from_bol = true;
547 } else if (word->word->flag(W_EOL)) {
548 if (marked_delete_point) {
549 while (copy_it.word() != word) {
550 x_delete_mode = word_deletable(copy_it.word(), x_debug_delete_mode);
551 if (crunch_debug > 0) {
552 tprintf("EOL CRUNCH DELETING(%d): \"%s\"\n", x_debug_delete_mode,
553 copy_it.word()->best_choice->unichar_string().c_str());
554 }
555 copy_it.word()->unlv_crunch_mode = x_delete_mode;
556 copy_it.forward();
557 }
558 }
559 if (crunch_debug > 0) {
560 tprintf("EOL CRUNCH DELETING(%d): \"%s\"\n", debug_delete_mode,
561 word->best_choice->unichar_string().c_str());
562 }
563 word->unlv_crunch_mode = delete_mode;
564 deleting_from_bol = false;
565 marked_delete_point = false;
566 } else {
567 if (!marked_delete_point) {
568 copy_it = page_res_it;
569 marked_delete_point = true;
570 }
571 }
572 } else {
573 deleting_from_bol = false;
574 // Forget earlier potential crunches
575 marked_delete_point = false;
576 }
577 /*
578 The following step has been left till now as the tess fails are used to
579 determine if the word is deletable.
580 */
581 if (!crunch_early_merge_tess_fails) {
582 word->merge_tess_fails();
583 }
584 page_res_it.forward();
585 }
586 }
587
588 void Tesseract::convert_bad_unlv_chs(WERD_RES *word_res) {
589 int i;
590 UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
591 UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" ");
592 UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~");
593 UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^");
594 for (i = 0; i < word_res->reject_map.length(); ++i) {
595 if (word_res->best_choice->unichar_id(i) == unichar_tilde) {
596 word_res->best_choice->set_unichar_id(unichar_dash, i);
597 if (word_res->reject_map[i].accepted()) {
598 word_res->reject_map[i].setrej_unlv_rej();
599 }
600 }
601 if (word_res->best_choice->unichar_id(i) == unichar_pow) {
602 word_res->best_choice->set_unichar_id(unichar_space, i);
603 if (word_res->reject_map[i].accepted()) {
604 word_res->reject_map[i].setrej_unlv_rej();
605 }
606 }
607 }
608 }
609
610 GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, bool ok_dict_word) {
611 enum STATES {
612 JUNK,
613 FIRST_UPPER,
614 FIRST_LOWER,
615 FIRST_NUM,
616 SUBSEQUENT_UPPER,
617 SUBSEQUENT_LOWER,
618 SUBSEQUENT_NUM
619 };
620 const char *str = word->best_choice->unichar_string().c_str();
621 const char *lengths = word->best_choice->unichar_lengths().c_str();
622 STATES state = JUNK;
623 int len = 0;
624 int isolated_digits = 0;
625 int isolated_alphas = 0;
626 int bad_char_count = 0;
627 int tess_rejs = 0;
628 int dodgy_chars = 0;
629 int ok_chars;
630 UNICHAR_ID last_char = -1;
631 int alpha_repetition_count = 0;
632 int longest_alpha_repetition_count = 0;
633 int longest_lower_run_len = 0;
634 int lower_string_count = 0;
635 int longest_upper_run_len = 0;
636 int upper_string_count = 0;
637 int total_alpha_count = 0;
638 int total_digit_count = 0;
639
640 for (; *str != '\0'; str += *(lengths++)) {
641 len++;
642 if (word->uch_set->get_isupper(str, *lengths)) {
643 total_alpha_count++;
644 switch (state) {
645 case SUBSEQUENT_UPPER:
646 case FIRST_UPPER:
647 state = SUBSEQUENT_UPPER;
648 upper_string_count++;
649 if (longest_upper_run_len < upper_string_count) {
650 longest_upper_run_len = upper_string_count;
651 }
652 if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
653 alpha_repetition_count++;
654 if (longest_alpha_repetition_count < alpha_repetition_count) {
655 longest_alpha_repetition_count = alpha_repetition_count;
656 }
657 } else {
658 last_char = word->uch_set->unichar_to_id(str, *lengths);
659 alpha_repetition_count = 1;
660 }
661 break;
662 case FIRST_NUM:
663 isolated_digits++;
664 // Fall through.
665 default:
666 state = FIRST_UPPER;
667 last_char = word->uch_set->unichar_to_id(str, *lengths);
668 alpha_repetition_count = 1;
669 upper_string_count = 1;
670 break;
671 }
672 } else if (word->uch_set->get_islower(str, *lengths)) {
673 total_alpha_count++;
674 switch (state) {
675 case SUBSEQUENT_LOWER:
676 case FIRST_LOWER:
677 state = SUBSEQUENT_LOWER;
678 lower_string_count++;
679 if (longest_lower_run_len < lower_string_count) {
680 longest_lower_run_len = lower_string_count;
681 }
682 if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
683 alpha_repetition_count++;
684 if (longest_alpha_repetition_count < alpha_repetition_count) {
685 longest_alpha_repetition_count = alpha_repetition_count;
686 }
687 } else {
688 last_char = word->uch_set->unichar_to_id(str, *lengths);
689 alpha_repetition_count = 1;
690 }
691 break;
692 case FIRST_NUM:
693 isolated_digits++;
694 // Fall through.
695 default:
696 state = FIRST_LOWER;
697 last_char = word->uch_set->unichar_to_id(str, *lengths);
698 alpha_repetition_count = 1;
699 lower_string_count = 1;
700 break;
701 }
702 } else if (word->uch_set->get_isdigit(str, *lengths)) {
703 total_digit_count++;
704 switch (state) {
705 case FIRST_NUM:
706 state = SUBSEQUENT_NUM;
707 case SUBSEQUENT_NUM:
708 break;
709 case FIRST_UPPER:
710 case FIRST_LOWER:
711 isolated_alphas++;
712 // Fall through.
713 default:
714 state = FIRST_NUM;
715 break;
716 }
717 } else {
718 if (*lengths == 1 && *str == ' ') {
719 tess_rejs++;
720 } else {
721 bad_char_count++;
722 }
723 switch (state) {
724 case FIRST_NUM:
725 isolated_digits++;
726 break;
727 case FIRST_UPPER:
728 case FIRST_LOWER:
729 isolated_alphas++;
730 default:
731 break;
732 }
733 state = JUNK;
734 }
735 }
736
737 switch (state) {
738 case FIRST_NUM:
739 isolated_digits++;
740 break;
741 case FIRST_UPPER:
742 case FIRST_LOWER:
743 isolated_alphas++;
744 default:
745 break;
746 }
747
748 if (crunch_include_numerals) {
749 total_alpha_count += total_digit_count - isolated_digits;
750 }
751
752 if (crunch_leave_ok_strings && len >= 4 && 2 * (total_alpha_count - isolated_alphas) > len &&
753 longest_alpha_repetition_count < crunch_long_repetitions) {
754 if ((crunch_accept_ok &&
755 acceptable_word_string(*word->uch_set, str, lengths) != AC_UNACCEPTABLE) ||
756 longest_lower_run_len > crunch_leave_lc_strings ||
757 longest_upper_run_len > crunch_leave_uc_strings) {
758 return G_NEVER_CRUNCH;
759 }
760 }
761 if (word->reject_map.length() > 1 && strpbrk(str, " ") == nullptr &&
762 (word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
763 word->best_choice->permuter() == FREQ_DAWG_PERM ||
764 word->best_choice->permuter() == USER_DAWG_PERM ||
765 word->best_choice->permuter() == NUMBER_PERM ||
766 acceptable_word_string(*word->uch_set, str, lengths) != AC_UNACCEPTABLE || ok_dict_word)) {
767 return G_OK;
768 }
769
770 ok_chars = len - bad_char_count - isolated_digits - isolated_alphas - tess_rejs;
771
772 if (crunch_debug > 3) {
773 tprintf("garbage_word: \"%s\"\n", word->best_choice->unichar_string().c_str());
774 tprintf("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n", len, bad_char_count,
775 isolated_digits, isolated_alphas, tess_rejs);
776 }
777 if (bad_char_count == 0 && tess_rejs == 0 &&
778 (len > isolated_digits + isolated_alphas || len <= 2)) {
779 return G_OK;
780 }
781
782 if (tess_rejs > ok_chars || (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len)) {
783 return G_TERRIBLE;
784 }
785
786 if (len > 4) {
787 dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits + isolated_alphas;
788 if (dodgy_chars > 5 || (dodgy_chars / static_cast<float>(len)) > 0.5) {
789 return G_DODGY;
790 } else {
791 return G_OK;
792 }
793 } else {
794 dodgy_chars = 2 * tess_rejs + bad_char_count;
795 if ((len == 4 && dodgy_chars > 2) || (len == 3 && dodgy_chars > 2) || dodgy_chars >= len) {
796 return G_DODGY;
797 } else {
798 return G_OK;
799 }
800 }
801 }
802
803 /*************************************************************************
804 * word_deletable()
805 * DELETE WERDS AT ENDS OF ROWS IF
806 * Word is crunched &&
807 * ( string length = 0 OR
808 * > 50% of chars are "|" (before merging) OR
809 * certainty < -10 OR
810 * rating /char > 60 OR
811 * TOP of word is more than 0.5 xht BELOW baseline OR
812 * BOTTOM of word is more than 0.5 xht ABOVE xht OR
813 * length of word < 3xht OR
814 * height of word < 0.7 xht OR
815 * height of word > 3.0 xht OR
816 * >75% of the outline BBs have longest dimension < 0.5xht
817 *************************************************************************/
818
819 CRUNCH_MODE Tesseract::word_deletable(WERD_RES *word, int16_t &delete_mode) {
820 int word_len = word->reject_map.length();
821 float rating_per_ch;
822 TBOX box; // BB of word
823
824 if (word->unlv_crunch_mode == CR_NONE) {
825 delete_mode = 0;
826 return CR_NONE;
827 }
828
829 if (word_len == 0) {
830 delete_mode = 1;
831 return CR_DELETE;
832 }
833
834 if (word->rebuild_word != nullptr) {
835 // Cube leaves rebuild_word nullptr.
836 box = word->rebuild_word->bounding_box();
837 if (box.height() < crunch_del_min_ht * kBlnXHeight) {
838 delete_mode = 4;
839 return CR_DELETE;
840 }
841
842 if (noise_outlines(word->rebuild_word)) {
843 delete_mode = 5;
844 return CR_DELETE;
845 }
846 }
847
848 if ((failure_count(word) * 1.5) > word_len) {
849 delete_mode = 2;
850 return CR_LOOSE_SPACE;
851 }
852
853 if (word->best_choice->certainty() < crunch_del_cert) {
854 delete_mode = 7;
855 return CR_LOOSE_SPACE;
856 }
857
858 rating_per_ch = word->best_choice->rating() / word_len;
859
860 if (rating_per_ch > crunch_del_rating) {
861 delete_mode = 8;
862 return CR_LOOSE_SPACE;
863 }
864
865 if (box.top() < kBlnBaselineOffset - crunch_del_low_word * kBlnXHeight) {
866 delete_mode = 9;
867 return CR_LOOSE_SPACE;
868 }
869
870 if (box.bottom() > kBlnBaselineOffset + crunch_del_high_word * kBlnXHeight) {
871 delete_mode = 10;
872 return CR_LOOSE_SPACE;
873 }
874
875 if (box.height() > crunch_del_max_ht * kBlnXHeight) {
876 delete_mode = 11;
877 return CR_LOOSE_SPACE;
878 }
879
880 if (box.width() < crunch_del_min_width * kBlnXHeight) {
881 delete_mode = 3;
882 return CR_LOOSE_SPACE;
883 }
884
885 delete_mode = 0;
886 return CR_NONE;
887 }
888
889 int16_t Tesseract::failure_count(WERD_RES *word) {
890 const char *str = word->best_choice->unichar_string().c_str();
891 int tess_rejs = 0;
892
893 for (; *str != '\0'; str++) {
894 if (*str == ' ') {
895 tess_rejs++;
896 }
897 }
898 return tess_rejs;
899 }
900
901 bool Tesseract::noise_outlines(TWERD *word) {
902 TBOX box; // BB of outline
903 int16_t outline_count = 0;
904 int16_t small_outline_count = 0;
905 int16_t max_dimension;
906 float small_limit = kBlnXHeight * crunch_small_outlines_size;
907
908 for (unsigned b = 0; b < word->NumBlobs(); ++b) {
909 TBLOB *blob = word->blobs[b];
910 for (TESSLINE *ol = blob->outlines; ol != nullptr; ol = ol->next) {
911 outline_count++;
912 box = ol->bounding_box();
913 if (box.height() > box.width()) {
914 max_dimension = box.height();
915 } else {
916 max_dimension = box.width();
917 }
918 if (max_dimension < small_limit) {
919 small_outline_count++;
920 }
921 }
922 }
923 return small_outline_count >= outline_count;
924 }
925
926 } // namespace tesseract