comparison mupdf-source/thirdparty/tesseract/src/ccmain/reject.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /**********************************************************************
2 * File: reject.cpp (Formerly reject.c)
3 * Description: Rejection functions used in tessedit
4 * Author: Phil Cheatle
5 *
6 * (C) Copyright 1992, Hewlett-Packard Ltd.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19 // Include automatically generated configuration file if running autoconf.
20 #ifdef HAVE_CONFIG_H
21 # include "config_auto.h"
22 #endif
23
24 #include "reject.h"
25
26 #ifdef DISABLED_LEGACY_ENGINE
27
28 # include "tesseractclass.h"
29
30 namespace tesseract {
31
32 int16_t Tesseract::safe_dict_word(const WERD_RES *werd_res) {
33 const WERD_CHOICE &word = *werd_res->best_choice;
34 int dict_word_type = werd_res->tesseract->dict_word(word);
35 return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
36 }
37 } // namespace tesseract
38
39 #else
40
41 # include "control.h"
42 # include "docqual.h"
43 # include "tesseractclass.h"
44 # include "tessvars.h"
45
46 # include "helpers.h"
47
48 # include <algorithm> // for std::sort
49 # include <cctype>
50 # include <cerrno>
51 # include <cstring>
52 # include <vector> // for std::vector
53
54 namespace tesseract {
55
56 /*************************************************************************
57 * set_done()
58 *
59 * Set the done flag based on the word acceptability criteria
60 *************************************************************************/
61
62 void Tesseract::set_done(WERD_RES *word, int16_t pass) {
63 word->done =
64 word->tess_accepted && (strchr(word->best_choice->unichar_string().c_str(), ' ') == nullptr);
65 bool word_is_ambig = word->best_choice->dangerous_ambig_found();
66 bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
67 word->best_choice->permuter() == FREQ_DAWG_PERM ||
68 word->best_choice->permuter() == USER_DAWG_PERM;
69 if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) &&
70 one_ell_conflict(word, false)) {
71 if (tessedit_rejection_debug) {
72 tprintf("one_ell_conflict detected\n");
73 }
74 word->done = false;
75 }
76 if (word->done &&
77 ((!word_from_dict && word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) {
78 if (tessedit_rejection_debug) {
79 tprintf("non-dict or ambig word detected\n");
80 }
81 word->done = false;
82 }
83 if (tessedit_rejection_debug) {
84 tprintf("set_done(): done=%d\n", word->done);
85 word->best_choice->print("");
86 }
87 }
88
89 /*************************************************************************
90 * make_reject_map()
91 *
92 * Sets the done flag to indicate whether the resylt is acceptable.
93 *
94 * Sets a reject map for the word.
95 *************************************************************************/
96 void Tesseract::make_reject_map(WERD_RES *word, ROW *row, int16_t pass) {
97 flip_0O(word);
98 check_debug_pt(word, -1); // For trap only
99 set_done(word, pass); // Set acceptance
100 word->reject_map.initialise(word->best_choice->unichar_lengths().length());
101 reject_blanks(word);
102 /*
103 0: Rays original heuristic - the baseline
104 */
105 if (tessedit_reject_mode == 0) {
106 if (!word->done) {
107 reject_poor_matches(word);
108 }
109 } else if (tessedit_reject_mode == 5) {
110 /*
111 5: Reject I/1/l from words where there is no strong contextual confirmation;
112 the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
113 and the whole of any words which are very small
114 */
115 if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {
116 word->reject_map.rej_word_small_xht();
117 } else {
118 one_ell_conflict(word, true);
119 /*
120 Originally the code here just used the done flag. Now I have duplicated
121 and unpacked the conditions for setting the done flag so that each
122 mechanism can be turned on or off independently. This works WITHOUT
123 affecting the done flag setting.
124 */
125 if (rej_use_tess_accepted && !word->tess_accepted) {
126 word->reject_map.rej_word_not_tess_accepted();
127 }
128
129 if (rej_use_tess_blanks &&
130 (strchr(word->best_choice->unichar_string().c_str(), ' ') != nullptr)) {
131 word->reject_map.rej_word_contains_blanks();
132 }
133
134 WERD_CHOICE *best_choice = word->best_choice;
135 if (rej_use_good_perm) {
136 if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||
137 best_choice->permuter() == FREQ_DAWG_PERM ||
138 best_choice->permuter() == USER_DAWG_PERM) &&
139 (!rej_use_sensible_wd ||
140 acceptable_word_string(*word->uch_set, best_choice->unichar_string().c_str(),
141 best_choice->unichar_lengths().c_str()) != AC_UNACCEPTABLE)) {
142 // PASSED TEST
143 } else if (best_choice->permuter() == NUMBER_PERM) {
144 if (rej_alphas_in_number_perm) {
145 for (int i = 0, offset = 0; best_choice->unichar_string()[offset] != '\0';
146 offset += best_choice->unichar_lengths()[i++]) {
147 if (word->reject_map[i].accepted() &&
148 word->uch_set->get_isalpha(best_choice->unichar_string().c_str() + offset,
149 best_choice->unichar_lengths()[i])) {
150 word->reject_map[i].setrej_bad_permuter();
151 }
152 // rej alpha
153 }
154 }
155 } else {
156 word->reject_map.rej_word_bad_permuter();
157 }
158 }
159 /* Ambig word rejection was here once !!*/
160 }
161 } else {
162 tprintf("BAD tessedit_reject_mode\n");
163 ASSERT_HOST("Fatal error encountered!" == nullptr);
164 }
165
166 if (tessedit_image_border > -1) {
167 reject_edge_blobs(word);
168 }
169
170 check_debug_pt(word, 10);
171 if (tessedit_rejection_debug) {
172 tprintf("Permuter Type = %d\n", word->best_choice->permuter());
173 tprintf("Certainty: %f Rating: %f\n", word->best_choice->certainty(),
174 word->best_choice->rating());
175 tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
176 }
177
178 flip_hyphens(word);
179 check_debug_pt(word, 20);
180 }
181
182 void reject_blanks(WERD_RES *word) {
183 int16_t i;
184 int16_t offset;
185
186 for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
187 offset += word->best_choice->unichar_lengths()[i], i += 1) {
188 if (word->best_choice->unichar_string()[offset] == ' ') {
189 // rej unrecognised blobs
190 word->reject_map[i].setrej_tess_failure();
191 }
192 }
193 }
194
195 void Tesseract::reject_I_1_L(WERD_RES *word) {
196 int16_t i;
197 int16_t offset;
198
199 for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
200 offset += word->best_choice->unichar_lengths()[i], i += 1) {
201 if (conflict_set_I_l_1.contains(word->best_choice->unichar_string()[offset])) {
202 // rej 1Il conflict
203 word->reject_map[i].setrej_1Il_conflict();
204 }
205 }
206 }
207
208 void reject_poor_matches(WERD_RES *word) {
209 float threshold = compute_reject_threshold(word->best_choice);
210 for (unsigned i = 0; i < word->best_choice->length(); ++i) {
211 if (word->best_choice->unichar_id(i) == UNICHAR_SPACE) {
212 word->reject_map[i].setrej_tess_failure();
213 } else if (word->best_choice->certainty(i) < threshold) {
214 word->reject_map[i].setrej_poor_match();
215 }
216 }
217 }
218
219 /**********************************************************************
220 * compute_reject_threshold
221 *
222 * Set a rejection threshold for this word.
223 * Initially this is a trivial function which looks for the largest
224 * gap in the certainty value.
225 **********************************************************************/
226
227 float compute_reject_threshold(WERD_CHOICE *word) {
228 float threshold; // rejection threshold
229 float bestgap = 0.0f; // biggest gap
230 float gapstart; // bottom of gap
231
232 auto blob_count = word->length();
233 std::vector<float> ratings;
234 ratings.reserve(blob_count);
235 for (unsigned i = 0; i < blob_count; ++i) {
236 ratings.push_back(word->certainty(i));
237 }
238 std::sort(ratings.begin(), ratings.end());
239 gapstart = ratings[0] - 1; // all reject if none better
240 if (blob_count >= 3) {
241 for (unsigned index = 0; index < blob_count - 1; index++) {
242 if (ratings[index + 1] - ratings[index] > bestgap) {
243 bestgap = ratings[index + 1] - ratings[index];
244 // find biggest
245 gapstart = ratings[index];
246 }
247 }
248 }
249 threshold = gapstart + bestgap / 2;
250
251 return threshold;
252 }
253
254 /*************************************************************************
255 * reject_edge_blobs()
256 *
257 * If the word is perilously close to the edge of the image, reject those blobs
258 * in the word which are too close to the edge as they could be clipped.
259 *************************************************************************/
260 void Tesseract::reject_edge_blobs(WERD_RES *word) {
261 TBOX word_box = word->word->bounding_box();
262 // Use the box_word as it is already denormed back to image coordinates.
263 int blobcount = word->box_word->length();
264
265 if (word_box.left() < tessedit_image_border || word_box.bottom() < tessedit_image_border ||
266 word_box.right() + tessedit_image_border > ImageWidth() - 1 ||
267 word_box.top() + tessedit_image_border > ImageHeight() - 1) {
268 ASSERT_HOST(word->reject_map.length() == blobcount);
269 for (int blobindex = 0; blobindex < blobcount; blobindex++) {
270 TBOX blob_box = word->box_word->BlobBox(blobindex);
271 if (blob_box.left() < tessedit_image_border || blob_box.bottom() < tessedit_image_border ||
272 blob_box.right() + tessedit_image_border > ImageWidth() - 1 ||
273 blob_box.top() + tessedit_image_border > ImageHeight() - 1) {
274 word->reject_map[blobindex].setrej_edge_char();
275 // Close to edge
276 }
277 }
278 }
279 }
280
281 /**********************************************************************
282 * one_ell_conflict()
283 *
284 * Identify words where there is a potential I/l/1 error.
285 * - A bundle of contextual heuristics!
286 **********************************************************************/
287 bool Tesseract::one_ell_conflict(WERD_RES *word_res, bool update_map) {
288 const char *word;
289 const char *lengths;
290 int16_t word_len; // its length
291 int16_t first_alphanum_index_;
292 int16_t first_alphanum_offset_;
293 int16_t i;
294 int16_t offset;
295 bool non_conflict_set_char; // non conf set a/n?
296 ACCEPTABLE_WERD_TYPE word_type;
297 bool dict_perm_type;
298 bool dict_word_ok;
299 int dict_word_type;
300
301 word = word_res->best_choice->unichar_string().c_str();
302 lengths = word_res->best_choice->unichar_lengths().c_str();
303 word_len = strlen(lengths);
304 /*
305 If there are no occurrences of the conflict set characters then the word
306 is OK.
307 */
308 if (strpbrk(word, conflict_set_I_l_1.c_str()) == nullptr) {
309 return false;
310 }
311
312 /*
313 There is a conflict if there are NO other (confirmed) alphanumerics apart
314 from those in the conflict set.
315 */
316
317 for (i = 0, offset = 0, non_conflict_set_char = false; (i < word_len) && !non_conflict_set_char;
318 offset += lengths[i++]) {
319 non_conflict_set_char = (word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||
320 word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&
321 !conflict_set_I_l_1.contains(word[offset]);
322 }
323 if (!non_conflict_set_char) {
324 if (update_map) {
325 reject_I_1_L(word_res);
326 }
327 return true;
328 }
329
330 /*
331 If the word is accepted by a dawg permuter, and the first alpha character
332 is "I" or "l", check to see if the alternative is also a dawg word. If it
333 is, then there is a potential error otherwise the word is ok.
334 */
335
336 dict_perm_type = (word_res->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
337 (word_res->best_choice->permuter() == USER_DAWG_PERM) ||
338 (rej_trust_doc_dawg && (word_res->best_choice->permuter() == DOC_DAWG_PERM)) ||
339 (word_res->best_choice->permuter() == FREQ_DAWG_PERM);
340 dict_word_type = dict_word(*(word_res->best_choice));
341 dict_word_ok = (dict_word_type > 0) && (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));
342
343 if ((rej_1Il_use_dict_word && dict_word_ok) || (rej_1Il_trust_permuter_type && dict_perm_type) ||
344 (dict_perm_type && dict_word_ok)) {
345 first_alphanum_index_ = first_alphanum_index(word, lengths);
346 first_alphanum_offset_ = first_alphanum_offset(word, lengths);
347 if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'I') {
348 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
349 if (safe_dict_word(word_res) > 0) {
350 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
351 if (update_map) {
352 word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict();
353 }
354 return true;
355 } else {
356 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
357 return false;
358 }
359 }
360
361 if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'l') {
362 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
363 if (safe_dict_word(word_res) > 0) {
364 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
365 if (update_map) {
366 word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict();
367 }
368 return true;
369 } else {
370 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
371 return false;
372 }
373 }
374 return false;
375 }
376
377 /*
378 NEW 1Il code. The old code relied on permuter types too much. In fact,
379 tess will use TOP_CHOICE permute for good things like "palette".
380 In this code the string is examined independently to see if it looks like
381 a well formed word.
382 */
383
384 /*
385 REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a
386 dictionary word.
387 */
388 first_alphanum_index_ = first_alphanum_index(word, lengths);
389 first_alphanum_offset_ = first_alphanum_offset(word, lengths);
390 if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'l') {
391 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
392 if (safe_dict_word(word_res) > 0) {
393 return false;
394 } else {
395 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
396 }
397 } else if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'I') {
398 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
399 if (safe_dict_word(word_res) > 0) {
400 return false;
401 } else {
402 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
403 }
404 }
405 /*
406 For strings containing digits:
407 If there are no alphas OR the numeric permuter liked the word,
408 reject any non 1 conflict chs
409 Else reject all conflict chs
410 */
411 if (word_contains_non_1_digit(word, lengths)) {
412 bool allow_1s =
413 (alpha_count(word, lengths) == 0) || (word_res->best_choice->permuter() == NUMBER_PERM);
414
415 int16_t offset;
416 bool conflict = false;
417 for (i = 0, offset = 0; word[offset] != '\0';
418 offset += word_res->best_choice->unichar_lengths()[i++]) {
419 if ((!allow_1s || (word[offset] != '1')) &&
420 conflict_set_I_l_1.contains(word[offset])) {
421 if (update_map) {
422 word_res->reject_map[i].setrej_1Il_conflict();
423 }
424 conflict = true;
425 }
426 }
427 return conflict;
428 }
429 /*
430 For anything else. See if it conforms to an acceptable word type. If so,
431 treat accordingly.
432 */
433 word_type = acceptable_word_string(*word_res->uch_set, word, lengths);
434 if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
435 first_alphanum_index_ = first_alphanum_index(word, lengths);
436 first_alphanum_offset_ = first_alphanum_offset(word, lengths);
437 if (conflict_set_I_l_1.contains(word[first_alphanum_offset_])) {
438 if (update_map) {
439 word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict();
440 }
441 return true;
442 } else {
443 return false;
444 }
445 } else if (word_type == AC_UPPER_CASE) {
446 return false;
447 } else {
448 if (update_map) {
449 reject_I_1_L(word_res);
450 }
451 return true;
452 }
453 }
454
455 int16_t Tesseract::first_alphanum_index(const char *word, const char *word_lengths) {
456 int16_t i;
457 int16_t offset;
458
459 for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
460 if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
461 unicharset.get_isdigit(word + offset, word_lengths[i])) {
462 return i;
463 }
464 }
465 return -1;
466 }
467
468 int16_t Tesseract::first_alphanum_offset(const char *word, const char *word_lengths) {
469 int16_t i;
470 int16_t offset;
471
472 for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
473 if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
474 unicharset.get_isdigit(word + offset, word_lengths[i])) {
475 return offset;
476 }
477 }
478 return -1;
479 }
480
481 int16_t Tesseract::alpha_count(const char *word, const char *word_lengths) {
482 int16_t i;
483 int16_t offset;
484 int16_t count = 0;
485
486 for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
487 if (unicharset.get_isalpha(word + offset, word_lengths[i])) {
488 count++;
489 }
490 }
491 return count;
492 }
493
494 bool Tesseract::word_contains_non_1_digit(const char *word, const char *word_lengths) {
495 int16_t i;
496 int16_t offset;
497
498 for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
499 if (unicharset.get_isdigit(word + offset, word_lengths[i]) &&
500 (word_lengths[i] != 1 || word[offset] != '1')) {
501 return true;
502 }
503 }
504 return false;
505 }
506
507 /*************************************************************************
508 * dont_allow_1Il()
509 * Don't unreject LONE accepted 1Il conflict set chars
510 *************************************************************************/
511 void Tesseract::dont_allow_1Il(WERD_RES *word) {
512 int word_len = word->reject_map.length();
513 const char *s = word->best_choice->unichar_string().c_str();
514 const char *lengths = word->best_choice->unichar_lengths().c_str();
515 bool accepted_1Il = false;
516
517 for (int i = 0, offset = 0; i < word_len; offset += word->best_choice->unichar_lengths()[i++]) {
518 if (word->reject_map[i].accepted()) {
519 if (conflict_set_I_l_1.contains(s[offset])) {
520 accepted_1Il = true;
521 } else {
522 if (word->uch_set->get_isalpha(s + offset, lengths[i]) ||
523 word->uch_set->get_isdigit(s + offset, lengths[i])) {
524 return; // >=1 non 1Il ch accepted
525 }
526 }
527 }
528 }
529 if (!accepted_1Il) {
530 return; // Nothing to worry about
531 }
532
533 for (int i = 0, offset = 0; i < word_len; offset += word->best_choice->unichar_lengths()[i++]) {
534 if (conflict_set_I_l_1.contains(s[offset]) && word->reject_map[i].accepted()) {
535 word->reject_map[i].setrej_postNN_1Il();
536 }
537 }
538 }
539
540 int16_t Tesseract::count_alphanums(WERD_RES *word_res) {
541 int count = 0;
542 const WERD_CHOICE *best_choice = word_res->best_choice;
543 for (unsigned i = 0; i < word_res->reject_map.length(); ++i) {
544 if ((word_res->reject_map[i].accepted()) &&
545 (word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) ||
546 word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) {
547 count++;
548 }
549 }
550 return count;
551 }
552
553 // reject all if most rejected.
554 void Tesseract::reject_mostly_rejects(WERD_RES *word) {
555 /* Reject the whole of the word if the fraction of rejects exceeds a limit */
556
557 if (static_cast<float>(word->reject_map.reject_count()) / word->reject_map.length() >=
558 rej_whole_of_mostly_reject_word_fract) {
559 word->reject_map.rej_word_mostly_rej();
560 }
561 }
562
563 bool Tesseract::repeated_nonalphanum_wd(WERD_RES *word, ROW *row) {
564 if (word->best_choice->unichar_lengths().length() <= 1) {
565 return false;
566 }
567
568 if (!ok_repeated_ch_non_alphanum_wds.contains(word->best_choice->unichar_string()[0])) {
569 return false;
570 }
571
572 UNICHAR_ID uch_id = word->best_choice->unichar_id(0);
573 for (unsigned i = 1; i < word->best_choice->length(); ++i) {
574 if (word->best_choice->unichar_id(i) != uch_id) {
575 return false;
576 }
577 }
578
579 int16_t char_quality;
580 int16_t accepted_char_quality;
581 word_char_quality(word, &char_quality, &accepted_char_quality);
582
583 if ((word->best_choice->unichar_lengths().length() == static_cast<size_t>(char_quality)) &&
584 (char_quality == accepted_char_quality)) {
585 return true;
586 } else {
587 return false;
588 }
589 }
590
591 int16_t Tesseract::safe_dict_word(const WERD_RES *werd_res) {
592 const WERD_CHOICE &word = *werd_res->best_choice;
593 int dict_word_type = werd_res->tesseract->dict_word(word);
594 return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
595 }
596
597 // Note: After running this function word_res->ratings
598 // might not contain the right BLOB_CHOICE corresponding to each character
599 // in word_res->best_choice.
600 void Tesseract::flip_hyphens(WERD_RES *word_res) {
601 WERD_CHOICE *best_choice = word_res->best_choice;
602 int prev_right = -9999;
603 int next_left;
604 TBOX out_box;
605 float aspect_ratio;
606
607 if (tessedit_lower_flip_hyphen <= 1) {
608 return;
609 }
610
611 auto num_blobs = word_res->rebuild_word->NumBlobs();
612 UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
613 for (unsigned i = 0; i < best_choice->length() && i < num_blobs; ++i) {
614 TBLOB *blob = word_res->rebuild_word->blobs[i];
615 out_box = blob->bounding_box();
616 if (i + 1 == num_blobs) {
617 next_left = 9999;
618 } else {
619 next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left();
620 }
621 // Don't touch small or touching blobs - it is too dangerous.
622 if ((out_box.width() > 8 * word_res->denorm.x_scale()) && (out_box.left() > prev_right) &&
623 (out_box.right() < next_left)) {
624 aspect_ratio = out_box.width() / static_cast<float>(out_box.height());
625 if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) {
626 if (aspect_ratio >= tessedit_upper_flip_hyphen &&
627 word_res->uch_set->contains_unichar_id(unichar_dash) &&
628 word_res->uch_set->get_enabled(unichar_dash)) {
629 /* Certain HYPHEN */
630 best_choice->set_unichar_id(unichar_dash, i);
631 if (word_res->reject_map[i].rejected()) {
632 word_res->reject_map[i].setrej_hyphen_accept();
633 }
634 }
635 if ((aspect_ratio > tessedit_lower_flip_hyphen) && word_res->reject_map[i].accepted()) {
636 // Suspected HYPHEN
637 word_res->reject_map[i].setrej_hyphen();
638 }
639 } else if (best_choice->unichar_id(i) == unichar_dash) {
640 if ((aspect_ratio >= tessedit_upper_flip_hyphen) && (word_res->reject_map[i].rejected())) {
641 word_res->reject_map[i].setrej_hyphen_accept();
642 }
643 // Certain HYPHEN
644
645 if ((aspect_ratio <= tessedit_lower_flip_hyphen) && (word_res->reject_map[i].accepted())) {
646 // Suspected HYPHEN
647 word_res->reject_map[i].setrej_hyphen();
648 }
649 }
650 }
651 prev_right = out_box.right();
652 }
653 }
654
655 // Note: After running this function word_res->ratings
656 // might not contain the right BLOB_CHOICE corresponding to each character
657 // in word_res->best_choice.
658 void Tesseract::flip_0O(WERD_RES *word_res) {
659 WERD_CHOICE *best_choice = word_res->best_choice;
660 TBOX out_box;
661
662 if (!tessedit_flip_0O) {
663 return;
664 }
665
666 auto num_blobs = word_res->rebuild_word->NumBlobs();
667 for (unsigned i = 0; i < best_choice->length() && i < num_blobs; ++i) {
668 TBLOB *blob = word_res->rebuild_word->blobs[i];
669 if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||
670 word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {
671 out_box = blob->bounding_box();
672 if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) ||
673 (out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4)) {
674 return; // Beware words with sub/superscripts
675 }
676 }
677 }
678 UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0");
679 UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O");
680 if (unichar_0 == INVALID_UNICHAR_ID || !word_res->uch_set->get_enabled(unichar_0) ||
681 unichar_O == INVALID_UNICHAR_ID || !word_res->uch_set->get_enabled(unichar_O)) {
682 return; // 0 or O are not present/enabled in unicharset
683 }
684 for (unsigned i = 1; i < best_choice->length(); ++i) {
685 if (best_choice->unichar_id(i) == unichar_0 || best_choice->unichar_id(i) == unichar_O) {
686 /* A0A */
687 if ((i + 1) < best_choice->length() &&
688 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
689 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i + 1))) {
690 best_choice->set_unichar_id(unichar_O, i);
691 }
692 /* A00A */
693 if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
694 (i + 1) < best_choice->length() &&
695 (best_choice->unichar_id(i + 1) == unichar_0 ||
696 best_choice->unichar_id(i + 1) == unichar_O) &&
697 (i + 2) < best_choice->length() &&
698 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i + 2))) {
699 best_choice->set_unichar_id(unichar_O, i);
700 i++;
701 }
702 /* AA0<non digit or end of word> */
703 if ((i > 1) && non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 2)) &&
704 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
705 (((i + 1) < best_choice->length() &&
706 !word_res->uch_set->get_isdigit(best_choice->unichar_id(i + 1)) &&
707 !word_res->uch_set->eq(best_choice->unichar_id(i + 1), "l") &&
708 !word_res->uch_set->eq(best_choice->unichar_id(i + 1), "I")) ||
709 (i == best_choice->length() - 1))) {
710 best_choice->set_unichar_id(unichar_O, i);
711 }
712 /* 9O9 */
713 if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
714 (i + 1) < best_choice->length() &&
715 non_0_digit(*word_res->uch_set, best_choice->unichar_id(i + 1))) {
716 best_choice->set_unichar_id(unichar_0, i);
717 }
718 /* 9OOO */
719 if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
720 (i + 2) < best_choice->length() &&
721 (best_choice->unichar_id(i + 1) == unichar_0 ||
722 best_choice->unichar_id(i + 1) == unichar_O) &&
723 (best_choice->unichar_id(i + 2) == unichar_0 ||
724 best_choice->unichar_id(i + 2) == unichar_O)) {
725 best_choice->set_unichar_id(unichar_0, i);
726 best_choice->set_unichar_id(unichar_0, i + 1);
727 best_choice->set_unichar_id(unichar_0, i + 2);
728 i += 2;
729 }
730 /* 9OO<non upper> */
731 if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
732 (i + 2) < best_choice->length() &&
733 (best_choice->unichar_id(i + 1) == unichar_0 ||
734 best_choice->unichar_id(i + 1) == unichar_O) &&
735 !word_res->uch_set->get_isupper(best_choice->unichar_id(i + 2))) {
736 best_choice->set_unichar_id(unichar_0, i);
737 best_choice->set_unichar_id(unichar_0, i + 1);
738 i++;
739 }
740 /* 9O<non upper> */
741 if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
742 (i + 1) < best_choice->length() &&
743 !word_res->uch_set->get_isupper(best_choice->unichar_id(i + 1))) {
744 best_choice->set_unichar_id(unichar_0, i);
745 }
746 /* 9[.,]OOO.. */
747 if ((i > 1) &&
748 (word_res->uch_set->eq(best_choice->unichar_id(i - 1), ".") ||
749 word_res->uch_set->eq(best_choice->unichar_id(i - 1), ",")) &&
750 (word_res->uch_set->get_isdigit(best_choice->unichar_id(i - 2)) ||
751 best_choice->unichar_id(i - 2) == unichar_O)) {
752 if (best_choice->unichar_id(i - 2) == unichar_O) {
753 best_choice->set_unichar_id(unichar_0, i - 2);
754 }
755 while (i < best_choice->length() && (best_choice->unichar_id(i) == unichar_O ||
756 best_choice->unichar_id(i) == unichar_0)) {
757 best_choice->set_unichar_id(unichar_0, i);
758 i++;
759 }
760 i--;
761 }
762 }
763 }
764 }
765
766 bool Tesseract::non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id) {
767 return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O");
768 }
769
770 bool Tesseract::non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id) {
771 return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, "0");
772 }
773 } // namespace tesseract
774
775 #endif // def DISABLED_LEGACY_ENGINE