Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/ccmain/reject.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/ccmain/reject.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,775 @@ +/********************************************************************** + * File: reject.cpp (Formerly reject.c) + * Description: Rejection functions used in tessedit + * Author: Phil Cheatle + * + * (C) Copyright 1992, Hewlett-Packard Ltd. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + **********************************************************************/ + +// Include automatically generated configuration file if running autoconf. +#ifdef HAVE_CONFIG_H +# include "config_auto.h" +#endif + +#include "reject.h" + +#ifdef DISABLED_LEGACY_ENGINE + +# include "tesseractclass.h" + +namespace tesseract { + +int16_t Tesseract::safe_dict_word(const WERD_RES *werd_res) { + const WERD_CHOICE &word = *werd_res->best_choice; + int dict_word_type = werd_res->tesseract->dict_word(word); + return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type; +} +} // namespace tesseract + +#else + +# include "control.h" +# include "docqual.h" +# include "tesseractclass.h" +# include "tessvars.h" + +# include "helpers.h" + +# include <algorithm> // for std::sort +# include <cctype> +# include <cerrno> +# include <cstring> +# include <vector> // for std::vector + +namespace tesseract { + +/************************************************************************* + * set_done() + * + * Set the done flag based on the word acceptability criteria + *************************************************************************/ + +void Tesseract::set_done(WERD_RES *word, int16_t pass) { + word->done = + word->tess_accepted && (strchr(word->best_choice->unichar_string().c_str(), ' ') == nullptr); + bool word_is_ambig = word->best_choice->dangerous_ambig_found(); + bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM || + word->best_choice->permuter() == FREQ_DAWG_PERM || + word->best_choice->permuter() == USER_DAWG_PERM; + if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) && + one_ell_conflict(word, false)) { + if (tessedit_rejection_debug) { + tprintf("one_ell_conflict detected\n"); + } + word->done = false; + } + if (word->done && + ((!word_from_dict && word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) { + if (tessedit_rejection_debug) { + tprintf("non-dict or ambig word detected\n"); + } + word->done = false; + } + if (tessedit_rejection_debug) { + tprintf("set_done(): done=%d\n", word->done); + word->best_choice->print(""); + } +} + +/************************************************************************* + * make_reject_map() + * + * Sets the done flag to indicate whether the resylt is acceptable. + * + * Sets a reject map for the word. + *************************************************************************/ +void Tesseract::make_reject_map(WERD_RES *word, ROW *row, int16_t pass) { + flip_0O(word); + check_debug_pt(word, -1); // For trap only + set_done(word, pass); // Set acceptance + word->reject_map.initialise(word->best_choice->unichar_lengths().length()); + reject_blanks(word); + /* +0: Rays original heuristic - the baseline +*/ + if (tessedit_reject_mode == 0) { + if (!word->done) { + reject_poor_matches(word); + } + } else if (tessedit_reject_mode == 5) { + /* +5: Reject I/1/l from words where there is no strong contextual confirmation; + the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls); + and the whole of any words which are very small +*/ + if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) { + word->reject_map.rej_word_small_xht(); + } else { + one_ell_conflict(word, true); + /* + Originally the code here just used the done flag. Now I have duplicated + and unpacked the conditions for setting the done flag so that each + mechanism can be turned on or off independently. This works WITHOUT + affecting the done flag setting. +*/ + if (rej_use_tess_accepted && !word->tess_accepted) { + word->reject_map.rej_word_not_tess_accepted(); + } + + if (rej_use_tess_blanks && + (strchr(word->best_choice->unichar_string().c_str(), ' ') != nullptr)) { + word->reject_map.rej_word_contains_blanks(); + } + + WERD_CHOICE *best_choice = word->best_choice; + if (rej_use_good_perm) { + if ((best_choice->permuter() == SYSTEM_DAWG_PERM || + best_choice->permuter() == FREQ_DAWG_PERM || + best_choice->permuter() == USER_DAWG_PERM) && + (!rej_use_sensible_wd || + acceptable_word_string(*word->uch_set, best_choice->unichar_string().c_str(), + best_choice->unichar_lengths().c_str()) != AC_UNACCEPTABLE)) { + // PASSED TEST + } else if (best_choice->permuter() == NUMBER_PERM) { + if (rej_alphas_in_number_perm) { + for (int i = 0, offset = 0; best_choice->unichar_string()[offset] != '\0'; + offset += best_choice->unichar_lengths()[i++]) { + if (word->reject_map[i].accepted() && + word->uch_set->get_isalpha(best_choice->unichar_string().c_str() + offset, + best_choice->unichar_lengths()[i])) { + word->reject_map[i].setrej_bad_permuter(); + } + // rej alpha + } + } + } else { + word->reject_map.rej_word_bad_permuter(); + } + } + /* Ambig word rejection was here once !!*/ + } + } else { + tprintf("BAD tessedit_reject_mode\n"); + ASSERT_HOST("Fatal error encountered!" == nullptr); + } + + if (tessedit_image_border > -1) { + reject_edge_blobs(word); + } + + check_debug_pt(word, 10); + if (tessedit_rejection_debug) { + tprintf("Permuter Type = %d\n", word->best_choice->permuter()); + tprintf("Certainty: %f Rating: %f\n", word->best_choice->certainty(), + word->best_choice->rating()); + tprintf("Dict word: %d\n", dict_word(*(word->best_choice))); + } + + flip_hyphens(word); + check_debug_pt(word, 20); +} + +void reject_blanks(WERD_RES *word) { + int16_t i; + int16_t offset; + + for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0'; + offset += word->best_choice->unichar_lengths()[i], i += 1) { + if (word->best_choice->unichar_string()[offset] == ' ') { + // rej unrecognised blobs + word->reject_map[i].setrej_tess_failure(); + } + } +} + +void Tesseract::reject_I_1_L(WERD_RES *word) { + int16_t i; + int16_t offset; + + for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0'; + offset += word->best_choice->unichar_lengths()[i], i += 1) { + if (conflict_set_I_l_1.contains(word->best_choice->unichar_string()[offset])) { + // rej 1Il conflict + word->reject_map[i].setrej_1Il_conflict(); + } + } +} + +void reject_poor_matches(WERD_RES *word) { + float threshold = compute_reject_threshold(word->best_choice); + for (unsigned i = 0; i < word->best_choice->length(); ++i) { + if (word->best_choice->unichar_id(i) == UNICHAR_SPACE) { + word->reject_map[i].setrej_tess_failure(); + } else if (word->best_choice->certainty(i) < threshold) { + word->reject_map[i].setrej_poor_match(); + } + } +} + +/********************************************************************** + * compute_reject_threshold + * + * Set a rejection threshold for this word. + * Initially this is a trivial function which looks for the largest + * gap in the certainty value. + **********************************************************************/ + +float compute_reject_threshold(WERD_CHOICE *word) { + float threshold; // rejection threshold + float bestgap = 0.0f; // biggest gap + float gapstart; // bottom of gap + + auto blob_count = word->length(); + std::vector<float> ratings; + ratings.reserve(blob_count); + for (unsigned i = 0; i < blob_count; ++i) { + ratings.push_back(word->certainty(i)); + } + std::sort(ratings.begin(), ratings.end()); + gapstart = ratings[0] - 1; // all reject if none better + if (blob_count >= 3) { + for (unsigned index = 0; index < blob_count - 1; index++) { + if (ratings[index + 1] - ratings[index] > bestgap) { + bestgap = ratings[index + 1] - ratings[index]; + // find biggest + gapstart = ratings[index]; + } + } + } + threshold = gapstart + bestgap / 2; + + return threshold; +} + +/************************************************************************* + * reject_edge_blobs() + * + * If the word is perilously close to the edge of the image, reject those blobs + * in the word which are too close to the edge as they could be clipped. + *************************************************************************/ +void Tesseract::reject_edge_blobs(WERD_RES *word) { + TBOX word_box = word->word->bounding_box(); + // Use the box_word as it is already denormed back to image coordinates. + int blobcount = word->box_word->length(); + + if (word_box.left() < tessedit_image_border || word_box.bottom() < tessedit_image_border || + word_box.right() + tessedit_image_border > ImageWidth() - 1 || + word_box.top() + tessedit_image_border > ImageHeight() - 1) { + ASSERT_HOST(word->reject_map.length() == blobcount); + for (int blobindex = 0; blobindex < blobcount; blobindex++) { + TBOX blob_box = word->box_word->BlobBox(blobindex); + if (blob_box.left() < tessedit_image_border || blob_box.bottom() < tessedit_image_border || + blob_box.right() + tessedit_image_border > ImageWidth() - 1 || + blob_box.top() + tessedit_image_border > ImageHeight() - 1) { + word->reject_map[blobindex].setrej_edge_char(); + // Close to edge + } + } + } +} + +/********************************************************************** + * one_ell_conflict() + * + * Identify words where there is a potential I/l/1 error. + * - A bundle of contextual heuristics! + **********************************************************************/ +bool Tesseract::one_ell_conflict(WERD_RES *word_res, bool update_map) { + const char *word; + const char *lengths; + int16_t word_len; // its length + int16_t first_alphanum_index_; + int16_t first_alphanum_offset_; + int16_t i; + int16_t offset; + bool non_conflict_set_char; // non conf set a/n? + ACCEPTABLE_WERD_TYPE word_type; + bool dict_perm_type; + bool dict_word_ok; + int dict_word_type; + + word = word_res->best_choice->unichar_string().c_str(); + lengths = word_res->best_choice->unichar_lengths().c_str(); + word_len = strlen(lengths); + /* + If there are no occurrences of the conflict set characters then the word + is OK. +*/ + if (strpbrk(word, conflict_set_I_l_1.c_str()) == nullptr) { + return false; + } + + /* + There is a conflict if there are NO other (confirmed) alphanumerics apart + from those in the conflict set. +*/ + + for (i = 0, offset = 0, non_conflict_set_char = false; (i < word_len) && !non_conflict_set_char; + offset += lengths[i++]) { + non_conflict_set_char = (word_res->uch_set->get_isalpha(word + offset, lengths[i]) || + word_res->uch_set->get_isdigit(word + offset, lengths[i])) && + !conflict_set_I_l_1.contains(word[offset]); + } + if (!non_conflict_set_char) { + if (update_map) { + reject_I_1_L(word_res); + } + return true; + } + + /* + If the word is accepted by a dawg permuter, and the first alpha character + is "I" or "l", check to see if the alternative is also a dawg word. If it + is, then there is a potential error otherwise the word is ok. +*/ + + dict_perm_type = (word_res->best_choice->permuter() == SYSTEM_DAWG_PERM) || + (word_res->best_choice->permuter() == USER_DAWG_PERM) || + (rej_trust_doc_dawg && (word_res->best_choice->permuter() == DOC_DAWG_PERM)) || + (word_res->best_choice->permuter() == FREQ_DAWG_PERM); + dict_word_type = dict_word(*(word_res->best_choice)); + dict_word_ok = (dict_word_type > 0) && (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM)); + + if ((rej_1Il_use_dict_word && dict_word_ok) || (rej_1Il_trust_permuter_type && dict_perm_type) || + (dict_perm_type && dict_word_ok)) { + first_alphanum_index_ = first_alphanum_index(word, lengths); + first_alphanum_offset_ = first_alphanum_offset(word, lengths); + if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'I') { + word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; + if (safe_dict_word(word_res) > 0) { + word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; + if (update_map) { + word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict(); + } + return true; + } else { + word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; + return false; + } + } + + if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'l') { + word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; + if (safe_dict_word(word_res) > 0) { + word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; + if (update_map) { + word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict(); + } + return true; + } else { + word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; + return false; + } + } + return false; + } + + /* + NEW 1Il code. The old code relied on permuter types too much. In fact, + tess will use TOP_CHOICE permute for good things like "palette". + In this code the string is examined independently to see if it looks like + a well formed word. +*/ + + /* + REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a + dictionary word. +*/ + first_alphanum_index_ = first_alphanum_index(word, lengths); + first_alphanum_offset_ = first_alphanum_offset(word, lengths); + if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'l') { + word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; + if (safe_dict_word(word_res) > 0) { + return false; + } else { + word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; + } + } else if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'I') { + word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; + if (safe_dict_word(word_res) > 0) { + return false; + } else { + word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; + } + } + /* + For strings containing digits: + If there are no alphas OR the numeric permuter liked the word, + reject any non 1 conflict chs + Else reject all conflict chs +*/ + if (word_contains_non_1_digit(word, lengths)) { + bool allow_1s = + (alpha_count(word, lengths) == 0) || (word_res->best_choice->permuter() == NUMBER_PERM); + + int16_t offset; + bool conflict = false; + for (i = 0, offset = 0; word[offset] != '\0'; + offset += word_res->best_choice->unichar_lengths()[i++]) { + if ((!allow_1s || (word[offset] != '1')) && + conflict_set_I_l_1.contains(word[offset])) { + if (update_map) { + word_res->reject_map[i].setrej_1Il_conflict(); + } + conflict = true; + } + } + return conflict; + } + /* + For anything else. See if it conforms to an acceptable word type. If so, + treat accordingly. +*/ + word_type = acceptable_word_string(*word_res->uch_set, word, lengths); + if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) { + first_alphanum_index_ = first_alphanum_index(word, lengths); + first_alphanum_offset_ = first_alphanum_offset(word, lengths); + if (conflict_set_I_l_1.contains(word[first_alphanum_offset_])) { + if (update_map) { + word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict(); + } + return true; + } else { + return false; + } + } else if (word_type == AC_UPPER_CASE) { + return false; + } else { + if (update_map) { + reject_I_1_L(word_res); + } + return true; + } +} + +int16_t Tesseract::first_alphanum_index(const char *word, const char *word_lengths) { + int16_t i; + int16_t offset; + + for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) { + if (unicharset.get_isalpha(word + offset, word_lengths[i]) || + unicharset.get_isdigit(word + offset, word_lengths[i])) { + return i; + } + } + return -1; +} + +int16_t Tesseract::first_alphanum_offset(const char *word, const char *word_lengths) { + int16_t i; + int16_t offset; + + for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) { + if (unicharset.get_isalpha(word + offset, word_lengths[i]) || + unicharset.get_isdigit(word + offset, word_lengths[i])) { + return offset; + } + } + return -1; +} + +int16_t Tesseract::alpha_count(const char *word, const char *word_lengths) { + int16_t i; + int16_t offset; + int16_t count = 0; + + for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) { + if (unicharset.get_isalpha(word + offset, word_lengths[i])) { + count++; + } + } + return count; +} + +bool Tesseract::word_contains_non_1_digit(const char *word, const char *word_lengths) { + int16_t i; + int16_t offset; + + for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) { + if (unicharset.get_isdigit(word + offset, word_lengths[i]) && + (word_lengths[i] != 1 || word[offset] != '1')) { + return true; + } + } + return false; +} + +/************************************************************************* + * dont_allow_1Il() + * Don't unreject LONE accepted 1Il conflict set chars + *************************************************************************/ +void Tesseract::dont_allow_1Il(WERD_RES *word) { + int word_len = word->reject_map.length(); + const char *s = word->best_choice->unichar_string().c_str(); + const char *lengths = word->best_choice->unichar_lengths().c_str(); + bool accepted_1Il = false; + + for (int i = 0, offset = 0; i < word_len; offset += word->best_choice->unichar_lengths()[i++]) { + if (word->reject_map[i].accepted()) { + if (conflict_set_I_l_1.contains(s[offset])) { + accepted_1Il = true; + } else { + if (word->uch_set->get_isalpha(s + offset, lengths[i]) || + word->uch_set->get_isdigit(s + offset, lengths[i])) { + return; // >=1 non 1Il ch accepted + } + } + } + } + if (!accepted_1Il) { + return; // Nothing to worry about + } + + for (int i = 0, offset = 0; i < word_len; offset += word->best_choice->unichar_lengths()[i++]) { + if (conflict_set_I_l_1.contains(s[offset]) && word->reject_map[i].accepted()) { + word->reject_map[i].setrej_postNN_1Il(); + } + } +} + +int16_t Tesseract::count_alphanums(WERD_RES *word_res) { + int count = 0; + const WERD_CHOICE *best_choice = word_res->best_choice; + for (unsigned i = 0; i < word_res->reject_map.length(); ++i) { + if ((word_res->reject_map[i].accepted()) && + (word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) || + word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) { + count++; + } + } + return count; +} + +// reject all if most rejected. +void Tesseract::reject_mostly_rejects(WERD_RES *word) { + /* Reject the whole of the word if the fraction of rejects exceeds a limit */ + + if (static_cast<float>(word->reject_map.reject_count()) / word->reject_map.length() >= + rej_whole_of_mostly_reject_word_fract) { + word->reject_map.rej_word_mostly_rej(); + } +} + +bool Tesseract::repeated_nonalphanum_wd(WERD_RES *word, ROW *row) { + if (word->best_choice->unichar_lengths().length() <= 1) { + return false; + } + + if (!ok_repeated_ch_non_alphanum_wds.contains(word->best_choice->unichar_string()[0])) { + return false; + } + + UNICHAR_ID uch_id = word->best_choice->unichar_id(0); + for (unsigned i = 1; i < word->best_choice->length(); ++i) { + if (word->best_choice->unichar_id(i) != uch_id) { + return false; + } + } + + int16_t char_quality; + int16_t accepted_char_quality; + word_char_quality(word, &char_quality, &accepted_char_quality); + + if ((word->best_choice->unichar_lengths().length() == static_cast<size_t>(char_quality)) && + (char_quality == accepted_char_quality)) { + return true; + } else { + return false; + } +} + +int16_t Tesseract::safe_dict_word(const WERD_RES *werd_res) { + const WERD_CHOICE &word = *werd_res->best_choice; + int dict_word_type = werd_res->tesseract->dict_word(word); + return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type; +} + +// Note: After running this function word_res->ratings +// might not contain the right BLOB_CHOICE corresponding to each character +// in word_res->best_choice. +void Tesseract::flip_hyphens(WERD_RES *word_res) { + WERD_CHOICE *best_choice = word_res->best_choice; + int prev_right = -9999; + int next_left; + TBOX out_box; + float aspect_ratio; + + if (tessedit_lower_flip_hyphen <= 1) { + return; + } + + auto num_blobs = word_res->rebuild_word->NumBlobs(); + UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-"); + for (unsigned i = 0; i < best_choice->length() && i < num_blobs; ++i) { + TBLOB *blob = word_res->rebuild_word->blobs[i]; + out_box = blob->bounding_box(); + if (i + 1 == num_blobs) { + next_left = 9999; + } else { + next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left(); + } + // Don't touch small or touching blobs - it is too dangerous. + if ((out_box.width() > 8 * word_res->denorm.x_scale()) && (out_box.left() > prev_right) && + (out_box.right() < next_left)) { + aspect_ratio = out_box.width() / static_cast<float>(out_box.height()); + if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) { + if (aspect_ratio >= tessedit_upper_flip_hyphen && + word_res->uch_set->contains_unichar_id(unichar_dash) && + word_res->uch_set->get_enabled(unichar_dash)) { + /* Certain HYPHEN */ + best_choice->set_unichar_id(unichar_dash, i); + if (word_res->reject_map[i].rejected()) { + word_res->reject_map[i].setrej_hyphen_accept(); + } + } + if ((aspect_ratio > tessedit_lower_flip_hyphen) && word_res->reject_map[i].accepted()) { + // Suspected HYPHEN + word_res->reject_map[i].setrej_hyphen(); + } + } else if (best_choice->unichar_id(i) == unichar_dash) { + if ((aspect_ratio >= tessedit_upper_flip_hyphen) && (word_res->reject_map[i].rejected())) { + word_res->reject_map[i].setrej_hyphen_accept(); + } + // Certain HYPHEN + + if ((aspect_ratio <= tessedit_lower_flip_hyphen) && (word_res->reject_map[i].accepted())) { + // Suspected HYPHEN + word_res->reject_map[i].setrej_hyphen(); + } + } + } + prev_right = out_box.right(); + } +} + +// Note: After running this function word_res->ratings +// might not contain the right BLOB_CHOICE corresponding to each character +// in word_res->best_choice. +void Tesseract::flip_0O(WERD_RES *word_res) { + WERD_CHOICE *best_choice = word_res->best_choice; + TBOX out_box; + + if (!tessedit_flip_0O) { + return; + } + + auto num_blobs = word_res->rebuild_word->NumBlobs(); + for (unsigned i = 0; i < best_choice->length() && i < num_blobs; ++i) { + TBLOB *blob = word_res->rebuild_word->blobs[i]; + if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) || + word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) { + out_box = blob->bounding_box(); + if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) || + (out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4)) { + return; // Beware words with sub/superscripts + } + } + } + UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0"); + UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O"); + if (unichar_0 == INVALID_UNICHAR_ID || !word_res->uch_set->get_enabled(unichar_0) || + unichar_O == INVALID_UNICHAR_ID || !word_res->uch_set->get_enabled(unichar_O)) { + return; // 0 or O are not present/enabled in unicharset + } + for (unsigned i = 1; i < best_choice->length(); ++i) { + if (best_choice->unichar_id(i) == unichar_0 || best_choice->unichar_id(i) == unichar_O) { + /* A0A */ + if ((i + 1) < best_choice->length() && + non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) && + non_O_upper(*word_res->uch_set, best_choice->unichar_id(i + 1))) { + best_choice->set_unichar_id(unichar_O, i); + } + /* A00A */ + if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) && + (i + 1) < best_choice->length() && + (best_choice->unichar_id(i + 1) == unichar_0 || + best_choice->unichar_id(i + 1) == unichar_O) && + (i + 2) < best_choice->length() && + non_O_upper(*word_res->uch_set, best_choice->unichar_id(i + 2))) { + best_choice->set_unichar_id(unichar_O, i); + i++; + } + /* AA0<non digit or end of word> */ + if ((i > 1) && non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 2)) && + non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) && + (((i + 1) < best_choice->length() && + !word_res->uch_set->get_isdigit(best_choice->unichar_id(i + 1)) && + !word_res->uch_set->eq(best_choice->unichar_id(i + 1), "l") && + !word_res->uch_set->eq(best_choice->unichar_id(i + 1), "I")) || + (i == best_choice->length() - 1))) { + best_choice->set_unichar_id(unichar_O, i); + } + /* 9O9 */ + if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) && + (i + 1) < best_choice->length() && + non_0_digit(*word_res->uch_set, best_choice->unichar_id(i + 1))) { + best_choice->set_unichar_id(unichar_0, i); + } + /* 9OOO */ + if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) && + (i + 2) < best_choice->length() && + (best_choice->unichar_id(i + 1) == unichar_0 || + best_choice->unichar_id(i + 1) == unichar_O) && + (best_choice->unichar_id(i + 2) == unichar_0 || + best_choice->unichar_id(i + 2) == unichar_O)) { + best_choice->set_unichar_id(unichar_0, i); + best_choice->set_unichar_id(unichar_0, i + 1); + best_choice->set_unichar_id(unichar_0, i + 2); + i += 2; + } + /* 9OO<non upper> */ + if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) && + (i + 2) < best_choice->length() && + (best_choice->unichar_id(i + 1) == unichar_0 || + best_choice->unichar_id(i + 1) == unichar_O) && + !word_res->uch_set->get_isupper(best_choice->unichar_id(i + 2))) { + best_choice->set_unichar_id(unichar_0, i); + best_choice->set_unichar_id(unichar_0, i + 1); + i++; + } + /* 9O<non upper> */ + if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) && + (i + 1) < best_choice->length() && + !word_res->uch_set->get_isupper(best_choice->unichar_id(i + 1))) { + best_choice->set_unichar_id(unichar_0, i); + } + /* 9[.,]OOO.. */ + if ((i > 1) && + (word_res->uch_set->eq(best_choice->unichar_id(i - 1), ".") || + word_res->uch_set->eq(best_choice->unichar_id(i - 1), ",")) && + (word_res->uch_set->get_isdigit(best_choice->unichar_id(i - 2)) || + best_choice->unichar_id(i - 2) == unichar_O)) { + if (best_choice->unichar_id(i - 2) == unichar_O) { + best_choice->set_unichar_id(unichar_0, i - 2); + } + while (i < best_choice->length() && (best_choice->unichar_id(i) == unichar_O || + best_choice->unichar_id(i) == unichar_0)) { + best_choice->set_unichar_id(unichar_0, i); + i++; + } + i--; + } + } + } +} + +bool Tesseract::non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id) { + return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O"); +} + +bool Tesseract::non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id) { + return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, "0"); +} +} // namespace tesseract + +#endif // def DISABLED_LEGACY_ENGINE
