Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/dict/permdawg.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /****************************************************************************** | |
| 2 * | |
| 3 * File: permdawg.cpp (Formerly permdawg.c) | |
| 4 * Description: Scale word choices by a dictionary | |
| 5 * Author: Mark Seaman, OCR Technology | |
| 6 * | |
| 7 * (c) Copyright 1987, Hewlett-Packard Company. | |
| 8 ** Licensed under the Apache License, Version 2.0 (the "License"); | |
| 9 ** you may not use this file except in compliance with the License. | |
| 10 ** You may obtain a copy of the License at | |
| 11 ** http://www.apache.org/licenses/LICENSE-2.0 | |
| 12 ** Unless required by applicable law or agreed to in writing, software | |
| 13 ** distributed under the License is distributed on an "AS IS" BASIS, | |
| 14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 15 ** See the License for the specific language governing permissions and | |
| 16 ** limitations under the License. | |
| 17 * | |
| 18 *****************************************************************************/ | |
| 19 /*---------------------------------------------------------------------- | |
| 20 I n c l u d e s | |
| 21 ----------------------------------------------------------------------*/ | |
| 22 | |
| 23 #include "dawg.h" | |
| 24 #include "params.h" | |
| 25 #include "stopper.h" | |
| 26 #include "tprintf.h" | |
| 27 | |
| 28 #include <algorithm> | |
| 29 #include <cctype> | |
| 30 #include "dict.h" | |
| 31 | |
| 32 /*---------------------------------------------------------------------- | |
| 33 F u n c t i o n s | |
| 34 ----------------------------------------------------------------------*/ | |
| 35 namespace tesseract { | |
| 36 | |
| 37 /** | |
| 38 * @name go_deeper_dawg_fxn | |
| 39 * | |
| 40 * If the choice being composed so far could be a dictionary word | |
| 41 * keep exploring choices. | |
| 42 */ | |
| 43 void Dict::go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, | |
| 44 int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, | |
| 45 bool word_ending, WERD_CHOICE *word, float certainties[], | |
| 46 float *limit, WERD_CHOICE *best_choice, int *attempts_left, | |
| 47 void *void_more_args) { | |
| 48 auto *more_args = static_cast<DawgArgs *>(void_more_args); | |
| 49 word_ending = (static_cast<unsigned>(char_choice_index) == char_choices.size() - 1); | |
| 50 int word_index = word->length() - 1; | |
| 51 if (best_choice->rating() < *limit) { | |
| 52 return; | |
| 53 } | |
| 54 // Look up char in DAWG | |
| 55 | |
| 56 // If the current unichar is an ngram first try calling | |
| 57 // letter_is_okay() for each unigram it contains separately. | |
| 58 UNICHAR_ID orig_uch_id = word->unichar_id(word_index); | |
| 59 bool checked_unigrams = false; | |
| 60 if (getUnicharset().get_isngram(orig_uch_id)) { | |
| 61 if (dawg_debug_level) { | |
| 62 tprintf("checking unigrams in an ngram %s\n", getUnicharset().debug_str(orig_uch_id).c_str()); | |
| 63 } | |
| 64 int num_unigrams = 0; | |
| 65 word->remove_last_unichar_id(); | |
| 66 std::vector<UNICHAR_ID> encoding; | |
| 67 const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id); | |
| 68 // Since the string came out of the unicharset, failure is impossible. | |
| 69 ASSERT_HOST(getUnicharset().encode_string(ngram_str, true, &encoding, nullptr, nullptr)); | |
| 70 bool unigrams_ok = true; | |
| 71 // Construct DawgArgs that reflect the current state. | |
| 72 DawgPositionVector unigram_active_dawgs = *(more_args->active_dawgs); | |
| 73 DawgPositionVector unigram_updated_dawgs; | |
| 74 DawgArgs unigram_dawg_args(&unigram_active_dawgs, &unigram_updated_dawgs, more_args->permuter); | |
| 75 // Check unigrams in the ngram with letter_is_okay(). | |
| 76 for (size_t i = 0; unigrams_ok && i < encoding.size(); ++i) { | |
| 77 UNICHAR_ID uch_id = encoding[i]; | |
| 78 ASSERT_HOST(uch_id != INVALID_UNICHAR_ID); | |
| 79 ++num_unigrams; | |
| 80 word->append_unichar_id(uch_id, 1, 0.0, 0.0); | |
| 81 unigrams_ok = (this->*letter_is_okay_)(&unigram_dawg_args, *word->unicharset(), | |
| 82 word->unichar_id(word_index + num_unigrams - 1), | |
| 83 word_ending && i == encoding.size() - 1); | |
| 84 (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs); | |
| 85 if (dawg_debug_level) { | |
| 86 tprintf("unigram %s is %s\n", getUnicharset().debug_str(uch_id).c_str(), | |
| 87 unigrams_ok ? "OK" : "not OK"); | |
| 88 } | |
| 89 } | |
| 90 // Restore the word and copy the updated dawg state if needed. | |
| 91 while (num_unigrams-- > 0) { | |
| 92 word->remove_last_unichar_id(); | |
| 93 } | |
| 94 word->append_unichar_id_space_allocated(orig_uch_id, 1, 0.0, 0.0); | |
| 95 if (unigrams_ok) { | |
| 96 checked_unigrams = true; | |
| 97 more_args->permuter = unigram_dawg_args.permuter; | |
| 98 *(more_args->updated_dawgs) = *(unigram_dawg_args.updated_dawgs); | |
| 99 } | |
| 100 } | |
| 101 | |
| 102 // Check which dawgs from the dawgs_ vector contain the word | |
| 103 // up to and including the current unichar. | |
| 104 if (checked_unigrams || (this->*letter_is_okay_)(more_args, *word->unicharset(), | |
| 105 word->unichar_id(word_index), word_ending)) { | |
| 106 // Add a new word choice | |
| 107 if (word_ending) { | |
| 108 if (dawg_debug_level) { | |
| 109 tprintf("found word = %s\n", word->debug_string().c_str()); | |
| 110 } | |
| 111 if (strcmp(output_ambig_words_file.c_str(), "") != 0) { | |
| 112 if (output_ambig_words_file_ == nullptr) { | |
| 113 output_ambig_words_file_ = fopen(output_ambig_words_file.c_str(), "wb+"); | |
| 114 if (output_ambig_words_file_ == nullptr) { | |
| 115 tprintf("Failed to open output_ambig_words_file %s\n", output_ambig_words_file.c_str()); | |
| 116 exit(1); | |
| 117 } | |
| 118 std::string word_str; | |
| 119 word->string_and_lengths(&word_str, nullptr); | |
| 120 word_str += " "; | |
| 121 fprintf(output_ambig_words_file_, "%s", word_str.c_str()); | |
| 122 } | |
| 123 std::string word_str; | |
| 124 word->string_and_lengths(&word_str, nullptr); | |
| 125 word_str += " "; | |
| 126 fprintf(output_ambig_words_file_, "%s", word_str.c_str()); | |
| 127 } | |
| 128 WERD_CHOICE *adjusted_word = word; | |
| 129 adjusted_word->set_permuter(more_args->permuter); | |
| 130 update_best_choice(*adjusted_word, best_choice); | |
| 131 } else { // search the next letter | |
| 132 // Make updated_* point to the next entries in the DawgPositionVector | |
| 133 // arrays (that were originally created in dawg_permute_and_select) | |
| 134 ++(more_args->updated_dawgs); | |
| 135 // Make active_dawgs and constraints point to the updated ones. | |
| 136 ++(more_args->active_dawgs); | |
| 137 permute_choices(debug, char_choices, char_choice_index + 1, prev_char_frag_info, word, | |
| 138 certainties, limit, best_choice, attempts_left, more_args); | |
| 139 // Restore previous state to explore another letter in this position. | |
| 140 --(more_args->updated_dawgs); | |
| 141 --(more_args->active_dawgs); | |
| 142 } | |
| 143 } else { | |
| 144 if (dawg_debug_level) { | |
| 145 tprintf("last unichar not OK at index %d in %s\n", word_index, word->debug_string().c_str()); | |
| 146 } | |
| 147 } | |
| 148 } | |
| 149 | |
| 150 /** | |
| 151 * dawg_permute_and_select | |
| 152 * | |
| 153 * Recursively explore all the possible character combinations in | |
| 154 * the given char_choices. Use go_deeper_dawg_fxn() to search all the | |
| 155 * dawgs in the dawgs_ vector in parallel and discard invalid words. | |
| 156 * | |
| 157 * Allocate and return a WERD_CHOICE with the best valid word found. | |
| 158 */ | |
| 159 WERD_CHOICE *Dict::dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices, | |
| 160 float rating_limit) { | |
| 161 auto *best_choice = new WERD_CHOICE(&getUnicharset()); | |
| 162 best_choice->make_bad(); | |
| 163 best_choice->set_rating(rating_limit); | |
| 164 if (char_choices.empty() || char_choices.size() > MAX_WERD_LENGTH) { | |
| 165 return best_choice; | |
| 166 } | |
| 167 auto *active_dawgs = new DawgPositionVector[char_choices.size() + 1]; | |
| 168 init_active_dawgs(&(active_dawgs[0]), true); | |
| 169 DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM); | |
| 170 WERD_CHOICE word(&getUnicharset(), MAX_WERD_LENGTH); | |
| 171 | |
| 172 float certainties[MAX_WERD_LENGTH]; | |
| 173 this->go_deeper_fxn_ = &tesseract::Dict::go_deeper_dawg_fxn; | |
| 174 int attempts_left = max_permuter_attempts; | |
| 175 permute_choices((dawg_debug_level) ? "permute_dawg_debug" : nullptr, char_choices, 0, nullptr, | |
| 176 &word, certainties, &rating_limit, best_choice, &attempts_left, &dawg_args); | |
| 177 delete[] active_dawgs; | |
| 178 return best_choice; | |
| 179 } | |
| 180 | |
| 181 /** | |
| 182 * permute_choices | |
| 183 * | |
| 184 * Call append_choices() for each BLOB_CHOICE in BLOB_CHOICE_LIST | |
| 185 * with the given char_choice_index in char_choices. | |
| 186 */ | |
| 187 void Dict::permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, | |
| 188 int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, | |
| 189 WERD_CHOICE *word, float certainties[], float *limit, | |
| 190 WERD_CHOICE *best_choice, int *attempts_left, void *more_args) { | |
| 191 if (debug) { | |
| 192 tprintf( | |
| 193 "%s permute_choices: char_choice_index=%d" | |
| 194 " limit=%g rating=%g, certainty=%g word=%s\n", | |
| 195 debug, char_choice_index, *limit, word->rating(), word->certainty(), | |
| 196 word->debug_string().c_str()); | |
| 197 } | |
| 198 if (static_cast<unsigned>(char_choice_index) < char_choices.size()) { | |
| 199 BLOB_CHOICE_IT blob_choice_it; | |
| 200 blob_choice_it.set_to_list(char_choices.at(char_choice_index)); | |
| 201 for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list(); blob_choice_it.forward()) { | |
| 202 (*attempts_left)--; | |
| 203 append_choices(debug, char_choices, *(blob_choice_it.data()), char_choice_index, | |
| 204 prev_char_frag_info, word, certainties, limit, best_choice, attempts_left, | |
| 205 more_args); | |
| 206 if (*attempts_left <= 0) { | |
| 207 if (debug) { | |
| 208 tprintf("permute_choices(): attempts_left is 0\n"); | |
| 209 } | |
| 210 break; | |
| 211 } | |
| 212 } | |
| 213 } | |
| 214 } | |
| 215 | |
| 216 /** | |
| 217 * append_choices | |
| 218 * | |
| 219 * Checks to see whether or not the next choice is worth appending to | |
| 220 * the word being generated. If so then keeps going deeper into the word. | |
| 221 * | |
| 222 * This function assumes that Dict::go_deeper_fxn_ is set. | |
| 223 */ | |
| 224 void Dict::append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, | |
| 225 const BLOB_CHOICE &blob_choice, int char_choice_index, | |
| 226 const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, | |
| 227 float certainties[], float *limit, WERD_CHOICE *best_choice, | |
| 228 int *attempts_left, void *more_args) { | |
| 229 auto word_ending = (static_cast<unsigned>(char_choice_index) == char_choices.size() - 1); | |
| 230 | |
| 231 // Deal with fragments. | |
| 232 CHAR_FRAGMENT_INFO char_frag_info; | |
| 233 if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(), blob_choice.certainty(), | |
| 234 prev_char_frag_info, debug, word_ending, &char_frag_info)) { | |
| 235 return; // blob_choice must be an invalid fragment | |
| 236 } | |
| 237 // Search the next letter if this character is a fragment. | |
| 238 if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) { | |
| 239 permute_choices(debug, char_choices, char_choice_index + 1, &char_frag_info, word, certainties, | |
| 240 limit, best_choice, attempts_left, more_args); | |
| 241 return; | |
| 242 } | |
| 243 | |
| 244 // Add the next unichar. | |
| 245 float old_rating = word->rating(); | |
| 246 float old_certainty = word->certainty(); | |
| 247 uint8_t old_permuter = word->permuter(); | |
| 248 certainties[word->length()] = char_frag_info.certainty; | |
| 249 word->append_unichar_id_space_allocated(char_frag_info.unichar_id, char_frag_info.num_fragments, | |
| 250 char_frag_info.rating, char_frag_info.certainty); | |
| 251 | |
| 252 // Explore the next unichar. | |
| 253 (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index, &char_frag_info, word_ending, | |
| 254 word, certainties, limit, best_choice, attempts_left, more_args); | |
| 255 | |
| 256 // Remove the unichar we added to explore other choices in it's place. | |
| 257 word->remove_last_unichar_id(); | |
| 258 word->set_rating(old_rating); | |
| 259 word->set_certainty(old_certainty); | |
| 260 word->set_permuter(old_permuter); | |
| 261 } | |
| 262 | |
| 263 /** | |
| 264 * @name fragment_state | |
| 265 * | |
| 266 * Given the current char choice and information about previously seen | |
| 267 * fragments, determines whether adjacent character fragments are | |
| 268 * present and whether they can be concatenated. | |
| 269 * | |
| 270 * The given prev_char_frag_info contains: | |
| 271 * - fragment: if not nullptr contains information about immediately | |
| 272 * preceding fragmented character choice | |
| 273 * - num_fragments: number of fragments that have been used so far | |
| 274 * to construct a character | |
| 275 * - certainty: certainty of the current choice or minimum | |
| 276 * certainty of all fragments concatenated so far | |
| 277 * - rating: rating of the current choice or sum of fragment | |
| 278 * ratings concatenated so far | |
| 279 * | |
| 280 * The output char_frag_info is filled in as follows: | |
| 281 * - character: is set to be nullptr if the choice is a non-matching | |
| 282 * or non-ending fragment piece; is set to unichar of the given choice | |
| 283 * if it represents a regular character or a matching ending fragment | |
| 284 * - fragment,num_fragments,certainty,rating are set as described above | |
| 285 * | |
| 286 * @returns false if a non-matching fragment is discovered, true otherwise. | |
| 287 */ | |
| 288 bool Dict::fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, | |
| 289 const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug, | |
| 290 int word_ending, CHAR_FRAGMENT_INFO *char_frag_info) { | |
| 291 const CHAR_FRAGMENT *this_fragment = getUnicharset().get_fragment(curr_unichar_id); | |
| 292 const CHAR_FRAGMENT *prev_fragment = | |
| 293 prev_char_frag_info != nullptr ? prev_char_frag_info->fragment : nullptr; | |
| 294 | |
| 295 // Print debug info for fragments. | |
| 296 if (debug && (prev_fragment || this_fragment)) { | |
| 297 tprintf("%s check fragments: choice=%s word_ending=%d\n", debug, | |
| 298 getUnicharset().debug_str(curr_unichar_id).c_str(), word_ending); | |
| 299 if (prev_fragment) { | |
| 300 tprintf("prev_fragment %s\n", prev_fragment->to_string().c_str()); | |
| 301 } | |
| 302 if (this_fragment) { | |
| 303 tprintf("this_fragment %s\n", this_fragment->to_string().c_str()); | |
| 304 } | |
| 305 } | |
| 306 | |
| 307 char_frag_info->unichar_id = curr_unichar_id; | |
| 308 char_frag_info->fragment = this_fragment; | |
| 309 char_frag_info->rating = curr_rating; | |
| 310 char_frag_info->certainty = curr_certainty; | |
| 311 char_frag_info->num_fragments = 1; | |
| 312 if (prev_fragment && !this_fragment) { | |
| 313 if (debug) { | |
| 314 tprintf("Skip choice with incomplete fragment\n"); | |
| 315 } | |
| 316 return false; | |
| 317 } | |
| 318 if (this_fragment) { | |
| 319 // We are dealing with a fragment. | |
| 320 char_frag_info->unichar_id = INVALID_UNICHAR_ID; | |
| 321 if (prev_fragment) { | |
| 322 if (!this_fragment->is_continuation_of(prev_fragment)) { | |
| 323 if (debug) { | |
| 324 tprintf("Non-matching fragment piece\n"); | |
| 325 } | |
| 326 return false; | |
| 327 } | |
| 328 if (this_fragment->is_ending()) { | |
| 329 char_frag_info->unichar_id = getUnicharset().unichar_to_id(this_fragment->get_unichar()); | |
| 330 char_frag_info->fragment = nullptr; | |
| 331 if (debug) { | |
| 332 tprintf("Built character %s from fragments\n", | |
| 333 getUnicharset().debug_str(char_frag_info->unichar_id).c_str()); | |
| 334 } | |
| 335 } else { | |
| 336 if (debug) { | |
| 337 tprintf("Record fragment continuation\n"); | |
| 338 } | |
| 339 char_frag_info->fragment = this_fragment; | |
| 340 } | |
| 341 // Update certainty and rating. | |
| 342 char_frag_info->rating = prev_char_frag_info->rating + curr_rating; | |
| 343 char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1; | |
| 344 char_frag_info->certainty = std::min(curr_certainty, prev_char_frag_info->certainty); | |
| 345 } else { | |
| 346 if (this_fragment->is_beginning()) { | |
| 347 if (debug) { | |
| 348 tprintf("Record fragment beginning\n"); | |
| 349 } | |
| 350 } else { | |
| 351 if (debug) { | |
| 352 tprintf("Non-starting fragment piece with no prev_fragment\n"); | |
| 353 } | |
| 354 return false; | |
| 355 } | |
| 356 } | |
| 357 } | |
| 358 if (word_ending && char_frag_info->fragment) { | |
| 359 if (debug) { | |
| 360 tprintf("Word cannot end with a fragment\n"); | |
| 361 } | |
| 362 return false; | |
| 363 } | |
| 364 return true; | |
| 365 } | |
| 366 | |
| 367 } // namespace tesseract |
