comparison mupdf-source/thirdparty/tesseract/src/dict/permdawg.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /******************************************************************************
2 *
3 * File: permdawg.cpp (Formerly permdawg.c)
4 * Description: Scale word choices by a dictionary
5 * Author: Mark Seaman, OCR Technology
6 *
7 * (c) Copyright 1987, Hewlett-Packard Company.
8 ** Licensed under the Apache License, Version 2.0 (the "License");
9 ** you may not use this file except in compliance with the License.
10 ** You may obtain a copy of the License at
11 ** http://www.apache.org/licenses/LICENSE-2.0
12 ** Unless required by applicable law or agreed to in writing, software
13 ** distributed under the License is distributed on an "AS IS" BASIS,
14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 ** See the License for the specific language governing permissions and
16 ** limitations under the License.
17 *
18 *****************************************************************************/
19 /*----------------------------------------------------------------------
20 I n c l u d e s
21 ----------------------------------------------------------------------*/
22
23 #include "dawg.h"
24 #include "params.h"
25 #include "stopper.h"
26 #include "tprintf.h"
27
28 #include <algorithm>
29 #include <cctype>
30 #include "dict.h"
31
32 /*----------------------------------------------------------------------
33 F u n c t i o n s
34 ----------------------------------------------------------------------*/
35 namespace tesseract {
36
37 /**
38 * @name go_deeper_dawg_fxn
39 *
40 * If the choice being composed so far could be a dictionary word
41 * keep exploring choices.
42 */
43 void Dict::go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
44 int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
45 bool word_ending, WERD_CHOICE *word, float certainties[],
46 float *limit, WERD_CHOICE *best_choice, int *attempts_left,
47 void *void_more_args) {
48 auto *more_args = static_cast<DawgArgs *>(void_more_args);
49 word_ending = (static_cast<unsigned>(char_choice_index) == char_choices.size() - 1);
50 int word_index = word->length() - 1;
51 if (best_choice->rating() < *limit) {
52 return;
53 }
54 // Look up char in DAWG
55
56 // If the current unichar is an ngram first try calling
57 // letter_is_okay() for each unigram it contains separately.
58 UNICHAR_ID orig_uch_id = word->unichar_id(word_index);
59 bool checked_unigrams = false;
60 if (getUnicharset().get_isngram(orig_uch_id)) {
61 if (dawg_debug_level) {
62 tprintf("checking unigrams in an ngram %s\n", getUnicharset().debug_str(orig_uch_id).c_str());
63 }
64 int num_unigrams = 0;
65 word->remove_last_unichar_id();
66 std::vector<UNICHAR_ID> encoding;
67 const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);
68 // Since the string came out of the unicharset, failure is impossible.
69 ASSERT_HOST(getUnicharset().encode_string(ngram_str, true, &encoding, nullptr, nullptr));
70 bool unigrams_ok = true;
71 // Construct DawgArgs that reflect the current state.
72 DawgPositionVector unigram_active_dawgs = *(more_args->active_dawgs);
73 DawgPositionVector unigram_updated_dawgs;
74 DawgArgs unigram_dawg_args(&unigram_active_dawgs, &unigram_updated_dawgs, more_args->permuter);
75 // Check unigrams in the ngram with letter_is_okay().
76 for (size_t i = 0; unigrams_ok && i < encoding.size(); ++i) {
77 UNICHAR_ID uch_id = encoding[i];
78 ASSERT_HOST(uch_id != INVALID_UNICHAR_ID);
79 ++num_unigrams;
80 word->append_unichar_id(uch_id, 1, 0.0, 0.0);
81 unigrams_ok = (this->*letter_is_okay_)(&unigram_dawg_args, *word->unicharset(),
82 word->unichar_id(word_index + num_unigrams - 1),
83 word_ending && i == encoding.size() - 1);
84 (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs);
85 if (dawg_debug_level) {
86 tprintf("unigram %s is %s\n", getUnicharset().debug_str(uch_id).c_str(),
87 unigrams_ok ? "OK" : "not OK");
88 }
89 }
90 // Restore the word and copy the updated dawg state if needed.
91 while (num_unigrams-- > 0) {
92 word->remove_last_unichar_id();
93 }
94 word->append_unichar_id_space_allocated(orig_uch_id, 1, 0.0, 0.0);
95 if (unigrams_ok) {
96 checked_unigrams = true;
97 more_args->permuter = unigram_dawg_args.permuter;
98 *(more_args->updated_dawgs) = *(unigram_dawg_args.updated_dawgs);
99 }
100 }
101
102 // Check which dawgs from the dawgs_ vector contain the word
103 // up to and including the current unichar.
104 if (checked_unigrams || (this->*letter_is_okay_)(more_args, *word->unicharset(),
105 word->unichar_id(word_index), word_ending)) {
106 // Add a new word choice
107 if (word_ending) {
108 if (dawg_debug_level) {
109 tprintf("found word = %s\n", word->debug_string().c_str());
110 }
111 if (strcmp(output_ambig_words_file.c_str(), "") != 0) {
112 if (output_ambig_words_file_ == nullptr) {
113 output_ambig_words_file_ = fopen(output_ambig_words_file.c_str(), "wb+");
114 if (output_ambig_words_file_ == nullptr) {
115 tprintf("Failed to open output_ambig_words_file %s\n", output_ambig_words_file.c_str());
116 exit(1);
117 }
118 std::string word_str;
119 word->string_and_lengths(&word_str, nullptr);
120 word_str += " ";
121 fprintf(output_ambig_words_file_, "%s", word_str.c_str());
122 }
123 std::string word_str;
124 word->string_and_lengths(&word_str, nullptr);
125 word_str += " ";
126 fprintf(output_ambig_words_file_, "%s", word_str.c_str());
127 }
128 WERD_CHOICE *adjusted_word = word;
129 adjusted_word->set_permuter(more_args->permuter);
130 update_best_choice(*adjusted_word, best_choice);
131 } else { // search the next letter
132 // Make updated_* point to the next entries in the DawgPositionVector
133 // arrays (that were originally created in dawg_permute_and_select)
134 ++(more_args->updated_dawgs);
135 // Make active_dawgs and constraints point to the updated ones.
136 ++(more_args->active_dawgs);
137 permute_choices(debug, char_choices, char_choice_index + 1, prev_char_frag_info, word,
138 certainties, limit, best_choice, attempts_left, more_args);
139 // Restore previous state to explore another letter in this position.
140 --(more_args->updated_dawgs);
141 --(more_args->active_dawgs);
142 }
143 } else {
144 if (dawg_debug_level) {
145 tprintf("last unichar not OK at index %d in %s\n", word_index, word->debug_string().c_str());
146 }
147 }
148 }
149
150 /**
151 * dawg_permute_and_select
152 *
153 * Recursively explore all the possible character combinations in
154 * the given char_choices. Use go_deeper_dawg_fxn() to search all the
155 * dawgs in the dawgs_ vector in parallel and discard invalid words.
156 *
157 * Allocate and return a WERD_CHOICE with the best valid word found.
158 */
159 WERD_CHOICE *Dict::dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices,
160 float rating_limit) {
161 auto *best_choice = new WERD_CHOICE(&getUnicharset());
162 best_choice->make_bad();
163 best_choice->set_rating(rating_limit);
164 if (char_choices.empty() || char_choices.size() > MAX_WERD_LENGTH) {
165 return best_choice;
166 }
167 auto *active_dawgs = new DawgPositionVector[char_choices.size() + 1];
168 init_active_dawgs(&(active_dawgs[0]), true);
169 DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
170 WERD_CHOICE word(&getUnicharset(), MAX_WERD_LENGTH);
171
172 float certainties[MAX_WERD_LENGTH];
173 this->go_deeper_fxn_ = &tesseract::Dict::go_deeper_dawg_fxn;
174 int attempts_left = max_permuter_attempts;
175 permute_choices((dawg_debug_level) ? "permute_dawg_debug" : nullptr, char_choices, 0, nullptr,
176 &word, certainties, &rating_limit, best_choice, &attempts_left, &dawg_args);
177 delete[] active_dawgs;
178 return best_choice;
179 }
180
181 /**
182 * permute_choices
183 *
184 * Call append_choices() for each BLOB_CHOICE in BLOB_CHOICE_LIST
185 * with the given char_choice_index in char_choices.
186 */
187 void Dict::permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
188 int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
189 WERD_CHOICE *word, float certainties[], float *limit,
190 WERD_CHOICE *best_choice, int *attempts_left, void *more_args) {
191 if (debug) {
192 tprintf(
193 "%s permute_choices: char_choice_index=%d"
194 " limit=%g rating=%g, certainty=%g word=%s\n",
195 debug, char_choice_index, *limit, word->rating(), word->certainty(),
196 word->debug_string().c_str());
197 }
198 if (static_cast<unsigned>(char_choice_index) < char_choices.size()) {
199 BLOB_CHOICE_IT blob_choice_it;
200 blob_choice_it.set_to_list(char_choices.at(char_choice_index));
201 for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list(); blob_choice_it.forward()) {
202 (*attempts_left)--;
203 append_choices(debug, char_choices, *(blob_choice_it.data()), char_choice_index,
204 prev_char_frag_info, word, certainties, limit, best_choice, attempts_left,
205 more_args);
206 if (*attempts_left <= 0) {
207 if (debug) {
208 tprintf("permute_choices(): attempts_left is 0\n");
209 }
210 break;
211 }
212 }
213 }
214 }
215
216 /**
217 * append_choices
218 *
219 * Checks to see whether or not the next choice is worth appending to
220 * the word being generated. If so then keeps going deeper into the word.
221 *
222 * This function assumes that Dict::go_deeper_fxn_ is set.
223 */
224 void Dict::append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
225 const BLOB_CHOICE &blob_choice, int char_choice_index,
226 const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word,
227 float certainties[], float *limit, WERD_CHOICE *best_choice,
228 int *attempts_left, void *more_args) {
229 auto word_ending = (static_cast<unsigned>(char_choice_index) == char_choices.size() - 1);
230
231 // Deal with fragments.
232 CHAR_FRAGMENT_INFO char_frag_info;
233 if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(), blob_choice.certainty(),
234 prev_char_frag_info, debug, word_ending, &char_frag_info)) {
235 return; // blob_choice must be an invalid fragment
236 }
237 // Search the next letter if this character is a fragment.
238 if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) {
239 permute_choices(debug, char_choices, char_choice_index + 1, &char_frag_info, word, certainties,
240 limit, best_choice, attempts_left, more_args);
241 return;
242 }
243
244 // Add the next unichar.
245 float old_rating = word->rating();
246 float old_certainty = word->certainty();
247 uint8_t old_permuter = word->permuter();
248 certainties[word->length()] = char_frag_info.certainty;
249 word->append_unichar_id_space_allocated(char_frag_info.unichar_id, char_frag_info.num_fragments,
250 char_frag_info.rating, char_frag_info.certainty);
251
252 // Explore the next unichar.
253 (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index, &char_frag_info, word_ending,
254 word, certainties, limit, best_choice, attempts_left, more_args);
255
256 // Remove the unichar we added to explore other choices in it's place.
257 word->remove_last_unichar_id();
258 word->set_rating(old_rating);
259 word->set_certainty(old_certainty);
260 word->set_permuter(old_permuter);
261 }
262
263 /**
264 * @name fragment_state
265 *
266 * Given the current char choice and information about previously seen
267 * fragments, determines whether adjacent character fragments are
268 * present and whether they can be concatenated.
269 *
270 * The given prev_char_frag_info contains:
271 * - fragment: if not nullptr contains information about immediately
272 * preceding fragmented character choice
273 * - num_fragments: number of fragments that have been used so far
274 * to construct a character
275 * - certainty: certainty of the current choice or minimum
276 * certainty of all fragments concatenated so far
277 * - rating: rating of the current choice or sum of fragment
278 * ratings concatenated so far
279 *
280 * The output char_frag_info is filled in as follows:
281 * - character: is set to be nullptr if the choice is a non-matching
282 * or non-ending fragment piece; is set to unichar of the given choice
283 * if it represents a regular character or a matching ending fragment
284 * - fragment,num_fragments,certainty,rating are set as described above
285 *
286 * @returns false if a non-matching fragment is discovered, true otherwise.
287 */
288 bool Dict::fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty,
289 const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug,
290 int word_ending, CHAR_FRAGMENT_INFO *char_frag_info) {
291 const CHAR_FRAGMENT *this_fragment = getUnicharset().get_fragment(curr_unichar_id);
292 const CHAR_FRAGMENT *prev_fragment =
293 prev_char_frag_info != nullptr ? prev_char_frag_info->fragment : nullptr;
294
295 // Print debug info for fragments.
296 if (debug && (prev_fragment || this_fragment)) {
297 tprintf("%s check fragments: choice=%s word_ending=%d\n", debug,
298 getUnicharset().debug_str(curr_unichar_id).c_str(), word_ending);
299 if (prev_fragment) {
300 tprintf("prev_fragment %s\n", prev_fragment->to_string().c_str());
301 }
302 if (this_fragment) {
303 tprintf("this_fragment %s\n", this_fragment->to_string().c_str());
304 }
305 }
306
307 char_frag_info->unichar_id = curr_unichar_id;
308 char_frag_info->fragment = this_fragment;
309 char_frag_info->rating = curr_rating;
310 char_frag_info->certainty = curr_certainty;
311 char_frag_info->num_fragments = 1;
312 if (prev_fragment && !this_fragment) {
313 if (debug) {
314 tprintf("Skip choice with incomplete fragment\n");
315 }
316 return false;
317 }
318 if (this_fragment) {
319 // We are dealing with a fragment.
320 char_frag_info->unichar_id = INVALID_UNICHAR_ID;
321 if (prev_fragment) {
322 if (!this_fragment->is_continuation_of(prev_fragment)) {
323 if (debug) {
324 tprintf("Non-matching fragment piece\n");
325 }
326 return false;
327 }
328 if (this_fragment->is_ending()) {
329 char_frag_info->unichar_id = getUnicharset().unichar_to_id(this_fragment->get_unichar());
330 char_frag_info->fragment = nullptr;
331 if (debug) {
332 tprintf("Built character %s from fragments\n",
333 getUnicharset().debug_str(char_frag_info->unichar_id).c_str());
334 }
335 } else {
336 if (debug) {
337 tprintf("Record fragment continuation\n");
338 }
339 char_frag_info->fragment = this_fragment;
340 }
341 // Update certainty and rating.
342 char_frag_info->rating = prev_char_frag_info->rating + curr_rating;
343 char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1;
344 char_frag_info->certainty = std::min(curr_certainty, prev_char_frag_info->certainty);
345 } else {
346 if (this_fragment->is_beginning()) {
347 if (debug) {
348 tprintf("Record fragment beginning\n");
349 }
350 } else {
351 if (debug) {
352 tprintf("Non-starting fragment piece with no prev_fragment\n");
353 }
354 return false;
355 }
356 }
357 }
358 if (word_ending && char_frag_info->fragment) {
359 if (debug) {
360 tprintf("Word cannot end with a fragment\n");
361 }
362 return false;
363 }
364 return true;
365 }
366
367 } // namespace tesseract