comparison mupdf-source/thirdparty/tesseract/src/dict/dawg.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /********************************************************************************
2 *
3 * File: dawg.cpp (Formerly dawg.c)
4 * Description: Use a Directed Acyclic Word Graph
5 * Author: Mark Seaman, OCR Technology
6 *
7 * (c) Copyright 1987, Hewlett-Packard Company.
8 ** Licensed under the Apache License, Version 2.0 (the "License");
9 ** you may not use this file except in compliance with the License.
10 ** You may obtain a copy of the License at
11 ** http://www.apache.org/licenses/LICENSE-2.0
12 ** Unless required by applicable law or agreed to in writing, software
13 ** distributed under the License is distributed on an "AS IS" BASIS,
14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 ** See the License for the specific language governing permissions and
16 ** limitations under the License.
17 *
18 *********************************************************************************/
19 /*----------------------------------------------------------------------
20 I n c l u d e s
21 ----------------------------------------------------------------------*/
22
23 #include "dawg.h"
24
25 #include "dict.h"
26 #include "helpers.h"
27 #include "tprintf.h"
28
29 #include <memory>
30
31 /*----------------------------------------------------------------------
32 F u n c t i o n s f o r D a w g
33 ----------------------------------------------------------------------*/
34 namespace tesseract {
35
36 // Destructor.
37 // It is defined here, so the compiler can create a single vtable
38 // instead of weak vtables in every compilation unit.
39 Dawg::~Dawg() = default;
40
41 bool Dawg::prefix_in_dawg(const WERD_CHOICE &word,
42 bool requires_complete) const {
43 if (word.empty()) {
44 return !requires_complete;
45 }
46 NODE_REF node = 0;
47 int end_index = word.length() - 1;
48 for (int i = 0; i < end_index; i++) {
49 EDGE_REF edge = edge_char_of(node, word.unichar_id(i), false);
50 if (edge == NO_EDGE) {
51 return false;
52 }
53 if ((node = next_node(edge)) == 0) {
54 // This only happens if all words following this edge terminate --
55 // there are no larger words. See Trie::add_word_to_dawg()
56 return false;
57 }
58 }
59 // Now check the last character.
60 return edge_char_of(node, word.unichar_id(end_index), requires_complete) !=
61 NO_EDGE;
62 }
63
64 bool Dawg::word_in_dawg(const WERD_CHOICE &word) const {
65 return prefix_in_dawg(word, true);
66 }
67
68 int Dawg::check_for_words(const char *filename, const UNICHARSET &unicharset,
69 bool enable_wildcard) const {
70 if (filename == nullptr) {
71 return 0;
72 }
73
74 FILE *word_file;
75 char string[CHARS_PER_LINE];
76 int misses = 0;
77 UNICHAR_ID wildcard = unicharset.unichar_to_id(kWildcard);
78
79 word_file = fopen(filename, "r");
80 if (word_file == nullptr) {
81 tprintf("Error: Could not open file %s\n", filename);
82 ASSERT_HOST(word_file);
83 }
84
85 while (fgets(string, CHARS_PER_LINE, word_file) != nullptr) {
86 chomp_string(string); // remove newline
87 WERD_CHOICE word(string, unicharset);
88 if (word.length() > 0 && !word.contains_unichar_id(INVALID_UNICHAR_ID)) {
89 if (!match_words(&word, 0, 0,
90 enable_wildcard ? wildcard : INVALID_UNICHAR_ID)) {
91 tprintf("Missing word: %s\n", string);
92 ++misses;
93 }
94 } else {
95 tprintf("Failed to create a valid word from %s\n", string);
96 }
97 }
98 fclose(word_file);
99 // Make sure the user sees this with fprintf instead of tprintf.
100 if (debug_level_) {
101 tprintf("Number of lost words=%d\n", misses);
102 }
103 return misses;
104 }
105
106 void Dawg::iterate_words(const UNICHARSET &unicharset,
107 std::function<void(const WERD_CHOICE *)> cb) const {
108 WERD_CHOICE word(&unicharset);
109 iterate_words_rec(word, 0, cb);
110 }
111
112 static void CallWithUTF8(const std::function<void(const char *)> &cb,
113 const WERD_CHOICE *wc) {
114 std::string s;
115 wc->string_and_lengths(&s, nullptr);
116 cb(s.c_str());
117 }
118
119 void Dawg::iterate_words(const UNICHARSET &unicharset,
120 const std::function<void(const char *)> &cb) const {
121 using namespace std::placeholders; // for _1
122 std::function<void(const WERD_CHOICE *)> shim(
123 std::bind(CallWithUTF8, cb, _1));
124 WERD_CHOICE word(&unicharset);
125 iterate_words_rec(word, 0, shim);
126 }
127
128 void Dawg::iterate_words_rec(
129 const WERD_CHOICE &word_so_far, NODE_REF to_explore,
130 const std::function<void(const WERD_CHOICE *)> &cb) const {
131 NodeChildVector children;
132 this->unichar_ids_of(to_explore, &children, false);
133 for (auto &i : children) {
134 WERD_CHOICE next_word(word_so_far);
135 next_word.append_unichar_id(i.unichar_id, 1, 0.0, 0.0);
136 if (this->end_of_word(i.edge_ref)) {
137 cb(&next_word);
138 }
139 NODE_REF next = next_node(i.edge_ref);
140 if (next != 0) {
141 iterate_words_rec(next_word, next, cb);
142 }
143 }
144 }
145
146 bool Dawg::match_words(WERD_CHOICE *word, uint32_t index, NODE_REF node,
147 UNICHAR_ID wildcard) const {
148 if (wildcard != INVALID_UNICHAR_ID && word->unichar_id(index) == wildcard) {
149 bool any_matched = false;
150 NodeChildVector vec;
151 this->unichar_ids_of(node, &vec, false);
152 for (auto &i : vec) {
153 word->set_unichar_id(i.unichar_id, index);
154 if (match_words(word, index, node, wildcard)) {
155 any_matched = true;
156 }
157 }
158 word->set_unichar_id(wildcard, index);
159 return any_matched;
160 } else {
161 auto word_end = index == word->length() - 1;
162 auto edge = edge_char_of(node, word->unichar_id(index), word_end);
163 if (edge != NO_EDGE) { // normal edge in DAWG
164 node = next_node(edge);
165 if (word_end) {
166 if (debug_level_ > 1) {
167 word->print("match_words() found: ");
168 }
169 return true;
170 } else if (node != 0) {
171 return match_words(word, index + 1, node, wildcard);
172 }
173 }
174 }
175 return false;
176 }
177
178 void Dawg::init(int unicharset_size) {
179 ASSERT_HOST(unicharset_size > 0);
180 unicharset_size_ = unicharset_size;
181 // Set bit masks. We will use the value unicharset_size_ as a null char, so
182 // the actual number of unichars is unicharset_size_ + 1.
183 flag_start_bit_ = ceil(log(unicharset_size_ + 1.0) / log(2.0));
184 next_node_start_bit_ = flag_start_bit_ + NUM_FLAG_BITS;
185 letter_mask_ = ~(~0ull << flag_start_bit_);
186 next_node_mask_ = ~0ull << (flag_start_bit_ + NUM_FLAG_BITS);
187 flags_mask_ = ~(letter_mask_ | next_node_mask_);
188 }
189
190 /*----------------------------------------------------------------------
191 F u n c t i o n s f o r S q u i s h e d D a w g
192 ----------------------------------------------------------------------*/
193
194 SquishedDawg::~SquishedDawg() {
195 delete[] edges_;
196 }
197
198 EDGE_REF SquishedDawg::edge_char_of(NODE_REF node, UNICHAR_ID unichar_id,
199 bool word_end) const {
200 EDGE_REF edge = node;
201 if (node == 0) { // binary search
202 EDGE_REF start = 0;
203 EDGE_REF end = num_forward_edges_in_node0 - 1;
204 int compare;
205 while (start <= end) {
206 edge = (start + end) >> 1; // (start + end) / 2
207 compare = given_greater_than_edge_rec(NO_EDGE, word_end, unichar_id,
208 edges_[edge]);
209 if (compare == 0) { // given == vec[k]
210 return edge;
211 } else if (compare == 1) { // given > vec[k]
212 start = edge + 1;
213 } else { // given < vec[k]
214 end = edge - 1;
215 }
216 }
217 } else { // linear search
218 if (edge != NO_EDGE && edge_occupied(edge)) {
219 do {
220 if ((unichar_id_from_edge_rec(edges_[edge]) == unichar_id) &&
221 (!word_end || end_of_word_from_edge_rec(edges_[edge]))) {
222 return (edge);
223 }
224 } while (!last_edge(edge++));
225 }
226 }
227 return (NO_EDGE); // not found
228 }
229
230 int32_t SquishedDawg::num_forward_edges(NODE_REF node) const {
231 EDGE_REF edge = node;
232 int32_t num = 0;
233
234 if (forward_edge(edge)) {
235 do {
236 num++;
237 } while (!last_edge(edge++));
238 }
239
240 return (num);
241 }
242
243 void SquishedDawg::print_node(NODE_REF node, int max_num_edges) const {
244 if (node == NO_EDGE) {
245 return; // nothing to print
246 }
247
248 EDGE_REF edge = node;
249 const char *forward_string = "FORWARD";
250 const char *backward_string = " ";
251
252 const char *last_string = "LAST";
253 const char *not_last_string = " ";
254
255 const char *eow_string = "EOW";
256 const char *not_eow_string = " ";
257
258 const char *direction;
259 const char *is_last;
260 const char *eow;
261
262 UNICHAR_ID unichar_id;
263
264 if (edge_occupied(edge)) {
265 do {
266 direction = forward_edge(edge) ? forward_string : backward_string;
267 is_last = last_edge(edge) ? last_string : not_last_string;
268 eow = end_of_word(edge) ? eow_string : not_eow_string;
269
270 unichar_id = edge_letter(edge);
271 tprintf(REFFORMAT " : next = " REFFORMAT ", unichar_id = %d, %s %s %s\n",
272 edge, next_node(edge), unichar_id, direction, is_last, eow);
273
274 if (edge - node > max_num_edges) {
275 return;
276 }
277 } while (!last_edge(edge++));
278
279 if (edge < num_edges_ && edge_occupied(edge) && backward_edge(edge)) {
280 do {
281 direction = forward_edge(edge) ? forward_string : backward_string;
282 is_last = last_edge(edge) ? last_string : not_last_string;
283 eow = end_of_word(edge) ? eow_string : not_eow_string;
284
285 unichar_id = edge_letter(edge);
286 tprintf(REFFORMAT " : next = " REFFORMAT
287 ", unichar_id = %d, %s %s %s\n",
288 edge, next_node(edge), unichar_id, direction, is_last, eow);
289
290 if (edge - node > MAX_NODE_EDGES_DISPLAY) {
291 return;
292 }
293 } while (!last_edge(edge++));
294 }
295 } else {
296 tprintf(REFFORMAT " : no edges in this node\n", node);
297 }
298 tprintf("\n");
299 }
300
301 void SquishedDawg::print_edge(EDGE_REF edge) const {
302 if (edge == NO_EDGE) {
303 tprintf("NO_EDGE\n");
304 } else {
305 tprintf(REFFORMAT " : next = " REFFORMAT ", unichar_id = '%d', %s %s %s\n",
306 edge, next_node(edge), edge_letter(edge),
307 (forward_edge(edge) ? "FORWARD" : " "),
308 (last_edge(edge) ? "LAST" : " "),
309 (end_of_word(edge) ? "EOW" : ""));
310 }
311 }
312
313 bool SquishedDawg::read_squished_dawg(TFile *file) {
314 if (debug_level_) {
315 tprintf("Reading squished dawg\n");
316 }
317
318 // Read the magic number and check that it matches kDawgMagicNumber, as
319 // auto-endian fixing should make sure it is always correct.
320 int16_t magic;
321 if (!file->DeSerialize(&magic)) {
322 return false;
323 }
324 if (magic != kDawgMagicNumber) {
325 tprintf("Bad magic number on dawg: %d vs %d\n", magic, kDawgMagicNumber);
326 return false;
327 }
328
329 int32_t unicharset_size;
330 if (!file->DeSerialize(&unicharset_size)) {
331 return false;
332 }
333 if (!file->DeSerialize(&num_edges_)) {
334 return false;
335 }
336 ASSERT_HOST(num_edges_ > 0); // DAWG should not be empty
337 Dawg::init(unicharset_size);
338
339 edges_ = new EDGE_RECORD[num_edges_];
340 if (!file->DeSerialize(&edges_[0], num_edges_)) {
341 return false;
342 }
343 if (debug_level_ > 2) {
344 tprintf("type: %d lang: %s perm: %d unicharset_size: %d num_edges: %d\n",
345 type_, lang_.c_str(), perm_, unicharset_size_, num_edges_);
346 for (EDGE_REF edge = 0; edge < num_edges_; ++edge) {
347 print_edge(edge);
348 }
349 }
350 return true;
351 }
352
353 std::unique_ptr<EDGE_REF[]> SquishedDawg::build_node_map(
354 int32_t *num_nodes) const {
355 EDGE_REF edge;
356 std::unique_ptr<EDGE_REF[]> node_map(new EDGE_REF[num_edges_]);
357 int32_t node_counter;
358 int32_t num_edges;
359
360 for (edge = 0; edge < num_edges_; edge++) { // init all slots
361 node_map[edge] = -1;
362 }
363
364 node_counter = num_forward_edges(0);
365
366 *num_nodes = 0;
367 for (edge = 0; edge < num_edges_; edge++) { // search all slots
368
369 if (forward_edge(edge)) {
370 (*num_nodes)++; // count nodes links
371 node_map[edge] = (edge ? node_counter : 0);
372 num_edges = num_forward_edges(edge);
373 if (edge != 0) {
374 node_counter += num_edges;
375 }
376 edge += num_edges;
377 if (edge >= num_edges_) {
378 break;
379 }
380 if (backward_edge(edge)) {
381 while (!last_edge(edge++)) {
382 ;
383 }
384 }
385 edge--;
386 }
387 }
388 return node_map;
389 }
390
391 bool SquishedDawg::write_squished_dawg(TFile *file) {
392 EDGE_REF edge;
393 int32_t num_edges;
394 int32_t node_count = 0;
395 EDGE_REF old_index;
396 EDGE_RECORD temp_record;
397
398 if (debug_level_) {
399 tprintf("write_squished_dawg\n");
400 }
401
402 std::unique_ptr<EDGE_REF[]> node_map(build_node_map(&node_count));
403
404 // Write the magic number to help detecting a change in endianness.
405 int16_t magic = kDawgMagicNumber;
406 if (!file->Serialize(&magic)) {
407 return false;
408 }
409 if (!file->Serialize(&unicharset_size_)) {
410 return false;
411 }
412
413 // Count the number of edges in this Dawg.
414 num_edges = 0;
415 for (edge = 0; edge < num_edges_; edge++) {
416 if (forward_edge(edge)) {
417 num_edges++;
418 }
419 }
420
421 // Write edge count to file.
422 if (!file->Serialize(&num_edges)) {
423 return false;
424 }
425
426 if (debug_level_) {
427 tprintf("%d nodes in DAWG\n", node_count);
428 tprintf("%d edges in DAWG\n", num_edges);
429 }
430
431 for (edge = 0; edge < num_edges_; edge++) {
432 if (forward_edge(edge)) { // write forward edges
433 do {
434 old_index = next_node_from_edge_rec(edges_[edge]);
435 set_next_node(edge, node_map[old_index]);
436 temp_record = edges_[edge];
437 if (!file->Serialize(&temp_record)) {
438 return false;
439 }
440 set_next_node(edge, old_index);
441 } while (!last_edge(edge++));
442
443 if (edge >= num_edges_) {
444 break;
445 }
446 if (backward_edge(edge)) { // skip back links
447 while (!last_edge(edge++)) {
448 ;
449 }
450 }
451
452 edge--;
453 }
454 }
455 return true;
456 }
457
458 } // namespace tesseract