Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/dict/dawg.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /******************************************************************************** | |
| 2 * | |
| 3 * File: dawg.cpp (Formerly dawg.c) | |
| 4 * Description: Use a Directed Acyclic Word Graph | |
| 5 * Author: Mark Seaman, OCR Technology | |
| 6 * | |
| 7 * (c) Copyright 1987, Hewlett-Packard Company. | |
| 8 ** Licensed under the Apache License, Version 2.0 (the "License"); | |
| 9 ** you may not use this file except in compliance with the License. | |
| 10 ** You may obtain a copy of the License at | |
| 11 ** http://www.apache.org/licenses/LICENSE-2.0 | |
| 12 ** Unless required by applicable law or agreed to in writing, software | |
| 13 ** distributed under the License is distributed on an "AS IS" BASIS, | |
| 14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 15 ** See the License for the specific language governing permissions and | |
| 16 ** limitations under the License. | |
| 17 * | |
| 18 *********************************************************************************/ | |
| 19 /*---------------------------------------------------------------------- | |
| 20 I n c l u d e s | |
| 21 ----------------------------------------------------------------------*/ | |
| 22 | |
| 23 #include "dawg.h" | |
| 24 | |
| 25 #include "dict.h" | |
| 26 #include "helpers.h" | |
| 27 #include "tprintf.h" | |
| 28 | |
| 29 #include <memory> | |
| 30 | |
| 31 /*---------------------------------------------------------------------- | |
| 32 F u n c t i o n s f o r D a w g | |
| 33 ----------------------------------------------------------------------*/ | |
| 34 namespace tesseract { | |
| 35 | |
| 36 // Destructor. | |
| 37 // It is defined here, so the compiler can create a single vtable | |
| 38 // instead of weak vtables in every compilation unit. | |
| 39 Dawg::~Dawg() = default; | |
| 40 | |
| 41 bool Dawg::prefix_in_dawg(const WERD_CHOICE &word, | |
| 42 bool requires_complete) const { | |
| 43 if (word.empty()) { | |
| 44 return !requires_complete; | |
| 45 } | |
| 46 NODE_REF node = 0; | |
| 47 int end_index = word.length() - 1; | |
| 48 for (int i = 0; i < end_index; i++) { | |
| 49 EDGE_REF edge = edge_char_of(node, word.unichar_id(i), false); | |
| 50 if (edge == NO_EDGE) { | |
| 51 return false; | |
| 52 } | |
| 53 if ((node = next_node(edge)) == 0) { | |
| 54 // This only happens if all words following this edge terminate -- | |
| 55 // there are no larger words. See Trie::add_word_to_dawg() | |
| 56 return false; | |
| 57 } | |
| 58 } | |
| 59 // Now check the last character. | |
| 60 return edge_char_of(node, word.unichar_id(end_index), requires_complete) != | |
| 61 NO_EDGE; | |
| 62 } | |
| 63 | |
| 64 bool Dawg::word_in_dawg(const WERD_CHOICE &word) const { | |
| 65 return prefix_in_dawg(word, true); | |
| 66 } | |
| 67 | |
| 68 int Dawg::check_for_words(const char *filename, const UNICHARSET &unicharset, | |
| 69 bool enable_wildcard) const { | |
| 70 if (filename == nullptr) { | |
| 71 return 0; | |
| 72 } | |
| 73 | |
| 74 FILE *word_file; | |
| 75 char string[CHARS_PER_LINE]; | |
| 76 int misses = 0; | |
| 77 UNICHAR_ID wildcard = unicharset.unichar_to_id(kWildcard); | |
| 78 | |
| 79 word_file = fopen(filename, "r"); | |
| 80 if (word_file == nullptr) { | |
| 81 tprintf("Error: Could not open file %s\n", filename); | |
| 82 ASSERT_HOST(word_file); | |
| 83 } | |
| 84 | |
| 85 while (fgets(string, CHARS_PER_LINE, word_file) != nullptr) { | |
| 86 chomp_string(string); // remove newline | |
| 87 WERD_CHOICE word(string, unicharset); | |
| 88 if (word.length() > 0 && !word.contains_unichar_id(INVALID_UNICHAR_ID)) { | |
| 89 if (!match_words(&word, 0, 0, | |
| 90 enable_wildcard ? wildcard : INVALID_UNICHAR_ID)) { | |
| 91 tprintf("Missing word: %s\n", string); | |
| 92 ++misses; | |
| 93 } | |
| 94 } else { | |
| 95 tprintf("Failed to create a valid word from %s\n", string); | |
| 96 } | |
| 97 } | |
| 98 fclose(word_file); | |
| 99 // Make sure the user sees this with fprintf instead of tprintf. | |
| 100 if (debug_level_) { | |
| 101 tprintf("Number of lost words=%d\n", misses); | |
| 102 } | |
| 103 return misses; | |
| 104 } | |
| 105 | |
| 106 void Dawg::iterate_words(const UNICHARSET &unicharset, | |
| 107 std::function<void(const WERD_CHOICE *)> cb) const { | |
| 108 WERD_CHOICE word(&unicharset); | |
| 109 iterate_words_rec(word, 0, cb); | |
| 110 } | |
| 111 | |
| 112 static void CallWithUTF8(const std::function<void(const char *)> &cb, | |
| 113 const WERD_CHOICE *wc) { | |
| 114 std::string s; | |
| 115 wc->string_and_lengths(&s, nullptr); | |
| 116 cb(s.c_str()); | |
| 117 } | |
| 118 | |
| 119 void Dawg::iterate_words(const UNICHARSET &unicharset, | |
| 120 const std::function<void(const char *)> &cb) const { | |
| 121 using namespace std::placeholders; // for _1 | |
| 122 std::function<void(const WERD_CHOICE *)> shim( | |
| 123 std::bind(CallWithUTF8, cb, _1)); | |
| 124 WERD_CHOICE word(&unicharset); | |
| 125 iterate_words_rec(word, 0, shim); | |
| 126 } | |
| 127 | |
| 128 void Dawg::iterate_words_rec( | |
| 129 const WERD_CHOICE &word_so_far, NODE_REF to_explore, | |
| 130 const std::function<void(const WERD_CHOICE *)> &cb) const { | |
| 131 NodeChildVector children; | |
| 132 this->unichar_ids_of(to_explore, &children, false); | |
| 133 for (auto &i : children) { | |
| 134 WERD_CHOICE next_word(word_so_far); | |
| 135 next_word.append_unichar_id(i.unichar_id, 1, 0.0, 0.0); | |
| 136 if (this->end_of_word(i.edge_ref)) { | |
| 137 cb(&next_word); | |
| 138 } | |
| 139 NODE_REF next = next_node(i.edge_ref); | |
| 140 if (next != 0) { | |
| 141 iterate_words_rec(next_word, next, cb); | |
| 142 } | |
| 143 } | |
| 144 } | |
| 145 | |
| 146 bool Dawg::match_words(WERD_CHOICE *word, uint32_t index, NODE_REF node, | |
| 147 UNICHAR_ID wildcard) const { | |
| 148 if (wildcard != INVALID_UNICHAR_ID && word->unichar_id(index) == wildcard) { | |
| 149 bool any_matched = false; | |
| 150 NodeChildVector vec; | |
| 151 this->unichar_ids_of(node, &vec, false); | |
| 152 for (auto &i : vec) { | |
| 153 word->set_unichar_id(i.unichar_id, index); | |
| 154 if (match_words(word, index, node, wildcard)) { | |
| 155 any_matched = true; | |
| 156 } | |
| 157 } | |
| 158 word->set_unichar_id(wildcard, index); | |
| 159 return any_matched; | |
| 160 } else { | |
| 161 auto word_end = index == word->length() - 1; | |
| 162 auto edge = edge_char_of(node, word->unichar_id(index), word_end); | |
| 163 if (edge != NO_EDGE) { // normal edge in DAWG | |
| 164 node = next_node(edge); | |
| 165 if (word_end) { | |
| 166 if (debug_level_ > 1) { | |
| 167 word->print("match_words() found: "); | |
| 168 } | |
| 169 return true; | |
| 170 } else if (node != 0) { | |
| 171 return match_words(word, index + 1, node, wildcard); | |
| 172 } | |
| 173 } | |
| 174 } | |
| 175 return false; | |
| 176 } | |
| 177 | |
| 178 void Dawg::init(int unicharset_size) { | |
| 179 ASSERT_HOST(unicharset_size > 0); | |
| 180 unicharset_size_ = unicharset_size; | |
| 181 // Set bit masks. We will use the value unicharset_size_ as a null char, so | |
| 182 // the actual number of unichars is unicharset_size_ + 1. | |
| 183 flag_start_bit_ = ceil(log(unicharset_size_ + 1.0) / log(2.0)); | |
| 184 next_node_start_bit_ = flag_start_bit_ + NUM_FLAG_BITS; | |
| 185 letter_mask_ = ~(~0ull << flag_start_bit_); | |
| 186 next_node_mask_ = ~0ull << (flag_start_bit_ + NUM_FLAG_BITS); | |
| 187 flags_mask_ = ~(letter_mask_ | next_node_mask_); | |
| 188 } | |
| 189 | |
| 190 /*---------------------------------------------------------------------- | |
| 191 F u n c t i o n s f o r S q u i s h e d D a w g | |
| 192 ----------------------------------------------------------------------*/ | |
| 193 | |
| 194 SquishedDawg::~SquishedDawg() { | |
| 195 delete[] edges_; | |
| 196 } | |
| 197 | |
| 198 EDGE_REF SquishedDawg::edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, | |
| 199 bool word_end) const { | |
| 200 EDGE_REF edge = node; | |
| 201 if (node == 0) { // binary search | |
| 202 EDGE_REF start = 0; | |
| 203 EDGE_REF end = num_forward_edges_in_node0 - 1; | |
| 204 int compare; | |
| 205 while (start <= end) { | |
| 206 edge = (start + end) >> 1; // (start + end) / 2 | |
| 207 compare = given_greater_than_edge_rec(NO_EDGE, word_end, unichar_id, | |
| 208 edges_[edge]); | |
| 209 if (compare == 0) { // given == vec[k] | |
| 210 return edge; | |
| 211 } else if (compare == 1) { // given > vec[k] | |
| 212 start = edge + 1; | |
| 213 } else { // given < vec[k] | |
| 214 end = edge - 1; | |
| 215 } | |
| 216 } | |
| 217 } else { // linear search | |
| 218 if (edge != NO_EDGE && edge_occupied(edge)) { | |
| 219 do { | |
| 220 if ((unichar_id_from_edge_rec(edges_[edge]) == unichar_id) && | |
| 221 (!word_end || end_of_word_from_edge_rec(edges_[edge]))) { | |
| 222 return (edge); | |
| 223 } | |
| 224 } while (!last_edge(edge++)); | |
| 225 } | |
| 226 } | |
| 227 return (NO_EDGE); // not found | |
| 228 } | |
| 229 | |
| 230 int32_t SquishedDawg::num_forward_edges(NODE_REF node) const { | |
| 231 EDGE_REF edge = node; | |
| 232 int32_t num = 0; | |
| 233 | |
| 234 if (forward_edge(edge)) { | |
| 235 do { | |
| 236 num++; | |
| 237 } while (!last_edge(edge++)); | |
| 238 } | |
| 239 | |
| 240 return (num); | |
| 241 } | |
| 242 | |
| 243 void SquishedDawg::print_node(NODE_REF node, int max_num_edges) const { | |
| 244 if (node == NO_EDGE) { | |
| 245 return; // nothing to print | |
| 246 } | |
| 247 | |
| 248 EDGE_REF edge = node; | |
| 249 const char *forward_string = "FORWARD"; | |
| 250 const char *backward_string = " "; | |
| 251 | |
| 252 const char *last_string = "LAST"; | |
| 253 const char *not_last_string = " "; | |
| 254 | |
| 255 const char *eow_string = "EOW"; | |
| 256 const char *not_eow_string = " "; | |
| 257 | |
| 258 const char *direction; | |
| 259 const char *is_last; | |
| 260 const char *eow; | |
| 261 | |
| 262 UNICHAR_ID unichar_id; | |
| 263 | |
| 264 if (edge_occupied(edge)) { | |
| 265 do { | |
| 266 direction = forward_edge(edge) ? forward_string : backward_string; | |
| 267 is_last = last_edge(edge) ? last_string : not_last_string; | |
| 268 eow = end_of_word(edge) ? eow_string : not_eow_string; | |
| 269 | |
| 270 unichar_id = edge_letter(edge); | |
| 271 tprintf(REFFORMAT " : next = " REFFORMAT ", unichar_id = %d, %s %s %s\n", | |
| 272 edge, next_node(edge), unichar_id, direction, is_last, eow); | |
| 273 | |
| 274 if (edge - node > max_num_edges) { | |
| 275 return; | |
| 276 } | |
| 277 } while (!last_edge(edge++)); | |
| 278 | |
| 279 if (edge < num_edges_ && edge_occupied(edge) && backward_edge(edge)) { | |
| 280 do { | |
| 281 direction = forward_edge(edge) ? forward_string : backward_string; | |
| 282 is_last = last_edge(edge) ? last_string : not_last_string; | |
| 283 eow = end_of_word(edge) ? eow_string : not_eow_string; | |
| 284 | |
| 285 unichar_id = edge_letter(edge); | |
| 286 tprintf(REFFORMAT " : next = " REFFORMAT | |
| 287 ", unichar_id = %d, %s %s %s\n", | |
| 288 edge, next_node(edge), unichar_id, direction, is_last, eow); | |
| 289 | |
| 290 if (edge - node > MAX_NODE_EDGES_DISPLAY) { | |
| 291 return; | |
| 292 } | |
| 293 } while (!last_edge(edge++)); | |
| 294 } | |
| 295 } else { | |
| 296 tprintf(REFFORMAT " : no edges in this node\n", node); | |
| 297 } | |
| 298 tprintf("\n"); | |
| 299 } | |
| 300 | |
| 301 void SquishedDawg::print_edge(EDGE_REF edge) const { | |
| 302 if (edge == NO_EDGE) { | |
| 303 tprintf("NO_EDGE\n"); | |
| 304 } else { | |
| 305 tprintf(REFFORMAT " : next = " REFFORMAT ", unichar_id = '%d', %s %s %s\n", | |
| 306 edge, next_node(edge), edge_letter(edge), | |
| 307 (forward_edge(edge) ? "FORWARD" : " "), | |
| 308 (last_edge(edge) ? "LAST" : " "), | |
| 309 (end_of_word(edge) ? "EOW" : "")); | |
| 310 } | |
| 311 } | |
| 312 | |
| 313 bool SquishedDawg::read_squished_dawg(TFile *file) { | |
| 314 if (debug_level_) { | |
| 315 tprintf("Reading squished dawg\n"); | |
| 316 } | |
| 317 | |
| 318 // Read the magic number and check that it matches kDawgMagicNumber, as | |
| 319 // auto-endian fixing should make sure it is always correct. | |
| 320 int16_t magic; | |
| 321 if (!file->DeSerialize(&magic)) { | |
| 322 return false; | |
| 323 } | |
| 324 if (magic != kDawgMagicNumber) { | |
| 325 tprintf("Bad magic number on dawg: %d vs %d\n", magic, kDawgMagicNumber); | |
| 326 return false; | |
| 327 } | |
| 328 | |
| 329 int32_t unicharset_size; | |
| 330 if (!file->DeSerialize(&unicharset_size)) { | |
| 331 return false; | |
| 332 } | |
| 333 if (!file->DeSerialize(&num_edges_)) { | |
| 334 return false; | |
| 335 } | |
| 336 ASSERT_HOST(num_edges_ > 0); // DAWG should not be empty | |
| 337 Dawg::init(unicharset_size); | |
| 338 | |
| 339 edges_ = new EDGE_RECORD[num_edges_]; | |
| 340 if (!file->DeSerialize(&edges_[0], num_edges_)) { | |
| 341 return false; | |
| 342 } | |
| 343 if (debug_level_ > 2) { | |
| 344 tprintf("type: %d lang: %s perm: %d unicharset_size: %d num_edges: %d\n", | |
| 345 type_, lang_.c_str(), perm_, unicharset_size_, num_edges_); | |
| 346 for (EDGE_REF edge = 0; edge < num_edges_; ++edge) { | |
| 347 print_edge(edge); | |
| 348 } | |
| 349 } | |
| 350 return true; | |
| 351 } | |
| 352 | |
| 353 std::unique_ptr<EDGE_REF[]> SquishedDawg::build_node_map( | |
| 354 int32_t *num_nodes) const { | |
| 355 EDGE_REF edge; | |
| 356 std::unique_ptr<EDGE_REF[]> node_map(new EDGE_REF[num_edges_]); | |
| 357 int32_t node_counter; | |
| 358 int32_t num_edges; | |
| 359 | |
| 360 for (edge = 0; edge < num_edges_; edge++) { // init all slots | |
| 361 node_map[edge] = -1; | |
| 362 } | |
| 363 | |
| 364 node_counter = num_forward_edges(0); | |
| 365 | |
| 366 *num_nodes = 0; | |
| 367 for (edge = 0; edge < num_edges_; edge++) { // search all slots | |
| 368 | |
| 369 if (forward_edge(edge)) { | |
| 370 (*num_nodes)++; // count nodes links | |
| 371 node_map[edge] = (edge ? node_counter : 0); | |
| 372 num_edges = num_forward_edges(edge); | |
| 373 if (edge != 0) { | |
| 374 node_counter += num_edges; | |
| 375 } | |
| 376 edge += num_edges; | |
| 377 if (edge >= num_edges_) { | |
| 378 break; | |
| 379 } | |
| 380 if (backward_edge(edge)) { | |
| 381 while (!last_edge(edge++)) { | |
| 382 ; | |
| 383 } | |
| 384 } | |
| 385 edge--; | |
| 386 } | |
| 387 } | |
| 388 return node_map; | |
| 389 } | |
| 390 | |
| 391 bool SquishedDawg::write_squished_dawg(TFile *file) { | |
| 392 EDGE_REF edge; | |
| 393 int32_t num_edges; | |
| 394 int32_t node_count = 0; | |
| 395 EDGE_REF old_index; | |
| 396 EDGE_RECORD temp_record; | |
| 397 | |
| 398 if (debug_level_) { | |
| 399 tprintf("write_squished_dawg\n"); | |
| 400 } | |
| 401 | |
| 402 std::unique_ptr<EDGE_REF[]> node_map(build_node_map(&node_count)); | |
| 403 | |
| 404 // Write the magic number to help detecting a change in endianness. | |
| 405 int16_t magic = kDawgMagicNumber; | |
| 406 if (!file->Serialize(&magic)) { | |
| 407 return false; | |
| 408 } | |
| 409 if (!file->Serialize(&unicharset_size_)) { | |
| 410 return false; | |
| 411 } | |
| 412 | |
| 413 // Count the number of edges in this Dawg. | |
| 414 num_edges = 0; | |
| 415 for (edge = 0; edge < num_edges_; edge++) { | |
| 416 if (forward_edge(edge)) { | |
| 417 num_edges++; | |
| 418 } | |
| 419 } | |
| 420 | |
| 421 // Write edge count to file. | |
| 422 if (!file->Serialize(&num_edges)) { | |
| 423 return false; | |
| 424 } | |
| 425 | |
| 426 if (debug_level_) { | |
| 427 tprintf("%d nodes in DAWG\n", node_count); | |
| 428 tprintf("%d edges in DAWG\n", num_edges); | |
| 429 } | |
| 430 | |
| 431 for (edge = 0; edge < num_edges_; edge++) { | |
| 432 if (forward_edge(edge)) { // write forward edges | |
| 433 do { | |
| 434 old_index = next_node_from_edge_rec(edges_[edge]); | |
| 435 set_next_node(edge, node_map[old_index]); | |
| 436 temp_record = edges_[edge]; | |
| 437 if (!file->Serialize(&temp_record)) { | |
| 438 return false; | |
| 439 } | |
| 440 set_next_node(edge, old_index); | |
| 441 } while (!last_edge(edge++)); | |
| 442 | |
| 443 if (edge >= num_edges_) { | |
| 444 break; | |
| 445 } | |
| 446 if (backward_edge(edge)) { // skip back links | |
| 447 while (!last_edge(edge++)) { | |
| 448 ; | |
| 449 } | |
| 450 } | |
| 451 | |
| 452 edge--; | |
| 453 } | |
| 454 } | |
| 455 return true; | |
| 456 } | |
| 457 | |
| 458 } // namespace tesseract |
