Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/dict/trie.cpp @ 3:2c135c81b16c
MERGE: upstream PyMuPDF 1.26.4 with MuPDF 1.26.7
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:44:09 +0200 |
| parents | b50eed0cc0ef |
| children |
comparison
equal
deleted
inserted
replaced
| 0:6015a75abc2d | 3:2c135c81b16c |
|---|---|
| 1 /****************************************************************************** | |
| 2 * | |
| 3 * File: trie.cpp (Formerly trie.c) | |
| 4 * Description: Functions to build a trie data structure. | |
| 5 * Author: Mark Seaman, OCR Technology | |
| 6 * | |
| 7 * (c) Copyright 1987, Hewlett-Packard Company. | |
| 8 ** Licensed under the Apache License, Version 2.0 (the "License"); | |
| 9 ** you may not use this file except in compliance with the License. | |
| 10 ** You may obtain a copy of the License at | |
| 11 ** http://www.apache.org/licenses/LICENSE-2.0 | |
| 12 ** Unless required by applicable law or agreed to in writing, software | |
| 13 ** distributed under the License is distributed on an "AS IS" BASIS, | |
| 14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 15 ** See the License for the specific language governing permissions and | |
| 16 ** limitations under the License. | |
| 17 * | |
| 18 *****************************************************************************/ | |
| 19 /*---------------------------------------------------------------------- | |
| 20 I n c l u d e s | |
| 21 ----------------------------------------------------------------------*/ | |
| 22 | |
| 23 #include "trie.h" | |
| 24 | |
| 25 #include "dawg.h" | |
| 26 #include "dict.h" | |
| 27 #include "helpers.h" | |
| 28 #include "kdpair.h" | |
| 29 | |
| 30 namespace tesseract { | |
| 31 | |
| 32 const char kDoNotReverse[] = "RRP_DO_NO_REVERSE"; | |
| 33 const char kReverseIfHasRTL[] = "RRP_REVERSE_IF_HAS_RTL"; | |
| 34 const char kForceReverse[] = "RRP_FORCE_REVERSE"; | |
| 35 | |
| 36 const char *const RTLReversePolicyNames[] = {kDoNotReverse, kReverseIfHasRTL, kForceReverse}; | |
| 37 | |
| 38 const char Trie::kAlphaPatternUnicode[] = "\u2000"; | |
| 39 const char Trie::kDigitPatternUnicode[] = "\u2001"; | |
| 40 const char Trie::kAlphanumPatternUnicode[] = "\u2002"; | |
| 41 const char Trie::kPuncPatternUnicode[] = "\u2003"; | |
| 42 const char Trie::kLowerPatternUnicode[] = "\u2004"; | |
| 43 const char Trie::kUpperPatternUnicode[] = "\u2005"; | |
| 44 | |
| 45 const char *Trie::get_reverse_policy_name(RTLReversePolicy reverse_policy) { | |
| 46 return RTLReversePolicyNames[reverse_policy]; | |
| 47 } | |
| 48 | |
| 49 // Reset the Trie to empty. | |
| 50 void Trie::clear() { | |
| 51 for (auto node : nodes_) { | |
| 52 delete node; | |
| 53 } | |
| 54 nodes_.clear(); | |
| 55 root_back_freelist_.clear(); | |
| 56 num_edges_ = 0; | |
| 57 new_dawg_node(); // Need to allocate node 0. | |
| 58 } | |
| 59 | |
| 60 bool Trie::edge_char_of(NODE_REF node_ref, NODE_REF next_node, int direction, bool word_end, | |
| 61 UNICHAR_ID unichar_id, EDGE_RECORD **edge_ptr, | |
| 62 EDGE_INDEX *edge_index) const { | |
| 63 if (debug_level_ == 3) { | |
| 64 tprintf("edge_char_of() given node_ref " REFFORMAT " next_node " REFFORMAT | |
| 65 " direction %d word_end %d unichar_id %d, exploring node:\n", | |
| 66 node_ref, next_node, direction, word_end, unichar_id); | |
| 67 if (node_ref != NO_EDGE) { | |
| 68 print_node(node_ref, nodes_[node_ref]->forward_edges.size()); | |
| 69 } | |
| 70 } | |
| 71 if (node_ref == NO_EDGE) { | |
| 72 return false; | |
| 73 } | |
| 74 assert(static_cast<size_t>(node_ref) < nodes_.size()); | |
| 75 EDGE_VECTOR &vec = (direction == FORWARD_EDGE) ? nodes_[node_ref]->forward_edges | |
| 76 : nodes_[node_ref]->backward_edges; | |
| 77 int vec_size = vec.size(); | |
| 78 if (node_ref == 0 && direction == FORWARD_EDGE) { // binary search | |
| 79 EDGE_INDEX start = 0; | |
| 80 EDGE_INDEX end = vec_size - 1; | |
| 81 EDGE_INDEX k; | |
| 82 int compare; | |
| 83 while (start <= end) { | |
| 84 k = (start + end) >> 1; // (start + end) / 2 | |
| 85 compare = given_greater_than_edge_rec(next_node, word_end, unichar_id, vec[k]); | |
| 86 if (compare == 0) { // given == vec[k] | |
| 87 *edge_ptr = &(vec[k]); | |
| 88 *edge_index = k; | |
| 89 return true; | |
| 90 } else if (compare == 1) { // given > vec[k] | |
| 91 start = k + 1; | |
| 92 } else { // given < vec[k] | |
| 93 end = k - 1; | |
| 94 } | |
| 95 } | |
| 96 } else { // linear search | |
| 97 for (int i = 0; i < vec_size; ++i) { | |
| 98 EDGE_RECORD &edge_rec = vec[i]; | |
| 99 if (edge_rec_match(next_node, word_end, unichar_id, next_node_from_edge_rec(edge_rec), | |
| 100 end_of_word_from_edge_rec(edge_rec), unichar_id_from_edge_rec(edge_rec))) { | |
| 101 *edge_ptr = &(edge_rec); | |
| 102 *edge_index = i; | |
| 103 return true; | |
| 104 } | |
| 105 } | |
| 106 } | |
| 107 return false; // not found | |
| 108 } | |
| 109 | |
| 110 bool Trie::add_edge_linkage(NODE_REF node1, NODE_REF node2, bool marker_flag, int direction, | |
| 111 bool word_end, UNICHAR_ID unichar_id) { | |
| 112 EDGE_VECTOR *vec = (direction == FORWARD_EDGE) ? &(nodes_[node1]->forward_edges) | |
| 113 : &(nodes_[node1]->backward_edges); | |
| 114 unsigned search_index; | |
| 115 if (node1 == 0 && direction == FORWARD_EDGE) { | |
| 116 search_index = 0; // find the index to make the add sorted | |
| 117 while (search_index < vec->size() && | |
| 118 given_greater_than_edge_rec(node2, word_end, unichar_id, (*vec)[search_index]) == 1) { | |
| 119 search_index++; | |
| 120 } | |
| 121 } else { | |
| 122 search_index = vec->size(); // add is unsorted, so index does not matter | |
| 123 } | |
| 124 EDGE_RECORD edge_rec; | |
| 125 link_edge(&edge_rec, node2, marker_flag, direction, word_end, unichar_id); | |
| 126 if (node1 == 0 && direction == BACKWARD_EDGE && !root_back_freelist_.empty()) { | |
| 127 EDGE_INDEX edge_index = root_back_freelist_.back(); | |
| 128 root_back_freelist_.pop_back(); | |
| 129 (*vec)[edge_index] = edge_rec; | |
| 130 } else if (search_index < vec->size()) { | |
| 131 vec->insert(vec->begin() + search_index, edge_rec); | |
| 132 } else { | |
| 133 vec->push_back(edge_rec); | |
| 134 } | |
| 135 if (debug_level_ > 1) { | |
| 136 tprintf("new edge in nodes_[" REFFORMAT "]: ", node1); | |
| 137 print_edge_rec(edge_rec); | |
| 138 tprintf("\n"); | |
| 139 } | |
| 140 num_edges_++; | |
| 141 return true; | |
| 142 } | |
| 143 | |
| 144 void Trie::add_word_ending(EDGE_RECORD *edge_ptr, NODE_REF the_next_node, bool marker_flag, | |
| 145 UNICHAR_ID unichar_id) { | |
| 146 EDGE_RECORD *back_edge_ptr; | |
| 147 EDGE_INDEX back_edge_index; | |
| 148 ASSERT_HOST(edge_char_of(the_next_node, NO_EDGE, BACKWARD_EDGE, false, unichar_id, &back_edge_ptr, | |
| 149 &back_edge_index)); | |
| 150 if (marker_flag) { | |
| 151 *back_edge_ptr |= (MARKER_FLAG << flag_start_bit_); | |
| 152 *edge_ptr |= (MARKER_FLAG << flag_start_bit_); | |
| 153 } | |
| 154 // Mark both directions as end of word. | |
| 155 *back_edge_ptr |= (WERD_END_FLAG << flag_start_bit_); | |
| 156 *edge_ptr |= (WERD_END_FLAG << flag_start_bit_); | |
| 157 } | |
| 158 | |
| 159 bool Trie::add_word_to_dawg(const WERD_CHOICE &word, const std::vector<bool> *repetitions) { | |
| 160 if (word.length() <= 0) { | |
| 161 return false; // can't add empty words | |
| 162 } | |
| 163 if (repetitions != nullptr) { | |
| 164 ASSERT_HOST(repetitions->size() == word.length()); | |
| 165 } | |
| 166 // Make sure the word does not contain invalid unchar ids. | |
| 167 for (unsigned i = 0; i < word.length(); ++i) { | |
| 168 if (word.unichar_id(i) < 0 || word.unichar_id(i) >= unicharset_size_) { | |
| 169 return false; | |
| 170 } | |
| 171 } | |
| 172 | |
| 173 EDGE_RECORD *edge_ptr; | |
| 174 NODE_REF last_node = 0; | |
| 175 NODE_REF the_next_node; | |
| 176 bool marker_flag = false; | |
| 177 EDGE_INDEX edge_index; | |
| 178 int32_t still_finding_chars = true; | |
| 179 int32_t word_end = false; | |
| 180 bool add_failed = false; | |
| 181 bool found; | |
| 182 | |
| 183 if (debug_level_ > 1) { | |
| 184 word.print("\nAdding word: "); | |
| 185 } | |
| 186 | |
| 187 UNICHAR_ID unichar_id; | |
| 188 unsigned i; | |
| 189 for (i = 0; i < word.length() - 1; ++i) { | |
| 190 unichar_id = word.unichar_id(i); | |
| 191 marker_flag = (repetitions != nullptr) ? (*repetitions)[i] : false; | |
| 192 if (debug_level_ > 1) { | |
| 193 tprintf("Adding letter %d\n", unichar_id); | |
| 194 } | |
| 195 if (still_finding_chars) { | |
| 196 found = edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, word_end, unichar_id, &edge_ptr, | |
| 197 &edge_index); | |
| 198 if (found && debug_level_ > 1) { | |
| 199 tprintf("exploring edge " REFFORMAT " in node " REFFORMAT "\n", edge_index, last_node); | |
| 200 } | |
| 201 if (!found) { | |
| 202 still_finding_chars = false; | |
| 203 } else if (next_node_from_edge_rec(*edge_ptr) == 0) { | |
| 204 // We hit the end of an existing word, but the new word is longer. | |
| 205 // In this case we have to disconnect the existing word from the | |
| 206 // backwards root node, mark the current position as end-of-word | |
| 207 // and add new nodes for the increased length. Disconnecting the | |
| 208 // existing word from the backwards root node requires a linear | |
| 209 // search, so it is much faster to add the longest words first, | |
| 210 // to avoid having to come here. | |
| 211 word_end = true; | |
| 212 still_finding_chars = false; | |
| 213 remove_edge(last_node, 0, word_end, unichar_id); | |
| 214 } else { | |
| 215 // We have to add a new branch here for the new word. | |
| 216 if (marker_flag) { | |
| 217 set_marker_flag_in_edge_rec(edge_ptr); | |
| 218 } | |
| 219 last_node = next_node_from_edge_rec(*edge_ptr); | |
| 220 } | |
| 221 } | |
| 222 if (!still_finding_chars) { | |
| 223 the_next_node = new_dawg_node(); | |
| 224 if (debug_level_ > 1) { | |
| 225 tprintf("adding node " REFFORMAT "\n", the_next_node); | |
| 226 } | |
| 227 if (the_next_node == 0) { | |
| 228 add_failed = true; | |
| 229 break; | |
| 230 } | |
| 231 if (!add_new_edge(last_node, the_next_node, marker_flag, word_end, unichar_id)) { | |
| 232 add_failed = true; | |
| 233 break; | |
| 234 } | |
| 235 word_end = false; | |
| 236 last_node = the_next_node; | |
| 237 } | |
| 238 } | |
| 239 the_next_node = 0; | |
| 240 unichar_id = word.unichar_id(i); | |
| 241 marker_flag = (repetitions != nullptr) ? (*repetitions)[i] : false; | |
| 242 if (debug_level_ > 1) { | |
| 243 tprintf("Adding letter %d\n", unichar_id); | |
| 244 } | |
| 245 if (still_finding_chars && | |
| 246 edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, false, unichar_id, &edge_ptr, &edge_index)) { | |
| 247 // An extension of this word already exists in the trie, so we | |
| 248 // only have to add the ending flags in both directions. | |
| 249 add_word_ending(edge_ptr, next_node_from_edge_rec(*edge_ptr), marker_flag, unichar_id); | |
| 250 } else { | |
| 251 // Add a link to node 0. All leaves connect to node 0 so the back links can | |
| 252 // be used in reduction to a dawg. This root backward node has one edge | |
| 253 // entry for every word, (except prefixes of longer words) so it is huge. | |
| 254 if (!add_failed && !add_new_edge(last_node, the_next_node, marker_flag, true, unichar_id)) { | |
| 255 add_failed = true; | |
| 256 } | |
| 257 } | |
| 258 if (add_failed) { | |
| 259 tprintf("Re-initializing document dictionary...\n"); | |
| 260 clear(); | |
| 261 return false; | |
| 262 } else { | |
| 263 return true; | |
| 264 } | |
| 265 } | |
| 266 | |
| 267 NODE_REF Trie::new_dawg_node() { | |
| 268 auto *node = new TRIE_NODE_RECORD(); | |
| 269 nodes_.push_back(node); | |
| 270 return nodes_.size() - 1; | |
| 271 } | |
| 272 | |
| 273 bool Trie::read_and_add_word_list(const char *filename, const UNICHARSET &unicharset, | |
| 274 Trie::RTLReversePolicy reverse_policy) { | |
| 275 std::vector<std::string> word_list; | |
| 276 if (!read_word_list(filename, &word_list)) { | |
| 277 return false; | |
| 278 } | |
| 279 std::sort(word_list.begin(), word_list.end(), | |
| 280 [](auto &s1, auto &s2) { return s1.size() > s2.size(); }); | |
| 281 return add_word_list(word_list, unicharset, reverse_policy); | |
| 282 } | |
| 283 | |
| 284 bool Trie::read_word_list(const char *filename, std::vector<std::string> *words) { | |
| 285 FILE *word_file; | |
| 286 char line_str[CHARS_PER_LINE]; | |
| 287 int word_count = 0; | |
| 288 | |
| 289 word_file = fopen(filename, "rb"); | |
| 290 if (word_file == nullptr) { | |
| 291 return false; | |
| 292 } | |
| 293 | |
| 294 while (fgets(line_str, sizeof(line_str), word_file) != nullptr) { | |
| 295 chomp_string(line_str); // remove newline | |
| 296 std::string word_str(line_str); | |
| 297 ++word_count; | |
| 298 if (debug_level_ && word_count % 10000 == 0) { | |
| 299 tprintf("Read %d words so far\n", word_count); | |
| 300 } | |
| 301 words->push_back(word_str); | |
| 302 } | |
| 303 if (debug_level_) { | |
| 304 tprintf("Read %d words total.\n", word_count); | |
| 305 } | |
| 306 fclose(word_file); | |
| 307 return true; | |
| 308 } | |
| 309 | |
| 310 bool Trie::add_word_list(const std::vector<std::string> &words, const UNICHARSET &unicharset, | |
| 311 Trie::RTLReversePolicy reverse_policy) { | |
| 312 for (const auto &i : words) { | |
| 313 WERD_CHOICE word(i.c_str(), unicharset); | |
| 314 if (word.empty() || word.contains_unichar_id(INVALID_UNICHAR_ID)) { | |
| 315 continue; | |
| 316 } | |
| 317 if ((reverse_policy == RRP_REVERSE_IF_HAS_RTL && word.has_rtl_unichar_id()) || | |
| 318 reverse_policy == RRP_FORCE_REVERSE) { | |
| 319 word.reverse_and_mirror_unichar_ids(); | |
| 320 } | |
| 321 if (!word_in_dawg(word)) { | |
| 322 add_word_to_dawg(word); | |
| 323 if (!word_in_dawg(word)) { | |
| 324 tprintf("Error: word '%s' not in DAWG after adding it\n", i.c_str()); | |
| 325 return false; | |
| 326 } | |
| 327 } | |
| 328 } | |
| 329 return true; | |
| 330 } | |
| 331 | |
| 332 void Trie::initialize_patterns(UNICHARSET *unicharset) { | |
| 333 unicharset->unichar_insert(kAlphaPatternUnicode); | |
| 334 alpha_pattern_ = unicharset->unichar_to_id(kAlphaPatternUnicode); | |
| 335 unicharset->unichar_insert(kDigitPatternUnicode); | |
| 336 digit_pattern_ = unicharset->unichar_to_id(kDigitPatternUnicode); | |
| 337 unicharset->unichar_insert(kAlphanumPatternUnicode); | |
| 338 alphanum_pattern_ = unicharset->unichar_to_id(kAlphanumPatternUnicode); | |
| 339 unicharset->unichar_insert(kPuncPatternUnicode); | |
| 340 punc_pattern_ = unicharset->unichar_to_id(kPuncPatternUnicode); | |
| 341 unicharset->unichar_insert(kLowerPatternUnicode); | |
| 342 lower_pattern_ = unicharset->unichar_to_id(kLowerPatternUnicode); | |
| 343 unicharset->unichar_insert(kUpperPatternUnicode); | |
| 344 upper_pattern_ = unicharset->unichar_to_id(kUpperPatternUnicode); | |
| 345 initialized_patterns_ = true; | |
| 346 unicharset_size_ = unicharset->size(); | |
| 347 } | |
| 348 | |
| 349 void Trie::unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset, | |
| 350 std::vector<UNICHAR_ID> *vec) const { | |
| 351 bool is_alpha = unicharset.get_isalpha(unichar_id); | |
| 352 if (is_alpha) { | |
| 353 vec->push_back(alpha_pattern_); | |
| 354 vec->push_back(alphanum_pattern_); | |
| 355 if (unicharset.get_islower(unichar_id)) { | |
| 356 vec->push_back(lower_pattern_); | |
| 357 } else if (unicharset.get_isupper(unichar_id)) { | |
| 358 vec->push_back(upper_pattern_); | |
| 359 } | |
| 360 } | |
| 361 if (unicharset.get_isdigit(unichar_id)) { | |
| 362 vec->push_back(digit_pattern_); | |
| 363 if (!is_alpha) { | |
| 364 vec->push_back(alphanum_pattern_); | |
| 365 } | |
| 366 } | |
| 367 if (unicharset.get_ispunctuation(unichar_id)) { | |
| 368 vec->push_back(punc_pattern_); | |
| 369 } | |
| 370 } | |
| 371 | |
| 372 UNICHAR_ID Trie::character_class_to_pattern(char ch) { | |
| 373 if (ch == 'c') { | |
| 374 return alpha_pattern_; | |
| 375 } else if (ch == 'd') { | |
| 376 return digit_pattern_; | |
| 377 } else if (ch == 'n') { | |
| 378 return alphanum_pattern_; | |
| 379 } else if (ch == 'p') { | |
| 380 return punc_pattern_; | |
| 381 } else if (ch == 'a') { | |
| 382 return lower_pattern_; | |
| 383 } else if (ch == 'A') { | |
| 384 return upper_pattern_; | |
| 385 } else { | |
| 386 return INVALID_UNICHAR_ID; | |
| 387 } | |
| 388 } | |
| 389 | |
| 390 bool Trie::read_pattern_list(const char *filename, const UNICHARSET &unicharset) { | |
| 391 if (!initialized_patterns_) { | |
| 392 tprintf("please call initialize_patterns() before read_pattern_list()\n"); | |
| 393 return false; | |
| 394 } | |
| 395 | |
| 396 FILE *pattern_file = fopen(filename, "rb"); | |
| 397 if (pattern_file == nullptr) { | |
| 398 tprintf("Error opening pattern file %s\n", filename); | |
| 399 return false; | |
| 400 } | |
| 401 | |
| 402 int pattern_count = 0; | |
| 403 char string[CHARS_PER_LINE]; | |
| 404 while (fgets(string, CHARS_PER_LINE, pattern_file) != nullptr) { | |
| 405 chomp_string(string); // remove newline | |
| 406 // Parse the pattern and construct a unichar id vector. | |
| 407 // Record the number of repetitions of each unichar in the parallel vector. | |
| 408 WERD_CHOICE word(&unicharset); | |
| 409 std::vector<bool> repetitions_vec; | |
| 410 const char *str_ptr = string; | |
| 411 int step = unicharset.step(str_ptr); | |
| 412 bool failed = false; | |
| 413 while (step > 0) { | |
| 414 UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID; | |
| 415 if (step == 1 && *str_ptr == '\\') { | |
| 416 ++str_ptr; | |
| 417 if (*str_ptr == '\\') { // regular '\' unichar that was escaped | |
| 418 curr_unichar_id = unicharset.unichar_to_id(str_ptr, step); | |
| 419 } else { | |
| 420 #if 0 // TODO: This code should be enabled if kSaneNumConcreteChars != 0. | |
| 421 if (word.length() < kSaneNumConcreteChars) { | |
| 422 tprintf( | |
| 423 "Please provide at least %d concrete characters at the" | |
| 424 " beginning of the pattern\n", | |
| 425 kSaneNumConcreteChars); | |
| 426 failed = true; | |
| 427 break; | |
| 428 } | |
| 429 #endif | |
| 430 // Parse character class from expression. | |
| 431 curr_unichar_id = character_class_to_pattern(*str_ptr); | |
| 432 } | |
| 433 } else { | |
| 434 curr_unichar_id = unicharset.unichar_to_id(str_ptr, step); | |
| 435 } | |
| 436 if (curr_unichar_id == INVALID_UNICHAR_ID) { | |
| 437 failed = true; | |
| 438 break; // failed to parse this pattern | |
| 439 } | |
| 440 word.append_unichar_id(curr_unichar_id, 1, 0.0, 0.0); | |
| 441 repetitions_vec.push_back(false); | |
| 442 str_ptr += step; | |
| 443 step = unicharset.step(str_ptr); | |
| 444 // Check if there is a repetition pattern specified after this unichar. | |
| 445 if (step == 1 && *str_ptr == '\\' && *(str_ptr + 1) == '*') { | |
| 446 repetitions_vec[repetitions_vec.size() - 1] = true; | |
| 447 str_ptr += 2; | |
| 448 step = unicharset.step(str_ptr); | |
| 449 } | |
| 450 } | |
| 451 if (failed) { | |
| 452 tprintf("Invalid user pattern %s\n", string); | |
| 453 continue; | |
| 454 } | |
| 455 // Insert the pattern into the trie. | |
| 456 if (debug_level_ > 2) { | |
| 457 tprintf("Inserting expanded user pattern %s\n", word.debug_string().c_str()); | |
| 458 } | |
| 459 if (!this->word_in_dawg(word)) { | |
| 460 this->add_word_to_dawg(word, &repetitions_vec); | |
| 461 if (!this->word_in_dawg(word)) { | |
| 462 tprintf("Error: failed to insert pattern '%s'\n", string); | |
| 463 } | |
| 464 } | |
| 465 ++pattern_count; | |
| 466 } | |
| 467 if (debug_level_) { | |
| 468 tprintf("Read %d valid patterns from %s\n", pattern_count, filename); | |
| 469 } | |
| 470 fclose(pattern_file); | |
| 471 return true; | |
| 472 } | |
| 473 | |
| 474 void Trie::remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, bool word_end, | |
| 475 UNICHAR_ID unichar_id) { | |
| 476 EDGE_RECORD *edge_ptr = nullptr; | |
| 477 EDGE_INDEX edge_index = 0; | |
| 478 ASSERT_HOST(edge_char_of(node1, node2, direction, word_end, unichar_id, &edge_ptr, &edge_index)); | |
| 479 if (debug_level_ > 1) { | |
| 480 tprintf("removed edge in nodes_[" REFFORMAT "]: ", node1); | |
| 481 print_edge_rec(*edge_ptr); | |
| 482 tprintf("\n"); | |
| 483 } | |
| 484 if (direction == FORWARD_EDGE) { | |
| 485 nodes_[node1]->forward_edges.erase(nodes_[node1]->forward_edges.begin() + edge_index); | |
| 486 } else if (node1 == 0) { | |
| 487 KillEdge(&nodes_[node1]->backward_edges[edge_index]); | |
| 488 root_back_freelist_.push_back(edge_index); | |
| 489 } else { | |
| 490 nodes_[node1]->backward_edges.erase(nodes_[node1]->backward_edges.begin() + edge_index); | |
| 491 } | |
| 492 --num_edges_; | |
| 493 } | |
| 494 | |
| 495 // Some optimizations employed in add_word_to_dawg and trie_to_dawg: | |
| 496 // 1 Avoid insertion sorting or bubble sorting the tail root node | |
| 497 // (back links on node 0, a list of all the leaves.). The node is | |
| 498 // huge, and sorting it with n^2 time is terrible. | |
| 499 // 2 Avoid using vector::erase on the tail root node. | |
| 500 // (a) During add of words to the trie, zero-out the unichars and | |
| 501 // keep a freelist of spaces to re-use. | |
| 502 // (b) During reduction, just zero-out the unichars of deleted back | |
| 503 // links, skipping zero entries while searching. | |
| 504 // 3 Avoid linear search of the tail root node. This has to be done when | |
| 505 // a suffix is added to an existing word. Adding words by decreasing | |
| 506 // length avoids this problem entirely. Words can still be added in | |
| 507 // any order, but it is faster to add the longest first. | |
| 508 SquishedDawg *Trie::trie_to_dawg() { | |
| 509 root_back_freelist_.clear(); // Will be invalided by trie_to_dawg. | |
| 510 if (debug_level_ > 2) { | |
| 511 print_all("Before reduction:", MAX_NODE_EDGES_DISPLAY); | |
| 512 } | |
| 513 std::vector<bool> reduced_nodes(nodes_.size()); | |
| 514 this->reduce_node_input(0, reduced_nodes); | |
| 515 | |
| 516 if (debug_level_ > 2) { | |
| 517 print_all("After reduction:", MAX_NODE_EDGES_DISPLAY); | |
| 518 } | |
| 519 // Build a translation map from node indices in nodes_ vector to | |
| 520 // their target indices in EDGE_ARRAY. | |
| 521 std::vector<NODE_REF> node_ref_map(nodes_.size() + 1); | |
| 522 unsigned i; | |
| 523 for (i = 0; i < nodes_.size(); ++i) { | |
| 524 node_ref_map[i + 1] = node_ref_map[i] + nodes_[i]->forward_edges.size(); | |
| 525 } | |
| 526 int num_forward_edges = node_ref_map[i]; | |
| 527 | |
| 528 // Convert nodes_ vector into EDGE_ARRAY translating the next node references | |
| 529 // in edges using node_ref_map. Empty nodes and backward edges are dropped. | |
| 530 auto edge_array = new EDGE_RECORD[num_forward_edges]; | |
| 531 EDGE_ARRAY edge_array_ptr = edge_array; | |
| 532 for (i = 0; i < nodes_.size(); ++i) { | |
| 533 TRIE_NODE_RECORD *node_ptr = nodes_[i]; | |
| 534 int end = node_ptr->forward_edges.size(); | |
| 535 for (int j = 0; j < end; ++j) { | |
| 536 EDGE_RECORD &edge_rec = node_ptr->forward_edges[j]; | |
| 537 NODE_REF node_ref = next_node_from_edge_rec(edge_rec); | |
| 538 ASSERT_HOST(static_cast<size_t>(node_ref) < nodes_.size()); | |
| 539 UNICHAR_ID unichar_id = unichar_id_from_edge_rec(edge_rec); | |
| 540 link_edge(edge_array_ptr, node_ref_map[node_ref], false, FORWARD_EDGE, | |
| 541 end_of_word_from_edge_rec(edge_rec), unichar_id); | |
| 542 if (j == end - 1) { | |
| 543 set_marker_flag_in_edge_rec(edge_array_ptr); | |
| 544 } | |
| 545 ++edge_array_ptr; | |
| 546 } | |
| 547 } | |
| 548 | |
| 549 return new SquishedDawg(edge_array, num_forward_edges, type_, lang_, perm_, unicharset_size_, | |
| 550 debug_level_); | |
| 551 } | |
| 552 | |
| 553 bool Trie::eliminate_redundant_edges(NODE_REF node, const EDGE_RECORD &edge1, | |
| 554 const EDGE_RECORD &edge2) { | |
| 555 if (debug_level_ > 1) { | |
| 556 tprintf("\nCollapsing node %" PRIi64 ":\n", node); | |
| 557 print_node(node, MAX_NODE_EDGES_DISPLAY); | |
| 558 tprintf("Candidate edges: "); | |
| 559 print_edge_rec(edge1); | |
| 560 tprintf(", "); | |
| 561 print_edge_rec(edge2); | |
| 562 tprintf("\n\n"); | |
| 563 } | |
| 564 NODE_REF next_node1 = next_node_from_edge_rec(edge1); | |
| 565 NODE_REF next_node2 = next_node_from_edge_rec(edge2); | |
| 566 TRIE_NODE_RECORD *next_node2_ptr = nodes_[next_node2]; | |
| 567 // Translate all edges going to/from next_node2 to go to/from next_node1. | |
| 568 EDGE_RECORD *edge_ptr = nullptr; | |
| 569 EDGE_INDEX edge_index; | |
| 570 // The backward link in node to next_node2 will be zeroed out by the caller. | |
| 571 // Copy all the backward links in next_node2 to node next_node1 | |
| 572 for (unsigned i = 0; i < next_node2_ptr->backward_edges.size(); ++i) { | |
| 573 const EDGE_RECORD &bkw_edge = next_node2_ptr->backward_edges[i]; | |
| 574 NODE_REF curr_next_node = next_node_from_edge_rec(bkw_edge); | |
| 575 UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(bkw_edge); | |
| 576 int curr_word_end = end_of_word_from_edge_rec(bkw_edge); | |
| 577 bool marker_flag = marker_flag_from_edge_rec(bkw_edge); | |
| 578 add_edge_linkage(next_node1, curr_next_node, marker_flag, BACKWARD_EDGE, curr_word_end, | |
| 579 curr_unichar_id); | |
| 580 // Relocate the corresponding forward edge in curr_next_node | |
| 581 ASSERT_HOST(edge_char_of(curr_next_node, next_node2, FORWARD_EDGE, curr_word_end, | |
| 582 curr_unichar_id, &edge_ptr, &edge_index)); | |
| 583 set_next_node_in_edge_rec(edge_ptr, next_node1); | |
| 584 } | |
| 585 int next_node2_num_edges = | |
| 586 (next_node2_ptr->forward_edges.size() + next_node2_ptr->backward_edges.size()); | |
| 587 if (debug_level_ > 1) { | |
| 588 tprintf("removed %d edges from node " REFFORMAT "\n", next_node2_num_edges, next_node2); | |
| 589 } | |
| 590 next_node2_ptr->forward_edges.clear(); | |
| 591 next_node2_ptr->backward_edges.clear(); | |
| 592 num_edges_ -= next_node2_num_edges; | |
| 593 return true; | |
| 594 } | |
| 595 | |
| 596 bool Trie::reduce_lettered_edges(EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node, | |
| 597 EDGE_VECTOR *backward_edges, std::vector<bool> &reduced_nodes) { | |
| 598 if (debug_level_ > 1) { | |
| 599 tprintf("reduce_lettered_edges(edge=" REFFORMAT ")\n", edge_index); | |
| 600 } | |
| 601 // Compare each of the edge pairs with the given unichar_id. | |
| 602 bool did_something = false; | |
| 603 for (unsigned i = edge_index; i < backward_edges->size() - 1; ++i) { | |
| 604 // Find the first edge that can be eliminated. | |
| 605 UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID; | |
| 606 while (i < backward_edges->size()) { | |
| 607 if (!DeadEdge((*backward_edges)[i])) { | |
| 608 curr_unichar_id = unichar_id_from_edge_rec((*backward_edges)[i]); | |
| 609 if (curr_unichar_id != unichar_id) { | |
| 610 return did_something; | |
| 611 } | |
| 612 if (can_be_eliminated((*backward_edges)[i])) { | |
| 613 break; | |
| 614 } | |
| 615 } | |
| 616 ++i; | |
| 617 } | |
| 618 if (i == backward_edges->size()) { | |
| 619 break; | |
| 620 } | |
| 621 const EDGE_RECORD &edge_rec = (*backward_edges)[i]; | |
| 622 // Compare it to the rest of the edges with the given unichar_id. | |
| 623 for (auto j = i + 1; j < backward_edges->size(); ++j) { | |
| 624 const EDGE_RECORD &next_edge_rec = (*backward_edges)[j]; | |
| 625 if (DeadEdge(next_edge_rec)) { | |
| 626 continue; | |
| 627 } | |
| 628 UNICHAR_ID next_id = unichar_id_from_edge_rec(next_edge_rec); | |
| 629 if (next_id != unichar_id) { | |
| 630 break; | |
| 631 } | |
| 632 if (end_of_word_from_edge_rec(next_edge_rec) == end_of_word_from_edge_rec(edge_rec) && | |
| 633 can_be_eliminated(next_edge_rec) && | |
| 634 eliminate_redundant_edges(node, edge_rec, next_edge_rec)) { | |
| 635 reduced_nodes[next_node_from_edge_rec(edge_rec)] = false; | |
| 636 did_something = true; | |
| 637 KillEdge(&(*backward_edges)[j]); | |
| 638 } | |
| 639 } | |
| 640 } | |
| 641 return did_something; | |
| 642 } | |
| 643 | |
| 644 void Trie::sort_edges(EDGE_VECTOR *edges) { | |
| 645 int num_edges = edges->size(); | |
| 646 if (num_edges <= 1) { | |
| 647 return; | |
| 648 } | |
| 649 std::vector<KDPairInc<UNICHAR_ID, EDGE_RECORD>> sort_vec; | |
| 650 sort_vec.reserve(num_edges); | |
| 651 for (int i = 0; i < num_edges; ++i) { | |
| 652 sort_vec.emplace_back(unichar_id_from_edge_rec((*edges)[i]), (*edges)[i]); | |
| 653 } | |
| 654 std::sort(sort_vec.begin(), sort_vec.end()); | |
| 655 for (int i = 0; i < num_edges; ++i) { | |
| 656 (*edges)[i] = sort_vec[i].data(); | |
| 657 } | |
| 658 } | |
| 659 | |
| 660 void Trie::reduce_node_input(NODE_REF node, std::vector<bool> &reduced_nodes) { | |
| 661 EDGE_VECTOR &backward_edges = nodes_[node]->backward_edges; | |
| 662 sort_edges(&backward_edges); | |
| 663 if (debug_level_ > 1) { | |
| 664 tprintf("reduce_node_input(node=" REFFORMAT ")\n", node); | |
| 665 print_node(node, MAX_NODE_EDGES_DISPLAY); | |
| 666 } | |
| 667 | |
| 668 EDGE_INDEX edge_index = 0; | |
| 669 while (static_cast<size_t>(edge_index) < backward_edges.size()) { | |
| 670 if (DeadEdge(backward_edges[edge_index])) { | |
| 671 continue; | |
| 672 } | |
| 673 UNICHAR_ID unichar_id = unichar_id_from_edge_rec(backward_edges[edge_index]); | |
| 674 while (reduce_lettered_edges(edge_index, unichar_id, node, &backward_edges, reduced_nodes)) { | |
| 675 ; | |
| 676 } | |
| 677 while (static_cast<size_t>(++edge_index) < backward_edges.size()) { | |
| 678 UNICHAR_ID id = unichar_id_from_edge_rec(backward_edges[edge_index]); | |
| 679 if (!DeadEdge(backward_edges[edge_index]) && id != unichar_id) { | |
| 680 break; | |
| 681 } | |
| 682 } | |
| 683 } | |
| 684 reduced_nodes[node] = true; // mark as reduced | |
| 685 | |
| 686 if (debug_level_ > 1) { | |
| 687 tprintf("Node " REFFORMAT " after reduction:\n", node); | |
| 688 print_node(node, MAX_NODE_EDGES_DISPLAY); | |
| 689 } | |
| 690 | |
| 691 for (auto &backward_edge : backward_edges) { | |
| 692 if (DeadEdge(backward_edge)) { | |
| 693 continue; | |
| 694 } | |
| 695 NODE_REF next_node = next_node_from_edge_rec(backward_edge); | |
| 696 if (next_node != 0 && !reduced_nodes[next_node]) { | |
| 697 reduce_node_input(next_node, reduced_nodes); | |
| 698 } | |
| 699 } | |
| 700 } | |
| 701 | |
| 702 void Trie::print_node(NODE_REF node, int max_num_edges) const { | |
| 703 if (node == NO_EDGE) { | |
| 704 return; // nothing to print | |
| 705 } | |
| 706 TRIE_NODE_RECORD *node_ptr = nodes_[node]; | |
| 707 int num_fwd = node_ptr->forward_edges.size(); | |
| 708 int num_bkw = node_ptr->backward_edges.size(); | |
| 709 EDGE_VECTOR *vec; | |
| 710 for (int dir = 0; dir < 2; ++dir) { | |
| 711 if (dir == 0) { | |
| 712 vec = &(node_ptr->forward_edges); | |
| 713 tprintf(REFFORMAT " (%d %d): ", node, num_fwd, num_bkw); | |
| 714 } else { | |
| 715 vec = &(node_ptr->backward_edges); | |
| 716 tprintf("\t"); | |
| 717 } | |
| 718 int i; | |
| 719 for (i = 0; (dir == 0 ? i < num_fwd : i < num_bkw) && i < max_num_edges; ++i) { | |
| 720 if (DeadEdge((*vec)[i])) { | |
| 721 continue; | |
| 722 } | |
| 723 print_edge_rec((*vec)[i]); | |
| 724 tprintf(" "); | |
| 725 } | |
| 726 if (dir == 0 ? i < num_fwd : i < num_bkw) { | |
| 727 tprintf("..."); | |
| 728 } | |
| 729 tprintf("\n"); | |
| 730 } | |
| 731 } | |
| 732 | |
| 733 } // namespace tesseract |
