Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/dict/dawg.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /****************************************************************************** | |
| 2 * | |
| 3 * File: dawg.h | |
| 4 * Description: Definition of a class that represents Directed Acyclic Word | |
| 5 * Graph (DAWG), functions to build and manipulate the DAWG. | |
| 6 * Author: Mark Seaman, SW Productivity | |
| 7 * | |
| 8 * (c) Copyright 1987, Hewlett-Packard Company. | |
| 9 ** Licensed under the Apache License, Version 2.0 (the "License"); | |
| 10 ** you may not use this file except in compliance with the License. | |
| 11 ** You may obtain a copy of the License at | |
| 12 ** http://www.apache.org/licenses/LICENSE-2.0 | |
| 13 ** Unless required by applicable law or agreed to in writing, software | |
| 14 ** distributed under the License is distributed on an "AS IS" BASIS, | |
| 15 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 16 ** See the License for the specific language governing permissions and | |
| 17 ** limitations under the License. | |
| 18 * | |
| 19 *****************************************************************************/ | |
| 20 | |
| 21 #ifndef DICT_DAWG_H_ | |
| 22 #define DICT_DAWG_H_ | |
| 23 | |
| 24 /*---------------------------------------------------------------------- | |
| 25 I n c l u d e s | |
| 26 ----------------------------------------------------------------------*/ | |
| 27 | |
| 28 #include <cinttypes> // for PRId64 | |
| 29 #include <functional> // for std::function | |
| 30 #include <memory> | |
| 31 #include "elst.h" | |
| 32 #include "params.h" | |
| 33 #include "ratngs.h" | |
| 34 | |
| 35 #ifndef __GNUC__ | |
| 36 # ifdef _WIN32 | |
| 37 # define NO_EDGE static_cast<int64_t>(0xffffffffffffffffi64) | |
| 38 # endif /*_WIN32*/ | |
| 39 #else | |
| 40 # define NO_EDGE static_cast<int64_t>(0xffffffffffffffffll) | |
| 41 #endif /*__GNUC__*/ | |
| 42 | |
| 43 namespace tesseract { | |
| 44 | |
| 45 class UNICHARSET; | |
| 46 | |
| 47 using EDGE_RECORD = uint64_t; | |
| 48 using EDGE_ARRAY = EDGE_RECORD *; | |
| 49 using EDGE_REF = int64_t; | |
| 50 using NODE_REF = int64_t; | |
| 51 using NODE_MAP = EDGE_REF *; | |
| 52 | |
| 53 struct NodeChild { | |
| 54 UNICHAR_ID unichar_id; | |
| 55 EDGE_REF edge_ref; | |
| 56 NodeChild(UNICHAR_ID id, EDGE_REF ref) : unichar_id(id), edge_ref(ref) {} | |
| 57 NodeChild() : unichar_id(INVALID_UNICHAR_ID), edge_ref(NO_EDGE) {} | |
| 58 }; | |
| 59 | |
| 60 using NodeChildVector = std::vector<NodeChild>; | |
| 61 using SuccessorList = std::vector<int>; | |
| 62 using SuccessorListsVector = std::vector<SuccessorList *>; | |
| 63 | |
| 64 enum DawgType { | |
| 65 DAWG_TYPE_PUNCTUATION, | |
| 66 DAWG_TYPE_WORD, | |
| 67 DAWG_TYPE_NUMBER, | |
| 68 DAWG_TYPE_PATTERN, | |
| 69 | |
| 70 DAWG_TYPE_COUNT // number of enum entries | |
| 71 }; | |
| 72 | |
| 73 /*---------------------------------------------------------------------- | |
| 74 C o n s t a n t s | |
| 75 ----------------------------------------------------------------------*/ | |
| 76 | |
| 77 #define FORWARD_EDGE static_cast<int32_t>(0) | |
| 78 #define BACKWARD_EDGE static_cast<int32_t>(1) | |
| 79 #define MAX_NODE_EDGES_DISPLAY static_cast<int64_t>(100) | |
| 80 #define MARKER_FLAG static_cast<int64_t>(1) | |
| 81 #define DIRECTION_FLAG static_cast<int64_t>(2) | |
| 82 #define WERD_END_FLAG static_cast<int64_t>(4) | |
| 83 #define LETTER_START_BIT 0 | |
| 84 #define NUM_FLAG_BITS 3 | |
| 85 #define REFFORMAT "%" PRId64 | |
| 86 | |
| 87 static const bool kDawgSuccessors[DAWG_TYPE_COUNT][DAWG_TYPE_COUNT] = { | |
| 88 {false, true, true, false}, // for DAWG_TYPE_PUNCTUATION | |
| 89 {true, false, false, false}, // for DAWG_TYPE_WORD | |
| 90 {true, false, false, false}, // for DAWG_TYPE_NUMBER | |
| 91 {false, false, false, false}, // for DAWG_TYPE_PATTERN | |
| 92 }; | |
| 93 | |
| 94 static const char kWildcard[] = "*"; | |
| 95 | |
| 96 /*---------------------------------------------------------------------- | |
| 97 C l a s s e s a n d S t r u c t s | |
| 98 ----------------------------------------------------------------------*/ | |
| 99 // | |
| 100 /// Abstract class (an interface) that declares methods needed by the | |
| 101 /// various tesseract classes to operate on SquishedDawg and Trie objects. | |
| 102 /// | |
| 103 /// This class initializes all the edge masks (since their usage by | |
| 104 /// SquishedDawg and Trie is identical) and implements simple accessors | |
| 105 /// for each of the fields encoded in an EDGE_RECORD. | |
| 106 /// This class also implements word_in_dawg() and check_for_words() | |
| 107 /// (since they use only the public methods of SquishedDawg and Trie | |
| 108 /// classes that are inherited from the Dawg base class). | |
| 109 // | |
| 110 class TESS_API Dawg { | |
| 111 public: | |
| 112 /// Magic number to determine endianness when reading the Dawg from file. | |
| 113 static const int16_t kDawgMagicNumber = 42; | |
| 114 /// A special unichar id that indicates that any appropriate pattern | |
| 115 /// (e.g.dictionary word, 0-9 digit, etc) can be inserted instead | |
| 116 /// Used for expressing patterns in punctuation and number Dawgs. | |
| 117 static const UNICHAR_ID kPatternUnicharID = 0; | |
| 118 | |
| 119 inline DawgType type() const { | |
| 120 return type_; | |
| 121 } | |
| 122 inline const std::string &lang() const { | |
| 123 return lang_; | |
| 124 } | |
| 125 inline PermuterType permuter() const { | |
| 126 return perm_; | |
| 127 } | |
| 128 | |
| 129 virtual ~Dawg(); | |
| 130 | |
| 131 /// Returns true if the given word is in the Dawg. | |
| 132 bool word_in_dawg(const WERD_CHOICE &word) const; | |
| 133 | |
| 134 // Returns true if the given word prefix is not contraindicated by the dawg. | |
| 135 // If requires_complete is true, then the exact complete word must be present. | |
| 136 bool prefix_in_dawg(const WERD_CHOICE &prefix, bool requires_complete) const; | |
| 137 | |
| 138 /// Checks the Dawg for the words that are listed in the requested file. | |
| 139 /// Returns the number of words in the given file missing from the Dawg. | |
| 140 int check_for_words(const char *filename, const UNICHARSET &unicharset, | |
| 141 bool enable_wildcard) const; | |
| 142 | |
| 143 // For each word in the Dawg, call the given (permanent) callback with the | |
| 144 // text (UTF-8) version of the word. | |
| 145 void iterate_words(const UNICHARSET &unicharset, | |
| 146 std::function<void(const WERD_CHOICE *)> cb) const; | |
| 147 | |
| 148 // For each word in the Dawg, call the given (permanent) callback with the | |
| 149 // text (UTF-8) version of the word. | |
| 150 void iterate_words(const UNICHARSET &unicharset, | |
| 151 const std::function<void(const char *)> &cb) const; | |
| 152 | |
| 153 // Pure virtual function that should be implemented by the derived classes. | |
| 154 | |
| 155 /// Returns the edge that corresponds to the letter out of this node. | |
| 156 virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, | |
| 157 bool word_end) const = 0; | |
| 158 | |
| 159 /// Fills the given NodeChildVector with all the unichar ids (and the | |
| 160 /// corresponding EDGE_REFs) for which there is an edge out of this node. | |
| 161 virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec, | |
| 162 bool word_end) const = 0; | |
| 163 | |
| 164 /// Returns the next node visited by following the edge | |
| 165 /// indicated by the given EDGE_REF. | |
| 166 virtual NODE_REF next_node(EDGE_REF edge_ref) const = 0; | |
| 167 | |
| 168 /// Returns true if the edge indicated by the given EDGE_REF | |
| 169 /// marks the end of a word. | |
| 170 virtual bool end_of_word(EDGE_REF edge_ref) const = 0; | |
| 171 | |
| 172 /// Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF. | |
| 173 virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const = 0; | |
| 174 | |
| 175 /// Prints the contents of the node indicated by the given NODE_REF. | |
| 176 /// At most max_num_edges will be printed. | |
| 177 virtual void print_node(NODE_REF node, int max_num_edges) const = 0; | |
| 178 | |
| 179 /// Fills vec with unichar ids that represent the character classes | |
| 180 /// of the given unichar_id. | |
| 181 virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id, | |
| 182 const UNICHARSET &unicharset, | |
| 183 std::vector<UNICHAR_ID> *vec) const { | |
| 184 (void)unichar_id; | |
| 185 (void)unicharset; | |
| 186 (void)vec; | |
| 187 } | |
| 188 | |
| 189 /// Returns the given EDGE_REF if the EDGE_RECORD that it points to has | |
| 190 /// a self loop and the given unichar_id matches the unichar_id stored in the | |
| 191 /// EDGE_RECORD, returns NO_EDGE otherwise. | |
| 192 virtual EDGE_REF pattern_loop_edge(EDGE_REF edge_ref, UNICHAR_ID unichar_id, | |
| 193 bool word_end) const { | |
| 194 (void)edge_ref; | |
| 195 (void)unichar_id; | |
| 196 (void)word_end; | |
| 197 return false; | |
| 198 } | |
| 199 | |
| 200 protected: | |
| 201 Dawg(DawgType type, const std::string &lang, PermuterType perm, | |
| 202 int debug_level) | |
| 203 : lang_(lang), | |
| 204 type_(type), | |
| 205 perm_(perm), | |
| 206 unicharset_size_(0), | |
| 207 debug_level_(debug_level) {} | |
| 208 | |
| 209 /// Returns the next node visited by following this edge. | |
| 210 inline NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const { | |
| 211 return ((edge_rec & next_node_mask_) >> next_node_start_bit_); | |
| 212 } | |
| 213 /// Returns the marker flag of this edge. | |
| 214 inline bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const { | |
| 215 return (edge_rec & (MARKER_FLAG << flag_start_bit_)) != 0; | |
| 216 } | |
| 217 /// Returns the direction flag of this edge. | |
| 218 inline int direction_from_edge_rec(const EDGE_RECORD &edge_rec) const { | |
| 219 return ((edge_rec & (DIRECTION_FLAG << flag_start_bit_))) ? BACKWARD_EDGE | |
| 220 : FORWARD_EDGE; | |
| 221 } | |
| 222 /// Returns true if this edge marks the end of a word. | |
| 223 inline bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const { | |
| 224 return (edge_rec & (WERD_END_FLAG << flag_start_bit_)) != 0; | |
| 225 } | |
| 226 /// Returns UNICHAR_ID recorded in this edge. | |
| 227 inline UNICHAR_ID unichar_id_from_edge_rec( | |
| 228 const EDGE_RECORD &edge_rec) const { | |
| 229 return ((edge_rec & letter_mask_) >> LETTER_START_BIT); | |
| 230 } | |
| 231 /// Sets the next node link for this edge in the Dawg. | |
| 232 inline void set_next_node_in_edge_rec(EDGE_RECORD *edge_rec, EDGE_REF value) { | |
| 233 *edge_rec &= (~next_node_mask_); | |
| 234 *edge_rec |= ((value << next_node_start_bit_) & next_node_mask_); | |
| 235 } | |
| 236 /// Sets this edge record to be the last one in a sequence of edges. | |
| 237 inline void set_marker_flag_in_edge_rec(EDGE_RECORD *edge_rec) { | |
| 238 *edge_rec |= (MARKER_FLAG << flag_start_bit_); | |
| 239 } | |
| 240 /// Sequentially compares the given values of unichar ID, next node | |
| 241 /// and word end marker with the values in the given EDGE_RECORD. | |
| 242 /// Returns: 1 if at any step the given input value exceeds | |
| 243 /// that of edge_rec (and all the values already | |
| 244 /// checked are the same) | |
| 245 /// 0 if edge_rec_match() returns true | |
| 246 /// -1 otherwise | |
| 247 inline int given_greater_than_edge_rec(NODE_REF next_node, bool word_end, | |
| 248 UNICHAR_ID unichar_id, | |
| 249 const EDGE_RECORD &edge_rec) const { | |
| 250 UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(edge_rec); | |
| 251 NODE_REF curr_next_node = next_node_from_edge_rec(edge_rec); | |
| 252 bool curr_word_end = end_of_word_from_edge_rec(edge_rec); | |
| 253 if (edge_rec_match(next_node, word_end, unichar_id, curr_next_node, | |
| 254 curr_word_end, curr_unichar_id)) { | |
| 255 return 0; | |
| 256 } | |
| 257 if (unichar_id > curr_unichar_id) { | |
| 258 return 1; | |
| 259 } | |
| 260 if (unichar_id == curr_unichar_id) { | |
| 261 if (next_node > curr_next_node) { | |
| 262 return 1; | |
| 263 } | |
| 264 if (next_node == curr_next_node) { | |
| 265 if (word_end > curr_word_end) { | |
| 266 return 1; | |
| 267 } | |
| 268 } | |
| 269 } | |
| 270 return -1; | |
| 271 } | |
| 272 /// Returns true if all the values are equal (any value matches | |
| 273 /// next_node if next_node == NO_EDGE, any value matches word_end | |
| 274 /// if word_end is false). | |
| 275 inline bool edge_rec_match(NODE_REF next_node, bool word_end, | |
| 276 UNICHAR_ID unichar_id, NODE_REF other_next_node, | |
| 277 bool other_word_end, | |
| 278 UNICHAR_ID other_unichar_id) const { | |
| 279 return ((unichar_id == other_unichar_id) && | |
| 280 (next_node == NO_EDGE || next_node == other_next_node) && | |
| 281 (!word_end || (word_end == other_word_end))); | |
| 282 } | |
| 283 | |
| 284 /// Sets unicharset_size_. | |
| 285 /// Initializes the values of various masks from unicharset_size_. | |
| 286 void init(int unicharset_size); | |
| 287 | |
| 288 /// Matches all of the words that are represented by this string. | |
| 289 /// If wildcard is set to something other than INVALID_UNICHAR_ID, | |
| 290 /// the *'s in this string are interpreted as wildcards. | |
| 291 /// WERD_CHOICE param is not passed by const so that wildcard searches | |
| 292 /// can modify it and work without having to copy WERD_CHOICEs. | |
| 293 bool match_words(WERD_CHOICE *word, uint32_t index, NODE_REF node, | |
| 294 UNICHAR_ID wildcard) const; | |
| 295 | |
| 296 // Recursively iterate over all words in a dawg (see public iterate_words). | |
| 297 void iterate_words_rec( | |
| 298 const WERD_CHOICE &word_so_far, NODE_REF to_explore, | |
| 299 const std::function<void(const WERD_CHOICE *)> &cb) const; | |
| 300 | |
| 301 // Member Variables. | |
| 302 std::string lang_; | |
| 303 DawgType type_; | |
| 304 /// Permuter code that should be used if the word is found in this Dawg. | |
| 305 PermuterType perm_; | |
| 306 // Variables to construct various edge masks. Formerly: | |
| 307 // #define NEXT_EDGE_MASK (int64_t) 0xfffffff800000000i64 | |
| 308 // #define FLAGS_MASK (int64_t) 0x0000000700000000i64 | |
| 309 // #define LETTER_MASK (int64_t) 0x00000000ffffffffi64 | |
| 310 uint64_t next_node_mask_ = 0; | |
| 311 uint64_t flags_mask_ = 0; | |
| 312 uint64_t letter_mask_ = 0; | |
| 313 int unicharset_size_; | |
| 314 int flag_start_bit_ = 0; | |
| 315 int next_node_start_bit_ = 0; | |
| 316 // Level of debug statements to print to stdout. | |
| 317 int debug_level_; | |
| 318 }; | |
| 319 | |
| 320 // | |
| 321 // DawgPosition keeps track of where we are in the primary dawg we're searching | |
| 322 // as well as where we may be in the "punctuation dawg" which may provide | |
| 323 // surrounding context. | |
| 324 // | |
| 325 // Example: | |
| 326 // punctuation dawg -- space is the "pattern character" | |
| 327 // " " // no punctuation | |
| 328 // "' '" // leading and trailing apostrophes | |
| 329 // " '" // trailing apostrophe | |
| 330 // word dawg: | |
| 331 // "cat" | |
| 332 // "cab" | |
| 333 // "cat's" | |
| 334 // | |
| 335 // DawgPosition(dawg_index, dawg_ref, punc_index, punc_ref, rtp) | |
| 336 // | |
| 337 // DawgPosition(-1, NO_EDGE, p, pe, false) | |
| 338 // We're in the punctuation dawg, no other dawg has been started. | |
| 339 // (1) If there's a pattern edge as a punc dawg child of us, | |
| 340 // for each punc-following dawg starting with ch, produce: | |
| 341 // Result: DawgPosition(k, w, p', false) | |
| 342 // (2) If there's a valid continuation in the punc dawg, produce: | |
| 343 // Result: DawgPosition(-k, NO_EDGE, p', false) | |
| 344 // | |
| 345 // DawgPosition(k, w, -1, NO_EDGE, false) | |
| 346 // We're in dawg k. Going back to punctuation dawg is not an option. | |
| 347 // Follow ch in dawg k. | |
| 348 // | |
| 349 // DawgPosition(k, w, p, pe, false) | |
| 350 // We're in dawg k. Continue in dawg k and/or go back to the punc dawg. | |
| 351 // If ending, check that the punctuation dawg is also ok to end here. | |
| 352 // | |
| 353 // DawgPosition(k, w, p, pe true) | |
| 354 // We're back in the punctuation dawg. Continuing there is the only option. | |
| 355 struct DawgPosition { | |
| 356 DawgPosition() = default; | |
| 357 DawgPosition(int dawg_idx, EDGE_REF dawgref, int punc_idx, EDGE_REF puncref, | |
| 358 bool backtopunc) | |
| 359 : dawg_ref(dawgref), | |
| 360 punc_ref(puncref), | |
| 361 dawg_index(dawg_idx), | |
| 362 punc_index(punc_idx), | |
| 363 back_to_punc(backtopunc) {} | |
| 364 bool operator==(const DawgPosition &other) const { | |
| 365 return dawg_index == other.dawg_index && dawg_ref == other.dawg_ref && | |
| 366 punc_index == other.punc_index && punc_ref == other.punc_ref && | |
| 367 back_to_punc == other.back_to_punc; | |
| 368 } | |
| 369 | |
| 370 EDGE_REF dawg_ref = NO_EDGE; | |
| 371 EDGE_REF punc_ref = NO_EDGE; | |
| 372 int8_t dawg_index = -1; | |
| 373 int8_t punc_index = -1; | |
| 374 // Have we returned to the punc dawg at the end of the word? | |
| 375 bool back_to_punc = false; | |
| 376 }; | |
| 377 | |
| 378 class DawgPositionVector : public std::vector<DawgPosition> { | |
| 379 public: | |
| 380 /// Adds an entry for the given dawg_index with the given node to the vec. | |
| 381 /// Returns false if the same entry already exists in the vector, | |
| 382 /// true otherwise. | |
| 383 inline bool add_unique(const DawgPosition &new_pos, bool debug, | |
| 384 const char *debug_msg) { | |
| 385 for (auto &&position : *this) { | |
| 386 if (position == new_pos) { | |
| 387 return false; | |
| 388 } | |
| 389 } | |
| 390 push_back(new_pos); | |
| 391 if (debug) { | |
| 392 tprintf("%s[%d, " REFFORMAT "] [punc: " REFFORMAT "%s]\n", debug_msg, | |
| 393 new_pos.dawg_index, new_pos.dawg_ref, new_pos.punc_ref, | |
| 394 new_pos.back_to_punc ? " returned" : ""); | |
| 395 } | |
| 396 return true; | |
| 397 } | |
| 398 }; | |
| 399 | |
| 400 // | |
| 401 /// Concrete class that can operate on a compacted (squished) Dawg (read, | |
| 402 /// search and write to file). This class is read-only in the sense that | |
| 403 /// new words cannot be added to an instance of SquishedDawg. | |
| 404 /// The underlying representation of the nodes and edges in SquishedDawg | |
| 405 /// is stored as a contiguous EDGE_ARRAY (read from file or given as an | |
| 406 /// argument to the constructor). | |
| 407 // | |
| 408 class TESS_API SquishedDawg : public Dawg { | |
| 409 public: | |
| 410 SquishedDawg(DawgType type, const std::string &lang, PermuterType perm, | |
| 411 int debug_level) | |
| 412 : Dawg(type, lang, perm, debug_level) {} | |
| 413 SquishedDawg(const char *filename, DawgType type, const std::string &lang, | |
| 414 PermuterType perm, int debug_level) | |
| 415 : Dawg(type, lang, perm, debug_level) { | |
| 416 TFile file; | |
| 417 ASSERT_HOST(file.Open(filename, nullptr)); | |
| 418 ASSERT_HOST(read_squished_dawg(&file)); | |
| 419 num_forward_edges_in_node0 = num_forward_edges(0); | |
| 420 } | |
| 421 SquishedDawg(EDGE_ARRAY edges, int num_edges, DawgType type, | |
| 422 const std::string &lang, PermuterType perm, int unicharset_size, | |
| 423 int debug_level) | |
| 424 : Dawg(type, lang, perm, debug_level), | |
| 425 edges_(edges), | |
| 426 num_edges_(num_edges) { | |
| 427 init(unicharset_size); | |
| 428 num_forward_edges_in_node0 = num_forward_edges(0); | |
| 429 if (debug_level > 3) { | |
| 430 print_all("SquishedDawg:"); | |
| 431 } | |
| 432 } | |
| 433 ~SquishedDawg() override; | |
| 434 | |
| 435 // Loads using the given TFile. Returns false on failure. | |
| 436 bool Load(TFile *fp) { | |
| 437 if (!read_squished_dawg(fp)) { | |
| 438 return false; | |
| 439 } | |
| 440 num_forward_edges_in_node0 = num_forward_edges(0); | |
| 441 return true; | |
| 442 } | |
| 443 | |
| 444 int NumEdges() { | |
| 445 return num_edges_; | |
| 446 } | |
| 447 | |
| 448 /// Returns the edge that corresponds to the letter out of this node. | |
| 449 EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, | |
| 450 bool word_end) const override; | |
| 451 | |
| 452 /// Fills the given NodeChildVector with all the unichar ids (and the | |
| 453 /// corresponding EDGE_REFs) for which there is an edge out of this node. | |
| 454 void unichar_ids_of(NODE_REF node, NodeChildVector *vec, | |
| 455 bool word_end) const override { | |
| 456 EDGE_REF edge = node; | |
| 457 if (!edge_occupied(edge) || edge == NO_EDGE) { | |
| 458 return; | |
| 459 } | |
| 460 assert(forward_edge(edge)); // we don't expect any backward edges to | |
| 461 do { // be present when this function is called | |
| 462 if (!word_end || end_of_word_from_edge_rec(edges_[edge])) { | |
| 463 vec->push_back(NodeChild(unichar_id_from_edge_rec(edges_[edge]), edge)); | |
| 464 } | |
| 465 } while (!last_edge(edge++)); | |
| 466 } | |
| 467 | |
| 468 /// Returns the next node visited by following the edge | |
| 469 /// indicated by the given EDGE_REF. | |
| 470 NODE_REF next_node(EDGE_REF edge) const override { | |
| 471 return next_node_from_edge_rec((edges_[edge])); | |
| 472 } | |
| 473 | |
| 474 /// Returns true if the edge indicated by the given EDGE_REF | |
| 475 /// marks the end of a word. | |
| 476 bool end_of_word(EDGE_REF edge_ref) const override { | |
| 477 return end_of_word_from_edge_rec((edges_[edge_ref])); | |
| 478 } | |
| 479 | |
| 480 /// Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF. | |
| 481 UNICHAR_ID edge_letter(EDGE_REF edge_ref) const override { | |
| 482 return unichar_id_from_edge_rec((edges_[edge_ref])); | |
| 483 } | |
| 484 | |
| 485 /// Prints the contents of the node indicated by the given NODE_REF. | |
| 486 /// At most max_num_edges will be printed. | |
| 487 void print_node(NODE_REF node, int max_num_edges) const override; | |
| 488 | |
| 489 /// Writes the squished/reduced Dawg to a file. | |
| 490 bool write_squished_dawg(TFile *file); | |
| 491 | |
| 492 /// Opens the file with the given filename and writes the | |
| 493 /// squished/reduced Dawg to the file. | |
| 494 bool write_squished_dawg(const char *filename) { | |
| 495 TFile file; | |
| 496 file.OpenWrite(nullptr); | |
| 497 if (!this->write_squished_dawg(&file)) { | |
| 498 tprintf("Error serializing %s\n", filename); | |
| 499 return false; | |
| 500 } | |
| 501 if (!file.CloseWrite(filename, nullptr)) { | |
| 502 tprintf("Error writing file %s\n", filename); | |
| 503 return false; | |
| 504 } | |
| 505 return true; | |
| 506 } | |
| 507 | |
| 508 private: | |
| 509 /// Sets the next node link for this edge. | |
| 510 inline void set_next_node(EDGE_REF edge_ref, EDGE_REF value) { | |
| 511 set_next_node_in_edge_rec(&(edges_[edge_ref]), value); | |
| 512 } | |
| 513 /// Sets the edge to be empty. | |
| 514 inline void set_empty_edge(EDGE_REF edge_ref) { | |
| 515 (edges_[edge_ref] = next_node_mask_); | |
| 516 } | |
| 517 /// Goes through all the edges and clears each one out. | |
| 518 inline void clear_all_edges() { | |
| 519 for (int edge = 0; edge < num_edges_; edge++) { | |
| 520 set_empty_edge(edge); | |
| 521 } | |
| 522 } | |
| 523 /// Clears the last flag of this edge. | |
| 524 inline void clear_marker_flag(EDGE_REF edge_ref) { | |
| 525 (edges_[edge_ref] &= ~(MARKER_FLAG << flag_start_bit_)); | |
| 526 } | |
| 527 /// Returns true if this edge is in the forward direction. | |
| 528 inline bool forward_edge(EDGE_REF edge_ref) const { | |
| 529 return (edge_occupied(edge_ref) && | |
| 530 (FORWARD_EDGE == direction_from_edge_rec(edges_[edge_ref]))); | |
| 531 } | |
| 532 /// Returns true if this edge is in the backward direction. | |
| 533 inline bool backward_edge(EDGE_REF edge_ref) const { | |
| 534 return (edge_occupied(edge_ref) && | |
| 535 (BACKWARD_EDGE == direction_from_edge_rec(edges_[edge_ref]))); | |
| 536 } | |
| 537 /// Returns true if the edge spot in this location is occupied. | |
| 538 inline bool edge_occupied(EDGE_REF edge_ref) const { | |
| 539 return (edges_[edge_ref] != next_node_mask_); | |
| 540 } | |
| 541 /// Returns true if this edge is the last edge in a sequence. | |
| 542 inline bool last_edge(EDGE_REF edge_ref) const { | |
| 543 return (edges_[edge_ref] & (MARKER_FLAG << flag_start_bit_)) != 0; | |
| 544 } | |
| 545 | |
| 546 /// Counts and returns the number of forward edges in this node. | |
| 547 int32_t num_forward_edges(NODE_REF node) const; | |
| 548 | |
| 549 /// Reads SquishedDawg from a file. | |
| 550 bool read_squished_dawg(TFile *file); | |
| 551 | |
| 552 /// Prints the contents of an edge indicated by the given EDGE_REF. | |
| 553 void print_edge(EDGE_REF edge) const; | |
| 554 | |
| 555 /// Prints the contents of the SquishedDawg. | |
| 556 void print_all(const char *msg) { | |
| 557 tprintf("\n__________________________\n%s\n", msg); | |
| 558 for (int i = 0; i < num_edges_; ++i) { | |
| 559 print_edge(i); | |
| 560 } | |
| 561 tprintf("__________________________\n"); | |
| 562 } | |
| 563 /// Constructs a mapping from the memory node indices to disk node indices. | |
| 564 std::unique_ptr<EDGE_REF[]> build_node_map(int32_t *num_nodes) const; | |
| 565 | |
| 566 // Member variables. | |
| 567 EDGE_ARRAY edges_ = nullptr; | |
| 568 int32_t num_edges_ = 0; | |
| 569 int num_forward_edges_in_node0 = 0; | |
| 570 }; | |
| 571 | |
| 572 } // namespace tesseract | |
| 573 | |
| 574 #endif // DICT_DAWG_H_ |
