comparison mupdf-source/thirdparty/tesseract/src/dict/dawg.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /******************************************************************************
2 *
3 * File: dawg.h
4 * Description: Definition of a class that represents Directed Acyclic Word
5 * Graph (DAWG), functions to build and manipulate the DAWG.
6 * Author: Mark Seaman, SW Productivity
7 *
8 * (c) Copyright 1987, Hewlett-Packard Company.
9 ** Licensed under the Apache License, Version 2.0 (the "License");
10 ** you may not use this file except in compliance with the License.
11 ** You may obtain a copy of the License at
12 ** http://www.apache.org/licenses/LICENSE-2.0
13 ** Unless required by applicable law or agreed to in writing, software
14 ** distributed under the License is distributed on an "AS IS" BASIS,
15 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 ** See the License for the specific language governing permissions and
17 ** limitations under the License.
18 *
19 *****************************************************************************/
20
21 #ifndef DICT_DAWG_H_
22 #define DICT_DAWG_H_
23
24 /*----------------------------------------------------------------------
25 I n c l u d e s
26 ----------------------------------------------------------------------*/
27
28 #include <cinttypes> // for PRId64
29 #include <functional> // for std::function
30 #include <memory>
31 #include "elst.h"
32 #include "params.h"
33 #include "ratngs.h"
34
35 #ifndef __GNUC__
36 # ifdef _WIN32
37 # define NO_EDGE static_cast<int64_t>(0xffffffffffffffffi64)
38 # endif /*_WIN32*/
39 #else
40 # define NO_EDGE static_cast<int64_t>(0xffffffffffffffffll)
41 #endif /*__GNUC__*/
42
43 namespace tesseract {
44
45 class UNICHARSET;
46
47 using EDGE_RECORD = uint64_t;
48 using EDGE_ARRAY = EDGE_RECORD *;
49 using EDGE_REF = int64_t;
50 using NODE_REF = int64_t;
51 using NODE_MAP = EDGE_REF *;
52
53 struct NodeChild {
54 UNICHAR_ID unichar_id;
55 EDGE_REF edge_ref;
56 NodeChild(UNICHAR_ID id, EDGE_REF ref) : unichar_id(id), edge_ref(ref) {}
57 NodeChild() : unichar_id(INVALID_UNICHAR_ID), edge_ref(NO_EDGE) {}
58 };
59
60 using NodeChildVector = std::vector<NodeChild>;
61 using SuccessorList = std::vector<int>;
62 using SuccessorListsVector = std::vector<SuccessorList *>;
63
64 enum DawgType {
65 DAWG_TYPE_PUNCTUATION,
66 DAWG_TYPE_WORD,
67 DAWG_TYPE_NUMBER,
68 DAWG_TYPE_PATTERN,
69
70 DAWG_TYPE_COUNT // number of enum entries
71 };
72
73 /*----------------------------------------------------------------------
74 C o n s t a n t s
75 ----------------------------------------------------------------------*/
76
77 #define FORWARD_EDGE static_cast<int32_t>(0)
78 #define BACKWARD_EDGE static_cast<int32_t>(1)
79 #define MAX_NODE_EDGES_DISPLAY static_cast<int64_t>(100)
80 #define MARKER_FLAG static_cast<int64_t>(1)
81 #define DIRECTION_FLAG static_cast<int64_t>(2)
82 #define WERD_END_FLAG static_cast<int64_t>(4)
83 #define LETTER_START_BIT 0
84 #define NUM_FLAG_BITS 3
85 #define REFFORMAT "%" PRId64
86
87 static const bool kDawgSuccessors[DAWG_TYPE_COUNT][DAWG_TYPE_COUNT] = {
88 {false, true, true, false}, // for DAWG_TYPE_PUNCTUATION
89 {true, false, false, false}, // for DAWG_TYPE_WORD
90 {true, false, false, false}, // for DAWG_TYPE_NUMBER
91 {false, false, false, false}, // for DAWG_TYPE_PATTERN
92 };
93
94 static const char kWildcard[] = "*";
95
96 /*----------------------------------------------------------------------
97 C l a s s e s a n d S t r u c t s
98 ----------------------------------------------------------------------*/
99 //
100 /// Abstract class (an interface) that declares methods needed by the
101 /// various tesseract classes to operate on SquishedDawg and Trie objects.
102 ///
103 /// This class initializes all the edge masks (since their usage by
104 /// SquishedDawg and Trie is identical) and implements simple accessors
105 /// for each of the fields encoded in an EDGE_RECORD.
106 /// This class also implements word_in_dawg() and check_for_words()
107 /// (since they use only the public methods of SquishedDawg and Trie
108 /// classes that are inherited from the Dawg base class).
109 //
110 class TESS_API Dawg {
111 public:
112 /// Magic number to determine endianness when reading the Dawg from file.
113 static const int16_t kDawgMagicNumber = 42;
114 /// A special unichar id that indicates that any appropriate pattern
115 /// (e.g.dictionary word, 0-9 digit, etc) can be inserted instead
116 /// Used for expressing patterns in punctuation and number Dawgs.
117 static const UNICHAR_ID kPatternUnicharID = 0;
118
119 inline DawgType type() const {
120 return type_;
121 }
122 inline const std::string &lang() const {
123 return lang_;
124 }
125 inline PermuterType permuter() const {
126 return perm_;
127 }
128
129 virtual ~Dawg();
130
131 /// Returns true if the given word is in the Dawg.
132 bool word_in_dawg(const WERD_CHOICE &word) const;
133
134 // Returns true if the given word prefix is not contraindicated by the dawg.
135 // If requires_complete is true, then the exact complete word must be present.
136 bool prefix_in_dawg(const WERD_CHOICE &prefix, bool requires_complete) const;
137
138 /// Checks the Dawg for the words that are listed in the requested file.
139 /// Returns the number of words in the given file missing from the Dawg.
140 int check_for_words(const char *filename, const UNICHARSET &unicharset,
141 bool enable_wildcard) const;
142
143 // For each word in the Dawg, call the given (permanent) callback with the
144 // text (UTF-8) version of the word.
145 void iterate_words(const UNICHARSET &unicharset,
146 std::function<void(const WERD_CHOICE *)> cb) const;
147
148 // For each word in the Dawg, call the given (permanent) callback with the
149 // text (UTF-8) version of the word.
150 void iterate_words(const UNICHARSET &unicharset,
151 const std::function<void(const char *)> &cb) const;
152
153 // Pure virtual function that should be implemented by the derived classes.
154
155 /// Returns the edge that corresponds to the letter out of this node.
156 virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id,
157 bool word_end) const = 0;
158
159 /// Fills the given NodeChildVector with all the unichar ids (and the
160 /// corresponding EDGE_REFs) for which there is an edge out of this node.
161 virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec,
162 bool word_end) const = 0;
163
164 /// Returns the next node visited by following the edge
165 /// indicated by the given EDGE_REF.
166 virtual NODE_REF next_node(EDGE_REF edge_ref) const = 0;
167
168 /// Returns true if the edge indicated by the given EDGE_REF
169 /// marks the end of a word.
170 virtual bool end_of_word(EDGE_REF edge_ref) const = 0;
171
172 /// Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
173 virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const = 0;
174
175 /// Prints the contents of the node indicated by the given NODE_REF.
176 /// At most max_num_edges will be printed.
177 virtual void print_node(NODE_REF node, int max_num_edges) const = 0;
178
179 /// Fills vec with unichar ids that represent the character classes
180 /// of the given unichar_id.
181 virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id,
182 const UNICHARSET &unicharset,
183 std::vector<UNICHAR_ID> *vec) const {
184 (void)unichar_id;
185 (void)unicharset;
186 (void)vec;
187 }
188
189 /// Returns the given EDGE_REF if the EDGE_RECORD that it points to has
190 /// a self loop and the given unichar_id matches the unichar_id stored in the
191 /// EDGE_RECORD, returns NO_EDGE otherwise.
192 virtual EDGE_REF pattern_loop_edge(EDGE_REF edge_ref, UNICHAR_ID unichar_id,
193 bool word_end) const {
194 (void)edge_ref;
195 (void)unichar_id;
196 (void)word_end;
197 return false;
198 }
199
200 protected:
201 Dawg(DawgType type, const std::string &lang, PermuterType perm,
202 int debug_level)
203 : lang_(lang),
204 type_(type),
205 perm_(perm),
206 unicharset_size_(0),
207 debug_level_(debug_level) {}
208
209 /// Returns the next node visited by following this edge.
210 inline NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const {
211 return ((edge_rec & next_node_mask_) >> next_node_start_bit_);
212 }
213 /// Returns the marker flag of this edge.
214 inline bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const {
215 return (edge_rec & (MARKER_FLAG << flag_start_bit_)) != 0;
216 }
217 /// Returns the direction flag of this edge.
218 inline int direction_from_edge_rec(const EDGE_RECORD &edge_rec) const {
219 return ((edge_rec & (DIRECTION_FLAG << flag_start_bit_))) ? BACKWARD_EDGE
220 : FORWARD_EDGE;
221 }
222 /// Returns true if this edge marks the end of a word.
223 inline bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const {
224 return (edge_rec & (WERD_END_FLAG << flag_start_bit_)) != 0;
225 }
226 /// Returns UNICHAR_ID recorded in this edge.
227 inline UNICHAR_ID unichar_id_from_edge_rec(
228 const EDGE_RECORD &edge_rec) const {
229 return ((edge_rec & letter_mask_) >> LETTER_START_BIT);
230 }
231 /// Sets the next node link for this edge in the Dawg.
232 inline void set_next_node_in_edge_rec(EDGE_RECORD *edge_rec, EDGE_REF value) {
233 *edge_rec &= (~next_node_mask_);
234 *edge_rec |= ((value << next_node_start_bit_) & next_node_mask_);
235 }
236 /// Sets this edge record to be the last one in a sequence of edges.
237 inline void set_marker_flag_in_edge_rec(EDGE_RECORD *edge_rec) {
238 *edge_rec |= (MARKER_FLAG << flag_start_bit_);
239 }
240 /// Sequentially compares the given values of unichar ID, next node
241 /// and word end marker with the values in the given EDGE_RECORD.
242 /// Returns: 1 if at any step the given input value exceeds
243 /// that of edge_rec (and all the values already
244 /// checked are the same)
245 /// 0 if edge_rec_match() returns true
246 /// -1 otherwise
247 inline int given_greater_than_edge_rec(NODE_REF next_node, bool word_end,
248 UNICHAR_ID unichar_id,
249 const EDGE_RECORD &edge_rec) const {
250 UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(edge_rec);
251 NODE_REF curr_next_node = next_node_from_edge_rec(edge_rec);
252 bool curr_word_end = end_of_word_from_edge_rec(edge_rec);
253 if (edge_rec_match(next_node, word_end, unichar_id, curr_next_node,
254 curr_word_end, curr_unichar_id)) {
255 return 0;
256 }
257 if (unichar_id > curr_unichar_id) {
258 return 1;
259 }
260 if (unichar_id == curr_unichar_id) {
261 if (next_node > curr_next_node) {
262 return 1;
263 }
264 if (next_node == curr_next_node) {
265 if (word_end > curr_word_end) {
266 return 1;
267 }
268 }
269 }
270 return -1;
271 }
272 /// Returns true if all the values are equal (any value matches
273 /// next_node if next_node == NO_EDGE, any value matches word_end
274 /// if word_end is false).
275 inline bool edge_rec_match(NODE_REF next_node, bool word_end,
276 UNICHAR_ID unichar_id, NODE_REF other_next_node,
277 bool other_word_end,
278 UNICHAR_ID other_unichar_id) const {
279 return ((unichar_id == other_unichar_id) &&
280 (next_node == NO_EDGE || next_node == other_next_node) &&
281 (!word_end || (word_end == other_word_end)));
282 }
283
284 /// Sets unicharset_size_.
285 /// Initializes the values of various masks from unicharset_size_.
286 void init(int unicharset_size);
287
288 /// Matches all of the words that are represented by this string.
289 /// If wildcard is set to something other than INVALID_UNICHAR_ID,
290 /// the *'s in this string are interpreted as wildcards.
291 /// WERD_CHOICE param is not passed by const so that wildcard searches
292 /// can modify it and work without having to copy WERD_CHOICEs.
293 bool match_words(WERD_CHOICE *word, uint32_t index, NODE_REF node,
294 UNICHAR_ID wildcard) const;
295
296 // Recursively iterate over all words in a dawg (see public iterate_words).
297 void iterate_words_rec(
298 const WERD_CHOICE &word_so_far, NODE_REF to_explore,
299 const std::function<void(const WERD_CHOICE *)> &cb) const;
300
301 // Member Variables.
302 std::string lang_;
303 DawgType type_;
304 /// Permuter code that should be used if the word is found in this Dawg.
305 PermuterType perm_;
306 // Variables to construct various edge masks. Formerly:
307 // #define NEXT_EDGE_MASK (int64_t) 0xfffffff800000000i64
308 // #define FLAGS_MASK (int64_t) 0x0000000700000000i64
309 // #define LETTER_MASK (int64_t) 0x00000000ffffffffi64
310 uint64_t next_node_mask_ = 0;
311 uint64_t flags_mask_ = 0;
312 uint64_t letter_mask_ = 0;
313 int unicharset_size_;
314 int flag_start_bit_ = 0;
315 int next_node_start_bit_ = 0;
316 // Level of debug statements to print to stdout.
317 int debug_level_;
318 };
319
320 //
321 // DawgPosition keeps track of where we are in the primary dawg we're searching
322 // as well as where we may be in the "punctuation dawg" which may provide
323 // surrounding context.
324 //
325 // Example:
326 // punctuation dawg -- space is the "pattern character"
327 // " " // no punctuation
328 // "' '" // leading and trailing apostrophes
329 // " '" // trailing apostrophe
330 // word dawg:
331 // "cat"
332 // "cab"
333 // "cat's"
334 //
335 // DawgPosition(dawg_index, dawg_ref, punc_index, punc_ref, rtp)
336 //
337 // DawgPosition(-1, NO_EDGE, p, pe, false)
338 // We're in the punctuation dawg, no other dawg has been started.
339 // (1) If there's a pattern edge as a punc dawg child of us,
340 // for each punc-following dawg starting with ch, produce:
341 // Result: DawgPosition(k, w, p', false)
342 // (2) If there's a valid continuation in the punc dawg, produce:
343 // Result: DawgPosition(-k, NO_EDGE, p', false)
344 //
345 // DawgPosition(k, w, -1, NO_EDGE, false)
346 // We're in dawg k. Going back to punctuation dawg is not an option.
347 // Follow ch in dawg k.
348 //
349 // DawgPosition(k, w, p, pe, false)
350 // We're in dawg k. Continue in dawg k and/or go back to the punc dawg.
351 // If ending, check that the punctuation dawg is also ok to end here.
352 //
353 // DawgPosition(k, w, p, pe true)
354 // We're back in the punctuation dawg. Continuing there is the only option.
355 struct DawgPosition {
356 DawgPosition() = default;
357 DawgPosition(int dawg_idx, EDGE_REF dawgref, int punc_idx, EDGE_REF puncref,
358 bool backtopunc)
359 : dawg_ref(dawgref),
360 punc_ref(puncref),
361 dawg_index(dawg_idx),
362 punc_index(punc_idx),
363 back_to_punc(backtopunc) {}
364 bool operator==(const DawgPosition &other) const {
365 return dawg_index == other.dawg_index && dawg_ref == other.dawg_ref &&
366 punc_index == other.punc_index && punc_ref == other.punc_ref &&
367 back_to_punc == other.back_to_punc;
368 }
369
370 EDGE_REF dawg_ref = NO_EDGE;
371 EDGE_REF punc_ref = NO_EDGE;
372 int8_t dawg_index = -1;
373 int8_t punc_index = -1;
374 // Have we returned to the punc dawg at the end of the word?
375 bool back_to_punc = false;
376 };
377
378 class DawgPositionVector : public std::vector<DawgPosition> {
379 public:
380 /// Adds an entry for the given dawg_index with the given node to the vec.
381 /// Returns false if the same entry already exists in the vector,
382 /// true otherwise.
383 inline bool add_unique(const DawgPosition &new_pos, bool debug,
384 const char *debug_msg) {
385 for (auto &&position : *this) {
386 if (position == new_pos) {
387 return false;
388 }
389 }
390 push_back(new_pos);
391 if (debug) {
392 tprintf("%s[%d, " REFFORMAT "] [punc: " REFFORMAT "%s]\n", debug_msg,
393 new_pos.dawg_index, new_pos.dawg_ref, new_pos.punc_ref,
394 new_pos.back_to_punc ? " returned" : "");
395 }
396 return true;
397 }
398 };
399
400 //
401 /// Concrete class that can operate on a compacted (squished) Dawg (read,
402 /// search and write to file). This class is read-only in the sense that
403 /// new words cannot be added to an instance of SquishedDawg.
404 /// The underlying representation of the nodes and edges in SquishedDawg
405 /// is stored as a contiguous EDGE_ARRAY (read from file or given as an
406 /// argument to the constructor).
407 //
408 class TESS_API SquishedDawg : public Dawg {
409 public:
410 SquishedDawg(DawgType type, const std::string &lang, PermuterType perm,
411 int debug_level)
412 : Dawg(type, lang, perm, debug_level) {}
413 SquishedDawg(const char *filename, DawgType type, const std::string &lang,
414 PermuterType perm, int debug_level)
415 : Dawg(type, lang, perm, debug_level) {
416 TFile file;
417 ASSERT_HOST(file.Open(filename, nullptr));
418 ASSERT_HOST(read_squished_dawg(&file));
419 num_forward_edges_in_node0 = num_forward_edges(0);
420 }
421 SquishedDawg(EDGE_ARRAY edges, int num_edges, DawgType type,
422 const std::string &lang, PermuterType perm, int unicharset_size,
423 int debug_level)
424 : Dawg(type, lang, perm, debug_level),
425 edges_(edges),
426 num_edges_(num_edges) {
427 init(unicharset_size);
428 num_forward_edges_in_node0 = num_forward_edges(0);
429 if (debug_level > 3) {
430 print_all("SquishedDawg:");
431 }
432 }
433 ~SquishedDawg() override;
434
435 // Loads using the given TFile. Returns false on failure.
436 bool Load(TFile *fp) {
437 if (!read_squished_dawg(fp)) {
438 return false;
439 }
440 num_forward_edges_in_node0 = num_forward_edges(0);
441 return true;
442 }
443
444 int NumEdges() {
445 return num_edges_;
446 }
447
448 /// Returns the edge that corresponds to the letter out of this node.
449 EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id,
450 bool word_end) const override;
451
452 /// Fills the given NodeChildVector with all the unichar ids (and the
453 /// corresponding EDGE_REFs) for which there is an edge out of this node.
454 void unichar_ids_of(NODE_REF node, NodeChildVector *vec,
455 bool word_end) const override {
456 EDGE_REF edge = node;
457 if (!edge_occupied(edge) || edge == NO_EDGE) {
458 return;
459 }
460 assert(forward_edge(edge)); // we don't expect any backward edges to
461 do { // be present when this function is called
462 if (!word_end || end_of_word_from_edge_rec(edges_[edge])) {
463 vec->push_back(NodeChild(unichar_id_from_edge_rec(edges_[edge]), edge));
464 }
465 } while (!last_edge(edge++));
466 }
467
468 /// Returns the next node visited by following the edge
469 /// indicated by the given EDGE_REF.
470 NODE_REF next_node(EDGE_REF edge) const override {
471 return next_node_from_edge_rec((edges_[edge]));
472 }
473
474 /// Returns true if the edge indicated by the given EDGE_REF
475 /// marks the end of a word.
476 bool end_of_word(EDGE_REF edge_ref) const override {
477 return end_of_word_from_edge_rec((edges_[edge_ref]));
478 }
479
480 /// Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
481 UNICHAR_ID edge_letter(EDGE_REF edge_ref) const override {
482 return unichar_id_from_edge_rec((edges_[edge_ref]));
483 }
484
485 /// Prints the contents of the node indicated by the given NODE_REF.
486 /// At most max_num_edges will be printed.
487 void print_node(NODE_REF node, int max_num_edges) const override;
488
489 /// Writes the squished/reduced Dawg to a file.
490 bool write_squished_dawg(TFile *file);
491
492 /// Opens the file with the given filename and writes the
493 /// squished/reduced Dawg to the file.
494 bool write_squished_dawg(const char *filename) {
495 TFile file;
496 file.OpenWrite(nullptr);
497 if (!this->write_squished_dawg(&file)) {
498 tprintf("Error serializing %s\n", filename);
499 return false;
500 }
501 if (!file.CloseWrite(filename, nullptr)) {
502 tprintf("Error writing file %s\n", filename);
503 return false;
504 }
505 return true;
506 }
507
508 private:
509 /// Sets the next node link for this edge.
510 inline void set_next_node(EDGE_REF edge_ref, EDGE_REF value) {
511 set_next_node_in_edge_rec(&(edges_[edge_ref]), value);
512 }
513 /// Sets the edge to be empty.
514 inline void set_empty_edge(EDGE_REF edge_ref) {
515 (edges_[edge_ref] = next_node_mask_);
516 }
517 /// Goes through all the edges and clears each one out.
518 inline void clear_all_edges() {
519 for (int edge = 0; edge < num_edges_; edge++) {
520 set_empty_edge(edge);
521 }
522 }
523 /// Clears the last flag of this edge.
524 inline void clear_marker_flag(EDGE_REF edge_ref) {
525 (edges_[edge_ref] &= ~(MARKER_FLAG << flag_start_bit_));
526 }
527 /// Returns true if this edge is in the forward direction.
528 inline bool forward_edge(EDGE_REF edge_ref) const {
529 return (edge_occupied(edge_ref) &&
530 (FORWARD_EDGE == direction_from_edge_rec(edges_[edge_ref])));
531 }
532 /// Returns true if this edge is in the backward direction.
533 inline bool backward_edge(EDGE_REF edge_ref) const {
534 return (edge_occupied(edge_ref) &&
535 (BACKWARD_EDGE == direction_from_edge_rec(edges_[edge_ref])));
536 }
537 /// Returns true if the edge spot in this location is occupied.
538 inline bool edge_occupied(EDGE_REF edge_ref) const {
539 return (edges_[edge_ref] != next_node_mask_);
540 }
541 /// Returns true if this edge is the last edge in a sequence.
542 inline bool last_edge(EDGE_REF edge_ref) const {
543 return (edges_[edge_ref] & (MARKER_FLAG << flag_start_bit_)) != 0;
544 }
545
546 /// Counts and returns the number of forward edges in this node.
547 int32_t num_forward_edges(NODE_REF node) const;
548
549 /// Reads SquishedDawg from a file.
550 bool read_squished_dawg(TFile *file);
551
552 /// Prints the contents of an edge indicated by the given EDGE_REF.
553 void print_edge(EDGE_REF edge) const;
554
555 /// Prints the contents of the SquishedDawg.
556 void print_all(const char *msg) {
557 tprintf("\n__________________________\n%s\n", msg);
558 for (int i = 0; i < num_edges_; ++i) {
559 print_edge(i);
560 }
561 tprintf("__________________________\n");
562 }
563 /// Constructs a mapping from the memory node indices to disk node indices.
564 std::unique_ptr<EDGE_REF[]> build_node_map(int32_t *num_nodes) const;
565
566 // Member variables.
567 EDGE_ARRAY edges_ = nullptr;
568 int32_t num_edges_ = 0;
569 int num_forward_edges_in_node0 = 0;
570 };
571
572 } // namespace tesseract
573
574 #endif // DICT_DAWG_H_