Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/dict/dawg.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/dict/dawg.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,458 @@ +/******************************************************************************** + * + * File: dawg.cpp (Formerly dawg.c) + * Description: Use a Directed Acyclic Word Graph + * Author: Mark Seaman, OCR Technology + * + * (c) Copyright 1987, Hewlett-Packard Company. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + *********************************************************************************/ +/*---------------------------------------------------------------------- + I n c l u d e s +----------------------------------------------------------------------*/ + +#include "dawg.h" + +#include "dict.h" +#include "helpers.h" +#include "tprintf.h" + +#include <memory> + +/*---------------------------------------------------------------------- + F u n c t i o n s f o r D a w g +----------------------------------------------------------------------*/ +namespace tesseract { + +// Destructor. +// It is defined here, so the compiler can create a single vtable +// instead of weak vtables in every compilation unit. +Dawg::~Dawg() = default; + +bool Dawg::prefix_in_dawg(const WERD_CHOICE &word, + bool requires_complete) const { + if (word.empty()) { + return !requires_complete; + } + NODE_REF node = 0; + int end_index = word.length() - 1; + for (int i = 0; i < end_index; i++) { + EDGE_REF edge = edge_char_of(node, word.unichar_id(i), false); + if (edge == NO_EDGE) { + return false; + } + if ((node = next_node(edge)) == 0) { + // This only happens if all words following this edge terminate -- + // there are no larger words. See Trie::add_word_to_dawg() + return false; + } + } + // Now check the last character. + return edge_char_of(node, word.unichar_id(end_index), requires_complete) != + NO_EDGE; +} + +bool Dawg::word_in_dawg(const WERD_CHOICE &word) const { + return prefix_in_dawg(word, true); +} + +int Dawg::check_for_words(const char *filename, const UNICHARSET &unicharset, + bool enable_wildcard) const { + if (filename == nullptr) { + return 0; + } + + FILE *word_file; + char string[CHARS_PER_LINE]; + int misses = 0; + UNICHAR_ID wildcard = unicharset.unichar_to_id(kWildcard); + + word_file = fopen(filename, "r"); + if (word_file == nullptr) { + tprintf("Error: Could not open file %s\n", filename); + ASSERT_HOST(word_file); + } + + while (fgets(string, CHARS_PER_LINE, word_file) != nullptr) { + chomp_string(string); // remove newline + WERD_CHOICE word(string, unicharset); + if (word.length() > 0 && !word.contains_unichar_id(INVALID_UNICHAR_ID)) { + if (!match_words(&word, 0, 0, + enable_wildcard ? wildcard : INVALID_UNICHAR_ID)) { + tprintf("Missing word: %s\n", string); + ++misses; + } + } else { + tprintf("Failed to create a valid word from %s\n", string); + } + } + fclose(word_file); + // Make sure the user sees this with fprintf instead of tprintf. + if (debug_level_) { + tprintf("Number of lost words=%d\n", misses); + } + return misses; +} + +void Dawg::iterate_words(const UNICHARSET &unicharset, + std::function<void(const WERD_CHOICE *)> cb) const { + WERD_CHOICE word(&unicharset); + iterate_words_rec(word, 0, cb); +} + +static void CallWithUTF8(const std::function<void(const char *)> &cb, + const WERD_CHOICE *wc) { + std::string s; + wc->string_and_lengths(&s, nullptr); + cb(s.c_str()); +} + +void Dawg::iterate_words(const UNICHARSET &unicharset, + const std::function<void(const char *)> &cb) const { + using namespace std::placeholders; // for _1 + std::function<void(const WERD_CHOICE *)> shim( + std::bind(CallWithUTF8, cb, _1)); + WERD_CHOICE word(&unicharset); + iterate_words_rec(word, 0, shim); +} + +void Dawg::iterate_words_rec( + const WERD_CHOICE &word_so_far, NODE_REF to_explore, + const std::function<void(const WERD_CHOICE *)> &cb) const { + NodeChildVector children; + this->unichar_ids_of(to_explore, &children, false); + for (auto &i : children) { + WERD_CHOICE next_word(word_so_far); + next_word.append_unichar_id(i.unichar_id, 1, 0.0, 0.0); + if (this->end_of_word(i.edge_ref)) { + cb(&next_word); + } + NODE_REF next = next_node(i.edge_ref); + if (next != 0) { + iterate_words_rec(next_word, next, cb); + } + } +} + +bool Dawg::match_words(WERD_CHOICE *word, uint32_t index, NODE_REF node, + UNICHAR_ID wildcard) const { + if (wildcard != INVALID_UNICHAR_ID && word->unichar_id(index) == wildcard) { + bool any_matched = false; + NodeChildVector vec; + this->unichar_ids_of(node, &vec, false); + for (auto &i : vec) { + word->set_unichar_id(i.unichar_id, index); + if (match_words(word, index, node, wildcard)) { + any_matched = true; + } + } + word->set_unichar_id(wildcard, index); + return any_matched; + } else { + auto word_end = index == word->length() - 1; + auto edge = edge_char_of(node, word->unichar_id(index), word_end); + if (edge != NO_EDGE) { // normal edge in DAWG + node = next_node(edge); + if (word_end) { + if (debug_level_ > 1) { + word->print("match_words() found: "); + } + return true; + } else if (node != 0) { + return match_words(word, index + 1, node, wildcard); + } + } + } + return false; +} + +void Dawg::init(int unicharset_size) { + ASSERT_HOST(unicharset_size > 0); + unicharset_size_ = unicharset_size; + // Set bit masks. We will use the value unicharset_size_ as a null char, so + // the actual number of unichars is unicharset_size_ + 1. + flag_start_bit_ = ceil(log(unicharset_size_ + 1.0) / log(2.0)); + next_node_start_bit_ = flag_start_bit_ + NUM_FLAG_BITS; + letter_mask_ = ~(~0ull << flag_start_bit_); + next_node_mask_ = ~0ull << (flag_start_bit_ + NUM_FLAG_BITS); + flags_mask_ = ~(letter_mask_ | next_node_mask_); +} + +/*---------------------------------------------------------------------- + F u n c t i o n s f o r S q u i s h e d D a w g +----------------------------------------------------------------------*/ + +SquishedDawg::~SquishedDawg() { + delete[] edges_; +} + +EDGE_REF SquishedDawg::edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, + bool word_end) const { + EDGE_REF edge = node; + if (node == 0) { // binary search + EDGE_REF start = 0; + EDGE_REF end = num_forward_edges_in_node0 - 1; + int compare; + while (start <= end) { + edge = (start + end) >> 1; // (start + end) / 2 + compare = given_greater_than_edge_rec(NO_EDGE, word_end, unichar_id, + edges_[edge]); + if (compare == 0) { // given == vec[k] + return edge; + } else if (compare == 1) { // given > vec[k] + start = edge + 1; + } else { // given < vec[k] + end = edge - 1; + } + } + } else { // linear search + if (edge != NO_EDGE && edge_occupied(edge)) { + do { + if ((unichar_id_from_edge_rec(edges_[edge]) == unichar_id) && + (!word_end || end_of_word_from_edge_rec(edges_[edge]))) { + return (edge); + } + } while (!last_edge(edge++)); + } + } + return (NO_EDGE); // not found +} + +int32_t SquishedDawg::num_forward_edges(NODE_REF node) const { + EDGE_REF edge = node; + int32_t num = 0; + + if (forward_edge(edge)) { + do { + num++; + } while (!last_edge(edge++)); + } + + return (num); +} + +void SquishedDawg::print_node(NODE_REF node, int max_num_edges) const { + if (node == NO_EDGE) { + return; // nothing to print + } + + EDGE_REF edge = node; + const char *forward_string = "FORWARD"; + const char *backward_string = " "; + + const char *last_string = "LAST"; + const char *not_last_string = " "; + + const char *eow_string = "EOW"; + const char *not_eow_string = " "; + + const char *direction; + const char *is_last; + const char *eow; + + UNICHAR_ID unichar_id; + + if (edge_occupied(edge)) { + do { + direction = forward_edge(edge) ? forward_string : backward_string; + is_last = last_edge(edge) ? last_string : not_last_string; + eow = end_of_word(edge) ? eow_string : not_eow_string; + + unichar_id = edge_letter(edge); + tprintf(REFFORMAT " : next = " REFFORMAT ", unichar_id = %d, %s %s %s\n", + edge, next_node(edge), unichar_id, direction, is_last, eow); + + if (edge - node > max_num_edges) { + return; + } + } while (!last_edge(edge++)); + + if (edge < num_edges_ && edge_occupied(edge) && backward_edge(edge)) { + do { + direction = forward_edge(edge) ? forward_string : backward_string; + is_last = last_edge(edge) ? last_string : not_last_string; + eow = end_of_word(edge) ? eow_string : not_eow_string; + + unichar_id = edge_letter(edge); + tprintf(REFFORMAT " : next = " REFFORMAT + ", unichar_id = %d, %s %s %s\n", + edge, next_node(edge), unichar_id, direction, is_last, eow); + + if (edge - node > MAX_NODE_EDGES_DISPLAY) { + return; + } + } while (!last_edge(edge++)); + } + } else { + tprintf(REFFORMAT " : no edges in this node\n", node); + } + tprintf("\n"); +} + +void SquishedDawg::print_edge(EDGE_REF edge) const { + if (edge == NO_EDGE) { + tprintf("NO_EDGE\n"); + } else { + tprintf(REFFORMAT " : next = " REFFORMAT ", unichar_id = '%d', %s %s %s\n", + edge, next_node(edge), edge_letter(edge), + (forward_edge(edge) ? "FORWARD" : " "), + (last_edge(edge) ? "LAST" : " "), + (end_of_word(edge) ? "EOW" : "")); + } +} + +bool SquishedDawg::read_squished_dawg(TFile *file) { + if (debug_level_) { + tprintf("Reading squished dawg\n"); + } + + // Read the magic number and check that it matches kDawgMagicNumber, as + // auto-endian fixing should make sure it is always correct. + int16_t magic; + if (!file->DeSerialize(&magic)) { + return false; + } + if (magic != kDawgMagicNumber) { + tprintf("Bad magic number on dawg: %d vs %d\n", magic, kDawgMagicNumber); + return false; + } + + int32_t unicharset_size; + if (!file->DeSerialize(&unicharset_size)) { + return false; + } + if (!file->DeSerialize(&num_edges_)) { + return false; + } + ASSERT_HOST(num_edges_ > 0); // DAWG should not be empty + Dawg::init(unicharset_size); + + edges_ = new EDGE_RECORD[num_edges_]; + if (!file->DeSerialize(&edges_[0], num_edges_)) { + return false; + } + if (debug_level_ > 2) { + tprintf("type: %d lang: %s perm: %d unicharset_size: %d num_edges: %d\n", + type_, lang_.c_str(), perm_, unicharset_size_, num_edges_); + for (EDGE_REF edge = 0; edge < num_edges_; ++edge) { + print_edge(edge); + } + } + return true; +} + +std::unique_ptr<EDGE_REF[]> SquishedDawg::build_node_map( + int32_t *num_nodes) const { + EDGE_REF edge; + std::unique_ptr<EDGE_REF[]> node_map(new EDGE_REF[num_edges_]); + int32_t node_counter; + int32_t num_edges; + + for (edge = 0; edge < num_edges_; edge++) { // init all slots + node_map[edge] = -1; + } + + node_counter = num_forward_edges(0); + + *num_nodes = 0; + for (edge = 0; edge < num_edges_; edge++) { // search all slots + + if (forward_edge(edge)) { + (*num_nodes)++; // count nodes links + node_map[edge] = (edge ? node_counter : 0); + num_edges = num_forward_edges(edge); + if (edge != 0) { + node_counter += num_edges; + } + edge += num_edges; + if (edge >= num_edges_) { + break; + } + if (backward_edge(edge)) { + while (!last_edge(edge++)) { + ; + } + } + edge--; + } + } + return node_map; +} + +bool SquishedDawg::write_squished_dawg(TFile *file) { + EDGE_REF edge; + int32_t num_edges; + int32_t node_count = 0; + EDGE_REF old_index; + EDGE_RECORD temp_record; + + if (debug_level_) { + tprintf("write_squished_dawg\n"); + } + + std::unique_ptr<EDGE_REF[]> node_map(build_node_map(&node_count)); + + // Write the magic number to help detecting a change in endianness. + int16_t magic = kDawgMagicNumber; + if (!file->Serialize(&magic)) { + return false; + } + if (!file->Serialize(&unicharset_size_)) { + return false; + } + + // Count the number of edges in this Dawg. + num_edges = 0; + for (edge = 0; edge < num_edges_; edge++) { + if (forward_edge(edge)) { + num_edges++; + } + } + + // Write edge count to file. + if (!file->Serialize(&num_edges)) { + return false; + } + + if (debug_level_) { + tprintf("%d nodes in DAWG\n", node_count); + tprintf("%d edges in DAWG\n", num_edges); + } + + for (edge = 0; edge < num_edges_; edge++) { + if (forward_edge(edge)) { // write forward edges + do { + old_index = next_node_from_edge_rec(edges_[edge]); + set_next_node(edge, node_map[old_index]); + temp_record = edges_[edge]; + if (!file->Serialize(&temp_record)) { + return false; + } + set_next_node(edge, old_index); + } while (!last_edge(edge++)); + + if (edge >= num_edges_) { + break; + } + if (backward_edge(edge)) { // skip back links + while (!last_edge(edge++)) { + ; + } + } + + edge--; + } + } + return true; +} + +} // namespace tesseract
