Python2/PyMuPDF: mupdf-source/thirdparty/tesseract/src/dict/dawg.cpp comparison

comparison mupdf-source/thirdparty/tesseract/src/dict/dawg.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.

author	Franz Glasner <fzglas.hg@dom66.de>
date	Mon, 15 Sep 2025 11:43:07 +0200
parents
children

comparison

equal deleted inserted replaced

-:1d09e1dec1d9
+:b50eed0cc0ef
+/********************************************************************************
+*
+* File:         dawg.cpp  (Formerly dawg.c)
+* Description:  Use a Directed Acyclic Word Graph
+* Author:       Mark Seaman, OCR Technology
+*
+* (c) Copyright 1987, Hewlett-Packard Company.
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+** http://www.apache.org/licenses/LICENSE-2.0
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*
+*********************************************************************************/
+/*----------------------------------------------------------------------
+I n c l u d e s
+----------------------------------------------------------------------*/
+#include "dawg.h"
+#include "dict.h"
+#include "helpers.h"
+#include "tprintf.h"
+#include <memory>
+/*----------------------------------------------------------------------
+F u n c t i o n s   f o r   D a w g
+----------------------------------------------------------------------*/
+namespace tesseract {
+// Destructor.
+// It is defined here, so the compiler can create a single vtable
+// instead of weak vtables in every compilation unit.
+Dawg::~Dawg() = default;
+bool Dawg::prefix_in_dawg(const WERD_CHOICE &word,
+bool requires_complete) const {
+if (word.empty()) {
+return !requires_complete;
+}
+NODE_REF node = 0;
+int end_index = word.length() - 1;
+for (int i = 0; i < end_index; i++) {
+EDGE_REF edge = edge_char_of(node, word.unichar_id(i), false);
+if (edge == NO_EDGE) {
+return false;
+}
+if ((node = next_node(edge)) == 0) {
+// This only happens if all words following this edge terminate --
+// there are no larger words.  See Trie::add_word_to_dawg()
+return false;
+}
+}
+// Now check the last character.
+return edge_char_of(node, word.unichar_id(end_index), requires_complete) !=
+NO_EDGE;
+}
+bool Dawg::word_in_dawg(const WERD_CHOICE &word) const {
+return prefix_in_dawg(word, true);
+}
+int Dawg::check_for_words(const char *filename, const UNICHARSET &unicharset,
+bool enable_wildcard) const {
+if (filename == nullptr) {
+return 0;
+}
+FILE *word_file;
+char string[CHARS_PER_LINE];
+int misses = 0;
+UNICHAR_ID wildcard = unicharset.unichar_to_id(kWildcard);
+word_file = fopen(filename, "r");
+if (word_file == nullptr) {
+tprintf("Error: Could not open file %s\n", filename);
+ASSERT_HOST(word_file);
+}
+while (fgets(string, CHARS_PER_LINE, word_file) != nullptr) {
+chomp_string(string); // remove newline
+WERD_CHOICE word(string, unicharset);
+if (word.length() > 0 && !word.contains_unichar_id(INVALID_UNICHAR_ID)) {
+if (!match_words(&word, 0, 0,
+enable_wildcard ? wildcard : INVALID_UNICHAR_ID)) {
+tprintf("Missing word: %s\n", string);
+++misses;
+}
+} else {
+tprintf("Failed to create a valid word from %s\n", string);
+}
+}
+fclose(word_file);
+// Make sure the user sees this with fprintf instead of tprintf.
+if (debug_level_) {
+tprintf("Number of lost words=%d\n", misses);
+}
+return misses;
+}
+void Dawg::iterate_words(const UNICHARSET &unicharset,
+std::function<void(const WERD_CHOICE *)> cb) const {
+WERD_CHOICE word(&unicharset);
+iterate_words_rec(word, 0, cb);
+}
+static void CallWithUTF8(const std::function<void(const char *)> &cb,
+const WERD_CHOICE *wc) {
+std::string s;
+wc->string_and_lengths(&s, nullptr);
+cb(s.c_str());
+}
+void Dawg::iterate_words(const UNICHARSET &unicharset,
+const std::function<void(const char *)> &cb) const {
+using namespace std::placeholders; // for _1
+std::function<void(const WERD_CHOICE *)> shim(
+std::bind(CallWithUTF8, cb, _1));
+WERD_CHOICE word(&unicharset);
+iterate_words_rec(word, 0, shim);
+}
+void Dawg::iterate_words_rec(
+const WERD_CHOICE &word_so_far, NODE_REF to_explore,
+const std::function<void(const WERD_CHOICE *)> &cb) const {
+NodeChildVector children;
+this->unichar_ids_of(to_explore, &children, false);
+for (auto &i : children) {
+WERD_CHOICE next_word(word_so_far);
+next_word.append_unichar_id(i.unichar_id, 1, 0.0, 0.0);
+if (this->end_of_word(i.edge_ref)) {
+cb(&next_word);
+}
+NODE_REF next = next_node(i.edge_ref);
+if (next != 0) {
+iterate_words_rec(next_word, next, cb);
+}
+}
+}
+bool Dawg::match_words(WERD_CHOICE *word, uint32_t index, NODE_REF node,
+UNICHAR_ID wildcard) const {
+if (wildcard != INVALID_UNICHAR_ID && word->unichar_id(index) == wildcard) {
+bool any_matched = false;
+NodeChildVector vec;
+this->unichar_ids_of(node, &vec, false);
+for (auto &i : vec) {
+word->set_unichar_id(i.unichar_id, index);
+if (match_words(word, index, node, wildcard)) {
+any_matched = true;
+}
+}
+word->set_unichar_id(wildcard, index);
+return any_matched;
+} else {
+auto word_end = index == word->length() - 1;
+auto edge = edge_char_of(node, word->unichar_id(index), word_end);
+if (edge != NO_EDGE) { // normal edge in DAWG
+node = next_node(edge);
+if (word_end) {
+if (debug_level_ > 1) {
+word->print("match_words() found: ");
+}
+return true;
+} else if (node != 0) {
+return match_words(word, index + 1, node, wildcard);
+}
+}
+}
+return false;
+}
+void Dawg::init(int unicharset_size) {
+ASSERT_HOST(unicharset_size > 0);
+unicharset_size_ = unicharset_size;
+// Set bit masks. We will use the value unicharset_size_ as a null char, so
+// the actual number of unichars is unicharset_size_ + 1.
+flag_start_bit_ = ceil(log(unicharset_size_ + 1.0) / log(2.0));
+next_node_start_bit_ = flag_start_bit_ + NUM_FLAG_BITS;
+letter_mask_ = ~(~0ull << flag_start_bit_);
+next_node_mask_ = ~0ull << (flag_start_bit_ + NUM_FLAG_BITS);
+flags_mask_ = ~(letter_mask_ | next_node_mask_);
+}
+/*----------------------------------------------------------------------
+F u n c t i o n s   f o r   S q u i s h e d    D a w g
+----------------------------------------------------------------------*/
+SquishedDawg::~SquishedDawg() {
+delete[] edges_;
+}
+EDGE_REF SquishedDawg::edge_char_of(NODE_REF node, UNICHAR_ID unichar_id,
+bool word_end) const {
+EDGE_REF edge = node;
+if (node == 0) { // binary search
+EDGE_REF start = 0;
+EDGE_REF end = num_forward_edges_in_node0 - 1;
+int compare;
+while (start <= end) {
+edge = (start + end) >> 1; // (start + end) / 2
+compare = given_greater_than_edge_rec(NO_EDGE, word_end, unichar_id,
+edges_[edge]);
+if (compare == 0) { // given == vec[k]
+return edge;
+} else if (compare == 1) { // given > vec[k]
+start = edge + 1;
+} else { // given < vec[k]
+end = edge - 1;
+}
+}
+} else { // linear search
+if (edge != NO_EDGE && edge_occupied(edge)) {
+do {
+if ((unichar_id_from_edge_rec(edges_[edge]) == unichar_id) &&
+(!word_end || end_of_word_from_edge_rec(edges_[edge]))) {
+return (edge);
+}
+} while (!last_edge(edge++));
+}
+}
+return (NO_EDGE); // not found
+}
+int32_t SquishedDawg::num_forward_edges(NODE_REF node) const {
+EDGE_REF edge = node;
+int32_t num = 0;
+if (forward_edge(edge)) {
+do {
+num++;
+} while (!last_edge(edge++));
+}
+return (num);
+}
+void SquishedDawg::print_node(NODE_REF node, int max_num_edges) const {
+if (node == NO_EDGE) {
+return; // nothing to print
+}
+EDGE_REF edge = node;
+const char *forward_string = "FORWARD";
+const char *backward_string = "       ";
+const char *last_string = "LAST";
+const char *not_last_string = "    ";
+const char *eow_string = "EOW";
+const char *not_eow_string = "   ";
+const char *direction;
+const char *is_last;
+const char *eow;
+UNICHAR_ID unichar_id;
+if (edge_occupied(edge)) {
+do {
+direction = forward_edge(edge) ? forward_string : backward_string;
+is_last = last_edge(edge) ? last_string : not_last_string;
+eow = end_of_word(edge) ? eow_string : not_eow_string;
+unichar_id = edge_letter(edge);
+tprintf(REFFORMAT " : next = " REFFORMAT ", unichar_id = %d, %s %s %s\n",
+edge, next_node(edge), unichar_id, direction, is_last, eow);
+if (edge - node > max_num_edges) {
+return;
+}
+} while (!last_edge(edge++));
+if (edge < num_edges_ && edge_occupied(edge) && backward_edge(edge)) {
+do {
+direction = forward_edge(edge) ? forward_string : backward_string;
+is_last = last_edge(edge) ? last_string : not_last_string;
+eow = end_of_word(edge) ? eow_string : not_eow_string;
+unichar_id = edge_letter(edge);
+tprintf(REFFORMAT " : next = " REFFORMAT
+", unichar_id = %d, %s %s %s\n",
+edge, next_node(edge), unichar_id, direction, is_last, eow);
+if (edge - node > MAX_NODE_EDGES_DISPLAY) {
+return;
+}
+} while (!last_edge(edge++));
+}
+} else {
+tprintf(REFFORMAT " : no edges in this node\n", node);
+}
+tprintf("\n");
+}
+void SquishedDawg::print_edge(EDGE_REF edge) const {
+if (edge == NO_EDGE) {
+tprintf("NO_EDGE\n");
+} else {
+tprintf(REFFORMAT " : next = " REFFORMAT ", unichar_id = '%d', %s %s %s\n",
+edge, next_node(edge), edge_letter(edge),
+(forward_edge(edge) ? "FORWARD" : "       "),
+(last_edge(edge) ? "LAST" : "    "),
+(end_of_word(edge) ? "EOW" : ""));
+}
+}
+bool SquishedDawg::read_squished_dawg(TFile *file) {
+if (debug_level_) {
+tprintf("Reading squished dawg\n");
+}
+// Read the magic number and check that it matches kDawgMagicNumber, as
+// auto-endian fixing should make sure it is always correct.
+int16_t magic;
+if (!file->DeSerialize(&magic)) {
+return false;
+}
+if (magic != kDawgMagicNumber) {
+tprintf("Bad magic number on dawg: %d vs %d\n", magic, kDawgMagicNumber);
+return false;
+}
+int32_t unicharset_size;
+if (!file->DeSerialize(&unicharset_size)) {
+return false;
+}
+if (!file->DeSerialize(&num_edges_)) {
+return false;
+}
+ASSERT_HOST(num_edges_ > 0); // DAWG should not be empty
+Dawg::init(unicharset_size);
+edges_ = new EDGE_RECORD[num_edges_];
+if (!file->DeSerialize(&edges_[0], num_edges_)) {
+return false;
+}
+if (debug_level_ > 2) {
+tprintf("type: %d lang: %s perm: %d unicharset_size: %d num_edges: %d\n",
+type_, lang_.c_str(), perm_, unicharset_size_, num_edges_);
+for (EDGE_REF edge = 0; edge < num_edges_; ++edge) {
+print_edge(edge);
+}
+}
+return true;
+}
+std::unique_ptr<EDGE_REF[]> SquishedDawg::build_node_map(
+int32_t *num_nodes) const {
+EDGE_REF edge;
+std::unique_ptr<EDGE_REF[]> node_map(new EDGE_REF[num_edges_]);
+int32_t node_counter;
+int32_t num_edges;
+for (edge = 0; edge < num_edges_; edge++) { // init all slots
+node_map[edge] = -1;
+}
+node_counter = num_forward_edges(0);
+*num_nodes = 0;
+for (edge = 0; edge < num_edges_; edge++) { // search all slots
+if (forward_edge(edge)) {
+(*num_nodes)++; // count nodes links
+node_map[edge] = (edge ? node_counter : 0);
+num_edges = num_forward_edges(edge);
+if (edge != 0) {
+node_counter += num_edges;
+}
+edge += num_edges;
+if (edge >= num_edges_) {
+break;
+}
+if (backward_edge(edge)) {
+while (!last_edge(edge++)) {
+;
+}
+}
+edge--;
+}
+}
+return node_map;
+}
+bool SquishedDawg::write_squished_dawg(TFile *file) {
+EDGE_REF edge;
+int32_t num_edges;
+int32_t node_count = 0;
+EDGE_REF old_index;
+EDGE_RECORD temp_record;
+if (debug_level_) {
+tprintf("write_squished_dawg\n");
+}
+std::unique_ptr<EDGE_REF[]> node_map(build_node_map(&node_count));
+// Write the magic number to help detecting a change in endianness.
+int16_t magic = kDawgMagicNumber;
+if (!file->Serialize(&magic)) {
+return false;
+}
+if (!file->Serialize(&unicharset_size_)) {
+return false;
+}
+// Count the number of edges in this Dawg.
+num_edges = 0;
+for (edge = 0; edge < num_edges_; edge++) {
+if (forward_edge(edge)) {
+num_edges++;
+}
+}
+// Write edge count to file.
+if (!file->Serialize(&num_edges)) {
+return false;
+}
+if (debug_level_) {
+tprintf("%d nodes in DAWG\n", node_count);
+tprintf("%d edges in DAWG\n", num_edges);
+}
+for (edge = 0; edge < num_edges_; edge++) {
+if (forward_edge(edge)) { // write forward edges
+do {
+old_index = next_node_from_edge_rec(edges_[edge]);
+set_next_node(edge, node_map[old_index]);
+temp_record = edges_[edge];
+if (!file->Serialize(&temp_record)) {
+return false;
+}
+set_next_node(edge, old_index);
+} while (!last_edge(edge++));
+if (edge >= num_edges_) {
+break;
+}
+if (backward_edge(edge)) { // skip back links
+while (!last_edge(edge++)) {
+;
+}
+}
+edge--;
+}
+}
+return true;
+}
+} // namespace tesseract

Mercurial > hgrepos > Python2 > PyMuPDF

comparison mupdf-source/thirdparty/tesseract/src/dict/dawg.cpp @ 2:b50eed0cc0ef upstream