Python2/PyMuPDF: mupdf-source/thirdparty/tesseract/src/dict/permdawg.cpp comparison

comparison mupdf-source/thirdparty/tesseract/src/dict/permdawg.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.

author	Franz Glasner <fzglas.hg@dom66.de>
date	Mon, 15 Sep 2025 11:43:07 +0200
parents
children

comparison

equal deleted inserted replaced

-:1d09e1dec1d9
+:b50eed0cc0ef
+/******************************************************************************
+*
+* File:         permdawg.cpp  (Formerly permdawg.c)
+* Description:  Scale word choices by a dictionary
+* Author:       Mark Seaman, OCR Technology
+*
+* (c) Copyright 1987, Hewlett-Packard Company.
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+** http://www.apache.org/licenses/LICENSE-2.0
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*
+*****************************************************************************/
+/*----------------------------------------------------------------------
+I n c l u d e s
+----------------------------------------------------------------------*/
+#include "dawg.h"
+#include "params.h"
+#include "stopper.h"
+#include "tprintf.h"
+#include <algorithm>
+#include <cctype>
+#include "dict.h"
+/*----------------------------------------------------------------------
+F u n c t i o n s
+----------------------------------------------------------------------*/
+namespace tesseract {
+/**
+* @name go_deeper_dawg_fxn
+*
+* If the choice being composed so far could be a dictionary word
+* keep exploring choices.
+*/
+void Dict::go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
+int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
+bool word_ending, WERD_CHOICE *word, float certainties[],
+float *limit, WERD_CHOICE *best_choice, int *attempts_left,
+void *void_more_args) {
+auto *more_args = static_cast<DawgArgs *>(void_more_args);
+word_ending = (static_cast<unsigned>(char_choice_index) == char_choices.size() - 1);
+int word_index = word->length() - 1;
+if (best_choice->rating() < *limit) {
+return;
+}
+// Look up char in DAWG
+// If the current unichar is an ngram first try calling
+// letter_is_okay() for each unigram it contains separately.
+UNICHAR_ID orig_uch_id = word->unichar_id(word_index);
+bool checked_unigrams = false;
+if (getUnicharset().get_isngram(orig_uch_id)) {
+if (dawg_debug_level) {
+tprintf("checking unigrams in an ngram %s\n", getUnicharset().debug_str(orig_uch_id).c_str());
+}
+int num_unigrams = 0;
+word->remove_last_unichar_id();
+std::vector<UNICHAR_ID> encoding;
+const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);
+// Since the string came out of the unicharset, failure is impossible.
+ASSERT_HOST(getUnicharset().encode_string(ngram_str, true, &encoding, nullptr, nullptr));
+bool unigrams_ok = true;
+// Construct DawgArgs that reflect the current state.
+DawgPositionVector unigram_active_dawgs = *(more_args->active_dawgs);
+DawgPositionVector unigram_updated_dawgs;
+DawgArgs unigram_dawg_args(&unigram_active_dawgs, &unigram_updated_dawgs, more_args->permuter);
+// Check unigrams in the ngram with letter_is_okay().
+for (size_t i = 0; unigrams_ok && i < encoding.size(); ++i) {
+UNICHAR_ID uch_id = encoding[i];
+ASSERT_HOST(uch_id != INVALID_UNICHAR_ID);
+++num_unigrams;
+word->append_unichar_id(uch_id, 1, 0.0, 0.0);
+unigrams_ok = (this->*letter_is_okay_)(&unigram_dawg_args, *word->unicharset(),
+word->unichar_id(word_index + num_unigrams - 1),
+word_ending && i == encoding.size() - 1);
+(*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs);
+if (dawg_debug_level) {
+tprintf("unigram %s is %s\n", getUnicharset().debug_str(uch_id).c_str(),
+unigrams_ok ? "OK" : "not OK");
+}
+}
+// Restore the word and copy the updated dawg state if needed.
+while (num_unigrams-- > 0) {
+word->remove_last_unichar_id();
+}
+word->append_unichar_id_space_allocated(orig_uch_id, 1, 0.0, 0.0);
+if (unigrams_ok) {
+checked_unigrams = true;
+more_args->permuter = unigram_dawg_args.permuter;
+*(more_args->updated_dawgs) = *(unigram_dawg_args.updated_dawgs);
+}
+}
+// Check which dawgs from the dawgs_ vector contain the word
+// up to and including the current unichar.
+if (checked_unigrams || (this->*letter_is_okay_)(more_args, *word->unicharset(),
+word->unichar_id(word_index), word_ending)) {
+// Add a new word choice
+if (word_ending) {
+if (dawg_debug_level) {
+tprintf("found word = %s\n", word->debug_string().c_str());
+}
+if (strcmp(output_ambig_words_file.c_str(), "") != 0) {
+if (output_ambig_words_file_ == nullptr) {
+output_ambig_words_file_ = fopen(output_ambig_words_file.c_str(), "wb+");
+if (output_ambig_words_file_ == nullptr) {
+tprintf("Failed to open output_ambig_words_file %s\n", output_ambig_words_file.c_str());
+exit(1);
+}
+std::string word_str;
+word->string_and_lengths(&word_str, nullptr);
+word_str += " ";
+fprintf(output_ambig_words_file_, "%s", word_str.c_str());
+}
+std::string word_str;
+word->string_and_lengths(&word_str, nullptr);
+word_str += " ";
+fprintf(output_ambig_words_file_, "%s", word_str.c_str());
+}
+WERD_CHOICE *adjusted_word = word;
+adjusted_word->set_permuter(more_args->permuter);
+update_best_choice(*adjusted_word, best_choice);
+} else { // search the next letter
+// Make updated_* point to the next entries in the DawgPositionVector
+// arrays (that were originally created in dawg_permute_and_select)
+++(more_args->updated_dawgs);
+// Make active_dawgs and constraints point to the updated ones.
+++(more_args->active_dawgs);
+permute_choices(debug, char_choices, char_choice_index + 1, prev_char_frag_info, word,
+certainties, limit, best_choice, attempts_left, more_args);
+// Restore previous state to explore another letter in this position.
+--(more_args->updated_dawgs);
+--(more_args->active_dawgs);
+}
+} else {
+if (dawg_debug_level) {
+tprintf("last unichar not OK at index %d in %s\n", word_index, word->debug_string().c_str());
+}
+}
+}
+/**
+* dawg_permute_and_select
+*
+* Recursively explore all the possible character combinations in
+* the given char_choices. Use go_deeper_dawg_fxn() to search all the
+* dawgs in the dawgs_ vector in parallel and discard invalid words.
+*
+* Allocate and return a WERD_CHOICE with the best valid word found.
+*/
+WERD_CHOICE *Dict::dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices,
+float rating_limit) {
+auto *best_choice = new WERD_CHOICE(&getUnicharset());
+best_choice->make_bad();
+best_choice->set_rating(rating_limit);
+if (char_choices.empty() || char_choices.size() > MAX_WERD_LENGTH) {
+return best_choice;
+}
+auto *active_dawgs = new DawgPositionVector[char_choices.size() + 1];
+init_active_dawgs(&(active_dawgs[0]), true);
+DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
+WERD_CHOICE word(&getUnicharset(), MAX_WERD_LENGTH);
+float certainties[MAX_WERD_LENGTH];
+this->go_deeper_fxn_ = &tesseract::Dict::go_deeper_dawg_fxn;
+int attempts_left = max_permuter_attempts;
+permute_choices((dawg_debug_level) ? "permute_dawg_debug" : nullptr, char_choices, 0, nullptr,
+&word, certainties, &rating_limit, best_choice, &attempts_left, &dawg_args);
+delete[] active_dawgs;
+return best_choice;
+}
+/**
+* permute_choices
+*
+* Call append_choices() for each BLOB_CHOICE in BLOB_CHOICE_LIST
+* with the given char_choice_index in char_choices.
+*/
+void Dict::permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
+int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
+WERD_CHOICE *word, float certainties[], float *limit,
+WERD_CHOICE *best_choice, int *attempts_left, void *more_args) {
+if (debug) {
+tprintf(
+"%s permute_choices: char_choice_index=%d"
+" limit=%g rating=%g, certainty=%g word=%s\n",
+debug, char_choice_index, *limit, word->rating(), word->certainty(),
+word->debug_string().c_str());
+}
+if (static_cast<unsigned>(char_choice_index) < char_choices.size()) {
+BLOB_CHOICE_IT blob_choice_it;
+blob_choice_it.set_to_list(char_choices.at(char_choice_index));
+for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list(); blob_choice_it.forward()) {
+(*attempts_left)--;
+append_choices(debug, char_choices, *(blob_choice_it.data()), char_choice_index,
+prev_char_frag_info, word, certainties, limit, best_choice, attempts_left,
+more_args);
+if (*attempts_left <= 0) {
+if (debug) {
+tprintf("permute_choices(): attempts_left is 0\n");
+}
+break;
+}
+}
+}
+}
+/**
+* append_choices
+*
+* Checks to see whether or not the next choice is worth appending to
+* the word being generated. If so then keeps going deeper into the word.
+*
+* This function assumes that Dict::go_deeper_fxn_ is set.
+*/
+void Dict::append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
+const BLOB_CHOICE &blob_choice, int char_choice_index,
+const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word,
+float certainties[], float *limit, WERD_CHOICE *best_choice,
+int *attempts_left, void *more_args) {
+auto word_ending = (static_cast<unsigned>(char_choice_index) == char_choices.size() - 1);
+// Deal with fragments.
+CHAR_FRAGMENT_INFO char_frag_info;
+if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(), blob_choice.certainty(),
+prev_char_frag_info, debug, word_ending, &char_frag_info)) {
+return; // blob_choice must be an invalid fragment
+}
+// Search the next letter if this character is a fragment.
+if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) {
+permute_choices(debug, char_choices, char_choice_index + 1, &char_frag_info, word, certainties,
+limit, best_choice, attempts_left, more_args);
+return;
+}
+// Add the next unichar.
+float old_rating = word->rating();
+float old_certainty = word->certainty();
+uint8_t old_permuter = word->permuter();
+certainties[word->length()] = char_frag_info.certainty;
+word->append_unichar_id_space_allocated(char_frag_info.unichar_id, char_frag_info.num_fragments,
+char_frag_info.rating, char_frag_info.certainty);
+// Explore the next unichar.
+(this->*go_deeper_fxn_)(debug, char_choices, char_choice_index, &char_frag_info, word_ending,
+word, certainties, limit, best_choice, attempts_left, more_args);
+// Remove the unichar we added to explore other choices in it's place.
+word->remove_last_unichar_id();
+word->set_rating(old_rating);
+word->set_certainty(old_certainty);
+word->set_permuter(old_permuter);
+}
+/**
+* @name fragment_state
+*
+* Given the current char choice and information about previously seen
+* fragments, determines whether adjacent character fragments are
+* present and whether they can be concatenated.
+*
+* The given prev_char_frag_info contains:
+* - fragment: if not nullptr contains information about immediately
+*   preceding fragmented character choice
+* - num_fragments: number of fragments that have been used so far
+*   to construct a character
+* - certainty: certainty of the current choice or minimum
+*   certainty of all fragments concatenated so far
+* - rating: rating of the current choice or sum of fragment
+*   ratings concatenated so far
+*
+* The output char_frag_info is filled in as follows:
+* - character: is set to be nullptr if the choice is a non-matching
+*   or non-ending fragment piece; is set to unichar of the given choice
+*   if it represents a regular character or a matching ending fragment
+* - fragment,num_fragments,certainty,rating are set as described above
+*
+* @returns false if a non-matching fragment is discovered, true otherwise.
+*/
+bool Dict::fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty,
+const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug,
+int word_ending, CHAR_FRAGMENT_INFO *char_frag_info) {
+const CHAR_FRAGMENT *this_fragment = getUnicharset().get_fragment(curr_unichar_id);
+const CHAR_FRAGMENT *prev_fragment =
+prev_char_frag_info != nullptr ? prev_char_frag_info->fragment : nullptr;
+// Print debug info for fragments.
+if (debug && (prev_fragment || this_fragment)) {
+tprintf("%s check fragments: choice=%s word_ending=%d\n", debug,
+getUnicharset().debug_str(curr_unichar_id).c_str(), word_ending);
+if (prev_fragment) {
+tprintf("prev_fragment %s\n", prev_fragment->to_string().c_str());
+}
+if (this_fragment) {
+tprintf("this_fragment %s\n", this_fragment->to_string().c_str());
+}
+}
+char_frag_info->unichar_id = curr_unichar_id;
+char_frag_info->fragment = this_fragment;
+char_frag_info->rating = curr_rating;
+char_frag_info->certainty = curr_certainty;
+char_frag_info->num_fragments = 1;
+if (prev_fragment && !this_fragment) {
+if (debug) {
+tprintf("Skip choice with incomplete fragment\n");
+}
+return false;
+}
+if (this_fragment) {
+// We are dealing with a fragment.
+char_frag_info->unichar_id = INVALID_UNICHAR_ID;
+if (prev_fragment) {
+if (!this_fragment->is_continuation_of(prev_fragment)) {
+if (debug) {
+tprintf("Non-matching fragment piece\n");
+}
+return false;
+}
+if (this_fragment->is_ending()) {
+char_frag_info->unichar_id = getUnicharset().unichar_to_id(this_fragment->get_unichar());
+char_frag_info->fragment = nullptr;
+if (debug) {
+tprintf("Built character %s from fragments\n",
+getUnicharset().debug_str(char_frag_info->unichar_id).c_str());
+}
+} else {
+if (debug) {
+tprintf("Record fragment continuation\n");
+}
+char_frag_info->fragment = this_fragment;
+}
+// Update certainty and rating.
+char_frag_info->rating = prev_char_frag_info->rating + curr_rating;
+char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1;
+char_frag_info->certainty = std::min(curr_certainty, prev_char_frag_info->certainty);
+} else {
+if (this_fragment->is_beginning()) {
+if (debug) {
+tprintf("Record fragment beginning\n");
+}
+} else {
+if (debug) {
+tprintf("Non-starting fragment piece with no prev_fragment\n");
+}
+return false;
+}
+}
+}
+if (word_ending && char_frag_info->fragment) {
+if (debug) {
+tprintf("Word cannot end with a fragment\n");
+}
+return false;
+}
+return true;
+}
+} // namespace tesseract

Mercurial > hgrepos > Python2 > PyMuPDF

comparison mupdf-source/thirdparty/tesseract/src/dict/permdawg.cpp @ 2:b50eed0cc0ef upstream