diff mupdf-source/thirdparty/tesseract/src/dict/permdawg.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/dict/permdawg.cpp	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,367 @@
+/******************************************************************************
+ *
+ * File:         permdawg.cpp  (Formerly permdawg.c)
+ * Description:  Scale word choices by a dictionary
+ * Author:       Mark Seaman, OCR Technology
+ *
+ * (c) Copyright 1987, Hewlett-Packard Company.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ *****************************************************************************/
+/*----------------------------------------------------------------------
+              I n c l u d e s
+----------------------------------------------------------------------*/
+
+#include "dawg.h"
+#include "params.h"
+#include "stopper.h"
+#include "tprintf.h"
+
+#include <algorithm>
+#include <cctype>
+#include "dict.h"
+
+/*----------------------------------------------------------------------
+              F u n c t i o n s
+----------------------------------------------------------------------*/
+namespace tesseract {
+
+/**
+ * @name go_deeper_dawg_fxn
+ *
+ * If the choice being composed so far could be a dictionary word
+ * keep exploring choices.
+ */
+void Dict::go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
+                              int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
+                              bool word_ending, WERD_CHOICE *word, float certainties[],
+                              float *limit, WERD_CHOICE *best_choice, int *attempts_left,
+                              void *void_more_args) {
+  auto *more_args = static_cast<DawgArgs *>(void_more_args);
+  word_ending = (static_cast<unsigned>(char_choice_index) == char_choices.size() - 1);
+  int word_index = word->length() - 1;
+  if (best_choice->rating() < *limit) {
+    return;
+  }
+  // Look up char in DAWG
+
+  // If the current unichar is an ngram first try calling
+  // letter_is_okay() for each unigram it contains separately.
+  UNICHAR_ID orig_uch_id = word->unichar_id(word_index);
+  bool checked_unigrams = false;
+  if (getUnicharset().get_isngram(orig_uch_id)) {
+    if (dawg_debug_level) {
+      tprintf("checking unigrams in an ngram %s\n", getUnicharset().debug_str(orig_uch_id).c_str());
+    }
+    int num_unigrams = 0;
+    word->remove_last_unichar_id();
+    std::vector<UNICHAR_ID> encoding;
+    const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);
+    // Since the string came out of the unicharset, failure is impossible.
+    ASSERT_HOST(getUnicharset().encode_string(ngram_str, true, &encoding, nullptr, nullptr));
+    bool unigrams_ok = true;
+    // Construct DawgArgs that reflect the current state.
+    DawgPositionVector unigram_active_dawgs = *(more_args->active_dawgs);
+    DawgPositionVector unigram_updated_dawgs;
+    DawgArgs unigram_dawg_args(&unigram_active_dawgs, &unigram_updated_dawgs, more_args->permuter);
+    // Check unigrams in the ngram with letter_is_okay().
+    for (size_t i = 0; unigrams_ok && i < encoding.size(); ++i) {
+      UNICHAR_ID uch_id = encoding[i];
+      ASSERT_HOST(uch_id != INVALID_UNICHAR_ID);
+      ++num_unigrams;
+      word->append_unichar_id(uch_id, 1, 0.0, 0.0);
+      unigrams_ok = (this->*letter_is_okay_)(&unigram_dawg_args, *word->unicharset(),
+                                             word->unichar_id(word_index + num_unigrams - 1),
+                                             word_ending && i == encoding.size() - 1);
+      (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs);
+      if (dawg_debug_level) {
+        tprintf("unigram %s is %s\n", getUnicharset().debug_str(uch_id).c_str(),
+                unigrams_ok ? "OK" : "not OK");
+      }
+    }
+    // Restore the word and copy the updated dawg state if needed.
+    while (num_unigrams-- > 0) {
+      word->remove_last_unichar_id();
+    }
+    word->append_unichar_id_space_allocated(orig_uch_id, 1, 0.0, 0.0);
+    if (unigrams_ok) {
+      checked_unigrams = true;
+      more_args->permuter = unigram_dawg_args.permuter;
+      *(more_args->updated_dawgs) = *(unigram_dawg_args.updated_dawgs);
+    }
+  }
+
+  // Check which dawgs from the dawgs_ vector contain the word
+  // up to and including the current unichar.
+  if (checked_unigrams || (this->*letter_is_okay_)(more_args, *word->unicharset(),
+                                                   word->unichar_id(word_index), word_ending)) {
+    // Add a new word choice
+    if (word_ending) {
+      if (dawg_debug_level) {
+        tprintf("found word = %s\n", word->debug_string().c_str());
+      }
+      if (strcmp(output_ambig_words_file.c_str(), "") != 0) {
+        if (output_ambig_words_file_ == nullptr) {
+          output_ambig_words_file_ = fopen(output_ambig_words_file.c_str(), "wb+");
+          if (output_ambig_words_file_ == nullptr) {
+            tprintf("Failed to open output_ambig_words_file %s\n", output_ambig_words_file.c_str());
+            exit(1);
+          }
+          std::string word_str;
+          word->string_and_lengths(&word_str, nullptr);
+          word_str += " ";
+          fprintf(output_ambig_words_file_, "%s", word_str.c_str());
+        }
+        std::string word_str;
+        word->string_and_lengths(&word_str, nullptr);
+        word_str += " ";
+        fprintf(output_ambig_words_file_, "%s", word_str.c_str());
+      }
+      WERD_CHOICE *adjusted_word = word;
+      adjusted_word->set_permuter(more_args->permuter);
+      update_best_choice(*adjusted_word, best_choice);
+    } else { // search the next letter
+      // Make updated_* point to the next entries in the DawgPositionVector
+      // arrays (that were originally created in dawg_permute_and_select)
+      ++(more_args->updated_dawgs);
+      // Make active_dawgs and constraints point to the updated ones.
+      ++(more_args->active_dawgs);
+      permute_choices(debug, char_choices, char_choice_index + 1, prev_char_frag_info, word,
+                      certainties, limit, best_choice, attempts_left, more_args);
+      // Restore previous state to explore another letter in this position.
+      --(more_args->updated_dawgs);
+      --(more_args->active_dawgs);
+    }
+  } else {
+    if (dawg_debug_level) {
+      tprintf("last unichar not OK at index %d in %s\n", word_index, word->debug_string().c_str());
+    }
+  }
+}
+
+/**
+ * dawg_permute_and_select
+ *
+ * Recursively explore all the possible character combinations in
+ * the given char_choices. Use go_deeper_dawg_fxn() to search all the
+ * dawgs in the dawgs_ vector in parallel and discard invalid words.
+ *
+ * Allocate and return a WERD_CHOICE with the best valid word found.
+ */
+WERD_CHOICE *Dict::dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices,
+                                           float rating_limit) {
+  auto *best_choice = new WERD_CHOICE(&getUnicharset());
+  best_choice->make_bad();
+  best_choice->set_rating(rating_limit);
+  if (char_choices.empty() || char_choices.size() > MAX_WERD_LENGTH) {
+    return best_choice;
+  }
+  auto *active_dawgs = new DawgPositionVector[char_choices.size() + 1];
+  init_active_dawgs(&(active_dawgs[0]), true);
+  DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
+  WERD_CHOICE word(&getUnicharset(), MAX_WERD_LENGTH);
+
+  float certainties[MAX_WERD_LENGTH];
+  this->go_deeper_fxn_ = &tesseract::Dict::go_deeper_dawg_fxn;
+  int attempts_left = max_permuter_attempts;
+  permute_choices((dawg_debug_level) ? "permute_dawg_debug" : nullptr, char_choices, 0, nullptr,
+                  &word, certainties, &rating_limit, best_choice, &attempts_left, &dawg_args);
+  delete[] active_dawgs;
+  return best_choice;
+}
+
+/**
+ * permute_choices
+ *
+ * Call append_choices() for each BLOB_CHOICE in BLOB_CHOICE_LIST
+ * with the given char_choice_index in char_choices.
+ */
+void Dict::permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
+                           int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
+                           WERD_CHOICE *word, float certainties[], float *limit,
+                           WERD_CHOICE *best_choice, int *attempts_left, void *more_args) {
+  if (debug) {
+    tprintf(
+        "%s permute_choices: char_choice_index=%d"
+        " limit=%g rating=%g, certainty=%g word=%s\n",
+        debug, char_choice_index, *limit, word->rating(), word->certainty(),
+        word->debug_string().c_str());
+  }
+  if (static_cast<unsigned>(char_choice_index) < char_choices.size()) {
+    BLOB_CHOICE_IT blob_choice_it;
+    blob_choice_it.set_to_list(char_choices.at(char_choice_index));
+    for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list(); blob_choice_it.forward()) {
+      (*attempts_left)--;
+      append_choices(debug, char_choices, *(blob_choice_it.data()), char_choice_index,
+                     prev_char_frag_info, word, certainties, limit, best_choice, attempts_left,
+                     more_args);
+      if (*attempts_left <= 0) {
+        if (debug) {
+          tprintf("permute_choices(): attempts_left is 0\n");
+        }
+        break;
+      }
+    }
+  }
+}
+
+/**
+ * append_choices
+ *
+ * Checks to see whether or not the next choice is worth appending to
+ * the word being generated. If so then keeps going deeper into the word.
+ *
+ * This function assumes that Dict::go_deeper_fxn_ is set.
+ */
+void Dict::append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
+                          const BLOB_CHOICE &blob_choice, int char_choice_index,
+                          const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word,
+                          float certainties[], float *limit, WERD_CHOICE *best_choice,
+                          int *attempts_left, void *more_args) {
+  auto word_ending = (static_cast<unsigned>(char_choice_index) == char_choices.size() - 1);
+
+  // Deal with fragments.
+  CHAR_FRAGMENT_INFO char_frag_info;
+  if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(), blob_choice.certainty(),
+                           prev_char_frag_info, debug, word_ending, &char_frag_info)) {
+    return; // blob_choice must be an invalid fragment
+  }
+  // Search the next letter if this character is a fragment.
+  if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) {
+    permute_choices(debug, char_choices, char_choice_index + 1, &char_frag_info, word, certainties,
+                    limit, best_choice, attempts_left, more_args);
+    return;
+  }
+
+  // Add the next unichar.
+  float old_rating = word->rating();
+  float old_certainty = word->certainty();
+  uint8_t old_permuter = word->permuter();
+  certainties[word->length()] = char_frag_info.certainty;
+  word->append_unichar_id_space_allocated(char_frag_info.unichar_id, char_frag_info.num_fragments,
+                                          char_frag_info.rating, char_frag_info.certainty);
+
+  // Explore the next unichar.
+  (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index, &char_frag_info, word_ending,
+                          word, certainties, limit, best_choice, attempts_left, more_args);
+
+  // Remove the unichar we added to explore other choices in it's place.
+  word->remove_last_unichar_id();
+  word->set_rating(old_rating);
+  word->set_certainty(old_certainty);
+  word->set_permuter(old_permuter);
+}
+
+/**
+ * @name fragment_state
+ *
+ * Given the current char choice and information about previously seen
+ * fragments, determines whether adjacent character fragments are
+ * present and whether they can be concatenated.
+ *
+ * The given prev_char_frag_info contains:
+ * - fragment: if not nullptr contains information about immediately
+ *   preceding fragmented character choice
+ * - num_fragments: number of fragments that have been used so far
+ *   to construct a character
+ * - certainty: certainty of the current choice or minimum
+ *   certainty of all fragments concatenated so far
+ * - rating: rating of the current choice or sum of fragment
+ *   ratings concatenated so far
+ *
+ * The output char_frag_info is filled in as follows:
+ * - character: is set to be nullptr if the choice is a non-matching
+ *   or non-ending fragment piece; is set to unichar of the given choice
+ *   if it represents a regular character or a matching ending fragment
+ * - fragment,num_fragments,certainty,rating are set as described above
+ *
+ * @returns false if a non-matching fragment is discovered, true otherwise.
+ */
+bool Dict::fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty,
+                               const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug,
+                               int word_ending, CHAR_FRAGMENT_INFO *char_frag_info) {
+  const CHAR_FRAGMENT *this_fragment = getUnicharset().get_fragment(curr_unichar_id);
+  const CHAR_FRAGMENT *prev_fragment =
+      prev_char_frag_info != nullptr ? prev_char_frag_info->fragment : nullptr;
+
+  // Print debug info for fragments.
+  if (debug && (prev_fragment || this_fragment)) {
+    tprintf("%s check fragments: choice=%s word_ending=%d\n", debug,
+            getUnicharset().debug_str(curr_unichar_id).c_str(), word_ending);
+    if (prev_fragment) {
+      tprintf("prev_fragment %s\n", prev_fragment->to_string().c_str());
+    }
+    if (this_fragment) {
+      tprintf("this_fragment %s\n", this_fragment->to_string().c_str());
+    }
+  }
+
+  char_frag_info->unichar_id = curr_unichar_id;
+  char_frag_info->fragment = this_fragment;
+  char_frag_info->rating = curr_rating;
+  char_frag_info->certainty = curr_certainty;
+  char_frag_info->num_fragments = 1;
+  if (prev_fragment && !this_fragment) {
+    if (debug) {
+      tprintf("Skip choice with incomplete fragment\n");
+    }
+    return false;
+  }
+  if (this_fragment) {
+    // We are dealing with a fragment.
+    char_frag_info->unichar_id = INVALID_UNICHAR_ID;
+    if (prev_fragment) {
+      if (!this_fragment->is_continuation_of(prev_fragment)) {
+        if (debug) {
+          tprintf("Non-matching fragment piece\n");
+        }
+        return false;
+      }
+      if (this_fragment->is_ending()) {
+        char_frag_info->unichar_id = getUnicharset().unichar_to_id(this_fragment->get_unichar());
+        char_frag_info->fragment = nullptr;
+        if (debug) {
+          tprintf("Built character %s from fragments\n",
+                  getUnicharset().debug_str(char_frag_info->unichar_id).c_str());
+        }
+      } else {
+        if (debug) {
+          tprintf("Record fragment continuation\n");
+        }
+        char_frag_info->fragment = this_fragment;
+      }
+      // Update certainty and rating.
+      char_frag_info->rating = prev_char_frag_info->rating + curr_rating;
+      char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1;
+      char_frag_info->certainty = std::min(curr_certainty, prev_char_frag_info->certainty);
+    } else {
+      if (this_fragment->is_beginning()) {
+        if (debug) {
+          tprintf("Record fragment beginning\n");
+        }
+      } else {
+        if (debug) {
+          tprintf("Non-starting fragment piece with no prev_fragment\n");
+        }
+        return false;
+      }
+    }
+  }
+  if (word_ending && char_frag_info->fragment) {
+    if (debug) {
+      tprintf("Word cannot end with a fragment\n");
+    }
+    return false;
+  }
+  return true;
+}
+
+} // namespace tesseract