diff mupdf-source/thirdparty/tesseract/src/ccmain/tfacepp.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/ccmain/tfacepp.cpp	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,299 @@
+/**********************************************************************
+ * File:        tfacepp.cpp  (Formerly tface++.c)
+ * Description: C++ side of the C/C++ Tess/Editor interface.
+ * Author:      Ray Smith
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include <cmath>
+
+#include "blamer.h"
+#include "errcode.h"
+#include "ratngs.h"
+#include "reject.h"
+#include "tesseractclass.h"
+#include "werd.h"
+
+#define MAX_UNDIVIDED_LENGTH 24
+
+/**********************************************************************
+ * recog_word
+ *
+ * Convert the word to tess form and pass it to the tess segmenter.
+ * Convert the output back to editor form.
+ **********************************************************************/
+namespace tesseract {
+void Tesseract::recog_word(WERD_RES *word) {
+  if (wordrec_skip_no_truth_words &&
+      (word->blamer_bundle == nullptr ||
+       word->blamer_bundle->incorrect_result_reason() == IRR_NO_TRUTH)) {
+    if (classify_debug_level) {
+      tprintf("No truth for word - skipping\n");
+    }
+    word->tess_failed = true;
+    return;
+  }
+  ASSERT_HOST(!word->chopped_word->blobs.empty());
+  recog_word_recursive(word);
+  word->SetupBoxWord();
+  ASSERT_HOST(static_cast<unsigned>(word->best_choice->length()) == word->box_word->length());
+  // Check that the ratings matrix size matches the sum of all the
+  // segmentation states.
+  if (!word->StatesAllValid()) {
+    tprintf("Not all words have valid states relative to ratings matrix!!");
+    word->DebugWordChoices(true, nullptr);
+    ASSERT_HOST(word->StatesAllValid());
+  }
+  if (tessedit_override_permuter) {
+    /* Override the permuter type if a straight dictionary check disagrees. */
+    uint8_t perm_type = word->best_choice->permuter();
+    if ((perm_type != SYSTEM_DAWG_PERM) && (perm_type != FREQ_DAWG_PERM) &&
+        (perm_type != USER_DAWG_PERM)) {
+      uint8_t real_dict_perm_type = dict_word(*word->best_choice);
+      if (((real_dict_perm_type == SYSTEM_DAWG_PERM) || (real_dict_perm_type == FREQ_DAWG_PERM) ||
+           (real_dict_perm_type == USER_DAWG_PERM)) &&
+          (alpha_count(word->best_choice->unichar_string().c_str(),
+                       word->best_choice->unichar_lengths().c_str()) > 0)) {
+        word->best_choice->set_permuter(real_dict_perm_type); // use dict perm
+      }
+    }
+    if (tessedit_rejection_debug && perm_type != word->best_choice->permuter()) {
+      tprintf("Permuter Type Flipped from %d to %d\n", perm_type, word->best_choice->permuter());
+    }
+  }
+  // Factored out from control.cpp
+  ASSERT_HOST((word->best_choice == nullptr) == (word->raw_choice == nullptr));
+  if (word->best_choice == nullptr || word->best_choice->empty() ||
+      strspn(word->best_choice->unichar_string().c_str(), " ") ==
+          word->best_choice->length()) {
+    word->tess_failed = true;
+    word->reject_map.initialise(word->box_word->length());
+    word->reject_map.rej_word_tess_failure();
+  } else {
+    word->tess_failed = false;
+  }
+}
+
+/**********************************************************************
+ * recog_word_recursive
+ *
+ * Convert the word to tess form and pass it to the tess segmenter.
+ * Convert the output back to editor form.
+ **********************************************************************/
+void Tesseract::recog_word_recursive(WERD_RES *word) {
+  auto word_length = word->chopped_word->NumBlobs(); // no of blobs
+  if (word_length > MAX_UNDIVIDED_LENGTH) {
+    return split_and_recog_word(word);
+  }
+  cc_recog(word);
+  word_length = word->rebuild_word->NumBlobs(); // No of blobs in output.
+
+  // Do sanity checks and minor fixes on best_choice.
+  if (word->best_choice->length() > word_length) {
+    word->best_choice->make_bad(); // should never happen
+    tprintf(
+        "recog_word: Discarded long string \"%s\""
+        " (%d characters vs %d blobs)\n",
+        word->best_choice->unichar_string().c_str(), word->best_choice->length(), word_length);
+    tprintf("Word is at:");
+    word->word->bounding_box().print();
+  }
+  if (word->best_choice->length() < word_length) {
+    UNICHAR_ID space_id = unicharset.unichar_to_id(" ");
+    while (word->best_choice->length() < word_length) {
+      word->best_choice->append_unichar_id(space_id, 1, 0.0, word->best_choice->certainty());
+    }
+  }
+}
+
+/**********************************************************************
+ * split_and_recog_word
+ *
+ * Split the word into 2 smaller pieces at the largest gap.
+ * Recognize the pieces and stick the results back together.
+ **********************************************************************/
+void Tesseract::split_and_recog_word(WERD_RES *word) {
+  // Find the biggest blob gap in the chopped_word.
+  int bestgap = -INT32_MAX;
+  int split_index = 0;
+  for (unsigned b = 1; b < word->chopped_word->NumBlobs(); ++b) {
+    TBOX prev_box = word->chopped_word->blobs[b - 1]->bounding_box();
+    TBOX blob_box = word->chopped_word->blobs[b]->bounding_box();
+    int gap = blob_box.left() - prev_box.right();
+    if (gap > bestgap) {
+      bestgap = gap;
+      split_index = b;
+    }
+  }
+  ASSERT_HOST(split_index > 0);
+
+  WERD_RES *word2 = nullptr;
+  BlamerBundle *orig_bb = nullptr;
+  split_word(word, split_index, &word2, &orig_bb);
+
+  // Recognize the first part of the word.
+  recog_word_recursive(word);
+  // Recognize the second part of the word.
+  recog_word_recursive(word2);
+
+  join_words(word, word2, orig_bb);
+}
+
+/**********************************************************************
+ * split_word
+ *
+ * Split a given WERD_RES in place into two smaller words for recognition.
+ * split_pt is the index of the first blob to go in the second word.
+ * The underlying word is left alone, only the TWERD (and subsequent data)
+ * are split up.  orig_blamer_bundle is set to the original blamer bundle,
+ * and will now be owned by the caller.  New blamer bundles are forged for the
+ * two pieces.
+ **********************************************************************/
+void Tesseract::split_word(WERD_RES *word, unsigned split_pt, WERD_RES **right_piece,
+                           BlamerBundle **orig_blamer_bundle) const {
+  ASSERT_HOST(split_pt > 0 && split_pt < word->chopped_word->NumBlobs());
+
+  // Save a copy of the blamer bundle so we can try to reconstruct it below.
+  BlamerBundle *orig_bb = word->blamer_bundle ? new BlamerBundle(*word->blamer_bundle) : nullptr;
+
+  auto *word2 = new WERD_RES(*word);
+
+  // blow away the copied chopped_word, as we want to work with
+  // the blobs from the input chopped_word so seam_arrays can be merged.
+  TWERD *chopped = word->chopped_word;
+  auto *chopped2 = new TWERD;
+  chopped2->blobs.reserve(chopped->NumBlobs() - split_pt);
+  for (auto i = split_pt; i < chopped->NumBlobs(); ++i) {
+    chopped2->blobs.push_back(chopped->blobs[i]);
+  }
+  chopped->blobs.resize(split_pt);
+  word->chopped_word = nullptr;
+  delete word2->chopped_word;
+  word2->chopped_word = nullptr;
+
+  const UNICHARSET &unicharset = *word->uch_set;
+  word->ClearResults();
+  word2->ClearResults();
+  word->chopped_word = chopped;
+  word2->chopped_word = chopped2;
+  word->SetupBasicsFromChoppedWord(unicharset);
+  word2->SetupBasicsFromChoppedWord(unicharset);
+
+  // Try to adjust the blamer bundle.
+  if (orig_bb != nullptr) {
+    // TODO(rays) Looks like a leak to me.
+    // orig_bb should take, rather than copy.
+    word->blamer_bundle = new BlamerBundle();
+    word2->blamer_bundle = new BlamerBundle();
+    orig_bb->SplitBundle(chopped->blobs.back()->bounding_box().right(),
+                         word2->chopped_word->blobs[0]->bounding_box().left(), wordrec_debug_blamer,
+                         word->blamer_bundle, word2->blamer_bundle);
+  }
+
+  *right_piece = word2;
+  *orig_blamer_bundle = orig_bb;
+}
+
+/**********************************************************************
+ * join_words
+ *
+ * The opposite of split_word():
+ *  join word2 (including any recognized data / seam array / etc)
+ *  onto the right of word and then delete word2.
+ *  Also, if orig_bb is provided, stitch it back into word.
+ **********************************************************************/
+void Tesseract::join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const {
+  TBOX prev_box = word->chopped_word->blobs.back()->bounding_box();
+  TBOX blob_box = word2->chopped_word->blobs[0]->bounding_box();
+  // Tack the word2 outputs onto the end of the word outputs.
+  word->chopped_word->blobs.insert(word->chopped_word->blobs.end(), word2->chopped_word->blobs.begin(), word2->chopped_word->blobs.end());
+  word->rebuild_word->blobs.insert(word->rebuild_word->blobs.end(), word2->rebuild_word->blobs.begin(), word2->rebuild_word->blobs.end());
+  word2->chopped_word->blobs.clear();
+  word2->rebuild_word->blobs.clear();
+  TPOINT split_pt;
+  split_pt.x = (prev_box.right() + blob_box.left()) / 2;
+  split_pt.y = (prev_box.top() + prev_box.bottom() + blob_box.top() + blob_box.bottom()) / 4;
+  // Move the word2 seams onto the end of the word1 seam_array.
+  // Since the seam list is one element short, an empty seam marking the
+  // end of the last blob in the first word is needed first.
+  word->seam_array.push_back(new SEAM(0.0f, split_pt));
+  word->seam_array.insert(word->seam_array.end(), word2->seam_array.begin(), word2->seam_array.end());
+  word2->seam_array.clear();
+  // Fix widths and gaps.
+  word->blob_widths.insert(word->blob_widths.end(), word2->blob_widths.begin(), word2->blob_widths.end());
+  word->blob_gaps.insert(word->blob_gaps.end(), word2->blob_gaps.begin(), word2->blob_gaps.end());
+  // Fix the ratings matrix.
+  int rat1 = word->ratings->dimension();
+  int rat2 = word2->ratings->dimension();
+  word->ratings->AttachOnCorner(word2->ratings);
+  ASSERT_HOST(word->ratings->dimension() == rat1 + rat2);
+  word->best_state.insert(word->best_state.end(), word2->best_state.begin(), word2->best_state.end());
+  // Append the word choices.
+  *word->raw_choice += *word2->raw_choice;
+
+  // How many alt choices from each should we try to get?
+  const int kAltsPerPiece = 2;
+  // When do we start throwing away extra alt choices?
+  const int kTooManyAltChoices = 100;
+
+  // Construct the cartesian product of the best_choices of word(1) and word2.
+  WERD_CHOICE_LIST joined_choices;
+  WERD_CHOICE_IT jc_it(&joined_choices);
+  WERD_CHOICE_IT bc1_it(&word->best_choices);
+  WERD_CHOICE_IT bc2_it(&word2->best_choices);
+  int num_word1_choices = word->best_choices.length();
+  int total_joined_choices = num_word1_choices;
+  // Nota Bene: For the main loop here, we operate only on the 2nd and greater
+  // word2 choices, and put them in the joined_choices list. The 1st word2
+  // choice gets added to the original word1 choices in-place after we have
+  // finished with them.
+  int bc2_index = 1;
+  for (bc2_it.forward(); !bc2_it.at_first(); bc2_it.forward(), ++bc2_index) {
+    if (total_joined_choices >= kTooManyAltChoices && bc2_index > kAltsPerPiece) {
+      break;
+    }
+    int bc1_index = 0;
+    for (bc1_it.move_to_first(); bc1_index < num_word1_choices; ++bc1_index, bc1_it.forward()) {
+      if (total_joined_choices >= kTooManyAltChoices && bc1_index > kAltsPerPiece) {
+        break;
+      }
+      auto *wc = new WERD_CHOICE(*bc1_it.data());
+      *wc += *bc2_it.data();
+      jc_it.add_after_then_move(wc);
+      ++total_joined_choices;
+    }
+  }
+  // Now that we've filled in as many alternates as we want, paste the best
+  // choice for word2 onto the original word alt_choices.
+  bc1_it.move_to_first();
+  bc2_it.move_to_first();
+  for (bc1_it.mark_cycle_pt(); !bc1_it.cycled_list(); bc1_it.forward()) {
+    *bc1_it.data() += *bc2_it.data();
+  }
+  bc1_it.move_to_last();
+  bc1_it.add_list_after(&joined_choices);
+
+  // Restore the pointer to original blamer bundle and combine blamer
+  // information recorded in the splits.
+  if (orig_bb != nullptr) {
+    orig_bb->JoinBlames(*word->blamer_bundle, *word2->blamer_bundle, wordrec_debug_blamer);
+    delete word->blamer_bundle;
+    word->blamer_bundle = orig_bb;
+  }
+  word->SetupBoxWord();
+  word->reject_map.initialise(word->box_word->length());
+  delete word2;
+}
+
+} // namespace tesseract