diff mupdf-source/thirdparty/tesseract/src/ccstruct/blamer.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/ccstruct/blamer.h	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,350 @@
+///////////////////////////////////////////////////////////////////////
+// File:        blamer.h
+// Description: Module allowing precise error causes to be allocated.
+// Author:      Rike Antonova
+// Refactored:  Ray Smith
+//
+// (C) Copyright 2013, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CCSTRUCT_BLAMER_H_
+#define TESSERACT_CCSTRUCT_BLAMER_H_
+
+#ifdef HAVE_CONFIG_H
+#  include "config_auto.h" // DISABLED_LEGACY_ENGINE
+#endif
+#include "boxword.h" // for BoxWord
+#ifndef DISABLED_LEGACY_ENGINE
+#  include "params_training_featdef.h" // for ParamsTrainingBundle, ParamsTra...
+#endif                                 //  ndef DISABLED_LEGACY_ENGINE
+#include "ratngs.h"                    // for BLOB_CHOICE_LIST (ptr only)
+#include "rect.h"                      // for TBOX
+#include "tprintf.h"                   // for tprintf
+
+#include <tesseract/unichar.h> // for UNICHAR_ID
+
+#include <cstdint> // for int16_t
+#include <cstring> // for memcpy
+#include <vector>  // for std::vector
+
+namespace tesseract {
+
+class DENORM;
+class MATRIX;
+class UNICHARSET;
+class WERD_RES;
+
+struct MATRIX_COORD;
+struct TWERD;
+
+class LMPainPoints;
+
+static const int16_t kBlamerBoxTolerance = 5;
+
+// Enum for expressing the source of error.
+// Note: Please update kIncorrectResultReasonNames when modifying this enum.
+enum IncorrectResultReason {
+  // The text recorded in best choice == truth text
+  IRR_CORRECT,
+  // Either: Top choice is incorrect and is a dictionary word (language model
+  // is unlikely to help correct such errors, so blame the classifier).
+  // Or: the correct unichar was not included in shortlist produced by the
+  // classifier at all.
+  IRR_CLASSIFIER,
+  // Chopper have not found one or more splits that correspond to the correct
+  // character bounding boxes recorded in BlamerBundle::truth_word.
+  IRR_CHOPPER,
+  // Classifier did include correct unichars for each blob in the correct
+  // segmentation, however its rating could have been too bad to allow the
+  // language model to pull out the correct choice. On the other hand the
+  // strength of the language model might have been too weak to favor the
+  // correct answer, this we call this case a classifier-language model
+  // tradeoff error.
+  IRR_CLASS_LM_TRADEOFF,
+  // Page layout failed to produce the correct bounding box. Blame page layout
+  // if the truth was not found for the word, which implies that the bounding
+  // box of the word was incorrect (no truth word had a similar bounding box).
+  IRR_PAGE_LAYOUT,
+  // SegSearch heuristic prevented one or more blobs from the correct
+  // segmentation state to be classified (e.g. the blob was too wide).
+  IRR_SEGSEARCH_HEUR,
+  // The correct segmentaiton state was not explored because of poor SegSearch
+  // pain point prioritization. We blame SegSearch pain point prioritization
+  // if the best rating of a choice constructed from correct segmentation is
+  // better than that of the best choice (i.e. if we got to explore the correct
+  // segmentation state, language model would have picked the correct choice).
+  IRR_SEGSEARCH_PP,
+  // Same as IRR_CLASS_LM_TRADEOFF, but used when we only run chopper on a word,
+  // and thus use the old language model (permuters).
+  // TODO(antonova): integrate the new language mode with chopper
+  IRR_CLASS_OLD_LM_TRADEOFF,
+  // If there is an incorrect adaptive template match with a better score than
+  // a correct one (either pre-trained or adapted), mark this as adaption error.
+  IRR_ADAPTION,
+  // split_and_recog_word() failed to find a suitable split in truth.
+  IRR_NO_TRUTH_SPLIT,
+  // Truth is not available for this word (e.g. when words in corrected content
+  // file are turned into ~~~~ because an appropriate alignment was not found.
+  IRR_NO_TRUTH,
+  // The text recorded in best choice != truth text, but none of the above
+  // reasons are set.
+  IRR_UNKNOWN,
+
+  IRR_NUM_REASONS
+};
+
+// Blamer-related information to determine the source of errors.
+struct BlamerBundle {
+  static const char *IncorrectReasonName(IncorrectResultReason irr);
+  BlamerBundle()
+      : truth_has_char_boxes_(false)
+      , incorrect_result_reason_(IRR_CORRECT)
+      , lattice_data_(nullptr) {
+    ClearResults();
+  }
+  BlamerBundle(const BlamerBundle &other) {
+    this->CopyTruth(other);
+    this->CopyResults(other);
+  }
+  ~BlamerBundle() {
+    delete[] lattice_data_;
+  }
+
+  // Accessors.
+  std::string TruthString() const {
+    std::string truth_str;
+    for (auto &text : truth_text_) {
+      truth_str += text;
+    }
+    return truth_str;
+  }
+  IncorrectResultReason incorrect_result_reason() const {
+    return incorrect_result_reason_;
+  }
+  bool NoTruth() const {
+    return incorrect_result_reason_ == IRR_NO_TRUTH || incorrect_result_reason_ == IRR_PAGE_LAYOUT;
+  }
+  bool HasDebugInfo() const {
+    return debug_.length() > 0 || misadaption_debug_.length() > 0;
+  }
+  const std::string &debug() const {
+    return debug_;
+  }
+  const std::string &misadaption_debug() const {
+    return misadaption_debug_;
+  }
+  void UpdateBestRating(float rating) {
+    if (rating < best_correctly_segmented_rating_) {
+      best_correctly_segmented_rating_ = rating;
+    }
+  }
+  int correct_segmentation_length() const {
+    return correct_segmentation_cols_.size();
+  }
+  // Returns true if the given ratings matrix col,row position is included
+  // in the correct segmentation path at the given index.
+  bool MatrixPositionCorrect(int index, const MATRIX_COORD &coord) {
+    return correct_segmentation_cols_[index] == coord.col &&
+           correct_segmentation_rows_[index] == coord.row;
+  }
+  void set_best_choice_is_dict_and_top_choice(bool value) {
+    best_choice_is_dict_and_top_choice_ = value;
+  }
+  const char *lattice_data() const {
+    return lattice_data_;
+  }
+  int lattice_size() const {
+    return lattice_size_; // size of lattice_data in bytes
+  }
+  void set_lattice_data(const char *data, int size) {
+    lattice_size_ = size;
+    delete[] lattice_data_;
+    lattice_data_ = new char[lattice_size_];
+    memcpy(lattice_data_, data, lattice_size_);
+  }
+#ifndef DISABLED_LEGACY_ENGINE
+  const tesseract::ParamsTrainingBundle &params_training_bundle() const {
+    return params_training_bundle_;
+  }
+  // Adds a new ParamsTrainingHypothesis to the current hypothesis list.
+  void AddHypothesis(const tesseract::ParamsTrainingHypothesis &hypo) {
+    params_training_bundle_.AddHypothesis(hypo);
+  }
+#endif // ndef DISABLED_LEGACY_ENGINE
+
+  // Functions to setup the blamer.
+  // Whole word string, whole word bounding box.
+  void SetWordTruth(const UNICHARSET &unicharset, const char *truth_str, const TBOX &word_box);
+  // Single "character" string, "character" bounding box.
+  // May be called multiple times to indicate the characters in a word.
+  void SetSymbolTruth(const UNICHARSET &unicharset, const char *char_str, const TBOX &char_box);
+  // Marks that there is something wrong with the truth text, like it contains
+  // reject characters.
+  void SetRejectedTruth();
+
+  // Returns true if the provided word_choice is correct.
+  bool ChoiceIsCorrect(const WERD_CHOICE *word_choice) const;
+
+  void ClearResults() {
+    norm_truth_word_.DeleteAllBoxes();
+    norm_box_tolerance_ = 0;
+    if (!NoTruth()) {
+      incorrect_result_reason_ = IRR_CORRECT;
+    }
+    debug_ = "";
+    segsearch_is_looking_for_blame_ = false;
+    best_correctly_segmented_rating_ = WERD_CHOICE::kBadRating;
+    correct_segmentation_cols_.clear();
+    correct_segmentation_rows_.clear();
+    best_choice_is_dict_and_top_choice_ = false;
+    delete[] lattice_data_;
+    lattice_data_ = nullptr;
+    lattice_size_ = 0;
+  }
+  void CopyTruth(const BlamerBundle &other) {
+    truth_has_char_boxes_ = other.truth_has_char_boxes_;
+    truth_word_ = other.truth_word_;
+    truth_text_ = other.truth_text_;
+    incorrect_result_reason_ = (other.NoTruth() ? other.incorrect_result_reason_ : IRR_CORRECT);
+  }
+  void CopyResults(const BlamerBundle &other) {
+    norm_truth_word_ = other.norm_truth_word_;
+    norm_box_tolerance_ = other.norm_box_tolerance_;
+    incorrect_result_reason_ = other.incorrect_result_reason_;
+    segsearch_is_looking_for_blame_ = other.segsearch_is_looking_for_blame_;
+    best_correctly_segmented_rating_ = other.best_correctly_segmented_rating_;
+    correct_segmentation_cols_ = other.correct_segmentation_cols_;
+    correct_segmentation_rows_ = other.correct_segmentation_rows_;
+    best_choice_is_dict_and_top_choice_ = other.best_choice_is_dict_and_top_choice_;
+    if (other.lattice_data_ != nullptr) {
+      lattice_data_ = new char[other.lattice_size_];
+      memcpy(lattice_data_, other.lattice_data_, other.lattice_size_);
+      lattice_size_ = other.lattice_size_;
+    } else {
+      lattice_data_ = nullptr;
+    }
+  }
+  const char *IncorrectReason() const;
+
+  // Appends choice and truth details to the given debug string.
+  void FillDebugString(const std::string &msg, const WERD_CHOICE *choice, std::string &debug);
+
+  // Sets up the norm_truth_word from truth_word using the given DENORM.
+  void SetupNormTruthWord(const DENORM &denorm);
+
+  // Splits *this into two pieces in bundle1 and bundle2 (preallocated, empty
+  // bundles) where the right edge/ of the left-hand word is word1_right,
+  // and the left edge of the right-hand word is word2_left.
+  void SplitBundle(int word1_right, int word2_left, bool debug, BlamerBundle *bundle1,
+                   BlamerBundle *bundle2) const;
+  // "Joins" the blames from bundle1 and bundle2 into *this.
+  void JoinBlames(const BlamerBundle &bundle1, const BlamerBundle &bundle2, bool debug);
+
+  // If a blob with the same bounding box as one of the truth character
+  // bounding boxes is not classified as the corresponding truth character
+  // blames character classifier for incorrect answer.
+  void BlameClassifier(const UNICHARSET &unicharset, const TBOX &blob_box,
+                       const BLOB_CHOICE_LIST &choices, bool debug);
+
+  // Checks whether chops were made at all the character bounding box
+  // boundaries in word->truth_word. If not - blames the chopper for an
+  // incorrect answer.
+  void SetChopperBlame(const WERD_RES *word, bool debug);
+  // Blames the classifier or the language model if, after running only the
+  // chopper, best_choice is incorrect and no blame has been yet set.
+  // Blames the classifier if best_choice is classifier's top choice and is a
+  // dictionary word (i.e. language model could not have helped).
+  // Otherwise, blames the language model (formerly permuter word adjustment).
+  void BlameClassifierOrLangModel(const WERD_RES *word, const UNICHARSET &unicharset,
+                                  bool valid_permuter, bool debug);
+  // Sets up the correct_segmentation_* to mark the correct bounding boxes.
+  void SetupCorrectSegmentation(const TWERD *word, bool debug);
+
+  // Returns true if a guided segmentation search is needed.
+  bool GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const;
+  // Setup ready to guide the segmentation search to the correct segmentation.
+  void InitForSegSearch(const WERD_CHOICE *best_choice, MATRIX *ratings, UNICHAR_ID wildcard_id,
+                        bool debug, std::string &debug_str, tesseract::LMPainPoints *pain_points,
+                        double max_char_wh_ratio, WERD_RES *word_res);
+  // Returns true if the guided segsearch is in progress.
+  bool GuidedSegsearchStillGoing() const;
+  // The segmentation search has ended. Sets the blame appropriately.
+  void FinishSegSearch(const WERD_CHOICE *best_choice, bool debug, std::string &debug_str);
+
+  // If the bundle is null or still does not indicate the correct result,
+  // fix it and use some backup reason for the blame.
+  static void LastChanceBlame(bool debug, WERD_RES *word);
+
+  // Sets the misadaption debug if this word is incorrect, as this word is
+  // being adapted to.
+  void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug);
+
+private:
+  // Copy assignment operator (currently unused, therefore private).
+  BlamerBundle &operator=(const BlamerBundle &other) = delete;
+  void SetBlame(IncorrectResultReason irr, const std::string &msg, const WERD_CHOICE *choice,
+                bool debug) {
+    incorrect_result_reason_ = irr;
+    debug_ = IncorrectReason();
+    debug_ += " to blame: ";
+    FillDebugString(msg, choice, debug_);
+    if (debug) {
+      tprintf("SetBlame(): %s", debug_.c_str());
+    }
+  }
+
+private:
+  // Set to true when bounding boxes for individual unichars are recorded.
+  bool truth_has_char_boxes_;
+  // Variables used by the segmentation search when looking for the blame.
+  // Set to true while segmentation search is continued after the usual
+  // termination condition in order to look for the blame.
+  bool segsearch_is_looking_for_blame_;
+  // Set to true if best choice is a dictionary word and
+  // classifier's top choice.
+  bool best_choice_is_dict_and_top_choice_;
+  // Tolerance for bounding box comparisons in normalized space.
+  int norm_box_tolerance_;
+  // The true_word (in the original image coordinate space) contains ground
+  // truth bounding boxes for this WERD_RES.
+  tesseract::BoxWord truth_word_;
+  // Same as above, but in normalized coordinates
+  // (filled in by WERD_RES::SetupForRecognition()).
+  tesseract::BoxWord norm_truth_word_;
+  // Contains ground truth unichar for each of the bounding boxes in truth_word.
+  std::vector<std::string> truth_text_;
+  // The reason for incorrect OCR result.
+  IncorrectResultReason incorrect_result_reason_;
+  // Debug text associated with the blame.
+  std::string debug_;
+  // Misadaption debug information (filled in if this word was misadapted to).
+  std::string misadaption_debug_;
+  // Vectors populated by SegSearch to indicate column and row indices that
+  // correspond to blobs with correct bounding boxes.
+  std::vector<int> correct_segmentation_cols_;
+  std::vector<int> correct_segmentation_rows_;
+  // Best rating for correctly segmented path
+  // (set and used by SegSearch when looking for blame).
+  float best_correctly_segmented_rating_;
+  int lattice_size_; // size of lattice_data in bytes
+  // Serialized segmentation search lattice.
+  char *lattice_data_;
+  // Information about hypotheses (paths) explored by the segmentation search.
+#ifndef DISABLED_LEGACY_ENGINE
+  tesseract::ParamsTrainingBundle params_training_bundle_;
+#endif // ndef DISABLED_LEGACY_ENGINE
+};
+
+} // namespace tesseract
+
+#endif // TESSERACT_CCSTRUCT_BLAMER_H_