Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccstruct/blamer.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: blamer.h | |
| 3 // Description: Module allowing precise error causes to be allocated. | |
| 4 // Author: Rike Antonova | |
| 5 // Refactored: Ray Smith | |
| 6 // | |
| 7 // (C) Copyright 2013, Google Inc. | |
| 8 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 9 // you may not use this file except in compliance with the License. | |
| 10 // You may obtain a copy of the License at | |
| 11 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 12 // Unless required by applicable law or agreed to in writing, software | |
| 13 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 15 // See the License for the specific language governing permissions and | |
| 16 // limitations under the License. | |
| 17 // | |
| 18 /////////////////////////////////////////////////////////////////////// | |
| 19 | |
| 20 #ifndef TESSERACT_CCSTRUCT_BLAMER_H_ | |
| 21 #define TESSERACT_CCSTRUCT_BLAMER_H_ | |
| 22 | |
| 23 #ifdef HAVE_CONFIG_H | |
| 24 # include "config_auto.h" // DISABLED_LEGACY_ENGINE | |
| 25 #endif | |
| 26 #include "boxword.h" // for BoxWord | |
| 27 #ifndef DISABLED_LEGACY_ENGINE | |
| 28 # include "params_training_featdef.h" // for ParamsTrainingBundle, ParamsTra... | |
| 29 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 30 #include "ratngs.h" // for BLOB_CHOICE_LIST (ptr only) | |
| 31 #include "rect.h" // for TBOX | |
| 32 #include "tprintf.h" // for tprintf | |
| 33 | |
| 34 #include <tesseract/unichar.h> // for UNICHAR_ID | |
| 35 | |
| 36 #include <cstdint> // for int16_t | |
| 37 #include <cstring> // for memcpy | |
| 38 #include <vector> // for std::vector | |
| 39 | |
| 40 namespace tesseract { | |
| 41 | |
| 42 class DENORM; | |
| 43 class MATRIX; | |
| 44 class UNICHARSET; | |
| 45 class WERD_RES; | |
| 46 | |
| 47 struct MATRIX_COORD; | |
| 48 struct TWERD; | |
| 49 | |
| 50 class LMPainPoints; | |
| 51 | |
| 52 static const int16_t kBlamerBoxTolerance = 5; | |
| 53 | |
| 54 // Enum for expressing the source of error. | |
| 55 // Note: Please update kIncorrectResultReasonNames when modifying this enum. | |
| 56 enum IncorrectResultReason { | |
| 57 // The text recorded in best choice == truth text | |
| 58 IRR_CORRECT, | |
| 59 // Either: Top choice is incorrect and is a dictionary word (language model | |
| 60 // is unlikely to help correct such errors, so blame the classifier). | |
| 61 // Or: the correct unichar was not included in shortlist produced by the | |
| 62 // classifier at all. | |
| 63 IRR_CLASSIFIER, | |
| 64 // Chopper have not found one or more splits that correspond to the correct | |
| 65 // character bounding boxes recorded in BlamerBundle::truth_word. | |
| 66 IRR_CHOPPER, | |
| 67 // Classifier did include correct unichars for each blob in the correct | |
| 68 // segmentation, however its rating could have been too bad to allow the | |
| 69 // language model to pull out the correct choice. On the other hand the | |
| 70 // strength of the language model might have been too weak to favor the | |
| 71 // correct answer, this we call this case a classifier-language model | |
| 72 // tradeoff error. | |
| 73 IRR_CLASS_LM_TRADEOFF, | |
| 74 // Page layout failed to produce the correct bounding box. Blame page layout | |
| 75 // if the truth was not found for the word, which implies that the bounding | |
| 76 // box of the word was incorrect (no truth word had a similar bounding box). | |
| 77 IRR_PAGE_LAYOUT, | |
| 78 // SegSearch heuristic prevented one or more blobs from the correct | |
| 79 // segmentation state to be classified (e.g. the blob was too wide). | |
| 80 IRR_SEGSEARCH_HEUR, | |
| 81 // The correct segmentaiton state was not explored because of poor SegSearch | |
| 82 // pain point prioritization. We blame SegSearch pain point prioritization | |
| 83 // if the best rating of a choice constructed from correct segmentation is | |
| 84 // better than that of the best choice (i.e. if we got to explore the correct | |
| 85 // segmentation state, language model would have picked the correct choice). | |
| 86 IRR_SEGSEARCH_PP, | |
| 87 // Same as IRR_CLASS_LM_TRADEOFF, but used when we only run chopper on a word, | |
| 88 // and thus use the old language model (permuters). | |
| 89 // TODO(antonova): integrate the new language mode with chopper | |
| 90 IRR_CLASS_OLD_LM_TRADEOFF, | |
| 91 // If there is an incorrect adaptive template match with a better score than | |
| 92 // a correct one (either pre-trained or adapted), mark this as adaption error. | |
| 93 IRR_ADAPTION, | |
| 94 // split_and_recog_word() failed to find a suitable split in truth. | |
| 95 IRR_NO_TRUTH_SPLIT, | |
| 96 // Truth is not available for this word (e.g. when words in corrected content | |
| 97 // file are turned into ~~~~ because an appropriate alignment was not found. | |
| 98 IRR_NO_TRUTH, | |
| 99 // The text recorded in best choice != truth text, but none of the above | |
| 100 // reasons are set. | |
| 101 IRR_UNKNOWN, | |
| 102 | |
| 103 IRR_NUM_REASONS | |
| 104 }; | |
| 105 | |
| 106 // Blamer-related information to determine the source of errors. | |
| 107 struct BlamerBundle { | |
| 108 static const char *IncorrectReasonName(IncorrectResultReason irr); | |
| 109 BlamerBundle() | |
| 110 : truth_has_char_boxes_(false) | |
| 111 , incorrect_result_reason_(IRR_CORRECT) | |
| 112 , lattice_data_(nullptr) { | |
| 113 ClearResults(); | |
| 114 } | |
| 115 BlamerBundle(const BlamerBundle &other) { | |
| 116 this->CopyTruth(other); | |
| 117 this->CopyResults(other); | |
| 118 } | |
| 119 ~BlamerBundle() { | |
| 120 delete[] lattice_data_; | |
| 121 } | |
| 122 | |
| 123 // Accessors. | |
| 124 std::string TruthString() const { | |
| 125 std::string truth_str; | |
| 126 for (auto &text : truth_text_) { | |
| 127 truth_str += text; | |
| 128 } | |
| 129 return truth_str; | |
| 130 } | |
| 131 IncorrectResultReason incorrect_result_reason() const { | |
| 132 return incorrect_result_reason_; | |
| 133 } | |
| 134 bool NoTruth() const { | |
| 135 return incorrect_result_reason_ == IRR_NO_TRUTH || incorrect_result_reason_ == IRR_PAGE_LAYOUT; | |
| 136 } | |
| 137 bool HasDebugInfo() const { | |
| 138 return debug_.length() > 0 || misadaption_debug_.length() > 0; | |
| 139 } | |
| 140 const std::string &debug() const { | |
| 141 return debug_; | |
| 142 } | |
| 143 const std::string &misadaption_debug() const { | |
| 144 return misadaption_debug_; | |
| 145 } | |
| 146 void UpdateBestRating(float rating) { | |
| 147 if (rating < best_correctly_segmented_rating_) { | |
| 148 best_correctly_segmented_rating_ = rating; | |
| 149 } | |
| 150 } | |
| 151 int correct_segmentation_length() const { | |
| 152 return correct_segmentation_cols_.size(); | |
| 153 } | |
| 154 // Returns true if the given ratings matrix col,row position is included | |
| 155 // in the correct segmentation path at the given index. | |
| 156 bool MatrixPositionCorrect(int index, const MATRIX_COORD &coord) { | |
| 157 return correct_segmentation_cols_[index] == coord.col && | |
| 158 correct_segmentation_rows_[index] == coord.row; | |
| 159 } | |
| 160 void set_best_choice_is_dict_and_top_choice(bool value) { | |
| 161 best_choice_is_dict_and_top_choice_ = value; | |
| 162 } | |
| 163 const char *lattice_data() const { | |
| 164 return lattice_data_; | |
| 165 } | |
| 166 int lattice_size() const { | |
| 167 return lattice_size_; // size of lattice_data in bytes | |
| 168 } | |
| 169 void set_lattice_data(const char *data, int size) { | |
| 170 lattice_size_ = size; | |
| 171 delete[] lattice_data_; | |
| 172 lattice_data_ = new char[lattice_size_]; | |
| 173 memcpy(lattice_data_, data, lattice_size_); | |
| 174 } | |
| 175 #ifndef DISABLED_LEGACY_ENGINE | |
| 176 const tesseract::ParamsTrainingBundle ¶ms_training_bundle() const { | |
| 177 return params_training_bundle_; | |
| 178 } | |
| 179 // Adds a new ParamsTrainingHypothesis to the current hypothesis list. | |
| 180 void AddHypothesis(const tesseract::ParamsTrainingHypothesis &hypo) { | |
| 181 params_training_bundle_.AddHypothesis(hypo); | |
| 182 } | |
| 183 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 184 | |
| 185 // Functions to setup the blamer. | |
| 186 // Whole word string, whole word bounding box. | |
| 187 void SetWordTruth(const UNICHARSET &unicharset, const char *truth_str, const TBOX &word_box); | |
| 188 // Single "character" string, "character" bounding box. | |
| 189 // May be called multiple times to indicate the characters in a word. | |
| 190 void SetSymbolTruth(const UNICHARSET &unicharset, const char *char_str, const TBOX &char_box); | |
| 191 // Marks that there is something wrong with the truth text, like it contains | |
| 192 // reject characters. | |
| 193 void SetRejectedTruth(); | |
| 194 | |
| 195 // Returns true if the provided word_choice is correct. | |
| 196 bool ChoiceIsCorrect(const WERD_CHOICE *word_choice) const; | |
| 197 | |
| 198 void ClearResults() { | |
| 199 norm_truth_word_.DeleteAllBoxes(); | |
| 200 norm_box_tolerance_ = 0; | |
| 201 if (!NoTruth()) { | |
| 202 incorrect_result_reason_ = IRR_CORRECT; | |
| 203 } | |
| 204 debug_ = ""; | |
| 205 segsearch_is_looking_for_blame_ = false; | |
| 206 best_correctly_segmented_rating_ = WERD_CHOICE::kBadRating; | |
| 207 correct_segmentation_cols_.clear(); | |
| 208 correct_segmentation_rows_.clear(); | |
| 209 best_choice_is_dict_and_top_choice_ = false; | |
| 210 delete[] lattice_data_; | |
| 211 lattice_data_ = nullptr; | |
| 212 lattice_size_ = 0; | |
| 213 } | |
| 214 void CopyTruth(const BlamerBundle &other) { | |
| 215 truth_has_char_boxes_ = other.truth_has_char_boxes_; | |
| 216 truth_word_ = other.truth_word_; | |
| 217 truth_text_ = other.truth_text_; | |
| 218 incorrect_result_reason_ = (other.NoTruth() ? other.incorrect_result_reason_ : IRR_CORRECT); | |
| 219 } | |
| 220 void CopyResults(const BlamerBundle &other) { | |
| 221 norm_truth_word_ = other.norm_truth_word_; | |
| 222 norm_box_tolerance_ = other.norm_box_tolerance_; | |
| 223 incorrect_result_reason_ = other.incorrect_result_reason_; | |
| 224 segsearch_is_looking_for_blame_ = other.segsearch_is_looking_for_blame_; | |
| 225 best_correctly_segmented_rating_ = other.best_correctly_segmented_rating_; | |
| 226 correct_segmentation_cols_ = other.correct_segmentation_cols_; | |
| 227 correct_segmentation_rows_ = other.correct_segmentation_rows_; | |
| 228 best_choice_is_dict_and_top_choice_ = other.best_choice_is_dict_and_top_choice_; | |
| 229 if (other.lattice_data_ != nullptr) { | |
| 230 lattice_data_ = new char[other.lattice_size_]; | |
| 231 memcpy(lattice_data_, other.lattice_data_, other.lattice_size_); | |
| 232 lattice_size_ = other.lattice_size_; | |
| 233 } else { | |
| 234 lattice_data_ = nullptr; | |
| 235 } | |
| 236 } | |
| 237 const char *IncorrectReason() const; | |
| 238 | |
| 239 // Appends choice and truth details to the given debug string. | |
| 240 void FillDebugString(const std::string &msg, const WERD_CHOICE *choice, std::string &debug); | |
| 241 | |
| 242 // Sets up the norm_truth_word from truth_word using the given DENORM. | |
| 243 void SetupNormTruthWord(const DENORM &denorm); | |
| 244 | |
| 245 // Splits *this into two pieces in bundle1 and bundle2 (preallocated, empty | |
| 246 // bundles) where the right edge/ of the left-hand word is word1_right, | |
| 247 // and the left edge of the right-hand word is word2_left. | |
| 248 void SplitBundle(int word1_right, int word2_left, bool debug, BlamerBundle *bundle1, | |
| 249 BlamerBundle *bundle2) const; | |
| 250 // "Joins" the blames from bundle1 and bundle2 into *this. | |
| 251 void JoinBlames(const BlamerBundle &bundle1, const BlamerBundle &bundle2, bool debug); | |
| 252 | |
| 253 // If a blob with the same bounding box as one of the truth character | |
| 254 // bounding boxes is not classified as the corresponding truth character | |
| 255 // blames character classifier for incorrect answer. | |
| 256 void BlameClassifier(const UNICHARSET &unicharset, const TBOX &blob_box, | |
| 257 const BLOB_CHOICE_LIST &choices, bool debug); | |
| 258 | |
| 259 // Checks whether chops were made at all the character bounding box | |
| 260 // boundaries in word->truth_word. If not - blames the chopper for an | |
| 261 // incorrect answer. | |
| 262 void SetChopperBlame(const WERD_RES *word, bool debug); | |
| 263 // Blames the classifier or the language model if, after running only the | |
| 264 // chopper, best_choice is incorrect and no blame has been yet set. | |
| 265 // Blames the classifier if best_choice is classifier's top choice and is a | |
| 266 // dictionary word (i.e. language model could not have helped). | |
| 267 // Otherwise, blames the language model (formerly permuter word adjustment). | |
| 268 void BlameClassifierOrLangModel(const WERD_RES *word, const UNICHARSET &unicharset, | |
| 269 bool valid_permuter, bool debug); | |
| 270 // Sets up the correct_segmentation_* to mark the correct bounding boxes. | |
| 271 void SetupCorrectSegmentation(const TWERD *word, bool debug); | |
| 272 | |
| 273 // Returns true if a guided segmentation search is needed. | |
| 274 bool GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const; | |
| 275 // Setup ready to guide the segmentation search to the correct segmentation. | |
| 276 void InitForSegSearch(const WERD_CHOICE *best_choice, MATRIX *ratings, UNICHAR_ID wildcard_id, | |
| 277 bool debug, std::string &debug_str, tesseract::LMPainPoints *pain_points, | |
| 278 double max_char_wh_ratio, WERD_RES *word_res); | |
| 279 // Returns true if the guided segsearch is in progress. | |
| 280 bool GuidedSegsearchStillGoing() const; | |
| 281 // The segmentation search has ended. Sets the blame appropriately. | |
| 282 void FinishSegSearch(const WERD_CHOICE *best_choice, bool debug, std::string &debug_str); | |
| 283 | |
| 284 // If the bundle is null or still does not indicate the correct result, | |
| 285 // fix it and use some backup reason for the blame. | |
| 286 static void LastChanceBlame(bool debug, WERD_RES *word); | |
| 287 | |
| 288 // Sets the misadaption debug if this word is incorrect, as this word is | |
| 289 // being adapted to. | |
| 290 void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug); | |
| 291 | |
| 292 private: | |
| 293 // Copy assignment operator (currently unused, therefore private). | |
| 294 BlamerBundle &operator=(const BlamerBundle &other) = delete; | |
| 295 void SetBlame(IncorrectResultReason irr, const std::string &msg, const WERD_CHOICE *choice, | |
| 296 bool debug) { | |
| 297 incorrect_result_reason_ = irr; | |
| 298 debug_ = IncorrectReason(); | |
| 299 debug_ += " to blame: "; | |
| 300 FillDebugString(msg, choice, debug_); | |
| 301 if (debug) { | |
| 302 tprintf("SetBlame(): %s", debug_.c_str()); | |
| 303 } | |
| 304 } | |
| 305 | |
| 306 private: | |
| 307 // Set to true when bounding boxes for individual unichars are recorded. | |
| 308 bool truth_has_char_boxes_; | |
| 309 // Variables used by the segmentation search when looking for the blame. | |
| 310 // Set to true while segmentation search is continued after the usual | |
| 311 // termination condition in order to look for the blame. | |
| 312 bool segsearch_is_looking_for_blame_; | |
| 313 // Set to true if best choice is a dictionary word and | |
| 314 // classifier's top choice. | |
| 315 bool best_choice_is_dict_and_top_choice_; | |
| 316 // Tolerance for bounding box comparisons in normalized space. | |
| 317 int norm_box_tolerance_; | |
| 318 // The true_word (in the original image coordinate space) contains ground | |
| 319 // truth bounding boxes for this WERD_RES. | |
| 320 tesseract::BoxWord truth_word_; | |
| 321 // Same as above, but in normalized coordinates | |
| 322 // (filled in by WERD_RES::SetupForRecognition()). | |
| 323 tesseract::BoxWord norm_truth_word_; | |
| 324 // Contains ground truth unichar for each of the bounding boxes in truth_word. | |
| 325 std::vector<std::string> truth_text_; | |
| 326 // The reason for incorrect OCR result. | |
| 327 IncorrectResultReason incorrect_result_reason_; | |
| 328 // Debug text associated with the blame. | |
| 329 std::string debug_; | |
| 330 // Misadaption debug information (filled in if this word was misadapted to). | |
| 331 std::string misadaption_debug_; | |
| 332 // Vectors populated by SegSearch to indicate column and row indices that | |
| 333 // correspond to blobs with correct bounding boxes. | |
| 334 std::vector<int> correct_segmentation_cols_; | |
| 335 std::vector<int> correct_segmentation_rows_; | |
| 336 // Best rating for correctly segmented path | |
| 337 // (set and used by SegSearch when looking for blame). | |
| 338 float best_correctly_segmented_rating_; | |
| 339 int lattice_size_; // size of lattice_data in bytes | |
| 340 // Serialized segmentation search lattice. | |
| 341 char *lattice_data_; | |
| 342 // Information about hypotheses (paths) explored by the segmentation search. | |
| 343 #ifndef DISABLED_LEGACY_ENGINE | |
| 344 tesseract::ParamsTrainingBundle params_training_bundle_; | |
| 345 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 346 }; | |
| 347 | |
| 348 } // namespace tesseract | |
| 349 | |
| 350 #endif // TESSERACT_CCSTRUCT_BLAMER_H_ |
