Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/classify/classify.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: classify.cpp | |
| 3 // Description: classify class. | |
| 4 // Author: Samuel Charron | |
| 5 // | |
| 6 // (C) Copyright 2006, Google Inc. | |
| 7 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 // you may not use this file except in compliance with the License. | |
| 9 // You may obtain a copy of the License at | |
| 10 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 // Unless required by applicable law or agreed to in writing, software | |
| 12 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 // See the License for the specific language governing permissions and | |
| 15 // limitations under the License. | |
| 16 // | |
| 17 /////////////////////////////////////////////////////////////////////// | |
| 18 | |
| 19 #include "classify.h" | |
| 20 | |
| 21 #ifdef DISABLED_LEGACY_ENGINE | |
| 22 | |
| 23 # include <string.h> | |
| 24 | |
| 25 namespace tesseract { | |
| 26 | |
| 27 Classify::Classify() | |
| 28 : INT_MEMBER(classify_debug_level, 0, "Classify debug level", this->params()) | |
| 29 , | |
| 30 | |
| 31 BOOL_MEMBER(classify_bln_numeric_mode, 0, "Assume the input is numbers [0-9].", this->params()) | |
| 32 , | |
| 33 | |
| 34 double_MEMBER(classify_max_rating_ratio, 1.5, "Veto ratio between classifier ratings", | |
| 35 this->params()) | |
| 36 , | |
| 37 | |
| 38 double_MEMBER(classify_max_certainty_margin, 5.5, | |
| 39 "Veto difference between classifier certainties", this->params()) | |
| 40 , | |
| 41 | |
| 42 dict_(this) {} | |
| 43 | |
| 44 Classify::~Classify() {} | |
| 45 | |
| 46 } // namespace tesseract | |
| 47 | |
| 48 #else // DISABLED_LEGACY_ENGINE not defined | |
| 49 | |
| 50 # include <cstring> | |
| 51 # include "fontinfo.h" | |
| 52 # include "intproto.h" | |
| 53 # include "mfoutline.h" | |
| 54 # include "scrollview.h" | |
| 55 # include "shapeclassifier.h" | |
| 56 # include "shapetable.h" | |
| 57 # include "unicity_table.h" | |
| 58 | |
| 59 namespace tesseract { | |
| 60 Classify::Classify() | |
| 61 : BOOL_MEMBER(allow_blob_division, true, "Use divisible blobs chopping", this->params()) | |
| 62 , BOOL_MEMBER(prioritize_division, false, "Prioritize blob division over chopping", | |
| 63 this->params()) | |
| 64 , BOOL_MEMBER(classify_enable_learning, true, "Enable adaptive classifier", this->params()) | |
| 65 , INT_MEMBER(classify_debug_level, 0, "Classify debug level", this->params()) | |
| 66 , INT_MEMBER(classify_norm_method, character, "Normalization Method ...", this->params()) | |
| 67 , double_MEMBER(classify_char_norm_range, 0.2, "Character Normalization Range ...", | |
| 68 this->params()) | |
| 69 , double_MEMBER(classify_max_rating_ratio, 1.5, "Veto ratio between classifier ratings", | |
| 70 this->params()) | |
| 71 , double_MEMBER(classify_max_certainty_margin, 5.5, | |
| 72 "Veto difference between classifier certainties", this->params()) | |
| 73 , BOOL_MEMBER(tess_cn_matching, 0, "Character Normalized Matching", this->params()) | |
| 74 , BOOL_MEMBER(tess_bn_matching, 0, "Baseline Normalized Matching", this->params()) | |
| 75 , BOOL_MEMBER(classify_enable_adaptive_matcher, 1, "Enable adaptive classifier", this->params()) | |
| 76 , BOOL_MEMBER(classify_use_pre_adapted_templates, 0, "Use pre-adapted classifier templates", | |
| 77 this->params()) | |
| 78 , BOOL_MEMBER(classify_save_adapted_templates, 0, "Save adapted templates to a file", | |
| 79 this->params()) | |
| 80 , BOOL_MEMBER(classify_enable_adaptive_debugger, 0, "Enable match debugger", this->params()) | |
| 81 , BOOL_MEMBER(classify_nonlinear_norm, 0, "Non-linear stroke-density normalization", | |
| 82 this->params()) | |
| 83 , INT_MEMBER(matcher_debug_level, 0, "Matcher Debug Level", this->params()) | |
| 84 , INT_MEMBER(matcher_debug_flags, 0, "Matcher Debug Flags", this->params()) | |
| 85 , INT_MEMBER(classify_learning_debug_level, 0, "Learning Debug Level: ", this->params()) | |
| 86 , double_MEMBER(matcher_good_threshold, 0.125, "Good Match (0-1)", this->params()) | |
| 87 , double_MEMBER(matcher_reliable_adaptive_result, 0.0, "Great Match (0-1)", this->params()) | |
| 88 , double_MEMBER(matcher_perfect_threshold, 0.02, "Perfect Match (0-1)", this->params()) | |
| 89 , double_MEMBER(matcher_bad_match_pad, 0.15, "Bad Match Pad (0-1)", this->params()) | |
| 90 , double_MEMBER(matcher_rating_margin, 0.1, "New template margin (0-1)", this->params()) | |
| 91 , double_MEMBER(matcher_avg_noise_size, 12.0, "Avg. noise blob length", this->params()) | |
| 92 , INT_MEMBER(matcher_permanent_classes_min, 1, "Min # of permanent classes", this->params()) | |
| 93 , INT_MEMBER(matcher_min_examples_for_prototyping, 3, "Reliable Config Threshold", | |
| 94 this->params()) | |
| 95 , INT_MEMBER(matcher_sufficient_examples_for_prototyping, 5, | |
| 96 "Enable adaption even if the ambiguities have not been seen", this->params()) | |
| 97 , double_MEMBER(matcher_clustering_max_angle_delta, 0.015, | |
| 98 "Maximum angle delta for prototype clustering", this->params()) | |
| 99 , double_MEMBER(classify_misfit_junk_penalty, 0.0, | |
| 100 "Penalty to apply when a non-alnum is vertically out of " | |
| 101 "its expected textline position", | |
| 102 this->params()) | |
| 103 , double_MEMBER(rating_scale, 1.5, "Rating scaling factor", this->params()) | |
| 104 , double_MEMBER(tessedit_class_miss_scale, 0.00390625, "Scale factor for features not used", | |
| 105 this->params()) | |
| 106 , double_MEMBER(classify_adapted_pruning_factor, 2.5, | |
| 107 "Prune poor adapted results this much worse than best result", this->params()) | |
| 108 , double_MEMBER(classify_adapted_pruning_threshold, -1.0, | |
| 109 "Threshold at which classify_adapted_pruning_factor starts", this->params()) | |
| 110 , INT_MEMBER(classify_adapt_proto_threshold, 230, | |
| 111 "Threshold for good protos during adaptive 0-255", this->params()) | |
| 112 , INT_MEMBER(classify_adapt_feature_threshold, 230, | |
| 113 "Threshold for good features during adaptive 0-255", this->params()) | |
| 114 , BOOL_MEMBER(disable_character_fragments, true, | |
| 115 "Do not include character fragments in the" | |
| 116 " results of the classifier", | |
| 117 this->params()) | |
| 118 , double_MEMBER(classify_character_fragments_garbage_certainty_threshold, -3.0, | |
| 119 "Exclude fragments that do not look like whole" | |
| 120 " characters from training and adaption", | |
| 121 this->params()) | |
| 122 , BOOL_MEMBER(classify_debug_character_fragments, false, | |
| 123 "Bring up graphical debugging windows for fragments training", this->params()) | |
| 124 , BOOL_MEMBER(matcher_debug_separate_windows, false, | |
| 125 "Use two different windows for debugging the matching: " | |
| 126 "One for the protos and one for the features.", | |
| 127 this->params()) | |
| 128 , STRING_MEMBER(classify_learn_debug_str, "", "Class str to debug learning", this->params()) | |
| 129 , INT_MEMBER(classify_class_pruner_threshold, 229, "Class Pruner Threshold 0-255", | |
| 130 this->params()) | |
| 131 , INT_MEMBER(classify_class_pruner_multiplier, 15, | |
| 132 "Class Pruner Multiplier 0-255: ", this->params()) | |
| 133 , INT_MEMBER(classify_cp_cutoff_strength, 7, | |
| 134 "Class Pruner CutoffStrength: ", this->params()) | |
| 135 , INT_MEMBER(classify_integer_matcher_multiplier, 10, | |
| 136 "Integer Matcher Multiplier 0-255: ", this->params()) | |
| 137 , BOOL_MEMBER(classify_bln_numeric_mode, 0, "Assume the input is numbers [0-9].", | |
| 138 this->params()) | |
| 139 , double_MEMBER(speckle_large_max_size, 0.30, "Max large speckle size", this->params()) | |
| 140 , double_MEMBER(speckle_rating_penalty, 10.0, "Penalty to add to worst rating for noise", | |
| 141 this->params()) | |
| 142 , im_(&classify_debug_level) | |
| 143 , dict_(this) { | |
| 144 using namespace std::placeholders; // for _1, _2 | |
| 145 fontinfo_table_.set_clear_callback(std::bind(FontInfoDeleteCallback, _1)); | |
| 146 | |
| 147 InitFeatureDefs(&feature_defs_); | |
| 148 } | |
| 149 | |
| 150 Classify::~Classify() { | |
| 151 EndAdaptiveClassifier(); | |
| 152 #ifndef GRAPHICS_DISABLED | |
| 153 delete learn_debug_win_; | |
| 154 delete learn_fragmented_word_debug_win_; | |
| 155 delete learn_fragments_debug_win_; | |
| 156 #endif | |
| 157 } | |
| 158 | |
| 159 // Takes ownership of the given classifier, and uses it for future calls | |
| 160 // to CharNormClassifier. | |
| 161 void Classify::SetStaticClassifier(ShapeClassifier *static_classifier) { | |
| 162 delete static_classifier_; | |
| 163 static_classifier_ = static_classifier; | |
| 164 } | |
| 165 | |
| 166 // Moved from speckle.cpp | |
| 167 // Adds a noise classification result that is a bit worse than the worst | |
| 168 // current result, or the worst possible result if no current results. | |
| 169 void Classify::AddLargeSpeckleTo(int blob_length, BLOB_CHOICE_LIST *choices) { | |
| 170 BLOB_CHOICE_IT bc_it(choices); | |
| 171 // If there is no classifier result, we will use the worst possible certainty | |
| 172 // and corresponding rating. | |
| 173 float certainty = -getDict().certainty_scale; | |
| 174 float rating = rating_scale * blob_length; | |
| 175 if (!choices->empty() && blob_length > 0) { | |
| 176 bc_it.move_to_last(); | |
| 177 BLOB_CHOICE *worst_choice = bc_it.data(); | |
| 178 // Add speckle_rating_penalty to worst rating, matching old value. | |
| 179 rating = worst_choice->rating() + speckle_rating_penalty; | |
| 180 // Compute the rating to correspond to the certainty. (Used to be kept | |
| 181 // the same, but that messes up the language model search.) | |
| 182 certainty = -rating * getDict().certainty_scale / (rating_scale * blob_length); | |
| 183 } | |
| 184 auto *blob_choice = new BLOB_CHOICE(UNICHAR_SPACE, rating, certainty, -1, 0.0f, FLT_MAX, 0, | |
| 185 BCC_SPECKLE_CLASSIFIER); | |
| 186 bc_it.add_to_end(blob_choice); | |
| 187 } | |
| 188 | |
| 189 // Returns true if the blob is small enough to be a large speckle. | |
| 190 bool Classify::LargeSpeckle(const TBLOB &blob) { | |
| 191 double speckle_size = kBlnXHeight * speckle_large_max_size; | |
| 192 TBOX bbox = blob.bounding_box(); | |
| 193 return bbox.width() < speckle_size && bbox.height() < speckle_size; | |
| 194 } | |
| 195 | |
| 196 } // namespace tesseract | |
| 197 | |
| 198 #endif // def DISABLED_LEGACY_ENGINE |
