Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/classify/intmatcher.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /****************************************************************************** | |
| 2 ** Filename: intmatcher.cpp | |
| 3 ** Purpose: Generic high level classification routines. | |
| 4 ** Author: Robert Moss | |
| 5 ** (c) Copyright Hewlett-Packard Company, 1988. | |
| 6 ** Licensed under the Apache License, Version 2.0 (the "License"); | |
| 7 ** you may not use this file except in compliance with the License. | |
| 8 ** You may obtain a copy of the License at | |
| 9 ** http://www.apache.org/licenses/LICENSE-2.0 | |
| 10 ** Unless required by applicable law or agreed to in writing, software | |
| 11 ** distributed under the License is distributed on an "AS IS" BASIS, | |
| 12 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 13 ** See the License for the specific language governing permissions and | |
| 14 ** limitations under the License. | |
| 15 ******************************************************************************/ | |
| 16 | |
| 17 // Include automatically generated configuration file if running autoconf. | |
| 18 #ifdef HAVE_CONFIG_H | |
| 19 # include "config_auto.h" | |
| 20 #endif | |
| 21 | |
| 22 #include "intmatcher.h" | |
| 23 | |
| 24 #include "classify.h" | |
| 25 #include "float2int.h" | |
| 26 #include "fontinfo.h" | |
| 27 #include "intproto.h" | |
| 28 #include "scrollview.h" | |
| 29 #include "shapetable.h" | |
| 30 | |
| 31 #include "helpers.h" | |
| 32 | |
| 33 #include <cassert> | |
| 34 #include <cmath> | |
| 35 | |
| 36 namespace tesseract { | |
| 37 | |
| 38 /*---------------------------------------------------------------------------- | |
| 39 Global Data Definitions and Declarations | |
| 40 ----------------------------------------------------------------------------*/ | |
| 41 // Parameters of the sigmoid used to convert similarity to evidence in the | |
| 42 // similarity_evidence_table_ that is used to convert distance metric to an | |
| 43 // 8 bit evidence value in the secondary matcher. (See IntMatcher::Init). | |
| 44 const float IntegerMatcher::kSEExponentialMultiplier = 0.0f; | |
| 45 const float IntegerMatcher::kSimilarityCenter = 0.0075f; | |
| 46 | |
| 47 static const uint8_t offset_table[] = { | |
| 48 255, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, | |
| 49 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, | |
| 50 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, | |
| 51 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, | |
| 52 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, | |
| 53 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, | |
| 54 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, | |
| 55 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, | |
| 56 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0}; | |
| 57 | |
| 58 static const uint8_t next_table[] = { | |
| 59 0, 0, 0, 0x2, 0, 0x4, 0x4, 0x6, 0, 0x8, 0x8, 0x0a, 0x08, 0x0c, 0x0c, 0x0e, | |
| 60 0, 0x10, 0x10, 0x12, 0x10, 0x14, 0x14, 0x16, 0x10, 0x18, 0x18, 0x1a, 0x18, 0x1c, 0x1c, 0x1e, | |
| 61 0, 0x20, 0x20, 0x22, 0x20, 0x24, 0x24, 0x26, 0x20, 0x28, 0x28, 0x2a, 0x28, 0x2c, 0x2c, 0x2e, | |
| 62 0x20, 0x30, 0x30, 0x32, 0x30, 0x34, 0x34, 0x36, 0x30, 0x38, 0x38, 0x3a, 0x38, 0x3c, 0x3c, 0x3e, | |
| 63 0, 0x40, 0x40, 0x42, 0x40, 0x44, 0x44, 0x46, 0x40, 0x48, 0x48, 0x4a, 0x48, 0x4c, 0x4c, 0x4e, | |
| 64 0x40, 0x50, 0x50, 0x52, 0x50, 0x54, 0x54, 0x56, 0x50, 0x58, 0x58, 0x5a, 0x58, 0x5c, 0x5c, 0x5e, | |
| 65 0x40, 0x60, 0x60, 0x62, 0x60, 0x64, 0x64, 0x66, 0x60, 0x68, 0x68, 0x6a, 0x68, 0x6c, 0x6c, 0x6e, | |
| 66 0x60, 0x70, 0x70, 0x72, 0x70, 0x74, 0x74, 0x76, 0x70, 0x78, 0x78, 0x7a, 0x78, 0x7c, 0x7c, 0x7e, | |
| 67 0, 0x80, 0x80, 0x82, 0x80, 0x84, 0x84, 0x86, 0x80, 0x88, 0x88, 0x8a, 0x88, 0x8c, 0x8c, 0x8e, | |
| 68 0x80, 0x90, 0x90, 0x92, 0x90, 0x94, 0x94, 0x96, 0x90, 0x98, 0x98, 0x9a, 0x98, 0x9c, 0x9c, 0x9e, | |
| 69 0x80, 0xa0, 0xa0, 0xa2, 0xa0, 0xa4, 0xa4, 0xa6, 0xa0, 0xa8, 0xa8, 0xaa, 0xa8, 0xac, 0xac, 0xae, | |
| 70 0xa0, 0xb0, 0xb0, 0xb2, 0xb0, 0xb4, 0xb4, 0xb6, 0xb0, 0xb8, 0xb8, 0xba, 0xb8, 0xbc, 0xbc, 0xbe, | |
| 71 0x80, 0xc0, 0xc0, 0xc2, 0xc0, 0xc4, 0xc4, 0xc6, 0xc0, 0xc8, 0xc8, 0xca, 0xc8, 0xcc, 0xcc, 0xce, | |
| 72 0xc0, 0xd0, 0xd0, 0xd2, 0xd0, 0xd4, 0xd4, 0xd6, 0xd0, 0xd8, 0xd8, 0xda, 0xd8, 0xdc, 0xdc, 0xde, | |
| 73 0xc0, 0xe0, 0xe0, 0xe2, 0xe0, 0xe4, 0xe4, 0xe6, 0xe0, 0xe8, 0xe8, 0xea, 0xe8, 0xec, 0xec, 0xee, | |
| 74 0xe0, 0xf0, 0xf0, 0xf2, 0xf0, 0xf4, 0xf4, 0xf6, 0xf0, 0xf8, 0xf8, 0xfa, 0xf8, 0xfc, 0xfc, 0xfe}; | |
| 75 | |
| 76 // See http://b/19318793 (#6) for a complete discussion. | |
| 77 | |
| 78 /** | |
| 79 * Sort Key array in ascending order using heap sort | |
| 80 * algorithm. Also sort Index array that is tied to | |
| 81 * the key array. | |
| 82 * @param n Number of elements to sort | |
| 83 * @param ra Key array [1..n] | |
| 84 * @param rb Index array [1..n] | |
| 85 */ | |
| 86 static void HeapSort(int n, int ra[], int rb[]) { | |
| 87 int i, rra, rrb; | |
| 88 int l, j, ir; | |
| 89 | |
| 90 l = (n >> 1) + 1; | |
| 91 ir = n; | |
| 92 for (;;) { | |
| 93 if (l > 1) { | |
| 94 rra = ra[--l]; | |
| 95 rrb = rb[l]; | |
| 96 } else { | |
| 97 rra = ra[ir]; | |
| 98 rrb = rb[ir]; | |
| 99 ra[ir] = ra[1]; | |
| 100 rb[ir] = rb[1]; | |
| 101 if (--ir == 1) { | |
| 102 ra[1] = rra; | |
| 103 rb[1] = rrb; | |
| 104 return; | |
| 105 } | |
| 106 } | |
| 107 i = l; | |
| 108 j = l << 1; | |
| 109 while (j <= ir) { | |
| 110 if (j < ir && ra[j] < ra[j + 1]) { | |
| 111 ++j; | |
| 112 } | |
| 113 if (rra < ra[j]) { | |
| 114 ra[i] = ra[j]; | |
| 115 rb[i] = rb[j]; | |
| 116 j += (i = j); | |
| 117 } else { | |
| 118 j = ir + 1; | |
| 119 } | |
| 120 } | |
| 121 ra[i] = rra; | |
| 122 rb[i] = rrb; | |
| 123 } | |
| 124 } | |
| 125 | |
| 126 // Encapsulation of the intermediate data and computations made by the class | |
| 127 // pruner. The class pruner implements a simple linear classifier on binary | |
| 128 // features by heavily quantizing the feature space, and applying | |
| 129 // NUM_BITS_PER_CLASS (2)-bit weights to the features. Lack of resolution in | |
| 130 // weights is compensated by a non-constant bias that is dependent on the | |
| 131 // number of features present. | |
| 132 class ClassPruner { | |
| 133 public: | |
| 134 ClassPruner(int max_classes) { | |
| 135 // The unrolled loop in ComputeScores means that the array sizes need to | |
| 136 // be rounded up so that the array is big enough to accommodate the extra | |
| 137 // entries accessed by the unrolling. Each pruner word is of sized | |
| 138 // BITS_PER_WERD and each entry is NUM_BITS_PER_CLASS, so there are | |
| 139 // BITS_PER_WERD / NUM_BITS_PER_CLASS entries. | |
| 140 // See ComputeScores. | |
| 141 max_classes_ = max_classes; | |
| 142 rounded_classes_ = | |
| 143 RoundUp(max_classes, WERDS_PER_CP_VECTOR * BITS_PER_WERD / NUM_BITS_PER_CLASS); | |
| 144 class_count_ = new int[rounded_classes_]; | |
| 145 norm_count_ = new int[rounded_classes_]; | |
| 146 sort_key_ = new int[rounded_classes_ + 1]; | |
| 147 sort_index_ = new int[rounded_classes_ + 1]; | |
| 148 for (int i = 0; i < rounded_classes_; i++) { | |
| 149 class_count_[i] = 0; | |
| 150 } | |
| 151 pruning_threshold_ = 0; | |
| 152 num_features_ = 0; | |
| 153 num_classes_ = 0; | |
| 154 } | |
| 155 | |
| 156 ~ClassPruner() { | |
| 157 delete[] class_count_; | |
| 158 delete[] norm_count_; | |
| 159 delete[] sort_key_; | |
| 160 delete[] sort_index_; | |
| 161 } | |
| 162 | |
| 163 /// Computes the scores for every class in the character set, by summing the | |
| 164 /// weights for each feature and stores the sums internally in class_count_. | |
| 165 void ComputeScores(const INT_TEMPLATES_STRUCT *int_templates, int num_features, | |
| 166 const INT_FEATURE_STRUCT *features) { | |
| 167 num_features_ = num_features; | |
| 168 auto num_pruners = int_templates->NumClassPruners; | |
| 169 for (int f = 0; f < num_features; ++f) { | |
| 170 const INT_FEATURE_STRUCT *feature = &features[f]; | |
| 171 // Quantize the feature to NUM_CP_BUCKETS*NUM_CP_BUCKETS*NUM_CP_BUCKETS. | |
| 172 int x = feature->X * NUM_CP_BUCKETS >> 8; | |
| 173 int y = feature->Y * NUM_CP_BUCKETS >> 8; | |
| 174 int theta = feature->Theta * NUM_CP_BUCKETS >> 8; | |
| 175 int class_id = 0; | |
| 176 // Each CLASS_PRUNER_STRUCT only covers CLASSES_PER_CP(32) classes, so | |
| 177 // we need a collection of them, indexed by pruner_set. | |
| 178 for (unsigned pruner_set = 0; pruner_set < num_pruners; ++pruner_set) { | |
| 179 // Look up quantized feature in a 3-D array, an array of weights for | |
| 180 // each class. | |
| 181 const uint32_t *pruner_word_ptr = int_templates->ClassPruners[pruner_set]->p[x][y][theta]; | |
| 182 for (int word = 0; word < WERDS_PER_CP_VECTOR; ++word) { | |
| 183 uint32_t pruner_word = *pruner_word_ptr++; | |
| 184 // This inner loop is unrolled to speed up the ClassPruner. | |
| 185 // Currently gcc would not unroll it unless it is set to O3 | |
| 186 // level of optimization or -funroll-loops is specified. | |
| 187 /* | |
| 188 uint32_t class_mask = (1 << NUM_BITS_PER_CLASS) - 1; | |
| 189 for (int bit = 0; bit < BITS_PER_WERD/NUM_BITS_PER_CLASS; bit++) { | |
| 190 class_count_[class_id++] += pruner_word & class_mask; | |
| 191 pruner_word >>= NUM_BITS_PER_CLASS; | |
| 192 } | |
| 193 */ | |
| 194 class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK; | |
| 195 pruner_word >>= NUM_BITS_PER_CLASS; | |
| 196 class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK; | |
| 197 pruner_word >>= NUM_BITS_PER_CLASS; | |
| 198 class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK; | |
| 199 pruner_word >>= NUM_BITS_PER_CLASS; | |
| 200 class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK; | |
| 201 pruner_word >>= NUM_BITS_PER_CLASS; | |
| 202 class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK; | |
| 203 pruner_word >>= NUM_BITS_PER_CLASS; | |
| 204 class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK; | |
| 205 pruner_word >>= NUM_BITS_PER_CLASS; | |
| 206 class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK; | |
| 207 pruner_word >>= NUM_BITS_PER_CLASS; | |
| 208 class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK; | |
| 209 pruner_word >>= NUM_BITS_PER_CLASS; | |
| 210 class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK; | |
| 211 pruner_word >>= NUM_BITS_PER_CLASS; | |
| 212 class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK; | |
| 213 pruner_word >>= NUM_BITS_PER_CLASS; | |
| 214 class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK; | |
| 215 pruner_word >>= NUM_BITS_PER_CLASS; | |
| 216 class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK; | |
| 217 pruner_word >>= NUM_BITS_PER_CLASS; | |
| 218 class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK; | |
| 219 pruner_word >>= NUM_BITS_PER_CLASS; | |
| 220 class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK; | |
| 221 pruner_word >>= NUM_BITS_PER_CLASS; | |
| 222 class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK; | |
| 223 pruner_word >>= NUM_BITS_PER_CLASS; | |
| 224 class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK; | |
| 225 } | |
| 226 } | |
| 227 } | |
| 228 } | |
| 229 | |
| 230 /// Adjusts the scores according to the number of expected features. Used | |
| 231 /// in lieu of a constant bias, this penalizes classes that expect more | |
| 232 /// features than there are present. Thus an actual c will score higher for c | |
| 233 /// than e, even though almost all the features match e as well as c, because | |
| 234 /// e expects more features to be present. | |
| 235 void AdjustForExpectedNumFeatures(const uint16_t *expected_num_features, int cutoff_strength) { | |
| 236 for (int class_id = 0; class_id < max_classes_; ++class_id) { | |
| 237 if (num_features_ < expected_num_features[class_id]) { | |
| 238 int deficit = expected_num_features[class_id] - num_features_; | |
| 239 class_count_[class_id] -= | |
| 240 class_count_[class_id] * deficit / (num_features_ * cutoff_strength + deficit); | |
| 241 } | |
| 242 } | |
| 243 } | |
| 244 | |
| 245 /// Zeros the scores for classes disabled in the unicharset. | |
| 246 /// Implements the black-list to recognize a subset of the character set. | |
| 247 void DisableDisabledClasses(const UNICHARSET &unicharset) { | |
| 248 for (int class_id = 0; class_id < max_classes_; ++class_id) { | |
| 249 if (!unicharset.get_enabled(class_id)) { | |
| 250 class_count_[class_id] = 0; // This char is disabled! | |
| 251 } | |
| 252 } | |
| 253 } | |
| 254 | |
| 255 /** Zeros the scores of fragments. */ | |
| 256 void DisableFragments(const UNICHARSET &unicharset) { | |
| 257 for (int class_id = 0; class_id < max_classes_; ++class_id) { | |
| 258 // Do not include character fragments in the class pruner | |
| 259 // results if disable_character_fragments is true. | |
| 260 if (unicharset.get_fragment(class_id)) { | |
| 261 class_count_[class_id] = 0; | |
| 262 } | |
| 263 } | |
| 264 } | |
| 265 | |
| 266 /// Normalizes the counts for xheight, putting the normalized result in | |
| 267 /// norm_count_. Applies a simple subtractive penalty for incorrect vertical | |
| 268 /// position provided by the normalization_factors array, indexed by | |
| 269 /// character class, and scaled by the norm_multiplier. | |
| 270 void NormalizeForXheight(int norm_multiplier, const uint8_t *normalization_factors) { | |
| 271 for (int class_id = 0; class_id < max_classes_; class_id++) { | |
| 272 norm_count_[class_id] = | |
| 273 class_count_[class_id] - ((norm_multiplier * normalization_factors[class_id]) >> 8); | |
| 274 } | |
| 275 } | |
| 276 | |
| 277 /** The nop normalization copies the class_count_ array to norm_count_. */ | |
| 278 void NoNormalization() { | |
| 279 for (int class_id = 0; class_id < max_classes_; class_id++) { | |
| 280 norm_count_[class_id] = class_count_[class_id]; | |
| 281 } | |
| 282 } | |
| 283 | |
| 284 /// Prunes the classes using <the maximum count> * pruning_factor/256 as a | |
| 285 /// threshold for keeping classes. If max_of_non_fragments, then ignore | |
| 286 /// fragments in computing the maximum count. | |
| 287 void PruneAndSort(int pruning_factor, int keep_this, bool max_of_non_fragments, | |
| 288 const UNICHARSET &unicharset) { | |
| 289 int max_count = 0; | |
| 290 for (int c = 0; c < max_classes_; ++c) { | |
| 291 if (norm_count_[c] > max_count && | |
| 292 // This additional check is added in order to ensure that | |
| 293 // the classifier will return at least one non-fragmented | |
| 294 // character match. | |
| 295 // TODO(daria): verify that this helps accuracy and does not | |
| 296 // hurt performance. | |
| 297 (!max_of_non_fragments || !unicharset.get_fragment(c))) { | |
| 298 max_count = norm_count_[c]; | |
| 299 } | |
| 300 } | |
| 301 // Prune Classes. | |
| 302 pruning_threshold_ = (max_count * pruning_factor) >> 8; | |
| 303 // Select Classes. | |
| 304 if (pruning_threshold_ < 1) { | |
| 305 pruning_threshold_ = 1; | |
| 306 } | |
| 307 num_classes_ = 0; | |
| 308 for (int class_id = 0; class_id < max_classes_; class_id++) { | |
| 309 if (norm_count_[class_id] >= pruning_threshold_ || class_id == keep_this) { | |
| 310 ++num_classes_; | |
| 311 sort_index_[num_classes_] = class_id; | |
| 312 sort_key_[num_classes_] = norm_count_[class_id]; | |
| 313 } | |
| 314 } | |
| 315 | |
| 316 // Sort Classes using Heapsort Algorithm. | |
| 317 if (num_classes_ > 1) { | |
| 318 HeapSort(num_classes_, sort_key_, sort_index_); | |
| 319 } | |
| 320 } | |
| 321 | |
| 322 /** Prints debug info on the class pruner matches for the pruned classes only. | |
| 323 */ | |
| 324 void DebugMatch(const Classify &classify, const INT_TEMPLATES_STRUCT *int_templates, | |
| 325 const INT_FEATURE_STRUCT *features) const { | |
| 326 int num_pruners = int_templates->NumClassPruners; | |
| 327 int max_num_classes = int_templates->NumClasses; | |
| 328 for (int f = 0; f < num_features_; ++f) { | |
| 329 const INT_FEATURE_STRUCT *feature = &features[f]; | |
| 330 tprintf("F=%3d(%d,%d,%d),", f, feature->X, feature->Y, feature->Theta); | |
| 331 // Quantize the feature to NUM_CP_BUCKETS*NUM_CP_BUCKETS*NUM_CP_BUCKETS. | |
| 332 int x = feature->X * NUM_CP_BUCKETS >> 8; | |
| 333 int y = feature->Y * NUM_CP_BUCKETS >> 8; | |
| 334 int theta = feature->Theta * NUM_CP_BUCKETS >> 8; | |
| 335 int class_id = 0; | |
| 336 for (int pruner_set = 0; pruner_set < num_pruners; ++pruner_set) { | |
| 337 // Look up quantized feature in a 3-D array, an array of weights for | |
| 338 // each class. | |
| 339 const uint32_t *pruner_word_ptr = int_templates->ClassPruners[pruner_set]->p[x][y][theta]; | |
| 340 for (int word = 0; word < WERDS_PER_CP_VECTOR; ++word) { | |
| 341 uint32_t pruner_word = *pruner_word_ptr++; | |
| 342 for (int word_class = 0; word_class < 16 && class_id < max_num_classes; | |
| 343 ++word_class, ++class_id) { | |
| 344 if (norm_count_[class_id] >= pruning_threshold_) { | |
| 345 tprintf(" %s=%d,", classify.ClassIDToDebugStr(int_templates, class_id, 0).c_str(), | |
| 346 pruner_word & CLASS_PRUNER_CLASS_MASK); | |
| 347 } | |
| 348 pruner_word >>= NUM_BITS_PER_CLASS; | |
| 349 } | |
| 350 } | |
| 351 tprintf("\n"); | |
| 352 } | |
| 353 } | |
| 354 } | |
| 355 | |
| 356 /** Prints a summary of the pruner result. */ | |
| 357 void SummarizeResult(const Classify &classify, const INT_TEMPLATES_STRUCT *int_templates, | |
| 358 const uint16_t *expected_num_features, int norm_multiplier, | |
| 359 const uint8_t *normalization_factors) const { | |
| 360 tprintf("CP:%d classes, %d features:\n", num_classes_, num_features_); | |
| 361 for (int i = 0; i < num_classes_; ++i) { | |
| 362 int class_id = sort_index_[num_classes_ - i]; | |
| 363 std::string class_string = classify.ClassIDToDebugStr(int_templates, class_id, 0); | |
| 364 tprintf( | |
| 365 "%s:Initial=%d, E=%d, Xht-adj=%d, N=%d, Rat=%.2f\n", class_string.c_str(), | |
| 366 class_count_[class_id], expected_num_features[class_id], | |
| 367 (norm_multiplier * normalization_factors[class_id]) >> 8, sort_key_[num_classes_ - i], | |
| 368 100.0 - 100.0 * sort_key_[num_classes_ - i] / (CLASS_PRUNER_CLASS_MASK * num_features_)); | |
| 369 } | |
| 370 } | |
| 371 | |
| 372 /// Copies the pruned, sorted classes into the output results and returns | |
| 373 /// the number of classes. | |
| 374 int SetupResults(std::vector<CP_RESULT_STRUCT> *results) const { | |
| 375 results->clear(); | |
| 376 results->resize(num_classes_); | |
| 377 for (int c = 0; c < num_classes_; ++c) { | |
| 378 (*results)[c].Class = sort_index_[num_classes_ - c]; | |
| 379 (*results)[c].Rating = | |
| 380 1.0f - sort_key_[num_classes_ - c] / | |
| 381 (static_cast<float>(CLASS_PRUNER_CLASS_MASK) * num_features_); | |
| 382 } | |
| 383 return num_classes_; | |
| 384 } | |
| 385 | |
| 386 private: | |
| 387 /** Array[rounded_classes_] of initial counts for each class. */ | |
| 388 int *class_count_; | |
| 389 /// Array[rounded_classes_] of modified counts for each class after | |
| 390 /// normalizing for expected number of features, disabled classes, fragments, | |
| 391 /// and xheights. | |
| 392 int *norm_count_; | |
| 393 /** Array[rounded_classes_ +1] of pruned counts that gets sorted */ | |
| 394 int *sort_key_; | |
| 395 /** Array[rounded_classes_ +1] of classes corresponding to sort_key_. */ | |
| 396 int *sort_index_; | |
| 397 /** Number of classes in this class pruner. */ | |
| 398 int max_classes_; | |
| 399 /** Rounded up number of classes used for array sizes. */ | |
| 400 int rounded_classes_; | |
| 401 /** Threshold count applied to prune classes. */ | |
| 402 int pruning_threshold_; | |
| 403 /** The number of features used to compute the scores. */ | |
| 404 int num_features_; | |
| 405 /** Final number of pruned classes. */ | |
| 406 int num_classes_; | |
| 407 }; | |
| 408 | |
| 409 /*---------------------------------------------------------------------------- | |
| 410 Public Code | |
| 411 ----------------------------------------------------------------------------*/ | |
| 412 /** | |
| 413 * Runs the class pruner from int_templates on the given features, returning | |
| 414 * the number of classes output in results. | |
| 415 * @param int_templates Class pruner tables | |
| 416 * @param num_features Number of features in blob | |
| 417 * @param features Array of features | |
| 418 * @param normalization_factors Array of fudge factors from blob | |
| 419 * normalization process (by CLASS_INDEX) | |
| 420 * @param expected_num_features Array of expected number of features | |
| 421 * for each class (by CLASS_INDEX) | |
| 422 * @param results Sorted Array of pruned classes. Must be an | |
| 423 * array of size at least | |
| 424 * int_templates->NumClasses. | |
| 425 * @param keep_this | |
| 426 */ | |
| 427 int Classify::PruneClasses(const INT_TEMPLATES_STRUCT *int_templates, int num_features, | |
| 428 int keep_this, const INT_FEATURE_STRUCT *features, | |
| 429 const uint8_t *normalization_factors, | |
| 430 const uint16_t *expected_num_features, | |
| 431 std::vector<CP_RESULT_STRUCT> *results) { | |
| 432 ClassPruner pruner(int_templates->NumClasses); | |
| 433 // Compute initial match scores for all classes. | |
| 434 pruner.ComputeScores(int_templates, num_features, features); | |
| 435 // Adjust match scores for number of expected features. | |
| 436 pruner.AdjustForExpectedNumFeatures(expected_num_features, classify_cp_cutoff_strength); | |
| 437 // Apply disabled classes in unicharset - only works without a shape_table. | |
| 438 if (shape_table_ == nullptr) { | |
| 439 pruner.DisableDisabledClasses(unicharset); | |
| 440 } | |
| 441 // If fragments are disabled, remove them, also only without a shape table. | |
| 442 if (disable_character_fragments && shape_table_ == nullptr) { | |
| 443 pruner.DisableFragments(unicharset); | |
| 444 } | |
| 445 | |
| 446 // If we have good x-heights, apply the given normalization factors. | |
| 447 if (normalization_factors != nullptr) { | |
| 448 pruner.NormalizeForXheight(classify_class_pruner_multiplier, normalization_factors); | |
| 449 } else { | |
| 450 pruner.NoNormalization(); | |
| 451 } | |
| 452 // Do the actual pruning and sort the short-list. | |
| 453 pruner.PruneAndSort(classify_class_pruner_threshold, keep_this, shape_table_ == nullptr, | |
| 454 unicharset); | |
| 455 | |
| 456 if (classify_debug_level > 2) { | |
| 457 pruner.DebugMatch(*this, int_templates, features); | |
| 458 } | |
| 459 if (classify_debug_level > 1) { | |
| 460 pruner.SummarizeResult(*this, int_templates, expected_num_features, | |
| 461 classify_class_pruner_multiplier, normalization_factors); | |
| 462 } | |
| 463 // Convert to the expected output format. | |
| 464 return pruner.SetupResults(results); | |
| 465 } | |
| 466 | |
| 467 /** | |
| 468 * IntegerMatcher returns the best configuration and rating | |
| 469 * for a single class. The class matched against is determined | |
| 470 * by the uniqueness of the ClassTemplate parameter. The | |
| 471 * best rating and its associated configuration are returned. | |
| 472 * | |
| 473 * Globals: | |
| 474 * - local_matcher_multiplier_ Normalization factor multiplier | |
| 475 * param ClassTemplate Prototypes & tables for a class | |
| 476 * param NumFeatures Number of features in blob | |
| 477 * param Features Array of features | |
| 478 * param NormalizationFactor Fudge factor from blob normalization process | |
| 479 * param Result Class rating & configuration: (0.0 -> 1.0), 0=bad, 1=good | |
| 480 * param Debug Debugger flag: 1=debugger on | |
| 481 */ | |
| 482 void IntegerMatcher::Match(INT_CLASS_STRUCT *ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask, | |
| 483 int16_t NumFeatures, const INT_FEATURE_STRUCT *Features, | |
| 484 UnicharRating *Result, int AdaptFeatureThreshold, int Debug, | |
| 485 bool SeparateDebugWindows) { | |
| 486 auto *tables = new ScratchEvidence(); | |
| 487 int Feature; | |
| 488 | |
| 489 if (MatchDebuggingOn(Debug)) { | |
| 490 tprintf("Integer Matcher -------------------------------------------\n"); | |
| 491 } | |
| 492 | |
| 493 tables->Clear(ClassTemplate); | |
| 494 Result->feature_misses = 0; | |
| 495 | |
| 496 for (Feature = 0; Feature < NumFeatures; Feature++) { | |
| 497 int csum = UpdateTablesForFeature(ClassTemplate, ProtoMask, ConfigMask, Feature, | |
| 498 &Features[Feature], tables, Debug); | |
| 499 // Count features that were missed over all configs. | |
| 500 if (csum == 0) { | |
| 501 ++Result->feature_misses; | |
| 502 } | |
| 503 } | |
| 504 | |
| 505 #ifndef GRAPHICS_DISABLED | |
| 506 if (PrintProtoMatchesOn(Debug) || PrintMatchSummaryOn(Debug)) { | |
| 507 DebugFeatureProtoError(ClassTemplate, ProtoMask, ConfigMask, *tables, NumFeatures, Debug); | |
| 508 } | |
| 509 | |
| 510 if (DisplayProtoMatchesOn(Debug)) { | |
| 511 DisplayProtoDebugInfo(ClassTemplate, ConfigMask, *tables, SeparateDebugWindows); | |
| 512 } | |
| 513 | |
| 514 if (DisplayFeatureMatchesOn(Debug)) { | |
| 515 DisplayFeatureDebugInfo(ClassTemplate, ProtoMask, ConfigMask, NumFeatures, Features, | |
| 516 AdaptFeatureThreshold, Debug, SeparateDebugWindows); | |
| 517 } | |
| 518 #endif | |
| 519 | |
| 520 tables->UpdateSumOfProtoEvidences(ClassTemplate, ConfigMask); | |
| 521 tables->NormalizeSums(ClassTemplate, NumFeatures); | |
| 522 | |
| 523 FindBestMatch(ClassTemplate, *tables, Result); | |
| 524 | |
| 525 #ifndef GRAPHICS_DISABLED | |
| 526 if (PrintMatchSummaryOn(Debug)) { | |
| 527 Result->Print(); | |
| 528 } | |
| 529 | |
| 530 if (MatchDebuggingOn(Debug)) { | |
| 531 tprintf("Match Complete --------------------------------------------\n"); | |
| 532 } | |
| 533 #endif | |
| 534 | |
| 535 delete tables; | |
| 536 } | |
| 537 | |
| 538 /** | |
| 539 * FindGoodProtos finds all protos whose normalized proto-evidence | |
| 540 * exceed AdaptProtoThreshold. The list is ordered by increasing | |
| 541 * proto id number. | |
| 542 * | |
| 543 * Globals: | |
| 544 * - local_matcher_multiplier_ Normalization factor multiplier | |
| 545 * param ClassTemplate Prototypes & tables for a class | |
| 546 * param ProtoMask AND Mask for proto word | |
| 547 * param ConfigMask AND Mask for config word | |
| 548 * param NumFeatures Number of features in blob | |
| 549 * param Features Array of features | |
| 550 * param ProtoArray Array of good protos | |
| 551 * param AdaptProtoThreshold Threshold for good protos | |
| 552 * param Debug Debugger flag: 1=debugger on | |
| 553 * @return Number of good protos in ProtoArray. | |
| 554 */ | |
| 555 int IntegerMatcher::FindGoodProtos(INT_CLASS_STRUCT *ClassTemplate, BIT_VECTOR ProtoMask, | |
| 556 BIT_VECTOR ConfigMask, int16_t NumFeatures, | |
| 557 INT_FEATURE_ARRAY Features, PROTO_ID *ProtoArray, | |
| 558 int AdaptProtoThreshold, int Debug) { | |
| 559 auto *tables = new ScratchEvidence(); | |
| 560 int NumGoodProtos = 0; | |
| 561 | |
| 562 /* DEBUG opening heading */ | |
| 563 if (MatchDebuggingOn(Debug)) { | |
| 564 tprintf("Find Good Protos -------------------------------------------\n"); | |
| 565 } | |
| 566 | |
| 567 tables->Clear(ClassTemplate); | |
| 568 | |
| 569 for (int Feature = 0; Feature < NumFeatures; Feature++) { | |
| 570 UpdateTablesForFeature(ClassTemplate, ProtoMask, ConfigMask, Feature, &(Features[Feature]), | |
| 571 tables, Debug); | |
| 572 } | |
| 573 | |
| 574 #ifndef GRAPHICS_DISABLED | |
| 575 if (PrintProtoMatchesOn(Debug) || PrintMatchSummaryOn(Debug)) { | |
| 576 DebugFeatureProtoError(ClassTemplate, ProtoMask, ConfigMask, *tables, NumFeatures, Debug); | |
| 577 } | |
| 578 #endif | |
| 579 | |
| 580 /* Average Proto Evidences & Find Good Protos */ | |
| 581 for (int proto = 0; proto < ClassTemplate->NumProtos; proto++) { | |
| 582 /* Compute Average for Actual Proto */ | |
| 583 int Temp = 0; | |
| 584 for (uint8_t i = 0; i < MAX_PROTO_INDEX && i < ClassTemplate->ProtoLengths[proto]; i++) { | |
| 585 Temp += tables->proto_evidence_[proto][i]; | |
| 586 } | |
| 587 | |
| 588 Temp /= ClassTemplate->ProtoLengths[proto]; | |
| 589 | |
| 590 /* Find Good Protos */ | |
| 591 if (Temp >= AdaptProtoThreshold) { | |
| 592 *ProtoArray = proto; | |
| 593 ProtoArray++; | |
| 594 NumGoodProtos++; | |
| 595 } | |
| 596 } | |
| 597 | |
| 598 if (MatchDebuggingOn(Debug)) { | |
| 599 tprintf("Match Complete --------------------------------------------\n"); | |
| 600 } | |
| 601 delete tables; | |
| 602 | |
| 603 return NumGoodProtos; | |
| 604 } | |
| 605 | |
| 606 /** | |
| 607 * FindBadFeatures finds all features with maximum feature-evidence < | |
| 608 * AdaptFeatureThresh. The list is ordered by increasing feature number. | |
| 609 * @param ClassTemplate Prototypes & tables for a class | |
| 610 * @param ProtoMask AND Mask for proto word | |
| 611 * @param ConfigMask AND Mask for config word | |
| 612 * @param NumFeatures Number of features in blob | |
| 613 * @param Features Array of features | |
| 614 * @param FeatureArray Array of bad features | |
| 615 * @param AdaptFeatureThreshold Threshold for bad features | |
| 616 * @param Debug Debugger flag: 1=debugger on | |
| 617 * @return Number of bad features in FeatureArray. | |
| 618 */ | |
| 619 int IntegerMatcher::FindBadFeatures(INT_CLASS_STRUCT *ClassTemplate, BIT_VECTOR ProtoMask, | |
| 620 BIT_VECTOR ConfigMask, int16_t NumFeatures, | |
| 621 INT_FEATURE_ARRAY Features, FEATURE_ID *FeatureArray, | |
| 622 int AdaptFeatureThreshold, int Debug) { | |
| 623 auto *tables = new ScratchEvidence(); | |
| 624 int NumBadFeatures = 0; | |
| 625 | |
| 626 /* DEBUG opening heading */ | |
| 627 if (MatchDebuggingOn(Debug)) { | |
| 628 tprintf("Find Bad Features -------------------------------------------\n"); | |
| 629 } | |
| 630 | |
| 631 tables->Clear(ClassTemplate); | |
| 632 | |
| 633 for (int Feature = 0; Feature < NumFeatures; Feature++) { | |
| 634 UpdateTablesForFeature(ClassTemplate, ProtoMask, ConfigMask, Feature, &Features[Feature], | |
| 635 tables, Debug); | |
| 636 | |
| 637 /* Find Best Evidence for Current Feature */ | |
| 638 int best = 0; | |
| 639 assert(ClassTemplate->NumConfigs < MAX_NUM_CONFIGS); | |
| 640 for (int i = 0; i < MAX_NUM_CONFIGS && i < ClassTemplate->NumConfigs; i++) { | |
| 641 if (tables->feature_evidence_[i] > best) { | |
| 642 best = tables->feature_evidence_[i]; | |
| 643 } | |
| 644 } | |
| 645 | |
| 646 /* Find Bad Features */ | |
| 647 if (best < AdaptFeatureThreshold) { | |
| 648 *FeatureArray = Feature; | |
| 649 FeatureArray++; | |
| 650 NumBadFeatures++; | |
| 651 } | |
| 652 } | |
| 653 | |
| 654 #ifndef GRAPHICS_DISABLED | |
| 655 if (PrintProtoMatchesOn(Debug) || PrintMatchSummaryOn(Debug)) { | |
| 656 DebugFeatureProtoError(ClassTemplate, ProtoMask, ConfigMask, *tables, NumFeatures, Debug); | |
| 657 } | |
| 658 #endif | |
| 659 | |
| 660 if (MatchDebuggingOn(Debug)) { | |
| 661 tprintf("Match Complete --------------------------------------------\n"); | |
| 662 } | |
| 663 | |
| 664 delete tables; | |
| 665 return NumBadFeatures; | |
| 666 } | |
| 667 | |
| 668 IntegerMatcher::IntegerMatcher(tesseract::IntParam *classify_debug_level) | |
| 669 : classify_debug_level_(classify_debug_level) { | |
| 670 /* Initialize table for evidence to similarity lookup */ | |
| 671 for (int i = 0; i < SE_TABLE_SIZE; i++) { | |
| 672 uint32_t IntSimilarity = i << (27 - SE_TABLE_BITS); | |
| 673 double Similarity = (static_cast<double>(IntSimilarity)) / 65536.0 / 65536.0; | |
| 674 double evidence = Similarity / kSimilarityCenter; | |
| 675 evidence = 255.0 / (evidence * evidence + 1.0); | |
| 676 | |
| 677 if (kSEExponentialMultiplier > 0.0) { | |
| 678 double scale = | |
| 679 1.0 - std::exp(-kSEExponentialMultiplier) * | |
| 680 exp(kSEExponentialMultiplier * (static_cast<double>(i) / SE_TABLE_SIZE)); | |
| 681 evidence *= ClipToRange(scale, 0.0, 1.0); | |
| 682 } | |
| 683 | |
| 684 similarity_evidence_table_[i] = static_cast<uint8_t>(evidence + 0.5); | |
| 685 } | |
| 686 | |
| 687 /* Initialize evidence computation variables */ | |
| 688 evidence_table_mask_ = ((1 << kEvidenceTableBits) - 1) << (9 - kEvidenceTableBits); | |
| 689 mult_trunc_shift_bits_ = (14 - kIntEvidenceTruncBits); | |
| 690 table_trunc_shift_bits_ = (27 - SE_TABLE_BITS - (mult_trunc_shift_bits_ << 1)); | |
| 691 evidence_mult_mask_ = ((1 << kIntEvidenceTruncBits) - 1); | |
| 692 } | |
| 693 | |
| 694 /*---------------------------------------------------------------------------- | |
| 695 Private Code | |
| 696 ----------------------------------------------------------------------------*/ | |
| 697 void ScratchEvidence::Clear(const INT_CLASS_STRUCT *class_template) { | |
| 698 memset(sum_feature_evidence_, 0, class_template->NumConfigs * sizeof(sum_feature_evidence_[0])); | |
| 699 memset(proto_evidence_, 0, class_template->NumProtos * sizeof(proto_evidence_[0])); | |
| 700 } | |
| 701 | |
| 702 void ScratchEvidence::ClearFeatureEvidence(const INT_CLASS_STRUCT *class_template) { | |
| 703 memset(feature_evidence_, 0, class_template->NumConfigs * sizeof(feature_evidence_[0])); | |
| 704 } | |
| 705 | |
| 706 /** | |
| 707 * Print debugging information for Configurations | |
| 708 */ | |
| 709 static void IMDebugConfiguration(int FeatureNum, uint16_t ActualProtoNum, uint8_t Evidence, | |
| 710 uint32_t ConfigWord) { | |
| 711 tprintf("F = %3d, P = %3d, E = %3d, Configs = ", FeatureNum, static_cast<int>(ActualProtoNum), | |
| 712 static_cast<int>(Evidence)); | |
| 713 while (ConfigWord) { | |
| 714 if (ConfigWord & 1) { | |
| 715 tprintf("1"); | |
| 716 } else { | |
| 717 tprintf("0"); | |
| 718 } | |
| 719 ConfigWord >>= 1; | |
| 720 } | |
| 721 tprintf("\n"); | |
| 722 } | |
| 723 | |
| 724 /** | |
| 725 * Print debugging information for Configurations | |
| 726 */ | |
| 727 static void IMDebugConfigurationSum(int FeatureNum, uint8_t *FeatureEvidence, int32_t ConfigCount) { | |
| 728 tprintf("F=%3d, C=", FeatureNum); | |
| 729 for (int ConfigNum = 0; ConfigNum < ConfigCount; ConfigNum++) { | |
| 730 tprintf("%4d", FeatureEvidence[ConfigNum]); | |
| 731 } | |
| 732 tprintf("\n"); | |
| 733 } | |
| 734 | |
| 735 /** | |
| 736 * For the given feature: prune protos, compute evidence, | |
| 737 * update Feature Evidence, Proto Evidence, and Sum of Feature | |
| 738 * Evidence tables. | |
| 739 * @param ClassTemplate Prototypes & tables for a class | |
| 740 * @param FeatureNum Current feature number (for DEBUG only) | |
| 741 * @param Feature Pointer to a feature struct | |
| 742 * @param tables Evidence tables | |
| 743 * @param Debug Debugger flag: 1=debugger on | |
| 744 * @return sum of feature evidence tables | |
| 745 */ | |
| 746 int IntegerMatcher::UpdateTablesForFeature(INT_CLASS_STRUCT *ClassTemplate, BIT_VECTOR ProtoMask, | |
| 747 BIT_VECTOR ConfigMask, int FeatureNum, | |
| 748 const INT_FEATURE_STRUCT *Feature, | |
| 749 ScratchEvidence *tables, int Debug) { | |
| 750 uint32_t ConfigWord; | |
| 751 uint32_t ProtoWord; | |
| 752 uint32_t ProtoNum; | |
| 753 uint32_t ActualProtoNum; | |
| 754 uint8_t proto_byte; | |
| 755 int32_t proto_word_offset; | |
| 756 int32_t proto_offset; | |
| 757 PROTO_SET_STRUCT *ProtoSet; | |
| 758 uint32_t *ProtoPrunerPtr; | |
| 759 INT_PROTO_STRUCT *Proto; | |
| 760 int ProtoSetIndex; | |
| 761 uint8_t Evidence; | |
| 762 uint32_t XFeatureAddress; | |
| 763 uint32_t YFeatureAddress; | |
| 764 uint32_t ThetaFeatureAddress; | |
| 765 | |
| 766 tables->ClearFeatureEvidence(ClassTemplate); | |
| 767 | |
| 768 /* Precompute Feature Address offset for Proto Pruning */ | |
| 769 XFeatureAddress = ((Feature->X >> 2) << 1); | |
| 770 YFeatureAddress = (NUM_PP_BUCKETS << 1) + ((Feature->Y >> 2) << 1); | |
| 771 ThetaFeatureAddress = (NUM_PP_BUCKETS << 2) + ((Feature->Theta >> 2) << 1); | |
| 772 | |
| 773 for (ProtoSetIndex = 0, ActualProtoNum = 0; ProtoSetIndex < ClassTemplate->NumProtoSets; | |
| 774 ProtoSetIndex++) { | |
| 775 ProtoSet = ClassTemplate->ProtoSets[ProtoSetIndex]; | |
| 776 ProtoPrunerPtr = reinterpret_cast<uint32_t *>((*ProtoSet).ProtoPruner); | |
| 777 for (ProtoNum = 0; ProtoNum < PROTOS_PER_PROTO_SET; ProtoNum += (PROTOS_PER_PROTO_SET >> 1), | |
| 778 ActualProtoNum += (PROTOS_PER_PROTO_SET >> 1), ProtoMask++, ProtoPrunerPtr++) { | |
| 779 /* Prune Protos of current Proto Set */ | |
| 780 ProtoWord = *(ProtoPrunerPtr + XFeatureAddress); | |
| 781 ProtoWord &= *(ProtoPrunerPtr + YFeatureAddress); | |
| 782 ProtoWord &= *(ProtoPrunerPtr + ThetaFeatureAddress); | |
| 783 ProtoWord &= *ProtoMask; | |
| 784 | |
| 785 if (ProtoWord != 0) { | |
| 786 proto_byte = ProtoWord & 0xff; | |
| 787 ProtoWord >>= 8; | |
| 788 proto_word_offset = 0; | |
| 789 while (ProtoWord != 0 || proto_byte != 0) { | |
| 790 while (proto_byte == 0) { | |
| 791 proto_byte = ProtoWord & 0xff; | |
| 792 ProtoWord >>= 8; | |
| 793 proto_word_offset += 8; | |
| 794 } | |
| 795 proto_offset = offset_table[proto_byte] + proto_word_offset; | |
| 796 proto_byte = next_table[proto_byte]; | |
| 797 Proto = &(ProtoSet->Protos[ProtoNum + proto_offset]); | |
| 798 ConfigWord = Proto->Configs[0]; | |
| 799 int32_t A3 = (((Proto->A * (Feature->X - 128)) * 2) - (Proto->B * (Feature->Y - 128)) + | |
| 800 (Proto->C * 512)); | |
| 801 int32_t M3 = ((static_cast<int8_t>(Feature->Theta - Proto->Angle)) * kIntThetaFudge) * 2; | |
| 802 | |
| 803 if (A3 < 0) { | |
| 804 A3 = ~A3; | |
| 805 } | |
| 806 if (M3 < 0) { | |
| 807 M3 = ~M3; | |
| 808 } | |
| 809 A3 >>= mult_trunc_shift_bits_; | |
| 810 M3 >>= mult_trunc_shift_bits_; | |
| 811 if (static_cast<uint32_t>(A3) > evidence_mult_mask_) { | |
| 812 A3 = evidence_mult_mask_; | |
| 813 } | |
| 814 if (static_cast<uint32_t>(M3) > evidence_mult_mask_) { | |
| 815 M3 = evidence_mult_mask_; | |
| 816 } | |
| 817 | |
| 818 uint32_t A4 = (A3 * A3) + (M3 * M3); | |
| 819 A4 >>= table_trunc_shift_bits_; | |
| 820 if (A4 > evidence_table_mask_) { | |
| 821 Evidence = 0; | |
| 822 } else { | |
| 823 Evidence = similarity_evidence_table_[A4]; | |
| 824 } | |
| 825 | |
| 826 if (PrintFeatureMatchesOn(Debug)) { | |
| 827 IMDebugConfiguration(FeatureNum, ActualProtoNum + proto_offset, Evidence, ConfigWord); | |
| 828 } | |
| 829 | |
| 830 ConfigWord &= *ConfigMask; | |
| 831 | |
| 832 uint8_t feature_evidence_index = 0; | |
| 833 uint8_t config_byte = 0; | |
| 834 while (ConfigWord != 0 || config_byte != 0) { | |
| 835 while (config_byte == 0) { | |
| 836 config_byte = ConfigWord & 0xff; | |
| 837 ConfigWord >>= 8; | |
| 838 feature_evidence_index += 8; | |
| 839 } | |
| 840 const uint8_t config_offset = offset_table[config_byte] + feature_evidence_index - 8; | |
| 841 config_byte = next_table[config_byte]; | |
| 842 if (Evidence > tables->feature_evidence_[config_offset]) { | |
| 843 tables->feature_evidence_[config_offset] = Evidence; | |
| 844 } | |
| 845 } | |
| 846 | |
| 847 uint8_t ProtoIndex = ClassTemplate->ProtoLengths[ActualProtoNum + proto_offset]; | |
| 848 if (ProtoIndex > MAX_PROTO_INDEX) { | |
| 849 // Avoid buffer overflow. | |
| 850 // TODO: A better fix is still open. | |
| 851 ProtoIndex = MAX_PROTO_INDEX; | |
| 852 } | |
| 853 uint8_t *UINT8Pointer = &(tables->proto_evidence_[ActualProtoNum + proto_offset][0]); | |
| 854 for (; Evidence > 0 && ProtoIndex > 0; ProtoIndex--, UINT8Pointer++) { | |
| 855 if (Evidence > *UINT8Pointer) { | |
| 856 uint8_t Temp = *UINT8Pointer; | |
| 857 *UINT8Pointer = Evidence; | |
| 858 Evidence = Temp; | |
| 859 } | |
| 860 } | |
| 861 } | |
| 862 } | |
| 863 } | |
| 864 } | |
| 865 | |
| 866 if (PrintFeatureMatchesOn(Debug)) { | |
| 867 IMDebugConfigurationSum(FeatureNum, tables->feature_evidence_, ClassTemplate->NumConfigs); | |
| 868 } | |
| 869 | |
| 870 int *IntPointer = tables->sum_feature_evidence_; | |
| 871 uint8_t *UINT8Pointer = tables->feature_evidence_; | |
| 872 int SumOverConfigs = 0; | |
| 873 for (int ConfigNum = ClassTemplate->NumConfigs; ConfigNum > 0; ConfigNum--) { | |
| 874 int evidence = *UINT8Pointer++; | |
| 875 SumOverConfigs += evidence; | |
| 876 *IntPointer++ += evidence; | |
| 877 } | |
| 878 return SumOverConfigs; | |
| 879 } | |
| 880 | |
| 881 /** | |
| 882 * Print debugging information for Configurations | |
| 883 */ | |
| 884 #ifndef GRAPHICS_DISABLED | |
| 885 void IntegerMatcher::DebugFeatureProtoError(INT_CLASS_STRUCT *ClassTemplate, BIT_VECTOR ProtoMask, | |
| 886 BIT_VECTOR ConfigMask, const ScratchEvidence &tables, | |
| 887 int16_t NumFeatures, int Debug) { | |
| 888 float ProtoConfigs[MAX_NUM_CONFIGS]; | |
| 889 int ConfigNum; | |
| 890 uint32_t ConfigWord; | |
| 891 int ProtoSetIndex; | |
| 892 uint16_t ProtoNum; | |
| 893 uint8_t ProtoWordNum; | |
| 894 PROTO_SET_STRUCT *ProtoSet; | |
| 895 | |
| 896 if (PrintMatchSummaryOn(Debug)) { | |
| 897 tprintf("Configuration Mask:\n"); | |
| 898 for (ConfigNum = 0; ConfigNum < ClassTemplate->NumConfigs; ConfigNum++) { | |
| 899 tprintf("%1d", (((*ConfigMask) >> ConfigNum) & 1)); | |
| 900 } | |
| 901 tprintf("\n"); | |
| 902 | |
| 903 tprintf("Feature Error for Configurations:\n"); | |
| 904 for (ConfigNum = 0; ConfigNum < ClassTemplate->NumConfigs; ConfigNum++) { | |
| 905 tprintf(" %5.1f", 100.0 * (1.0 - static_cast<float>(tables.sum_feature_evidence_[ConfigNum]) / | |
| 906 NumFeatures / 256.0)); | |
| 907 } | |
| 908 tprintf("\n\n\n"); | |
| 909 } | |
| 910 | |
| 911 if (PrintMatchSummaryOn(Debug)) { | |
| 912 tprintf("Proto Mask:\n"); | |
| 913 for (ProtoSetIndex = 0; ProtoSetIndex < ClassTemplate->NumProtoSets; ProtoSetIndex++) { | |
| 914 for (ProtoWordNum = 0; ProtoWordNum < 2; ProtoWordNum++, ProtoMask++) { | |
| 915 uint16_t ActualProtoNum = (ProtoSetIndex * PROTOS_PER_PROTO_SET); | |
| 916 for (ProtoNum = 0; ((ProtoNum < (PROTOS_PER_PROTO_SET >> 1)) && | |
| 917 (ActualProtoNum < ClassTemplate->NumProtos)); | |
| 918 ProtoNum++, ActualProtoNum++) { | |
| 919 tprintf("%1d", (((*ProtoMask) >> ProtoNum) & 1)); | |
| 920 } | |
| 921 tprintf("\n"); | |
| 922 } | |
| 923 } | |
| 924 tprintf("\n"); | |
| 925 } | |
| 926 | |
| 927 for (int i = 0; i < ClassTemplate->NumConfigs; i++) { | |
| 928 ProtoConfigs[i] = 0; | |
| 929 } | |
| 930 | |
| 931 if (PrintProtoMatchesOn(Debug)) { | |
| 932 tprintf("Proto Evidence:\n"); | |
| 933 for (ProtoSetIndex = 0; ProtoSetIndex < ClassTemplate->NumProtoSets; ProtoSetIndex++) { | |
| 934 ProtoSet = ClassTemplate->ProtoSets[ProtoSetIndex]; | |
| 935 uint16_t ActualProtoNum = (ProtoSetIndex * PROTOS_PER_PROTO_SET); | |
| 936 for (ProtoNum = 0; | |
| 937 ((ProtoNum < PROTOS_PER_PROTO_SET) && (ActualProtoNum < ClassTemplate->NumProtos)); | |
| 938 ProtoNum++, ActualProtoNum++) { | |
| 939 tprintf("P %3d =", ActualProtoNum); | |
| 940 int temp = 0; | |
| 941 for (uint8_t j = 0; j < ClassTemplate->ProtoLengths[ActualProtoNum]; j++) { | |
| 942 uint8_t data = tables.proto_evidence_[ActualProtoNum][j]; | |
| 943 tprintf(" %d", data); | |
| 944 temp += data; | |
| 945 } | |
| 946 | |
| 947 tprintf(" = %6.4f%%\n", temp / 256.0 / ClassTemplate->ProtoLengths[ActualProtoNum]); | |
| 948 | |
| 949 ConfigWord = ProtoSet->Protos[ProtoNum].Configs[0]; | |
| 950 ConfigNum = 0; | |
| 951 while (ConfigWord) { | |
| 952 tprintf("%5d", ConfigWord & 1 ? temp : 0); | |
| 953 if (ConfigWord & 1) { | |
| 954 ProtoConfigs[ConfigNum] += temp; | |
| 955 } | |
| 956 ConfigNum++; | |
| 957 ConfigWord >>= 1; | |
| 958 } | |
| 959 tprintf("\n"); | |
| 960 } | |
| 961 } | |
| 962 } | |
| 963 | |
| 964 if (PrintMatchSummaryOn(Debug)) { | |
| 965 tprintf("Proto Error for Configurations:\n"); | |
| 966 for (ConfigNum = 0; ConfigNum < ClassTemplate->NumConfigs; ConfigNum++) { | |
| 967 tprintf(" %5.1f", 100.0 * (1.0 - ProtoConfigs[ConfigNum] / | |
| 968 ClassTemplate->ConfigLengths[ConfigNum] / 256.0)); | |
| 969 } | |
| 970 tprintf("\n\n"); | |
| 971 } | |
| 972 | |
| 973 if (PrintProtoMatchesOn(Debug)) { | |
| 974 tprintf("Proto Sum for Configurations:\n"); | |
| 975 for (ConfigNum = 0; ConfigNum < ClassTemplate->NumConfigs; ConfigNum++) { | |
| 976 tprintf(" %4.1f", ProtoConfigs[ConfigNum] / 256.0); | |
| 977 } | |
| 978 tprintf("\n\n"); | |
| 979 | |
| 980 tprintf("Proto Length for Configurations:\n"); | |
| 981 for (ConfigNum = 0; ConfigNum < ClassTemplate->NumConfigs; ConfigNum++) { | |
| 982 tprintf(" %4.1f", static_cast<float>(ClassTemplate->ConfigLengths[ConfigNum])); | |
| 983 } | |
| 984 tprintf("\n\n"); | |
| 985 } | |
| 986 } | |
| 987 | |
| 988 void IntegerMatcher::DisplayProtoDebugInfo(INT_CLASS_STRUCT *ClassTemplate, BIT_VECTOR ConfigMask, | |
| 989 const ScratchEvidence &tables, | |
| 990 bool SeparateDebugWindows) { | |
| 991 uint16_t ProtoNum; | |
| 992 PROTO_SET_STRUCT *ProtoSet; | |
| 993 int ProtoSetIndex; | |
| 994 | |
| 995 InitIntMatchWindowIfReqd(); | |
| 996 if (SeparateDebugWindows) { | |
| 997 InitFeatureDisplayWindowIfReqd(); | |
| 998 InitProtoDisplayWindowIfReqd(); | |
| 999 } | |
| 1000 | |
| 1001 for (ProtoSetIndex = 0; ProtoSetIndex < ClassTemplate->NumProtoSets; ProtoSetIndex++) { | |
| 1002 ProtoSet = ClassTemplate->ProtoSets[ProtoSetIndex]; | |
| 1003 uint16_t ActualProtoNum = ProtoSetIndex * PROTOS_PER_PROTO_SET; | |
| 1004 for (ProtoNum = 0; | |
| 1005 ((ProtoNum < PROTOS_PER_PROTO_SET) && (ActualProtoNum < ClassTemplate->NumProtos)); | |
| 1006 ProtoNum++, ActualProtoNum++) { | |
| 1007 /* Compute Average for Actual Proto */ | |
| 1008 int temp = 0; | |
| 1009 for (uint8_t i = 0; i < ClassTemplate->ProtoLengths[ActualProtoNum]; i++) { | |
| 1010 temp += tables.proto_evidence_[ActualProtoNum][i]; | |
| 1011 } | |
| 1012 | |
| 1013 temp /= ClassTemplate->ProtoLengths[ActualProtoNum]; | |
| 1014 | |
| 1015 if ((ProtoSet->Protos[ProtoNum]).Configs[0] & (*ConfigMask)) { | |
| 1016 DisplayIntProto(ClassTemplate, ActualProtoNum, temp / 255.0); | |
| 1017 } | |
| 1018 } | |
| 1019 } | |
| 1020 } | |
| 1021 | |
| 1022 void IntegerMatcher::DisplayFeatureDebugInfo(INT_CLASS_STRUCT *ClassTemplate, BIT_VECTOR ProtoMask, | |
| 1023 BIT_VECTOR ConfigMask, int16_t NumFeatures, | |
| 1024 const INT_FEATURE_STRUCT *Features, | |
| 1025 int AdaptFeatureThreshold, int Debug, | |
| 1026 bool SeparateDebugWindows) { | |
| 1027 auto *tables = new ScratchEvidence(); | |
| 1028 | |
| 1029 tables->Clear(ClassTemplate); | |
| 1030 | |
| 1031 InitIntMatchWindowIfReqd(); | |
| 1032 if (SeparateDebugWindows) { | |
| 1033 InitFeatureDisplayWindowIfReqd(); | |
| 1034 InitProtoDisplayWindowIfReqd(); | |
| 1035 } | |
| 1036 | |
| 1037 for (int Feature = 0; Feature < NumFeatures; Feature++) { | |
| 1038 UpdateTablesForFeature(ClassTemplate, ProtoMask, ConfigMask, Feature, &Features[Feature], | |
| 1039 tables, 0); | |
| 1040 | |
| 1041 /* Find Best Evidence for Current Feature */ | |
| 1042 int best = 0; | |
| 1043 assert(ClassTemplate->NumConfigs < MAX_NUM_CONFIGS); | |
| 1044 for (int i = 0; i < MAX_NUM_CONFIGS && i < ClassTemplate->NumConfigs; i++) { | |
| 1045 if (tables->feature_evidence_[i] > best) { | |
| 1046 best = tables->feature_evidence_[i]; | |
| 1047 } | |
| 1048 } | |
| 1049 | |
| 1050 /* Update display for current feature */ | |
| 1051 if (ClipMatchEvidenceOn(Debug)) { | |
| 1052 if (best < AdaptFeatureThreshold) { | |
| 1053 DisplayIntFeature(&Features[Feature], 0.0); | |
| 1054 } else { | |
| 1055 DisplayIntFeature(&Features[Feature], 1.0); | |
| 1056 } | |
| 1057 } else { | |
| 1058 DisplayIntFeature(&Features[Feature], best / 255.0); | |
| 1059 } | |
| 1060 } | |
| 1061 | |
| 1062 delete tables; | |
| 1063 } | |
| 1064 #endif | |
| 1065 | |
| 1066 /** | |
| 1067 * Add sum of Proto Evidences into Sum Of Feature Evidence Array | |
| 1068 */ | |
| 1069 void ScratchEvidence::UpdateSumOfProtoEvidences(INT_CLASS_STRUCT *ClassTemplate, BIT_VECTOR ConfigMask) { | |
| 1070 int *IntPointer; | |
| 1071 uint32_t ConfigWord; | |
| 1072 int ProtoSetIndex; | |
| 1073 uint16_t ProtoNum; | |
| 1074 PROTO_SET_STRUCT *ProtoSet; | |
| 1075 int NumProtos; | |
| 1076 | |
| 1077 NumProtos = ClassTemplate->NumProtos; | |
| 1078 | |
| 1079 for (ProtoSetIndex = 0; ProtoSetIndex < ClassTemplate->NumProtoSets; ProtoSetIndex++) { | |
| 1080 ProtoSet = ClassTemplate->ProtoSets[ProtoSetIndex]; | |
| 1081 uint16_t ActualProtoNum = (ProtoSetIndex * PROTOS_PER_PROTO_SET); | |
| 1082 for (ProtoNum = 0; ((ProtoNum < PROTOS_PER_PROTO_SET) && (ActualProtoNum < NumProtos)); | |
| 1083 ProtoNum++, ActualProtoNum++) { | |
| 1084 int temp = 0; | |
| 1085 for (uint8_t i = 0; i < MAX_PROTO_INDEX && i < ClassTemplate->ProtoLengths[ActualProtoNum]; | |
| 1086 i++) { | |
| 1087 temp += proto_evidence_[ActualProtoNum][i]; | |
| 1088 } | |
| 1089 | |
| 1090 ConfigWord = ProtoSet->Protos[ProtoNum].Configs[0]; | |
| 1091 ConfigWord &= *ConfigMask; | |
| 1092 IntPointer = sum_feature_evidence_; | |
| 1093 while (ConfigWord) { | |
| 1094 if (ConfigWord & 1) { | |
| 1095 *IntPointer += temp; | |
| 1096 } | |
| 1097 IntPointer++; | |
| 1098 ConfigWord >>= 1; | |
| 1099 } | |
| 1100 } | |
| 1101 } | |
| 1102 } | |
| 1103 | |
| 1104 /** | |
| 1105 * Normalize Sum of Proto and Feature Evidence by dividing by the sum of | |
| 1106 * the Feature Lengths and the Proto Lengths for each configuration. | |
| 1107 */ | |
| 1108 void ScratchEvidence::NormalizeSums(INT_CLASS_STRUCT *ClassTemplate, int16_t NumFeatures) { | |
| 1109 // ClassTemplate->NumConfigs can become larger than MAX_NUM_CONFIGS. | |
| 1110 for (int i = 0; i < MAX_NUM_CONFIGS && i < ClassTemplate->NumConfigs; i++) { | |
| 1111 sum_feature_evidence_[i] = | |
| 1112 (sum_feature_evidence_[i] << 8) / (NumFeatures + ClassTemplate->ConfigLengths[i]); | |
| 1113 } | |
| 1114 } | |
| 1115 | |
| 1116 /** | |
| 1117 * Find the best match for the current class and update the Result | |
| 1118 * with the configuration and match rating. | |
| 1119 * @return The best normalized sum of evidences | |
| 1120 */ | |
| 1121 int IntegerMatcher::FindBestMatch(INT_CLASS_STRUCT *class_template, const ScratchEvidence &tables, | |
| 1122 UnicharRating *result) { | |
| 1123 int best_match = 0; | |
| 1124 result->config = 0; | |
| 1125 result->fonts.clear(); | |
| 1126 result->fonts.reserve(class_template->NumConfigs); | |
| 1127 | |
| 1128 // Find best match. | |
| 1129 // ClassTemplate->NumConfigs can become larger than MAX_NUM_CONFIGS. | |
| 1130 for (int c = 0; c < MAX_NUM_CONFIGS && c < class_template->NumConfigs; ++c) { | |
| 1131 int rating = tables.sum_feature_evidence_[c]; | |
| 1132 if (*classify_debug_level_ > 2) { | |
| 1133 tprintf("Config %d, rating=%d\n", c, rating); | |
| 1134 } | |
| 1135 if (rating > best_match) { | |
| 1136 result->config = c; | |
| 1137 best_match = rating; | |
| 1138 } | |
| 1139 result->fonts.emplace_back(c, rating); | |
| 1140 } | |
| 1141 | |
| 1142 // Compute confidence on a Probability scale. | |
| 1143 result->rating = best_match / 65536.0f; | |
| 1144 | |
| 1145 return best_match; | |
| 1146 } | |
| 1147 | |
| 1148 /** | |
| 1149 * Applies the CN normalization factor to the given rating and returns | |
| 1150 * the modified rating. | |
| 1151 */ | |
| 1152 float IntegerMatcher::ApplyCNCorrection(float rating, int blob_length, int normalization_factor, | |
| 1153 int matcher_multiplier) { | |
| 1154 int divisor = blob_length + matcher_multiplier; | |
| 1155 return divisor == 0 | |
| 1156 ? 1.0f | |
| 1157 : (rating * blob_length + matcher_multiplier * normalization_factor / 256.0f) / | |
| 1158 divisor; | |
| 1159 } | |
| 1160 | |
| 1161 } // namespace tesseract |
