Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/training/common/trainingsampleset.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 // Copyright 2010 Google Inc. All Rights Reserved. | |
| 2 // Author: rays@google.com (Ray Smith) | |
| 3 // | |
| 4 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 5 // you may not use this file except in compliance with the License. | |
| 6 // You may obtain a copy of the License at | |
| 7 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 8 // Unless required by applicable law or agreed to in writing, software | |
| 9 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 11 // See the License for the specific language governing permissions and | |
| 12 // limitations under the License. | |
| 13 // | |
| 14 /////////////////////////////////////////////////////////////////////// | |
| 15 | |
| 16 #ifdef HAVE_CONFIG_H | |
| 17 # include "config_auto.h" | |
| 18 #endif | |
| 19 | |
| 20 #include <algorithm> | |
| 21 | |
| 22 #include <allheaders.h> | |
| 23 #include "boxread.h" | |
| 24 #include "fontinfo.h" | |
| 25 //#include "helpers.h" | |
| 26 #include "indexmapbidi.h" | |
| 27 #include "intfeaturedist.h" | |
| 28 #include "intfeaturemap.h" | |
| 29 #include "intfeaturespace.h" | |
| 30 #include "shapetable.h" | |
| 31 #include "tesserrstream.h" // for tesserr | |
| 32 #include "trainingsample.h" | |
| 33 #include "trainingsampleset.h" | |
| 34 #include "unicity_table.h" | |
| 35 | |
| 36 namespace tesseract { | |
| 37 | |
| 38 const int kTestChar = -1; // 37; | |
| 39 // Max number of distances to compute the squared way | |
| 40 const int kSquareLimit = 25; | |
| 41 // Prime numbers for subsampling distances. | |
| 42 const int kPrime1 = 17; | |
| 43 const int kPrime2 = 13; | |
| 44 | |
| 45 TrainingSampleSet::FontClassInfo::FontClassInfo() | |
| 46 : num_raw_samples(0), canonical_sample(-1), canonical_dist(0.0f) {} | |
| 47 | |
| 48 // Writes to the given file. Returns false in case of error. | |
| 49 bool TrainingSampleSet::FontClassInfo::Serialize(FILE *fp) const { | |
| 50 if (fwrite(&num_raw_samples, sizeof(num_raw_samples), 1, fp) != 1) { | |
| 51 return false; | |
| 52 } | |
| 53 if (fwrite(&canonical_sample, sizeof(canonical_sample), 1, fp) != 1) { | |
| 54 return false; | |
| 55 } | |
| 56 if (fwrite(&canonical_dist, sizeof(canonical_dist), 1, fp) != 1) { | |
| 57 return false; | |
| 58 } | |
| 59 if (!::tesseract::Serialize(fp, samples)) { | |
| 60 return false; | |
| 61 } | |
| 62 return true; | |
| 63 } | |
| 64 // Reads from the given file. Returns false in case of error. | |
| 65 // If swap is true, assumes a big/little-endian swap is needed. | |
| 66 bool TrainingSampleSet::FontClassInfo::DeSerialize(bool swap, FILE *fp) { | |
| 67 if (fread(&num_raw_samples, sizeof(num_raw_samples), 1, fp) != 1) { | |
| 68 return false; | |
| 69 } | |
| 70 if (fread(&canonical_sample, sizeof(canonical_sample), 1, fp) != 1) { | |
| 71 return false; | |
| 72 } | |
| 73 if (fread(&canonical_dist, sizeof(canonical_dist), 1, fp) != 1) { | |
| 74 return false; | |
| 75 } | |
| 76 if (!::tesseract::DeSerialize(swap, fp, samples)) { | |
| 77 return false; | |
| 78 } | |
| 79 if (swap) { | |
| 80 ReverseN(&num_raw_samples, sizeof(num_raw_samples)); | |
| 81 ReverseN(&canonical_sample, sizeof(canonical_sample)); | |
| 82 ReverseN(&canonical_dist, sizeof(canonical_dist)); | |
| 83 } | |
| 84 return true; | |
| 85 } | |
| 86 | |
| 87 TrainingSampleSet::TrainingSampleSet(const FontInfoTable &font_table) | |
| 88 : num_raw_samples_(0) | |
| 89 , unicharset_size_(0) | |
| 90 , font_class_array_(nullptr) | |
| 91 , fontinfo_table_(font_table) {} | |
| 92 | |
| 93 TrainingSampleSet::~TrainingSampleSet() { | |
| 94 for (auto sample : samples_) { | |
| 95 delete sample; | |
| 96 } | |
| 97 delete font_class_array_; | |
| 98 } | |
| 99 | |
| 100 // Writes to the given file. Returns false in case of error. | |
| 101 bool TrainingSampleSet::Serialize(FILE *fp) const { | |
| 102 if (!tesseract::Serialize(fp, samples_)) { | |
| 103 return false; | |
| 104 } | |
| 105 if (!unicharset_.save_to_file(fp)) { | |
| 106 return false; | |
| 107 } | |
| 108 if (!font_id_map_.Serialize(fp)) { | |
| 109 return false; | |
| 110 } | |
| 111 int8_t not_null = font_class_array_ != nullptr; | |
| 112 if (fwrite(¬_null, sizeof(not_null), 1, fp) != 1) { | |
| 113 return false; | |
| 114 } | |
| 115 if (not_null) { | |
| 116 if (!font_class_array_->SerializeClasses(fp)) { | |
| 117 return false; | |
| 118 } | |
| 119 } | |
| 120 return true; | |
| 121 } | |
| 122 | |
| 123 // Reads from the given file. Returns false in case of error. | |
| 124 // If swap is true, assumes a big/little-endian swap is needed. | |
| 125 bool TrainingSampleSet::DeSerialize(bool swap, FILE *fp) { | |
| 126 if (!tesseract::DeSerialize(swap, fp, samples_)) { | |
| 127 return false; | |
| 128 } | |
| 129 num_raw_samples_ = samples_.size(); | |
| 130 if (!unicharset_.load_from_file(fp)) { | |
| 131 return false; | |
| 132 } | |
| 133 if (!font_id_map_.DeSerialize(swap, fp)) { | |
| 134 return false; | |
| 135 } | |
| 136 delete font_class_array_; | |
| 137 font_class_array_ = nullptr; | |
| 138 int8_t not_null; | |
| 139 if (fread(¬_null, sizeof(not_null), 1, fp) != 1) { | |
| 140 return false; | |
| 141 } | |
| 142 if (not_null) { | |
| 143 FontClassInfo empty; | |
| 144 font_class_array_ = new GENERIC_2D_ARRAY<FontClassInfo>(1, 1, empty); | |
| 145 if (!font_class_array_->DeSerializeClasses(swap, fp)) { | |
| 146 return false; | |
| 147 } | |
| 148 } | |
| 149 unicharset_size_ = unicharset_.size(); | |
| 150 return true; | |
| 151 } | |
| 152 | |
| 153 // Load an initial unicharset, or set one up if the file cannot be read. | |
| 154 void TrainingSampleSet::LoadUnicharset(const char *filename) { | |
| 155 if (!unicharset_.load_from_file(filename)) { | |
| 156 tprintf( | |
| 157 "Failed to load unicharset from file %s\n" | |
| 158 "Building unicharset from scratch...\n", | |
| 159 filename); | |
| 160 unicharset_.clear(); | |
| 161 // Add special characters as they were removed by the clear. | |
| 162 UNICHARSET empty; | |
| 163 unicharset_.AppendOtherUnicharset(empty); | |
| 164 } | |
| 165 unicharset_size_ = unicharset_.size(); | |
| 166 } | |
| 167 | |
| 168 // Adds a character sample to this sample set. | |
| 169 // If the unichar is not already in the local unicharset, it is added. | |
| 170 // Returns the unichar_id of the added sample, from the local unicharset. | |
| 171 int TrainingSampleSet::AddSample(const char *unichar, TrainingSample *sample) { | |
| 172 if (!unicharset_.contains_unichar(unichar)) { | |
| 173 unicharset_.unichar_insert(unichar); | |
| 174 if (unicharset_.size() > MAX_NUM_CLASSES) { | |
| 175 tprintf( | |
| 176 "Error: Size of unicharset in TrainingSampleSet::AddSample is " | |
| 177 "greater than MAX_NUM_CLASSES\n"); | |
| 178 return -1; | |
| 179 } | |
| 180 } | |
| 181 UNICHAR_ID char_id = unicharset_.unichar_to_id(unichar); | |
| 182 AddSample(char_id, sample); | |
| 183 return char_id; | |
| 184 } | |
| 185 | |
| 186 // Adds a character sample to this sample set with the given unichar_id, | |
| 187 // which must correspond to the local unicharset (in this). | |
| 188 void TrainingSampleSet::AddSample(int unichar_id, TrainingSample *sample) { | |
| 189 sample->set_class_id(unichar_id); | |
| 190 samples_.push_back(sample); | |
| 191 num_raw_samples_ = samples_.size(); | |
| 192 unicharset_size_ = unicharset_.size(); | |
| 193 } | |
| 194 | |
| 195 // Returns the number of samples for the given font,class pair. | |
| 196 // If randomize is true, returns the number of samples accessible | |
| 197 // with randomizing on. (Increases the number of samples if small.) | |
| 198 // OrganizeByFontAndClass must have been already called. | |
| 199 int TrainingSampleSet::NumClassSamples(int font_id, int class_id, bool randomize) const { | |
| 200 ASSERT_HOST(font_class_array_ != nullptr); | |
| 201 if (font_id < 0 || class_id < 0 || font_id >= font_id_map_.SparseSize() || | |
| 202 class_id >= unicharset_size_) { | |
| 203 // There are no samples because the font or class doesn't exist. | |
| 204 return 0; | |
| 205 } | |
| 206 int font_index = font_id_map_.SparseToCompact(font_id); | |
| 207 if (font_index < 0) { | |
| 208 return 0; // The font has no samples. | |
| 209 } | |
| 210 if (randomize) { | |
| 211 return (*font_class_array_)(font_index, class_id).samples.size(); | |
| 212 } else { | |
| 213 return (*font_class_array_)(font_index, class_id).num_raw_samples; | |
| 214 } | |
| 215 } | |
| 216 | |
| 217 // Gets a sample by its index. | |
| 218 const TrainingSample *TrainingSampleSet::GetSample(int index) const { | |
| 219 return samples_[index]; | |
| 220 } | |
| 221 | |
| 222 // Gets a sample by its font, class, index. | |
| 223 // OrganizeByFontAndClass must have been already called. | |
| 224 const TrainingSample *TrainingSampleSet::GetSample(int font_id, int class_id, int index) const { | |
| 225 ASSERT_HOST(font_class_array_ != nullptr); | |
| 226 int font_index = font_id_map_.SparseToCompact(font_id); | |
| 227 if (font_index < 0) { | |
| 228 return nullptr; | |
| 229 } | |
| 230 int sample_index = (*font_class_array_)(font_index, class_id).samples[index]; | |
| 231 return samples_[sample_index]; | |
| 232 } | |
| 233 | |
| 234 // Get a sample by its font, class, index. Does not randomize. | |
| 235 // OrganizeByFontAndClass must have been already called. | |
| 236 TrainingSample *TrainingSampleSet::MutableSample(int font_id, int class_id, int index) { | |
| 237 ASSERT_HOST(font_class_array_ != nullptr); | |
| 238 int font_index = font_id_map_.SparseToCompact(font_id); | |
| 239 if (font_index < 0) { | |
| 240 return nullptr; | |
| 241 } | |
| 242 int sample_index = (*font_class_array_)(font_index, class_id).samples[index]; | |
| 243 return samples_[sample_index]; | |
| 244 } | |
| 245 | |
| 246 // Returns a string debug representation of the given sample: | |
| 247 // font, unichar_str, bounding box, page. | |
| 248 std::string TrainingSampleSet::SampleToString(const TrainingSample &sample) const { | |
| 249 std::string boxfile_str; | |
| 250 MakeBoxFileStr(unicharset_.id_to_unichar(sample.class_id()), sample.bounding_box(), | |
| 251 sample.page_num(), boxfile_str); | |
| 252 return std::string(fontinfo_table_.at(sample.font_id()).name) + " " + boxfile_str; | |
| 253 } | |
| 254 | |
| 255 // Gets the combined set of features used by all the samples of the given | |
| 256 // font/class combination. | |
| 257 const BitVector &TrainingSampleSet::GetCloudFeatures(int font_id, int class_id) const { | |
| 258 int font_index = font_id_map_.SparseToCompact(font_id); | |
| 259 ASSERT_HOST(font_index >= 0); | |
| 260 return (*font_class_array_)(font_index, class_id).cloud_features; | |
| 261 } | |
| 262 // Gets the indexed features of the canonical sample of the given | |
| 263 // font/class combination. | |
| 264 const std::vector<int> &TrainingSampleSet::GetCanonicalFeatures(int font_id, int class_id) const { | |
| 265 int font_index = font_id_map_.SparseToCompact(font_id); | |
| 266 ASSERT_HOST(font_index >= 0); | |
| 267 return (*font_class_array_)(font_index, class_id).canonical_features; | |
| 268 } | |
| 269 | |
| 270 // Returns the distance between the given UniCharAndFonts pair. | |
| 271 // If matched_fonts, only matching fonts, are considered, unless that yields | |
| 272 // the empty set. | |
| 273 // OrganizeByFontAndClass must have been already called. | |
| 274 float TrainingSampleSet::UnicharDistance(const UnicharAndFonts &uf1, const UnicharAndFonts &uf2, | |
| 275 bool matched_fonts, const IntFeatureMap &feature_map) { | |
| 276 int num_fonts1 = uf1.font_ids.size(); | |
| 277 int c1 = uf1.unichar_id; | |
| 278 int num_fonts2 = uf2.font_ids.size(); | |
| 279 int c2 = uf2.unichar_id; | |
| 280 double dist_sum = 0.0; | |
| 281 int dist_count = 0; | |
| 282 const bool debug = false; | |
| 283 if (matched_fonts) { | |
| 284 // Compute distances only where fonts match. | |
| 285 for (int i = 0; i < num_fonts1; ++i) { | |
| 286 int f1 = uf1.font_ids[i]; | |
| 287 for (int j = 0; j < num_fonts2; ++j) { | |
| 288 int f2 = uf2.font_ids[j]; | |
| 289 if (f1 == f2) { | |
| 290 dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map); | |
| 291 ++dist_count; | |
| 292 } | |
| 293 } | |
| 294 } | |
| 295 } else if (num_fonts1 * num_fonts2 <= kSquareLimit) { | |
| 296 // Small enough sets to compute all the distances. | |
| 297 for (int i = 0; i < num_fonts1; ++i) { | |
| 298 int f1 = uf1.font_ids[i]; | |
| 299 for (int j = 0; j < num_fonts2; ++j) { | |
| 300 int f2 = uf2.font_ids[j]; | |
| 301 dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map); | |
| 302 if (debug) { | |
| 303 tprintf("Cluster dist %d %d %d %d = %g\n", f1, c1, f2, c2, | |
| 304 ClusterDistance(f1, c1, f2, c2, feature_map)); | |
| 305 } | |
| 306 ++dist_count; | |
| 307 } | |
| 308 } | |
| 309 } else { | |
| 310 // Subsample distances, using the largest set once, and stepping through | |
| 311 // the smaller set so as to ensure that all the pairs are different. | |
| 312 int increment = kPrime1 != num_fonts2 ? kPrime1 : kPrime2; | |
| 313 int index = 0; | |
| 314 int num_samples = std::max(num_fonts1, num_fonts2); | |
| 315 for (int i = 0; i < num_samples; ++i, index += increment) { | |
| 316 int f1 = uf1.font_ids[i % num_fonts1]; | |
| 317 int f2 = uf2.font_ids[index % num_fonts2]; | |
| 318 if (debug) { | |
| 319 tprintf("Cluster dist %d %d %d %d = %g\n", f1, c1, f2, c2, | |
| 320 ClusterDistance(f1, c1, f2, c2, feature_map)); | |
| 321 } | |
| 322 dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map); | |
| 323 ++dist_count; | |
| 324 } | |
| 325 } | |
| 326 if (dist_count == 0) { | |
| 327 if (matched_fonts) { | |
| 328 return UnicharDistance(uf1, uf2, false, feature_map); | |
| 329 } | |
| 330 return 0.0f; | |
| 331 } | |
| 332 return dist_sum / dist_count; | |
| 333 } | |
| 334 | |
| 335 // Returns the distance between the given pair of font/class pairs. | |
| 336 // Finds in cache or computes and caches. | |
| 337 // OrganizeByFontAndClass must have been already called. | |
| 338 float TrainingSampleSet::ClusterDistance(int font_id1, int class_id1, int font_id2, int class_id2, | |
| 339 const IntFeatureMap &feature_map) { | |
| 340 ASSERT_HOST(font_class_array_ != nullptr); | |
| 341 int font_index1 = font_id_map_.SparseToCompact(font_id1); | |
| 342 int font_index2 = font_id_map_.SparseToCompact(font_id2); | |
| 343 if (font_index1 < 0 || font_index2 < 0) { | |
| 344 return 0.0f; | |
| 345 } | |
| 346 FontClassInfo &fc_info = (*font_class_array_)(font_index1, class_id1); | |
| 347 if (font_id1 == font_id2) { | |
| 348 // Special case cache for speed. | |
| 349 if (fc_info.unichar_distance_cache.empty()) { | |
| 350 fc_info.unichar_distance_cache.resize(unicharset_size_, -1.0f); | |
| 351 } | |
| 352 if (fc_info.unichar_distance_cache[class_id2] < 0) { | |
| 353 // Distance has to be calculated. | |
| 354 float result = ComputeClusterDistance(font_id1, class_id1, font_id2, class_id2, feature_map); | |
| 355 fc_info.unichar_distance_cache[class_id2] = result; | |
| 356 // Copy to the symmetric cache entry. | |
| 357 FontClassInfo &fc_info2 = (*font_class_array_)(font_index2, class_id2); | |
| 358 if (fc_info2.unichar_distance_cache.empty()) { | |
| 359 fc_info2.unichar_distance_cache.resize(unicharset_size_, -1.0f); | |
| 360 } | |
| 361 fc_info2.unichar_distance_cache[class_id1] = result; | |
| 362 } | |
| 363 return fc_info.unichar_distance_cache[class_id2]; | |
| 364 } else if (class_id1 == class_id2) { | |
| 365 // Another special-case cache for equal class-id. | |
| 366 if (fc_info.font_distance_cache.empty()) { | |
| 367 fc_info.font_distance_cache.resize(font_id_map_.CompactSize(), -1.0f); | |
| 368 } | |
| 369 if (fc_info.font_distance_cache[font_index2] < 0) { | |
| 370 // Distance has to be calculated. | |
| 371 float result = ComputeClusterDistance(font_id1, class_id1, font_id2, class_id2, feature_map); | |
| 372 fc_info.font_distance_cache[font_index2] = result; | |
| 373 // Copy to the symmetric cache entry. | |
| 374 FontClassInfo &fc_info2 = (*font_class_array_)(font_index2, class_id2); | |
| 375 if (fc_info2.font_distance_cache.empty()) { | |
| 376 fc_info2.font_distance_cache.resize(font_id_map_.CompactSize(), -1.0f); | |
| 377 } | |
| 378 fc_info2.font_distance_cache[font_index1] = result; | |
| 379 } | |
| 380 return fc_info.font_distance_cache[font_index2]; | |
| 381 } | |
| 382 // Both font and class are different. Linear search for class_id2/font_id2 | |
| 383 // in what is a hopefully short list of distances. | |
| 384 size_t cache_index = 0; | |
| 385 while (cache_index < fc_info.distance_cache.size() && | |
| 386 (fc_info.distance_cache[cache_index].unichar_id != class_id2 || | |
| 387 fc_info.distance_cache[cache_index].font_id != font_id2)) { | |
| 388 ++cache_index; | |
| 389 } | |
| 390 if (cache_index == fc_info.distance_cache.size()) { | |
| 391 // Distance has to be calculated. | |
| 392 float result = ComputeClusterDistance(font_id1, class_id1, font_id2, class_id2, feature_map); | |
| 393 FontClassDistance fc_dist = {class_id2, font_id2, result}; | |
| 394 fc_info.distance_cache.push_back(fc_dist); | |
| 395 // Copy to the symmetric cache entry. We know it isn't there already, as | |
| 396 // we always copy to the symmetric entry. | |
| 397 FontClassInfo &fc_info2 = (*font_class_array_)(font_index2, class_id2); | |
| 398 fc_dist.unichar_id = class_id1; | |
| 399 fc_dist.font_id = font_id1; | |
| 400 fc_info2.distance_cache.push_back(fc_dist); | |
| 401 } | |
| 402 return fc_info.distance_cache[cache_index].distance; | |
| 403 } | |
| 404 | |
| 405 // Computes the distance between the given pair of font/class pairs. | |
| 406 float TrainingSampleSet::ComputeClusterDistance(int font_id1, int class_id1, int font_id2, | |
| 407 int class_id2, | |
| 408 const IntFeatureMap &feature_map) const { | |
| 409 int dist = ReliablySeparable(font_id1, class_id1, font_id2, class_id2, feature_map, false); | |
| 410 dist += ReliablySeparable(font_id2, class_id2, font_id1, class_id1, feature_map, false); | |
| 411 int denominator = GetCanonicalFeatures(font_id1, class_id1).size(); | |
| 412 denominator += GetCanonicalFeatures(font_id2, class_id2).size(); | |
| 413 return static_cast<float>(dist) / denominator; | |
| 414 } | |
| 415 | |
| 416 // Helper to add a feature and its near neighbors to the good_features. | |
| 417 // levels indicates how many times to compute the offset features of what is | |
| 418 // already there. This is done by iteration rather than recursion. | |
| 419 static void AddNearFeatures(const IntFeatureMap &feature_map, int f, int levels, | |
| 420 std::vector<int> *good_features) { | |
| 421 int prev_num_features = 0; | |
| 422 good_features->push_back(f); | |
| 423 int num_features = 1; | |
| 424 for (int level = 0; level < levels; ++level) { | |
| 425 for (int i = prev_num_features; i < num_features; ++i) { | |
| 426 int feature = (*good_features)[i]; | |
| 427 for (int dir = -kNumOffsetMaps; dir <= kNumOffsetMaps; ++dir) { | |
| 428 if (dir == 0) { | |
| 429 continue; | |
| 430 } | |
| 431 int f1 = feature_map.OffsetFeature(feature, dir); | |
| 432 if (f1 >= 0) { | |
| 433 good_features->push_back(f1); | |
| 434 } | |
| 435 } | |
| 436 } | |
| 437 prev_num_features = num_features; | |
| 438 num_features = good_features->size(); | |
| 439 } | |
| 440 } | |
| 441 | |
| 442 // Returns the number of canonical features of font/class 2 for which | |
| 443 // neither the feature nor any of its near neighbors occurs in the cloud | |
| 444 // of font/class 1. Each such feature is a reliable separation between | |
| 445 // the classes, ASSUMING that the canonical sample is sufficiently | |
| 446 // representative that every sample has a feature near that particular | |
| 447 // feature. To check that this is so on the fly would be prohibitively | |
| 448 // expensive, but it might be possible to pre-qualify the canonical features | |
| 449 // to include only those for which this assumption is true. | |
| 450 // ComputeCanonicalFeatures and ComputeCloudFeatures must have been called | |
| 451 // first, or the results will be nonsense. | |
| 452 int TrainingSampleSet::ReliablySeparable(int font_id1, int class_id1, int font_id2, int class_id2, | |
| 453 const IntFeatureMap &feature_map, bool thorough) const { | |
| 454 int result = 0; | |
| 455 const TrainingSample *sample2 = GetCanonicalSample(font_id2, class_id2); | |
| 456 if (sample2 == nullptr) { | |
| 457 return 0; // There are no canonical features. | |
| 458 } | |
| 459 const std::vector<int> &canonical2 = GetCanonicalFeatures(font_id2, class_id2); | |
| 460 const BitVector &cloud1 = GetCloudFeatures(font_id1, class_id1); | |
| 461 if (cloud1.empty()) { | |
| 462 return canonical2.size(); // There are no cloud features. | |
| 463 } | |
| 464 | |
| 465 // Find a canonical2 feature that is not in cloud1. | |
| 466 for (int feature : canonical2) { | |
| 467 if (cloud1[feature]) { | |
| 468 continue; | |
| 469 } | |
| 470 // Gather the near neighbours of f. | |
| 471 std::vector<int> good_features; | |
| 472 AddNearFeatures(feature_map, feature, 1, &good_features); | |
| 473 // Check that none of the good_features are in the cloud. | |
| 474 bool found = false; | |
| 475 for (auto good_f : good_features) { | |
| 476 if (cloud1[good_f]) { | |
| 477 found = true; | |
| 478 break; | |
| 479 } | |
| 480 } | |
| 481 if (found) { | |
| 482 continue; // Found one in the cloud. | |
| 483 } | |
| 484 ++result; | |
| 485 } | |
| 486 return result; | |
| 487 } | |
| 488 | |
| 489 // Returns the total index of the requested sample. | |
| 490 // OrganizeByFontAndClass must have been already called. | |
| 491 int TrainingSampleSet::GlobalSampleIndex(int font_id, int class_id, int index) const { | |
| 492 ASSERT_HOST(font_class_array_ != nullptr); | |
| 493 int font_index = font_id_map_.SparseToCompact(font_id); | |
| 494 if (font_index < 0) { | |
| 495 return -1; | |
| 496 } | |
| 497 return (*font_class_array_)(font_index, class_id).samples[index]; | |
| 498 } | |
| 499 | |
| 500 // Gets the canonical sample for the given font, class pair. | |
| 501 // ComputeCanonicalSamples must have been called first. | |
| 502 const TrainingSample *TrainingSampleSet::GetCanonicalSample(int font_id, int class_id) const { | |
| 503 ASSERT_HOST(font_class_array_ != nullptr); | |
| 504 int font_index = font_id_map_.SparseToCompact(font_id); | |
| 505 if (font_index < 0) { | |
| 506 return nullptr; | |
| 507 } | |
| 508 const int sample_index = (*font_class_array_)(font_index, class_id).canonical_sample; | |
| 509 return sample_index >= 0 ? samples_[sample_index] : nullptr; | |
| 510 } | |
| 511 | |
| 512 // Gets the max distance for the given canonical sample. | |
| 513 // ComputeCanonicalSamples must have been called first. | |
| 514 float TrainingSampleSet::GetCanonicalDist(int font_id, int class_id) const { | |
| 515 ASSERT_HOST(font_class_array_ != nullptr); | |
| 516 int font_index = font_id_map_.SparseToCompact(font_id); | |
| 517 if (font_index < 0) { | |
| 518 return 0.0f; | |
| 519 } | |
| 520 if ((*font_class_array_)(font_index, class_id).canonical_sample >= 0) { | |
| 521 return (*font_class_array_)(font_index, class_id).canonical_dist; | |
| 522 } else { | |
| 523 return 0.0f; | |
| 524 } | |
| 525 } | |
| 526 | |
| 527 // Generates indexed features for all samples with the supplied feature_space. | |
| 528 void TrainingSampleSet::IndexFeatures(const IntFeatureSpace &feature_space) { | |
| 529 for (auto &sample : samples_) { | |
| 530 sample->IndexFeatures(feature_space); | |
| 531 } | |
| 532 } | |
| 533 | |
| 534 // Marks the given sample index for deletion. | |
| 535 // Deletion is actually completed by DeleteDeadSamples. | |
| 536 void TrainingSampleSet::KillSample(TrainingSample *sample) { | |
| 537 sample->set_sample_index(-1); | |
| 538 } | |
| 539 | |
| 540 // Deletes all samples with zero features marked by KillSample. | |
| 541 void TrainingSampleSet::DeleteDeadSamples() { | |
| 542 using namespace std::placeholders; // for _1 | |
| 543 for (auto &&it = samples_.begin(); it < samples_.end();) { | |
| 544 if (*it == nullptr || (*it)->class_id() < 0) { | |
| 545 samples_.erase(it); | |
| 546 delete *it; | |
| 547 } else { | |
| 548 ++it; | |
| 549 } | |
| 550 } | |
| 551 num_raw_samples_ = samples_.size(); | |
| 552 // Samples must be re-organized now we have deleted a few. | |
| 553 } | |
| 554 | |
| 555 // Construct an array to access the samples by font,class pair. | |
| 556 void TrainingSampleSet::OrganizeByFontAndClass() { | |
| 557 // Font indexes are sparse, so we used a map to compact them, so we can | |
| 558 // have an efficient 2-d array of fonts and character classes. | |
| 559 SetupFontIdMap(); | |
| 560 int compact_font_size = font_id_map_.CompactSize(); | |
| 561 // Get a 2-d array of generic vectors. | |
| 562 delete font_class_array_; | |
| 563 FontClassInfo empty; | |
| 564 font_class_array_ = | |
| 565 new GENERIC_2D_ARRAY<FontClassInfo>(compact_font_size, unicharset_size_, empty); | |
| 566 for (size_t s = 0; s < samples_.size(); ++s) { | |
| 567 int font_id = samples_[s]->font_id(); | |
| 568 int class_id = samples_[s]->class_id(); | |
| 569 if (font_id < 0 || font_id >= font_id_map_.SparseSize()) { | |
| 570 tesserr << "Font id = " << font_id << '/' << font_id_map_.SparseSize() | |
| 571 << ", class id = " << class_id << '/' << unicharset_size_ | |
| 572 << " on sample " << s << '\n'; | |
| 573 } | |
| 574 ASSERT_HOST(font_id >= 0 && font_id < font_id_map_.SparseSize()); | |
| 575 ASSERT_HOST(class_id >= 0 && class_id < unicharset_size_); | |
| 576 int font_index = font_id_map_.SparseToCompact(font_id); | |
| 577 (*font_class_array_)(font_index, class_id).samples.push_back(s); | |
| 578 } | |
| 579 // Set the num_raw_samples member of the FontClassInfo, to set the boundary | |
| 580 // between the raw samples and the replicated ones. | |
| 581 for (int f = 0; f < compact_font_size; ++f) { | |
| 582 for (int c = 0; c < unicharset_size_; ++c) { | |
| 583 (*font_class_array_)(f, c).num_raw_samples = (*font_class_array_)(f, c).samples.size(); | |
| 584 } | |
| 585 } | |
| 586 // This is the global number of samples and also marks the boundary between | |
| 587 // real and replicated samples. | |
| 588 num_raw_samples_ = samples_.size(); | |
| 589 } | |
| 590 | |
| 591 // Constructs the font_id_map_ which maps real font_ids (sparse) to a compact | |
| 592 // index for the font_class_array_. | |
| 593 void TrainingSampleSet::SetupFontIdMap() { | |
| 594 // Number of samples for each font_id. | |
| 595 std::vector<int> font_counts; | |
| 596 for (auto &sample : samples_) { | |
| 597 const int font_id = sample->font_id(); | |
| 598 while (font_id >= font_counts.size()) { | |
| 599 font_counts.push_back(0); | |
| 600 } | |
| 601 ++font_counts[font_id]; | |
| 602 } | |
| 603 font_id_map_.Init(font_counts.size(), false); | |
| 604 for (size_t f = 0; f < font_counts.size(); ++f) { | |
| 605 font_id_map_.SetMap(f, font_counts[f] > 0); | |
| 606 } | |
| 607 font_id_map_.Setup(); | |
| 608 } | |
| 609 | |
| 610 // Finds the sample for each font, class pair that has least maximum | |
| 611 // distance to all the other samples of the same font, class. | |
| 612 // OrganizeByFontAndClass must have been already called. | |
| 613 void TrainingSampleSet::ComputeCanonicalSamples(const IntFeatureMap &map, bool debug) { | |
| 614 ASSERT_HOST(font_class_array_ != nullptr); | |
| 615 IntFeatureDist f_table; | |
| 616 if (debug) { | |
| 617 tprintf("feature table size %d\n", map.sparse_size()); | |
| 618 } | |
| 619 f_table.Init(&map); | |
| 620 int worst_s1 = 0; | |
| 621 int worst_s2 = 0; | |
| 622 double global_worst_dist = 0.0; | |
| 623 // Compute distances independently for each font and char index. | |
| 624 int font_size = font_id_map_.CompactSize(); | |
| 625 for (int font_index = 0; font_index < font_size; ++font_index) { | |
| 626 int font_id = font_id_map_.CompactToSparse(font_index); | |
| 627 for (int c = 0; c < unicharset_size_; ++c) { | |
| 628 int samples_found = 0; | |
| 629 FontClassInfo &fcinfo = (*font_class_array_)(font_index, c); | |
| 630 if (fcinfo.samples.empty() || (kTestChar >= 0 && c != kTestChar)) { | |
| 631 fcinfo.canonical_sample = -1; | |
| 632 fcinfo.canonical_dist = 0.0f; | |
| 633 if (debug) { | |
| 634 tprintf("Skipping class %d\n", c); | |
| 635 } | |
| 636 continue; | |
| 637 } | |
| 638 // The canonical sample will be the one with the min_max_dist, which | |
| 639 // is the sample with the lowest maximum distance to all other samples. | |
| 640 double min_max_dist = 2.0; | |
| 641 // We keep track of the farthest apart pair (max_s1, max_s2) which | |
| 642 // are max_max_dist apart, so we can see how bad the variability is. | |
| 643 double max_max_dist = 0.0; | |
| 644 int max_s1 = 0; | |
| 645 int max_s2 = 0; | |
| 646 fcinfo.canonical_sample = fcinfo.samples[0]; | |
| 647 fcinfo.canonical_dist = 0.0f; | |
| 648 for (auto s1 : fcinfo.samples) { | |
| 649 const std::vector<int> &features1 = samples_[s1]->indexed_features(); | |
| 650 f_table.Set(features1, features1.size(), true); | |
| 651 double max_dist = 0.0; | |
| 652 // Run the full squared-order search for similar samples. It is still | |
| 653 // reasonably fast because f_table.FeatureDistance is fast, but we | |
| 654 // may have to reconsider if we start playing with too many samples | |
| 655 // of a single char/font. | |
| 656 for (int s2 : fcinfo.samples) { | |
| 657 if (samples_[s2]->class_id() != c || samples_[s2]->font_id() != font_id || s2 == s1) { | |
| 658 continue; | |
| 659 } | |
| 660 std::vector<int> features2 = samples_[s2]->indexed_features(); | |
| 661 double dist = f_table.FeatureDistance(features2); | |
| 662 if (dist > max_dist) { | |
| 663 max_dist = dist; | |
| 664 if (dist > max_max_dist) { | |
| 665 max_max_dist = dist; | |
| 666 max_s1 = s1; | |
| 667 max_s2 = s2; | |
| 668 } | |
| 669 } | |
| 670 } | |
| 671 // Using Set(..., false) is far faster than re initializing, due to | |
| 672 // the sparseness of the feature space. | |
| 673 f_table.Set(features1, features1.size(), false); | |
| 674 samples_[s1]->set_max_dist(max_dist); | |
| 675 ++samples_found; | |
| 676 if (max_dist < min_max_dist) { | |
| 677 fcinfo.canonical_sample = s1; | |
| 678 fcinfo.canonical_dist = max_dist; | |
| 679 } | |
| 680 UpdateRange(max_dist, &min_max_dist, &max_max_dist); | |
| 681 } | |
| 682 if (max_max_dist > global_worst_dist) { | |
| 683 // Keep a record of the worst pair over all characters/fonts too. | |
| 684 global_worst_dist = max_max_dist; | |
| 685 worst_s1 = max_s1; | |
| 686 worst_s2 = max_s2; | |
| 687 } | |
| 688 if (debug) { | |
| 689 tprintf( | |
| 690 "Found %d samples of class %d=%s, font %d, " | |
| 691 "dist range [%g, %g], worst pair= %s, %s\n", | |
| 692 samples_found, c, unicharset_.debug_str(c).c_str(), font_index, min_max_dist, | |
| 693 max_max_dist, SampleToString(*samples_[max_s1]).c_str(), | |
| 694 SampleToString(*samples_[max_s2]).c_str()); | |
| 695 } | |
| 696 } | |
| 697 } | |
| 698 if (debug) { | |
| 699 tprintf("Global worst dist = %g, between sample %d and %d\n", global_worst_dist, worst_s1, | |
| 700 worst_s2); | |
| 701 } | |
| 702 } | |
| 703 | |
| 704 // Replicates the samples to a minimum frequency defined by | |
| 705 // 2 * kSampleRandomSize, or for larger counts duplicates all samples. | |
| 706 // After replication, the replicated samples are perturbed slightly, but | |
| 707 // in a predictable and repeatable way. | |
| 708 // Use after OrganizeByFontAndClass(). | |
| 709 void TrainingSampleSet::ReplicateAndRandomizeSamples() { | |
| 710 ASSERT_HOST(font_class_array_ != nullptr); | |
| 711 int font_size = font_id_map_.CompactSize(); | |
| 712 for (int font_index = 0; font_index < font_size; ++font_index) { | |
| 713 for (int c = 0; c < unicharset_size_; ++c) { | |
| 714 FontClassInfo &fcinfo = (*font_class_array_)(font_index, c); | |
| 715 int sample_count = fcinfo.samples.size(); | |
| 716 int min_samples = 2 * std::max(kSampleRandomSize, sample_count); | |
| 717 if (sample_count > 0 && sample_count < min_samples) { | |
| 718 int base_count = sample_count; | |
| 719 for (int base_index = 0; sample_count < min_samples; ++sample_count) { | |
| 720 int src_index = fcinfo.samples[base_index++]; | |
| 721 if (base_index >= base_count) { | |
| 722 base_index = 0; | |
| 723 } | |
| 724 TrainingSample *sample = | |
| 725 samples_[src_index]->RandomizedCopy(sample_count % kSampleRandomSize); | |
| 726 int sample_index = samples_.size(); | |
| 727 sample->set_sample_index(sample_index); | |
| 728 samples_.push_back(sample); | |
| 729 fcinfo.samples.push_back(sample_index); | |
| 730 } | |
| 731 } | |
| 732 } | |
| 733 } | |
| 734 } | |
| 735 | |
| 736 // Caches the indexed features of the canonical samples. | |
| 737 // ComputeCanonicalSamples must have been already called. | |
| 738 // TODO(rays) see note on ReliablySeparable and try restricting the | |
| 739 // canonical features to those that truly represent all samples. | |
| 740 void TrainingSampleSet::ComputeCanonicalFeatures() { | |
| 741 ASSERT_HOST(font_class_array_ != nullptr); | |
| 742 const int font_size = font_id_map_.CompactSize(); | |
| 743 for (int font_index = 0; font_index < font_size; ++font_index) { | |
| 744 const int font_id = font_id_map_.CompactToSparse(font_index); | |
| 745 for (int c = 0; c < unicharset_size_; ++c) { | |
| 746 int num_samples = NumClassSamples(font_id, c, false); | |
| 747 if (num_samples == 0) { | |
| 748 continue; | |
| 749 } | |
| 750 const TrainingSample *sample = GetCanonicalSample(font_id, c); | |
| 751 FontClassInfo &fcinfo = (*font_class_array_)(font_index, c); | |
| 752 fcinfo.canonical_features = sample->indexed_features(); | |
| 753 } | |
| 754 } | |
| 755 } | |
| 756 | |
| 757 // Computes the combined set of features used by all the samples of each | |
| 758 // font/class combination. Use after ReplicateAndRandomizeSamples. | |
| 759 void TrainingSampleSet::ComputeCloudFeatures(int feature_space_size) { | |
| 760 ASSERT_HOST(font_class_array_ != nullptr); | |
| 761 int font_size = font_id_map_.CompactSize(); | |
| 762 for (int font_index = 0; font_index < font_size; ++font_index) { | |
| 763 int font_id = font_id_map_.CompactToSparse(font_index); | |
| 764 for (int c = 0; c < unicharset_size_; ++c) { | |
| 765 int num_samples = NumClassSamples(font_id, c, false); | |
| 766 if (num_samples == 0) { | |
| 767 continue; | |
| 768 } | |
| 769 FontClassInfo &fcinfo = (*font_class_array_)(font_index, c); | |
| 770 fcinfo.cloud_features.Init(feature_space_size); | |
| 771 for (int s = 0; s < num_samples; ++s) { | |
| 772 const TrainingSample *sample = GetSample(font_id, c, s); | |
| 773 const std::vector<int> &sample_features = sample->indexed_features(); | |
| 774 for (int sample_feature : sample_features) { | |
| 775 fcinfo.cloud_features.SetBit(sample_feature); | |
| 776 } | |
| 777 } | |
| 778 } | |
| 779 } | |
| 780 } | |
| 781 | |
| 782 // Adds all fonts of the given class to the shape. | |
| 783 void TrainingSampleSet::AddAllFontsForClass(int class_id, Shape *shape) const { | |
| 784 for (int f = 0; f < font_id_map_.CompactSize(); ++f) { | |
| 785 const int font_id = font_id_map_.CompactToSparse(f); | |
| 786 shape->AddToShape(class_id, font_id); | |
| 787 } | |
| 788 } | |
| 789 | |
| 790 #ifndef GRAPHICS_DISABLED | |
| 791 | |
| 792 // Display the samples with the given indexed feature that also match | |
| 793 // the given shape. | |
| 794 void TrainingSampleSet::DisplaySamplesWithFeature(int f_index, const Shape &shape, | |
| 795 const IntFeatureSpace &space, | |
| 796 ScrollView::Color color, | |
| 797 ScrollView *window) const { | |
| 798 for (int s = 0; s < num_raw_samples(); ++s) { | |
| 799 const TrainingSample *sample = GetSample(s); | |
| 800 if (shape.ContainsUnichar(sample->class_id())) { | |
| 801 std::vector<int> indexed_features; | |
| 802 space.IndexAndSortFeatures(sample->features(), sample->num_features(), &indexed_features); | |
| 803 for (int indexed_feature : indexed_features) { | |
| 804 if (indexed_feature == f_index) { | |
| 805 sample->DisplayFeatures(color, window); | |
| 806 } | |
| 807 } | |
| 808 } | |
| 809 } | |
| 810 } | |
| 811 | |
| 812 #endif // !GRAPHICS_DISABLED | |
| 813 | |
| 814 } // namespace tesseract. |
