comparison mupdf-source/thirdparty/tesseract/src/training/common/trainingsampleset.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 // Copyright 2010 Google Inc. All Rights Reserved.
2 // Author: rays@google.com (Ray Smith)
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 //
14 ///////////////////////////////////////////////////////////////////////
15
16 #ifdef HAVE_CONFIG_H
17 # include "config_auto.h"
18 #endif
19
20 #include <algorithm>
21
22 #include <allheaders.h>
23 #include "boxread.h"
24 #include "fontinfo.h"
25 //#include "helpers.h"
26 #include "indexmapbidi.h"
27 #include "intfeaturedist.h"
28 #include "intfeaturemap.h"
29 #include "intfeaturespace.h"
30 #include "shapetable.h"
31 #include "tesserrstream.h" // for tesserr
32 #include "trainingsample.h"
33 #include "trainingsampleset.h"
34 #include "unicity_table.h"
35
36 namespace tesseract {
37
38 const int kTestChar = -1; // 37;
39 // Max number of distances to compute the squared way
40 const int kSquareLimit = 25;
41 // Prime numbers for subsampling distances.
42 const int kPrime1 = 17;
43 const int kPrime2 = 13;
44
45 TrainingSampleSet::FontClassInfo::FontClassInfo()
46 : num_raw_samples(0), canonical_sample(-1), canonical_dist(0.0f) {}
47
48 // Writes to the given file. Returns false in case of error.
49 bool TrainingSampleSet::FontClassInfo::Serialize(FILE *fp) const {
50 if (fwrite(&num_raw_samples, sizeof(num_raw_samples), 1, fp) != 1) {
51 return false;
52 }
53 if (fwrite(&canonical_sample, sizeof(canonical_sample), 1, fp) != 1) {
54 return false;
55 }
56 if (fwrite(&canonical_dist, sizeof(canonical_dist), 1, fp) != 1) {
57 return false;
58 }
59 if (!::tesseract::Serialize(fp, samples)) {
60 return false;
61 }
62 return true;
63 }
64 // Reads from the given file. Returns false in case of error.
65 // If swap is true, assumes a big/little-endian swap is needed.
66 bool TrainingSampleSet::FontClassInfo::DeSerialize(bool swap, FILE *fp) {
67 if (fread(&num_raw_samples, sizeof(num_raw_samples), 1, fp) != 1) {
68 return false;
69 }
70 if (fread(&canonical_sample, sizeof(canonical_sample), 1, fp) != 1) {
71 return false;
72 }
73 if (fread(&canonical_dist, sizeof(canonical_dist), 1, fp) != 1) {
74 return false;
75 }
76 if (!::tesseract::DeSerialize(swap, fp, samples)) {
77 return false;
78 }
79 if (swap) {
80 ReverseN(&num_raw_samples, sizeof(num_raw_samples));
81 ReverseN(&canonical_sample, sizeof(canonical_sample));
82 ReverseN(&canonical_dist, sizeof(canonical_dist));
83 }
84 return true;
85 }
86
87 TrainingSampleSet::TrainingSampleSet(const FontInfoTable &font_table)
88 : num_raw_samples_(0)
89 , unicharset_size_(0)
90 , font_class_array_(nullptr)
91 , fontinfo_table_(font_table) {}
92
93 TrainingSampleSet::~TrainingSampleSet() {
94 for (auto sample : samples_) {
95 delete sample;
96 }
97 delete font_class_array_;
98 }
99
100 // Writes to the given file. Returns false in case of error.
101 bool TrainingSampleSet::Serialize(FILE *fp) const {
102 if (!tesseract::Serialize(fp, samples_)) {
103 return false;
104 }
105 if (!unicharset_.save_to_file(fp)) {
106 return false;
107 }
108 if (!font_id_map_.Serialize(fp)) {
109 return false;
110 }
111 int8_t not_null = font_class_array_ != nullptr;
112 if (fwrite(&not_null, sizeof(not_null), 1, fp) != 1) {
113 return false;
114 }
115 if (not_null) {
116 if (!font_class_array_->SerializeClasses(fp)) {
117 return false;
118 }
119 }
120 return true;
121 }
122
123 // Reads from the given file. Returns false in case of error.
124 // If swap is true, assumes a big/little-endian swap is needed.
125 bool TrainingSampleSet::DeSerialize(bool swap, FILE *fp) {
126 if (!tesseract::DeSerialize(swap, fp, samples_)) {
127 return false;
128 }
129 num_raw_samples_ = samples_.size();
130 if (!unicharset_.load_from_file(fp)) {
131 return false;
132 }
133 if (!font_id_map_.DeSerialize(swap, fp)) {
134 return false;
135 }
136 delete font_class_array_;
137 font_class_array_ = nullptr;
138 int8_t not_null;
139 if (fread(&not_null, sizeof(not_null), 1, fp) != 1) {
140 return false;
141 }
142 if (not_null) {
143 FontClassInfo empty;
144 font_class_array_ = new GENERIC_2D_ARRAY<FontClassInfo>(1, 1, empty);
145 if (!font_class_array_->DeSerializeClasses(swap, fp)) {
146 return false;
147 }
148 }
149 unicharset_size_ = unicharset_.size();
150 return true;
151 }
152
153 // Load an initial unicharset, or set one up if the file cannot be read.
154 void TrainingSampleSet::LoadUnicharset(const char *filename) {
155 if (!unicharset_.load_from_file(filename)) {
156 tprintf(
157 "Failed to load unicharset from file %s\n"
158 "Building unicharset from scratch...\n",
159 filename);
160 unicharset_.clear();
161 // Add special characters as they were removed by the clear.
162 UNICHARSET empty;
163 unicharset_.AppendOtherUnicharset(empty);
164 }
165 unicharset_size_ = unicharset_.size();
166 }
167
168 // Adds a character sample to this sample set.
169 // If the unichar is not already in the local unicharset, it is added.
170 // Returns the unichar_id of the added sample, from the local unicharset.
171 int TrainingSampleSet::AddSample(const char *unichar, TrainingSample *sample) {
172 if (!unicharset_.contains_unichar(unichar)) {
173 unicharset_.unichar_insert(unichar);
174 if (unicharset_.size() > MAX_NUM_CLASSES) {
175 tprintf(
176 "Error: Size of unicharset in TrainingSampleSet::AddSample is "
177 "greater than MAX_NUM_CLASSES\n");
178 return -1;
179 }
180 }
181 UNICHAR_ID char_id = unicharset_.unichar_to_id(unichar);
182 AddSample(char_id, sample);
183 return char_id;
184 }
185
186 // Adds a character sample to this sample set with the given unichar_id,
187 // which must correspond to the local unicharset (in this).
188 void TrainingSampleSet::AddSample(int unichar_id, TrainingSample *sample) {
189 sample->set_class_id(unichar_id);
190 samples_.push_back(sample);
191 num_raw_samples_ = samples_.size();
192 unicharset_size_ = unicharset_.size();
193 }
194
195 // Returns the number of samples for the given font,class pair.
196 // If randomize is true, returns the number of samples accessible
197 // with randomizing on. (Increases the number of samples if small.)
198 // OrganizeByFontAndClass must have been already called.
199 int TrainingSampleSet::NumClassSamples(int font_id, int class_id, bool randomize) const {
200 ASSERT_HOST(font_class_array_ != nullptr);
201 if (font_id < 0 || class_id < 0 || font_id >= font_id_map_.SparseSize() ||
202 class_id >= unicharset_size_) {
203 // There are no samples because the font or class doesn't exist.
204 return 0;
205 }
206 int font_index = font_id_map_.SparseToCompact(font_id);
207 if (font_index < 0) {
208 return 0; // The font has no samples.
209 }
210 if (randomize) {
211 return (*font_class_array_)(font_index, class_id).samples.size();
212 } else {
213 return (*font_class_array_)(font_index, class_id).num_raw_samples;
214 }
215 }
216
217 // Gets a sample by its index.
218 const TrainingSample *TrainingSampleSet::GetSample(int index) const {
219 return samples_[index];
220 }
221
222 // Gets a sample by its font, class, index.
223 // OrganizeByFontAndClass must have been already called.
224 const TrainingSample *TrainingSampleSet::GetSample(int font_id, int class_id, int index) const {
225 ASSERT_HOST(font_class_array_ != nullptr);
226 int font_index = font_id_map_.SparseToCompact(font_id);
227 if (font_index < 0) {
228 return nullptr;
229 }
230 int sample_index = (*font_class_array_)(font_index, class_id).samples[index];
231 return samples_[sample_index];
232 }
233
234 // Get a sample by its font, class, index. Does not randomize.
235 // OrganizeByFontAndClass must have been already called.
236 TrainingSample *TrainingSampleSet::MutableSample(int font_id, int class_id, int index) {
237 ASSERT_HOST(font_class_array_ != nullptr);
238 int font_index = font_id_map_.SparseToCompact(font_id);
239 if (font_index < 0) {
240 return nullptr;
241 }
242 int sample_index = (*font_class_array_)(font_index, class_id).samples[index];
243 return samples_[sample_index];
244 }
245
246 // Returns a string debug representation of the given sample:
247 // font, unichar_str, bounding box, page.
248 std::string TrainingSampleSet::SampleToString(const TrainingSample &sample) const {
249 std::string boxfile_str;
250 MakeBoxFileStr(unicharset_.id_to_unichar(sample.class_id()), sample.bounding_box(),
251 sample.page_num(), boxfile_str);
252 return std::string(fontinfo_table_.at(sample.font_id()).name) + " " + boxfile_str;
253 }
254
255 // Gets the combined set of features used by all the samples of the given
256 // font/class combination.
257 const BitVector &TrainingSampleSet::GetCloudFeatures(int font_id, int class_id) const {
258 int font_index = font_id_map_.SparseToCompact(font_id);
259 ASSERT_HOST(font_index >= 0);
260 return (*font_class_array_)(font_index, class_id).cloud_features;
261 }
262 // Gets the indexed features of the canonical sample of the given
263 // font/class combination.
264 const std::vector<int> &TrainingSampleSet::GetCanonicalFeatures(int font_id, int class_id) const {
265 int font_index = font_id_map_.SparseToCompact(font_id);
266 ASSERT_HOST(font_index >= 0);
267 return (*font_class_array_)(font_index, class_id).canonical_features;
268 }
269
270 // Returns the distance between the given UniCharAndFonts pair.
271 // If matched_fonts, only matching fonts, are considered, unless that yields
272 // the empty set.
273 // OrganizeByFontAndClass must have been already called.
274 float TrainingSampleSet::UnicharDistance(const UnicharAndFonts &uf1, const UnicharAndFonts &uf2,
275 bool matched_fonts, const IntFeatureMap &feature_map) {
276 int num_fonts1 = uf1.font_ids.size();
277 int c1 = uf1.unichar_id;
278 int num_fonts2 = uf2.font_ids.size();
279 int c2 = uf2.unichar_id;
280 double dist_sum = 0.0;
281 int dist_count = 0;
282 const bool debug = false;
283 if (matched_fonts) {
284 // Compute distances only where fonts match.
285 for (int i = 0; i < num_fonts1; ++i) {
286 int f1 = uf1.font_ids[i];
287 for (int j = 0; j < num_fonts2; ++j) {
288 int f2 = uf2.font_ids[j];
289 if (f1 == f2) {
290 dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map);
291 ++dist_count;
292 }
293 }
294 }
295 } else if (num_fonts1 * num_fonts2 <= kSquareLimit) {
296 // Small enough sets to compute all the distances.
297 for (int i = 0; i < num_fonts1; ++i) {
298 int f1 = uf1.font_ids[i];
299 for (int j = 0; j < num_fonts2; ++j) {
300 int f2 = uf2.font_ids[j];
301 dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map);
302 if (debug) {
303 tprintf("Cluster dist %d %d %d %d = %g\n", f1, c1, f2, c2,
304 ClusterDistance(f1, c1, f2, c2, feature_map));
305 }
306 ++dist_count;
307 }
308 }
309 } else {
310 // Subsample distances, using the largest set once, and stepping through
311 // the smaller set so as to ensure that all the pairs are different.
312 int increment = kPrime1 != num_fonts2 ? kPrime1 : kPrime2;
313 int index = 0;
314 int num_samples = std::max(num_fonts1, num_fonts2);
315 for (int i = 0; i < num_samples; ++i, index += increment) {
316 int f1 = uf1.font_ids[i % num_fonts1];
317 int f2 = uf2.font_ids[index % num_fonts2];
318 if (debug) {
319 tprintf("Cluster dist %d %d %d %d = %g\n", f1, c1, f2, c2,
320 ClusterDistance(f1, c1, f2, c2, feature_map));
321 }
322 dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map);
323 ++dist_count;
324 }
325 }
326 if (dist_count == 0) {
327 if (matched_fonts) {
328 return UnicharDistance(uf1, uf2, false, feature_map);
329 }
330 return 0.0f;
331 }
332 return dist_sum / dist_count;
333 }
334
335 // Returns the distance between the given pair of font/class pairs.
336 // Finds in cache or computes and caches.
337 // OrganizeByFontAndClass must have been already called.
338 float TrainingSampleSet::ClusterDistance(int font_id1, int class_id1, int font_id2, int class_id2,
339 const IntFeatureMap &feature_map) {
340 ASSERT_HOST(font_class_array_ != nullptr);
341 int font_index1 = font_id_map_.SparseToCompact(font_id1);
342 int font_index2 = font_id_map_.SparseToCompact(font_id2);
343 if (font_index1 < 0 || font_index2 < 0) {
344 return 0.0f;
345 }
346 FontClassInfo &fc_info = (*font_class_array_)(font_index1, class_id1);
347 if (font_id1 == font_id2) {
348 // Special case cache for speed.
349 if (fc_info.unichar_distance_cache.empty()) {
350 fc_info.unichar_distance_cache.resize(unicharset_size_, -1.0f);
351 }
352 if (fc_info.unichar_distance_cache[class_id2] < 0) {
353 // Distance has to be calculated.
354 float result = ComputeClusterDistance(font_id1, class_id1, font_id2, class_id2, feature_map);
355 fc_info.unichar_distance_cache[class_id2] = result;
356 // Copy to the symmetric cache entry.
357 FontClassInfo &fc_info2 = (*font_class_array_)(font_index2, class_id2);
358 if (fc_info2.unichar_distance_cache.empty()) {
359 fc_info2.unichar_distance_cache.resize(unicharset_size_, -1.0f);
360 }
361 fc_info2.unichar_distance_cache[class_id1] = result;
362 }
363 return fc_info.unichar_distance_cache[class_id2];
364 } else if (class_id1 == class_id2) {
365 // Another special-case cache for equal class-id.
366 if (fc_info.font_distance_cache.empty()) {
367 fc_info.font_distance_cache.resize(font_id_map_.CompactSize(), -1.0f);
368 }
369 if (fc_info.font_distance_cache[font_index2] < 0) {
370 // Distance has to be calculated.
371 float result = ComputeClusterDistance(font_id1, class_id1, font_id2, class_id2, feature_map);
372 fc_info.font_distance_cache[font_index2] = result;
373 // Copy to the symmetric cache entry.
374 FontClassInfo &fc_info2 = (*font_class_array_)(font_index2, class_id2);
375 if (fc_info2.font_distance_cache.empty()) {
376 fc_info2.font_distance_cache.resize(font_id_map_.CompactSize(), -1.0f);
377 }
378 fc_info2.font_distance_cache[font_index1] = result;
379 }
380 return fc_info.font_distance_cache[font_index2];
381 }
382 // Both font and class are different. Linear search for class_id2/font_id2
383 // in what is a hopefully short list of distances.
384 size_t cache_index = 0;
385 while (cache_index < fc_info.distance_cache.size() &&
386 (fc_info.distance_cache[cache_index].unichar_id != class_id2 ||
387 fc_info.distance_cache[cache_index].font_id != font_id2)) {
388 ++cache_index;
389 }
390 if (cache_index == fc_info.distance_cache.size()) {
391 // Distance has to be calculated.
392 float result = ComputeClusterDistance(font_id1, class_id1, font_id2, class_id2, feature_map);
393 FontClassDistance fc_dist = {class_id2, font_id2, result};
394 fc_info.distance_cache.push_back(fc_dist);
395 // Copy to the symmetric cache entry. We know it isn't there already, as
396 // we always copy to the symmetric entry.
397 FontClassInfo &fc_info2 = (*font_class_array_)(font_index2, class_id2);
398 fc_dist.unichar_id = class_id1;
399 fc_dist.font_id = font_id1;
400 fc_info2.distance_cache.push_back(fc_dist);
401 }
402 return fc_info.distance_cache[cache_index].distance;
403 }
404
405 // Computes the distance between the given pair of font/class pairs.
406 float TrainingSampleSet::ComputeClusterDistance(int font_id1, int class_id1, int font_id2,
407 int class_id2,
408 const IntFeatureMap &feature_map) const {
409 int dist = ReliablySeparable(font_id1, class_id1, font_id2, class_id2, feature_map, false);
410 dist += ReliablySeparable(font_id2, class_id2, font_id1, class_id1, feature_map, false);
411 int denominator = GetCanonicalFeatures(font_id1, class_id1).size();
412 denominator += GetCanonicalFeatures(font_id2, class_id2).size();
413 return static_cast<float>(dist) / denominator;
414 }
415
416 // Helper to add a feature and its near neighbors to the good_features.
417 // levels indicates how many times to compute the offset features of what is
418 // already there. This is done by iteration rather than recursion.
419 static void AddNearFeatures(const IntFeatureMap &feature_map, int f, int levels,
420 std::vector<int> *good_features) {
421 int prev_num_features = 0;
422 good_features->push_back(f);
423 int num_features = 1;
424 for (int level = 0; level < levels; ++level) {
425 for (int i = prev_num_features; i < num_features; ++i) {
426 int feature = (*good_features)[i];
427 for (int dir = -kNumOffsetMaps; dir <= kNumOffsetMaps; ++dir) {
428 if (dir == 0) {
429 continue;
430 }
431 int f1 = feature_map.OffsetFeature(feature, dir);
432 if (f1 >= 0) {
433 good_features->push_back(f1);
434 }
435 }
436 }
437 prev_num_features = num_features;
438 num_features = good_features->size();
439 }
440 }
441
442 // Returns the number of canonical features of font/class 2 for which
443 // neither the feature nor any of its near neighbors occurs in the cloud
444 // of font/class 1. Each such feature is a reliable separation between
445 // the classes, ASSUMING that the canonical sample is sufficiently
446 // representative that every sample has a feature near that particular
447 // feature. To check that this is so on the fly would be prohibitively
448 // expensive, but it might be possible to pre-qualify the canonical features
449 // to include only those for which this assumption is true.
450 // ComputeCanonicalFeatures and ComputeCloudFeatures must have been called
451 // first, or the results will be nonsense.
452 int TrainingSampleSet::ReliablySeparable(int font_id1, int class_id1, int font_id2, int class_id2,
453 const IntFeatureMap &feature_map, bool thorough) const {
454 int result = 0;
455 const TrainingSample *sample2 = GetCanonicalSample(font_id2, class_id2);
456 if (sample2 == nullptr) {
457 return 0; // There are no canonical features.
458 }
459 const std::vector<int> &canonical2 = GetCanonicalFeatures(font_id2, class_id2);
460 const BitVector &cloud1 = GetCloudFeatures(font_id1, class_id1);
461 if (cloud1.empty()) {
462 return canonical2.size(); // There are no cloud features.
463 }
464
465 // Find a canonical2 feature that is not in cloud1.
466 for (int feature : canonical2) {
467 if (cloud1[feature]) {
468 continue;
469 }
470 // Gather the near neighbours of f.
471 std::vector<int> good_features;
472 AddNearFeatures(feature_map, feature, 1, &good_features);
473 // Check that none of the good_features are in the cloud.
474 bool found = false;
475 for (auto good_f : good_features) {
476 if (cloud1[good_f]) {
477 found = true;
478 break;
479 }
480 }
481 if (found) {
482 continue; // Found one in the cloud.
483 }
484 ++result;
485 }
486 return result;
487 }
488
489 // Returns the total index of the requested sample.
490 // OrganizeByFontAndClass must have been already called.
491 int TrainingSampleSet::GlobalSampleIndex(int font_id, int class_id, int index) const {
492 ASSERT_HOST(font_class_array_ != nullptr);
493 int font_index = font_id_map_.SparseToCompact(font_id);
494 if (font_index < 0) {
495 return -1;
496 }
497 return (*font_class_array_)(font_index, class_id).samples[index];
498 }
499
500 // Gets the canonical sample for the given font, class pair.
501 // ComputeCanonicalSamples must have been called first.
502 const TrainingSample *TrainingSampleSet::GetCanonicalSample(int font_id, int class_id) const {
503 ASSERT_HOST(font_class_array_ != nullptr);
504 int font_index = font_id_map_.SparseToCompact(font_id);
505 if (font_index < 0) {
506 return nullptr;
507 }
508 const int sample_index = (*font_class_array_)(font_index, class_id).canonical_sample;
509 return sample_index >= 0 ? samples_[sample_index] : nullptr;
510 }
511
512 // Gets the max distance for the given canonical sample.
513 // ComputeCanonicalSamples must have been called first.
514 float TrainingSampleSet::GetCanonicalDist(int font_id, int class_id) const {
515 ASSERT_HOST(font_class_array_ != nullptr);
516 int font_index = font_id_map_.SparseToCompact(font_id);
517 if (font_index < 0) {
518 return 0.0f;
519 }
520 if ((*font_class_array_)(font_index, class_id).canonical_sample >= 0) {
521 return (*font_class_array_)(font_index, class_id).canonical_dist;
522 } else {
523 return 0.0f;
524 }
525 }
526
527 // Generates indexed features for all samples with the supplied feature_space.
528 void TrainingSampleSet::IndexFeatures(const IntFeatureSpace &feature_space) {
529 for (auto &sample : samples_) {
530 sample->IndexFeatures(feature_space);
531 }
532 }
533
534 // Marks the given sample index for deletion.
535 // Deletion is actually completed by DeleteDeadSamples.
536 void TrainingSampleSet::KillSample(TrainingSample *sample) {
537 sample->set_sample_index(-1);
538 }
539
540 // Deletes all samples with zero features marked by KillSample.
541 void TrainingSampleSet::DeleteDeadSamples() {
542 using namespace std::placeholders; // for _1
543 for (auto &&it = samples_.begin(); it < samples_.end();) {
544 if (*it == nullptr || (*it)->class_id() < 0) {
545 samples_.erase(it);
546 delete *it;
547 } else {
548 ++it;
549 }
550 }
551 num_raw_samples_ = samples_.size();
552 // Samples must be re-organized now we have deleted a few.
553 }
554
555 // Construct an array to access the samples by font,class pair.
556 void TrainingSampleSet::OrganizeByFontAndClass() {
557 // Font indexes are sparse, so we used a map to compact them, so we can
558 // have an efficient 2-d array of fonts and character classes.
559 SetupFontIdMap();
560 int compact_font_size = font_id_map_.CompactSize();
561 // Get a 2-d array of generic vectors.
562 delete font_class_array_;
563 FontClassInfo empty;
564 font_class_array_ =
565 new GENERIC_2D_ARRAY<FontClassInfo>(compact_font_size, unicharset_size_, empty);
566 for (size_t s = 0; s < samples_.size(); ++s) {
567 int font_id = samples_[s]->font_id();
568 int class_id = samples_[s]->class_id();
569 if (font_id < 0 || font_id >= font_id_map_.SparseSize()) {
570 tesserr << "Font id = " << font_id << '/' << font_id_map_.SparseSize()
571 << ", class id = " << class_id << '/' << unicharset_size_
572 << " on sample " << s << '\n';
573 }
574 ASSERT_HOST(font_id >= 0 && font_id < font_id_map_.SparseSize());
575 ASSERT_HOST(class_id >= 0 && class_id < unicharset_size_);
576 int font_index = font_id_map_.SparseToCompact(font_id);
577 (*font_class_array_)(font_index, class_id).samples.push_back(s);
578 }
579 // Set the num_raw_samples member of the FontClassInfo, to set the boundary
580 // between the raw samples and the replicated ones.
581 for (int f = 0; f < compact_font_size; ++f) {
582 for (int c = 0; c < unicharset_size_; ++c) {
583 (*font_class_array_)(f, c).num_raw_samples = (*font_class_array_)(f, c).samples.size();
584 }
585 }
586 // This is the global number of samples and also marks the boundary between
587 // real and replicated samples.
588 num_raw_samples_ = samples_.size();
589 }
590
591 // Constructs the font_id_map_ which maps real font_ids (sparse) to a compact
592 // index for the font_class_array_.
593 void TrainingSampleSet::SetupFontIdMap() {
594 // Number of samples for each font_id.
595 std::vector<int> font_counts;
596 for (auto &sample : samples_) {
597 const int font_id = sample->font_id();
598 while (font_id >= font_counts.size()) {
599 font_counts.push_back(0);
600 }
601 ++font_counts[font_id];
602 }
603 font_id_map_.Init(font_counts.size(), false);
604 for (size_t f = 0; f < font_counts.size(); ++f) {
605 font_id_map_.SetMap(f, font_counts[f] > 0);
606 }
607 font_id_map_.Setup();
608 }
609
610 // Finds the sample for each font, class pair that has least maximum
611 // distance to all the other samples of the same font, class.
612 // OrganizeByFontAndClass must have been already called.
613 void TrainingSampleSet::ComputeCanonicalSamples(const IntFeatureMap &map, bool debug) {
614 ASSERT_HOST(font_class_array_ != nullptr);
615 IntFeatureDist f_table;
616 if (debug) {
617 tprintf("feature table size %d\n", map.sparse_size());
618 }
619 f_table.Init(&map);
620 int worst_s1 = 0;
621 int worst_s2 = 0;
622 double global_worst_dist = 0.0;
623 // Compute distances independently for each font and char index.
624 int font_size = font_id_map_.CompactSize();
625 for (int font_index = 0; font_index < font_size; ++font_index) {
626 int font_id = font_id_map_.CompactToSparse(font_index);
627 for (int c = 0; c < unicharset_size_; ++c) {
628 int samples_found = 0;
629 FontClassInfo &fcinfo = (*font_class_array_)(font_index, c);
630 if (fcinfo.samples.empty() || (kTestChar >= 0 && c != kTestChar)) {
631 fcinfo.canonical_sample = -1;
632 fcinfo.canonical_dist = 0.0f;
633 if (debug) {
634 tprintf("Skipping class %d\n", c);
635 }
636 continue;
637 }
638 // The canonical sample will be the one with the min_max_dist, which
639 // is the sample with the lowest maximum distance to all other samples.
640 double min_max_dist = 2.0;
641 // We keep track of the farthest apart pair (max_s1, max_s2) which
642 // are max_max_dist apart, so we can see how bad the variability is.
643 double max_max_dist = 0.0;
644 int max_s1 = 0;
645 int max_s2 = 0;
646 fcinfo.canonical_sample = fcinfo.samples[0];
647 fcinfo.canonical_dist = 0.0f;
648 for (auto s1 : fcinfo.samples) {
649 const std::vector<int> &features1 = samples_[s1]->indexed_features();
650 f_table.Set(features1, features1.size(), true);
651 double max_dist = 0.0;
652 // Run the full squared-order search for similar samples. It is still
653 // reasonably fast because f_table.FeatureDistance is fast, but we
654 // may have to reconsider if we start playing with too many samples
655 // of a single char/font.
656 for (int s2 : fcinfo.samples) {
657 if (samples_[s2]->class_id() != c || samples_[s2]->font_id() != font_id || s2 == s1) {
658 continue;
659 }
660 std::vector<int> features2 = samples_[s2]->indexed_features();
661 double dist = f_table.FeatureDistance(features2);
662 if (dist > max_dist) {
663 max_dist = dist;
664 if (dist > max_max_dist) {
665 max_max_dist = dist;
666 max_s1 = s1;
667 max_s2 = s2;
668 }
669 }
670 }
671 // Using Set(..., false) is far faster than re initializing, due to
672 // the sparseness of the feature space.
673 f_table.Set(features1, features1.size(), false);
674 samples_[s1]->set_max_dist(max_dist);
675 ++samples_found;
676 if (max_dist < min_max_dist) {
677 fcinfo.canonical_sample = s1;
678 fcinfo.canonical_dist = max_dist;
679 }
680 UpdateRange(max_dist, &min_max_dist, &max_max_dist);
681 }
682 if (max_max_dist > global_worst_dist) {
683 // Keep a record of the worst pair over all characters/fonts too.
684 global_worst_dist = max_max_dist;
685 worst_s1 = max_s1;
686 worst_s2 = max_s2;
687 }
688 if (debug) {
689 tprintf(
690 "Found %d samples of class %d=%s, font %d, "
691 "dist range [%g, %g], worst pair= %s, %s\n",
692 samples_found, c, unicharset_.debug_str(c).c_str(), font_index, min_max_dist,
693 max_max_dist, SampleToString(*samples_[max_s1]).c_str(),
694 SampleToString(*samples_[max_s2]).c_str());
695 }
696 }
697 }
698 if (debug) {
699 tprintf("Global worst dist = %g, between sample %d and %d\n", global_worst_dist, worst_s1,
700 worst_s2);
701 }
702 }
703
704 // Replicates the samples to a minimum frequency defined by
705 // 2 * kSampleRandomSize, or for larger counts duplicates all samples.
706 // After replication, the replicated samples are perturbed slightly, but
707 // in a predictable and repeatable way.
708 // Use after OrganizeByFontAndClass().
709 void TrainingSampleSet::ReplicateAndRandomizeSamples() {
710 ASSERT_HOST(font_class_array_ != nullptr);
711 int font_size = font_id_map_.CompactSize();
712 for (int font_index = 0; font_index < font_size; ++font_index) {
713 for (int c = 0; c < unicharset_size_; ++c) {
714 FontClassInfo &fcinfo = (*font_class_array_)(font_index, c);
715 int sample_count = fcinfo.samples.size();
716 int min_samples = 2 * std::max(kSampleRandomSize, sample_count);
717 if (sample_count > 0 && sample_count < min_samples) {
718 int base_count = sample_count;
719 for (int base_index = 0; sample_count < min_samples; ++sample_count) {
720 int src_index = fcinfo.samples[base_index++];
721 if (base_index >= base_count) {
722 base_index = 0;
723 }
724 TrainingSample *sample =
725 samples_[src_index]->RandomizedCopy(sample_count % kSampleRandomSize);
726 int sample_index = samples_.size();
727 sample->set_sample_index(sample_index);
728 samples_.push_back(sample);
729 fcinfo.samples.push_back(sample_index);
730 }
731 }
732 }
733 }
734 }
735
736 // Caches the indexed features of the canonical samples.
737 // ComputeCanonicalSamples must have been already called.
738 // TODO(rays) see note on ReliablySeparable and try restricting the
739 // canonical features to those that truly represent all samples.
740 void TrainingSampleSet::ComputeCanonicalFeatures() {
741 ASSERT_HOST(font_class_array_ != nullptr);
742 const int font_size = font_id_map_.CompactSize();
743 for (int font_index = 0; font_index < font_size; ++font_index) {
744 const int font_id = font_id_map_.CompactToSparse(font_index);
745 for (int c = 0; c < unicharset_size_; ++c) {
746 int num_samples = NumClassSamples(font_id, c, false);
747 if (num_samples == 0) {
748 continue;
749 }
750 const TrainingSample *sample = GetCanonicalSample(font_id, c);
751 FontClassInfo &fcinfo = (*font_class_array_)(font_index, c);
752 fcinfo.canonical_features = sample->indexed_features();
753 }
754 }
755 }
756
757 // Computes the combined set of features used by all the samples of each
758 // font/class combination. Use after ReplicateAndRandomizeSamples.
759 void TrainingSampleSet::ComputeCloudFeatures(int feature_space_size) {
760 ASSERT_HOST(font_class_array_ != nullptr);
761 int font_size = font_id_map_.CompactSize();
762 for (int font_index = 0; font_index < font_size; ++font_index) {
763 int font_id = font_id_map_.CompactToSparse(font_index);
764 for (int c = 0; c < unicharset_size_; ++c) {
765 int num_samples = NumClassSamples(font_id, c, false);
766 if (num_samples == 0) {
767 continue;
768 }
769 FontClassInfo &fcinfo = (*font_class_array_)(font_index, c);
770 fcinfo.cloud_features.Init(feature_space_size);
771 for (int s = 0; s < num_samples; ++s) {
772 const TrainingSample *sample = GetSample(font_id, c, s);
773 const std::vector<int> &sample_features = sample->indexed_features();
774 for (int sample_feature : sample_features) {
775 fcinfo.cloud_features.SetBit(sample_feature);
776 }
777 }
778 }
779 }
780 }
781
782 // Adds all fonts of the given class to the shape.
783 void TrainingSampleSet::AddAllFontsForClass(int class_id, Shape *shape) const {
784 for (int f = 0; f < font_id_map_.CompactSize(); ++f) {
785 const int font_id = font_id_map_.CompactToSparse(f);
786 shape->AddToShape(class_id, font_id);
787 }
788 }
789
790 #ifndef GRAPHICS_DISABLED
791
792 // Display the samples with the given indexed feature that also match
793 // the given shape.
794 void TrainingSampleSet::DisplaySamplesWithFeature(int f_index, const Shape &shape,
795 const IntFeatureSpace &space,
796 ScrollView::Color color,
797 ScrollView *window) const {
798 for (int s = 0; s < num_raw_samples(); ++s) {
799 const TrainingSample *sample = GetSample(s);
800 if (shape.ContainsUnichar(sample->class_id())) {
801 std::vector<int> indexed_features;
802 space.IndexAndSortFeatures(sample->features(), sample->num_features(), &indexed_features);
803 for (int indexed_feature : indexed_features) {
804 if (indexed_feature == f_index) {
805 sample->DisplayFeatures(color, window);
806 }
807 }
808 }
809 }
810 }
811
812 #endif // !GRAPHICS_DISABLED
813
814 } // namespace tesseract.