Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/training/common/trainingsampleset.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/training/common/trainingsampleset.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,814 @@ +// Copyright 2010 Google Inc. All Rights Reserved. +// Author: rays@google.com (Ray Smith) +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#ifdef HAVE_CONFIG_H +# include "config_auto.h" +#endif + +#include <algorithm> + +#include <allheaders.h> +#include "boxread.h" +#include "fontinfo.h" +//#include "helpers.h" +#include "indexmapbidi.h" +#include "intfeaturedist.h" +#include "intfeaturemap.h" +#include "intfeaturespace.h" +#include "shapetable.h" +#include "tesserrstream.h" // for tesserr +#include "trainingsample.h" +#include "trainingsampleset.h" +#include "unicity_table.h" + +namespace tesseract { + +const int kTestChar = -1; // 37; +// Max number of distances to compute the squared way +const int kSquareLimit = 25; +// Prime numbers for subsampling distances. +const int kPrime1 = 17; +const int kPrime2 = 13; + +TrainingSampleSet::FontClassInfo::FontClassInfo() + : num_raw_samples(0), canonical_sample(-1), canonical_dist(0.0f) {} + +// Writes to the given file. Returns false in case of error. +bool TrainingSampleSet::FontClassInfo::Serialize(FILE *fp) const { + if (fwrite(&num_raw_samples, sizeof(num_raw_samples), 1, fp) != 1) { + return false; + } + if (fwrite(&canonical_sample, sizeof(canonical_sample), 1, fp) != 1) { + return false; + } + if (fwrite(&canonical_dist, sizeof(canonical_dist), 1, fp) != 1) { + return false; + } + if (!::tesseract::Serialize(fp, samples)) { + return false; + } + return true; +} +// Reads from the given file. Returns false in case of error. +// If swap is true, assumes a big/little-endian swap is needed. +bool TrainingSampleSet::FontClassInfo::DeSerialize(bool swap, FILE *fp) { + if (fread(&num_raw_samples, sizeof(num_raw_samples), 1, fp) != 1) { + return false; + } + if (fread(&canonical_sample, sizeof(canonical_sample), 1, fp) != 1) { + return false; + } + if (fread(&canonical_dist, sizeof(canonical_dist), 1, fp) != 1) { + return false; + } + if (!::tesseract::DeSerialize(swap, fp, samples)) { + return false; + } + if (swap) { + ReverseN(&num_raw_samples, sizeof(num_raw_samples)); + ReverseN(&canonical_sample, sizeof(canonical_sample)); + ReverseN(&canonical_dist, sizeof(canonical_dist)); + } + return true; +} + +TrainingSampleSet::TrainingSampleSet(const FontInfoTable &font_table) + : num_raw_samples_(0) + , unicharset_size_(0) + , font_class_array_(nullptr) + , fontinfo_table_(font_table) {} + +TrainingSampleSet::~TrainingSampleSet() { + for (auto sample : samples_) { + delete sample; + } + delete font_class_array_; +} + +// Writes to the given file. Returns false in case of error. +bool TrainingSampleSet::Serialize(FILE *fp) const { + if (!tesseract::Serialize(fp, samples_)) { + return false; + } + if (!unicharset_.save_to_file(fp)) { + return false; + } + if (!font_id_map_.Serialize(fp)) { + return false; + } + int8_t not_null = font_class_array_ != nullptr; + if (fwrite(¬_null, sizeof(not_null), 1, fp) != 1) { + return false; + } + if (not_null) { + if (!font_class_array_->SerializeClasses(fp)) { + return false; + } + } + return true; +} + +// Reads from the given file. Returns false in case of error. +// If swap is true, assumes a big/little-endian swap is needed. +bool TrainingSampleSet::DeSerialize(bool swap, FILE *fp) { + if (!tesseract::DeSerialize(swap, fp, samples_)) { + return false; + } + num_raw_samples_ = samples_.size(); + if (!unicharset_.load_from_file(fp)) { + return false; + } + if (!font_id_map_.DeSerialize(swap, fp)) { + return false; + } + delete font_class_array_; + font_class_array_ = nullptr; + int8_t not_null; + if (fread(¬_null, sizeof(not_null), 1, fp) != 1) { + return false; + } + if (not_null) { + FontClassInfo empty; + font_class_array_ = new GENERIC_2D_ARRAY<FontClassInfo>(1, 1, empty); + if (!font_class_array_->DeSerializeClasses(swap, fp)) { + return false; + } + } + unicharset_size_ = unicharset_.size(); + return true; +} + +// Load an initial unicharset, or set one up if the file cannot be read. +void TrainingSampleSet::LoadUnicharset(const char *filename) { + if (!unicharset_.load_from_file(filename)) { + tprintf( + "Failed to load unicharset from file %s\n" + "Building unicharset from scratch...\n", + filename); + unicharset_.clear(); + // Add special characters as they were removed by the clear. + UNICHARSET empty; + unicharset_.AppendOtherUnicharset(empty); + } + unicharset_size_ = unicharset_.size(); +} + +// Adds a character sample to this sample set. +// If the unichar is not already in the local unicharset, it is added. +// Returns the unichar_id of the added sample, from the local unicharset. +int TrainingSampleSet::AddSample(const char *unichar, TrainingSample *sample) { + if (!unicharset_.contains_unichar(unichar)) { + unicharset_.unichar_insert(unichar); + if (unicharset_.size() > MAX_NUM_CLASSES) { + tprintf( + "Error: Size of unicharset in TrainingSampleSet::AddSample is " + "greater than MAX_NUM_CLASSES\n"); + return -1; + } + } + UNICHAR_ID char_id = unicharset_.unichar_to_id(unichar); + AddSample(char_id, sample); + return char_id; +} + +// Adds a character sample to this sample set with the given unichar_id, +// which must correspond to the local unicharset (in this). +void TrainingSampleSet::AddSample(int unichar_id, TrainingSample *sample) { + sample->set_class_id(unichar_id); + samples_.push_back(sample); + num_raw_samples_ = samples_.size(); + unicharset_size_ = unicharset_.size(); +} + +// Returns the number of samples for the given font,class pair. +// If randomize is true, returns the number of samples accessible +// with randomizing on. (Increases the number of samples if small.) +// OrganizeByFontAndClass must have been already called. +int TrainingSampleSet::NumClassSamples(int font_id, int class_id, bool randomize) const { + ASSERT_HOST(font_class_array_ != nullptr); + if (font_id < 0 || class_id < 0 || font_id >= font_id_map_.SparseSize() || + class_id >= unicharset_size_) { + // There are no samples because the font or class doesn't exist. + return 0; + } + int font_index = font_id_map_.SparseToCompact(font_id); + if (font_index < 0) { + return 0; // The font has no samples. + } + if (randomize) { + return (*font_class_array_)(font_index, class_id).samples.size(); + } else { + return (*font_class_array_)(font_index, class_id).num_raw_samples; + } +} + +// Gets a sample by its index. +const TrainingSample *TrainingSampleSet::GetSample(int index) const { + return samples_[index]; +} + +// Gets a sample by its font, class, index. +// OrganizeByFontAndClass must have been already called. +const TrainingSample *TrainingSampleSet::GetSample(int font_id, int class_id, int index) const { + ASSERT_HOST(font_class_array_ != nullptr); + int font_index = font_id_map_.SparseToCompact(font_id); + if (font_index < 0) { + return nullptr; + } + int sample_index = (*font_class_array_)(font_index, class_id).samples[index]; + return samples_[sample_index]; +} + +// Get a sample by its font, class, index. Does not randomize. +// OrganizeByFontAndClass must have been already called. +TrainingSample *TrainingSampleSet::MutableSample(int font_id, int class_id, int index) { + ASSERT_HOST(font_class_array_ != nullptr); + int font_index = font_id_map_.SparseToCompact(font_id); + if (font_index < 0) { + return nullptr; + } + int sample_index = (*font_class_array_)(font_index, class_id).samples[index]; + return samples_[sample_index]; +} + +// Returns a string debug representation of the given sample: +// font, unichar_str, bounding box, page. +std::string TrainingSampleSet::SampleToString(const TrainingSample &sample) const { + std::string boxfile_str; + MakeBoxFileStr(unicharset_.id_to_unichar(sample.class_id()), sample.bounding_box(), + sample.page_num(), boxfile_str); + return std::string(fontinfo_table_.at(sample.font_id()).name) + " " + boxfile_str; +} + +// Gets the combined set of features used by all the samples of the given +// font/class combination. +const BitVector &TrainingSampleSet::GetCloudFeatures(int font_id, int class_id) const { + int font_index = font_id_map_.SparseToCompact(font_id); + ASSERT_HOST(font_index >= 0); + return (*font_class_array_)(font_index, class_id).cloud_features; +} +// Gets the indexed features of the canonical sample of the given +// font/class combination. +const std::vector<int> &TrainingSampleSet::GetCanonicalFeatures(int font_id, int class_id) const { + int font_index = font_id_map_.SparseToCompact(font_id); + ASSERT_HOST(font_index >= 0); + return (*font_class_array_)(font_index, class_id).canonical_features; +} + +// Returns the distance between the given UniCharAndFonts pair. +// If matched_fonts, only matching fonts, are considered, unless that yields +// the empty set. +// OrganizeByFontAndClass must have been already called. +float TrainingSampleSet::UnicharDistance(const UnicharAndFonts &uf1, const UnicharAndFonts &uf2, + bool matched_fonts, const IntFeatureMap &feature_map) { + int num_fonts1 = uf1.font_ids.size(); + int c1 = uf1.unichar_id; + int num_fonts2 = uf2.font_ids.size(); + int c2 = uf2.unichar_id; + double dist_sum = 0.0; + int dist_count = 0; + const bool debug = false; + if (matched_fonts) { + // Compute distances only where fonts match. + for (int i = 0; i < num_fonts1; ++i) { + int f1 = uf1.font_ids[i]; + for (int j = 0; j < num_fonts2; ++j) { + int f2 = uf2.font_ids[j]; + if (f1 == f2) { + dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map); + ++dist_count; + } + } + } + } else if (num_fonts1 * num_fonts2 <= kSquareLimit) { + // Small enough sets to compute all the distances. + for (int i = 0; i < num_fonts1; ++i) { + int f1 = uf1.font_ids[i]; + for (int j = 0; j < num_fonts2; ++j) { + int f2 = uf2.font_ids[j]; + dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map); + if (debug) { + tprintf("Cluster dist %d %d %d %d = %g\n", f1, c1, f2, c2, + ClusterDistance(f1, c1, f2, c2, feature_map)); + } + ++dist_count; + } + } + } else { + // Subsample distances, using the largest set once, and stepping through + // the smaller set so as to ensure that all the pairs are different. + int increment = kPrime1 != num_fonts2 ? kPrime1 : kPrime2; + int index = 0; + int num_samples = std::max(num_fonts1, num_fonts2); + for (int i = 0; i < num_samples; ++i, index += increment) { + int f1 = uf1.font_ids[i % num_fonts1]; + int f2 = uf2.font_ids[index % num_fonts2]; + if (debug) { + tprintf("Cluster dist %d %d %d %d = %g\n", f1, c1, f2, c2, + ClusterDistance(f1, c1, f2, c2, feature_map)); + } + dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map); + ++dist_count; + } + } + if (dist_count == 0) { + if (matched_fonts) { + return UnicharDistance(uf1, uf2, false, feature_map); + } + return 0.0f; + } + return dist_sum / dist_count; +} + +// Returns the distance between the given pair of font/class pairs. +// Finds in cache or computes and caches. +// OrganizeByFontAndClass must have been already called. +float TrainingSampleSet::ClusterDistance(int font_id1, int class_id1, int font_id2, int class_id2, + const IntFeatureMap &feature_map) { + ASSERT_HOST(font_class_array_ != nullptr); + int font_index1 = font_id_map_.SparseToCompact(font_id1); + int font_index2 = font_id_map_.SparseToCompact(font_id2); + if (font_index1 < 0 || font_index2 < 0) { + return 0.0f; + } + FontClassInfo &fc_info = (*font_class_array_)(font_index1, class_id1); + if (font_id1 == font_id2) { + // Special case cache for speed. + if (fc_info.unichar_distance_cache.empty()) { + fc_info.unichar_distance_cache.resize(unicharset_size_, -1.0f); + } + if (fc_info.unichar_distance_cache[class_id2] < 0) { + // Distance has to be calculated. + float result = ComputeClusterDistance(font_id1, class_id1, font_id2, class_id2, feature_map); + fc_info.unichar_distance_cache[class_id2] = result; + // Copy to the symmetric cache entry. + FontClassInfo &fc_info2 = (*font_class_array_)(font_index2, class_id2); + if (fc_info2.unichar_distance_cache.empty()) { + fc_info2.unichar_distance_cache.resize(unicharset_size_, -1.0f); + } + fc_info2.unichar_distance_cache[class_id1] = result; + } + return fc_info.unichar_distance_cache[class_id2]; + } else if (class_id1 == class_id2) { + // Another special-case cache for equal class-id. + if (fc_info.font_distance_cache.empty()) { + fc_info.font_distance_cache.resize(font_id_map_.CompactSize(), -1.0f); + } + if (fc_info.font_distance_cache[font_index2] < 0) { + // Distance has to be calculated. + float result = ComputeClusterDistance(font_id1, class_id1, font_id2, class_id2, feature_map); + fc_info.font_distance_cache[font_index2] = result; + // Copy to the symmetric cache entry. + FontClassInfo &fc_info2 = (*font_class_array_)(font_index2, class_id2); + if (fc_info2.font_distance_cache.empty()) { + fc_info2.font_distance_cache.resize(font_id_map_.CompactSize(), -1.0f); + } + fc_info2.font_distance_cache[font_index1] = result; + } + return fc_info.font_distance_cache[font_index2]; + } + // Both font and class are different. Linear search for class_id2/font_id2 + // in what is a hopefully short list of distances. + size_t cache_index = 0; + while (cache_index < fc_info.distance_cache.size() && + (fc_info.distance_cache[cache_index].unichar_id != class_id2 || + fc_info.distance_cache[cache_index].font_id != font_id2)) { + ++cache_index; + } + if (cache_index == fc_info.distance_cache.size()) { + // Distance has to be calculated. + float result = ComputeClusterDistance(font_id1, class_id1, font_id2, class_id2, feature_map); + FontClassDistance fc_dist = {class_id2, font_id2, result}; + fc_info.distance_cache.push_back(fc_dist); + // Copy to the symmetric cache entry. We know it isn't there already, as + // we always copy to the symmetric entry. + FontClassInfo &fc_info2 = (*font_class_array_)(font_index2, class_id2); + fc_dist.unichar_id = class_id1; + fc_dist.font_id = font_id1; + fc_info2.distance_cache.push_back(fc_dist); + } + return fc_info.distance_cache[cache_index].distance; +} + +// Computes the distance between the given pair of font/class pairs. +float TrainingSampleSet::ComputeClusterDistance(int font_id1, int class_id1, int font_id2, + int class_id2, + const IntFeatureMap &feature_map) const { + int dist = ReliablySeparable(font_id1, class_id1, font_id2, class_id2, feature_map, false); + dist += ReliablySeparable(font_id2, class_id2, font_id1, class_id1, feature_map, false); + int denominator = GetCanonicalFeatures(font_id1, class_id1).size(); + denominator += GetCanonicalFeatures(font_id2, class_id2).size(); + return static_cast<float>(dist) / denominator; +} + +// Helper to add a feature and its near neighbors to the good_features. +// levels indicates how many times to compute the offset features of what is +// already there. This is done by iteration rather than recursion. +static void AddNearFeatures(const IntFeatureMap &feature_map, int f, int levels, + std::vector<int> *good_features) { + int prev_num_features = 0; + good_features->push_back(f); + int num_features = 1; + for (int level = 0; level < levels; ++level) { + for (int i = prev_num_features; i < num_features; ++i) { + int feature = (*good_features)[i]; + for (int dir = -kNumOffsetMaps; dir <= kNumOffsetMaps; ++dir) { + if (dir == 0) { + continue; + } + int f1 = feature_map.OffsetFeature(feature, dir); + if (f1 >= 0) { + good_features->push_back(f1); + } + } + } + prev_num_features = num_features; + num_features = good_features->size(); + } +} + +// Returns the number of canonical features of font/class 2 for which +// neither the feature nor any of its near neighbors occurs in the cloud +// of font/class 1. Each such feature is a reliable separation between +// the classes, ASSUMING that the canonical sample is sufficiently +// representative that every sample has a feature near that particular +// feature. To check that this is so on the fly would be prohibitively +// expensive, but it might be possible to pre-qualify the canonical features +// to include only those for which this assumption is true. +// ComputeCanonicalFeatures and ComputeCloudFeatures must have been called +// first, or the results will be nonsense. +int TrainingSampleSet::ReliablySeparable(int font_id1, int class_id1, int font_id2, int class_id2, + const IntFeatureMap &feature_map, bool thorough) const { + int result = 0; + const TrainingSample *sample2 = GetCanonicalSample(font_id2, class_id2); + if (sample2 == nullptr) { + return 0; // There are no canonical features. + } + const std::vector<int> &canonical2 = GetCanonicalFeatures(font_id2, class_id2); + const BitVector &cloud1 = GetCloudFeatures(font_id1, class_id1); + if (cloud1.empty()) { + return canonical2.size(); // There are no cloud features. + } + + // Find a canonical2 feature that is not in cloud1. + for (int feature : canonical2) { + if (cloud1[feature]) { + continue; + } + // Gather the near neighbours of f. + std::vector<int> good_features; + AddNearFeatures(feature_map, feature, 1, &good_features); + // Check that none of the good_features are in the cloud. + bool found = false; + for (auto good_f : good_features) { + if (cloud1[good_f]) { + found = true; + break; + } + } + if (found) { + continue; // Found one in the cloud. + } + ++result; + } + return result; +} + +// Returns the total index of the requested sample. +// OrganizeByFontAndClass must have been already called. +int TrainingSampleSet::GlobalSampleIndex(int font_id, int class_id, int index) const { + ASSERT_HOST(font_class_array_ != nullptr); + int font_index = font_id_map_.SparseToCompact(font_id); + if (font_index < 0) { + return -1; + } + return (*font_class_array_)(font_index, class_id).samples[index]; +} + +// Gets the canonical sample for the given font, class pair. +// ComputeCanonicalSamples must have been called first. +const TrainingSample *TrainingSampleSet::GetCanonicalSample(int font_id, int class_id) const { + ASSERT_HOST(font_class_array_ != nullptr); + int font_index = font_id_map_.SparseToCompact(font_id); + if (font_index < 0) { + return nullptr; + } + const int sample_index = (*font_class_array_)(font_index, class_id).canonical_sample; + return sample_index >= 0 ? samples_[sample_index] : nullptr; +} + +// Gets the max distance for the given canonical sample. +// ComputeCanonicalSamples must have been called first. +float TrainingSampleSet::GetCanonicalDist(int font_id, int class_id) const { + ASSERT_HOST(font_class_array_ != nullptr); + int font_index = font_id_map_.SparseToCompact(font_id); + if (font_index < 0) { + return 0.0f; + } + if ((*font_class_array_)(font_index, class_id).canonical_sample >= 0) { + return (*font_class_array_)(font_index, class_id).canonical_dist; + } else { + return 0.0f; + } +} + +// Generates indexed features for all samples with the supplied feature_space. +void TrainingSampleSet::IndexFeatures(const IntFeatureSpace &feature_space) { + for (auto &sample : samples_) { + sample->IndexFeatures(feature_space); + } +} + +// Marks the given sample index for deletion. +// Deletion is actually completed by DeleteDeadSamples. +void TrainingSampleSet::KillSample(TrainingSample *sample) { + sample->set_sample_index(-1); +} + +// Deletes all samples with zero features marked by KillSample. +void TrainingSampleSet::DeleteDeadSamples() { + using namespace std::placeholders; // for _1 + for (auto &&it = samples_.begin(); it < samples_.end();) { + if (*it == nullptr || (*it)->class_id() < 0) { + samples_.erase(it); + delete *it; + } else { + ++it; + } + } + num_raw_samples_ = samples_.size(); + // Samples must be re-organized now we have deleted a few. +} + +// Construct an array to access the samples by font,class pair. +void TrainingSampleSet::OrganizeByFontAndClass() { + // Font indexes are sparse, so we used a map to compact them, so we can + // have an efficient 2-d array of fonts and character classes. + SetupFontIdMap(); + int compact_font_size = font_id_map_.CompactSize(); + // Get a 2-d array of generic vectors. + delete font_class_array_; + FontClassInfo empty; + font_class_array_ = + new GENERIC_2D_ARRAY<FontClassInfo>(compact_font_size, unicharset_size_, empty); + for (size_t s = 0; s < samples_.size(); ++s) { + int font_id = samples_[s]->font_id(); + int class_id = samples_[s]->class_id(); + if (font_id < 0 || font_id >= font_id_map_.SparseSize()) { + tesserr << "Font id = " << font_id << '/' << font_id_map_.SparseSize() + << ", class id = " << class_id << '/' << unicharset_size_ + << " on sample " << s << '\n'; + } + ASSERT_HOST(font_id >= 0 && font_id < font_id_map_.SparseSize()); + ASSERT_HOST(class_id >= 0 && class_id < unicharset_size_); + int font_index = font_id_map_.SparseToCompact(font_id); + (*font_class_array_)(font_index, class_id).samples.push_back(s); + } + // Set the num_raw_samples member of the FontClassInfo, to set the boundary + // between the raw samples and the replicated ones. + for (int f = 0; f < compact_font_size; ++f) { + for (int c = 0; c < unicharset_size_; ++c) { + (*font_class_array_)(f, c).num_raw_samples = (*font_class_array_)(f, c).samples.size(); + } + } + // This is the global number of samples and also marks the boundary between + // real and replicated samples. + num_raw_samples_ = samples_.size(); +} + +// Constructs the font_id_map_ which maps real font_ids (sparse) to a compact +// index for the font_class_array_. +void TrainingSampleSet::SetupFontIdMap() { + // Number of samples for each font_id. + std::vector<int> font_counts; + for (auto &sample : samples_) { + const int font_id = sample->font_id(); + while (font_id >= font_counts.size()) { + font_counts.push_back(0); + } + ++font_counts[font_id]; + } + font_id_map_.Init(font_counts.size(), false); + for (size_t f = 0; f < font_counts.size(); ++f) { + font_id_map_.SetMap(f, font_counts[f] > 0); + } + font_id_map_.Setup(); +} + +// Finds the sample for each font, class pair that has least maximum +// distance to all the other samples of the same font, class. +// OrganizeByFontAndClass must have been already called. +void TrainingSampleSet::ComputeCanonicalSamples(const IntFeatureMap &map, bool debug) { + ASSERT_HOST(font_class_array_ != nullptr); + IntFeatureDist f_table; + if (debug) { + tprintf("feature table size %d\n", map.sparse_size()); + } + f_table.Init(&map); + int worst_s1 = 0; + int worst_s2 = 0; + double global_worst_dist = 0.0; + // Compute distances independently for each font and char index. + int font_size = font_id_map_.CompactSize(); + for (int font_index = 0; font_index < font_size; ++font_index) { + int font_id = font_id_map_.CompactToSparse(font_index); + for (int c = 0; c < unicharset_size_; ++c) { + int samples_found = 0; + FontClassInfo &fcinfo = (*font_class_array_)(font_index, c); + if (fcinfo.samples.empty() || (kTestChar >= 0 && c != kTestChar)) { + fcinfo.canonical_sample = -1; + fcinfo.canonical_dist = 0.0f; + if (debug) { + tprintf("Skipping class %d\n", c); + } + continue; + } + // The canonical sample will be the one with the min_max_dist, which + // is the sample with the lowest maximum distance to all other samples. + double min_max_dist = 2.0; + // We keep track of the farthest apart pair (max_s1, max_s2) which + // are max_max_dist apart, so we can see how bad the variability is. + double max_max_dist = 0.0; + int max_s1 = 0; + int max_s2 = 0; + fcinfo.canonical_sample = fcinfo.samples[0]; + fcinfo.canonical_dist = 0.0f; + for (auto s1 : fcinfo.samples) { + const std::vector<int> &features1 = samples_[s1]->indexed_features(); + f_table.Set(features1, features1.size(), true); + double max_dist = 0.0; + // Run the full squared-order search for similar samples. It is still + // reasonably fast because f_table.FeatureDistance is fast, but we + // may have to reconsider if we start playing with too many samples + // of a single char/font. + for (int s2 : fcinfo.samples) { + if (samples_[s2]->class_id() != c || samples_[s2]->font_id() != font_id || s2 == s1) { + continue; + } + std::vector<int> features2 = samples_[s2]->indexed_features(); + double dist = f_table.FeatureDistance(features2); + if (dist > max_dist) { + max_dist = dist; + if (dist > max_max_dist) { + max_max_dist = dist; + max_s1 = s1; + max_s2 = s2; + } + } + } + // Using Set(..., false) is far faster than re initializing, due to + // the sparseness of the feature space. + f_table.Set(features1, features1.size(), false); + samples_[s1]->set_max_dist(max_dist); + ++samples_found; + if (max_dist < min_max_dist) { + fcinfo.canonical_sample = s1; + fcinfo.canonical_dist = max_dist; + } + UpdateRange(max_dist, &min_max_dist, &max_max_dist); + } + if (max_max_dist > global_worst_dist) { + // Keep a record of the worst pair over all characters/fonts too. + global_worst_dist = max_max_dist; + worst_s1 = max_s1; + worst_s2 = max_s2; + } + if (debug) { + tprintf( + "Found %d samples of class %d=%s, font %d, " + "dist range [%g, %g], worst pair= %s, %s\n", + samples_found, c, unicharset_.debug_str(c).c_str(), font_index, min_max_dist, + max_max_dist, SampleToString(*samples_[max_s1]).c_str(), + SampleToString(*samples_[max_s2]).c_str()); + } + } + } + if (debug) { + tprintf("Global worst dist = %g, between sample %d and %d\n", global_worst_dist, worst_s1, + worst_s2); + } +} + +// Replicates the samples to a minimum frequency defined by +// 2 * kSampleRandomSize, or for larger counts duplicates all samples. +// After replication, the replicated samples are perturbed slightly, but +// in a predictable and repeatable way. +// Use after OrganizeByFontAndClass(). +void TrainingSampleSet::ReplicateAndRandomizeSamples() { + ASSERT_HOST(font_class_array_ != nullptr); + int font_size = font_id_map_.CompactSize(); + for (int font_index = 0; font_index < font_size; ++font_index) { + for (int c = 0; c < unicharset_size_; ++c) { + FontClassInfo &fcinfo = (*font_class_array_)(font_index, c); + int sample_count = fcinfo.samples.size(); + int min_samples = 2 * std::max(kSampleRandomSize, sample_count); + if (sample_count > 0 && sample_count < min_samples) { + int base_count = sample_count; + for (int base_index = 0; sample_count < min_samples; ++sample_count) { + int src_index = fcinfo.samples[base_index++]; + if (base_index >= base_count) { + base_index = 0; + } + TrainingSample *sample = + samples_[src_index]->RandomizedCopy(sample_count % kSampleRandomSize); + int sample_index = samples_.size(); + sample->set_sample_index(sample_index); + samples_.push_back(sample); + fcinfo.samples.push_back(sample_index); + } + } + } + } +} + +// Caches the indexed features of the canonical samples. +// ComputeCanonicalSamples must have been already called. +// TODO(rays) see note on ReliablySeparable and try restricting the +// canonical features to those that truly represent all samples. +void TrainingSampleSet::ComputeCanonicalFeatures() { + ASSERT_HOST(font_class_array_ != nullptr); + const int font_size = font_id_map_.CompactSize(); + for (int font_index = 0; font_index < font_size; ++font_index) { + const int font_id = font_id_map_.CompactToSparse(font_index); + for (int c = 0; c < unicharset_size_; ++c) { + int num_samples = NumClassSamples(font_id, c, false); + if (num_samples == 0) { + continue; + } + const TrainingSample *sample = GetCanonicalSample(font_id, c); + FontClassInfo &fcinfo = (*font_class_array_)(font_index, c); + fcinfo.canonical_features = sample->indexed_features(); + } + } +} + +// Computes the combined set of features used by all the samples of each +// font/class combination. Use after ReplicateAndRandomizeSamples. +void TrainingSampleSet::ComputeCloudFeatures(int feature_space_size) { + ASSERT_HOST(font_class_array_ != nullptr); + int font_size = font_id_map_.CompactSize(); + for (int font_index = 0; font_index < font_size; ++font_index) { + int font_id = font_id_map_.CompactToSparse(font_index); + for (int c = 0; c < unicharset_size_; ++c) { + int num_samples = NumClassSamples(font_id, c, false); + if (num_samples == 0) { + continue; + } + FontClassInfo &fcinfo = (*font_class_array_)(font_index, c); + fcinfo.cloud_features.Init(feature_space_size); + for (int s = 0; s < num_samples; ++s) { + const TrainingSample *sample = GetSample(font_id, c, s); + const std::vector<int> &sample_features = sample->indexed_features(); + for (int sample_feature : sample_features) { + fcinfo.cloud_features.SetBit(sample_feature); + } + } + } + } +} + +// Adds all fonts of the given class to the shape. +void TrainingSampleSet::AddAllFontsForClass(int class_id, Shape *shape) const { + for (int f = 0; f < font_id_map_.CompactSize(); ++f) { + const int font_id = font_id_map_.CompactToSparse(f); + shape->AddToShape(class_id, font_id); + } +} + +#ifndef GRAPHICS_DISABLED + +// Display the samples with the given indexed feature that also match +// the given shape. +void TrainingSampleSet::DisplaySamplesWithFeature(int f_index, const Shape &shape, + const IntFeatureSpace &space, + ScrollView::Color color, + ScrollView *window) const { + for (int s = 0; s < num_raw_samples(); ++s) { + const TrainingSample *sample = GetSample(s); + if (shape.ContainsUnichar(sample->class_id())) { + std::vector<int> indexed_features; + space.IndexAndSortFeatures(sample->features(), sample->num_features(), &indexed_features); + for (int indexed_feature : indexed_features) { + if (indexed_feature == f_index) { + sample->DisplayFeatures(color, window); + } + } + } + } +} + +#endif // !GRAPHICS_DISABLED + +} // namespace tesseract.
