Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/training/common/mastertrainer.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/training/common/mastertrainer.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,1044 @@ +/////////////////////////////////////////////////////////////////////// +// File: mastertrainer.cpp +// Description: Trainer to build the MasterClassifier. +// Author: Ray Smith +// +// (C) Copyright 2010, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +// Include automatically generated configuration file if running autoconf. +#ifdef HAVE_CONFIG_H +# include "config_auto.h" +#endif + +#include <allheaders.h> +#include <cmath> +#include <ctime> +#include "boxread.h" +#include "classify.h" +#include "errorcounter.h" +#include "featdefs.h" +#include "mastertrainer.h" +#include "sampleiterator.h" +#include "shapeclassifier.h" +#include "shapetable.h" +#ifndef GRAPHICS_DISABLED +# include "svmnode.h" +#endif + +#include "scanutils.h" + +namespace tesseract { + +// Constants controlling clustering. With a low kMinClusteredShapes and a high +// kMaxUnicharsPerCluster, then kFontMergeDistance is the only limiting factor. +// Min number of shapes in the output. +const int kMinClusteredShapes = 1; +// Max number of unichars in any individual cluster. +const int kMaxUnicharsPerCluster = 2000; +// Mean font distance below which to merge fonts and unichars. +const float kFontMergeDistance = 0.025; + +MasterTrainer::MasterTrainer(NormalizationMode norm_mode, bool shape_analysis, + bool replicate_samples, int debug_level) + : norm_mode_(norm_mode), + samples_(fontinfo_table_), + junk_samples_(fontinfo_table_), + verify_samples_(fontinfo_table_), + charsetsize_(0), + enable_shape_analysis_(shape_analysis), + enable_replication_(replicate_samples), + fragments_(nullptr), + prev_unichar_id_(-1), + debug_level_(debug_level) {} + +MasterTrainer::~MasterTrainer() { + delete[] fragments_; + for (auto &page_image : page_images_) { + page_image.destroy(); + } +} + +// WARNING! Serialize/DeSerialize are only partial, providing +// enough data to get the samples back and display them. +// Writes to the given file. Returns false in case of error. +bool MasterTrainer::Serialize(FILE *fp) const { + uint32_t value = norm_mode_; + if (!tesseract::Serialize(fp, &value)) { + return false; + } + if (!unicharset_.save_to_file(fp)) { + return false; + } + if (!feature_space_.Serialize(fp)) { + return false; + } + if (!samples_.Serialize(fp)) { + return false; + } + if (!junk_samples_.Serialize(fp)) { + return false; + } + if (!verify_samples_.Serialize(fp)) { + return false; + } + if (!master_shapes_.Serialize(fp)) { + return false; + } + if (!flat_shapes_.Serialize(fp)) { + return false; + } + if (!fontinfo_table_.Serialize(fp)) { + return false; + } + if (!tesseract::Serialize(fp, xheights_)) { + return false; + } + return true; +} + +// Load an initial unicharset, or set one up if the file cannot be read. +void MasterTrainer::LoadUnicharset(const char *filename) { + if (!unicharset_.load_from_file(filename)) { + tprintf( + "Failed to load unicharset from file %s\n" + "Building unicharset for training from scratch...\n", + filename); + unicharset_.clear(); + UNICHARSET initialized; + // Add special characters, as they were removed by the clear, but the + // default constructor puts them in. + unicharset_.AppendOtherUnicharset(initialized); + } + charsetsize_ = unicharset_.size(); + delete[] fragments_; + fragments_ = new int[charsetsize_]; + memset(fragments_, 0, sizeof(*fragments_) * charsetsize_); + samples_.LoadUnicharset(filename); + junk_samples_.LoadUnicharset(filename); + verify_samples_.LoadUnicharset(filename); +} + +// Reads the samples and their features from the given .tr format file, +// adding them to the trainer with the font_id from the content of the file. +// See mftraining.cpp for a description of the file format. +// If verification, then these are verification samples, not training. +void MasterTrainer::ReadTrainingSamples(const char *page_name, + const FEATURE_DEFS_STRUCT &feature_defs, + bool verification) { + char buffer[2048]; + const int int_feature_type = + ShortNameToFeatureType(feature_defs, kIntFeatureType); + const int micro_feature_type = + ShortNameToFeatureType(feature_defs, kMicroFeatureType); + const int cn_feature_type = + ShortNameToFeatureType(feature_defs, kCNFeatureType); + const int geo_feature_type = + ShortNameToFeatureType(feature_defs, kGeoFeatureType); + + FILE *fp = fopen(page_name, "rb"); + if (fp == nullptr) { + tprintf("Failed to open tr file: %s\n", page_name); + return; + } + tr_filenames_.emplace_back(page_name); + while (fgets(buffer, sizeof(buffer), fp) != nullptr) { + if (buffer[0] == '\n') { + continue; + } + + char *space = strchr(buffer, ' '); + if (space == nullptr) { + tprintf("Bad format in tr file, reading fontname, unichar\n"); + continue; + } + *space++ = '\0'; + int font_id = GetFontInfoId(buffer); + if (font_id < 0) { + font_id = 0; + } + int page_number; + std::string unichar; + TBOX bounding_box; + if (!ParseBoxFileStr(space, &page_number, unichar, &bounding_box)) { + tprintf("Bad format in tr file, reading box coords\n"); + continue; + } + auto char_desc = ReadCharDescription(feature_defs, fp); + auto *sample = new TrainingSample; + sample->set_font_id(font_id); + sample->set_page_num(page_number + page_images_.size()); + sample->set_bounding_box(bounding_box); + sample->ExtractCharDesc(int_feature_type, micro_feature_type, + cn_feature_type, geo_feature_type, char_desc); + AddSample(verification, unichar.c_str(), sample); + delete char_desc; + } + charsetsize_ = unicharset_.size(); + fclose(fp); +} + +// Adds the given single sample to the trainer, setting the classid +// appropriately from the given unichar_str. +void MasterTrainer::AddSample(bool verification, const char *unichar, + TrainingSample *sample) { + if (verification) { + verify_samples_.AddSample(unichar, sample); + prev_unichar_id_ = -1; + } else if (unicharset_.contains_unichar(unichar)) { + if (prev_unichar_id_ >= 0) { + fragments_[prev_unichar_id_] = -1; + } + prev_unichar_id_ = samples_.AddSample(unichar, sample); + if (flat_shapes_.FindShape(prev_unichar_id_, sample->font_id()) < 0) { + flat_shapes_.AddShape(prev_unichar_id_, sample->font_id()); + } + } else { + const int junk_id = junk_samples_.AddSample(unichar, sample); + if (prev_unichar_id_ >= 0) { + CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(unichar); + if (frag != nullptr && frag->is_natural()) { + if (fragments_[prev_unichar_id_] == 0) { + fragments_[prev_unichar_id_] = junk_id; + } else if (fragments_[prev_unichar_id_] != junk_id) { + fragments_[prev_unichar_id_] = -1; + } + } + delete frag; + } + prev_unichar_id_ = -1; + } +} + +// Loads all pages from the given tif filename and append to page_images_. +// Must be called after ReadTrainingSamples, as the current number of images +// is used as an offset for page numbers in the samples. +void MasterTrainer::LoadPageImages(const char *filename) { + size_t offset = 0; + int page; + Image pix; + for (page = 0;; page++) { + pix = pixReadFromMultipageTiff(filename, &offset); + if (!pix) { + break; + } + page_images_.push_back(pix); + if (!offset) { + break; + } + } + tprintf("Loaded %d page images from %s\n", page, filename); +} + +// Cleans up the samples after initial load from the tr files, and prior to +// saving the MasterTrainer: +// Remaps fragmented chars if running shape analysis. +// Sets up the samples appropriately for class/fontwise access. +// Deletes outlier samples. +void MasterTrainer::PostLoadCleanup() { + if (debug_level_ > 0) { + tprintf("PostLoadCleanup...\n"); + } + if (enable_shape_analysis_) { + ReplaceFragmentedSamples(); + } + SampleIterator sample_it; + sample_it.Init(nullptr, nullptr, true, &verify_samples_); + sample_it.NormalizeSamples(); + verify_samples_.OrganizeByFontAndClass(); + + samples_.IndexFeatures(feature_space_); + // TODO(rays) DeleteOutliers is currently turned off to prove NOP-ness + // against current training. + // samples_.DeleteOutliers(feature_space_, debug_level_ > 0); + samples_.OrganizeByFontAndClass(); + if (debug_level_ > 0) { + tprintf("ComputeCanonicalSamples...\n"); + } + samples_.ComputeCanonicalSamples(feature_map_, debug_level_ > 0); +} + +// Gets the samples ready for training. Use after both +// ReadTrainingSamples+PostLoadCleanup or DeSerialize. +// Re-indexes the features and computes canonical and cloud features. +void MasterTrainer::PreTrainingSetup() { + if (debug_level_ > 0) { + tprintf("PreTrainingSetup...\n"); + } + samples_.IndexFeatures(feature_space_); + samples_.ComputeCanonicalFeatures(); + if (debug_level_ > 0) { + tprintf("ComputeCloudFeatures...\n"); + } + samples_.ComputeCloudFeatures(feature_space_.Size()); +} + +// Sets up the master_shapes_ table, which tells which fonts should stay +// together until they get to a leaf node classifier. +void MasterTrainer::SetupMasterShapes() { + tprintf("Building master shape table\n"); + const int num_fonts = samples_.NumFonts(); + + ShapeTable char_shapes_begin_fragment(samples_.unicharset()); + ShapeTable char_shapes_end_fragment(samples_.unicharset()); + ShapeTable char_shapes(samples_.unicharset()); + for (int c = 0; c < samples_.charsetsize(); ++c) { + ShapeTable shapes(samples_.unicharset()); + for (int f = 0; f < num_fonts; ++f) { + if (samples_.NumClassSamples(f, c, true) > 0) { + shapes.AddShape(c, f); + } + } + ClusterShapes(kMinClusteredShapes, 1, kFontMergeDistance, &shapes); + + const CHAR_FRAGMENT *fragment = samples_.unicharset().get_fragment(c); + + if (fragment == nullptr) { + char_shapes.AppendMasterShapes(shapes, nullptr); + } else if (fragment->is_beginning()) { + char_shapes_begin_fragment.AppendMasterShapes(shapes, nullptr); + } else if (fragment->is_ending()) { + char_shapes_end_fragment.AppendMasterShapes(shapes, nullptr); + } else { + char_shapes.AppendMasterShapes(shapes, nullptr); + } + } + ClusterShapes(kMinClusteredShapes, kMaxUnicharsPerCluster, kFontMergeDistance, + &char_shapes_begin_fragment); + char_shapes.AppendMasterShapes(char_shapes_begin_fragment, nullptr); + ClusterShapes(kMinClusteredShapes, kMaxUnicharsPerCluster, kFontMergeDistance, + &char_shapes_end_fragment); + char_shapes.AppendMasterShapes(char_shapes_end_fragment, nullptr); + ClusterShapes(kMinClusteredShapes, kMaxUnicharsPerCluster, kFontMergeDistance, + &char_shapes); + master_shapes_.AppendMasterShapes(char_shapes, nullptr); + tprintf("Master shape_table:%s\n", master_shapes_.SummaryStr().c_str()); +} + +// Adds the junk_samples_ to the main samples_ set. Junk samples are initially +// fragments and n-grams (all incorrectly segmented characters). +// Various training functions may result in incorrectly segmented characters +// being added to the unicharset of the main samples, perhaps because they +// form a "radical" decomposition of some (Indic) grapheme, or because they +// just look the same as a real character (like rn/m) +// This function moves all the junk samples, to the main samples_ set, but +// desirable junk, being any sample for which the unichar already exists in +// the samples_ unicharset gets the unichar-ids re-indexed to match, but +// anything else gets re-marked as unichar_id 0 (space character) to identify +// it as junk to the error counter. +void MasterTrainer::IncludeJunk() { + // Get ids of fragments in junk_samples_ that replace the dead chars. + const UNICHARSET &junk_set = junk_samples_.unicharset(); + const UNICHARSET &sample_set = samples_.unicharset(); + int num_junks = junk_samples_.num_samples(); + tprintf("Moving %d junk samples to master sample set.\n", num_junks); + for (int s = 0; s < num_junks; ++s) { + TrainingSample *sample = junk_samples_.mutable_sample(s); + int junk_id = sample->class_id(); + const char *junk_utf8 = junk_set.id_to_unichar(junk_id); + int sample_id = sample_set.unichar_to_id(junk_utf8); + if (sample_id == INVALID_UNICHAR_ID) { + sample_id = 0; + } + sample->set_class_id(sample_id); + junk_samples_.extract_sample(s); + samples_.AddSample(sample_id, sample); + } + junk_samples_.DeleteDeadSamples(); + samples_.OrganizeByFontAndClass(); +} + +// Replicates the samples and perturbs them if the enable_replication_ flag +// is set. MUST be used after the last call to OrganizeByFontAndClass on +// the training samples, ie after IncludeJunk if it is going to be used, as +// OrganizeByFontAndClass will eat the replicated samples into the regular +// samples. +void MasterTrainer::ReplicateAndRandomizeSamplesIfRequired() { + if (enable_replication_) { + if (debug_level_ > 0) { + tprintf("ReplicateAndRandomize...\n"); + } + verify_samples_.ReplicateAndRandomizeSamples(); + samples_.ReplicateAndRandomizeSamples(); + samples_.IndexFeatures(feature_space_); + } +} + +// Loads the basic font properties file into fontinfo_table_. +// Returns false on failure. +bool MasterTrainer::LoadFontInfo(const char *filename) { + FILE *fp = fopen(filename, "rb"); + if (fp == nullptr) { + fprintf(stderr, "Failed to load font_properties from %s\n", filename); + return false; + } + int italic, bold, fixed, serif, fraktur; + while (!feof(fp)) { + FontInfo fontinfo; + char *font_name = new char[1024]; + fontinfo.name = font_name; + fontinfo.properties = 0; + fontinfo.universal_id = 0; + if (tfscanf(fp, "%1024s %i %i %i %i %i\n", font_name, &italic, &bold, + &fixed, &serif, &fraktur) != 6) { + delete[] font_name; + continue; + } + fontinfo.properties = (italic << 0) + (bold << 1) + (fixed << 2) + + (serif << 3) + (fraktur << 4); + if (fontinfo_table_.get_index(fontinfo) < 0) { + // fontinfo not in table. + fontinfo_table_.push_back(fontinfo); + } else { + delete[] font_name; + } + } + fclose(fp); + return true; +} + +// Loads the xheight font properties file into xheights_. +// Returns false on failure. +bool MasterTrainer::LoadXHeights(const char *filename) { + tprintf("fontinfo table is of size %d\n", fontinfo_table_.size()); + xheights_.clear(); + xheights_.resize(fontinfo_table_.size(), -1); + if (filename == nullptr) { + return true; + } + FILE *f = fopen(filename, "rb"); + if (f == nullptr) { + fprintf(stderr, "Failed to load font xheights from %s\n", filename); + return false; + } + tprintf("Reading x-heights from %s ...\n", filename); + FontInfo fontinfo; + fontinfo.properties = 0; // Not used to lookup in the table. + fontinfo.universal_id = 0; + char buffer[1024]; + int xht; + int total_xheight = 0; + int xheight_count = 0; + while (!feof(f)) { + if (tfscanf(f, "%1023s %d\n", buffer, &xht) != 2) { + continue; + } + buffer[1023] = '\0'; + fontinfo.name = buffer; + auto fontinfo_id = fontinfo_table_.get_index(fontinfo); + if (fontinfo_id < 0) { + // fontinfo not in table. + continue; + } + xheights_[fontinfo_id] = xht; + total_xheight += xht; + ++xheight_count; + } + if (xheight_count == 0) { + fprintf(stderr, "No valid xheights in %s!\n", filename); + fclose(f); + return false; + } + int mean_xheight = DivRounded(total_xheight, xheight_count); + for (size_t i = 0; i < fontinfo_table_.size(); ++i) { + if (xheights_[i] < 0) { + xheights_[i] = mean_xheight; + } + } + fclose(f); + return true; +} // LoadXHeights + +// Reads spacing stats from filename and adds them to fontinfo_table. +bool MasterTrainer::AddSpacingInfo(const char *filename) { + FILE *fontinfo_file = fopen(filename, "rb"); + if (fontinfo_file == nullptr) { + return true; // We silently ignore missing files! + } + // Find the fontinfo_id. + int fontinfo_id = GetBestMatchingFontInfoId(filename); + if (fontinfo_id < 0) { + tprintf("No font found matching fontinfo filename %s\n", filename); + fclose(fontinfo_file); + return false; + } + tprintf("Reading spacing from %s for font %d...\n", filename, fontinfo_id); + // TODO(rays) scale should probably be a double, but keep as an int for now + // to duplicate current behavior. + int scale = kBlnXHeight / xheights_[fontinfo_id]; + int num_unichars; + char uch[UNICHAR_LEN]; + char kerned_uch[UNICHAR_LEN]; + int x_gap, x_gap_before, x_gap_after, num_kerned; + ASSERT_HOST(tfscanf(fontinfo_file, "%d\n", &num_unichars) == 1); + FontInfo *fi = &fontinfo_table_.at(fontinfo_id); + fi->init_spacing(unicharset_.size()); + FontSpacingInfo *spacing = nullptr; + for (int l = 0; l < num_unichars; ++l) { + if (tfscanf(fontinfo_file, "%s %d %d %d", uch, &x_gap_before, &x_gap_after, + &num_kerned) != 4) { + tprintf("Bad format of font spacing file %s\n", filename); + fclose(fontinfo_file); + return false; + } + bool valid = unicharset_.contains_unichar(uch); + if (valid) { + spacing = new FontSpacingInfo(); + spacing->x_gap_before = static_cast<int16_t>(x_gap_before * scale); + spacing->x_gap_after = static_cast<int16_t>(x_gap_after * scale); + } + for (int k = 0; k < num_kerned; ++k) { + if (tfscanf(fontinfo_file, "%s %d", kerned_uch, &x_gap) != 2) { + tprintf("Bad format of font spacing file %s\n", filename); + fclose(fontinfo_file); + delete spacing; + return false; + } + if (!valid || !unicharset_.contains_unichar(kerned_uch)) { + continue; + } + spacing->kerned_unichar_ids.push_back( + unicharset_.unichar_to_id(kerned_uch)); + spacing->kerned_x_gaps.push_back(static_cast<int16_t>(x_gap * scale)); + } + if (valid) { + fi->add_spacing(unicharset_.unichar_to_id(uch), spacing); + } + } + fclose(fontinfo_file); + return true; +} + +// Returns the font id corresponding to the given font name. +// Returns -1 if the font cannot be found. +int MasterTrainer::GetFontInfoId(const char *font_name) { + FontInfo fontinfo; + // We are only borrowing the string, so it is OK to const cast it. + fontinfo.name = const_cast<char *>(font_name); + fontinfo.properties = 0; // Not used to lookup in the table + fontinfo.universal_id = 0; + return fontinfo_table_.get_index(fontinfo); +} +// Returns the font_id of the closest matching font name to the given +// filename. It is assumed that a substring of the filename will match +// one of the fonts. If more than one is matched, the longest is returned. +int MasterTrainer::GetBestMatchingFontInfoId(const char *filename) { + int fontinfo_id = -1; + int best_len = 0; + for (size_t f = 0; f < fontinfo_table_.size(); ++f) { + if (strstr(filename, fontinfo_table_.at(f).name) != nullptr) { + int len = strlen(fontinfo_table_.at(f).name); + // Use the longest matching length in case a substring of a font matched. + if (len > best_len) { + best_len = len; + fontinfo_id = f; + } + } + } + return fontinfo_id; +} + +// Sets up a flat shapetable with one shape per class/font combination. +void MasterTrainer::SetupFlatShapeTable(ShapeTable *shape_table) { + // To exactly mimic the results of the previous implementation, the shapes + // must be clustered in order the fonts arrived, and reverse order of the + // characters within each font. + // Get a list of the fonts in the order they appeared. + std::vector<int> active_fonts; + int num_shapes = flat_shapes_.NumShapes(); + for (int s = 0; s < num_shapes; ++s) { + int font = flat_shapes_.GetShape(s)[0].font_ids[0]; + unsigned f = 0; + for (f = 0; f < active_fonts.size(); ++f) { + if (active_fonts[f] == font) { + break; + } + } + if (f == active_fonts.size()) { + active_fonts.push_back(font); + } + } + // For each font in order, add all the shapes with that font in reverse order. + int num_fonts = active_fonts.size(); + for (int f = 0; f < num_fonts; ++f) { + for (int s = num_shapes - 1; s >= 0; --s) { + int font = flat_shapes_.GetShape(s)[0].font_ids[0]; + if (font == active_fonts[f]) { + shape_table->AddShape(flat_shapes_.GetShape(s)); + } + } + } +} + +// Sets up a Clusterer for mftraining on a single shape_id. +// Call FreeClusterer on the return value after use. +CLUSTERER *MasterTrainer::SetupForClustering( + const ShapeTable &shape_table, const FEATURE_DEFS_STRUCT &feature_defs, + int shape_id, int *num_samples) { + int desc_index = ShortNameToFeatureType(feature_defs, kMicroFeatureType); + int num_params = feature_defs.FeatureDesc[desc_index]->NumParams; + ASSERT_HOST(num_params == (int)MicroFeatureParameter::MFCount); + CLUSTERER *clusterer = MakeClusterer( + num_params, feature_defs.FeatureDesc[desc_index]->ParamDesc); + + // We want to iterate over the samples of just the one shape. + IndexMapBiDi shape_map; + shape_map.Init(shape_table.NumShapes(), false); + shape_map.SetMap(shape_id, true); + shape_map.Setup(); + // Reverse the order of the samples to match the previous behavior. + std::vector<const TrainingSample *> sample_ptrs; + SampleIterator it; + it.Init(&shape_map, &shape_table, false, &samples_); + for (it.Begin(); !it.AtEnd(); it.Next()) { + sample_ptrs.push_back(&it.GetSample()); + } + uint32_t sample_id = 0; + for (int i = sample_ptrs.size() - 1; i >= 0; --i) { + const TrainingSample *sample = sample_ptrs[i]; + uint32_t num_features = sample->num_micro_features(); + for (uint32_t f = 0; f < num_features; ++f) { + MakeSample(clusterer, sample->micro_features()[f].data(), sample_id); + } + ++sample_id; + } + *num_samples = sample_id; + return clusterer; +} + +// Writes the given float_classes (produced by SetupForFloat2Int) as inttemp +// to the given inttemp_file, and the corresponding pffmtable. +// The unicharset is the original encoding of graphemes, and shape_set should +// match the size of the shape_table, and may possibly be totally fake. +void MasterTrainer::WriteInttempAndPFFMTable(const UNICHARSET &unicharset, + const UNICHARSET &shape_set, + const ShapeTable &shape_table, + CLASS_STRUCT *float_classes, + const char *inttemp_file, + const char *pffmtable_file) { + auto *classify = new tesseract::Classify(); + // Move the fontinfo table to classify. + fontinfo_table_.MoveTo(&classify->get_fontinfo_table()); + INT_TEMPLATES_STRUCT *int_templates = + classify->CreateIntTemplates(float_classes, shape_set); + FILE *fp = fopen(inttemp_file, "wb"); + if (fp == nullptr) { + tprintf("Error, failed to open file \"%s\"\n", inttemp_file); + } else { + classify->WriteIntTemplates(fp, int_templates, shape_set); + fclose(fp); + } + // Now write pffmtable. This is complicated by the fact that the adaptive + // classifier still wants one indexed by unichar-id, but the static + // classifier needs one indexed by its shape class id. + // We put the shapetable_cutoffs in a vector, and compute the + // unicharset cutoffs along the way. + std::vector<uint16_t> shapetable_cutoffs; + std::vector<uint16_t> unichar_cutoffs(unicharset.size()); + /* then write out each class */ + for (unsigned i = 0; i < int_templates->NumClasses; ++i) { + INT_CLASS_STRUCT *Class = ClassForClassId(int_templates, i); + // Todo: Test with min instead of max + // int MaxLength = LengthForConfigId(Class, 0); + uint16_t max_length = 0; + for (int config_id = 0; config_id < Class->NumConfigs; config_id++) { + // Todo: Test with min instead of max + // if (LengthForConfigId (Class, config_id) < MaxLength) + uint16_t length = Class->ConfigLengths[config_id]; + if (length > max_length) { + max_length = Class->ConfigLengths[config_id]; + } + int shape_id = float_classes[i].font_set.at(config_id); + const Shape &shape = shape_table.GetShape(shape_id); + for (int c = 0; c < shape.size(); ++c) { + int unichar_id = shape[c].unichar_id; + if (length > unichar_cutoffs[unichar_id]) { + unichar_cutoffs[unichar_id] = length; + } + } + } + shapetable_cutoffs.push_back(max_length); + } + fp = fopen(pffmtable_file, "wb"); + if (fp == nullptr) { + tprintf("Error, failed to open file \"%s\"\n", pffmtable_file); + } else { + tesseract::Serialize(fp, shapetable_cutoffs); + for (size_t c = 0; c < unicharset.size(); ++c) { + const char *unichar = unicharset.id_to_unichar(c); + if (strcmp(unichar, " ") == 0) { + unichar = "NULL"; + } + fprintf(fp, "%s %d\n", unichar, unichar_cutoffs[c]); + } + fclose(fp); + } + delete int_templates; + delete classify; +} + +// Generate debug output relating to the canonical distance between the +// two given UTF8 grapheme strings. +void MasterTrainer::DebugCanonical(const char *unichar_str1, + const char *unichar_str2) { + int class_id1 = unicharset_.unichar_to_id(unichar_str1); + int class_id2 = unicharset_.unichar_to_id(unichar_str2); + if (class_id2 == INVALID_UNICHAR_ID) { + class_id2 = class_id1; + } + if (class_id1 == INVALID_UNICHAR_ID) { + tprintf("No unicharset entry found for %s\n", unichar_str1); + return; + } else { + tprintf("Font ambiguities for unichar %d = %s and %d = %s\n", class_id1, + unichar_str1, class_id2, unichar_str2); + } + int num_fonts = samples_.NumFonts(); + const IntFeatureMap &feature_map = feature_map_; + // Iterate the fonts to get the similarity with other fonst of the same + // class. + tprintf(" "); + for (int f = 0; f < num_fonts; ++f) { + if (samples_.NumClassSamples(f, class_id2, false) == 0) { + continue; + } + tprintf("%6d", f); + } + tprintf("\n"); + for (int f1 = 0; f1 < num_fonts; ++f1) { + // Map the features of the canonical_sample. + if (samples_.NumClassSamples(f1, class_id1, false) == 0) { + continue; + } + tprintf("%4d ", f1); + for (int f2 = 0; f2 < num_fonts; ++f2) { + if (samples_.NumClassSamples(f2, class_id2, false) == 0) { + continue; + } + float dist = + samples_.ClusterDistance(f1, class_id1, f2, class_id2, feature_map); + tprintf(" %5.3f", dist); + } + tprintf("\n"); + } + // Build a fake ShapeTable containing all the sample types. + ShapeTable shapes(unicharset_); + for (int f = 0; f < num_fonts; ++f) { + if (samples_.NumClassSamples(f, class_id1, true) > 0) { + shapes.AddShape(class_id1, f); + } + if (class_id1 != class_id2 && + samples_.NumClassSamples(f, class_id2, true) > 0) { + shapes.AddShape(class_id2, f); + } + } +} + +#ifndef GRAPHICS_DISABLED +// Debugging for cloud/canonical features. +// Displays a Features window containing: +// If unichar_str2 is in the unicharset, and canonical_font is non-negative, +// displays the canonical features of the char/font combination in red. +// If unichar_str1 is in the unicharset, and cloud_font is non-negative, +// displays the cloud feature of the char/font combination in green. +// The canonical features are drawn first to show which ones have no +// matches in the cloud features. +// Until the features window is destroyed, each click in the features window +// will display the samples that have that feature in a separate window. +void MasterTrainer::DisplaySamples(const char *unichar_str1, int cloud_font, + const char *unichar_str2, + int canonical_font) { + const IntFeatureMap &feature_map = feature_map_; + const IntFeatureSpace &feature_space = feature_map.feature_space(); + ScrollView *f_window = CreateFeatureSpaceWindow("Features", 100, 500); + ClearFeatureSpaceWindow(norm_mode_ == NM_BASELINE ? baseline : character, + f_window); + int class_id2 = samples_.unicharset().unichar_to_id(unichar_str2); + if (class_id2 != INVALID_UNICHAR_ID && canonical_font >= 0) { + const TrainingSample *sample = + samples_.GetCanonicalSample(canonical_font, class_id2); + for (uint32_t f = 0; f < sample->num_features(); ++f) { + RenderIntFeature(f_window, &sample->features()[f], ScrollView::RED); + } + } + int class_id1 = samples_.unicharset().unichar_to_id(unichar_str1); + if (class_id1 != INVALID_UNICHAR_ID && cloud_font >= 0) { + const BitVector &cloud = samples_.GetCloudFeatures(cloud_font, class_id1); + for (int f = 0; f < cloud.size(); ++f) { + if (cloud[f]) { + INT_FEATURE_STRUCT feature = feature_map.InverseIndexFeature(f); + RenderIntFeature(f_window, &feature, ScrollView::GREEN); + } + } + } + f_window->Update(); + ScrollView *s_window = CreateFeatureSpaceWindow("Samples", 100, 500); + SVEventType ev_type; + do { + // Wait until a click or popup event. + auto ev = f_window->AwaitEvent(SVET_ANY); + ev_type = ev->type; + if (ev_type == SVET_CLICK) { + int feature_index = feature_space.XYToFeatureIndex(ev->x, ev->y); + if (feature_index >= 0) { + // Iterate samples and display those with the feature. + Shape shape; + shape.AddToShape(class_id1, cloud_font); + s_window->Clear(); + samples_.DisplaySamplesWithFeature(feature_index, shape, feature_space, + ScrollView::GREEN, s_window); + s_window->Update(); + } + } + } while (ev_type != SVET_DESTROY); +} +#endif // !GRAPHICS_DISABLED + +void MasterTrainer::TestClassifierVOld(bool replicate_samples, + ShapeClassifier *test_classifier, + ShapeClassifier *old_classifier) { + SampleIterator sample_it; + sample_it.Init(nullptr, nullptr, replicate_samples, &samples_); + ErrorCounter::DebugNewErrors(test_classifier, old_classifier, + CT_UNICHAR_TOPN_ERR, fontinfo_table_, + page_images_, &sample_it); +} + +// Tests the given test_classifier on the internal samples. +// See TestClassifier for details. +void MasterTrainer::TestClassifierOnSamples(CountTypes error_mode, + int report_level, + bool replicate_samples, + ShapeClassifier *test_classifier, + std::string *report_string) { + TestClassifier(error_mode, report_level, replicate_samples, &samples_, + test_classifier, report_string); +} + +// Tests the given test_classifier on the given samples. +// error_mode indicates what counts as an error. +// report_levels: +// 0 = no output. +// 1 = bottom-line error rate. +// 2 = bottom-line error rate + time. +// 3 = font-level error rate + time. +// 4 = list of all errors + short classifier debug output on 16 errors. +// 5 = list of all errors + short classifier debug output on 25 errors. +// If replicate_samples is true, then the test is run on an extended test +// sample including replicated and systematically perturbed samples. +// If report_string is non-nullptr, a summary of the results for each font +// is appended to the report_string. +double MasterTrainer::TestClassifier(CountTypes error_mode, int report_level, + bool replicate_samples, + TrainingSampleSet *samples, + ShapeClassifier *test_classifier, + std::string *report_string) { + SampleIterator sample_it; + sample_it.Init(nullptr, nullptr, replicate_samples, samples); + if (report_level > 0) { + int num_samples = 0; + for (sample_it.Begin(); !sample_it.AtEnd(); sample_it.Next()) { + ++num_samples; + } + tprintf("Iterator has charset size of %d/%d, %d shapes, %d samples\n", + sample_it.SparseCharsetSize(), sample_it.CompactCharsetSize(), + test_classifier->GetShapeTable()->NumShapes(), num_samples); + tprintf("Testing %sREPLICATED:\n", replicate_samples ? "" : "NON-"); + } + double unichar_error = 0.0; + ErrorCounter::ComputeErrorRate(test_classifier, report_level, error_mode, + fontinfo_table_, page_images_, &sample_it, + &unichar_error, nullptr, report_string); + return unichar_error; +} + +// Returns the average (in some sense) distance between the two given +// shapes, which may contain multiple fonts and/or unichars. +float MasterTrainer::ShapeDistance(const ShapeTable &shapes, int s1, int s2) { + const IntFeatureMap &feature_map = feature_map_; + const Shape &shape1 = shapes.GetShape(s1); + const Shape &shape2 = shapes.GetShape(s2); + int num_chars1 = shape1.size(); + int num_chars2 = shape2.size(); + float dist_sum = 0.0f; + int dist_count = 0; + if (num_chars1 > 1 || num_chars2 > 1) { + // In the multi-char case try to optimize the calculation by computing + // distances between characters of matching font where possible. + for (int c1 = 0; c1 < num_chars1; ++c1) { + for (int c2 = 0; c2 < num_chars2; ++c2) { + dist_sum += + samples_.UnicharDistance(shape1[c1], shape2[c2], true, feature_map); + ++dist_count; + } + } + } else { + // In the single unichar case, there is little alternative, but to compute + // the squared-order distance between pairs of fonts. + dist_sum = + samples_.UnicharDistance(shape1[0], shape2[0], false, feature_map); + ++dist_count; + } + return dist_sum / dist_count; +} + +// Replaces samples that are always fragmented with the corresponding +// fragment samples. +void MasterTrainer::ReplaceFragmentedSamples() { + if (fragments_ == nullptr) { + return; + } + // Remove samples that are replaced by fragments. Each class that was + // always naturally fragmented should be replaced by its fragments. + int num_samples = samples_.num_samples(); + for (int s = 0; s < num_samples; ++s) { + TrainingSample *sample = samples_.mutable_sample(s); + if (fragments_[sample->class_id()] > 0) { + samples_.KillSample(sample); + } + } + samples_.DeleteDeadSamples(); + + // Get ids of fragments in junk_samples_ that replace the dead chars. + const UNICHARSET &frag_set = junk_samples_.unicharset(); +#if 0 + // TODO(rays) The original idea was to replace only graphemes that were + // always naturally fragmented, but that left a lot of the Indic graphemes + // out. Determine whether we can go back to that idea now that spacing + // is fixed in the training images, or whether this code is obsolete. + bool* good_junk = new bool[frag_set.size()]; + memset(good_junk, 0, sizeof(*good_junk) * frag_set.size()); + for (int dead_ch = 1; dead_ch < unicharset_.size(); ++dead_ch) { + int frag_ch = fragments_[dead_ch]; + if (frag_ch <= 0) continue; + const char* frag_utf8 = frag_set.id_to_unichar(frag_ch); + CHAR_FRAGMENT* frag = CHAR_FRAGMENT::parse_from_string(frag_utf8); + // Mark the chars for all parts of the fragment as good in good_junk. + for (int part = 0; part < frag->get_total(); ++part) { + frag->set_pos(part); + int good_ch = frag_set.unichar_to_id(frag->to_string().c_str()); + if (good_ch != INVALID_UNICHAR_ID) + good_junk[good_ch] = true; // We want this one. + } + delete frag; + } +#endif + // For now just use all the junk that was from natural fragments. + // Get samples of fragments in junk_samples_ that replace the dead chars. + int num_junks = junk_samples_.num_samples(); + for (int s = 0; s < num_junks; ++s) { + TrainingSample *sample = junk_samples_.mutable_sample(s); + int junk_id = sample->class_id(); + const char *frag_utf8 = frag_set.id_to_unichar(junk_id); + CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(frag_utf8); + if (frag != nullptr && frag->is_natural()) { + junk_samples_.extract_sample(s); + samples_.AddSample(frag_set.id_to_unichar(junk_id), sample); + } + delete frag; + } + junk_samples_.DeleteDeadSamples(); + junk_samples_.OrganizeByFontAndClass(); + samples_.OrganizeByFontAndClass(); + unicharset_.clear(); + unicharset_.AppendOtherUnicharset(samples_.unicharset()); + // delete [] good_junk; + // Fragments_ no longer needed? + delete[] fragments_; + fragments_ = nullptr; +} + +// Runs a hierarchical agglomerative clustering to merge shapes in the given +// shape_table, while satisfying the given constraints: +// * End with at least min_shapes left in shape_table, +// * No shape shall have more than max_shape_unichars in it, +// * Don't merge shapes where the distance between them exceeds max_dist. +const float kInfiniteDist = 999.0f; +void MasterTrainer::ClusterShapes(int min_shapes, int max_shape_unichars, + float max_dist, ShapeTable *shapes) { + int num_shapes = shapes->NumShapes(); + int max_merges = num_shapes - min_shapes; + // TODO: avoid new / delete. + auto *shape_dists = new std::vector<ShapeDist>[num_shapes]; + float min_dist = kInfiniteDist; + int min_s1 = 0; + int min_s2 = 0; + tprintf("Computing shape distances..."); + for (int s1 = 0; s1 < num_shapes; ++s1) { + for (int s2 = s1 + 1; s2 < num_shapes; ++s2) { + ShapeDist dist(s1, s2, ShapeDistance(*shapes, s1, s2)); + shape_dists[s1].push_back(dist); + if (dist.distance < min_dist) { + min_dist = dist.distance; + min_s1 = s1; + min_s2 = s2; + } + } + tprintf(" %d", s1); + } + tprintf("\n"); + int num_merged = 0; + while (num_merged < max_merges && min_dist < max_dist) { + tprintf("Distance = %f: ", min_dist); + int num_unichars = shapes->MergedUnicharCount(min_s1, min_s2); + shape_dists[min_s1][min_s2 - min_s1 - 1].distance = kInfiniteDist; + if (num_unichars > max_shape_unichars) { + tprintf("Merge of %d and %d with %d would exceed max of %d unichars\n", + min_s1, min_s2, num_unichars, max_shape_unichars); + } else { + shapes->MergeShapes(min_s1, min_s2); + shape_dists[min_s2].clear(); + ++num_merged; + + for (int s = 0; s < min_s1; ++s) { + if (!shape_dists[s].empty()) { + shape_dists[s][min_s1 - s - 1].distance = + ShapeDistance(*shapes, s, min_s1); + shape_dists[s][min_s2 - s - 1].distance = kInfiniteDist; + } + } + for (int s2 = min_s1 + 1; s2 < num_shapes; ++s2) { + if (shape_dists[min_s1][s2 - min_s1 - 1].distance < kInfiniteDist) { + shape_dists[min_s1][s2 - min_s1 - 1].distance = + ShapeDistance(*shapes, min_s1, s2); + } + } + for (int s = min_s1 + 1; s < min_s2; ++s) { + if (!shape_dists[s].empty()) { + shape_dists[s][min_s2 - s - 1].distance = kInfiniteDist; + } + } + } + min_dist = kInfiniteDist; + for (int s1 = 0; s1 < num_shapes; ++s1) { + for (unsigned i = 0; i < shape_dists[s1].size(); ++i) { + if (shape_dists[s1][i].distance < min_dist) { + min_dist = shape_dists[s1][i].distance; + min_s1 = s1; + min_s2 = s1 + 1 + i; + } + } + } + } + tprintf("Stopped with %d merged, min dist %f\n", num_merged, min_dist); + delete[] shape_dists; + if (debug_level_ > 1) { + for (int s1 = 0; s1 < num_shapes; ++s1) { + if (shapes->MasterDestinationIndex(s1) == s1) { + tprintf("Master shape:%s\n", shapes->DebugStr(s1).c_str()); + } + } + } +} + +} // namespace tesseract.
