diff mupdf-source/thirdparty/tesseract/src/training/common/trainingsampleset.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/training/common/trainingsampleset.h	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,275 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+// Author: rays@google.com (Ray Smith)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_TRAINING_TRAININGSAMPLESET_H_
+#define TESSERACT_TRAINING_TRAININGSAMPLESET_H_
+
+#include "bitvector.h"
+#include "indexmapbidi.h"
+#include "matrix.h"
+#include "shapetable.h"
+#include "trainingsample.h"
+
+namespace tesseract {
+
+class UNICHARSET;
+struct FontInfo;
+class FontInfoTable;
+class IntFeatureMap;
+class IntFeatureSpace;
+class TrainingSample;
+struct UnicharAndFonts;
+
+// Collection of TrainingSample used for training or testing a classifier.
+// Provides several useful methods to operate on the collection as a whole,
+// including outlier detection and deletion, providing access by font and
+// class, finding the canonical sample, finding the "cloud" features (OR of
+// all features in all samples), replication of samples, caching of distance
+// metrics.
+class TrainingSampleSet {
+public:
+  explicit TrainingSampleSet(const FontInfoTable &fontinfo_table);
+  ~TrainingSampleSet();
+
+  // Writes to the given file. Returns false in case of error.
+  bool Serialize(FILE *fp) const;
+  // Reads from the given file. Returns false in case of error.
+  // If swap is true, assumes a big/little-endian swap is needed.
+  bool DeSerialize(bool swap, FILE *fp);
+
+  // Accessors
+  int num_samples() const {
+    return samples_.size();
+  }
+  int num_raw_samples() const {
+    return num_raw_samples_;
+  }
+  int NumFonts() const {
+    return font_id_map_.SparseSize();
+  }
+  const UNICHARSET &unicharset() const {
+    return unicharset_;
+  }
+  int charsetsize() const {
+    return unicharset_size_;
+  }
+  const FontInfoTable &fontinfo_table() const {
+    return fontinfo_table_;
+  }
+
+  // Loads an initial unicharset, or sets one up if the file cannot be read.
+  void LoadUnicharset(const char *filename);
+
+  // Adds a character sample to this sample set.
+  // If the unichar is not already in the local unicharset, it is added.
+  // Returns the unichar_id of the added sample, from the local unicharset.
+  int AddSample(const char *unichar, TrainingSample *sample);
+  // Adds a character sample to this sample set with the given unichar_id,
+  // which must correspond to the local unicharset (in this).
+  void AddSample(int unichar_id, TrainingSample *sample);
+
+  // Returns the number of samples for the given font,class pair.
+  // If randomize is true, returns the number of samples accessible
+  // with randomizing on. (Increases the number of samples if small.)
+  // OrganizeByFontAndClass must have been already called.
+  int NumClassSamples(int font_id, int class_id, bool randomize) const;
+
+  // Gets a sample by its index.
+  const TrainingSample *GetSample(int index) const;
+
+  // Gets a sample by its font, class, index.
+  // OrganizeByFontAndClass must have been already called.
+  const TrainingSample *GetSample(int font_id, int class_id, int index) const;
+
+  // Get a sample by its font, class, index. Does not randomize.
+  // OrganizeByFontAndClass must have been already called.
+  TrainingSample *MutableSample(int font_id, int class_id, int index);
+
+  // Returns a string debug representation of the given sample:
+  // font, unichar_str, bounding box, page.
+  std::string SampleToString(const TrainingSample &sample) const;
+
+  // Gets the combined set of features used by all the samples of the given
+  // font/class combination.
+  const BitVector &GetCloudFeatures(int font_id, int class_id) const;
+  // Gets the indexed features of the canonical sample of the given
+  // font/class combination.
+  const std::vector<int> &GetCanonicalFeatures(int font_id, int class_id) const;
+
+  // Returns the distance between the given UniCharAndFonts pair.
+  // If matched_fonts, only matching fonts, are considered, unless that yields
+  // the empty set.
+  // OrganizeByFontAndClass must have been already called.
+  float UnicharDistance(const UnicharAndFonts &uf1, const UnicharAndFonts &uf2, bool matched_fonts,
+                        const IntFeatureMap &feature_map);
+
+  // Returns the distance between the given pair of font/class pairs.
+  // Finds in cache or computes and caches.
+  // OrganizeByFontAndClass must have been already called.
+  float ClusterDistance(int font_id1, int class_id1, int font_id2, int class_id2,
+                        const IntFeatureMap &feature_map);
+
+  // Computes the distance between the given pair of font/class pairs.
+  float ComputeClusterDistance(int font_id1, int class_id1, int font_id2, int class_id2,
+                               const IntFeatureMap &feature_map) const;
+
+  // Returns the number of canonical features of font/class 2 for which
+  // neither the feature nor any of its near neighbors occurs in the cloud
+  // of font/class 1. Each such feature is a reliable separation between
+  // the classes, ASSUMING that the canonical sample is sufficiently
+  // representative that every sample has a feature near that particular
+  // feature. To check that this is so on the fly would be prohibitively
+  // expensive, but it might be possible to pre-qualify the canonical features
+  // to include only those for which this assumption is true.
+  // ComputeCanonicalFeatures and ComputeCloudFeatures must have been called
+  // first, or the results will be nonsense.
+  int ReliablySeparable(int font_id1, int class_id1, int font_id2, int class_id2,
+                        const IntFeatureMap &feature_map, bool thorough) const;
+
+  // Returns the total index of the requested sample.
+  // OrganizeByFontAndClass must have been already called.
+  int GlobalSampleIndex(int font_id, int class_id, int index) const;
+
+  // Gets the canonical sample for the given font, class pair.
+  // ComputeCanonicalSamples must have been called first.
+  const TrainingSample *GetCanonicalSample(int font_id, int class_id) const;
+  // Gets the max distance for the given canonical sample.
+  // ComputeCanonicalSamples must have been called first.
+  float GetCanonicalDist(int font_id, int class_id) const;
+
+  // Returns a mutable pointer to the sample with the given index.
+  TrainingSample *mutable_sample(int index) {
+    return samples_[index];
+  }
+  // Gets ownership of the sample with the given index, removing it from this.
+  TrainingSample *extract_sample(int index) {
+    TrainingSample *sample = samples_[index];
+    samples_[index] = nullptr;
+    return sample;
+  }
+
+  // Generates indexed features for all samples with the supplied feature_space.
+  void IndexFeatures(const IntFeatureSpace &feature_space);
+
+  // Marks the given sample for deletion.
+  // Deletion is actually completed by DeleteDeadSamples.
+  void KillSample(TrainingSample *sample);
+
+  // Deletes all samples with a negative sample index marked by KillSample.
+  // Must be called before OrganizeByFontAndClass, and OrganizeByFontAndClass
+  // must be called after as the samples have been renumbered.
+  void DeleteDeadSamples();
+
+  // Construct an array to access the samples by font,class pair.
+  void OrganizeByFontAndClass();
+
+  // Constructs the font_id_map_ which maps real font_ids (sparse) to a compact
+  // index for the font_class_array_.
+  void SetupFontIdMap();
+
+  // Finds the sample for each font, class pair that has least maximum
+  // distance to all the other samples of the same font, class.
+  // OrganizeByFontAndClass must have been already called.
+  void ComputeCanonicalSamples(const IntFeatureMap &map, bool debug);
+
+  // Replicates the samples to a minimum frequency defined by
+  // 2 * kSampleRandomSize, or for larger counts duplicates all samples.
+  // After replication, the replicated samples are perturbed slightly, but
+  // in a predictable and repeatable way.
+  // Use after OrganizeByFontAndClass().
+  void ReplicateAndRandomizeSamples();
+
+  // Caches the indexed features of the canonical samples.
+  // ComputeCanonicalSamples must have been already called.
+  void ComputeCanonicalFeatures();
+  // Computes the combined set of features used by all the samples of each
+  // font/class combination. Use after ReplicateAndRandomizeSamples.
+  void ComputeCloudFeatures(int feature_space_size);
+
+  // Adds all fonts of the given class to the shape.
+  void AddAllFontsForClass(int class_id, Shape *shape) const;
+
+  // Display the samples with the given indexed feature that also match
+  // the given shape.
+  void DisplaySamplesWithFeature(int f_index, const Shape &shape,
+                                 const IntFeatureSpace &feature_space, ScrollView::Color color,
+                                 ScrollView *window) const;
+
+private:
+  // Struct to store a triplet of unichar, font, distance in the distance cache.
+  struct FontClassDistance {
+    int unichar_id;
+    int font_id; // Real font id.
+    float distance;
+  };
+  // Simple struct to store information related to each font/class combination.
+  struct FontClassInfo {
+    FontClassInfo();
+
+    // Writes to the given file. Returns false in case of error.
+    bool Serialize(FILE *fp) const;
+    // Reads from the given file. Returns false in case of error.
+    // If swap is true, assumes a big/little-endian swap is needed.
+    bool DeSerialize(bool swap, FILE *fp);
+
+    // Number of raw samples.
+    int32_t num_raw_samples;
+    // Index of the canonical sample.
+    int32_t canonical_sample;
+    // Max distance of the canonical sample from any other.
+    float canonical_dist;
+    // Sample indices for the samples, including replicated.
+    std::vector<int32_t> samples;
+
+    // Non-serialized cache data.
+    // Indexed features of the canonical sample.
+    std::vector<int> canonical_features;
+    // The mapped features of all the samples.
+    BitVector cloud_features;
+
+    // Caches for ClusterDistance.
+    // Caches for other fonts but matching this unichar. -1 indicates not set.
+    // Indexed by compact font index from font_id_map_.
+    std::vector<float> font_distance_cache;
+    // Caches for other unichars but matching this font. -1 indicates not set.
+    std::vector<float> unichar_distance_cache;
+    // Cache for the rest (non matching font and unichar.)
+    // A cache of distances computed by ReliablySeparable.
+    std::vector<FontClassDistance> distance_cache;
+  };
+
+  std::vector<TrainingSample *> samples_;
+  // Number of samples before replication/randomization.
+  int num_raw_samples_;
+  // Character set we are training for.
+  UNICHARSET unicharset_;
+  // Character set size to which the 2-d arrays below refer.
+  int unicharset_size_;
+  // Map to allow the font_class_array_ below to be compact.
+  // The sparse space is the real font_id, used in samples_ .
+  // The compact space is an index to font_class_array_
+  IndexMapBiDi font_id_map_;
+  // A 2-d array of FontClassInfo holding information related to each
+  // (font_id, class_id) pair.
+  GENERIC_2D_ARRAY<FontClassInfo> *font_class_array_;
+
+  // Reference to the fontinfo_table_ in MasterTrainer. Provides names
+  // for font_ids in the samples. Not serialized!
+  const FontInfoTable &fontinfo_table_;
+};
+
+} // namespace tesseract.
+
+#endif // TRAININGSAMPLESETSET_H_