diff mupdf-source/thirdparty/tesseract/src/training/common/sampleiterator.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/training/common/sampleiterator.h	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,192 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+// Author: rays@google.com (Ray Smith)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CLASSIFY_SAMPLEITERATOR_H_
+#define TESSERACT_CLASSIFY_SAMPLEITERATOR_H_
+
+namespace tesseract {
+
+class IndexMapBiDi;
+class IntFeatureMap;
+class ShapeTable;
+class TrainingSample;
+class TrainingSampleSet;
+struct UnicharAndFonts;
+
+// Iterator class to encapsulate the complex iteration involved in getting
+// all samples of all shapes needed for a classification problem.
+//
+// =====INPUTS TO Init FUNCTION=====
+// The charset_map defines a subset of the sample_set classes (with a nullptr
+// shape_table, or the shape_table classes if not nullptr.)
+//
+// The shape_table (if not nullptr) defines the mapping from shapes to
+// font_id/class_id pairs. Each shape is a list of unichar_id and font lists.
+//
+// The sample_set holds the samples and provides indexed access to samples
+// of font_id/class_id pairs.
+//
+// If randomize is true, the samples are perturbed slightly, but the
+// perturbation is guaranteed to be the same for multiple identical
+// iterations.
+//
+// =====DIFFERENT COMBINATIONS OF INPUTS=====
+// nullptr shape_table:
+// Without a shape_table, everything works in UNICHAR_IDs.
+//
+// nullptr shape_table, nullptr charset_map:
+// Iterations simply run over the samples in the order the samples occur in the
+// input files.
+// GetCompactClassID and GetSparseClassID both return the sample UNICHAR_ID.
+//
+// nullptr shape_table, non-nullptr charset_map:
+// When shape_table is nullptr, the charset_map indexes unichar_ids directly,
+// and an iteration returns all samples of all chars in the charset_map, which
+// is a subset of the full unicharset.
+// The iteration will be in groups of the same unichar_id, in the order
+// defined by the charset_map.
+// GetCompactClassID returns the charset_map index of a sample, and
+// GetSparseClassID returns the sample UNICHAR_ID.
+//
+// Non-nullptr shape_table:
+// With a shape_table, samples are grouped according to the shape_table, so
+// multiple UNICHAR_IDs and fonts may be grouped together, and everything
+// works in shape_ids.
+//
+// Non-nullptr shape_table, nullptr charset_map.
+// Iterations simply run over the samples in the order of shape_id.
+// GetCompactClassID and GetSparseClassID both return the shape_id.
+// (If you want the unichar_id or font_id, the sample still has them.)
+//
+// Non-nullptr shape_table, non-nullptr charset_map.
+// When shape_table is not nullptr, the charset_map indexes and subsets shapes
+// in the shape_table, and iterations will be in shape_table order, not
+// charset_map order.
+// GetCompactClassID returns the charset_map index of a shape, and
+// GetSparseClassID returns the shape_id.
+//
+// =====What is SampleIterator good for?=====
+// Inside a classifier training module, the SampleIterator has abstracted away
+// all the different modes above.
+// Use the following iteration to train your classifier:
+// for (it.Begin(); !it.AtEnd(); it.Next()) {
+//   const TrainingSample& sample = it.GetSample();
+//   int class_id = it.GetCompactClassID();
+// Your classifier may or may not be dealing with a shape_table, and may be
+// dealing with some subset of the character/shape set. It doesn't need to
+// know and shouldn't care. It is just learning shapes with compact class ids
+// in the range [0, it.CompactCharsetSize()).
+class SampleIterator {
+public:
+  SampleIterator();
+  ~SampleIterator();
+
+  void Clear();
+
+  // See class comment for arguments.
+  void Init(const IndexMapBiDi *charset_map, const ShapeTable *shape_table, bool randomize,
+            TrainingSampleSet *sample_set);
+
+  // Iterator functions designed for use with a simple for loop:
+  // for (it.Begin(); !it.AtEnd(); it.Next()) {
+  //   const TrainingSample& sample = it.GetSample();
+  //   int class_id = it.GetCompactClassID();
+  //   ...
+  // }
+  void Begin();
+  bool AtEnd() const;
+  const TrainingSample &GetSample() const;
+  TrainingSample *MutableSample() const;
+  // Returns the total index (from the original set of samples) of the current
+  // sample.
+  int GlobalSampleIndex() const;
+  // Returns the index of the current sample in compact charset space, so
+  // in a 2-class problem between x and y, the returned indices will all be
+  // 0 or 1, and have nothing to do with the unichar_ids.
+  // If the charset_map_ is nullptr, then this is equal to GetSparseClassID().
+  int GetCompactClassID() const;
+  // Returns the index of the current sample in sparse charset space, so
+  // in a 2-class problem between x and y, the returned indices will all be
+  // x or y, where x and y may be unichar_ids (no shape_table_) or shape_ids
+  // with a shape_table_.
+  int GetSparseClassID() const;
+  // Moves on to the next indexable sample. If the end is reached, leaves
+  // the state such that AtEnd() is true.
+  void Next();
+
+  // Returns the size of the compact charset space.
+  int CompactCharsetSize() const;
+  // Returns the size of the sparse charset space.
+  int SparseCharsetSize() const;
+
+  const IndexMapBiDi &charset_map() const {
+    return *charset_map_;
+  }
+  const ShapeTable *shape_table() const {
+    return shape_table_;
+  }
+  // Sample set operations.
+  const TrainingSampleSet *sample_set() const {
+    return sample_set_;
+  }
+
+  // A set of functions that do something to all the samples accessed by the
+  // iterator, as it is currently setup.
+
+  // Apply the supplied feature_space/feature_map transform to all samples
+  // accessed by this iterator.
+  void MapSampleFeatures(const IntFeatureMap &feature_map);
+
+  // Adjust the weights of all the samples to be uniform in the given charset.
+  // Returns the number of samples in the iterator.
+  int UniformSamples();
+
+  // Normalize the weights of all the samples defined by the iterator so they
+  // sum to 1. Returns the minimum assigned sample weight.
+  double NormalizeSamples();
+
+private:
+  // Helper returns the current UnicharAndFont shape_entry.
+  const UnicharAndFonts *GetShapeEntry() const;
+
+  // Map to subset the actual charset space.
+  const IndexMapBiDi *charset_map_;
+  // Shape table to recombine character classes into shapes
+  const ShapeTable *shape_table_;
+  // The samples to iterate over.
+  TrainingSampleSet *sample_set_;
+  // Flag to control randomizing the sample features.
+  bool randomize_;
+  // Shape table owned by this used to iterate character classes.
+  ShapeTable *owned_shape_table_;
+
+  // Top-level iteration. Shape index in sparse charset_map space.
+  int shape_index_;
+  int num_shapes_;
+  // Index to the character class within a shape.
+  int shape_char_index_;
+  int num_shape_chars_;
+  // Index to the font within a shape/class pair.
+  int shape_font_index_;
+  int num_shape_fonts_;
+  // The lowest level iteration. sample_index_/num_samples_ counts samples
+  // in the current shape/class/font combination.
+  int sample_index_;
+  int num_samples_;
+};
+
+} // namespace tesseract.
+
+#endif // TESSERACT_CLASSIFY_SAMPLEITERATOR_H_