diff mupdf-source/thirdparty/tesseract/src/training/common/sampleiterator.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/training/common/sampleiterator.cpp	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,268 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+// Author: rays@google.com (Ray Smith)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "sampleiterator.h"
+
+#include "intfeaturemap.h"
+
+#include "indexmapbidi.h"
+#include "shapetable.h"
+#include "trainingsample.h"
+#include "trainingsampleset.h"
+
+namespace tesseract {
+
+// ================== SampleIterator Implementation =================
+
+SampleIterator::SampleIterator()
+    : charset_map_(nullptr)
+    , shape_table_(nullptr)
+    , sample_set_(nullptr)
+    , randomize_(false)
+    , owned_shape_table_(nullptr) {
+  num_shapes_ = 0;
+  Begin();
+}
+
+SampleIterator::~SampleIterator() {
+  Clear();
+}
+
+void SampleIterator::Clear() {
+  delete owned_shape_table_;
+  owned_shape_table_ = nullptr;
+}
+
+// See class comment for arguments.
+void SampleIterator::Init(const IndexMapBiDi *charset_map, const ShapeTable *shape_table,
+                          bool randomize, TrainingSampleSet *sample_set) {
+  Clear();
+  charset_map_ = charset_map;
+  shape_table_ = shape_table;
+  sample_set_ = sample_set;
+  randomize_ = randomize;
+  if (shape_table_ == nullptr && charset_map_ != nullptr) {
+    // The caller wishes to iterate by class. The easiest way to do this
+    // is to create a dummy shape_table_ that we will own.
+    int num_fonts = sample_set_->NumFonts();
+    owned_shape_table_ = new ShapeTable(sample_set_->unicharset());
+    int charsetsize = sample_set_->unicharset().size();
+    for (int c = 0; c < charsetsize; ++c) {
+      // We always add a shape for each character to keep the index in sync
+      // with the unichar_id.
+      int shape_id = owned_shape_table_->AddShape(c, 0);
+      for (int f = 1; f < num_fonts; ++f) {
+        if (sample_set_->NumClassSamples(f, c, true) > 0) {
+          owned_shape_table_->AddToShape(shape_id, c, f);
+        }
+      }
+    }
+    shape_table_ = owned_shape_table_;
+  }
+  if (shape_table_ != nullptr) {
+    num_shapes_ = shape_table_->NumShapes();
+  } else {
+    num_shapes_ = randomize ? sample_set_->num_samples() : sample_set_->num_raw_samples();
+  }
+  Begin();
+}
+
+// Iterator functions designed for use with a simple for loop:
+// for (it.Begin(); !it.AtEnd(); it.Next()) {
+//   const TrainingSample& sample = it.GetSample();
+// }
+void SampleIterator::Begin() {
+  shape_index_ = -1;
+  shape_char_index_ = 0;
+  num_shape_chars_ = 0;
+  shape_font_index_ = 0;
+  num_shape_fonts_ = 0;
+  sample_index_ = 0;
+  num_samples_ = 0;
+  // Find the first indexable sample.
+  Next();
+}
+
+bool SampleIterator::AtEnd() const {
+  return shape_index_ >= num_shapes_;
+}
+
+const TrainingSample &SampleIterator::GetSample() const {
+  if (shape_table_ != nullptr) {
+    const UnicharAndFonts *shape_entry = GetShapeEntry();
+    int char_id = shape_entry->unichar_id;
+    int font_id = shape_entry->font_ids[shape_font_index_];
+    return *sample_set_->GetSample(font_id, char_id, sample_index_);
+  } else {
+    return *sample_set_->GetSample(shape_index_);
+  }
+}
+
+TrainingSample *SampleIterator::MutableSample() const {
+  if (shape_table_ != nullptr) {
+    const UnicharAndFonts *shape_entry = GetShapeEntry();
+    int char_id = shape_entry->unichar_id;
+    int font_id = shape_entry->font_ids[shape_font_index_];
+    return sample_set_->MutableSample(font_id, char_id, sample_index_);
+  } else {
+    return sample_set_->mutable_sample(shape_index_);
+  }
+}
+
+// Returns the total index (from the original set of samples) of the current
+// sample.
+int SampleIterator::GlobalSampleIndex() const {
+  if (shape_table_ != nullptr) {
+    const UnicharAndFonts *shape_entry = GetShapeEntry();
+    int char_id = shape_entry->unichar_id;
+    int font_id = shape_entry->font_ids[shape_font_index_];
+    return sample_set_->GlobalSampleIndex(font_id, char_id, sample_index_);
+  } else {
+    return shape_index_;
+  }
+}
+
+// Returns the index of the current sample in compact charset space, so
+// in a 2-class problem between x and y, the returned indices will all be
+// 0 or 1, and have nothing to do with the unichar_ids.
+// If the charset_map_ is nullptr, then this is equal to GetSparseClassID().
+int SampleIterator::GetCompactClassID() const {
+  return charset_map_ != nullptr ? charset_map_->SparseToCompact(shape_index_) : GetSparseClassID();
+}
+// Returns the index of the current sample in sparse charset space, so
+// in a 2-class problem between x and y, the returned indices will all be
+// x or y, where x and y may be unichar_ids (no shape_table_) or shape_ids
+// with a shape_table_.
+int SampleIterator::GetSparseClassID() const {
+  return shape_table_ != nullptr ? shape_index_ : GetSample().class_id();
+}
+
+// Moves on to the next indexable sample. If the end is reached, leaves
+// the state such that AtEnd() is true.
+void SampleIterator::Next() {
+  if (shape_table_ != nullptr) {
+    // Next sample in this class/font combination.
+    ++sample_index_;
+    if (sample_index_ < num_samples_) {
+      return;
+    }
+    // Next font in this class in this shape.
+    sample_index_ = 0;
+    do {
+      ++shape_font_index_;
+      if (shape_font_index_ >= num_shape_fonts_) {
+        // Next unichar in this shape.
+        shape_font_index_ = 0;
+        ++shape_char_index_;
+        if (shape_char_index_ >= num_shape_chars_) {
+          // Find the next shape that is mapped in the charset_map_.
+          shape_char_index_ = 0;
+          do {
+            ++shape_index_;
+          } while (shape_index_ < num_shapes_ && charset_map_ != nullptr &&
+                   charset_map_->SparseToCompact(shape_index_) < 0);
+          if (shape_index_ >= num_shapes_) {
+            return; // The end.
+          }
+          num_shape_chars_ = shape_table_->GetShape(shape_index_).size();
+        }
+      }
+      const UnicharAndFonts *shape_entry = GetShapeEntry();
+      num_shape_fonts_ = shape_entry->font_ids.size();
+      int char_id = shape_entry->unichar_id;
+      int font_id = shape_entry->font_ids[shape_font_index_];
+      num_samples_ = sample_set_->NumClassSamples(font_id, char_id, randomize_);
+    } while (num_samples_ == 0);
+  } else {
+    // We are just iterating over the samples.
+    ++shape_index_;
+  }
+}
+
+// Returns the size of the compact charset space.
+int SampleIterator::CompactCharsetSize() const {
+  return charset_map_ != nullptr ? charset_map_->CompactSize() : SparseCharsetSize();
+}
+
+// Returns the size of the sparse charset space.
+int SampleIterator::SparseCharsetSize() const {
+  return charset_map_ != nullptr
+             ? charset_map_->SparseSize()
+             : (shape_table_ != nullptr ? shape_table_->NumShapes() : sample_set_->charsetsize());
+}
+
+// Sets the mapped_features_ from the features using the provided
+// feature_map.
+static void MapFeatures(TrainingSample &s, const IntFeatureMap &feature_map) {
+  std::vector<int> indexed_features;
+  feature_map.feature_space().IndexAndSortFeatures(s.features(), s.num_features(),
+                                                   &indexed_features);
+  feature_map.MapIndexedFeatures(indexed_features, &s.mapped_features_);
+  s.features_are_indexed_ = false;
+  s.features_are_mapped_ = true;
+}
+
+// Apply the supplied feature_space/feature_map transform to all samples
+// accessed by this iterator.
+void SampleIterator::MapSampleFeatures(const IntFeatureMap &feature_map) {
+  for (Begin(); !AtEnd(); Next()) {
+    TrainingSample *sample = MutableSample();
+    MapFeatures(*sample, feature_map);
+  }
+}
+
+// Adjust the weights of all the samples to be uniform in the given charset.
+// Returns the number of samples in the iterator.
+int SampleIterator::UniformSamples() {
+  int num_good_samples = 0;
+  for (Begin(); !AtEnd(); Next()) {
+    TrainingSample *sample = MutableSample();
+    sample->set_weight(1.0);
+    ++num_good_samples;
+  }
+  NormalizeSamples();
+  return num_good_samples;
+}
+
+// Normalize the weights of all the samples in the charset_map so they sum
+// to 1. Returns the minimum assigned sample weight.
+double SampleIterator::NormalizeSamples() {
+  double total_weight = 0.0;
+  for (Begin(); !AtEnd(); Next()) {
+    const TrainingSample &sample = GetSample();
+    total_weight += sample.weight();
+  }
+  // Normalize samples.
+  double min_assigned_sample_weight = 1.0;
+  if (total_weight > 0.0) {
+    for (Begin(); !AtEnd(); Next()) {
+      TrainingSample *sample = MutableSample();
+      double weight = sample->weight() / total_weight;
+      if (weight < min_assigned_sample_weight) {
+        min_assigned_sample_weight = weight;
+      }
+      sample->set_weight(weight);
+    }
+  }
+  return min_assigned_sample_weight;
+}
+
+// Helper returns the current UnicharAndFont shape_entry.
+const UnicharAndFonts *SampleIterator::GetShapeEntry() const {
+  const Shape &shape = shape_table_->GetShape(shape_index_);
+  return &shape[shape_char_index_];
+}
+
+} // namespace tesseract.