diff mupdf-source/thirdparty/tesseract/src/training/common/mastertrainer.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/training/common/mastertrainer.h	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,294 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+// Author: rays@google.com (Ray Smith)
+///////////////////////////////////////////////////////////////////////
+// File:        mastertrainer.h
+// Description: Trainer to build the MasterClassifier.
+// Author:      Ray Smith
+//
+// (C) Copyright 2010, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_TRAINING_MASTERTRAINER_H_
+#define TESSERACT_TRAINING_MASTERTRAINER_H_
+
+#include "export.h"
+
+#include "classify.h"
+#include "cluster.h"
+#include "elst.h"
+#include "errorcounter.h"
+#include "featdefs.h"
+#include "fontinfo.h"
+#include "indexmapbidi.h"
+#include "intfeaturemap.h"
+#include "intfeaturespace.h"
+#include "intfx.h"
+#include "intmatcher.h"
+#include "params.h"
+#include "shapetable.h"
+#include "trainingsample.h"
+#include "trainingsampleset.h"
+#include "unicharset.h"
+
+namespace tesseract {
+
+class ShapeClassifier;
+
+// Simple struct to hold the distance between two shapes during clustering.
+struct ShapeDist {
+  ShapeDist() : shape1(0), shape2(0), distance(0.0f) {}
+  ShapeDist(int s1, int s2, float dist) : shape1(s1), shape2(s2), distance(dist) {}
+
+  // Sort operator to sort in ascending order of distance.
+  bool operator<(const ShapeDist &other) const {
+    return distance < other.distance;
+  }
+
+  int shape1;
+  int shape2;
+  float distance;
+};
+
+// Class to encapsulate training processes that use the TrainingSampleSet.
+// Initially supports shape clustering and mftrainining.
+// Other important features of the MasterTrainer are conditioning the data
+// by outlier elimination, replication with perturbation, and serialization.
+class TESS_COMMON_TRAINING_API MasterTrainer {
+public:
+  MasterTrainer(NormalizationMode norm_mode, bool shape_analysis, bool replicate_samples,
+                int debug_level);
+  ~MasterTrainer();
+
+  // Writes to the given file. Returns false in case of error.
+  bool Serialize(FILE *fp) const;
+
+  // Loads an initial unicharset, or sets one up if the file cannot be read.
+  void LoadUnicharset(const char *filename);
+
+  // Sets the feature space definition.
+  void SetFeatureSpace(const IntFeatureSpace &fs) {
+    feature_space_ = fs;
+    feature_map_.Init(fs);
+  }
+
+  // Reads the samples and their features from the given file,
+  // adding them to the trainer with the font_id from the content of the file.
+  // If verification, then these are verification samples, not training.
+  void ReadTrainingSamples(const char *page_name, const FEATURE_DEFS_STRUCT &feature_defs,
+                           bool verification);
+
+  // Adds the given single sample to the trainer, setting the classid
+  // appropriately from the given unichar_str.
+  void AddSample(bool verification, const char *unichar_str, TrainingSample *sample);
+
+  // Loads all pages from the given tif filename and append to page_images_.
+  // Must be called after ReadTrainingSamples, as the current number of images
+  // is used as an offset for page numbers in the samples.
+  void LoadPageImages(const char *filename);
+
+  // Cleans up the samples after initial load from the tr files, and prior to
+  // saving the MasterTrainer:
+  // Remaps fragmented chars if running shape analysis.
+  // Sets up the samples appropriately for class/fontwise access.
+  // Deletes outlier samples.
+  void PostLoadCleanup();
+
+  // Gets the samples ready for training. Use after both
+  // ReadTrainingSamples+PostLoadCleanup or DeSerialize.
+  // Re-indexes the features and computes canonical and cloud features.
+  void PreTrainingSetup();
+
+  // Sets up the master_shapes_ table, which tells which fonts should stay
+  // together until they get to a leaf node classifier.
+  void SetupMasterShapes();
+
+  // Adds the junk_samples_ to the main samples_ set. Junk samples are initially
+  // fragments and n-grams (all incorrectly segmented characters).
+  // Various training functions may result in incorrectly segmented characters
+  // being added to the unicharset of the main samples, perhaps because they
+  // form a "radical" decomposition of some (Indic) grapheme, or because they
+  // just look the same as a real character (like rn/m)
+  // This function moves all the junk samples, to the main samples_ set, but
+  // desirable junk, being any sample for which the unichar already exists in
+  // the samples_ unicharset gets the unichar-ids re-indexed to match, but
+  // anything else gets re-marked as unichar_id 0 (space character) to identify
+  // it as junk to the error counter.
+  void IncludeJunk();
+
+  // Replicates the samples and perturbs them if the enable_replication_ flag
+  // is set. MUST be used after the last call to OrganizeByFontAndClass on
+  // the training samples, ie after IncludeJunk if it is going to be used, as
+  // OrganizeByFontAndClass will eat the replicated samples into the regular
+  // samples.
+  void ReplicateAndRandomizeSamplesIfRequired();
+
+  // Loads the basic font properties file into fontinfo_table_.
+  // Returns false on failure.
+  bool LoadFontInfo(const char *filename);
+
+  // Loads the xheight font properties file into xheights_.
+  // Returns false on failure.
+  bool LoadXHeights(const char *filename);
+
+  // Reads spacing stats from filename and adds them to fontinfo_table.
+  // Returns false on failure.
+  bool AddSpacingInfo(const char *filename);
+
+  // Returns the font id corresponding to the given font name.
+  // Returns -1 if the font cannot be found.
+  int GetFontInfoId(const char *font_name);
+  // Returns the font_id of the closest matching font name to the given
+  // filename. It is assumed that a substring of the filename will match
+  // one of the fonts. If more than one is matched, the longest is returned.
+  int GetBestMatchingFontInfoId(const char *filename);
+
+  // Returns the filename of the tr file corresponding to the command-line
+  // argument with the given index.
+  const std::string &GetTRFileName(int index) const {
+    return tr_filenames_[index];
+  }
+
+  // Sets up a flat shapetable with one shape per class/font combination.
+  void SetupFlatShapeTable(ShapeTable *shape_table);
+
+  // Sets up a Clusterer for mftraining on a single shape_id.
+  // Call FreeClusterer on the return value after use.
+  CLUSTERER *SetupForClustering(const ShapeTable &shape_table,
+                                const FEATURE_DEFS_STRUCT &feature_defs, int shape_id,
+                                int *num_samples);
+
+  // Writes the given float_classes (produced by SetupForFloat2Int) as inttemp
+  // to the given inttemp_file, and the corresponding pffmtable.
+  // The unicharset is the original encoding of graphemes, and shape_set should
+  // match the size of the shape_table, and may possibly be totally fake.
+  void WriteInttempAndPFFMTable(const UNICHARSET &unicharset, const UNICHARSET &shape_set,
+                                const ShapeTable &shape_table, CLASS_STRUCT *float_classes,
+                                const char *inttemp_file, const char *pffmtable_file);
+
+  const UNICHARSET &unicharset() const {
+    return samples_.unicharset();
+  }
+  TrainingSampleSet *GetSamples() {
+    return &samples_;
+  }
+  const ShapeTable &master_shapes() const {
+    return master_shapes_;
+  }
+
+  // Generates debug output relating to the canonical distance between the
+  // two given UTF8 grapheme strings.
+  void DebugCanonical(const char *unichar_str1, const char *unichar_str2);
+#ifndef GRAPHICS_DISABLED
+  // Debugging for cloud/canonical features.
+  // Displays a Features window containing:
+  // If unichar_str2 is in the unicharset, and canonical_font is non-negative,
+  // displays the canonical features of the char/font combination in red.
+  // If unichar_str1 is in the unicharset, and cloud_font is non-negative,
+  // displays the cloud feature of the char/font combination in green.
+  // The canonical features are drawn first to show which ones have no
+  // matches in the cloud features.
+  // Until the features window is destroyed, each click in the features window
+  // will display the samples that have that feature in a separate window.
+  void DisplaySamples(const char *unichar_str1, int cloud_font, const char *unichar_str2,
+                      int canonical_font);
+#endif // !GRAPHICS_DISABLED
+
+  void TestClassifierVOld(bool replicate_samples, ShapeClassifier *test_classifier,
+                          ShapeClassifier *old_classifier);
+
+  // Tests the given test_classifier on the internal samples.
+  // See TestClassifier for details.
+  void TestClassifierOnSamples(CountTypes error_mode, int report_level, bool replicate_samples,
+                               ShapeClassifier *test_classifier, std::string *report_string);
+  // Tests the given test_classifier on the given samples
+  // error_mode indicates what counts as an error.
+  // report_levels:
+  // 0 = no output.
+  // 1 = bottom-line error rate.
+  // 2 = bottom-line error rate + time.
+  // 3 = font-level error rate + time.
+  // 4 = list of all errors + short classifier debug output on 16 errors.
+  // 5 = list of all errors + short classifier debug output on 25 errors.
+  // If replicate_samples is true, then the test is run on an extended test
+  // sample including replicated and systematically perturbed samples.
+  // If report_string is non-nullptr, a summary of the results for each font
+  // is appended to the report_string.
+  double TestClassifier(CountTypes error_mode, int report_level, bool replicate_samples,
+                        TrainingSampleSet *samples, ShapeClassifier *test_classifier,
+                        std::string *report_string);
+
+  // Returns the average (in some sense) distance between the two given
+  // shapes, which may contain multiple fonts and/or unichars.
+  // This function is public to facilitate testing.
+  float ShapeDistance(const ShapeTable &shapes, int s1, int s2);
+
+private:
+  // Replaces samples that are always fragmented with the corresponding
+  // fragment samples.
+  void ReplaceFragmentedSamples();
+
+  // Runs a hierarchical agglomerative clustering to merge shapes in the given
+  // shape_table, while satisfying the given constraints:
+  // * End with at least min_shapes left in shape_table,
+  // * No shape shall have more than max_shape_unichars in it,
+  // * Don't merge shapes where the distance between them exceeds max_dist.
+  void ClusterShapes(int min_shapes, int max_shape_unichars, float max_dist,
+                     ShapeTable *shape_table);
+
+private:
+  NormalizationMode norm_mode_;
+  // Character set we are training for.
+  UNICHARSET unicharset_;
+  // Original feature space. Subspace mapping is contained in feature_map_.
+  IntFeatureSpace feature_space_;
+  TrainingSampleSet samples_;
+  TrainingSampleSet junk_samples_;
+  TrainingSampleSet verify_samples_;
+  // Master shape table defines what fonts stay together until the leaves.
+  ShapeTable master_shapes_;
+  // Flat shape table has each unichar/font id pair in a separate shape.
+  ShapeTable flat_shapes_;
+  // Font metrics gathered from multiple files.
+  FontInfoTable fontinfo_table_;
+  // Array of xheights indexed by font ids in fontinfo_table_;
+  std::vector<int32_t> xheights_;
+
+  // Non-serialized data initialized by other means or used temporarily
+  // during loading of training samples.
+  // Number of different class labels in unicharset_.
+  int charsetsize_;
+  // Flag to indicate that we are running shape analysis and need fragments
+  // fixing.
+  bool enable_shape_analysis_;
+  // Flag to indicate that sample replication is required.
+  bool enable_replication_;
+  // Array of classids of fragments that replace the correctly segmented chars.
+  int *fragments_;
+  // Classid of previous correctly segmented sample that was added.
+  int prev_unichar_id_;
+  // Debug output control.
+  int debug_level_;
+  // Feature map used to construct reduced feature spaces for compact
+  // classifiers.
+  IntFeatureMap feature_map_;
+  // Vector of Pix pointers used for classifiers that need the image.
+  // Indexed by page_num_ in the samples.
+  // These images are owned by the trainer and need to be pixDestroyed.
+  std::vector<Image > page_images_;
+  // Vector of filenames of loaded tr files.
+  std::vector<std::string> tr_filenames_;
+};
+
+} // namespace tesseract.
+
+#endif // TESSERACT_TRAINING_MASTERTRAINER_H_