diff mupdf-source/thirdparty/tesseract/src/training/common/commontraining.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/training/common/commontraining.h	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,186 @@
+// Copyright 2008 Google Inc. All Rights Reserved.
+// Author: scharron@google.com (Samuel Charron)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TESSERACT_TRAINING_COMMONTRAINING_H_
+#define TESSERACT_TRAINING_COMMONTRAINING_H_
+
+#ifdef HAVE_CONFIG_H
+#  include "config_auto.h"
+#endif
+
+#include "commandlineflags.h"
+#include "export.h"
+#include "tprintf.h"
+
+#include <tesseract/baseapi.h>
+
+#include <memory>
+
+namespace tesseract {
+
+TESS_COMMON_TRAINING_API
+void ParseArguments(int *argc, char ***argv);
+
+// Check whether the shared tesseract library is the right one.
+// This function must be inline because otherwise it would be part of
+// the shared library, so it could not compare the versions.
+static inline void CheckSharedLibraryVersion() {
+#ifdef HAVE_CONFIG_H
+  if (!!strcmp(TESSERACT_VERSION_STR, TessBaseAPI::Version())) {
+    tprintf(
+        "ERROR: shared library version mismatch (was %s, expected %s\n"
+        "Did you use a wrong shared tesseract library?\n",
+        TessBaseAPI::Version(), TESSERACT_VERSION_STR);
+    exit(1);
+  }
+#endif
+}
+
+} // namespace tesseract
+
+#ifndef DISABLED_LEGACY_ENGINE
+
+#  include "cluster.h"
+#  include "featdefs.h"
+#  include "intproto.h"
+#  include "oldlist.h"
+
+namespace tesseract {
+
+class Classify;
+class MasterTrainer;
+class ShapeTable;
+
+//////////////////////////////////////////////////////////////////////////////
+// Globals ///////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////
+
+TESS_COMMON_TRAINING_API
+extern FEATURE_DEFS_STRUCT feature_defs;
+
+// Must be defined in the file that "implements" commonTraining facilities.
+TESS_COMMON_TRAINING_API
+extern CLUSTERCONFIG Config;
+
+//////////////////////////////////////////////////////////////////////////////
+// Structs ///////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////
+struct LABELEDLISTNODE {
+  /// This constructor allocates a new, empty labeled list and gives
+  /// it the specified label.
+  /// @param Label label for new list
+  LABELEDLISTNODE(const char *label) : Label(label) {
+  }
+  std::string Label;
+  int SampleCount = 0;
+  int font_sample_count = 0;
+  LIST List = nullptr;
+};
+using LABELEDLIST = LABELEDLISTNODE *;
+
+struct MERGE_CLASS_NODE {
+  MERGE_CLASS_NODE(const char * label) : Label(label), Class(NewClass(MAX_NUM_PROTOS, MAX_NUM_CONFIGS)) {
+  }
+  std::string Label;
+  int NumMerged[MAX_NUM_PROTOS];
+  tesseract::CLASS_TYPE Class;
+};
+using MERGE_CLASS = MERGE_CLASS_NODE *;
+
+//////////////////////////////////////////////////////////////////////////////
+// Functions /////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////
+
+// Helper loads shape table from the given file.
+ShapeTable *LoadShapeTable(const std::string &file_prefix);
+// Helper to write the shape_table.
+TESS_COMMON_TRAINING_API
+void WriteShapeTable(const std::string &file_prefix, const ShapeTable &shape_table);
+
+// Creates a MasterTraininer and loads the training data into it:
+// Initializes feature_defs and IntegerFX.
+// Loads the shape_table if shape_table != nullptr.
+// Loads initial unicharset from -U command-line option.
+// If FLAGS_input_trainer is set, loads the majority of data from there, else:
+//   Loads font info from -F option.
+//   Loads xheights from -X option.
+//   Loads samples from .tr files in remaining command-line args.
+//   Deletes outliers and computes canonical samples.
+//   If FLAGS_output_trainer is set, saves the trainer for future use.
+// Computes canonical and cloud features.
+// If shape_table is not nullptr, but failed to load, make a fake flat one,
+// as shape clustering was not run.
+TESS_COMMON_TRAINING_API
+std::unique_ptr<MasterTrainer> LoadTrainingData(const char *const *filelist, bool replication,
+                                                ShapeTable **shape_table, std::string &file_prefix);
+
+LABELEDLIST FindList(tesseract::LIST List, const std::string &Label);
+
+TESS_COMMON_TRAINING_API
+void ReadTrainingSamples(const tesseract::FEATURE_DEFS_STRUCT &feature_defs,
+                         const char *feature_name, int max_samples,
+                         tesseract::UNICHARSET *unicharset, FILE *file,
+                         tesseract::LIST *training_samples);
+
+void WriteTrainingSamples(const tesseract::FEATURE_DEFS_STRUCT &FeatureDefs, char *Directory,
+                          tesseract::LIST CharList, const char *program_feature_type);
+
+TESS_COMMON_TRAINING_API
+void FreeTrainingSamples(tesseract::LIST CharList);
+
+TESS_COMMON_TRAINING_API
+void FreeLabeledList(LABELEDLIST LabeledList);
+
+TESS_COMMON_TRAINING_API
+void FreeLabeledClassList(tesseract::LIST ClassListList);
+
+TESS_COMMON_TRAINING_API
+tesseract::CLUSTERER *SetUpForClustering(const tesseract::FEATURE_DEFS_STRUCT &FeatureDefs,
+                                         LABELEDLIST CharSample, const char *program_feature_type);
+
+TESS_COMMON_TRAINING_API
+tesseract::LIST RemoveInsignificantProtos(tesseract::LIST ProtoList, bool KeepSigProtos,
+                                          bool KeepInsigProtos, int N);
+
+TESS_COMMON_TRAINING_API
+void CleanUpUnusedData(tesseract::LIST ProtoList);
+
+TESS_COMMON_TRAINING_API
+void MergeInsignificantProtos(tesseract::LIST ProtoList, const char *label,
+                              tesseract::CLUSTERER *Clusterer, tesseract::CLUSTERCONFIG *Config);
+
+TESS_COMMON_TRAINING_API
+MERGE_CLASS FindClass(tesseract::LIST List, const std::string &Label);
+
+TESS_COMMON_TRAINING_API
+tesseract::CLASS_STRUCT *SetUpForFloat2Int(const tesseract::UNICHARSET &unicharset,
+                                           tesseract::LIST LabeledClassList);
+
+void Normalize(float *Values);
+
+TESS_COMMON_TRAINING_API
+void FreeNormProtoList(tesseract::LIST CharList);
+
+TESS_COMMON_TRAINING_API
+void AddToNormProtosList(tesseract::LIST *NormProtoList, tesseract::LIST ProtoList, const std::string &CharName);
+
+TESS_COMMON_TRAINING_API
+int NumberOfProtos(tesseract::LIST ProtoList, bool CountSigProtos, bool CountInsigProtos);
+
+void allocNormProtos();
+
+} // namespace tesseract
+
+#endif // def DISABLED_LEGACY_ENGINE
+
+#endif // TESSERACT_TRAINING_COMMONTRAINING_H_