diff mupdf-source/thirdparty/tesseract/src/training/combine_tessdata.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/training/combine_tessdata.cpp	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,279 @@
+///////////////////////////////////////////////////////////////////////
+// File:        combine_tessdata.cpp
+// Description: Creates a unified traineddata file from several
+//              data files produced by the training process.
+// Author:      Daria Antonova
+//
+// (C) Copyright 2009, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "commontraining.h" // CheckSharedLibraryVersion
+#include "lstmrecognizer.h"
+#include "tessdatamanager.h"
+
+#include <cerrno>
+#include <iostream> // std::cout
+
+using namespace tesseract;
+
+static int list_components(TessdataManager &tm, const char *filename) {
+  // Initialize TessdataManager with the data in the given traineddata file.
+  if (filename != nullptr && !tm.Init(filename)) {
+    tprintf("Failed to read %s\n", filename);
+    return EXIT_FAILURE;
+  }
+  tm.Directory();
+  return EXIT_SUCCESS;
+}
+
+static int list_network(TessdataManager &tm, const char *filename) {
+  if (filename != nullptr && !tm.Init(filename)) {
+    tprintf("Failed to read %s\n", filename);
+    return EXIT_FAILURE;
+  }
+  tesseract::TFile fp;
+  if (tm.GetComponent(tesseract::TESSDATA_LSTM, &fp)) {
+    tesseract::LSTMRecognizer recognizer;
+    if (!recognizer.DeSerialize(&tm, &fp)) {
+      tprintf("Failed to deserialize LSTM in %s!\n", filename);
+      return EXIT_FAILURE;
+    }
+    std::cout << "LSTM: network=" << recognizer.GetNetwork()
+              << ", int_mode=" << recognizer.IsIntMode()
+              << ", recoding=" << recognizer.IsRecoding()
+              << ", iteration=" << recognizer.training_iteration()
+              << ", sample_iteration=" << recognizer.sample_iteration()
+              << ", null_char=" << recognizer.null_char()
+              << ", learning_rate=" << recognizer.learning_rate()
+              << ", momentum=" << recognizer.GetMomentum()
+              << ", adam_beta=" << recognizer.GetAdamBeta() << '\n';
+
+    std::cout << "Layer Learning Rates: ";
+    auto layers = recognizer.EnumerateLayers();
+    for (const auto &id : layers) {
+      auto layer = recognizer.GetLayer(id);
+      std::cout << id << "(" << layer->name() << ")"
+                << "=" << recognizer.GetLayerLearningRate(id)
+                << (layers[layers.size() - 1] != id ? ", " : "");
+    }
+    std::cout << "\n";
+  }
+  return EXIT_SUCCESS;
+}
+
+// Main program to combine/extract/overwrite tessdata components
+// in [lang].traineddata files.
+//
+// To combine all the individual tessdata components (unicharset, DAWGs,
+// classifier templates, ambiguities, language configs) located at, say,
+// /home/$USER/temp/eng.* run:
+//
+//   combine_tessdata /home/$USER/temp/eng.
+//
+// The result will be a combined tessdata file /home/$USER/temp/eng.traineddata
+//
+// Specify option -e if you would like to extract individual components
+// from a combined traineddata file. For example, to extract language config
+// file and the unicharset from tessdata/eng.traineddata run:
+//
+//   combine_tessdata -e tessdata/eng.traineddata
+//   /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset
+//
+// The desired config file and unicharset will be written to
+// /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset
+//
+// Specify option -o to overwrite individual components of the given
+// [lang].traineddata file. For example, to overwrite language config
+// and unichar ambiguities files in tessdata/eng.traineddata use:
+//
+//   combine_tessdata -o tessdata/eng.traineddata
+//   /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharambigs
+//
+// As a result, tessdata/eng.traineddata will contain the new language config
+// and unichar ambigs, plus all the original DAWGs, classifier teamples, etc.
+//
+// Note: the file names of the files to extract to and to overwrite from should
+// have the appropriate file suffixes (extensions) indicating their tessdata
+// component type (.unicharset for the unicharset, .unicharambigs for unichar
+// ambigs, etc). See k*FileSuffix variable in ccutil/tessdatamanager.h.
+//
+// Specify option -u to unpack all the components to the specified path:
+//
+// combine_tessdata -u tessdata/eng.traineddata /home/$USER/temp/eng.
+//
+// This will create  /home/$USER/temp/eng.* files with individual tessdata
+// components from tessdata/eng.traineddata.
+//
+int main(int argc, char **argv) {
+  tesseract::CheckSharedLibraryVersion();
+
+  int i;
+  tesseract::TessdataManager tm;
+  if (argc > 1 && (!strcmp(argv[1], "-v") || !strcmp(argv[1], "--version"))) {
+    printf("%s\n", tesseract::TessBaseAPI::Version());
+    return EXIT_SUCCESS;
+  } else if (argc == 2) {
+    printf("Combining tessdata files\n");
+    std::string lang = argv[1];
+    char *last = &argv[1][strlen(argv[1]) - 1];
+    if (*last != '.') {
+      lang += '.';
+    }
+    std::string output_file = lang;
+    output_file += kTrainedDataSuffix;
+    if (!tm.CombineDataFiles(lang.c_str(), output_file.c_str())) {
+      printf("Error combining tessdata files into %s\n", output_file.c_str());
+    } else {
+      printf("Output %s created successfully.\n", output_file.c_str());
+    }
+  } else if (argc >= 4 &&
+             (strcmp(argv[1], "-e") == 0 || strcmp(argv[1], "-u") == 0)) {
+    // Initialize TessdataManager with the data in the given traineddata file.
+    if (!tm.Init(argv[2])) {
+      tprintf("Failed to read %s\n", argv[2]);
+      return EXIT_FAILURE;
+    }
+    printf("Extracting tessdata components from %s\n", argv[2]);
+    if (strcmp(argv[1], "-e") == 0) {
+      for (i = 3; i < argc; ++i) {
+        errno = 0;
+        if (tm.ExtractToFile(argv[i])) {
+          printf("Wrote %s\n", argv[i]);
+        } else if (errno == 0) {
+          printf(
+              "Not extracting %s, since this component"
+              " is not present\n",
+              argv[i]);
+          return EXIT_FAILURE;
+        } else {
+          printf("Error, could not extract %s: %s\n", argv[i], strerror(errno));
+          return EXIT_FAILURE;
+        }
+      }
+    } else { // extract all the components
+      for (i = 0; i < tesseract::TESSDATA_NUM_ENTRIES; ++i) {
+        std::string filename = argv[3];
+        char *last = &argv[3][strlen(argv[3]) - 1];
+        if (*last != '.') {
+          filename += '.';
+        }
+        filename += tesseract::kTessdataFileSuffixes[i];
+        errno = 0;
+        if (tm.ExtractToFile(filename.c_str())) {
+          printf("Wrote %s\n", filename.c_str());
+        } else if (errno != 0) {
+          printf("Error, could not extract %s: %s\n", filename.c_str(),
+                 strerror(errno));
+          return EXIT_FAILURE;
+        }
+      }
+    }
+  } else if (argc >= 4 && strcmp(argv[1], "-o") == 0) {
+    // Rename the current traineddata file to a temporary name.
+    const char *new_traineddata_filename = argv[2];
+    std::string traineddata_filename = new_traineddata_filename;
+    traineddata_filename += ".__tmp__";
+    if (rename(new_traineddata_filename, traineddata_filename.c_str()) != 0) {
+      tprintf("Failed to create a temporary file %s\n",
+              traineddata_filename.c_str());
+      return EXIT_FAILURE;
+    }
+
+    // Initialize TessdataManager with the data in the given traineddata file.
+    tm.Init(traineddata_filename.c_str());
+
+    // Write the updated traineddata file.
+    tm.OverwriteComponents(new_traineddata_filename, argv + 3, argc - 3);
+  } else if (argc == 3 && strcmp(argv[1], "-c") == 0) {
+    if (!tm.Init(argv[2])) {
+      tprintf("Failed to read %s\n", argv[2]);
+      return EXIT_FAILURE;
+    }
+    tesseract::TFile fp;
+    if (!tm.GetComponent(tesseract::TESSDATA_LSTM, &fp)) {
+      tprintf("No LSTM Component found in %s!\n", argv[2]);
+      return EXIT_FAILURE;
+    }
+    tesseract::LSTMRecognizer recognizer;
+    if (!recognizer.DeSerialize(&tm, &fp)) {
+      tprintf("Failed to deserialize LSTM in %s!\n", argv[2]);
+      return EXIT_FAILURE;
+    }
+    recognizer.ConvertToInt();
+    std::vector<char> lstm_data;
+    fp.OpenWrite(&lstm_data);
+    ASSERT_HOST(recognizer.Serialize(&tm, &fp));
+    tm.OverwriteEntry(tesseract::TESSDATA_LSTM, &lstm_data[0],
+                      lstm_data.size());
+    if (!tm.SaveFile(argv[2], nullptr)) {
+      tprintf("Failed to write modified traineddata:%s!\n", argv[2]);
+      return EXIT_FAILURE;
+    }
+  } else if (argc == 3 && strcmp(argv[1], "-d") == 0) {
+    return list_components(tm, argv[2]);
+  } else if (argc == 3 && strcmp(argv[1], "-l") == 0) {
+    return list_network(tm, argv[2]);
+  } else if (argc == 3 && strcmp(argv[1], "-dl") == 0) {
+    int result = list_components(tm, argv[2]);
+    if (result == EXIT_SUCCESS) {
+      result = list_network(tm, nullptr);
+    }
+    return result;
+  } else if (argc == 3 && strcmp(argv[1], "-ld") == 0) {
+    int result = list_network(tm, argv[2]);
+    if (result == EXIT_SUCCESS) {
+      result = list_components(tm, nullptr);
+    }
+    return result;
+  } else {
+    printf(
+        "Usage for combining tessdata components:\n"
+        "  %s language_data_path_prefix\n"
+        "  (e.g. %s tessdata/eng.)\n\n",
+        argv[0], argv[0]);
+    printf(
+        "Usage for extracting tessdata components:\n"
+        "  %s -e traineddata_file [output_component_file...]\n"
+        "  (e.g. %s -e eng.traineddata eng.unicharset)\n\n",
+        argv[0], argv[0]);
+    printf(
+        "Usage for overwriting tessdata components:\n"
+        "  %s -o traineddata_file [input_component_file...]\n"
+        "  (e.g. %s -o eng.traineddata eng.unicharset)\n\n",
+        argv[0], argv[0]);
+    printf(
+        "Usage for unpacking all tessdata components:\n"
+        "  %s -u traineddata_file output_path_prefix\n"
+        "  (e.g. %s -u eng.traineddata tmp/eng.)\n\n",
+        argv[0], argv[0]);
+    printf(
+        "Usage for listing the network information\n"
+        "  %s -l traineddata_file\n"
+        "  (e.g. %s -l eng.traineddata)\n\n",
+        argv[0], argv[0]);
+    printf(
+        "Usage for listing directory of components:\n"
+        "  %s -d traineddata_file\n\n",
+        argv[0]);
+    printf(
+        "NOTE: Above two flags may combined as -dl or -ld to get both outputs"
+        );
+    printf(
+        "Usage for compacting LSTM component to int:\n"
+        "  %s -c traineddata_file\n",
+        argv[0]);
+    return EXIT_FAILURE;
+  }
+  tm.Directory();
+  return EXIT_SUCCESS;
+}