diff mupdf-source/thirdparty/tesseract/src/ccutil/tessdatamanager.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/ccutil/tessdatamanager.h	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,255 @@
+///////////////////////////////////////////////////////////////////////
+// File:        tessdatamanager.h
+// Description: Functions to handle loading/combining tesseract data files.
+// Author:      Daria Antonova
+//
+// (C) Copyright 2009, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CCUTIL_TESSDATAMANAGER_H_
+#define TESSERACT_CCUTIL_TESSDATAMANAGER_H_
+
+#include <tesseract/baseapi.h> // FileReader
+#include <string>              // std::string
+#include <vector>              // std::vector
+#include "serialis.h"          // FileWriter
+
+static const char kTrainedDataSuffix[] = "traineddata";
+
+// When adding new tessdata types and file suffixes, please make sure to
+// update TessdataType enum, kTessdataFileSuffixes and kTessdataFileIsText.
+static const char kLangConfigFileSuffix[] = "config";
+static const char kUnicharsetFileSuffix[] = "unicharset";
+static const char kAmbigsFileSuffix[] = "unicharambigs";
+static const char kBuiltInTemplatesFileSuffix[] = "inttemp";
+static const char kBuiltInCutoffsFileSuffix[] = "pffmtable";
+static const char kNormProtoFileSuffix[] = "normproto";
+static const char kPuncDawgFileSuffix[] = "punc-dawg";
+static const char kSystemDawgFileSuffix[] = "word-dawg";
+static const char kNumberDawgFileSuffix[] = "number-dawg";
+static const char kFreqDawgFileSuffix[] = "freq-dawg";
+static const char kFixedLengthDawgsFileSuffix[] = "fixed-length-dawgs";
+static const char kCubeUnicharsetFileSuffix[] = "cube-unicharset";
+static const char kCubeSystemDawgFileSuffix[] = "cube-word-dawg";
+static const char kShapeTableFileSuffix[] = "shapetable";
+static const char kBigramDawgFileSuffix[] = "bigram-dawg";
+static const char kUnambigDawgFileSuffix[] = "unambig-dawg";
+static const char kParamsModelFileSuffix[] = "params-model";
+static const char kLSTMModelFileSuffix[] = "lstm";
+static const char kLSTMPuncDawgFileSuffix[] = "lstm-punc-dawg";
+static const char kLSTMSystemDawgFileSuffix[] = "lstm-word-dawg";
+static const char kLSTMNumberDawgFileSuffix[] = "lstm-number-dawg";
+static const char kLSTMUnicharsetFileSuffix[] = "lstm-unicharset";
+static const char kLSTMRecoderFileSuffix[] = "lstm-recoder";
+static const char kVersionFileSuffix[] = "version";
+
+namespace tesseract {
+
+enum TessdataType {
+  TESSDATA_LANG_CONFIG,        // 0
+  TESSDATA_UNICHARSET,         // 1
+  TESSDATA_AMBIGS,             // 2
+  TESSDATA_INTTEMP,            // 3
+  TESSDATA_PFFMTABLE,          // 4
+  TESSDATA_NORMPROTO,          // 5
+  TESSDATA_PUNC_DAWG,          // 6
+  TESSDATA_SYSTEM_DAWG,        // 7
+  TESSDATA_NUMBER_DAWG,        // 8
+  TESSDATA_FREQ_DAWG,          // 9
+  TESSDATA_FIXED_LENGTH_DAWGS, // 10  // deprecated
+  TESSDATA_CUBE_UNICHARSET,    // 11  // deprecated
+  TESSDATA_CUBE_SYSTEM_DAWG,   // 12  // deprecated
+  TESSDATA_SHAPE_TABLE,        // 13
+  TESSDATA_BIGRAM_DAWG,        // 14
+  TESSDATA_UNAMBIG_DAWG,       // 15
+  TESSDATA_PARAMS_MODEL,       // 16
+  TESSDATA_LSTM,               // 17
+  TESSDATA_LSTM_PUNC_DAWG,     // 18
+  TESSDATA_LSTM_SYSTEM_DAWG,   // 19
+  TESSDATA_LSTM_NUMBER_DAWG,   // 20
+  TESSDATA_LSTM_UNICHARSET,    // 21
+  TESSDATA_LSTM_RECODER,       // 22
+  TESSDATA_VERSION,            // 23
+
+  TESSDATA_NUM_ENTRIES
+};
+
+/**
+ * kTessdataFileSuffixes[i] indicates the file suffix for
+ * tessdata of type i (from TessdataType enum).
+ */
+static const char *const kTessdataFileSuffixes[] = {
+    kLangConfigFileSuffix,       // 0
+    kUnicharsetFileSuffix,       // 1
+    kAmbigsFileSuffix,           // 2
+    kBuiltInTemplatesFileSuffix, // 3
+    kBuiltInCutoffsFileSuffix,   // 4
+    kNormProtoFileSuffix,        // 5
+    kPuncDawgFileSuffix,         // 6
+    kSystemDawgFileSuffix,       // 7
+    kNumberDawgFileSuffix,       // 8
+    kFreqDawgFileSuffix,         // 9
+    kFixedLengthDawgsFileSuffix, // 10  // deprecated
+    kCubeUnicharsetFileSuffix,   // 11  // deprecated
+    kCubeSystemDawgFileSuffix,   // 12  // deprecated
+    kShapeTableFileSuffix,       // 13
+    kBigramDawgFileSuffix,       // 14
+    kUnambigDawgFileSuffix,      // 15
+    kParamsModelFileSuffix,      // 16
+    kLSTMModelFileSuffix,        // 17
+    kLSTMPuncDawgFileSuffix,     // 18
+    kLSTMSystemDawgFileSuffix,   // 19
+    kLSTMNumberDawgFileSuffix,   // 20
+    kLSTMUnicharsetFileSuffix,   // 21
+    kLSTMRecoderFileSuffix,      // 22
+    kVersionFileSuffix,          // 23
+};
+
+/**
+ * TessdataType could be updated to contain more entries, however
+ * we do not expect that number to be astronomically high.
+ * In order to automatically detect endianness TessdataManager will
+ * flip the bits if actual_tessdata_num_entries_ is larger than
+ * kMaxNumTessdataEntries.
+ */
+static const int kMaxNumTessdataEntries = 1000;
+
+class TESS_API TessdataManager {
+public:
+  TessdataManager();
+  explicit TessdataManager(FileReader reader);
+
+  ~TessdataManager() = default;
+
+  bool swap() const {
+    return swap_;
+  }
+  bool is_loaded() const {
+    return is_loaded_;
+  }
+
+  // Lazily loads from the given filename. Won't actually read the file
+  // until it needs it.
+  void LoadFileLater(const char *data_file_name);
+  /**
+   * Opens and reads the given data file right now.
+   * @return true on success.
+   */
+  bool Init(const char *data_file_name);
+  // Loads from the given memory buffer as if a file, remembering name as some
+  // arbitrary source id for caching.
+  bool LoadMemBuffer(const char *name, const char *data, int size);
+  // Overwrites a single entry of the given type.
+  void OverwriteEntry(TessdataType type, const char *data, int size);
+
+  // Saves to the given filename.
+  bool SaveFile(const char *filename, FileWriter writer) const;
+  // Serializes to the given vector.
+  void Serialize(std::vector<char> *data) const;
+  // Resets to the initial state, keeping the reader.
+  void Clear();
+
+  // Prints a directory of contents.
+  void Directory() const;
+
+  // Returns true if the component requested is present.
+  bool IsComponentAvailable(TessdataType type) const {
+    return !entries_[type].empty();
+  }
+  // Opens the given TFile pointer to the given component type.
+  // Returns false in case of failure.
+  bool GetComponent(TessdataType type, TFile *fp);
+  // As non-const version except it can't load the component if not already
+  // loaded.
+  bool GetComponent(TessdataType type, TFile *fp) const;
+
+  // Returns the current version string.
+  std::string VersionString() const;
+  // Sets the version string to the given v_str.
+  void SetVersionString(const std::string &v_str);
+
+  // Returns true if the base Tesseract components are present.
+  bool IsBaseAvailable() const {
+    return !entries_[TESSDATA_UNICHARSET].empty() && !entries_[TESSDATA_INTTEMP].empty();
+  }
+
+  // Returns true if the LSTM components are present.
+  bool IsLSTMAvailable() const {
+    return !entries_[TESSDATA_LSTM].empty();
+  }
+
+  // Return the name of the underlying data file.
+  const std::string &GetDataFileName() const {
+    return data_file_name_;
+  }
+
+  /**
+   * Reads all the standard tesseract config and data files for a language
+   * at the given path and bundles them up into one binary data file.
+   * Returns true if the combined traineddata file was successfully written.
+   */
+  bool CombineDataFiles(const char *language_data_path_prefix, const char *output_filename);
+
+  /**
+   * Gets the individual components from the data_file_ with which the class was
+   * initialized. Overwrites the components specified by component_filenames.
+   * Writes the updated traineddata file to new_traineddata_filename.
+   */
+  bool OverwriteComponents(const char *new_traineddata_filename, char **component_filenames,
+                           int num_new_components);
+
+  /**
+   * Extracts tessdata component implied by the name of the input file from
+   * the combined traineddata loaded into TessdataManager.
+   * Writes the extracted component to the file indicated by the file name.
+   * E.g. if the filename given is somepath/somelang.unicharset, unicharset
+   * will be extracted from the data loaded into the TessdataManager and will
+   * be written to somepath/somelang.unicharset.
+   * @return true if the component was successfully extracted, false if the
+   * component was not present in the traineddata loaded into TessdataManager.
+   */
+  bool ExtractToFile(const char *filename);
+
+private:
+  // Use libarchive.
+  bool LoadArchiveFile(const char *filename);
+
+  /**
+   * Fills type with TessdataType of the tessdata component represented by the
+   * given file name. E.g. tessdata/eng.unicharset -> TESSDATA_UNICHARSET.
+   * @return true if the tessdata component type could be determined
+   * from the given file name.
+   */
+  static bool TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type);
+
+  /**
+   * Tries to determine tessdata component file suffix from filename,
+   * returns true on success.
+   */
+  static bool TessdataTypeFromFileName(const char *filename, TessdataType *type);
+
+  // Name of file it came from.
+  std::string data_file_name_;
+  // Function to load the file when we need it.
+  FileReader reader_;
+  // True if the file has been loaded.
+  bool is_loaded_;
+  // True if the bytes need swapping.
+  bool swap_;
+  // Contents of each element of the traineddata file.
+  std::vector<char> entries_[TESSDATA_NUM_ENTRIES];
+};
+
+} // namespace tesseract
+
+#endif // TESSERACT_CCUTIL_TESSDATAMANAGER_H_