diff mupdf-source/thirdparty/tesseract/src/ccutil/tessdatamanager.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/ccutil/tessdatamanager.cpp	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,340 @@
+///////////////////////////////////////////////////////////////////////
+// File:        tessdatamanager.cpp
+// Description: Functions to handle loading/combining tesseract data files.
+// Author:      Daria Antonova
+//
+// (C) Copyright 2009, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifdef HAVE_CONFIG_H
+#  include "config_auto.h"
+#endif
+
+#include "tessdatamanager.h"
+
+#include <cstdio>
+#include <string>
+
+#if defined(HAVE_LIBARCHIVE)
+#  include <archive.h>
+#  include <archive_entry.h>
+#endif
+
+#include <tesseract/version.h>
+#include "errcode.h"
+#include "helpers.h"
+#include "params.h"
+#include "serialis.h"
+#include "tprintf.h"
+
+namespace tesseract {
+
+TessdataManager::TessdataManager() : reader_(nullptr), is_loaded_(false), swap_(false) {
+  SetVersionString(TESSERACT_VERSION_STR);
+}
+
+TessdataManager::TessdataManager(FileReader reader)
+    : reader_(reader), is_loaded_(false), swap_(false) {
+  SetVersionString(TESSERACT_VERSION_STR);
+}
+
+// Lazily loads from the given filename. Won't actually read the file
+// until it needs it.
+void TessdataManager::LoadFileLater(const char *data_file_name) {
+  Clear();
+  data_file_name_ = data_file_name;
+}
+
+#if defined(HAVE_LIBARCHIVE)
+bool TessdataManager::LoadArchiveFile(const char *filename) {
+  bool result = false;
+  archive *a = archive_read_new();
+  if (a != nullptr) {
+    archive_read_support_filter_all(a);
+    archive_read_support_format_all(a);
+    if (archive_read_open_filename(a, filename, 8192) == ARCHIVE_OK) {
+      archive_entry *ae;
+      while (archive_read_next_header(a, &ae) == ARCHIVE_OK) {
+        const char *component = archive_entry_pathname(ae);
+        if (component != nullptr) {
+          TessdataType type;
+          if (TessdataTypeFromFileName(component, &type)) {
+            int64_t size = archive_entry_size(ae);
+            if (size > 0) {
+              entries_[type].resize(size);
+              if (archive_read_data(a, &entries_[type][0], size) == size) {
+                is_loaded_ = true;
+              }
+            }
+          }
+        }
+      }
+      result = is_loaded_;
+    }
+    archive_read_free(a);
+  }
+  return result;
+}
+#endif
+
+bool TessdataManager::Init(const char *data_file_name) {
+  std::vector<char> data;
+  if (reader_ == nullptr) {
+#if defined(HAVE_LIBARCHIVE)
+    if (LoadArchiveFile(data_file_name)) {
+      return true;
+    }
+#endif
+    if (!LoadDataFromFile(data_file_name, &data)) {
+      return false;
+    }
+  } else {
+    if (!(*reader_)(data_file_name, &data)) {
+      return false;
+    }
+  }
+  return LoadMemBuffer(data_file_name, &data[0], data.size());
+}
+
+// Loads from the given memory buffer as if a file.
+bool TessdataManager::LoadMemBuffer(const char *name, const char *data, int size) {
+  // TODO: This method supports only the proprietary file format.
+  Clear();
+  data_file_name_ = name;
+  TFile fp;
+  fp.Open(data, size);
+  uint32_t num_entries;
+  if (!fp.DeSerialize(&num_entries)) {
+    return false;
+  }
+  swap_ = num_entries > kMaxNumTessdataEntries;
+  fp.set_swap(swap_);
+  if (swap_) {
+    ReverseN(&num_entries, sizeof(num_entries));
+  }
+  if (num_entries > kMaxNumTessdataEntries) {
+    return false;
+  }
+  // TODO: optimize (no init required).
+  std::vector<int64_t> offset_table(num_entries);
+  if (!fp.DeSerialize(&offset_table[0], num_entries)) {
+    return false;
+  }
+  for (unsigned i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) {
+    if (offset_table[i] >= 0) {
+      int64_t entry_size = size - offset_table[i];
+      unsigned j = i + 1;
+      while (j < num_entries && offset_table[j] == -1) {
+        ++j;
+      }
+      if (j < num_entries) {
+        entry_size = offset_table[j] - offset_table[i];
+      }
+      entries_[i].resize(entry_size);
+      if (!fp.DeSerialize(&entries_[i][0], entry_size)) {
+        return false;
+      }
+    }
+  }
+  if (entries_[TESSDATA_VERSION].empty()) {
+    SetVersionString("Pre-4.0.0");
+  }
+  is_loaded_ = true;
+  return true;
+}
+
+// Overwrites a single entry of the given type.
+void TessdataManager::OverwriteEntry(TessdataType type, const char *data, int size) {
+  is_loaded_ = true;
+  entries_[type].resize(size);
+  memcpy(&entries_[type][0], data, size);
+}
+
+// Saves to the given filename.
+bool TessdataManager::SaveFile(const char *filename, FileWriter writer) const {
+  // TODO: This method supports only the proprietary file format.
+  ASSERT_HOST(is_loaded_);
+  std::vector<char> data;
+  Serialize(&data);
+  if (writer == nullptr) {
+    return SaveDataToFile(data, filename);
+  } else {
+    return (*writer)(data, filename);
+  }
+}
+
+// Serializes to the given vector.
+void TessdataManager::Serialize(std::vector<char> *data) const {
+  // TODO: This method supports only the proprietary file format.
+  ASSERT_HOST(is_loaded_);
+  // Compute the offset_table and total size.
+  int64_t offset_table[TESSDATA_NUM_ENTRIES];
+  int64_t offset = sizeof(int32_t) + sizeof(offset_table);
+  for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
+    if (entries_[i].empty()) {
+      offset_table[i] = -1;
+    } else {
+      offset_table[i] = offset;
+      offset += entries_[i].size();
+    }
+  }
+  data->resize(offset, 0);
+  int32_t num_entries = TESSDATA_NUM_ENTRIES;
+  TFile fp;
+  fp.OpenWrite(data);
+  fp.Serialize(&num_entries);
+  fp.Serialize(&offset_table[0], countof(offset_table));
+  for (const auto &entry : entries_) {
+    if (!entry.empty()) {
+      fp.Serialize(&entry[0], entry.size());
+    }
+  }
+}
+
+// Resets to the initial state, keeping the reader.
+void TessdataManager::Clear() {
+  for (auto &entry : entries_) {
+    entry.clear();
+  }
+  is_loaded_ = false;
+}
+
+// Prints a directory of contents.
+void TessdataManager::Directory() const {
+  printf("Version:%s\n", VersionString().c_str());
+  auto offset = TESSDATA_NUM_ENTRIES * sizeof(int64_t);
+  for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
+    if (!entries_[i].empty()) {
+      printf("%u:%s:size=%zu, offset=%zu\n", i, kTessdataFileSuffixes[i], entries_[i].size(),
+              offset);
+      offset += entries_[i].size();
+    }
+  }
+}
+
+// Opens the given TFile pointer to the given component type.
+// Returns false in case of failure.
+bool TessdataManager::GetComponent(TessdataType type, TFile *fp) {
+  if (!is_loaded_ && !Init(data_file_name_.c_str())) {
+    return false;
+  }
+  const TessdataManager *const_this = this;
+  return const_this->GetComponent(type, fp);
+}
+
+// As non-const version except it can't load the component if not already
+// loaded.
+bool TessdataManager::GetComponent(TessdataType type, TFile *fp) const {
+  ASSERT_HOST(is_loaded_);
+  if (entries_[type].empty()) {
+    return false;
+  }
+  fp->Open(&entries_[type][0], entries_[type].size());
+  fp->set_swap(swap_);
+  return true;
+}
+
+// Returns the current version string.
+std::string TessdataManager::VersionString() const {
+  return std::string(&entries_[TESSDATA_VERSION][0], entries_[TESSDATA_VERSION].size());
+}
+
+// Sets the version string to the given v_str.
+void TessdataManager::SetVersionString(const std::string &v_str) {
+  entries_[TESSDATA_VERSION].resize(v_str.size());
+  memcpy(&entries_[TESSDATA_VERSION][0], v_str.data(), v_str.size());
+}
+
+bool TessdataManager::CombineDataFiles(const char *language_data_path_prefix,
+                                       const char *output_filename) {
+  // Load individual tessdata components from files.
+  for (auto filesuffix : kTessdataFileSuffixes) {
+    TessdataType type;
+    ASSERT_HOST(TessdataTypeFromFileSuffix(filesuffix, &type));
+    std::string filename = language_data_path_prefix;
+    filename += filesuffix;
+    FILE *fp = fopen(filename.c_str(), "rb");
+    if (fp != nullptr) {
+      fclose(fp);
+      if (!LoadDataFromFile(filename.c_str(), &entries_[type])) {
+        tprintf("Load of file %s failed!\n", filename.c_str());
+        return false;
+      }
+    }
+  }
+  is_loaded_ = true;
+
+  // Make sure that the required components are present.
+  if (!IsBaseAvailable() && !IsLSTMAvailable()) {
+    tprintf(
+        "Error: traineddata file must contain at least (a unicharset file"
+        " and inttemp) OR an lstm file.\n");
+    return false;
+  }
+  // Write updated data to the output traineddata file.
+  return SaveFile(output_filename, nullptr);
+}
+
+bool TessdataManager::OverwriteComponents(const char *new_traineddata_filename,
+                                          char **component_filenames, int num_new_components) {
+  // Open the files with the new components.
+  // TODO: This method supports only the proprietary file format.
+  for (int i = 0; i < num_new_components; ++i) {
+    TessdataType type;
+    if (TessdataTypeFromFileName(component_filenames[i], &type)) {
+      if (!LoadDataFromFile(component_filenames[i], &entries_[type])) {
+        tprintf("Failed to read component file:%s\n", component_filenames[i]);
+        return false;
+      }
+    }
+  }
+
+  // Write updated data to the output traineddata file.
+  return SaveFile(new_traineddata_filename, nullptr);
+}
+
+bool TessdataManager::ExtractToFile(const char *filename) {
+  TessdataType type = TESSDATA_NUM_ENTRIES;
+  ASSERT_HOST(tesseract::TessdataManager::TessdataTypeFromFileName(filename, &type));
+  if (entries_[type].empty()) {
+    return false;
+  }
+  return SaveDataToFile(entries_[type], filename);
+}
+
+bool TessdataManager::TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type) {
+  for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
+    if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) {
+      *type = static_cast<TessdataType>(i);
+      return true;
+    }
+  }
+#if !defined(NDEBUG)
+  tprintf(
+      "TessdataManager can't determine which tessdata"
+      " component is represented by %s\n",
+      suffix);
+#endif
+  return false;
+}
+
+bool TessdataManager::TessdataTypeFromFileName(const char *filename, TessdataType *type) {
+  // Get the file suffix (extension)
+  const char *suffix = strrchr(filename, '.');
+  if (suffix == nullptr || *(++suffix) == '\0') {
+    return false;
+  }
+  return TessdataTypeFromFileSuffix(suffix, type);
+}
+
+} // namespace tesseract