Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccutil/tessdatamanager.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: tessdatamanager.cpp | |
| 3 // Description: Functions to handle loading/combining tesseract data files. | |
| 4 // Author: Daria Antonova | |
| 5 // | |
| 6 // (C) Copyright 2009, Google Inc. | |
| 7 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 // you may not use this file except in compliance with the License. | |
| 9 // You may obtain a copy of the License at | |
| 10 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 // Unless required by applicable law or agreed to in writing, software | |
| 12 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 // See the License for the specific language governing permissions and | |
| 15 // limitations under the License. | |
| 16 // | |
| 17 /////////////////////////////////////////////////////////////////////// | |
| 18 | |
| 19 #ifdef HAVE_CONFIG_H | |
| 20 # include "config_auto.h" | |
| 21 #endif | |
| 22 | |
| 23 #include "tessdatamanager.h" | |
| 24 | |
| 25 #include <cstdio> | |
| 26 #include <string> | |
| 27 | |
| 28 #if defined(HAVE_LIBARCHIVE) | |
| 29 # include <archive.h> | |
| 30 # include <archive_entry.h> | |
| 31 #endif | |
| 32 | |
| 33 #include <tesseract/version.h> | |
| 34 #include "errcode.h" | |
| 35 #include "helpers.h" | |
| 36 #include "params.h" | |
| 37 #include "serialis.h" | |
| 38 #include "tprintf.h" | |
| 39 | |
| 40 namespace tesseract { | |
| 41 | |
| 42 TessdataManager::TessdataManager() : reader_(nullptr), is_loaded_(false), swap_(false) { | |
| 43 SetVersionString(TESSERACT_VERSION_STR); | |
| 44 } | |
| 45 | |
| 46 TessdataManager::TessdataManager(FileReader reader) | |
| 47 : reader_(reader), is_loaded_(false), swap_(false) { | |
| 48 SetVersionString(TESSERACT_VERSION_STR); | |
| 49 } | |
| 50 | |
| 51 // Lazily loads from the given filename. Won't actually read the file | |
| 52 // until it needs it. | |
| 53 void TessdataManager::LoadFileLater(const char *data_file_name) { | |
| 54 Clear(); | |
| 55 data_file_name_ = data_file_name; | |
| 56 } | |
| 57 | |
| 58 #if defined(HAVE_LIBARCHIVE) | |
| 59 bool TessdataManager::LoadArchiveFile(const char *filename) { | |
| 60 bool result = false; | |
| 61 archive *a = archive_read_new(); | |
| 62 if (a != nullptr) { | |
| 63 archive_read_support_filter_all(a); | |
| 64 archive_read_support_format_all(a); | |
| 65 if (archive_read_open_filename(a, filename, 8192) == ARCHIVE_OK) { | |
| 66 archive_entry *ae; | |
| 67 while (archive_read_next_header(a, &ae) == ARCHIVE_OK) { | |
| 68 const char *component = archive_entry_pathname(ae); | |
| 69 if (component != nullptr) { | |
| 70 TessdataType type; | |
| 71 if (TessdataTypeFromFileName(component, &type)) { | |
| 72 int64_t size = archive_entry_size(ae); | |
| 73 if (size > 0) { | |
| 74 entries_[type].resize(size); | |
| 75 if (archive_read_data(a, &entries_[type][0], size) == size) { | |
| 76 is_loaded_ = true; | |
| 77 } | |
| 78 } | |
| 79 } | |
| 80 } | |
| 81 } | |
| 82 result = is_loaded_; | |
| 83 } | |
| 84 archive_read_free(a); | |
| 85 } | |
| 86 return result; | |
| 87 } | |
| 88 #endif | |
| 89 | |
| 90 bool TessdataManager::Init(const char *data_file_name) { | |
| 91 std::vector<char> data; | |
| 92 if (reader_ == nullptr) { | |
| 93 #if defined(HAVE_LIBARCHIVE) | |
| 94 if (LoadArchiveFile(data_file_name)) { | |
| 95 return true; | |
| 96 } | |
| 97 #endif | |
| 98 if (!LoadDataFromFile(data_file_name, &data)) { | |
| 99 return false; | |
| 100 } | |
| 101 } else { | |
| 102 if (!(*reader_)(data_file_name, &data)) { | |
| 103 return false; | |
| 104 } | |
| 105 } | |
| 106 return LoadMemBuffer(data_file_name, &data[0], data.size()); | |
| 107 } | |
| 108 | |
| 109 // Loads from the given memory buffer as if a file. | |
| 110 bool TessdataManager::LoadMemBuffer(const char *name, const char *data, int size) { | |
| 111 // TODO: This method supports only the proprietary file format. | |
| 112 Clear(); | |
| 113 data_file_name_ = name; | |
| 114 TFile fp; | |
| 115 fp.Open(data, size); | |
| 116 uint32_t num_entries; | |
| 117 if (!fp.DeSerialize(&num_entries)) { | |
| 118 return false; | |
| 119 } | |
| 120 swap_ = num_entries > kMaxNumTessdataEntries; | |
| 121 fp.set_swap(swap_); | |
| 122 if (swap_) { | |
| 123 ReverseN(&num_entries, sizeof(num_entries)); | |
| 124 } | |
| 125 if (num_entries > kMaxNumTessdataEntries) { | |
| 126 return false; | |
| 127 } | |
| 128 // TODO: optimize (no init required). | |
| 129 std::vector<int64_t> offset_table(num_entries); | |
| 130 if (!fp.DeSerialize(&offset_table[0], num_entries)) { | |
| 131 return false; | |
| 132 } | |
| 133 for (unsigned i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) { | |
| 134 if (offset_table[i] >= 0) { | |
| 135 int64_t entry_size = size - offset_table[i]; | |
| 136 unsigned j = i + 1; | |
| 137 while (j < num_entries && offset_table[j] == -1) { | |
| 138 ++j; | |
| 139 } | |
| 140 if (j < num_entries) { | |
| 141 entry_size = offset_table[j] - offset_table[i]; | |
| 142 } | |
| 143 entries_[i].resize(entry_size); | |
| 144 if (!fp.DeSerialize(&entries_[i][0], entry_size)) { | |
| 145 return false; | |
| 146 } | |
| 147 } | |
| 148 } | |
| 149 if (entries_[TESSDATA_VERSION].empty()) { | |
| 150 SetVersionString("Pre-4.0.0"); | |
| 151 } | |
| 152 is_loaded_ = true; | |
| 153 return true; | |
| 154 } | |
| 155 | |
| 156 // Overwrites a single entry of the given type. | |
| 157 void TessdataManager::OverwriteEntry(TessdataType type, const char *data, int size) { | |
| 158 is_loaded_ = true; | |
| 159 entries_[type].resize(size); | |
| 160 memcpy(&entries_[type][0], data, size); | |
| 161 } | |
| 162 | |
| 163 // Saves to the given filename. | |
| 164 bool TessdataManager::SaveFile(const char *filename, FileWriter writer) const { | |
| 165 // TODO: This method supports only the proprietary file format. | |
| 166 ASSERT_HOST(is_loaded_); | |
| 167 std::vector<char> data; | |
| 168 Serialize(&data); | |
| 169 if (writer == nullptr) { | |
| 170 return SaveDataToFile(data, filename); | |
| 171 } else { | |
| 172 return (*writer)(data, filename); | |
| 173 } | |
| 174 } | |
| 175 | |
| 176 // Serializes to the given vector. | |
| 177 void TessdataManager::Serialize(std::vector<char> *data) const { | |
| 178 // TODO: This method supports only the proprietary file format. | |
| 179 ASSERT_HOST(is_loaded_); | |
| 180 // Compute the offset_table and total size. | |
| 181 int64_t offset_table[TESSDATA_NUM_ENTRIES]; | |
| 182 int64_t offset = sizeof(int32_t) + sizeof(offset_table); | |
| 183 for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { | |
| 184 if (entries_[i].empty()) { | |
| 185 offset_table[i] = -1; | |
| 186 } else { | |
| 187 offset_table[i] = offset; | |
| 188 offset += entries_[i].size(); | |
| 189 } | |
| 190 } | |
| 191 data->resize(offset, 0); | |
| 192 int32_t num_entries = TESSDATA_NUM_ENTRIES; | |
| 193 TFile fp; | |
| 194 fp.OpenWrite(data); | |
| 195 fp.Serialize(&num_entries); | |
| 196 fp.Serialize(&offset_table[0], countof(offset_table)); | |
| 197 for (const auto &entry : entries_) { | |
| 198 if (!entry.empty()) { | |
| 199 fp.Serialize(&entry[0], entry.size()); | |
| 200 } | |
| 201 } | |
| 202 } | |
| 203 | |
| 204 // Resets to the initial state, keeping the reader. | |
| 205 void TessdataManager::Clear() { | |
| 206 for (auto &entry : entries_) { | |
| 207 entry.clear(); | |
| 208 } | |
| 209 is_loaded_ = false; | |
| 210 } | |
| 211 | |
| 212 // Prints a directory of contents. | |
| 213 void TessdataManager::Directory() const { | |
| 214 printf("Version:%s\n", VersionString().c_str()); | |
| 215 auto offset = TESSDATA_NUM_ENTRIES * sizeof(int64_t); | |
| 216 for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { | |
| 217 if (!entries_[i].empty()) { | |
| 218 printf("%u:%s:size=%zu, offset=%zu\n", i, kTessdataFileSuffixes[i], entries_[i].size(), | |
| 219 offset); | |
| 220 offset += entries_[i].size(); | |
| 221 } | |
| 222 } | |
| 223 } | |
| 224 | |
| 225 // Opens the given TFile pointer to the given component type. | |
| 226 // Returns false in case of failure. | |
| 227 bool TessdataManager::GetComponent(TessdataType type, TFile *fp) { | |
| 228 if (!is_loaded_ && !Init(data_file_name_.c_str())) { | |
| 229 return false; | |
| 230 } | |
| 231 const TessdataManager *const_this = this; | |
| 232 return const_this->GetComponent(type, fp); | |
| 233 } | |
| 234 | |
| 235 // As non-const version except it can't load the component if not already | |
| 236 // loaded. | |
| 237 bool TessdataManager::GetComponent(TessdataType type, TFile *fp) const { | |
| 238 ASSERT_HOST(is_loaded_); | |
| 239 if (entries_[type].empty()) { | |
| 240 return false; | |
| 241 } | |
| 242 fp->Open(&entries_[type][0], entries_[type].size()); | |
| 243 fp->set_swap(swap_); | |
| 244 return true; | |
| 245 } | |
| 246 | |
| 247 // Returns the current version string. | |
| 248 std::string TessdataManager::VersionString() const { | |
| 249 return std::string(&entries_[TESSDATA_VERSION][0], entries_[TESSDATA_VERSION].size()); | |
| 250 } | |
| 251 | |
| 252 // Sets the version string to the given v_str. | |
| 253 void TessdataManager::SetVersionString(const std::string &v_str) { | |
| 254 entries_[TESSDATA_VERSION].resize(v_str.size()); | |
| 255 memcpy(&entries_[TESSDATA_VERSION][0], v_str.data(), v_str.size()); | |
| 256 } | |
| 257 | |
| 258 bool TessdataManager::CombineDataFiles(const char *language_data_path_prefix, | |
| 259 const char *output_filename) { | |
| 260 // Load individual tessdata components from files. | |
| 261 for (auto filesuffix : kTessdataFileSuffixes) { | |
| 262 TessdataType type; | |
| 263 ASSERT_HOST(TessdataTypeFromFileSuffix(filesuffix, &type)); | |
| 264 std::string filename = language_data_path_prefix; | |
| 265 filename += filesuffix; | |
| 266 FILE *fp = fopen(filename.c_str(), "rb"); | |
| 267 if (fp != nullptr) { | |
| 268 fclose(fp); | |
| 269 if (!LoadDataFromFile(filename.c_str(), &entries_[type])) { | |
| 270 tprintf("Load of file %s failed!\n", filename.c_str()); | |
| 271 return false; | |
| 272 } | |
| 273 } | |
| 274 } | |
| 275 is_loaded_ = true; | |
| 276 | |
| 277 // Make sure that the required components are present. | |
| 278 if (!IsBaseAvailable() && !IsLSTMAvailable()) { | |
| 279 tprintf( | |
| 280 "Error: traineddata file must contain at least (a unicharset file" | |
| 281 " and inttemp) OR an lstm file.\n"); | |
| 282 return false; | |
| 283 } | |
| 284 // Write updated data to the output traineddata file. | |
| 285 return SaveFile(output_filename, nullptr); | |
| 286 } | |
| 287 | |
| 288 bool TessdataManager::OverwriteComponents(const char *new_traineddata_filename, | |
| 289 char **component_filenames, int num_new_components) { | |
| 290 // Open the files with the new components. | |
| 291 // TODO: This method supports only the proprietary file format. | |
| 292 for (int i = 0; i < num_new_components; ++i) { | |
| 293 TessdataType type; | |
| 294 if (TessdataTypeFromFileName(component_filenames[i], &type)) { | |
| 295 if (!LoadDataFromFile(component_filenames[i], &entries_[type])) { | |
| 296 tprintf("Failed to read component file:%s\n", component_filenames[i]); | |
| 297 return false; | |
| 298 } | |
| 299 } | |
| 300 } | |
| 301 | |
| 302 // Write updated data to the output traineddata file. | |
| 303 return SaveFile(new_traineddata_filename, nullptr); | |
| 304 } | |
| 305 | |
| 306 bool TessdataManager::ExtractToFile(const char *filename) { | |
| 307 TessdataType type = TESSDATA_NUM_ENTRIES; | |
| 308 ASSERT_HOST(tesseract::TessdataManager::TessdataTypeFromFileName(filename, &type)); | |
| 309 if (entries_[type].empty()) { | |
| 310 return false; | |
| 311 } | |
| 312 return SaveDataToFile(entries_[type], filename); | |
| 313 } | |
| 314 | |
| 315 bool TessdataManager::TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type) { | |
| 316 for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { | |
| 317 if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) { | |
| 318 *type = static_cast<TessdataType>(i); | |
| 319 return true; | |
| 320 } | |
| 321 } | |
| 322 #if !defined(NDEBUG) | |
| 323 tprintf( | |
| 324 "TessdataManager can't determine which tessdata" | |
| 325 " component is represented by %s\n", | |
| 326 suffix); | |
| 327 #endif | |
| 328 return false; | |
| 329 } | |
| 330 | |
| 331 bool TessdataManager::TessdataTypeFromFileName(const char *filename, TessdataType *type) { | |
| 332 // Get the file suffix (extension) | |
| 333 const char *suffix = strrchr(filename, '.'); | |
| 334 if (suffix == nullptr || *(++suffix) == '\0') { | |
| 335 return false; | |
| 336 } | |
| 337 return TessdataTypeFromFileSuffix(suffix, type); | |
| 338 } | |
| 339 | |
| 340 } // namespace tesseract |
