Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccutil/tessdatamanager.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: tessdatamanager.h | |
| 3 // Description: Functions to handle loading/combining tesseract data files. | |
| 4 // Author: Daria Antonova | |
| 5 // | |
| 6 // (C) Copyright 2009, Google Inc. | |
| 7 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 // you may not use this file except in compliance with the License. | |
| 9 // You may obtain a copy of the License at | |
| 10 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 // Unless required by applicable law or agreed to in writing, software | |
| 12 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 // See the License for the specific language governing permissions and | |
| 15 // limitations under the License. | |
| 16 // | |
| 17 /////////////////////////////////////////////////////////////////////// | |
| 18 | |
| 19 #ifndef TESSERACT_CCUTIL_TESSDATAMANAGER_H_ | |
| 20 #define TESSERACT_CCUTIL_TESSDATAMANAGER_H_ | |
| 21 | |
| 22 #include <tesseract/baseapi.h> // FileReader | |
| 23 #include <string> // std::string | |
| 24 #include <vector> // std::vector | |
| 25 #include "serialis.h" // FileWriter | |
| 26 | |
| 27 static const char kTrainedDataSuffix[] = "traineddata"; | |
| 28 | |
| 29 // When adding new tessdata types and file suffixes, please make sure to | |
| 30 // update TessdataType enum, kTessdataFileSuffixes and kTessdataFileIsText. | |
| 31 static const char kLangConfigFileSuffix[] = "config"; | |
| 32 static const char kUnicharsetFileSuffix[] = "unicharset"; | |
| 33 static const char kAmbigsFileSuffix[] = "unicharambigs"; | |
| 34 static const char kBuiltInTemplatesFileSuffix[] = "inttemp"; | |
| 35 static const char kBuiltInCutoffsFileSuffix[] = "pffmtable"; | |
| 36 static const char kNormProtoFileSuffix[] = "normproto"; | |
| 37 static const char kPuncDawgFileSuffix[] = "punc-dawg"; | |
| 38 static const char kSystemDawgFileSuffix[] = "word-dawg"; | |
| 39 static const char kNumberDawgFileSuffix[] = "number-dawg"; | |
| 40 static const char kFreqDawgFileSuffix[] = "freq-dawg"; | |
| 41 static const char kFixedLengthDawgsFileSuffix[] = "fixed-length-dawgs"; | |
| 42 static const char kCubeUnicharsetFileSuffix[] = "cube-unicharset"; | |
| 43 static const char kCubeSystemDawgFileSuffix[] = "cube-word-dawg"; | |
| 44 static const char kShapeTableFileSuffix[] = "shapetable"; | |
| 45 static const char kBigramDawgFileSuffix[] = "bigram-dawg"; | |
| 46 static const char kUnambigDawgFileSuffix[] = "unambig-dawg"; | |
| 47 static const char kParamsModelFileSuffix[] = "params-model"; | |
| 48 static const char kLSTMModelFileSuffix[] = "lstm"; | |
| 49 static const char kLSTMPuncDawgFileSuffix[] = "lstm-punc-dawg"; | |
| 50 static const char kLSTMSystemDawgFileSuffix[] = "lstm-word-dawg"; | |
| 51 static const char kLSTMNumberDawgFileSuffix[] = "lstm-number-dawg"; | |
| 52 static const char kLSTMUnicharsetFileSuffix[] = "lstm-unicharset"; | |
| 53 static const char kLSTMRecoderFileSuffix[] = "lstm-recoder"; | |
| 54 static const char kVersionFileSuffix[] = "version"; | |
| 55 | |
| 56 namespace tesseract { | |
| 57 | |
| 58 enum TessdataType { | |
| 59 TESSDATA_LANG_CONFIG, // 0 | |
| 60 TESSDATA_UNICHARSET, // 1 | |
| 61 TESSDATA_AMBIGS, // 2 | |
| 62 TESSDATA_INTTEMP, // 3 | |
| 63 TESSDATA_PFFMTABLE, // 4 | |
| 64 TESSDATA_NORMPROTO, // 5 | |
| 65 TESSDATA_PUNC_DAWG, // 6 | |
| 66 TESSDATA_SYSTEM_DAWG, // 7 | |
| 67 TESSDATA_NUMBER_DAWG, // 8 | |
| 68 TESSDATA_FREQ_DAWG, // 9 | |
| 69 TESSDATA_FIXED_LENGTH_DAWGS, // 10 // deprecated | |
| 70 TESSDATA_CUBE_UNICHARSET, // 11 // deprecated | |
| 71 TESSDATA_CUBE_SYSTEM_DAWG, // 12 // deprecated | |
| 72 TESSDATA_SHAPE_TABLE, // 13 | |
| 73 TESSDATA_BIGRAM_DAWG, // 14 | |
| 74 TESSDATA_UNAMBIG_DAWG, // 15 | |
| 75 TESSDATA_PARAMS_MODEL, // 16 | |
| 76 TESSDATA_LSTM, // 17 | |
| 77 TESSDATA_LSTM_PUNC_DAWG, // 18 | |
| 78 TESSDATA_LSTM_SYSTEM_DAWG, // 19 | |
| 79 TESSDATA_LSTM_NUMBER_DAWG, // 20 | |
| 80 TESSDATA_LSTM_UNICHARSET, // 21 | |
| 81 TESSDATA_LSTM_RECODER, // 22 | |
| 82 TESSDATA_VERSION, // 23 | |
| 83 | |
| 84 TESSDATA_NUM_ENTRIES | |
| 85 }; | |
| 86 | |
| 87 /** | |
| 88 * kTessdataFileSuffixes[i] indicates the file suffix for | |
| 89 * tessdata of type i (from TessdataType enum). | |
| 90 */ | |
| 91 static const char *const kTessdataFileSuffixes[] = { | |
| 92 kLangConfigFileSuffix, // 0 | |
| 93 kUnicharsetFileSuffix, // 1 | |
| 94 kAmbigsFileSuffix, // 2 | |
| 95 kBuiltInTemplatesFileSuffix, // 3 | |
| 96 kBuiltInCutoffsFileSuffix, // 4 | |
| 97 kNormProtoFileSuffix, // 5 | |
| 98 kPuncDawgFileSuffix, // 6 | |
| 99 kSystemDawgFileSuffix, // 7 | |
| 100 kNumberDawgFileSuffix, // 8 | |
| 101 kFreqDawgFileSuffix, // 9 | |
| 102 kFixedLengthDawgsFileSuffix, // 10 // deprecated | |
| 103 kCubeUnicharsetFileSuffix, // 11 // deprecated | |
| 104 kCubeSystemDawgFileSuffix, // 12 // deprecated | |
| 105 kShapeTableFileSuffix, // 13 | |
| 106 kBigramDawgFileSuffix, // 14 | |
| 107 kUnambigDawgFileSuffix, // 15 | |
| 108 kParamsModelFileSuffix, // 16 | |
| 109 kLSTMModelFileSuffix, // 17 | |
| 110 kLSTMPuncDawgFileSuffix, // 18 | |
| 111 kLSTMSystemDawgFileSuffix, // 19 | |
| 112 kLSTMNumberDawgFileSuffix, // 20 | |
| 113 kLSTMUnicharsetFileSuffix, // 21 | |
| 114 kLSTMRecoderFileSuffix, // 22 | |
| 115 kVersionFileSuffix, // 23 | |
| 116 }; | |
| 117 | |
| 118 /** | |
| 119 * TessdataType could be updated to contain more entries, however | |
| 120 * we do not expect that number to be astronomically high. | |
| 121 * In order to automatically detect endianness TessdataManager will | |
| 122 * flip the bits if actual_tessdata_num_entries_ is larger than | |
| 123 * kMaxNumTessdataEntries. | |
| 124 */ | |
| 125 static const int kMaxNumTessdataEntries = 1000; | |
| 126 | |
| 127 class TESS_API TessdataManager { | |
| 128 public: | |
| 129 TessdataManager(); | |
| 130 explicit TessdataManager(FileReader reader); | |
| 131 | |
| 132 ~TessdataManager() = default; | |
| 133 | |
| 134 bool swap() const { | |
| 135 return swap_; | |
| 136 } | |
| 137 bool is_loaded() const { | |
| 138 return is_loaded_; | |
| 139 } | |
| 140 | |
| 141 // Lazily loads from the given filename. Won't actually read the file | |
| 142 // until it needs it. | |
| 143 void LoadFileLater(const char *data_file_name); | |
| 144 /** | |
| 145 * Opens and reads the given data file right now. | |
| 146 * @return true on success. | |
| 147 */ | |
| 148 bool Init(const char *data_file_name); | |
| 149 // Loads from the given memory buffer as if a file, remembering name as some | |
| 150 // arbitrary source id for caching. | |
| 151 bool LoadMemBuffer(const char *name, const char *data, int size); | |
| 152 // Overwrites a single entry of the given type. | |
| 153 void OverwriteEntry(TessdataType type, const char *data, int size); | |
| 154 | |
| 155 // Saves to the given filename. | |
| 156 bool SaveFile(const char *filename, FileWriter writer) const; | |
| 157 // Serializes to the given vector. | |
| 158 void Serialize(std::vector<char> *data) const; | |
| 159 // Resets to the initial state, keeping the reader. | |
| 160 void Clear(); | |
| 161 | |
| 162 // Prints a directory of contents. | |
| 163 void Directory() const; | |
| 164 | |
| 165 // Returns true if the component requested is present. | |
| 166 bool IsComponentAvailable(TessdataType type) const { | |
| 167 return !entries_[type].empty(); | |
| 168 } | |
| 169 // Opens the given TFile pointer to the given component type. | |
| 170 // Returns false in case of failure. | |
| 171 bool GetComponent(TessdataType type, TFile *fp); | |
| 172 // As non-const version except it can't load the component if not already | |
| 173 // loaded. | |
| 174 bool GetComponent(TessdataType type, TFile *fp) const; | |
| 175 | |
| 176 // Returns the current version string. | |
| 177 std::string VersionString() const; | |
| 178 // Sets the version string to the given v_str. | |
| 179 void SetVersionString(const std::string &v_str); | |
| 180 | |
| 181 // Returns true if the base Tesseract components are present. | |
| 182 bool IsBaseAvailable() const { | |
| 183 return !entries_[TESSDATA_UNICHARSET].empty() && !entries_[TESSDATA_INTTEMP].empty(); | |
| 184 } | |
| 185 | |
| 186 // Returns true if the LSTM components are present. | |
| 187 bool IsLSTMAvailable() const { | |
| 188 return !entries_[TESSDATA_LSTM].empty(); | |
| 189 } | |
| 190 | |
| 191 // Return the name of the underlying data file. | |
| 192 const std::string &GetDataFileName() const { | |
| 193 return data_file_name_; | |
| 194 } | |
| 195 | |
| 196 /** | |
| 197 * Reads all the standard tesseract config and data files for a language | |
| 198 * at the given path and bundles them up into one binary data file. | |
| 199 * Returns true if the combined traineddata file was successfully written. | |
| 200 */ | |
| 201 bool CombineDataFiles(const char *language_data_path_prefix, const char *output_filename); | |
| 202 | |
| 203 /** | |
| 204 * Gets the individual components from the data_file_ with which the class was | |
| 205 * initialized. Overwrites the components specified by component_filenames. | |
| 206 * Writes the updated traineddata file to new_traineddata_filename. | |
| 207 */ | |
| 208 bool OverwriteComponents(const char *new_traineddata_filename, char **component_filenames, | |
| 209 int num_new_components); | |
| 210 | |
| 211 /** | |
| 212 * Extracts tessdata component implied by the name of the input file from | |
| 213 * the combined traineddata loaded into TessdataManager. | |
| 214 * Writes the extracted component to the file indicated by the file name. | |
| 215 * E.g. if the filename given is somepath/somelang.unicharset, unicharset | |
| 216 * will be extracted from the data loaded into the TessdataManager and will | |
| 217 * be written to somepath/somelang.unicharset. | |
| 218 * @return true if the component was successfully extracted, false if the | |
| 219 * component was not present in the traineddata loaded into TessdataManager. | |
| 220 */ | |
| 221 bool ExtractToFile(const char *filename); | |
| 222 | |
| 223 private: | |
| 224 // Use libarchive. | |
| 225 bool LoadArchiveFile(const char *filename); | |
| 226 | |
| 227 /** | |
| 228 * Fills type with TessdataType of the tessdata component represented by the | |
| 229 * given file name. E.g. tessdata/eng.unicharset -> TESSDATA_UNICHARSET. | |
| 230 * @return true if the tessdata component type could be determined | |
| 231 * from the given file name. | |
| 232 */ | |
| 233 static bool TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type); | |
| 234 | |
| 235 /** | |
| 236 * Tries to determine tessdata component file suffix from filename, | |
| 237 * returns true on success. | |
| 238 */ | |
| 239 static bool TessdataTypeFromFileName(const char *filename, TessdataType *type); | |
| 240 | |
| 241 // Name of file it came from. | |
| 242 std::string data_file_name_; | |
| 243 // Function to load the file when we need it. | |
| 244 FileReader reader_; | |
| 245 // True if the file has been loaded. | |
| 246 bool is_loaded_; | |
| 247 // True if the bytes need swapping. | |
| 248 bool swap_; | |
| 249 // Contents of each element of the traineddata file. | |
| 250 std::vector<char> entries_[TESSDATA_NUM_ENTRIES]; | |
| 251 }; | |
| 252 | |
| 253 } // namespace tesseract | |
| 254 | |
| 255 #endif // TESSERACT_CCUTIL_TESSDATAMANAGER_H_ |
