comparison mupdf-source/thirdparty/tesseract/src/ccutil/tessdatamanager.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 ///////////////////////////////////////////////////////////////////////
2 // File: tessdatamanager.h
3 // Description: Functions to handle loading/combining tesseract data files.
4 // Author: Daria Antonova
5 //
6 // (C) Copyright 2009, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
17 ///////////////////////////////////////////////////////////////////////
18
19 #ifndef TESSERACT_CCUTIL_TESSDATAMANAGER_H_
20 #define TESSERACT_CCUTIL_TESSDATAMANAGER_H_
21
22 #include <tesseract/baseapi.h> // FileReader
23 #include <string> // std::string
24 #include <vector> // std::vector
25 #include "serialis.h" // FileWriter
26
27 static const char kTrainedDataSuffix[] = "traineddata";
28
29 // When adding new tessdata types and file suffixes, please make sure to
30 // update TessdataType enum, kTessdataFileSuffixes and kTessdataFileIsText.
31 static const char kLangConfigFileSuffix[] = "config";
32 static const char kUnicharsetFileSuffix[] = "unicharset";
33 static const char kAmbigsFileSuffix[] = "unicharambigs";
34 static const char kBuiltInTemplatesFileSuffix[] = "inttemp";
35 static const char kBuiltInCutoffsFileSuffix[] = "pffmtable";
36 static const char kNormProtoFileSuffix[] = "normproto";
37 static const char kPuncDawgFileSuffix[] = "punc-dawg";
38 static const char kSystemDawgFileSuffix[] = "word-dawg";
39 static const char kNumberDawgFileSuffix[] = "number-dawg";
40 static const char kFreqDawgFileSuffix[] = "freq-dawg";
41 static const char kFixedLengthDawgsFileSuffix[] = "fixed-length-dawgs";
42 static const char kCubeUnicharsetFileSuffix[] = "cube-unicharset";
43 static const char kCubeSystemDawgFileSuffix[] = "cube-word-dawg";
44 static const char kShapeTableFileSuffix[] = "shapetable";
45 static const char kBigramDawgFileSuffix[] = "bigram-dawg";
46 static const char kUnambigDawgFileSuffix[] = "unambig-dawg";
47 static const char kParamsModelFileSuffix[] = "params-model";
48 static const char kLSTMModelFileSuffix[] = "lstm";
49 static const char kLSTMPuncDawgFileSuffix[] = "lstm-punc-dawg";
50 static const char kLSTMSystemDawgFileSuffix[] = "lstm-word-dawg";
51 static const char kLSTMNumberDawgFileSuffix[] = "lstm-number-dawg";
52 static const char kLSTMUnicharsetFileSuffix[] = "lstm-unicharset";
53 static const char kLSTMRecoderFileSuffix[] = "lstm-recoder";
54 static const char kVersionFileSuffix[] = "version";
55
56 namespace tesseract {
57
58 enum TessdataType {
59 TESSDATA_LANG_CONFIG, // 0
60 TESSDATA_UNICHARSET, // 1
61 TESSDATA_AMBIGS, // 2
62 TESSDATA_INTTEMP, // 3
63 TESSDATA_PFFMTABLE, // 4
64 TESSDATA_NORMPROTO, // 5
65 TESSDATA_PUNC_DAWG, // 6
66 TESSDATA_SYSTEM_DAWG, // 7
67 TESSDATA_NUMBER_DAWG, // 8
68 TESSDATA_FREQ_DAWG, // 9
69 TESSDATA_FIXED_LENGTH_DAWGS, // 10 // deprecated
70 TESSDATA_CUBE_UNICHARSET, // 11 // deprecated
71 TESSDATA_CUBE_SYSTEM_DAWG, // 12 // deprecated
72 TESSDATA_SHAPE_TABLE, // 13
73 TESSDATA_BIGRAM_DAWG, // 14
74 TESSDATA_UNAMBIG_DAWG, // 15
75 TESSDATA_PARAMS_MODEL, // 16
76 TESSDATA_LSTM, // 17
77 TESSDATA_LSTM_PUNC_DAWG, // 18
78 TESSDATA_LSTM_SYSTEM_DAWG, // 19
79 TESSDATA_LSTM_NUMBER_DAWG, // 20
80 TESSDATA_LSTM_UNICHARSET, // 21
81 TESSDATA_LSTM_RECODER, // 22
82 TESSDATA_VERSION, // 23
83
84 TESSDATA_NUM_ENTRIES
85 };
86
87 /**
88 * kTessdataFileSuffixes[i] indicates the file suffix for
89 * tessdata of type i (from TessdataType enum).
90 */
91 static const char *const kTessdataFileSuffixes[] = {
92 kLangConfigFileSuffix, // 0
93 kUnicharsetFileSuffix, // 1
94 kAmbigsFileSuffix, // 2
95 kBuiltInTemplatesFileSuffix, // 3
96 kBuiltInCutoffsFileSuffix, // 4
97 kNormProtoFileSuffix, // 5
98 kPuncDawgFileSuffix, // 6
99 kSystemDawgFileSuffix, // 7
100 kNumberDawgFileSuffix, // 8
101 kFreqDawgFileSuffix, // 9
102 kFixedLengthDawgsFileSuffix, // 10 // deprecated
103 kCubeUnicharsetFileSuffix, // 11 // deprecated
104 kCubeSystemDawgFileSuffix, // 12 // deprecated
105 kShapeTableFileSuffix, // 13
106 kBigramDawgFileSuffix, // 14
107 kUnambigDawgFileSuffix, // 15
108 kParamsModelFileSuffix, // 16
109 kLSTMModelFileSuffix, // 17
110 kLSTMPuncDawgFileSuffix, // 18
111 kLSTMSystemDawgFileSuffix, // 19
112 kLSTMNumberDawgFileSuffix, // 20
113 kLSTMUnicharsetFileSuffix, // 21
114 kLSTMRecoderFileSuffix, // 22
115 kVersionFileSuffix, // 23
116 };
117
118 /**
119 * TessdataType could be updated to contain more entries, however
120 * we do not expect that number to be astronomically high.
121 * In order to automatically detect endianness TessdataManager will
122 * flip the bits if actual_tessdata_num_entries_ is larger than
123 * kMaxNumTessdataEntries.
124 */
125 static const int kMaxNumTessdataEntries = 1000;
126
127 class TESS_API TessdataManager {
128 public:
129 TessdataManager();
130 explicit TessdataManager(FileReader reader);
131
132 ~TessdataManager() = default;
133
134 bool swap() const {
135 return swap_;
136 }
137 bool is_loaded() const {
138 return is_loaded_;
139 }
140
141 // Lazily loads from the given filename. Won't actually read the file
142 // until it needs it.
143 void LoadFileLater(const char *data_file_name);
144 /**
145 * Opens and reads the given data file right now.
146 * @return true on success.
147 */
148 bool Init(const char *data_file_name);
149 // Loads from the given memory buffer as if a file, remembering name as some
150 // arbitrary source id for caching.
151 bool LoadMemBuffer(const char *name, const char *data, int size);
152 // Overwrites a single entry of the given type.
153 void OverwriteEntry(TessdataType type, const char *data, int size);
154
155 // Saves to the given filename.
156 bool SaveFile(const char *filename, FileWriter writer) const;
157 // Serializes to the given vector.
158 void Serialize(std::vector<char> *data) const;
159 // Resets to the initial state, keeping the reader.
160 void Clear();
161
162 // Prints a directory of contents.
163 void Directory() const;
164
165 // Returns true if the component requested is present.
166 bool IsComponentAvailable(TessdataType type) const {
167 return !entries_[type].empty();
168 }
169 // Opens the given TFile pointer to the given component type.
170 // Returns false in case of failure.
171 bool GetComponent(TessdataType type, TFile *fp);
172 // As non-const version except it can't load the component if not already
173 // loaded.
174 bool GetComponent(TessdataType type, TFile *fp) const;
175
176 // Returns the current version string.
177 std::string VersionString() const;
178 // Sets the version string to the given v_str.
179 void SetVersionString(const std::string &v_str);
180
181 // Returns true if the base Tesseract components are present.
182 bool IsBaseAvailable() const {
183 return !entries_[TESSDATA_UNICHARSET].empty() && !entries_[TESSDATA_INTTEMP].empty();
184 }
185
186 // Returns true if the LSTM components are present.
187 bool IsLSTMAvailable() const {
188 return !entries_[TESSDATA_LSTM].empty();
189 }
190
191 // Return the name of the underlying data file.
192 const std::string &GetDataFileName() const {
193 return data_file_name_;
194 }
195
196 /**
197 * Reads all the standard tesseract config and data files for a language
198 * at the given path and bundles them up into one binary data file.
199 * Returns true if the combined traineddata file was successfully written.
200 */
201 bool CombineDataFiles(const char *language_data_path_prefix, const char *output_filename);
202
203 /**
204 * Gets the individual components from the data_file_ with which the class was
205 * initialized. Overwrites the components specified by component_filenames.
206 * Writes the updated traineddata file to new_traineddata_filename.
207 */
208 bool OverwriteComponents(const char *new_traineddata_filename, char **component_filenames,
209 int num_new_components);
210
211 /**
212 * Extracts tessdata component implied by the name of the input file from
213 * the combined traineddata loaded into TessdataManager.
214 * Writes the extracted component to the file indicated by the file name.
215 * E.g. if the filename given is somepath/somelang.unicharset, unicharset
216 * will be extracted from the data loaded into the TessdataManager and will
217 * be written to somepath/somelang.unicharset.
218 * @return true if the component was successfully extracted, false if the
219 * component was not present in the traineddata loaded into TessdataManager.
220 */
221 bool ExtractToFile(const char *filename);
222
223 private:
224 // Use libarchive.
225 bool LoadArchiveFile(const char *filename);
226
227 /**
228 * Fills type with TessdataType of the tessdata component represented by the
229 * given file name. E.g. tessdata/eng.unicharset -> TESSDATA_UNICHARSET.
230 * @return true if the tessdata component type could be determined
231 * from the given file name.
232 */
233 static bool TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type);
234
235 /**
236 * Tries to determine tessdata component file suffix from filename,
237 * returns true on success.
238 */
239 static bool TessdataTypeFromFileName(const char *filename, TessdataType *type);
240
241 // Name of file it came from.
242 std::string data_file_name_;
243 // Function to load the file when we need it.
244 FileReader reader_;
245 // True if the file has been loaded.
246 bool is_loaded_;
247 // True if the bytes need swapping.
248 bool swap_;
249 // Contents of each element of the traineddata file.
250 std::vector<char> entries_[TESSDATA_NUM_ENTRIES];
251 };
252
253 } // namespace tesseract
254
255 #endif // TESSERACT_CCUTIL_TESSDATAMANAGER_H_