Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/training/combine_tessdata.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: combine_tessdata.cpp | |
| 3 // Description: Creates a unified traineddata file from several | |
| 4 // data files produced by the training process. | |
| 5 // Author: Daria Antonova | |
| 6 // | |
| 7 // (C) Copyright 2009, Google Inc. | |
| 8 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 9 // you may not use this file except in compliance with the License. | |
| 10 // You may obtain a copy of the License at | |
| 11 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 12 // Unless required by applicable law or agreed to in writing, software | |
| 13 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 15 // See the License for the specific language governing permissions and | |
| 16 // limitations under the License. | |
| 17 // | |
| 18 /////////////////////////////////////////////////////////////////////// | |
| 19 | |
| 20 #include "commontraining.h" // CheckSharedLibraryVersion | |
| 21 #include "lstmrecognizer.h" | |
| 22 #include "tessdatamanager.h" | |
| 23 | |
| 24 #include <cerrno> | |
| 25 #include <iostream> // std::cout | |
| 26 | |
| 27 using namespace tesseract; | |
| 28 | |
| 29 static int list_components(TessdataManager &tm, const char *filename) { | |
| 30 // Initialize TessdataManager with the data in the given traineddata file. | |
| 31 if (filename != nullptr && !tm.Init(filename)) { | |
| 32 tprintf("Failed to read %s\n", filename); | |
| 33 return EXIT_FAILURE; | |
| 34 } | |
| 35 tm.Directory(); | |
| 36 return EXIT_SUCCESS; | |
| 37 } | |
| 38 | |
| 39 static int list_network(TessdataManager &tm, const char *filename) { | |
| 40 if (filename != nullptr && !tm.Init(filename)) { | |
| 41 tprintf("Failed to read %s\n", filename); | |
| 42 return EXIT_FAILURE; | |
| 43 } | |
| 44 tesseract::TFile fp; | |
| 45 if (tm.GetComponent(tesseract::TESSDATA_LSTM, &fp)) { | |
| 46 tesseract::LSTMRecognizer recognizer; | |
| 47 if (!recognizer.DeSerialize(&tm, &fp)) { | |
| 48 tprintf("Failed to deserialize LSTM in %s!\n", filename); | |
| 49 return EXIT_FAILURE; | |
| 50 } | |
| 51 std::cout << "LSTM: network=" << recognizer.GetNetwork() | |
| 52 << ", int_mode=" << recognizer.IsIntMode() | |
| 53 << ", recoding=" << recognizer.IsRecoding() | |
| 54 << ", iteration=" << recognizer.training_iteration() | |
| 55 << ", sample_iteration=" << recognizer.sample_iteration() | |
| 56 << ", null_char=" << recognizer.null_char() | |
| 57 << ", learning_rate=" << recognizer.learning_rate() | |
| 58 << ", momentum=" << recognizer.GetMomentum() | |
| 59 << ", adam_beta=" << recognizer.GetAdamBeta() << '\n'; | |
| 60 | |
| 61 std::cout << "Layer Learning Rates: "; | |
| 62 auto layers = recognizer.EnumerateLayers(); | |
| 63 for (const auto &id : layers) { | |
| 64 auto layer = recognizer.GetLayer(id); | |
| 65 std::cout << id << "(" << layer->name() << ")" | |
| 66 << "=" << recognizer.GetLayerLearningRate(id) | |
| 67 << (layers[layers.size() - 1] != id ? ", " : ""); | |
| 68 } | |
| 69 std::cout << "\n"; | |
| 70 } | |
| 71 return EXIT_SUCCESS; | |
| 72 } | |
| 73 | |
| 74 // Main program to combine/extract/overwrite tessdata components | |
| 75 // in [lang].traineddata files. | |
| 76 // | |
| 77 // To combine all the individual tessdata components (unicharset, DAWGs, | |
| 78 // classifier templates, ambiguities, language configs) located at, say, | |
| 79 // /home/$USER/temp/eng.* run: | |
| 80 // | |
| 81 // combine_tessdata /home/$USER/temp/eng. | |
| 82 // | |
| 83 // The result will be a combined tessdata file /home/$USER/temp/eng.traineddata | |
| 84 // | |
| 85 // Specify option -e if you would like to extract individual components | |
| 86 // from a combined traineddata file. For example, to extract language config | |
| 87 // file and the unicharset from tessdata/eng.traineddata run: | |
| 88 // | |
| 89 // combine_tessdata -e tessdata/eng.traineddata | |
| 90 // /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset | |
| 91 // | |
| 92 // The desired config file and unicharset will be written to | |
| 93 // /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset | |
| 94 // | |
| 95 // Specify option -o to overwrite individual components of the given | |
| 96 // [lang].traineddata file. For example, to overwrite language config | |
| 97 // and unichar ambiguities files in tessdata/eng.traineddata use: | |
| 98 // | |
| 99 // combine_tessdata -o tessdata/eng.traineddata | |
| 100 // /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharambigs | |
| 101 // | |
| 102 // As a result, tessdata/eng.traineddata will contain the new language config | |
| 103 // and unichar ambigs, plus all the original DAWGs, classifier teamples, etc. | |
| 104 // | |
| 105 // Note: the file names of the files to extract to and to overwrite from should | |
| 106 // have the appropriate file suffixes (extensions) indicating their tessdata | |
| 107 // component type (.unicharset for the unicharset, .unicharambigs for unichar | |
| 108 // ambigs, etc). See k*FileSuffix variable in ccutil/tessdatamanager.h. | |
| 109 // | |
| 110 // Specify option -u to unpack all the components to the specified path: | |
| 111 // | |
| 112 // combine_tessdata -u tessdata/eng.traineddata /home/$USER/temp/eng. | |
| 113 // | |
| 114 // This will create /home/$USER/temp/eng.* files with individual tessdata | |
| 115 // components from tessdata/eng.traineddata. | |
| 116 // | |
| 117 int main(int argc, char **argv) { | |
| 118 tesseract::CheckSharedLibraryVersion(); | |
| 119 | |
| 120 int i; | |
| 121 tesseract::TessdataManager tm; | |
| 122 if (argc > 1 && (!strcmp(argv[1], "-v") || !strcmp(argv[1], "--version"))) { | |
| 123 printf("%s\n", tesseract::TessBaseAPI::Version()); | |
| 124 return EXIT_SUCCESS; | |
| 125 } else if (argc == 2) { | |
| 126 printf("Combining tessdata files\n"); | |
| 127 std::string lang = argv[1]; | |
| 128 char *last = &argv[1][strlen(argv[1]) - 1]; | |
| 129 if (*last != '.') { | |
| 130 lang += '.'; | |
| 131 } | |
| 132 std::string output_file = lang; | |
| 133 output_file += kTrainedDataSuffix; | |
| 134 if (!tm.CombineDataFiles(lang.c_str(), output_file.c_str())) { | |
| 135 printf("Error combining tessdata files into %s\n", output_file.c_str()); | |
| 136 } else { | |
| 137 printf("Output %s created successfully.\n", output_file.c_str()); | |
| 138 } | |
| 139 } else if (argc >= 4 && | |
| 140 (strcmp(argv[1], "-e") == 0 || strcmp(argv[1], "-u") == 0)) { | |
| 141 // Initialize TessdataManager with the data in the given traineddata file. | |
| 142 if (!tm.Init(argv[2])) { | |
| 143 tprintf("Failed to read %s\n", argv[2]); | |
| 144 return EXIT_FAILURE; | |
| 145 } | |
| 146 printf("Extracting tessdata components from %s\n", argv[2]); | |
| 147 if (strcmp(argv[1], "-e") == 0) { | |
| 148 for (i = 3; i < argc; ++i) { | |
| 149 errno = 0; | |
| 150 if (tm.ExtractToFile(argv[i])) { | |
| 151 printf("Wrote %s\n", argv[i]); | |
| 152 } else if (errno == 0) { | |
| 153 printf( | |
| 154 "Not extracting %s, since this component" | |
| 155 " is not present\n", | |
| 156 argv[i]); | |
| 157 return EXIT_FAILURE; | |
| 158 } else { | |
| 159 printf("Error, could not extract %s: %s\n", argv[i], strerror(errno)); | |
| 160 return EXIT_FAILURE; | |
| 161 } | |
| 162 } | |
| 163 } else { // extract all the components | |
| 164 for (i = 0; i < tesseract::TESSDATA_NUM_ENTRIES; ++i) { | |
| 165 std::string filename = argv[3]; | |
| 166 char *last = &argv[3][strlen(argv[3]) - 1]; | |
| 167 if (*last != '.') { | |
| 168 filename += '.'; | |
| 169 } | |
| 170 filename += tesseract::kTessdataFileSuffixes[i]; | |
| 171 errno = 0; | |
| 172 if (tm.ExtractToFile(filename.c_str())) { | |
| 173 printf("Wrote %s\n", filename.c_str()); | |
| 174 } else if (errno != 0) { | |
| 175 printf("Error, could not extract %s: %s\n", filename.c_str(), | |
| 176 strerror(errno)); | |
| 177 return EXIT_FAILURE; | |
| 178 } | |
| 179 } | |
| 180 } | |
| 181 } else if (argc >= 4 && strcmp(argv[1], "-o") == 0) { | |
| 182 // Rename the current traineddata file to a temporary name. | |
| 183 const char *new_traineddata_filename = argv[2]; | |
| 184 std::string traineddata_filename = new_traineddata_filename; | |
| 185 traineddata_filename += ".__tmp__"; | |
| 186 if (rename(new_traineddata_filename, traineddata_filename.c_str()) != 0) { | |
| 187 tprintf("Failed to create a temporary file %s\n", | |
| 188 traineddata_filename.c_str()); | |
| 189 return EXIT_FAILURE; | |
| 190 } | |
| 191 | |
| 192 // Initialize TessdataManager with the data in the given traineddata file. | |
| 193 tm.Init(traineddata_filename.c_str()); | |
| 194 | |
| 195 // Write the updated traineddata file. | |
| 196 tm.OverwriteComponents(new_traineddata_filename, argv + 3, argc - 3); | |
| 197 } else if (argc == 3 && strcmp(argv[1], "-c") == 0) { | |
| 198 if (!tm.Init(argv[2])) { | |
| 199 tprintf("Failed to read %s\n", argv[2]); | |
| 200 return EXIT_FAILURE; | |
| 201 } | |
| 202 tesseract::TFile fp; | |
| 203 if (!tm.GetComponent(tesseract::TESSDATA_LSTM, &fp)) { | |
| 204 tprintf("No LSTM Component found in %s!\n", argv[2]); | |
| 205 return EXIT_FAILURE; | |
| 206 } | |
| 207 tesseract::LSTMRecognizer recognizer; | |
| 208 if (!recognizer.DeSerialize(&tm, &fp)) { | |
| 209 tprintf("Failed to deserialize LSTM in %s!\n", argv[2]); | |
| 210 return EXIT_FAILURE; | |
| 211 } | |
| 212 recognizer.ConvertToInt(); | |
| 213 std::vector<char> lstm_data; | |
| 214 fp.OpenWrite(&lstm_data); | |
| 215 ASSERT_HOST(recognizer.Serialize(&tm, &fp)); | |
| 216 tm.OverwriteEntry(tesseract::TESSDATA_LSTM, &lstm_data[0], | |
| 217 lstm_data.size()); | |
| 218 if (!tm.SaveFile(argv[2], nullptr)) { | |
| 219 tprintf("Failed to write modified traineddata:%s!\n", argv[2]); | |
| 220 return EXIT_FAILURE; | |
| 221 } | |
| 222 } else if (argc == 3 && strcmp(argv[1], "-d") == 0) { | |
| 223 return list_components(tm, argv[2]); | |
| 224 } else if (argc == 3 && strcmp(argv[1], "-l") == 0) { | |
| 225 return list_network(tm, argv[2]); | |
| 226 } else if (argc == 3 && strcmp(argv[1], "-dl") == 0) { | |
| 227 int result = list_components(tm, argv[2]); | |
| 228 if (result == EXIT_SUCCESS) { | |
| 229 result = list_network(tm, nullptr); | |
| 230 } | |
| 231 return result; | |
| 232 } else if (argc == 3 && strcmp(argv[1], "-ld") == 0) { | |
| 233 int result = list_network(tm, argv[2]); | |
| 234 if (result == EXIT_SUCCESS) { | |
| 235 result = list_components(tm, nullptr); | |
| 236 } | |
| 237 return result; | |
| 238 } else { | |
| 239 printf( | |
| 240 "Usage for combining tessdata components:\n" | |
| 241 " %s language_data_path_prefix\n" | |
| 242 " (e.g. %s tessdata/eng.)\n\n", | |
| 243 argv[0], argv[0]); | |
| 244 printf( | |
| 245 "Usage for extracting tessdata components:\n" | |
| 246 " %s -e traineddata_file [output_component_file...]\n" | |
| 247 " (e.g. %s -e eng.traineddata eng.unicharset)\n\n", | |
| 248 argv[0], argv[0]); | |
| 249 printf( | |
| 250 "Usage for overwriting tessdata components:\n" | |
| 251 " %s -o traineddata_file [input_component_file...]\n" | |
| 252 " (e.g. %s -o eng.traineddata eng.unicharset)\n\n", | |
| 253 argv[0], argv[0]); | |
| 254 printf( | |
| 255 "Usage for unpacking all tessdata components:\n" | |
| 256 " %s -u traineddata_file output_path_prefix\n" | |
| 257 " (e.g. %s -u eng.traineddata tmp/eng.)\n\n", | |
| 258 argv[0], argv[0]); | |
| 259 printf( | |
| 260 "Usage for listing the network information\n" | |
| 261 " %s -l traineddata_file\n" | |
| 262 " (e.g. %s -l eng.traineddata)\n\n", | |
| 263 argv[0], argv[0]); | |
| 264 printf( | |
| 265 "Usage for listing directory of components:\n" | |
| 266 " %s -d traineddata_file\n\n", | |
| 267 argv[0]); | |
| 268 printf( | |
| 269 "NOTE: Above two flags may combined as -dl or -ld to get both outputs" | |
| 270 ); | |
| 271 printf( | |
| 272 "Usage for compacting LSTM component to int:\n" | |
| 273 " %s -c traineddata_file\n", | |
| 274 argv[0]); | |
| 275 return EXIT_FAILURE; | |
| 276 } | |
| 277 tm.Directory(); | |
| 278 return EXIT_SUCCESS; | |
| 279 } |
