Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/training/unicharset/lang_model_helpers.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 // Copyright 2017 Google Inc. All Rights Reserved. | |
| 2 // Author: rays@google.com (Ray Smith) | |
| 3 // Purpose: Collection of convenience functions to simplify creation of the | |
| 4 // unicharset, recoder, and dawgs for an LSTM model. | |
| 5 | |
| 6 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 7 // you may not use this file except in compliance with the License. | |
| 8 // You may obtain a copy of the License at | |
| 9 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 10 // Unless required by applicable law or agreed to in writing, software | |
| 11 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 13 // See the License for the specific language governing permissions and | |
| 14 // limitations under the License. | |
| 15 | |
| 16 #include "lang_model_helpers.h" | |
| 17 | |
| 18 #include "dawg.h" | |
| 19 #include "fileio.h" | |
| 20 #include "tessdatamanager.h" | |
| 21 #include "trie.h" | |
| 22 #include "unicharcompress.h" | |
| 23 | |
| 24 #include <cstdlib> | |
| 25 | |
| 26 #include <sys/stat.h> | |
| 27 #include <sys/types.h> | |
| 28 | |
| 29 #if defined(_WIN32) | |
| 30 # include <direct.h> | |
| 31 #endif | |
| 32 | |
| 33 namespace tesseract { | |
| 34 | |
| 35 // Helper makes a filename (<output_dir>/<lang>/<lang><suffix>) and writes data | |
| 36 // to the file, using writer if not null, otherwise, a default writer. | |
| 37 // Default writer will overwrite any existing file, but a supplied writer | |
| 38 // can do its own thing. If lang is empty, returns true but does nothing. | |
| 39 // NOTE that suffix should contain any required . for the filename. | |
| 40 bool WriteFile(const std::string &output_dir, const std::string &lang, const std::string &suffix, | |
| 41 const std::vector<char> &data, FileWriter writer) { | |
| 42 if (lang.empty()) { | |
| 43 return true; | |
| 44 } | |
| 45 std::string dirname = output_dir + "/" + lang; | |
| 46 // Attempt to make the directory, but ignore errors, as it may not be a | |
| 47 // standard filesystem, and the writer will complain if not successful. | |
| 48 #if defined(_WIN32) | |
| 49 _mkdir(dirname.c_str()); | |
| 50 #else | |
| 51 mkdir(dirname.c_str(), S_IRWXU | S_IRWXG); | |
| 52 #endif | |
| 53 std::string filename = dirname + "/" + lang + suffix; | |
| 54 if (writer == nullptr) { | |
| 55 return SaveDataToFile(data, filename.c_str()); | |
| 56 } else { | |
| 57 return (*writer)(data, filename.c_str()); | |
| 58 } | |
| 59 } | |
| 60 | |
| 61 // Helper reads a file with optional reader and returns a string. | |
| 62 // On failure emits a warning message and returns an empty string. | |
| 63 std::string ReadFile(const std::string &filename, FileReader reader) { | |
| 64 if (filename.empty()) { | |
| 65 return std::string(); | |
| 66 } | |
| 67 std::vector<char> data; | |
| 68 bool read_result; | |
| 69 if (reader == nullptr) { | |
| 70 read_result = LoadDataFromFile(filename.c_str(), &data); | |
| 71 } else { | |
| 72 read_result = (*reader)(filename.c_str(), &data); | |
| 73 } | |
| 74 if (read_result) { | |
| 75 return std::string(&data[0], data.size()); | |
| 76 } | |
| 77 tprintf("Failed to read data from: %s\n", filename.c_str()); | |
| 78 return std::string(); | |
| 79 } | |
| 80 | |
| 81 // Helper writes the unicharset to file and to the traineddata. | |
| 82 bool WriteUnicharset(const UNICHARSET &unicharset, const std::string &output_dir, | |
| 83 const std::string &lang, FileWriter writer, TessdataManager *traineddata) { | |
| 84 std::vector<char> unicharset_data; | |
| 85 TFile fp; | |
| 86 fp.OpenWrite(&unicharset_data); | |
| 87 if (!unicharset.save_to_file(&fp)) { | |
| 88 return false; | |
| 89 } | |
| 90 traineddata->OverwriteEntry(TESSDATA_LSTM_UNICHARSET, &unicharset_data[0], | |
| 91 unicharset_data.size()); | |
| 92 return WriteFile(output_dir, lang, ".unicharset", unicharset_data, writer); | |
| 93 } | |
| 94 | |
| 95 // Helper creates the recoder and writes it to the traineddata, and a human- | |
| 96 // readable form to file. | |
| 97 bool WriteRecoder(const UNICHARSET &unicharset, bool pass_through, const std::string &output_dir, | |
| 98 const std::string &lang, FileWriter writer, std::string *radical_table_data, | |
| 99 TessdataManager *traineddata) { | |
| 100 UnicharCompress recoder; | |
| 101 // Where the unicharset is carefully setup already to contain a good | |
| 102 // compact encoding, use a pass-through recoder that does nothing. | |
| 103 // For scripts that have a large number of unicodes (Han, Hangul) we want | |
| 104 // to use the recoder to compress the symbol space by re-encoding each | |
| 105 // unicode as multiple codes from a smaller 'alphabet' that are related to the | |
| 106 // shapes in the character. Hangul Jamo is a perfect example of this. | |
| 107 // See the Hangul Syllables section, sub-section "Equivalence" in: | |
| 108 // http://www.unicode.org/versions/Unicode10.0.0/ch18.pdf | |
| 109 if (pass_through) { | |
| 110 recoder.SetupPassThrough(unicharset); | |
| 111 } else { | |
| 112 int null_char = unicharset.has_special_codes() ? UNICHAR_BROKEN : unicharset.size(); | |
| 113 tprintf("Null char=%d\n", null_char); | |
| 114 if (!recoder.ComputeEncoding(unicharset, null_char, radical_table_data)) { | |
| 115 tprintf("Creation of encoded unicharset failed!!\n"); | |
| 116 return false; | |
| 117 } | |
| 118 } | |
| 119 TFile fp; | |
| 120 std::vector<char> recoder_data; | |
| 121 fp.OpenWrite(&recoder_data); | |
| 122 if (!recoder.Serialize(&fp)) { | |
| 123 return false; | |
| 124 } | |
| 125 traineddata->OverwriteEntry(TESSDATA_LSTM_RECODER, &recoder_data[0], recoder_data.size()); | |
| 126 std::string encoding = recoder.GetEncodingAsString(unicharset); | |
| 127 recoder_data.resize(encoding.length(), 0); | |
| 128 memcpy(&recoder_data[0], &encoding[0], encoding.length()); | |
| 129 std::string suffix; | |
| 130 suffix += ".charset_size=" + std::to_string(recoder.code_range()); | |
| 131 suffix += ".txt"; | |
| 132 return WriteFile(output_dir, lang, suffix, recoder_data, writer); | |
| 133 } | |
| 134 | |
| 135 // Helper builds a dawg from the given words, using the unicharset as coding, | |
| 136 // and reverse_policy for LTR/RTL, and overwrites file_type in the traineddata. | |
| 137 static bool WriteDawg(const std::vector<std::string> &words, const UNICHARSET &unicharset, | |
| 138 Trie::RTLReversePolicy reverse_policy, TessdataType file_type, | |
| 139 TessdataManager *traineddata) { | |
| 140 // The first 3 arguments are not used in this case. | |
| 141 Trie trie(DAWG_TYPE_WORD, "", SYSTEM_DAWG_PERM, unicharset.size(), 0); | |
| 142 trie.add_word_list(words, unicharset, reverse_policy); | |
| 143 tprintf("Reducing Trie to SquishedDawg\n"); | |
| 144 std::unique_ptr<SquishedDawg> dawg(trie.trie_to_dawg()); | |
| 145 if (dawg == nullptr || dawg->NumEdges() == 0) { | |
| 146 return false; | |
| 147 } | |
| 148 TFile fp; | |
| 149 std::vector<char> dawg_data; | |
| 150 fp.OpenWrite(&dawg_data); | |
| 151 if (!dawg->write_squished_dawg(&fp)) { | |
| 152 return false; | |
| 153 } | |
| 154 traineddata->OverwriteEntry(file_type, &dawg_data[0], dawg_data.size()); | |
| 155 return true; | |
| 156 } | |
| 157 | |
| 158 // Builds and writes the dawgs, given a set of words, punctuation | |
| 159 // patterns, number patterns, to the traineddata. Encoding uses the given | |
| 160 // unicharset, and the punc dawgs is reversed if lang_is_rtl. | |
| 161 static bool WriteDawgs(const std::vector<std::string> &words, const std::vector<std::string> &puncs, | |
| 162 const std::vector<std::string> &numbers, bool lang_is_rtl, | |
| 163 const UNICHARSET &unicharset, TessdataManager *traineddata) { | |
| 164 if (puncs.empty()) { | |
| 165 tprintf("Must have non-empty puncs list to use language models!!\n"); | |
| 166 return false; | |
| 167 } | |
| 168 // For each of the dawg types, make the dawg, and write to traineddata. | |
| 169 // Dawgs are reversed as follows: | |
| 170 // Words: According to the word content. | |
| 171 // Puncs: According to lang_is_rtl. | |
| 172 // Numbers: Never. | |
| 173 // System dawg (main wordlist). | |
| 174 if (!words.empty() && !WriteDawg(words, unicharset, Trie::RRP_REVERSE_IF_HAS_RTL, | |
| 175 TESSDATA_LSTM_SYSTEM_DAWG, traineddata)) { | |
| 176 return false; | |
| 177 } | |
| 178 // punc/punc-dawg. | |
| 179 Trie::RTLReversePolicy reverse_policy = | |
| 180 lang_is_rtl ? Trie::RRP_FORCE_REVERSE : Trie::RRP_DO_NO_REVERSE; | |
| 181 if (!WriteDawg(puncs, unicharset, reverse_policy, TESSDATA_LSTM_PUNC_DAWG, traineddata)) { | |
| 182 return false; | |
| 183 } | |
| 184 // numbers/number-dawg. | |
| 185 if (!numbers.empty() && !WriteDawg(numbers, unicharset, Trie::RRP_DO_NO_REVERSE, | |
| 186 TESSDATA_LSTM_NUMBER_DAWG, traineddata)) { | |
| 187 return false; | |
| 188 } | |
| 189 return true; | |
| 190 } | |
| 191 | |
| 192 // The main function for combine_lang_model.cpp. | |
| 193 // Returns EXIT_SUCCESS or EXIT_FAILURE for error. | |
| 194 int CombineLangModel(const UNICHARSET &unicharset, const std::string &script_dir, | |
| 195 const std::string &version_str, const std::string &output_dir, | |
| 196 const std::string &lang, bool pass_through_recoder, | |
| 197 const std::vector<std::string> &words, const std::vector<std::string> &puncs, | |
| 198 const std::vector<std::string> &numbers, bool lang_is_rtl, FileReader reader, | |
| 199 FileWriter writer) { | |
| 200 // Build the traineddata file. | |
| 201 TessdataManager traineddata; | |
| 202 if (!version_str.empty()) { | |
| 203 traineddata.SetVersionString(traineddata.VersionString() + ":" + version_str); | |
| 204 } | |
| 205 // Unicharset and recoder. | |
| 206 if (!WriteUnicharset(unicharset, output_dir, lang, writer, &traineddata)) { | |
| 207 tprintf("Error writing unicharset!!\n"); | |
| 208 return EXIT_FAILURE; | |
| 209 } else { | |
| 210 tprintf("Config file is optional, continuing...\n"); | |
| 211 } | |
| 212 // If there is a config file, read it and add to traineddata. | |
| 213 std::string config_filename = script_dir + "/" + lang + "/" + lang + ".config"; | |
| 214 std::string config_file = ReadFile(config_filename, reader); | |
| 215 if (config_file.length() > 0) { | |
| 216 traineddata.OverwriteEntry(TESSDATA_LANG_CONFIG, &config_file[0], config_file.length()); | |
| 217 } | |
| 218 std::string radical_filename = script_dir + "/radical-stroke.txt"; | |
| 219 std::string radical_data = ReadFile(radical_filename, reader); | |
| 220 if (radical_data.empty()) { | |
| 221 tprintf("Error reading radical code table %s\n", radical_filename.c_str()); | |
| 222 return EXIT_FAILURE; | |
| 223 } | |
| 224 if (!WriteRecoder(unicharset, pass_through_recoder, output_dir, lang, writer, &radical_data, | |
| 225 &traineddata)) { | |
| 226 tprintf("Error writing recoder!!\n"); | |
| 227 } | |
| 228 if (!words.empty() || !puncs.empty() || !numbers.empty()) { | |
| 229 if (!WriteDawgs(words, puncs, numbers, lang_is_rtl, unicharset, &traineddata)) { | |
| 230 tprintf("Error during conversion of wordlists to DAWGs!!\n"); | |
| 231 return EXIT_FAILURE; | |
| 232 } | |
| 233 } | |
| 234 | |
| 235 // Traineddata file. | |
| 236 std::vector<char> traineddata_data; | |
| 237 traineddata.Serialize(&traineddata_data); | |
| 238 if (!WriteFile(output_dir, lang, ".traineddata", traineddata_data, writer)) { | |
| 239 tprintf("Error writing output traineddata file!!\n"); | |
| 240 return EXIT_FAILURE; | |
| 241 } | |
| 242 tprintf("Created %s/%s/%s.traineddata", output_dir.c_str(), lang.c_str(), lang.c_str()); | |
| 243 return EXIT_SUCCESS; | |
| 244 } | |
| 245 | |
| 246 } // namespace tesseract |
