Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/training/unicharset/lang_model_helpers.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/training/unicharset/lang_model_helpers.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,246 @@ +// Copyright 2017 Google Inc. All Rights Reserved. +// Author: rays@google.com (Ray Smith) +// Purpose: Collection of convenience functions to simplify creation of the +// unicharset, recoder, and dawgs for an LSTM model. + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lang_model_helpers.h" + +#include "dawg.h" +#include "fileio.h" +#include "tessdatamanager.h" +#include "trie.h" +#include "unicharcompress.h" + +#include <cstdlib> + +#include <sys/stat.h> +#include <sys/types.h> + +#if defined(_WIN32) +# include <direct.h> +#endif + +namespace tesseract { + +// Helper makes a filename (<output_dir>/<lang>/<lang><suffix>) and writes data +// to the file, using writer if not null, otherwise, a default writer. +// Default writer will overwrite any existing file, but a supplied writer +// can do its own thing. If lang is empty, returns true but does nothing. +// NOTE that suffix should contain any required . for the filename. +bool WriteFile(const std::string &output_dir, const std::string &lang, const std::string &suffix, + const std::vector<char> &data, FileWriter writer) { + if (lang.empty()) { + return true; + } + std::string dirname = output_dir + "/" + lang; + // Attempt to make the directory, but ignore errors, as it may not be a + // standard filesystem, and the writer will complain if not successful. +#if defined(_WIN32) + _mkdir(dirname.c_str()); +#else + mkdir(dirname.c_str(), S_IRWXU | S_IRWXG); +#endif + std::string filename = dirname + "/" + lang + suffix; + if (writer == nullptr) { + return SaveDataToFile(data, filename.c_str()); + } else { + return (*writer)(data, filename.c_str()); + } +} + +// Helper reads a file with optional reader and returns a string. +// On failure emits a warning message and returns an empty string. +std::string ReadFile(const std::string &filename, FileReader reader) { + if (filename.empty()) { + return std::string(); + } + std::vector<char> data; + bool read_result; + if (reader == nullptr) { + read_result = LoadDataFromFile(filename.c_str(), &data); + } else { + read_result = (*reader)(filename.c_str(), &data); + } + if (read_result) { + return std::string(&data[0], data.size()); + } + tprintf("Failed to read data from: %s\n", filename.c_str()); + return std::string(); +} + +// Helper writes the unicharset to file and to the traineddata. +bool WriteUnicharset(const UNICHARSET &unicharset, const std::string &output_dir, + const std::string &lang, FileWriter writer, TessdataManager *traineddata) { + std::vector<char> unicharset_data; + TFile fp; + fp.OpenWrite(&unicharset_data); + if (!unicharset.save_to_file(&fp)) { + return false; + } + traineddata->OverwriteEntry(TESSDATA_LSTM_UNICHARSET, &unicharset_data[0], + unicharset_data.size()); + return WriteFile(output_dir, lang, ".unicharset", unicharset_data, writer); +} + +// Helper creates the recoder and writes it to the traineddata, and a human- +// readable form to file. +bool WriteRecoder(const UNICHARSET &unicharset, bool pass_through, const std::string &output_dir, + const std::string &lang, FileWriter writer, std::string *radical_table_data, + TessdataManager *traineddata) { + UnicharCompress recoder; + // Where the unicharset is carefully setup already to contain a good + // compact encoding, use a pass-through recoder that does nothing. + // For scripts that have a large number of unicodes (Han, Hangul) we want + // to use the recoder to compress the symbol space by re-encoding each + // unicode as multiple codes from a smaller 'alphabet' that are related to the + // shapes in the character. Hangul Jamo is a perfect example of this. + // See the Hangul Syllables section, sub-section "Equivalence" in: + // http://www.unicode.org/versions/Unicode10.0.0/ch18.pdf + if (pass_through) { + recoder.SetupPassThrough(unicharset); + } else { + int null_char = unicharset.has_special_codes() ? UNICHAR_BROKEN : unicharset.size(); + tprintf("Null char=%d\n", null_char); + if (!recoder.ComputeEncoding(unicharset, null_char, radical_table_data)) { + tprintf("Creation of encoded unicharset failed!!\n"); + return false; + } + } + TFile fp; + std::vector<char> recoder_data; + fp.OpenWrite(&recoder_data); + if (!recoder.Serialize(&fp)) { + return false; + } + traineddata->OverwriteEntry(TESSDATA_LSTM_RECODER, &recoder_data[0], recoder_data.size()); + std::string encoding = recoder.GetEncodingAsString(unicharset); + recoder_data.resize(encoding.length(), 0); + memcpy(&recoder_data[0], &encoding[0], encoding.length()); + std::string suffix; + suffix += ".charset_size=" + std::to_string(recoder.code_range()); + suffix += ".txt"; + return WriteFile(output_dir, lang, suffix, recoder_data, writer); +} + +// Helper builds a dawg from the given words, using the unicharset as coding, +// and reverse_policy for LTR/RTL, and overwrites file_type in the traineddata. +static bool WriteDawg(const std::vector<std::string> &words, const UNICHARSET &unicharset, + Trie::RTLReversePolicy reverse_policy, TessdataType file_type, + TessdataManager *traineddata) { + // The first 3 arguments are not used in this case. + Trie trie(DAWG_TYPE_WORD, "", SYSTEM_DAWG_PERM, unicharset.size(), 0); + trie.add_word_list(words, unicharset, reverse_policy); + tprintf("Reducing Trie to SquishedDawg\n"); + std::unique_ptr<SquishedDawg> dawg(trie.trie_to_dawg()); + if (dawg == nullptr || dawg->NumEdges() == 0) { + return false; + } + TFile fp; + std::vector<char> dawg_data; + fp.OpenWrite(&dawg_data); + if (!dawg->write_squished_dawg(&fp)) { + return false; + } + traineddata->OverwriteEntry(file_type, &dawg_data[0], dawg_data.size()); + return true; +} + +// Builds and writes the dawgs, given a set of words, punctuation +// patterns, number patterns, to the traineddata. Encoding uses the given +// unicharset, and the punc dawgs is reversed if lang_is_rtl. +static bool WriteDawgs(const std::vector<std::string> &words, const std::vector<std::string> &puncs, + const std::vector<std::string> &numbers, bool lang_is_rtl, + const UNICHARSET &unicharset, TessdataManager *traineddata) { + if (puncs.empty()) { + tprintf("Must have non-empty puncs list to use language models!!\n"); + return false; + } + // For each of the dawg types, make the dawg, and write to traineddata. + // Dawgs are reversed as follows: + // Words: According to the word content. + // Puncs: According to lang_is_rtl. + // Numbers: Never. + // System dawg (main wordlist). + if (!words.empty() && !WriteDawg(words, unicharset, Trie::RRP_REVERSE_IF_HAS_RTL, + TESSDATA_LSTM_SYSTEM_DAWG, traineddata)) { + return false; + } + // punc/punc-dawg. + Trie::RTLReversePolicy reverse_policy = + lang_is_rtl ? Trie::RRP_FORCE_REVERSE : Trie::RRP_DO_NO_REVERSE; + if (!WriteDawg(puncs, unicharset, reverse_policy, TESSDATA_LSTM_PUNC_DAWG, traineddata)) { + return false; + } + // numbers/number-dawg. + if (!numbers.empty() && !WriteDawg(numbers, unicharset, Trie::RRP_DO_NO_REVERSE, + TESSDATA_LSTM_NUMBER_DAWG, traineddata)) { + return false; + } + return true; +} + +// The main function for combine_lang_model.cpp. +// Returns EXIT_SUCCESS or EXIT_FAILURE for error. +int CombineLangModel(const UNICHARSET &unicharset, const std::string &script_dir, + const std::string &version_str, const std::string &output_dir, + const std::string &lang, bool pass_through_recoder, + const std::vector<std::string> &words, const std::vector<std::string> &puncs, + const std::vector<std::string> &numbers, bool lang_is_rtl, FileReader reader, + FileWriter writer) { + // Build the traineddata file. + TessdataManager traineddata; + if (!version_str.empty()) { + traineddata.SetVersionString(traineddata.VersionString() + ":" + version_str); + } + // Unicharset and recoder. + if (!WriteUnicharset(unicharset, output_dir, lang, writer, &traineddata)) { + tprintf("Error writing unicharset!!\n"); + return EXIT_FAILURE; + } else { + tprintf("Config file is optional, continuing...\n"); + } + // If there is a config file, read it and add to traineddata. + std::string config_filename = script_dir + "/" + lang + "/" + lang + ".config"; + std::string config_file = ReadFile(config_filename, reader); + if (config_file.length() > 0) { + traineddata.OverwriteEntry(TESSDATA_LANG_CONFIG, &config_file[0], config_file.length()); + } + std::string radical_filename = script_dir + "/radical-stroke.txt"; + std::string radical_data = ReadFile(radical_filename, reader); + if (radical_data.empty()) { + tprintf("Error reading radical code table %s\n", radical_filename.c_str()); + return EXIT_FAILURE; + } + if (!WriteRecoder(unicharset, pass_through_recoder, output_dir, lang, writer, &radical_data, + &traineddata)) { + tprintf("Error writing recoder!!\n"); + } + if (!words.empty() || !puncs.empty() || !numbers.empty()) { + if (!WriteDawgs(words, puncs, numbers, lang_is_rtl, unicharset, &traineddata)) { + tprintf("Error during conversion of wordlists to DAWGs!!\n"); + return EXIT_FAILURE; + } + } + + // Traineddata file. + std::vector<char> traineddata_data; + traineddata.Serialize(&traineddata_data); + if (!WriteFile(output_dir, lang, ".traineddata", traineddata_data, writer)) { + tprintf("Error writing output traineddata file!!\n"); + return EXIT_FAILURE; + } + tprintf("Created %s/%s/%s.traineddata", output_dir.c_str(), lang.c_str(), lang.c_str()); + return EXIT_SUCCESS; +} + +} // namespace tesseract
