Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/ccmain/tessedit.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/ccmain/tessedit.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,464 @@ +/********************************************************************** + * File: tessedit.cpp (Formerly tessedit.c) + * Description: (Previously) Main program for merge of tess and editor. + * Now just code to load the language model and various + * engine-specific data files. + * Author: Ray Smith + * + * (C) Copyright 1992, Hewlett-Packard Ltd. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + **********************************************************************/ + +// Include automatically generated configuration file if running autoconf. +#ifdef HAVE_CONFIG_H +# include "config_auto.h" +#endif + +#include "control.h" +#include "matchdefs.h" +#include "pageres.h" +#include "params.h" +#include "stopper.h" +#include "tesseractclass.h" +#include "tessvars.h" +#include "tprintf.h" +#ifndef DISABLED_LEGACY_ENGINE +# include "chop.h" +# include "intmatcher.h" +# include "reject.h" +#endif +#include "lstmrecognizer.h" + +namespace tesseract { + +// Read a "config" file containing a set of variable, value pairs. +// Searches the standard places: tessdata/configs, tessdata/tessconfigs +// and also accepts a relative or absolute path name. +void Tesseract::read_config_file(const char *filename, SetParamConstraint constraint) { + std::string path = datadir; + path += "configs/"; + path += filename; + FILE *fp; + if ((fp = fopen(path.c_str(), "rb")) != nullptr) { + fclose(fp); + } else { + path = datadir; + path += "tessconfigs/"; + path += filename; + if ((fp = fopen(path.c_str(), "rb")) != nullptr) { + fclose(fp); + } else { + path = filename; + } + } + ParamUtils::ReadParamsFile(path.c_str(), constraint, this->params()); +} + +// Returns false if a unicharset file for the specified language was not found +// or was invalid. +// This function initializes TessdataManager. After TessdataManager is +// no longer needed, TessdataManager::End() should be called. +// +// This function sets tessedit_oem_mode to the given OcrEngineMode oem, unless +// it is OEM_DEFAULT, in which case the value of the variable will be obtained +// from the language-specific config file (stored in [lang].traineddata), from +// the config files specified on the command line or left as the default +// OEM_TESSERACT_ONLY if none of the configs specify this variable. +bool Tesseract::init_tesseract_lang_data(const std::string &arg0, + const std::string &language, OcrEngineMode oem, + char **configs, int configs_size, + const std::vector<std::string> *vars_vec, + const std::vector<std::string> *vars_values, + bool set_only_non_debug_params, TessdataManager *mgr) { + // Set the language data path prefix + lang = !language.empty() ? language : "eng"; + language_data_path_prefix = datadir; + language_data_path_prefix += lang; + language_data_path_prefix += "."; + + // Initialize TessdataManager. + std::string tessdata_path = language_data_path_prefix + kTrainedDataSuffix; + if (!mgr->is_loaded() && !mgr->Init(tessdata_path.c_str())) { + tprintf("Error opening data file %s\n", tessdata_path.c_str()); + tprintf( + "Please make sure the TESSDATA_PREFIX environment variable is set" + " to your \"tessdata\" directory.\n"); + return false; + } +#ifdef DISABLED_LEGACY_ENGINE + tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY); +#else + if (oem == OEM_DEFAULT) { + // Set the engine mode from availability, which can then be overridden by + // the config file when we read it below. + if (!mgr->IsLSTMAvailable()) { + tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY); + } else if (!mgr->IsBaseAvailable()) { + tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY); + } else { + tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_LSTM_COMBINED); + } + } +#endif // ndef DISABLED_LEGACY_ENGINE + + // If a language specific config file (lang.config) exists, load it in. + TFile fp; + if (mgr->GetComponent(TESSDATA_LANG_CONFIG, &fp)) { + ParamUtils::ReadParamsFromFp(SET_PARAM_CONSTRAINT_NONE, &fp, this->params()); + } + + SetParamConstraint set_params_constraint = + set_only_non_debug_params ? SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY : SET_PARAM_CONSTRAINT_NONE; + // Load tesseract variables from config files. This is done after loading + // language-specific variables from [lang].traineddata file, so that custom + // config files can override values in [lang].traineddata file. + for (int i = 0; i < configs_size; ++i) { + read_config_file(configs[i], set_params_constraint); + } + + // Set params specified in vars_vec (done after setting params from config + // files, so that params in vars_vec can override those from files). + if (vars_vec != nullptr && vars_values != nullptr) { + for (unsigned i = 0; i < vars_vec->size(); ++i) { + if (!ParamUtils::SetParam((*vars_vec)[i].c_str(), (*vars_values)[i].c_str(), + set_params_constraint, this->params())) { + tprintf("Warning: The parameter '%s' was not found.\n", (*vars_vec)[i].c_str()); + } + } + } + + if (!tessedit_write_params_to_file.empty()) { + FILE *params_file = fopen(tessedit_write_params_to_file.c_str(), "wb"); + if (params_file != nullptr) { + ParamUtils::PrintParams(params_file, this->params()); + fclose(params_file); + } else { + tprintf("Failed to open %s for writing params.\n", tessedit_write_params_to_file.c_str()); + } + } + +#ifndef DISABLED_LEGACY_ENGINE + // Determine which ocr engine(s) should be loaded and used for recognition. + if (oem != OEM_DEFAULT) { + tessedit_ocr_engine_mode.set_value(oem); + } +#endif + + // If we are only loading the config file (and so not planning on doing any + // recognition) then there's nothing else do here. + if (tessedit_init_config_only) { + return true; + } + +// The various OcrEngineMode settings (see tesseract/publictypes.h) determine +// which engine-specific data files need to be loaded. If LSTM_ONLY is +// requested, the base Tesseract files are *Not* required. +#ifdef DISABLED_LEGACY_ENGINE + if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) { +#else + if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY || + tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) { +#endif // ndef DISABLED_LEGACY_ENGINE + if (mgr->IsComponentAvailable(TESSDATA_LSTM)) { + lstm_recognizer_ = new LSTMRecognizer(language_data_path_prefix.c_str()); + ASSERT_HOST(lstm_recognizer_->Load(this->params(), lstm_use_matrix ? language : "", mgr)); + } else { + tprintf("Error: LSTM requested, but not present!! Loading tesseract.\n"); + tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY); + } + } + + // Load the unicharset + if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) { + // Avoid requiring a unicharset when we aren't running base tesseract. + unicharset.CopyFrom(lstm_recognizer_->GetUnicharset()); + } +#ifndef DISABLED_LEGACY_ENGINE + else if (!mgr->GetComponent(TESSDATA_UNICHARSET, &fp) || !unicharset.load_from_file(&fp, false)) { + tprintf( + "Error: Tesseract (legacy) engine requested, but components are " + "not present in %s!!\n", + tessdata_path.c_str()); + return false; + } +#endif // ndef DISABLED_LEGACY_ENGINE + if (unicharset.size() > MAX_NUM_CLASSES) { + tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n"); + return false; + } + right_to_left_ = unicharset.major_right_to_left(); + +#ifndef DISABLED_LEGACY_ENGINE + + // Setup initial unichar ambigs table and read universal ambigs. + UNICHARSET encoder_unicharset; + encoder_unicharset.CopyFrom(unicharset); + unichar_ambigs.InitUnicharAmbigs(unicharset, use_ambigs_for_adaption); + unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset); + + if (!tessedit_ambigs_training && mgr->GetComponent(TESSDATA_AMBIGS, &fp)) { + unichar_ambigs.LoadUnicharAmbigs(encoder_unicharset, &fp, ambigs_debug_level, + use_ambigs_for_adaption, &unicharset); + } + + // Init ParamsModel. + // Load pass1 and pass2 weights (for now these two sets are the same, but in + // the future separate sets of weights can be generated). + for (int p = ParamsModel::PTRAIN_PASS1; p < ParamsModel::PTRAIN_NUM_PASSES; ++p) { + language_model_->getParamsModel().SetPass(static_cast<ParamsModel::PassEnum>(p)); + if (mgr->GetComponent(TESSDATA_PARAMS_MODEL, &fp)) { + if (!language_model_->getParamsModel().LoadFromFp(lang.c_str(), &fp)) { + return false; + } + } + } +#endif // ndef DISABLED_LEGACY_ENGINE + + return true; +} + +// Helper returns true if the given string is in the vector of strings. +static bool IsStrInList(const std::string &str, const std::vector<std::string> &str_list) { + for (const auto &i : str_list) { + if (i == str) { + return true; + } + } + return false; +} + +// Parse a string of the form [~]<lang>[+[~]<lang>]*. +// Langs with no prefix get appended to to_load, provided they +// are not in there already. +// Langs with ~ prefix get appended to not_to_load, provided they are not in +// there already. +void Tesseract::ParseLanguageString(const std::string &lang_str, std::vector<std::string> *to_load, + std::vector<std::string> *not_to_load) { + std::string remains(lang_str); + // Look whether the model file uses a prefix which must be applied to + // included model files as well. + std::string prefix; + size_t found = lang.find_last_of('/'); + if (found != std::string::npos) { + // A prefix was found. + prefix = lang.substr(0, found + 1); + } + while (!remains.empty()) { + // Find the start of the lang code and which vector to add to. + const char *start = remains.c_str(); + while (*start == '+') { + ++start; + } + std::vector<std::string> *target = to_load; + if (*start == '~') { + target = not_to_load; + ++start; + } + // Find the index of the end of the lang code in string start. + int end = strlen(start); + const char *plus = strchr(start, '+'); + if (plus != nullptr && plus - start < end) { + end = plus - start; + } + std::string lang_code(start); + lang_code.resize(end); + std::string next(start + end); + remains = std::move(next); + lang_code = prefix + lang_code; + // Check whether lang_code is already in the target vector and add. + if (!IsStrInList(lang_code, *target)) { + target->push_back(lang_code); + } + } +} + +// Initialize for potentially a set of languages defined by the language +// string and recursively any additional languages required by any language +// traineddata file (via tessedit_load_sublangs in its config) that is loaded. +// See init_tesseract_internal for args. +int Tesseract::init_tesseract(const std::string &arg0, const std::string &textbase, + const std::string &language, OcrEngineMode oem, char **configs, + int configs_size, const std::vector<std::string> *vars_vec, + const std::vector<std::string> *vars_values, + bool set_only_non_debug_params, TessdataManager *mgr) { + std::vector<std::string> langs_to_load; + std::vector<std::string> langs_not_to_load; + ParseLanguageString(language, &langs_to_load, &langs_not_to_load); + + for (auto *lang : sub_langs_) { + delete lang; + } + + // Set the basename, compute the data directory. + main_setup(arg0, textbase); + + sub_langs_.clear(); + // Find the first loadable lang and load into this. + // Add any languages that this language requires + bool loaded_primary = false; + // Load the rest into sub_langs_. + // WARNING: A range based for loop does not work here because langs_to_load + // might be changed in the loop when a new submodel is found. + for (size_t lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) { + auto &lang_to_load = langs_to_load[lang_index]; + if (!IsStrInList(lang_to_load, langs_not_to_load)) { + const char *lang_str = lang_to_load.c_str(); + Tesseract *tess_to_init; + if (!loaded_primary) { + tess_to_init = this; + } else { + tess_to_init = new Tesseract; + tess_to_init->main_setup(arg0, textbase); + } + + int result = tess_to_init->init_tesseract_internal(arg0, textbase, lang_str, oem, configs, + configs_size, vars_vec, vars_values, + set_only_non_debug_params, mgr); + // Forget that language, but keep any reader we were given. + mgr->Clear(); + + if (!loaded_primary) { + if (result < 0) { + tprintf("Failed loading language '%s'\n", lang_str); + } else { + ParseLanguageString(tess_to_init->tessedit_load_sublangs, &langs_to_load, + &langs_not_to_load); + loaded_primary = true; + } + } else { + if (result < 0) { + tprintf("Failed loading language '%s'\n", lang_str); + delete tess_to_init; + } else { + sub_langs_.push_back(tess_to_init); + // Add any languages that this language requires + ParseLanguageString(tess_to_init->tessedit_load_sublangs, &langs_to_load, + &langs_not_to_load); + } + } + } + } + if (!loaded_primary && !langs_to_load.empty()) { + tprintf("Tesseract couldn't load any languages!\n"); + return -1; // Couldn't load any language! + } +#ifndef DISABLED_LEGACY_ENGINE + if (!sub_langs_.empty()) { + // In multilingual mode word ratings have to be directly comparable, + // so use the same language model weights for all languages: + // use the primary language's params model if + // tessedit_use_primary_params_model is set, + // otherwise use default language model weights. + if (tessedit_use_primary_params_model) { + for (auto &sub_lang : sub_langs_) { + sub_lang->language_model_->getParamsModel().Copy(this->language_model_->getParamsModel()); + } + tprintf("Using params model of the primary language\n"); + } else { + this->language_model_->getParamsModel().Clear(); + for (auto &sub_lang : sub_langs_) { + sub_lang->language_model_->getParamsModel().Clear(); + } + } + } + + SetupUniversalFontIds(); +#endif // ndef DISABLED_LEGACY_ENGINE + return 0; +} + +// Common initialization for a single language. +// arg0 is the datapath for the tessdata directory, which could be the +// path of the tessdata directory with no trailing /, or (if tessdata +// lives in the same directory as the executable, the path of the executable, +// hence the name arg0. +// textbase is an optional output file basename (used only for training) +// language is the language code to load. +// oem controls which engine(s) will operate on the image +// configs (argv) is an array of config filenames to load variables from. +// May be nullptr. +// configs_size (argc) is the number of elements in configs. +// vars_vec is an optional vector of variables to set. +// vars_values is an optional corresponding vector of values for the variables +// in vars_vec. +// If set_only_non_debug_params is true, only params that do not contain +// "debug" in the name will be set. +int Tesseract::init_tesseract_internal(const std::string &arg0, const std::string &textbase, + const std::string &language, OcrEngineMode oem, + char **configs, int configs_size, + const std::vector<std::string> *vars_vec, + const std::vector<std::string> *vars_values, + bool set_only_non_debug_params, TessdataManager *mgr) { + if (!init_tesseract_lang_data(arg0, language, oem, configs, configs_size, vars_vec, + vars_values, set_only_non_debug_params, mgr)) { + return -1; + } + if (tessedit_init_config_only) { + return 0; + } + // If only LSTM will be used, skip loading Tesseract classifier's + // pre-trained templates and dictionary. + bool init_tesseract = tessedit_ocr_engine_mode != OEM_LSTM_ONLY; + program_editup(textbase, init_tesseract ? mgr : nullptr, init_tesseract ? mgr : nullptr); + return 0; // Normal exit +} + +#ifndef DISABLED_LEGACY_ENGINE + +// Helper builds the all_fonts table by adding new fonts from new_fonts. +static void CollectFonts(const UnicityTable<FontInfo> &new_fonts, + UnicityTable<FontInfo> *all_fonts) { + for (int i = 0; i < new_fonts.size(); ++i) { + // UnicityTable uniques as we go. + all_fonts->push_back(new_fonts.at(i)); + } +} + +// Helper assigns an id to lang_fonts using the index in all_fonts table. +static void AssignIds(const UnicityTable<FontInfo> &all_fonts, UnicityTable<FontInfo> *lang_fonts) { + for (int i = 0; i < lang_fonts->size(); ++i) { + auto index = all_fonts.get_index(lang_fonts->at(i)); + lang_fonts->at(i).universal_id = index; + } +} + +// Set the universal_id member of each font to be unique among all +// instances of the same font loaded. +void Tesseract::SetupUniversalFontIds() { + // Note that we can get away with bitwise copying FontInfo in + // all_fonts, as it is a temporary structure and we avoid setting the + // delete callback. + UnicityTable<FontInfo> all_fonts; + + // Create the universal ID table. + CollectFonts(get_fontinfo_table(), &all_fonts); + for (auto &sub_lang : sub_langs_) { + CollectFonts(sub_lang->get_fontinfo_table(), &all_fonts); + } + // Assign ids from the table to each font table. + AssignIds(all_fonts, &get_fontinfo_table()); + for (auto &sub_lang : sub_langs_) { + AssignIds(all_fonts, &sub_lang->get_fontinfo_table()); + } + font_table_size_ = all_fonts.size(); +} + +#endif // ndef DISABLED_LEGACY_ENGINE + +void Tesseract::end_tesseract() { + end_recog(); +} + +/* Define command type identifiers */ + +enum CMD_EVENTS { ACTION_1_CMD_EVENT, RECOG_WERDS, RECOG_PSEUDO, ACTION_2_CMD_EVENT }; +} // namespace tesseract
