Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/training/unicharset/lang_model_helpers.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 // Copyright 2017 Google Inc. All Rights Reserved. | |
| 2 // Author: rays@google.com (Ray Smith) | |
| 3 // Purpose: Collection of convenience functions to simplify creation of the | |
| 4 // unicharset, recoder, and dawgs for an LSTM model. | |
| 5 | |
| 6 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 7 // you may not use this file except in compliance with the License. | |
| 8 // You may obtain a copy of the License at | |
| 9 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 10 // Unless required by applicable law or agreed to in writing, software | |
| 11 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 13 // See the License for the specific language governing permissions and | |
| 14 // limitations under the License. | |
| 15 #ifndef TESSERACT_TRAINING_LANG_MODEL_HELPERS_H_ | |
| 16 #define TESSERACT_TRAINING_LANG_MODEL_HELPERS_H_ | |
| 17 | |
| 18 #include "export.h" | |
| 19 | |
| 20 #include "serialis.h" | |
| 21 #include "tessdatamanager.h" | |
| 22 #include "unicharset.h" | |
| 23 | |
| 24 #include <string> | |
| 25 | |
| 26 namespace tesseract { | |
| 27 | |
| 28 // Helper makes a filename (<output_dir>/<lang>/<lang><suffix>) and writes data | |
| 29 // to the file, using writer if not null, otherwise, a default writer. | |
| 30 // Default writer will overwrite any existing file, but a supplied writer | |
| 31 // can do its own thing. If lang is empty, returns true but does nothing. | |
| 32 // NOTE that suffix should contain any required . for the filename. | |
| 33 TESS_UNICHARSET_TRAINING_API | |
| 34 bool WriteFile(const std::string &output_dir, const std::string &lang, const std::string &suffix, | |
| 35 const std::vector<char> &data, FileWriter writer); | |
| 36 // Helper reads a file with optional reader and returns a string. | |
| 37 // On failure emits a warning message and returns and empty string. | |
| 38 TESS_UNICHARSET_TRAINING_API | |
| 39 std::string ReadFile(const std::string &filename, FileReader reader = nullptr); | |
| 40 | |
| 41 // Helper writes the unicharset to file and to the traineddata. | |
| 42 bool WriteUnicharset(const UNICHARSET &unicharset, const std::string &output_dir, | |
| 43 const std::string &lang, FileWriter writer, TessdataManager *traineddata); | |
| 44 // Helper creates the recoder from the unicharset and writes it to the | |
| 45 // traineddata, with a human-readable form to file at: | |
| 46 // <output_dir>/<lang>/<lang>.charset_size=<num> for some num being the size | |
| 47 // of the re-encoded character set. The charset_size file is written using | |
| 48 // writer if not null, or using a default file writer otherwise, overwriting | |
| 49 // any existing content. | |
| 50 // If pass_through is true, then the recoder will be a no-op, passing the | |
| 51 // unicharset codes through unchanged. Otherwise, the recoder will "compress" | |
| 52 // the unicharset by encoding Hangul in Jamos, decomposing multi-unicode | |
| 53 // symbols into sequences of unicodes, and encoding Han using the data in the | |
| 54 // radical_table_data, which must be the content of the file: | |
| 55 // langdata/radical-stroke.txt. | |
| 56 bool WriteRecoder(const UNICHARSET &unicharset, bool pass_through, const std::string &output_dir, | |
| 57 const std::string &lang, FileWriter writer, std::string *radical_table_data, | |
| 58 TessdataManager *traineddata); | |
| 59 | |
| 60 // The main function for combine_lang_model.cpp. | |
| 61 // Returns EXIT_SUCCESS or EXIT_FAILURE for error. | |
| 62 // unicharset: can be a hand-created file with incomplete fields. Its basic | |
| 63 // and script properties will be set before it is used. | |
| 64 // script_dir: should point to the langdata (github repo) directory. | |
| 65 // version_str: arbitrary version label. | |
| 66 // Output files will be written to <output_dir>/<lang>/<lang>.* | |
| 67 // If pass_through_recoder is true, the unicharset will be used unchanged as | |
| 68 // labels in the classifier, otherwise, the unicharset will be "compressed" to | |
| 69 // make the recognition task simpler and faster. | |
| 70 // The words/puncs/numbers lists may be all empty. If any are non-empty then | |
| 71 // puncs must be non-empty. | |
| 72 // lang_is_rtl indicates that the language is generally written from right | |
| 73 // to left (eg Arabic/Hebrew). | |
| 74 TESS_UNICHARSET_TRAINING_API | |
| 75 int CombineLangModel(const UNICHARSET &unicharset, const std::string &script_dir, | |
| 76 const std::string &version_str, const std::string &output_dir, | |
| 77 const std::string &lang, bool pass_through_recoder, | |
| 78 const std::vector<std::string> &words, const std::vector<std::string> &puncs, | |
| 79 const std::vector<std::string> &numbers, bool lang_is_rtl, FileReader reader, | |
| 80 FileWriter writer); | |
| 81 | |
| 82 } // namespace tesseract | |
| 83 | |
| 84 #endif // TESSERACT_TRAINING_LANG_MODEL_HELPERS_H_ |
