Mercurial > hgrepos > Python2 > PyMuPDF
view mupdf-source/thirdparty/tesseract/src/training/unicharset_extractor.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line source
/////////////////////////////////////////////////////////////////////// // File: unicharset_extractor.cpp // Description: Unicode character/ligature set extractor. // Author: Thomas Kielbus // // (C) Copyright 2006, Google Inc. // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // /////////////////////////////////////////////////////////////////////// // Given a list of box files or text files on the command line, this program // normalizes the text according to command-line options and generates // a unicharset. #include <cstdlib> #include <filesystem> #include "boxread.h" #include "commandlineflags.h" #include "commontraining.h" // CheckSharedLibraryVersion #include "lang_model_helpers.h" #include "normstrngs.h" #include "unicharset.h" #include "unicharset_training_utils.h" using namespace tesseract; static STRING_PARAM_FLAG(output_unicharset, "unicharset", "Output file path"); static INT_PARAM_FLAG(norm_mode, 1, "Normalization mode: 1=Combine graphemes, " "2=Split graphemes, 3=Pure unicode"); namespace tesseract { // Helper normalizes and segments the given strings according to norm_mode, and // adds the segmented parts to unicharset. static void AddStringsToUnicharset(const std::vector<std::string> &strings, int norm_mode, UNICHARSET *unicharset) { for (const auto &string : strings) { std::vector<std::string> normalized; if (NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone, static_cast<GraphemeNormMode>(norm_mode), /*report_errors*/ true, string.c_str(), &normalized)) { for (const std::string &normed : normalized) { // normed is a UTF-8 encoded string if (normed.empty() || IsUTF8Whitespace(normed.c_str())) { continue; } unicharset->unichar_insert(normed.c_str()); } } else { tprintf("Normalization failed for string '%s'\n", string.c_str()); } } } static int Main(int argc, char **argv) { UNICHARSET unicharset; // Load input files for (int arg = 1; arg < argc; ++arg) { std::filesystem::path filePath = argv[arg]; std::string file_data = tesseract::ReadFile(argv[arg]); if (file_data.empty()) { continue; } std::vector<std::string> texts; if (filePath.extension() == ".box") { tprintf("Extracting unicharset from box file %s\n", argv[arg]); bool res = ReadMemBoxes(-1, /*skip_blanks*/ true, &file_data[0], /*continue_on_failure*/ false, /*boxes*/ nullptr, &texts, /*box_texts*/ nullptr, /*pages*/ nullptr); if (!res) { tprintf("Cannot read box data from '%s'\n", argv[arg]); return EXIT_FAILURE; } } else { tprintf("Extracting unicharset from plain text file %s\n", argv[arg]); texts.clear(); texts = split(file_data, '\n'); } AddStringsToUnicharset(texts, FLAGS_norm_mode, &unicharset); } SetupBasicProperties(/*report_errors*/ true, /*decompose*/ false, &unicharset); // Write unicharset file. if (unicharset.save_to_file(FLAGS_output_unicharset.c_str())) { tprintf("Wrote unicharset file %s\n", FLAGS_output_unicharset.c_str()); } else { tprintf("Cannot save unicharset file %s\n", FLAGS_output_unicharset.c_str()); return EXIT_FAILURE; } return EXIT_SUCCESS; } } // namespace tesseract int main(int argc, char **argv) { tesseract::CheckSharedLibraryVersion(); if (argc > 1) { tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true); } if (argc < 2) { tprintf( "Usage: %s [--output_unicharset filename] [--norm_mode mode]" " box_or_text_file [...]\n", argv[0]); tprintf("Where mode means:\n"); tprintf(" 1=combine graphemes (use for Latin and other simple scripts)\n"); tprintf(" 2=split graphemes (use for Indic/Khmer/Myanmar)\n"); tprintf(" 3=pure unicode (use for Arabic/Hebrew/Thai/Tibetan)\n"); tprintf("Reads box or plain text files to extract the unicharset.\n"); return EXIT_FAILURE; } return tesseract::Main(argc, argv); }
