Python2/PyMuPDF: mupdf-source/thirdparty/tesseract/src/training/unicharset

comparison mupdf-source/thirdparty/tesseract/src/training/unicharset_extractor.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.

author	Franz Glasner <fzglas.hg@dom66.de>
date	Mon, 15 Sep 2025 11:43:07 +0200
parents
children

comparison

equal deleted inserted replaced

-:1d09e1dec1d9
+:b50eed0cc0ef
+///////////////////////////////////////////////////////////////////////
+// File:        unicharset_extractor.cpp
+// Description: Unicode character/ligature set extractor.
+// Author:      Thomas Kielbus
+//
+// (C) Copyright 2006, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+// Given a list of box files or text files on the command line, this program
+// normalizes the text according to command-line options and generates
+// a unicharset.
+#include <cstdlib>
+#include <filesystem>
+#include "boxread.h"
+#include "commandlineflags.h"
+#include "commontraining.h" // CheckSharedLibraryVersion
+#include "lang_model_helpers.h"
+#include "normstrngs.h"
+#include "unicharset.h"
+#include "unicharset_training_utils.h"
+using namespace tesseract;
+static STRING_PARAM_FLAG(output_unicharset, "unicharset", "Output file path");
+static INT_PARAM_FLAG(norm_mode, 1,
+"Normalization mode: 1=Combine graphemes, "
+"2=Split graphemes, 3=Pure unicode");
+namespace tesseract {
+// Helper normalizes and segments the given strings according to norm_mode, and
+// adds the segmented parts to unicharset.
+static void AddStringsToUnicharset(const std::vector<std::string> &strings, int norm_mode,
+UNICHARSET *unicharset) {
+for (const auto &string : strings) {
+std::vector<std::string> normalized;
+if (NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,
+static_cast<GraphemeNormMode>(norm_mode),
+/*report_errors*/ true, string.c_str(), &normalized)) {
+for (const std::string &normed : normalized) {
+// normed is a UTF-8 encoded string
+if (normed.empty() || IsUTF8Whitespace(normed.c_str())) {
+continue;
+}
+unicharset->unichar_insert(normed.c_str());
+}
+} else {
+tprintf("Normalization failed for string '%s'\n", string.c_str());
+}
+}
+}
+static int Main(int argc, char **argv) {
+UNICHARSET unicharset;
+// Load input files
+for (int arg = 1; arg < argc; ++arg) {
+std::filesystem::path filePath = argv[arg];
+std::string file_data = tesseract::ReadFile(argv[arg]);
+if (file_data.empty()) {
+continue;
+}
+std::vector<std::string> texts;
+if (filePath.extension() == ".box") {
+tprintf("Extracting unicharset from box file %s\n", argv[arg]);
+bool res = ReadMemBoxes(-1, /*skip_blanks*/ true, &file_data[0],
+/*continue_on_failure*/ false, /*boxes*/ nullptr, &texts,
+/*box_texts*/ nullptr, /*pages*/ nullptr);
+if (!res) {
+tprintf("Cannot read box data from '%s'\n", argv[arg]);
+return EXIT_FAILURE;
+}
+} else {
+tprintf("Extracting unicharset from plain text file %s\n", argv[arg]);
+texts.clear();
+texts = split(file_data, '\n');
+}
+AddStringsToUnicharset(texts, FLAGS_norm_mode, &unicharset);
+}
+SetupBasicProperties(/*report_errors*/ true, /*decompose*/ false, &unicharset);
+// Write unicharset file.
+if (unicharset.save_to_file(FLAGS_output_unicharset.c_str())) {
+tprintf("Wrote unicharset file %s\n", FLAGS_output_unicharset.c_str());
+} else {
+tprintf("Cannot save unicharset file %s\n", FLAGS_output_unicharset.c_str());
+return EXIT_FAILURE;
+}
+return EXIT_SUCCESS;
+}
+} // namespace tesseract
+int main(int argc, char **argv) {
+tesseract::CheckSharedLibraryVersion();
+if (argc > 1) {
+tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);
+}
+if (argc < 2) {
+tprintf(
+"Usage: %s [--output_unicharset filename] [--norm_mode mode]"
+" box_or_text_file [...]\n",
+argv[0]);
+tprintf("Where mode means:\n");
+tprintf(" 1=combine graphemes (use for Latin and other simple scripts)\n");
+tprintf(" 2=split graphemes (use for Indic/Khmer/Myanmar)\n");
+tprintf(" 3=pure unicode (use for Arabic/Hebrew/Thai/Tibetan)\n");
+tprintf("Reads box or plain text files to extract the unicharset.\n");
+return EXIT_FAILURE;
+}
+return tesseract::Main(argc, argv);
+}

Mercurial > hgrepos > Python2 > PyMuPDF

comparison mupdf-source/thirdparty/tesseract/src/training/unicharset_extractor.cpp @ 2:b50eed0cc0ef upstream