Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/training/ambiguous_words.cpp @ 3:2c135c81b16c
MERGE: upstream PyMuPDF 1.26.4 with MuPDF 1.26.7
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:44:09 +0200 |
| parents | b50eed0cc0ef |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/training/ambiguous_words.cpp Mon Sep 15 11:44:09 2025 +0200 @@ -0,0 +1,82 @@ +/////////////////////////////////////////////////////////////////////// +// File: ambiguous_words.cpp +// Description: A program that takes a text file with a list of words as +// input (one per line) and outputs a file with the words +// that were found in the dictionary followed by the words +// that are ambiguous to them. +// Author: Rika Antonova +// +// (C) Copyright 2011, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// +// + +#include "commontraining.h" // CheckSharedLibraryVersion +#include "dict.h" +#include "tesseractclass.h" + +#include <tesseract/baseapi.h> +#include "helpers.h" + +int main(int argc, char **argv) { + tesseract::CheckSharedLibraryVersion(); + + // Parse input arguments. + if (argc > 1 && (!strcmp(argv[1], "-v") || !strcmp(argv[1], "--version"))) { + printf("%s\n", tesseract::TessBaseAPI::Version()); + return EXIT_SUCCESS; + } else if (argc != 4 && (argc != 6 || strcmp(argv[1], "-l") != 0)) { + printf( + "Usage: %s -v | --version | %s [-l lang] tessdata_dir wordlist_file" + " output_ambiguous_wordlist_file\n", + argv[0], argv[0]); + return EXIT_FAILURE; + } + int argv_offset = 0; + std::string lang; + if (argc == 6) { + lang = argv[2]; + argv_offset = 2; + } else { + lang = "eng"; + } + const char *tessdata_dir = argv[++argv_offset]; + const char *input_file_str = argv[++argv_offset]; + const char *output_file_str = argv[++argv_offset]; + + // Initialize Tesseract. + tesseract::TessBaseAPI api; + std::vector<std::string> vars_vec; + std::vector<std::string> vars_values; + vars_vec.emplace_back("output_ambig_words_file"); + vars_values.emplace_back(output_file_str); + api.Init(tessdata_dir, lang.c_str(), tesseract::OEM_TESSERACT_ONLY, nullptr, 0, &vars_vec, + &vars_values, false); + tesseract::Dict &dict = api.tesseract()->getDict(); + FILE *input_file = fopen(input_file_str, "rb"); + if (input_file == nullptr) { + tesseract::tprintf("Failed to open input wordlist file %s\n", input_file_str); + return EXIT_FAILURE; + } + char str[CHARS_PER_LINE]; + + // Read word list and call Dict::NoDangerousAmbig() for each word + // to record ambiguities in the output file. + while (fgets(str, CHARS_PER_LINE, input_file) != nullptr) { + tesseract::chomp_string(str); // remove newline + tesseract::WERD_CHOICE word(str, dict.getUnicharset()); + dict.NoDangerousAmbig(&word, nullptr, false, nullptr); + } + // Clean up. + fclose(input_file); + return EXIT_SUCCESS; +}
