Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/tesseract.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /********************************************************************** | |
| 2 * File: tesseract.cpp | |
| 3 * Description: Main program for merge of tess and editor. | |
| 4 * Author: Ray Smith | |
| 5 * | |
| 6 * (C) Copyright 1992, Hewlett-Packard Ltd. | |
| 7 ** Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 ** you may not use this file except in compliance with the License. | |
| 9 ** You may obtain a copy of the License at | |
| 10 ** http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 ** Unless required by applicable law or agreed to in writing, software | |
| 12 ** distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 ** See the License for the specific language governing permissions and | |
| 15 ** limitations under the License. | |
| 16 * | |
| 17 **********************************************************************/ | |
| 18 | |
| 19 // Include automatically generated configuration file if running autoconf | |
| 20 #ifdef HAVE_CONFIG_H | |
| 21 # include "config_auto.h" | |
| 22 #endif | |
| 23 | |
| 24 #include <cerrno> // for errno | |
| 25 #if defined(__USE_GNU) | |
| 26 # include <cfenv> // for feenableexcept | |
| 27 #endif | |
| 28 #include <climits> // for INT_MIN, INT_MAX | |
| 29 #include <cstdlib> // for std::getenv | |
| 30 #include <iostream> | |
| 31 #include <map> // for std::map | |
| 32 #include <memory> // std::unique_ptr | |
| 33 | |
| 34 #include <allheaders.h> | |
| 35 #include <tesseract/baseapi.h> | |
| 36 #include "dict.h" | |
| 37 #include <tesseract/renderer.h> | |
| 38 #include "simddetect.h" | |
| 39 #include "tesseractclass.h" // for AnyTessLang | |
| 40 #include "tprintf.h" // for tprintf | |
| 41 | |
| 42 #ifdef _OPENMP | |
| 43 # include <omp.h> | |
| 44 #endif | |
| 45 | |
| 46 #if defined(HAVE_LIBARCHIVE) | |
| 47 # include <archive.h> | |
| 48 #endif | |
| 49 #if defined(HAVE_LIBCURL) | |
| 50 # include <curl/curl.h> | |
| 51 #endif | |
| 52 | |
| 53 #if defined(_WIN32) | |
| 54 # include <fcntl.h> | |
| 55 # include <io.h> | |
| 56 # if defined(HAVE_TIFFIO_H) | |
| 57 | |
| 58 # include <tiffio.h> | |
| 59 | |
| 60 static void Win32ErrorHandler(const char *module, const char *fmt, va_list ap) { | |
| 61 if (module != nullptr) { | |
| 62 fprintf(stderr, "%s: ", module); | |
| 63 } | |
| 64 vfprintf(stderr, fmt, ap); | |
| 65 fprintf(stderr, ".\n"); | |
| 66 } | |
| 67 | |
| 68 static void Win32WarningHandler(const char *module, const char *fmt, va_list ap) { | |
| 69 if (module != nullptr) { | |
| 70 fprintf(stderr, "%s: ", module); | |
| 71 } | |
| 72 fprintf(stderr, "Warning, "); | |
| 73 vfprintf(stderr, fmt, ap); | |
| 74 fprintf(stderr, ".\n"); | |
| 75 } | |
| 76 | |
| 77 # endif /* HAVE_TIFFIO_H */ | |
| 78 | |
| 79 class AutoWin32ConsoleOutputCP { | |
| 80 public: | |
| 81 explicit AutoWin32ConsoleOutputCP(UINT codeCP) : | |
| 82 oldCP_(GetConsoleOutputCP()) { | |
| 83 SetConsoleOutputCP(codeCP); | |
| 84 } | |
| 85 ~AutoWin32ConsoleOutputCP() { | |
| 86 SetConsoleOutputCP(oldCP_); | |
| 87 } | |
| 88 | |
| 89 private: | |
| 90 UINT oldCP_; | |
| 91 }; | |
| 92 | |
| 93 static AutoWin32ConsoleOutputCP autoWin32ConsoleOutputCP(CP_UTF8); | |
| 94 | |
| 95 #endif // _WIN32 | |
| 96 | |
| 97 using namespace tesseract; | |
| 98 | |
| 99 static void PrintVersionInfo() { | |
| 100 char *versionStrP; | |
| 101 | |
| 102 printf("tesseract %s\n", tesseract::TessBaseAPI::Version()); | |
| 103 | |
| 104 versionStrP = getLeptonicaVersion(); | |
| 105 printf(" %s\n", versionStrP); | |
| 106 lept_free(versionStrP); | |
| 107 | |
| 108 versionStrP = getImagelibVersions(); | |
| 109 printf(" %s\n", versionStrP); | |
| 110 lept_free(versionStrP); | |
| 111 | |
| 112 #if defined(HAVE_NEON) || defined(__aarch64__) | |
| 113 if (tesseract::SIMDDetect::IsNEONAvailable()) | |
| 114 printf(" Found NEON\n"); | |
| 115 #elif defined(HAVE_RVV) | |
| 116 if (tesseract::SIMDDetect::IsRVVAvailable()) | |
| 117 printf(" Found RVV\n"); | |
| 118 #else | |
| 119 if (tesseract::SIMDDetect::IsAVX512BWAvailable()) { | |
| 120 printf(" Found AVX512BW\n"); | |
| 121 } | |
| 122 if (tesseract::SIMDDetect::IsAVX512FAvailable()) { | |
| 123 printf(" Found AVX512F\n"); | |
| 124 } | |
| 125 if (tesseract::SIMDDetect::IsAVX512VNNIAvailable()) { | |
| 126 printf(" Found AVX512VNNI\n"); | |
| 127 } | |
| 128 if (tesseract::SIMDDetect::IsAVX2Available()) { | |
| 129 printf(" Found AVX2\n"); | |
| 130 } | |
| 131 if (tesseract::SIMDDetect::IsAVXAvailable()) { | |
| 132 printf(" Found AVX\n"); | |
| 133 } | |
| 134 if (tesseract::SIMDDetect::IsFMAAvailable()) { | |
| 135 printf(" Found FMA\n"); | |
| 136 } | |
| 137 if (tesseract::SIMDDetect::IsSSEAvailable()) { | |
| 138 printf(" Found SSE4.1\n"); | |
| 139 } | |
| 140 #endif | |
| 141 #ifdef _OPENMP | |
| 142 printf(" Found OpenMP %d\n", _OPENMP); | |
| 143 #endif | |
| 144 #if defined(HAVE_LIBARCHIVE) | |
| 145 # if ARCHIVE_VERSION_NUMBER >= 3002000 | |
| 146 printf(" Found %s\n", archive_version_details()); | |
| 147 # else | |
| 148 printf(" Found %s\n", archive_version_string()); | |
| 149 # endif // ARCHIVE_VERSION_NUMBER | |
| 150 #endif // HAVE_LIBARCHIVE | |
| 151 #if defined(HAVE_LIBCURL) | |
| 152 printf(" Found %s\n", curl_version()); | |
| 153 #endif | |
| 154 } | |
| 155 | |
| 156 static void PrintHelpForPSM() { | |
| 157 printf( | |
| 158 "Page segmentation modes (PSM):\n" | |
| 159 " 0|osd_only Orientation and script detection (OSD) only.\n" | |
| 160 " 1|auto_osd Automatic page segmentation with OSD.\n" | |
| 161 " 2|auto_only Automatic page segmentation, but no OSD, or OCR. (not " | |
| 162 "implemented)\n" | |
| 163 " 3|auto Fully automatic page segmentation, but no OSD. (Default)\n" | |
| 164 " 4|single_column Assume a single column of text of variable sizes.\n" | |
| 165 " 5|single_block_vert_text Assume a single uniform block of vertically aligned text.\n" | |
| 166 " 6|single_block Assume a single uniform block of text.\n" | |
| 167 " 7|single_line Treat the image as a single text line.\n" | |
| 168 " 8|single_word Treat the image as a single word.\n" | |
| 169 " 9|circle_word Treat the image as a single word in a circle.\n" | |
| 170 " 10|single_char Treat the image as a single character.\n" | |
| 171 " 11|sparse_text Sparse text. Find as much text as possible in no" | |
| 172 " particular order.\n" | |
| 173 " 12|sparse_text_osd Sparse text with OSD.\n" | |
| 174 " 13|raw_line Raw line. Treat the image as a single text line,\n" | |
| 175 " bypassing hacks that are Tesseract-specific.\n" | |
| 176 ); | |
| 177 | |
| 178 #ifdef DISABLED_LEGACY_ENGINE | |
| 179 printf("\nNOTE: The OSD modes are currently disabled.\n"); | |
| 180 #endif | |
| 181 } | |
| 182 | |
| 183 #ifndef DISABLED_LEGACY_ENGINE | |
| 184 static void PrintHelpForOEM() { | |
| 185 printf( | |
| 186 "OCR Engine modes (OEM):\n" | |
| 187 " 0|tesseract_only Legacy engine only.\n" | |
| 188 " 1|lstm_only Neural nets LSTM engine only.\n" | |
| 189 " 2|tesseract_lstm_combined Legacy + LSTM engines.\n" | |
| 190 " 3|default Default, based on what is available.\n" | |
| 191 ); | |
| 192 } | |
| 193 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 194 | |
| 195 static void PrintHelpExtra(const char *program) { | |
| 196 printf( | |
| 197 "Usage:\n" | |
| 198 " %s --help | --help-extra | --help-psm | " | |
| 199 #ifndef DISABLED_LEGACY_ENGINE | |
| 200 "--help-oem | " | |
| 201 #endif | |
| 202 "--version\n" | |
| 203 " %s --list-langs [--tessdata-dir PATH]\n" | |
| 204 #ifndef DISABLED_LEGACY_ENGINE | |
| 205 " %s --print-fonts-table [options...] [configfile...]\n" | |
| 206 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 207 " %s --print-parameters [options...] [configfile...]\n" | |
| 208 " %s imagename|imagelist|stdin outputbase|stdout [options...] " | |
| 209 "[configfile...]\n" | |
| 210 "\n" | |
| 211 "OCR options:\n" | |
| 212 " --tessdata-dir PATH Specify the location of tessdata path.\n" | |
| 213 " --user-words PATH Specify the location of user words file.\n" | |
| 214 " --user-patterns PATH Specify the location of user patterns file.\n" | |
| 215 " --dpi VALUE Specify DPI for input image.\n" | |
| 216 " --loglevel LEVEL Specify logging level. LEVEL can be\n" | |
| 217 " ALL, TRACE, DEBUG, INFO, WARN, ERROR, FATAL or OFF.\n" | |
| 218 " -l LANG[+LANG] Specify language(s) used for OCR.\n" | |
| 219 " -c VAR=VALUE Set value for config variables.\n" | |
| 220 " Multiple -c arguments are allowed.\n" | |
| 221 " --psm PSM|NUM Specify page segmentation mode.\n" | |
| 222 #ifndef DISABLED_LEGACY_ENGINE | |
| 223 " --oem OEM|NUM Specify OCR Engine mode.\n" | |
| 224 #endif | |
| 225 "NOTE: These options must occur before any configfile.\n" | |
| 226 "\n", | |
| 227 program, program, program, program | |
| 228 #ifndef DISABLED_LEGACY_ENGINE | |
| 229 , program | |
| 230 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 231 ); | |
| 232 | |
| 233 PrintHelpForPSM(); | |
| 234 #ifndef DISABLED_LEGACY_ENGINE | |
| 235 printf("\n"); | |
| 236 PrintHelpForOEM(); | |
| 237 #endif | |
| 238 | |
| 239 printf( | |
| 240 "\n" | |
| 241 "Single options:\n" | |
| 242 " -h, --help Show minimal help message.\n" | |
| 243 " --help-extra Show extra help for advanced users.\n" | |
| 244 " --help-psm Show page segmentation modes.\n" | |
| 245 #ifndef DISABLED_LEGACY_ENGINE | |
| 246 " --help-oem Show OCR Engine modes.\n" | |
| 247 #endif | |
| 248 " -v, --version Show version information.\n" | |
| 249 " --list-langs List available languages for tesseract engine.\n" | |
| 250 #ifndef DISABLED_LEGACY_ENGINE | |
| 251 " --print-fonts-table Print tesseract fonts table.\n" | |
| 252 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 253 " --print-parameters Print tesseract parameters.\n"); | |
| 254 } | |
| 255 | |
| 256 static void PrintHelpMessage(const char *program) { | |
| 257 printf( | |
| 258 "Usage:\n" | |
| 259 " %s --help | --help-extra | --version\n" | |
| 260 " %s --list-langs\n" | |
| 261 " %s imagename outputbase [options...] [configfile...]\n" | |
| 262 "\n" | |
| 263 "OCR options:\n" | |
| 264 " -l LANG[+LANG] Specify language(s) used for OCR.\n" | |
| 265 "NOTE: These options must occur before any configfile.\n" | |
| 266 "\n" | |
| 267 "Single options:\n" | |
| 268 " --help Show this help message.\n" | |
| 269 " --help-extra Show extra help for advanced users.\n" | |
| 270 " --version Show version information.\n" | |
| 271 " --list-langs List available languages for tesseract " | |
| 272 "engine.\n", | |
| 273 program, program, program); | |
| 274 } | |
| 275 | |
| 276 static bool SetVariablesFromCLArgs(tesseract::TessBaseAPI &api, int argc, char **argv) { | |
| 277 bool success = true; | |
| 278 char opt1[256], opt2[255]; | |
| 279 for (int i = 0; i < argc; i++) { | |
| 280 if (strcmp(argv[i], "-c") == 0 && i + 1 < argc) { | |
| 281 strncpy(opt1, argv[i + 1], 255); | |
| 282 opt1[255] = '\0'; | |
| 283 char *p = strchr(opt1, '='); | |
| 284 if (!p) { | |
| 285 fprintf(stderr, "Missing = in configvar assignment\n"); | |
| 286 success = false; | |
| 287 break; | |
| 288 } | |
| 289 *p = 0; | |
| 290 strncpy(opt2, strchr(argv[i + 1], '=') + 1, sizeof(opt2) - 1); | |
| 291 opt2[254] = 0; | |
| 292 ++i; | |
| 293 | |
| 294 if (!api.SetVariable(opt1, opt2)) { | |
| 295 fprintf(stderr, "Could not set option: %s=%s\n", opt1, opt2); | |
| 296 } | |
| 297 } | |
| 298 } | |
| 299 return success; | |
| 300 } | |
| 301 | |
| 302 static void PrintLangsList(tesseract::TessBaseAPI &api) { | |
| 303 std::vector<std::string> languages; | |
| 304 api.GetAvailableLanguagesAsVector(&languages); | |
| 305 printf("List of available languages in \"%s\" (%zu):\n", | |
| 306 api.GetDatapath(), languages.size()); | |
| 307 for (const auto &language : languages) { | |
| 308 printf("%s\n", language.c_str()); | |
| 309 } | |
| 310 api.End(); | |
| 311 } | |
| 312 | |
| 313 /** | |
| 314 * We have 2 possible sources of pagesegmode: a config file and | |
| 315 * the command line. For backwards compatibility reasons, the | |
| 316 * default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the | |
| 317 * default for this program is tesseract::PSM_AUTO. We will let | |
| 318 * the config file take priority, so the command-line default | |
| 319 * can take priority over the tesseract default, so we use the | |
| 320 * value from the command line only if the retrieved mode | |
| 321 * is still tesseract::PSM_SINGLE_BLOCK, indicating no change | |
| 322 * in any config file. Therefore the only way to force | |
| 323 * tesseract::PSM_SINGLE_BLOCK is from the command line. | |
| 324 * It would be simpler if we could set the value before Init, | |
| 325 * but that doesn't work. | |
| 326 */ | |
| 327 static void FixPageSegMode(tesseract::TessBaseAPI &api, tesseract::PageSegMode pagesegmode) { | |
| 328 if (api.GetPageSegMode() == tesseract::PSM_SINGLE_BLOCK) { | |
| 329 api.SetPageSegMode(pagesegmode); | |
| 330 } | |
| 331 } | |
| 332 | |
| 333 static bool checkArgValues(int arg, const char *mode, int count) { | |
| 334 if (arg >= count || arg < 0) { | |
| 335 printf("Invalid %s value, please enter a symbolic %s value or a number between 0-%d\n", mode, mode, count - 1); | |
| 336 return false; | |
| 337 } | |
| 338 return true; | |
| 339 } | |
| 340 | |
| 341 // Convert a symbolic or numeric string to an OEM value. | |
| 342 static int stringToOEM(const std::string arg) { | |
| 343 std::map<std::string, int> oem_map = { | |
| 344 {"0", 0}, | |
| 345 {"1", 1}, | |
| 346 {"2", 2}, | |
| 347 {"3", 3}, | |
| 348 {"tesseract_only", 0}, | |
| 349 {"lstm_only", 1}, | |
| 350 {"tesseract_lstm_combined", 2}, | |
| 351 {"default", 3}, | |
| 352 }; | |
| 353 auto it = oem_map.find(arg); | |
| 354 return it == oem_map.end() ? -1 : it->second; | |
| 355 } | |
| 356 | |
| 357 static int stringToPSM(const std::string arg) { | |
| 358 std::map<std::string, int> psm_map = { | |
| 359 {"0", 0}, | |
| 360 {"1", 1}, | |
| 361 {"2", 2}, | |
| 362 {"3", 3}, | |
| 363 {"4", 4}, | |
| 364 {"5", 5}, | |
| 365 {"6", 6}, | |
| 366 {"7", 7}, | |
| 367 {"8", 8}, | |
| 368 {"9", 9}, | |
| 369 {"10", 10}, | |
| 370 {"11", 11}, | |
| 371 {"12", 12}, | |
| 372 {"13", 13}, | |
| 373 {"osd_only", 0}, | |
| 374 {"auto_osd", 1}, | |
| 375 {"auto_only", 2}, | |
| 376 {"auto", 3}, | |
| 377 {"single_column", 4}, | |
| 378 {"single_block_vert_text", 5}, | |
| 379 {"single_block", 6}, | |
| 380 {"single_line", 7}, | |
| 381 {"single_word", 8}, | |
| 382 {"circle_word", 9}, | |
| 383 {"single_char", 10}, | |
| 384 {"sparse_text", 11}, | |
| 385 {"sparse_text_osd", 12}, | |
| 386 {"raw_line", 13}, | |
| 387 }; | |
| 388 auto it = psm_map.find(arg); | |
| 389 return it == psm_map.end() ? -1 : it->second; | |
| 390 } | |
| 391 | |
| 392 // NOTE: arg_i is used here to avoid ugly *i so many times in this function | |
| 393 static bool ParseArgs(int argc, char **argv, const char **lang, const char **image, | |
| 394 const char **outputbase, const char **datapath, l_int32 *dpi, | |
| 395 bool *list_langs, bool *print_parameters, bool *print_fonts_table, | |
| 396 std::vector<std::string> *vars_vec, std::vector<std::string> *vars_values, | |
| 397 l_int32 *arg_i, tesseract::PageSegMode *pagesegmode, | |
| 398 tesseract::OcrEngineMode *enginemode) { | |
| 399 bool noocr = false; | |
| 400 int i; | |
| 401 for (i = 1; i < argc && (*outputbase == nullptr || argv[i][0] == '-'); i++) { | |
| 402 if (*image != nullptr && *outputbase == nullptr) { | |
| 403 // outputbase follows image, don't allow options at that position. | |
| 404 *outputbase = argv[i]; | |
| 405 } else if ((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0)) { | |
| 406 PrintHelpMessage(argv[0]); | |
| 407 noocr = true; | |
| 408 } else if (strcmp(argv[i], "--help-extra") == 0) { | |
| 409 PrintHelpExtra(argv[0]); | |
| 410 noocr = true; | |
| 411 } else if ((strcmp(argv[i], "--help-psm") == 0)) { | |
| 412 PrintHelpForPSM(); | |
| 413 noocr = true; | |
| 414 #ifndef DISABLED_LEGACY_ENGINE | |
| 415 } else if ((strcmp(argv[i], "--help-oem") == 0)) { | |
| 416 PrintHelpForOEM(); | |
| 417 noocr = true; | |
| 418 #endif | |
| 419 } else if ((strcmp(argv[i], "-v") == 0) || (strcmp(argv[i], "--version") == 0)) { | |
| 420 PrintVersionInfo(); | |
| 421 noocr = true; | |
| 422 } else if (strcmp(argv[i], "-l") == 0 && i + 1 < argc) { | |
| 423 *lang = argv[i + 1]; | |
| 424 ++i; | |
| 425 } else if (strcmp(argv[i], "--tessdata-dir") == 0 && i + 1 < argc) { | |
| 426 *datapath = argv[i + 1]; | |
| 427 ++i; | |
| 428 } else if (strcmp(argv[i], "--dpi") == 0 && i + 1 < argc) { | |
| 429 *dpi = atoi(argv[i + 1]); | |
| 430 ++i; | |
| 431 } else if (strcmp(argv[i], "--loglevel") == 0 && i + 1 < argc) { | |
| 432 // Allow the log levels which are used by log4cxx. | |
| 433 const std::string loglevel_string = argv[++i]; | |
| 434 static const std::map<const std::string, int> loglevels { | |
| 435 {"ALL", INT_MIN}, | |
| 436 {"TRACE", 5000}, | |
| 437 {"DEBUG", 10000}, | |
| 438 {"INFO", 20000}, | |
| 439 {"WARN", 30000}, | |
| 440 {"ERROR", 40000}, | |
| 441 {"FATAL", 50000}, | |
| 442 {"OFF", INT_MAX}, | |
| 443 }; | |
| 444 try { | |
| 445 auto loglevel = loglevels.at(loglevel_string); | |
| 446 log_level = loglevel; | |
| 447 } catch (const std::out_of_range &e) { | |
| 448 // TODO: Allow numeric argument? | |
| 449 tprintf("Error, unsupported --loglevel %s\n", loglevel_string.c_str()); | |
| 450 return false; | |
| 451 } | |
| 452 } else if (strcmp(argv[i], "--user-words") == 0 && i + 1 < argc) { | |
| 453 vars_vec->push_back("user_words_file"); | |
| 454 vars_values->push_back(argv[i + 1]); | |
| 455 ++i; | |
| 456 } else if (strcmp(argv[i], "--user-patterns") == 0 && i + 1 < argc) { | |
| 457 vars_vec->push_back("user_patterns_file"); | |
| 458 vars_values->push_back(argv[i + 1]); | |
| 459 ++i; | |
| 460 } else if (strcmp(argv[i], "--list-langs") == 0) { | |
| 461 noocr = true; | |
| 462 *list_langs = true; | |
| 463 } else if (strcmp(argv[i], "--psm") == 0 && i + 1 < argc) { | |
| 464 int psm = stringToPSM(argv[i + 1]); | |
| 465 if (!checkArgValues(psm, "PSM", tesseract::PSM_COUNT)) { | |
| 466 return false; | |
| 467 } | |
| 468 *pagesegmode = static_cast<tesseract::PageSegMode>(psm); | |
| 469 ++i; | |
| 470 } else if (strcmp(argv[i], "--oem") == 0 && i + 1 < argc) { | |
| 471 #ifndef DISABLED_LEGACY_ENGINE | |
| 472 int oem = stringToOEM(argv[i + 1]); | |
| 473 if (!checkArgValues(oem, "OEM", tesseract::OEM_COUNT)) { | |
| 474 return false; | |
| 475 } | |
| 476 *enginemode = static_cast<tesseract::OcrEngineMode>(oem); | |
| 477 #endif | |
| 478 ++i; | |
| 479 } else if (strcmp(argv[i], "--print-parameters") == 0) { | |
| 480 noocr = true; | |
| 481 *print_parameters = true; | |
| 482 #ifndef DISABLED_LEGACY_ENGINE | |
| 483 } else if (strcmp(argv[i], "--print-fonts-table") == 0) { | |
| 484 noocr = true; | |
| 485 *print_fonts_table = true; | |
| 486 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 487 } else if (strcmp(argv[i], "-c") == 0 && i + 1 < argc) { | |
| 488 // handled properly after api init | |
| 489 ++i; | |
| 490 } else if (*image == nullptr) { | |
| 491 *image = argv[i]; | |
| 492 } else { | |
| 493 // Unexpected argument. | |
| 494 fprintf(stderr, "Error, unknown command line argument '%s'\n", argv[i]); | |
| 495 return false; | |
| 496 } | |
| 497 } | |
| 498 | |
| 499 *arg_i = i; | |
| 500 | |
| 501 if (*pagesegmode == tesseract::PSM_OSD_ONLY) { | |
| 502 // OSD = orientation and script detection. | |
| 503 if (*lang != nullptr && strcmp(*lang, "osd")) { | |
| 504 // If the user explicitly specifies a language (other than osd) | |
| 505 // or a script, only orientation can be detected. | |
| 506 fprintf(stderr, "Warning, detects only orientation with -l %s\n", *lang); | |
| 507 } else { | |
| 508 // That mode requires osd.traineddata to detect orientation and script. | |
| 509 *lang = "osd"; | |
| 510 } | |
| 511 } | |
| 512 | |
| 513 if (*outputbase == nullptr && noocr == false) { | |
| 514 PrintHelpMessage(argv[0]); | |
| 515 return false; | |
| 516 } | |
| 517 | |
| 518 return true; | |
| 519 } | |
| 520 | |
| 521 static void PreloadRenderers(tesseract::TessBaseAPI &api, | |
| 522 std::vector<std::unique_ptr<TessResultRenderer>> &renderers, | |
| 523 tesseract::PageSegMode pagesegmode, const char *outputbase) { | |
| 524 if (pagesegmode == tesseract::PSM_OSD_ONLY) { | |
| 525 #ifndef DISABLED_LEGACY_ENGINE | |
| 526 renderers.push_back(std::make_unique<tesseract::TessOsdRenderer>(outputbase)); | |
| 527 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 528 } else { | |
| 529 bool error = false; | |
| 530 bool b; | |
| 531 api.GetBoolVariable("tessedit_create_hocr", &b); | |
| 532 if (b) { | |
| 533 bool font_info; | |
| 534 api.GetBoolVariable("hocr_font_info", &font_info); | |
| 535 auto renderer = std::make_unique<tesseract::TessHOcrRenderer>(outputbase, font_info); | |
| 536 if (renderer->happy()) { | |
| 537 renderers.push_back(std::move(renderer)); | |
| 538 } else { | |
| 539 tprintf("Error, could not create hOCR output file: %s\n", strerror(errno)); | |
| 540 error = true; | |
| 541 } | |
| 542 } | |
| 543 | |
| 544 api.GetBoolVariable("tessedit_create_alto", &b); | |
| 545 if (b) { | |
| 546 auto renderer = std::make_unique<tesseract::TessAltoRenderer>(outputbase); | |
| 547 if (renderer->happy()) { | |
| 548 renderers.push_back(std::move(renderer)); | |
| 549 } else { | |
| 550 tprintf("Error, could not create ALTO output file: %s\n", strerror(errno)); | |
| 551 error = true; | |
| 552 } | |
| 553 } | |
| 554 | |
| 555 api.GetBoolVariable("tessedit_create_page_xml", &b); | |
| 556 if (b) { | |
| 557 auto renderer = std::make_unique<tesseract::TessPAGERenderer>(outputbase); | |
| 558 if (renderer->happy()) { | |
| 559 renderers.push_back(std::move(renderer)); | |
| 560 } else { | |
| 561 tprintf("Error, could not create PAGE output file: %s\n", strerror(errno)); | |
| 562 error = true; | |
| 563 } | |
| 564 } | |
| 565 | |
| 566 api.GetBoolVariable("tessedit_create_tsv", &b); | |
| 567 if (b) { | |
| 568 bool font_info; | |
| 569 api.GetBoolVariable("hocr_font_info", &font_info); | |
| 570 auto renderer = std::make_unique<tesseract::TessTsvRenderer>(outputbase, font_info); | |
| 571 if (renderer->happy()) { | |
| 572 renderers.push_back(std::move(renderer)); | |
| 573 } else { | |
| 574 tprintf("Error, could not create TSV output file: %s\n", strerror(errno)); | |
| 575 error = true; | |
| 576 } | |
| 577 } | |
| 578 | |
| 579 api.GetBoolVariable("tessedit_create_pdf", &b); | |
| 580 if (b) { | |
| 581 #ifdef WIN32 | |
| 582 if (_setmode(_fileno(stdout), _O_BINARY) == -1) | |
| 583 tprintf("ERROR: cin to binary: %s", strerror(errno)); | |
| 584 #endif // WIN32 | |
| 585 bool textonly; | |
| 586 api.GetBoolVariable("textonly_pdf", &textonly); | |
| 587 auto renderer = std::make_unique<tesseract::TessPDFRenderer>(outputbase, api.GetDatapath(), textonly); | |
| 588 if (renderer->happy()) { | |
| 589 renderers.push_back(std::move(renderer)); | |
| 590 } else { | |
| 591 tprintf("Error, could not create PDF output file: %s\n", strerror(errno)); | |
| 592 error = true; | |
| 593 } | |
| 594 } | |
| 595 | |
| 596 api.GetBoolVariable("tessedit_write_unlv", &b); | |
| 597 if (b) { | |
| 598 api.SetVariable("unlv_tilde_crunching", "true"); | |
| 599 auto renderer = std::make_unique<tesseract::TessUnlvRenderer>(outputbase); | |
| 600 if (renderer->happy()) { | |
| 601 renderers.push_back(std::move(renderer)); | |
| 602 } else { | |
| 603 tprintf("Error, could not create UNLV output file: %s\n", strerror(errno)); | |
| 604 error = true; | |
| 605 } | |
| 606 } | |
| 607 | |
| 608 api.GetBoolVariable("tessedit_create_lstmbox", &b); | |
| 609 if (b) { | |
| 610 auto renderer = std::make_unique<tesseract::TessLSTMBoxRenderer>(outputbase); | |
| 611 if (renderer->happy()) { | |
| 612 renderers.push_back(std::move(renderer)); | |
| 613 } else { | |
| 614 tprintf("Error, could not create LSTM BOX output file: %s\n", strerror(errno)); | |
| 615 error = true; | |
| 616 } | |
| 617 } | |
| 618 | |
| 619 api.GetBoolVariable("tessedit_create_boxfile", &b); | |
| 620 if (b) { | |
| 621 auto renderer = std::make_unique<tesseract::TessBoxTextRenderer>(outputbase); | |
| 622 if (renderer->happy()) { | |
| 623 renderers.push_back(std::move(renderer)); | |
| 624 } else { | |
| 625 tprintf("Error, could not create BOX output file: %s\n", strerror(errno)); | |
| 626 error = true; | |
| 627 } | |
| 628 } | |
| 629 | |
| 630 api.GetBoolVariable("tessedit_create_wordstrbox", &b); | |
| 631 if (b) { | |
| 632 auto renderer = std::make_unique<tesseract::TessWordStrBoxRenderer>(outputbase); | |
| 633 if (renderer->happy()) { | |
| 634 renderers.push_back(std::move(renderer)); | |
| 635 } else { | |
| 636 tprintf("Error, could not create WordStr BOX output file: %s\n", strerror(errno)); | |
| 637 error = true; | |
| 638 } | |
| 639 } | |
| 640 | |
| 641 api.GetBoolVariable("tessedit_create_txt", &b); | |
| 642 if (b || (!error && renderers.empty())) { | |
| 643 // Create text output if no other output was requested | |
| 644 // even if text output was not explicitly requested unless | |
| 645 // there was an error. | |
| 646 auto renderer = std::make_unique<tesseract::TessTextRenderer>(outputbase); | |
| 647 if (renderer->happy()) { | |
| 648 renderers.push_back(std::move(renderer)); | |
| 649 } else { | |
| 650 tprintf("Error, could not create TXT output file: %s\n", strerror(errno)); | |
| 651 } | |
| 652 } | |
| 653 } | |
| 654 | |
| 655 // Null-out the renderers that are | |
| 656 // added to the root, and leave the root in the vector. | |
| 657 for (size_t r = 1; r < renderers.size(); ++r) { | |
| 658 renderers[0]->insert(renderers[r].get()); | |
| 659 renderers[r].release(); // at the moment insert() is owning | |
| 660 } | |
| 661 } | |
| 662 | |
| 663 /********************************************************************** | |
| 664 * main() | |
| 665 * | |
| 666 **********************************************************************/ | |
| 667 | |
| 668 int main(int argc, char **argv) { | |
| 669 #if defined(__USE_GNU) && defined(HAVE_FEENABLEEXCEPT) | |
| 670 // Raise SIGFPE. | |
| 671 # if defined(__clang__) | |
| 672 // clang creates code which causes some FP exceptions, so don't enable those. | |
| 673 feenableexcept(FE_DIVBYZERO); | |
| 674 # else | |
| 675 feenableexcept(FE_DIVBYZERO | FE_OVERFLOW | FE_INVALID); | |
| 676 # endif | |
| 677 #endif | |
| 678 const char *lang = nullptr; | |
| 679 const char *image = nullptr; | |
| 680 const char *outputbase = nullptr; | |
| 681 const char *datapath = nullptr; | |
| 682 bool list_langs = false; | |
| 683 bool print_parameters = false; | |
| 684 bool print_fonts_table = false; | |
| 685 l_int32 dpi = 0; | |
| 686 int arg_i = 1; | |
| 687 tesseract::PageSegMode pagesegmode = tesseract::PSM_AUTO; | |
| 688 #ifdef DISABLED_LEGACY_ENGINE | |
| 689 auto enginemode = tesseract::OEM_LSTM_ONLY; | |
| 690 #else | |
| 691 tesseract::OcrEngineMode enginemode = tesseract::OEM_DEFAULT; | |
| 692 #endif | |
| 693 std::vector<std::string> vars_vec; | |
| 694 std::vector<std::string> vars_values; | |
| 695 | |
| 696 if (std::getenv("LEPT_MSG_SEVERITY")) { | |
| 697 // Get Leptonica message level from environment variable. | |
| 698 setMsgSeverity(L_SEVERITY_EXTERNAL); | |
| 699 } else { | |
| 700 // Disable debugging and informational messages from Leptonica. | |
| 701 setMsgSeverity(L_SEVERITY_ERROR); | |
| 702 } | |
| 703 | |
| 704 #if defined(HAVE_TIFFIO_H) && defined(_WIN32) | |
| 705 /* Show libtiff errors and warnings on console (not in GUI). */ | |
| 706 TIFFSetErrorHandler(Win32ErrorHandler); | |
| 707 TIFFSetWarningHandler(Win32WarningHandler); | |
| 708 #endif // HAVE_TIFFIO_H && _WIN32 | |
| 709 | |
| 710 if (!ParseArgs(argc, argv, &lang, &image, &outputbase, &datapath, &dpi, &list_langs, | |
| 711 &print_parameters, &print_fonts_table, &vars_vec, &vars_values, &arg_i, | |
| 712 &pagesegmode, &enginemode)) { | |
| 713 return EXIT_FAILURE; | |
| 714 } | |
| 715 | |
| 716 bool in_recognition_mode = !list_langs && !print_parameters && !print_fonts_table; | |
| 717 | |
| 718 if (lang == nullptr && in_recognition_mode) { | |
| 719 // Set default language model if none was given and a model file is needed. | |
| 720 lang = "eng"; | |
| 721 } | |
| 722 | |
| 723 if (image == nullptr && in_recognition_mode) { | |
| 724 return EXIT_SUCCESS; | |
| 725 } | |
| 726 | |
| 727 // Call GlobalDawgCache here to create the global DawgCache object before | |
| 728 // the TessBaseAPI object. This fixes the order of destructor calls: | |
| 729 // first TessBaseAPI must be destructed, DawgCache must be the last object. | |
| 730 tesseract::Dict::GlobalDawgCache(); | |
| 731 | |
| 732 TessBaseAPI api; | |
| 733 | |
| 734 api.SetOutputName(outputbase); | |
| 735 | |
| 736 const int init_failed = api.Init(datapath, lang, enginemode, &(argv[arg_i]), argc - arg_i, | |
| 737 &vars_vec, &vars_values, false); | |
| 738 | |
| 739 if (!SetVariablesFromCLArgs(api, argc, argv)) { | |
| 740 return EXIT_FAILURE; | |
| 741 } | |
| 742 | |
| 743 // SIMD settings might be overridden by config variable. | |
| 744 tesseract::SIMDDetect::Update(); | |
| 745 | |
| 746 if (list_langs) { | |
| 747 PrintLangsList(api); | |
| 748 return EXIT_SUCCESS; | |
| 749 } | |
| 750 | |
| 751 if (init_failed) { | |
| 752 fprintf(stderr, "Could not initialize tesseract.\n"); | |
| 753 return EXIT_FAILURE; | |
| 754 } | |
| 755 | |
| 756 if (print_parameters) { | |
| 757 FILE *fout = stdout; | |
| 758 fprintf(stdout, "Tesseract parameters:\n"); | |
| 759 api.PrintVariables(fout); | |
| 760 api.End(); | |
| 761 return EXIT_SUCCESS; | |
| 762 } | |
| 763 | |
| 764 #ifndef DISABLED_LEGACY_ENGINE | |
| 765 if (print_fonts_table) { | |
| 766 FILE *fout = stdout; | |
| 767 fprintf(stdout, "Tesseract fonts table:\n"); | |
| 768 api.PrintFontsTable(fout); | |
| 769 api.End(); | |
| 770 return EXIT_SUCCESS; | |
| 771 } | |
| 772 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 773 | |
| 774 FixPageSegMode(api, pagesegmode); | |
| 775 | |
| 776 if (dpi) { | |
| 777 auto dpi_string = std::to_string(dpi); | |
| 778 api.SetVariable("user_defined_dpi", dpi_string.c_str()); | |
| 779 } | |
| 780 | |
| 781 int ret_val = EXIT_SUCCESS; | |
| 782 | |
| 783 if (pagesegmode == tesseract::PSM_AUTO_ONLY) { | |
| 784 Pix *pixs = pixRead(image); | |
| 785 if (!pixs) { | |
| 786 fprintf(stderr, "Leptonica can't process input file: %s\n", image); | |
| 787 return 2; | |
| 788 } | |
| 789 | |
| 790 api.SetImage(pixs); | |
| 791 | |
| 792 tesseract::Orientation orientation; | |
| 793 tesseract::WritingDirection direction; | |
| 794 tesseract::TextlineOrder order; | |
| 795 float deskew_angle; | |
| 796 | |
| 797 const std::unique_ptr<const tesseract::PageIterator> it(api.AnalyseLayout()); | |
| 798 if (it) { | |
| 799 // TODO: Implement output of page segmentation, see documentation | |
| 800 // ("Automatic page segmentation, but no OSD, or OCR"). | |
| 801 it->Orientation(&orientation, &direction, &order, &deskew_angle); | |
| 802 tprintf( | |
| 803 "Orientation: %d\nWritingDirection: %d\nTextlineOrder: %d\n" | |
| 804 "Deskew angle: %.4f\n", | |
| 805 orientation, direction, order, deskew_angle); | |
| 806 } else { | |
| 807 ret_val = EXIT_FAILURE; | |
| 808 } | |
| 809 | |
| 810 pixDestroy(&pixs); | |
| 811 return ret_val; | |
| 812 } | |
| 813 | |
| 814 // Set in_training_mode to true when using one of these configs: | |
| 815 // ambigs.train, box.train, box.train.stderr, linebox, rebox, lstm.train. | |
| 816 // In this mode no other OCR result files are written. | |
| 817 bool b = false; | |
| 818 bool in_training_mode = (api.GetBoolVariable("tessedit_ambigs_training", &b) && b) || | |
| 819 (api.GetBoolVariable("tessedit_resegment_from_boxes", &b) && b) || | |
| 820 (api.GetBoolVariable("tessedit_make_boxes_from_boxes", &b) && b) || | |
| 821 (api.GetBoolVariable("tessedit_train_line_recognizer", &b) && b); | |
| 822 | |
| 823 if (api.GetPageSegMode() == tesseract::PSM_OSD_ONLY) { | |
| 824 if (!api.tesseract()->AnyTessLang()) { | |
| 825 fprintf(stderr, "Error, OSD requires a model for the legacy engine\n"); | |
| 826 return EXIT_FAILURE; | |
| 827 } | |
| 828 } | |
| 829 #ifdef DISABLED_LEGACY_ENGINE | |
| 830 auto cur_psm = api.GetPageSegMode(); | |
| 831 auto osd_warning = std::string(""); | |
| 832 if (cur_psm == tesseract::PSM_OSD_ONLY) { | |
| 833 const char *disabled_osd_msg = | |
| 834 "\nERROR: The page segmentation mode 0 (OSD Only) is currently " | |
| 835 "disabled.\n\n"; | |
| 836 fprintf(stderr, "%s", disabled_osd_msg); | |
| 837 return EXIT_FAILURE; | |
| 838 } else if (cur_psm == tesseract::PSM_AUTO_OSD) { | |
| 839 api.SetPageSegMode(tesseract::PSM_AUTO); | |
| 840 osd_warning += | |
| 841 "\nWarning: The page segmentation mode 1 (Auto+OSD) is currently " | |
| 842 "disabled. " | |
| 843 "Using PSM 3 (Auto) instead.\n\n"; | |
| 844 } else if (cur_psm == tesseract::PSM_SPARSE_TEXT_OSD) { | |
| 845 api.SetPageSegMode(tesseract::PSM_SPARSE_TEXT); | |
| 846 osd_warning += | |
| 847 "\nWarning: The page segmentation mode 12 (Sparse text + OSD) is " | |
| 848 "currently disabled. " | |
| 849 "Using PSM 11 (Sparse text) instead.\n\n"; | |
| 850 } | |
| 851 #endif // def DISABLED_LEGACY_ENGINE | |
| 852 | |
| 853 std::vector<std::unique_ptr<TessResultRenderer>> renderers; | |
| 854 | |
| 855 if (in_training_mode) { | |
| 856 renderers.push_back(nullptr); | |
| 857 } else if (outputbase != nullptr) { | |
| 858 PreloadRenderers(api, renderers, pagesegmode, outputbase); | |
| 859 } | |
| 860 | |
| 861 if (!renderers.empty()) { | |
| 862 #ifdef DISABLED_LEGACY_ENGINE | |
| 863 if (!osd_warning.empty()) { | |
| 864 fprintf(stderr, "%s", osd_warning.c_str()); | |
| 865 } | |
| 866 #endif | |
| 867 bool succeed = api.ProcessPages(image, nullptr, 0, renderers[0].get()); | |
| 868 if (!succeed) { | |
| 869 fprintf(stderr, "Error during processing.\n"); | |
| 870 ret_val = EXIT_FAILURE; | |
| 871 } | |
| 872 } | |
| 873 | |
| 874 return ret_val; | |
| 875 } |
