Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/api/baseapi.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /********************************************************************** | |
| 2 * File: baseapi.cpp | |
| 3 * Description: Simple API for calling tesseract. | |
| 4 * Author: Ray Smith | |
| 5 * | |
| 6 * (C) Copyright 2006, Google Inc. | |
| 7 ** Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 ** you may not use this file except in compliance with the License. | |
| 9 ** You may obtain a copy of the License at | |
| 10 ** http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 ** Unless required by applicable law or agreed to in writing, software | |
| 12 ** distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 ** See the License for the specific language governing permissions and | |
| 15 ** limitations under the License. | |
| 16 * | |
| 17 **********************************************************************/ | |
| 18 | |
| 19 #define _USE_MATH_DEFINES // for M_PI | |
| 20 | |
| 21 // Include automatically generated configuration file if running autoconf. | |
| 22 #ifdef HAVE_CONFIG_H | |
| 23 # include "config_auto.h" | |
| 24 #endif | |
| 25 | |
| 26 #include "boxword.h" // for BoxWord | |
| 27 #include "coutln.h" // for C_OUTLINE_IT, C_OUTLINE_LIST | |
| 28 #include "dawg_cache.h" // for DawgCache | |
| 29 #include "dict.h" // for Dict | |
| 30 #include "elst.h" // for ELIST_ITERATOR, ELISTIZE, ELISTIZEH | |
| 31 #include "environ.h" // for l_uint8 | |
| 32 #ifndef DISABLED_LEGACY_ENGINE | |
| 33 #include "equationdetect.h" // for EquationDetect, destructor of equ_detect_ | |
| 34 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 35 #include "errcode.h" // for ASSERT_HOST | |
| 36 #include "helpers.h" // for IntCastRounded, chomp_string, copy_string | |
| 37 #include "host.h" // for MAX_PATH | |
| 38 #include "imageio.h" // for IFF_TIFF_G4, IFF_TIFF, IFF_TIFF_G3, ... | |
| 39 #ifndef DISABLED_LEGACY_ENGINE | |
| 40 # include "intfx.h" // for INT_FX_RESULT_STRUCT | |
| 41 #endif | |
| 42 #include "mutableiterator.h" // for MutableIterator | |
| 43 #include "normalis.h" // for kBlnBaselineOffset, kBlnXHeight | |
| 44 #include "pageres.h" // for PAGE_RES_IT, WERD_RES, PAGE_RES, CR_DE... | |
| 45 #include "paragraphs.h" // for DetectParagraphs | |
| 46 #include "params.h" // for BoolParam, IntParam, DoubleParam, Stri... | |
| 47 #include "pdblock.h" // for PDBLK | |
| 48 #include "points.h" // for FCOORD | |
| 49 #include "polyblk.h" // for POLY_BLOCK | |
| 50 #include "rect.h" // for TBOX | |
| 51 #include "stepblob.h" // for C_BLOB_IT, C_BLOB, C_BLOB_LIST | |
| 52 #include "tessdatamanager.h" // for TessdataManager, kTrainedDataSuffix | |
| 53 #include "tesseractclass.h" // for Tesseract | |
| 54 #include "tprintf.h" // for tprintf | |
| 55 #include "werd.h" // for WERD, WERD_IT, W_FUZZY_NON, W_FUZZY_SP | |
| 56 #include "thresholder.h" // for ImageThresholder | |
| 57 | |
| 58 #include <tesseract/baseapi.h> | |
| 59 #include <tesseract/ocrclass.h> // for ETEXT_DESC | |
| 60 #include <tesseract/osdetect.h> // for OSResults, OSBestResult, OrientationId... | |
| 61 #include <tesseract/renderer.h> // for TessResultRenderer | |
| 62 #include <tesseract/resultiterator.h> // for ResultIterator | |
| 63 | |
| 64 #include <cmath> // for round, M_PI | |
| 65 #include <cstdint> // for int32_t | |
| 66 #include <cstring> // for strcmp, strcpy | |
| 67 #include <filesystem> // for std::filesystem | |
| 68 #include <fstream> // for size_t | |
| 69 #include <iostream> // for std::cin | |
| 70 #include <locale> // for std::locale::classic | |
| 71 #include <memory> // for std::unique_ptr | |
| 72 #include <set> // for std::pair | |
| 73 #include <sstream> // for std::stringstream | |
| 74 #include <vector> // for std::vector | |
| 75 | |
| 76 #include <allheaders.h> // for pixDestroy, boxCreate, boxaAddBox, box... | |
| 77 #ifdef HAVE_LIBCURL | |
| 78 # include <curl/curl.h> | |
| 79 #endif | |
| 80 | |
| 81 #ifdef __linux__ | |
| 82 # include <csignal> // for sigaction, SA_RESETHAND, SIGBUS, SIGFPE | |
| 83 #endif | |
| 84 | |
| 85 #if defined(_WIN32) | |
| 86 # include <fcntl.h> // for _O_BINARY | |
| 87 # include <io.h> // for _setmode | |
| 88 #endif | |
| 89 | |
| 90 namespace tesseract { | |
| 91 | |
| 92 static BOOL_VAR(stream_filelist, false, "Stream a filelist from stdin"); | |
| 93 static STRING_VAR(document_title, "", "Title of output document (used for hOCR and PDF output)"); | |
| 94 #ifdef HAVE_LIBCURL | |
| 95 static INT_VAR(curl_timeout, 0, "Timeout for curl in seconds"); | |
| 96 static STRING_VAR(curl_cookiefile, "", "File with cookie data for curl"); | |
| 97 #endif | |
| 98 | |
| 99 /** Minimum sensible image size to be worth running Tesseract. */ | |
| 100 const int kMinRectSize = 10; | |
| 101 /** Character returned when Tesseract couldn't recognize as anything. */ | |
| 102 const char kTesseractReject = '~'; | |
| 103 /** Character used by UNLV error counter as a reject. */ | |
| 104 const char kUNLVReject = '~'; | |
| 105 /** Character used by UNLV as a suspect marker. */ | |
| 106 const char kUNLVSuspect = '^'; | |
| 107 /** | |
| 108 * Temp file used for storing current parameters before applying retry values. | |
| 109 */ | |
| 110 static const char *kOldVarsFile = "failed_vars.txt"; | |
| 111 | |
| 112 #ifndef DISABLED_LEGACY_ENGINE | |
| 113 /** | |
| 114 * Filename used for input image file, from which to derive a name to search | |
| 115 * for a possible UNLV zone file, if none is specified by SetInputName. | |
| 116 */ | |
| 117 static const char *kInputFile = "noname.tif"; | |
| 118 static const char kUnknownFontName[] = "UnknownFont"; | |
| 119 | |
| 120 static STRING_VAR(classify_font_name, kUnknownFontName, | |
| 121 "Default font name to be used in training"); | |
| 122 | |
| 123 // Finds the name of the training font and returns it in fontname, by cutting | |
| 124 // it out based on the expectation that the filename is of the form: | |
| 125 // /path/to/dir/[lang].[fontname].exp[num] | |
| 126 // The [lang], [fontname] and [num] fields should not have '.' characters. | |
| 127 // If the global parameter classify_font_name is set, its value is used instead. | |
| 128 static void ExtractFontName(const char* filename, std::string* fontname) { | |
| 129 *fontname = classify_font_name; | |
| 130 if (*fontname == kUnknownFontName) { | |
| 131 // filename is expected to be of the form [lang].[fontname].exp[num] | |
| 132 // The [lang], [fontname] and [num] fields should not have '.' characters. | |
| 133 const char *basename = strrchr(filename, '/'); | |
| 134 const char *firstdot = strchr(basename ? basename : filename, '.'); | |
| 135 const char *lastdot = strrchr(filename, '.'); | |
| 136 if (firstdot != lastdot && firstdot != nullptr && lastdot != nullptr) { | |
| 137 ++firstdot; | |
| 138 *fontname = firstdot; | |
| 139 fontname->resize(lastdot - firstdot); | |
| 140 } | |
| 141 } | |
| 142 } | |
| 143 #endif | |
| 144 | |
| 145 /* Add all available languages recursively. | |
| 146 */ | |
| 147 static void addAvailableLanguages(const std::string &datadir, | |
| 148 std::vector<std::string> *langs) { | |
| 149 for (const auto& entry : | |
| 150 std::filesystem::recursive_directory_iterator(datadir, | |
| 151 std::filesystem::directory_options::follow_directory_symlink | | |
| 152 std::filesystem::directory_options::skip_permission_denied)) { | |
| 153 auto path = entry.path().lexically_relative(datadir).string(); | |
| 154 auto extPos = path.rfind(".traineddata"); | |
| 155 if (extPos != std::string::npos) { | |
| 156 langs->push_back(path.substr(0, extPos)); | |
| 157 } | |
| 158 } | |
| 159 } | |
| 160 | |
| 161 TessBaseAPI::TessBaseAPI() | |
| 162 : tesseract_(nullptr) | |
| 163 , osd_tesseract_(nullptr) | |
| 164 , equ_detect_(nullptr) | |
| 165 , reader_(nullptr) | |
| 166 , | |
| 167 // thresholder_ is initialized to nullptr here, but will be set before use | |
| 168 // by: A constructor of a derived API or created | |
| 169 // implicitly when used in InternalSetImage. | |
| 170 thresholder_(nullptr) | |
| 171 , paragraph_models_(nullptr) | |
| 172 , block_list_(nullptr) | |
| 173 , page_res_(nullptr) | |
| 174 , last_oem_requested_(OEM_DEFAULT) | |
| 175 , recognition_done_(false) | |
| 176 , rect_left_(0) | |
| 177 , rect_top_(0) | |
| 178 , rect_width_(0) | |
| 179 , rect_height_(0) | |
| 180 , image_width_(0) | |
| 181 , image_height_(0) { | |
| 182 } | |
| 183 | |
| 184 TessBaseAPI::~TessBaseAPI() { | |
| 185 End(); | |
| 186 } | |
| 187 | |
| 188 /** | |
| 189 * Returns the version identifier as a static string. Do not delete. | |
| 190 */ | |
| 191 const char *TessBaseAPI::Version() { | |
| 192 return TESSERACT_VERSION_STR; | |
| 193 } | |
| 194 | |
| 195 /** | |
| 196 * Set the name of the input file. Needed only for training and | |
| 197 * loading a UNLV zone file. | |
| 198 */ | |
| 199 void TessBaseAPI::SetInputName(const char *name) { | |
| 200 input_file_ = name ? name : ""; | |
| 201 } | |
| 202 | |
| 203 /** Set the name of the output files. Needed only for debugging. */ | |
| 204 void TessBaseAPI::SetOutputName(const char *name) { | |
| 205 output_file_ = name ? name : ""; | |
| 206 } | |
| 207 | |
| 208 bool TessBaseAPI::SetVariable(const char *name, const char *value) { | |
| 209 if (tesseract_ == nullptr) { | |
| 210 tesseract_ = new Tesseract; | |
| 211 } | |
| 212 return ParamUtils::SetParam(name, value, SET_PARAM_CONSTRAINT_NON_INIT_ONLY, | |
| 213 tesseract_->params()); | |
| 214 } | |
| 215 | |
| 216 bool TessBaseAPI::SetDebugVariable(const char *name, const char *value) { | |
| 217 if (tesseract_ == nullptr) { | |
| 218 tesseract_ = new Tesseract; | |
| 219 } | |
| 220 return ParamUtils::SetParam(name, value, SET_PARAM_CONSTRAINT_DEBUG_ONLY, tesseract_->params()); | |
| 221 } | |
| 222 | |
| 223 bool TessBaseAPI::GetIntVariable(const char *name, int *value) const { | |
| 224 auto *p = ParamUtils::FindParam<IntParam>(name, GlobalParams()->int_params, | |
| 225 tesseract_->params()->int_params); | |
| 226 if (p == nullptr) { | |
| 227 return false; | |
| 228 } | |
| 229 *value = (int32_t)(*p); | |
| 230 return true; | |
| 231 } | |
| 232 | |
| 233 bool TessBaseAPI::GetBoolVariable(const char *name, bool *value) const { | |
| 234 auto *p = ParamUtils::FindParam<BoolParam>(name, GlobalParams()->bool_params, | |
| 235 tesseract_->params()->bool_params); | |
| 236 if (p == nullptr) { | |
| 237 return false; | |
| 238 } | |
| 239 *value = bool(*p); | |
| 240 return true; | |
| 241 } | |
| 242 | |
| 243 const char *TessBaseAPI::GetStringVariable(const char *name) const { | |
| 244 auto *p = ParamUtils::FindParam<StringParam>(name, GlobalParams()->string_params, | |
| 245 tesseract_->params()->string_params); | |
| 246 return (p != nullptr) ? p->c_str() : nullptr; | |
| 247 } | |
| 248 | |
| 249 bool TessBaseAPI::GetDoubleVariable(const char *name, double *value) const { | |
| 250 auto *p = ParamUtils::FindParam<DoubleParam>(name, GlobalParams()->double_params, | |
| 251 tesseract_->params()->double_params); | |
| 252 if (p == nullptr) { | |
| 253 return false; | |
| 254 } | |
| 255 *value = (double)(*p); | |
| 256 return true; | |
| 257 } | |
| 258 | |
| 259 /** Get value of named variable as a string, if it exists. */ | |
| 260 bool TessBaseAPI::GetVariableAsString(const char *name, std::string *val) const { | |
| 261 return ParamUtils::GetParamAsString(name, tesseract_->params(), val); | |
| 262 } | |
| 263 | |
| 264 #ifndef DISABLED_LEGACY_ENGINE | |
| 265 | |
| 266 /** Print Tesseract fonts table to the given file. */ | |
| 267 void TessBaseAPI::PrintFontsTable(FILE *fp) const { | |
| 268 const int fontinfo_size = tesseract_->get_fontinfo_table().size(); | |
| 269 for (int font_index = 1; font_index < fontinfo_size; ++font_index) { | |
| 270 FontInfo font = tesseract_->get_fontinfo_table().at(font_index); | |
| 271 fprintf(fp, "ID=%3d: %s is_italic=%s is_bold=%s" | |
| 272 " is_fixed_pitch=%s is_serif=%s is_fraktur=%s\n", | |
| 273 font_index, font.name, | |
| 274 font.is_italic() ? "true" : "false", | |
| 275 font.is_bold() ? "true" : "false", | |
| 276 font.is_fixed_pitch() ? "true" : "false", | |
| 277 font.is_serif() ? "true" : "false", | |
| 278 font.is_fraktur() ? "true" : "false"); | |
| 279 } | |
| 280 } | |
| 281 | |
| 282 #endif | |
| 283 | |
| 284 /** Print Tesseract parameters to the given file. */ | |
| 285 void TessBaseAPI::PrintVariables(FILE *fp) const { | |
| 286 ParamUtils::PrintParams(fp, tesseract_->params()); | |
| 287 } | |
| 288 | |
| 289 /** | |
| 290 * The datapath must be the name of the data directory or | |
| 291 * some other file in which the data directory resides (for instance argv[0].) | |
| 292 * The language is (usually) an ISO 639-3 string or nullptr will default to eng. | |
| 293 * If numeric_mode is true, then only digits and Roman numerals will | |
| 294 * be returned. | |
| 295 * @return: 0 on success and -1 on initialization failure. | |
| 296 */ | |
| 297 int TessBaseAPI::Init(const char *datapath, const char *language, OcrEngineMode oem, char **configs, | |
| 298 int configs_size, const std::vector<std::string> *vars_vec, | |
| 299 const std::vector<std::string> *vars_values, bool set_only_non_debug_params) { | |
| 300 return Init(datapath, 0, language, oem, configs, configs_size, vars_vec, vars_values, | |
| 301 set_only_non_debug_params, nullptr); | |
| 302 } | |
| 303 | |
| 304 // In-memory version reads the traineddata file directly from the given | |
| 305 // data[data_size] array. Also implements the version with a datapath in data, | |
| 306 // flagged by data_size = 0. | |
| 307 int TessBaseAPI::Init(const char *data, int data_size, const char *language, OcrEngineMode oem, | |
| 308 char **configs, int configs_size, const std::vector<std::string> *vars_vec, | |
| 309 const std::vector<std::string> *vars_values, bool set_only_non_debug_params, | |
| 310 FileReader reader) { | |
| 311 if (language == nullptr) { | |
| 312 language = ""; | |
| 313 } | |
| 314 if (data == nullptr) { | |
| 315 data = ""; | |
| 316 } | |
| 317 std::string datapath = data_size == 0 ? data : language; | |
| 318 // If the datapath, OcrEngineMode or the language have changed - start again. | |
| 319 // Note that the language_ field stores the last requested language that was | |
| 320 // initialized successfully, while tesseract_->lang stores the language | |
| 321 // actually used. They differ only if the requested language was nullptr, in | |
| 322 // which case tesseract_->lang is set to the Tesseract default ("eng"). | |
| 323 if (tesseract_ != nullptr && | |
| 324 (datapath_.empty() || language_.empty() || datapath_ != datapath || | |
| 325 last_oem_requested_ != oem || (language_ != language && tesseract_->lang != language))) { | |
| 326 delete tesseract_; | |
| 327 tesseract_ = nullptr; | |
| 328 } | |
| 329 bool reset_classifier = true; | |
| 330 if (tesseract_ == nullptr) { | |
| 331 reset_classifier = false; | |
| 332 tesseract_ = new Tesseract; | |
| 333 if (reader != nullptr) { | |
| 334 reader_ = reader; | |
| 335 } | |
| 336 TessdataManager mgr(reader_); | |
| 337 if (data_size != 0) { | |
| 338 mgr.LoadMemBuffer(language, data, data_size); | |
| 339 } | |
| 340 if (tesseract_->init_tesseract(datapath, output_file_, language, oem, configs, | |
| 341 configs_size, vars_vec, vars_values, set_only_non_debug_params, | |
| 342 &mgr) != 0) { | |
| 343 return -1; | |
| 344 } | |
| 345 } | |
| 346 | |
| 347 // Update datapath and language requested for the last valid initialization. | |
| 348 datapath_ = std::move(datapath); | |
| 349 if (datapath_.empty() && !tesseract_->datadir.empty()) { | |
| 350 datapath_ = tesseract_->datadir; | |
| 351 } | |
| 352 | |
| 353 language_ = language; | |
| 354 last_oem_requested_ = oem; | |
| 355 | |
| 356 #ifndef DISABLED_LEGACY_ENGINE | |
| 357 // For same language and datapath, just reset the adaptive classifier. | |
| 358 if (reset_classifier) { | |
| 359 tesseract_->ResetAdaptiveClassifier(); | |
| 360 } | |
| 361 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 362 return 0; | |
| 363 } | |
| 364 | |
| 365 /** | |
| 366 * Returns the languages string used in the last valid initialization. | |
| 367 * If the last initialization specified "deu+hin" then that will be | |
| 368 * returned. If hin loaded eng automatically as well, then that will | |
| 369 * not be included in this list. To find the languages actually | |
| 370 * loaded use GetLoadedLanguagesAsVector. | |
| 371 * The returned string should NOT be deleted. | |
| 372 */ | |
| 373 const char *TessBaseAPI::GetInitLanguagesAsString() const { | |
| 374 return language_.c_str(); | |
| 375 } | |
| 376 | |
| 377 /** | |
| 378 * Returns the loaded languages in the vector of std::string. | |
| 379 * Includes all languages loaded by the last Init, including those loaded | |
| 380 * as dependencies of other loaded languages. | |
| 381 */ | |
| 382 void TessBaseAPI::GetLoadedLanguagesAsVector(std::vector<std::string> *langs) const { | |
| 383 langs->clear(); | |
| 384 if (tesseract_ != nullptr) { | |
| 385 langs->push_back(tesseract_->lang); | |
| 386 int num_subs = tesseract_->num_sub_langs(); | |
| 387 for (int i = 0; i < num_subs; ++i) { | |
| 388 langs->push_back(tesseract_->get_sub_lang(i)->lang); | |
| 389 } | |
| 390 } | |
| 391 } | |
| 392 | |
| 393 /** | |
| 394 * Returns the available languages in the sorted vector of std::string. | |
| 395 */ | |
| 396 void TessBaseAPI::GetAvailableLanguagesAsVector(std::vector<std::string> *langs) const { | |
| 397 langs->clear(); | |
| 398 if (tesseract_ != nullptr) { | |
| 399 addAvailableLanguages(tesseract_->datadir, langs); | |
| 400 std::sort(langs->begin(), langs->end()); | |
| 401 } | |
| 402 } | |
| 403 | |
| 404 /** | |
| 405 * Init only for page layout analysis. Use only for calls to SetImage and | |
| 406 * AnalysePage. Calls that attempt recognition will generate an error. | |
| 407 */ | |
| 408 void TessBaseAPI::InitForAnalysePage() { | |
| 409 if (tesseract_ == nullptr) { | |
| 410 tesseract_ = new Tesseract; | |
| 411 #ifndef DISABLED_LEGACY_ENGINE | |
| 412 tesseract_->InitAdaptiveClassifier(nullptr); | |
| 413 #endif | |
| 414 } | |
| 415 } | |
| 416 | |
| 417 /** | |
| 418 * Read a "config" file containing a set of parameter name, value pairs. | |
| 419 * Searches the standard places: tessdata/configs, tessdata/tessconfigs | |
| 420 * and also accepts a relative or absolute path name. | |
| 421 */ | |
| 422 void TessBaseAPI::ReadConfigFile(const char *filename) { | |
| 423 tesseract_->read_config_file(filename, SET_PARAM_CONSTRAINT_NON_INIT_ONLY); | |
| 424 } | |
| 425 | |
| 426 /** Same as above, but only set debug params from the given config file. */ | |
| 427 void TessBaseAPI::ReadDebugConfigFile(const char *filename) { | |
| 428 tesseract_->read_config_file(filename, SET_PARAM_CONSTRAINT_DEBUG_ONLY); | |
| 429 } | |
| 430 | |
| 431 /** | |
| 432 * Set the current page segmentation mode. Defaults to PSM_AUTO. | |
| 433 * The mode is stored as an IntParam so it can also be modified by | |
| 434 * ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string). | |
| 435 */ | |
| 436 void TessBaseAPI::SetPageSegMode(PageSegMode mode) { | |
| 437 if (tesseract_ == nullptr) { | |
| 438 tesseract_ = new Tesseract; | |
| 439 } | |
| 440 tesseract_->tessedit_pageseg_mode.set_value(mode); | |
| 441 } | |
| 442 | |
| 443 /** Return the current page segmentation mode. */ | |
| 444 PageSegMode TessBaseAPI::GetPageSegMode() const { | |
| 445 if (tesseract_ == nullptr) { | |
| 446 return PSM_SINGLE_BLOCK; | |
| 447 } | |
| 448 return static_cast<PageSegMode>(static_cast<int>(tesseract_->tessedit_pageseg_mode)); | |
| 449 } | |
| 450 | |
| 451 /** | |
| 452 * Recognize a rectangle from an image and return the result as a string. | |
| 453 * May be called many times for a single Init. | |
| 454 * Currently has no error checking. | |
| 455 * Greyscale of 8 and color of 24 or 32 bits per pixel may be given. | |
| 456 * Palette color images will not work properly and must be converted to | |
| 457 * 24 bit. | |
| 458 * Binary images of 1 bit per pixel may also be given but they must be | |
| 459 * byte packed with the MSB of the first byte being the first pixel, and a | |
| 460 * one pixel is WHITE. For binary images set bytes_per_pixel=0. | |
| 461 * The recognized text is returned as a char* which is coded | |
| 462 * as UTF8 and must be freed with the delete [] operator. | |
| 463 */ | |
| 464 char *TessBaseAPI::TesseractRect(const unsigned char *imagedata, int bytes_per_pixel, | |
| 465 int bytes_per_line, int left, int top, int width, int height) { | |
| 466 if (tesseract_ == nullptr || width < kMinRectSize || height < kMinRectSize) { | |
| 467 return nullptr; // Nothing worth doing. | |
| 468 } | |
| 469 | |
| 470 // Since this original api didn't give the exact size of the image, | |
| 471 // we have to invent a reasonable value. | |
| 472 int bits_per_pixel = bytes_per_pixel == 0 ? 1 : bytes_per_pixel * 8; | |
| 473 SetImage(imagedata, bytes_per_line * 8 / bits_per_pixel, height + top, bytes_per_pixel, | |
| 474 bytes_per_line); | |
| 475 SetRectangle(left, top, width, height); | |
| 476 | |
| 477 return GetUTF8Text(); | |
| 478 } | |
| 479 | |
| 480 #ifndef DISABLED_LEGACY_ENGINE | |
| 481 /** | |
| 482 * Call between pages or documents etc to free up memory and forget | |
| 483 * adaptive data. | |
| 484 */ | |
| 485 void TessBaseAPI::ClearAdaptiveClassifier() { | |
| 486 if (tesseract_ == nullptr) { | |
| 487 return; | |
| 488 } | |
| 489 tesseract_->ResetAdaptiveClassifier(); | |
| 490 tesseract_->ResetDocumentDictionary(); | |
| 491 } | |
| 492 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 493 | |
| 494 /** | |
| 495 * Provide an image for Tesseract to recognize. Format is as | |
| 496 * TesseractRect above. Copies the image buffer and converts to Pix. | |
| 497 * SetImage clears all recognition results, and sets the rectangle to the | |
| 498 * full image, so it may be followed immediately by a GetUTF8Text, and it | |
| 499 * will automatically perform recognition. | |
| 500 */ | |
| 501 void TessBaseAPI::SetImage(const unsigned char *imagedata, int width, int height, | |
| 502 int bytes_per_pixel, int bytes_per_line) { | |
| 503 if (InternalSetImage()) { | |
| 504 thresholder_->SetImage(imagedata, width, height, bytes_per_pixel, bytes_per_line); | |
| 505 SetInputImage(thresholder_->GetPixRect()); | |
| 506 } | |
| 507 } | |
| 508 | |
| 509 void TessBaseAPI::SetSourceResolution(int ppi) { | |
| 510 if (thresholder_) { | |
| 511 thresholder_->SetSourceYResolution(ppi); | |
| 512 } else { | |
| 513 tprintf("Please call SetImage before SetSourceResolution.\n"); | |
| 514 } | |
| 515 } | |
| 516 | |
| 517 /** | |
| 518 * Provide an image for Tesseract to recognize. As with SetImage above, | |
| 519 * Tesseract takes its own copy of the image, so it need not persist until | |
| 520 * after Recognize. | |
| 521 * Pix vs raw, which to use? | |
| 522 * Use Pix where possible. Tesseract uses Pix as its internal representation | |
| 523 * and it is therefore more efficient to provide a Pix directly. | |
| 524 */ | |
| 525 void TessBaseAPI::SetImage(Pix *pix) { | |
| 526 if (InternalSetImage()) { | |
| 527 if (pixGetSpp(pix) == 4 && pixGetInputFormat(pix) == IFF_PNG) { | |
| 528 // remove alpha channel from png | |
| 529 Pix *p1 = pixRemoveAlpha(pix); | |
| 530 pixSetSpp(p1, 3); | |
| 531 (void)pixCopy(pix, p1); | |
| 532 pixDestroy(&p1); | |
| 533 } | |
| 534 thresholder_->SetImage(pix); | |
| 535 SetInputImage(thresholder_->GetPixRect()); | |
| 536 } | |
| 537 } | |
| 538 | |
| 539 /** | |
| 540 * Restrict recognition to a sub-rectangle of the image. Call after SetImage. | |
| 541 * Each SetRectangle clears the recognition results so multiple rectangles | |
| 542 * can be recognized with the same image. | |
| 543 */ | |
| 544 void TessBaseAPI::SetRectangle(int left, int top, int width, int height) { | |
| 545 if (thresholder_ == nullptr) { | |
| 546 return; | |
| 547 } | |
| 548 thresholder_->SetRectangle(left, top, width, height); | |
| 549 ClearResults(); | |
| 550 } | |
| 551 | |
| 552 /** | |
| 553 * ONLY available after SetImage if you have Leptonica installed. | |
| 554 * Get a copy of the internal thresholded image from Tesseract. | |
| 555 */ | |
| 556 Pix *TessBaseAPI::GetThresholdedImage() { | |
| 557 if (tesseract_ == nullptr || thresholder_ == nullptr) { | |
| 558 return nullptr; | |
| 559 } | |
| 560 if (tesseract_->pix_binary() == nullptr && !Threshold(&tesseract_->mutable_pix_binary()->pix_)) { | |
| 561 return nullptr; | |
| 562 } | |
| 563 return tesseract_->pix_binary().clone(); | |
| 564 } | |
| 565 | |
| 566 /** | |
| 567 * Get the result of page layout analysis as a leptonica-style | |
| 568 * Boxa, Pixa pair, in reading order. | |
| 569 * Can be called before or after Recognize. | |
| 570 */ | |
| 571 Boxa *TessBaseAPI::GetRegions(Pixa **pixa) { | |
| 572 return GetComponentImages(RIL_BLOCK, false, pixa, nullptr); | |
| 573 } | |
| 574 | |
| 575 /** | |
| 576 * Get the textlines as a leptonica-style Boxa, Pixa pair, in reading order. | |
| 577 * Can be called before or after Recognize. | |
| 578 * If blockids is not nullptr, the block-id of each line is also returned as an | |
| 579 * array of one element per line. delete [] after use. | |
| 580 * If paraids is not nullptr, the paragraph-id of each line within its block is | |
| 581 * also returned as an array of one element per line. delete [] after use. | |
| 582 */ | |
| 583 Boxa *TessBaseAPI::GetTextlines(const bool raw_image, const int raw_padding, Pixa **pixa, | |
| 584 int **blockids, int **paraids) { | |
| 585 return GetComponentImages(RIL_TEXTLINE, true, raw_image, raw_padding, pixa, blockids, paraids); | |
| 586 } | |
| 587 | |
| 588 /** | |
| 589 * Get textlines and strips of image regions as a leptonica-style Boxa, Pixa | |
| 590 * pair, in reading order. Enables downstream handling of non-rectangular | |
| 591 * regions. | |
| 592 * Can be called before or after Recognize. | |
| 593 * If blockids is not nullptr, the block-id of each line is also returned as an | |
| 594 * array of one element per line. delete [] after use. | |
| 595 */ | |
| 596 Boxa *TessBaseAPI::GetStrips(Pixa **pixa, int **blockids) { | |
| 597 return GetComponentImages(RIL_TEXTLINE, false, pixa, blockids); | |
| 598 } | |
| 599 | |
| 600 /** | |
| 601 * Get the words as a leptonica-style | |
| 602 * Boxa, Pixa pair, in reading order. | |
| 603 * Can be called before or after Recognize. | |
| 604 */ | |
| 605 Boxa *TessBaseAPI::GetWords(Pixa **pixa) { | |
| 606 return GetComponentImages(RIL_WORD, true, pixa, nullptr); | |
| 607 } | |
| 608 | |
| 609 /** | |
| 610 * Gets the individual connected (text) components (created | |
| 611 * after pages segmentation step, but before recognition) | |
| 612 * as a leptonica-style Boxa, Pixa pair, in reading order. | |
| 613 * Can be called before or after Recognize. | |
| 614 */ | |
| 615 Boxa *TessBaseAPI::GetConnectedComponents(Pixa **pixa) { | |
| 616 return GetComponentImages(RIL_SYMBOL, true, pixa, nullptr); | |
| 617 } | |
| 618 | |
| 619 /** | |
| 620 * Get the given level kind of components (block, textline, word etc.) as a | |
| 621 * leptonica-style Boxa, Pixa pair, in reading order. | |
| 622 * Can be called before or after Recognize. | |
| 623 * If blockids is not nullptr, the block-id of each component is also returned | |
| 624 * as an array of one element per component. delete [] after use. | |
| 625 * If text_only is true, then only text components are returned. | |
| 626 */ | |
| 627 Boxa *TessBaseAPI::GetComponentImages(PageIteratorLevel level, bool text_only, bool raw_image, | |
| 628 const int raw_padding, Pixa **pixa, int **blockids, | |
| 629 int **paraids) { | |
| 630 /*non-const*/ std::unique_ptr</*non-const*/ PageIterator> page_it(GetIterator()); | |
| 631 if (page_it == nullptr) { | |
| 632 page_it.reset(AnalyseLayout()); | |
| 633 } | |
| 634 if (page_it == nullptr) { | |
| 635 return nullptr; // Failed. | |
| 636 } | |
| 637 | |
| 638 // Count the components to get a size for the arrays. | |
| 639 int component_count = 0; | |
| 640 int left, top, right, bottom; | |
| 641 | |
| 642 if (raw_image) { | |
| 643 // Get bounding box in original raw image with padding. | |
| 644 do { | |
| 645 if (page_it->BoundingBox(level, raw_padding, &left, &top, &right, &bottom) && | |
| 646 (!text_only || PTIsTextType(page_it->BlockType()))) { | |
| 647 ++component_count; | |
| 648 } | |
| 649 } while (page_it->Next(level)); | |
| 650 } else { | |
| 651 // Get bounding box from binarized imaged. Note that this could be | |
| 652 // differently scaled from the original image. | |
| 653 do { | |
| 654 if (page_it->BoundingBoxInternal(level, &left, &top, &right, &bottom) && | |
| 655 (!text_only || PTIsTextType(page_it->BlockType()))) { | |
| 656 ++component_count; | |
| 657 } | |
| 658 } while (page_it->Next(level)); | |
| 659 } | |
| 660 | |
| 661 Boxa *boxa = boxaCreate(component_count); | |
| 662 if (pixa != nullptr) { | |
| 663 *pixa = pixaCreate(component_count); | |
| 664 } | |
| 665 if (blockids != nullptr) { | |
| 666 *blockids = new int[component_count]; | |
| 667 } | |
| 668 if (paraids != nullptr) { | |
| 669 *paraids = new int[component_count]; | |
| 670 } | |
| 671 | |
| 672 int blockid = 0; | |
| 673 int paraid = 0; | |
| 674 int component_index = 0; | |
| 675 page_it->Begin(); | |
| 676 do { | |
| 677 bool got_bounding_box; | |
| 678 if (raw_image) { | |
| 679 got_bounding_box = page_it->BoundingBox(level, raw_padding, &left, &top, &right, &bottom); | |
| 680 } else { | |
| 681 got_bounding_box = page_it->BoundingBoxInternal(level, &left, &top, &right, &bottom); | |
| 682 } | |
| 683 if (got_bounding_box && (!text_only || PTIsTextType(page_it->BlockType()))) { | |
| 684 Box *lbox = boxCreate(left, top, right - left, bottom - top); | |
| 685 boxaAddBox(boxa, lbox, L_INSERT); | |
| 686 if (pixa != nullptr) { | |
| 687 Pix *pix = nullptr; | |
| 688 if (raw_image) { | |
| 689 pix = page_it->GetImage(level, raw_padding, GetInputImage(), &left, &top); | |
| 690 } else { | |
| 691 pix = page_it->GetBinaryImage(level); | |
| 692 } | |
| 693 pixaAddPix(*pixa, pix, L_INSERT); | |
| 694 pixaAddBox(*pixa, lbox, L_CLONE); | |
| 695 } | |
| 696 if (paraids != nullptr) { | |
| 697 (*paraids)[component_index] = paraid; | |
| 698 if (page_it->IsAtFinalElement(RIL_PARA, level)) { | |
| 699 ++paraid; | |
| 700 } | |
| 701 } | |
| 702 if (blockids != nullptr) { | |
| 703 (*blockids)[component_index] = blockid; | |
| 704 if (page_it->IsAtFinalElement(RIL_BLOCK, level)) { | |
| 705 ++blockid; | |
| 706 paraid = 0; | |
| 707 } | |
| 708 } | |
| 709 ++component_index; | |
| 710 } | |
| 711 } while (page_it->Next(level)); | |
| 712 return boxa; | |
| 713 } | |
| 714 | |
| 715 int TessBaseAPI::GetThresholdedImageScaleFactor() const { | |
| 716 if (thresholder_ == nullptr) { | |
| 717 return 0; | |
| 718 } | |
| 719 return thresholder_->GetScaleFactor(); | |
| 720 } | |
| 721 | |
| 722 /** | |
| 723 * Runs page layout analysis in the mode set by SetPageSegMode. | |
| 724 * May optionally be called prior to Recognize to get access to just | |
| 725 * the page layout results. Returns an iterator to the results. | |
| 726 * If merge_similar_words is true, words are combined where suitable for use | |
| 727 * with a line recognizer. Use if you want to use AnalyseLayout to find the | |
| 728 * textlines, and then want to process textline fragments with an external | |
| 729 * line recognizer. | |
| 730 * Returns nullptr on error or an empty page. | |
| 731 * The returned iterator must be deleted after use. | |
| 732 * WARNING! This class points to data held within the TessBaseAPI class, and | |
| 733 * therefore can only be used while the TessBaseAPI class still exists and | |
| 734 * has not been subjected to a call of Init, SetImage, Recognize, Clear, End | |
| 735 * DetectOS, or anything else that changes the internal PAGE_RES. | |
| 736 */ | |
| 737 PageIterator *TessBaseAPI::AnalyseLayout() { | |
| 738 return AnalyseLayout(false); | |
| 739 } | |
| 740 | |
| 741 PageIterator *TessBaseAPI::AnalyseLayout(bool merge_similar_words) { | |
| 742 if (FindLines() == 0) { | |
| 743 if (block_list_->empty()) { | |
| 744 return nullptr; // The page was empty. | |
| 745 } | |
| 746 page_res_ = new PAGE_RES(merge_similar_words, block_list_, nullptr); | |
| 747 DetectParagraphs(false); | |
| 748 return new PageIterator(page_res_, tesseract_, thresholder_->GetScaleFactor(), | |
| 749 thresholder_->GetScaledYResolution(), rect_left_, rect_top_, | |
| 750 rect_width_, rect_height_); | |
| 751 } | |
| 752 return nullptr; | |
| 753 } | |
| 754 | |
| 755 /** | |
| 756 * Recognize the tesseract global image and return the result as Tesseract | |
| 757 * internal structures. | |
| 758 */ | |
| 759 int TessBaseAPI::Recognize(ETEXT_DESC *monitor) { | |
| 760 if (tesseract_ == nullptr) { | |
| 761 return -1; | |
| 762 } | |
| 763 if (FindLines() != 0) { | |
| 764 return -1; | |
| 765 } | |
| 766 delete page_res_; | |
| 767 if (block_list_->empty()) { | |
| 768 page_res_ = new PAGE_RES(false, block_list_, &tesseract_->prev_word_best_choice_); | |
| 769 return 0; // Empty page. | |
| 770 } | |
| 771 | |
| 772 tesseract_->SetBlackAndWhitelist(); | |
| 773 recognition_done_ = true; | |
| 774 #ifndef DISABLED_LEGACY_ENGINE | |
| 775 if (tesseract_->tessedit_resegment_from_line_boxes) { | |
| 776 page_res_ = tesseract_->ApplyBoxes(input_file_.c_str(), true, block_list_); | |
| 777 } else if (tesseract_->tessedit_resegment_from_boxes) { | |
| 778 page_res_ = tesseract_->ApplyBoxes(input_file_.c_str(), false, block_list_); | |
| 779 } else | |
| 780 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 781 { | |
| 782 page_res_ = | |
| 783 new PAGE_RES(tesseract_->AnyLSTMLang(), block_list_, &tesseract_->prev_word_best_choice_); | |
| 784 } | |
| 785 | |
| 786 if (page_res_ == nullptr) { | |
| 787 return -1; | |
| 788 } | |
| 789 | |
| 790 if (tesseract_->tessedit_train_line_recognizer) { | |
| 791 if (!tesseract_->TrainLineRecognizer(input_file_.c_str(), output_file_, block_list_)) { | |
| 792 return -1; | |
| 793 } | |
| 794 tesseract_->CorrectClassifyWords(page_res_); | |
| 795 return 0; | |
| 796 } | |
| 797 #ifndef DISABLED_LEGACY_ENGINE | |
| 798 if (tesseract_->tessedit_make_boxes_from_boxes) { | |
| 799 tesseract_->CorrectClassifyWords(page_res_); | |
| 800 return 0; | |
| 801 } | |
| 802 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 803 | |
| 804 int result = 0; | |
| 805 if (tesseract_->interactive_display_mode) { | |
| 806 #ifndef GRAPHICS_DISABLED | |
| 807 tesseract_->pgeditor_main(rect_width_, rect_height_, page_res_); | |
| 808 #endif // !GRAPHICS_DISABLED | |
| 809 // The page_res is invalid after an interactive session, so cleanup | |
| 810 // in a way that lets us continue to the next page without crashing. | |
| 811 delete page_res_; | |
| 812 page_res_ = nullptr; | |
| 813 return -1; | |
| 814 #ifndef DISABLED_LEGACY_ENGINE | |
| 815 } else if (tesseract_->tessedit_train_from_boxes) { | |
| 816 std::string fontname; | |
| 817 ExtractFontName(output_file_.c_str(), &fontname); | |
| 818 tesseract_->ApplyBoxTraining(fontname, page_res_); | |
| 819 } else if (tesseract_->tessedit_ambigs_training) { | |
| 820 FILE *training_output_file = tesseract_->init_recog_training(input_file_.c_str()); | |
| 821 // OCR the page segmented into words by tesseract. | |
| 822 tesseract_->recog_training_segmented(input_file_.c_str(), page_res_, monitor, | |
| 823 training_output_file); | |
| 824 fclose(training_output_file); | |
| 825 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 826 } else { | |
| 827 // Now run the main recognition. | |
| 828 bool wait_for_text = true; | |
| 829 GetBoolVariable("paragraph_text_based", &wait_for_text); | |
| 830 if (!wait_for_text) { | |
| 831 DetectParagraphs(false); | |
| 832 } | |
| 833 if (tesseract_->recog_all_words(page_res_, monitor, nullptr, nullptr, 0)) { | |
| 834 if (wait_for_text) { | |
| 835 DetectParagraphs(true); | |
| 836 } | |
| 837 } else { | |
| 838 result = -1; | |
| 839 } | |
| 840 } | |
| 841 return result; | |
| 842 } | |
| 843 | |
| 844 // Takes ownership of the input pix. | |
| 845 void TessBaseAPI::SetInputImage(Pix *pix) { | |
| 846 tesseract_->set_pix_original(pix); | |
| 847 } | |
| 848 | |
| 849 Pix *TessBaseAPI::GetInputImage() { | |
| 850 return tesseract_->pix_original(); | |
| 851 } | |
| 852 | |
| 853 const char *TessBaseAPI::GetInputName() { | |
| 854 if (!input_file_.empty()) { | |
| 855 return input_file_.c_str(); | |
| 856 } | |
| 857 return nullptr; | |
| 858 } | |
| 859 | |
| 860 const char *TessBaseAPI::GetDatapath() { | |
| 861 return tesseract_->datadir.c_str(); | |
| 862 } | |
| 863 | |
| 864 int TessBaseAPI::GetSourceYResolution() { | |
| 865 if (thresholder_ == nullptr) | |
| 866 return -1; | |
| 867 return thresholder_->GetSourceYResolution(); | |
| 868 } | |
| 869 | |
| 870 // If flist exists, get data from there. Otherwise get data from buf. | |
| 871 // Seems convoluted, but is the easiest way I know of to meet multiple | |
| 872 // goals. Support streaming from stdin, and also work on platforms | |
| 873 // lacking fmemopen. | |
| 874 // TODO: check different logic for flist/buf and simplify. | |
| 875 bool TessBaseAPI::ProcessPagesFileList(FILE *flist, std::string *buf, const char *retry_config, | |
| 876 int timeout_millisec, TessResultRenderer *renderer, | |
| 877 int tessedit_page_number) { | |
| 878 if (!flist && !buf) { | |
| 879 return false; | |
| 880 } | |
| 881 unsigned page = (tessedit_page_number >= 0) ? tessedit_page_number : 0; | |
| 882 char pagename[MAX_PATH]; | |
| 883 | |
| 884 std::vector<std::string> lines; | |
| 885 if (!flist) { | |
| 886 std::string line; | |
| 887 for (const auto ch : *buf) { | |
| 888 if (ch == '\n') { | |
| 889 lines.push_back(line); | |
| 890 line.clear(); | |
| 891 } else { | |
| 892 line.push_back(ch); | |
| 893 } | |
| 894 } | |
| 895 if (!line.empty()) { | |
| 896 // Add last line without terminating LF. | |
| 897 lines.push_back(line); | |
| 898 } | |
| 899 if (lines.empty()) { | |
| 900 return false; | |
| 901 } | |
| 902 } | |
| 903 | |
| 904 // Skip to the requested page number. | |
| 905 for (unsigned i = 0; i < page; i++) { | |
| 906 if (flist) { | |
| 907 if (fgets(pagename, sizeof(pagename), flist) == nullptr) { | |
| 908 break; | |
| 909 } | |
| 910 } | |
| 911 } | |
| 912 | |
| 913 // Begin producing output | |
| 914 if (renderer && !renderer->BeginDocument(document_title.c_str())) { | |
| 915 return false; | |
| 916 } | |
| 917 | |
| 918 // Loop over all pages - or just the requested one | |
| 919 while (true) { | |
| 920 if (flist) { | |
| 921 if (fgets(pagename, sizeof(pagename), flist) == nullptr) { | |
| 922 break; | |
| 923 } | |
| 924 } else { | |
| 925 if (page >= lines.size()) { | |
| 926 break; | |
| 927 } | |
| 928 snprintf(pagename, sizeof(pagename), "%s", lines[page].c_str()); | |
| 929 } | |
| 930 chomp_string(pagename); | |
| 931 Pix *pix = pixRead(pagename); | |
| 932 if (pix == nullptr) { | |
| 933 tprintf("Image file %s cannot be read!\n", pagename); | |
| 934 return false; | |
| 935 } | |
| 936 tprintf("Page %u : %s\n", page, pagename); | |
| 937 bool r = ProcessPage(pix, page, pagename, retry_config, timeout_millisec, renderer); | |
| 938 pixDestroy(&pix); | |
| 939 if (!r) { | |
| 940 return false; | |
| 941 } | |
| 942 if (tessedit_page_number >= 0) { | |
| 943 break; | |
| 944 } | |
| 945 ++page; | |
| 946 } | |
| 947 | |
| 948 // Finish producing output | |
| 949 if (renderer && !renderer->EndDocument()) { | |
| 950 return false; | |
| 951 } | |
| 952 return true; | |
| 953 } | |
| 954 | |
| 955 bool TessBaseAPI::ProcessPagesMultipageTiff(const l_uint8 *data, size_t size, const char *filename, | |
| 956 const char *retry_config, int timeout_millisec, | |
| 957 TessResultRenderer *renderer, | |
| 958 int tessedit_page_number) { | |
| 959 Pix *pix = nullptr; | |
| 960 int page = (tessedit_page_number >= 0) ? tessedit_page_number : 0; | |
| 961 size_t offset = 0; | |
| 962 for (;; ++page) { | |
| 963 if (tessedit_page_number >= 0) { | |
| 964 page = tessedit_page_number; | |
| 965 pix = (data) ? pixReadMemTiff(data, size, page) : pixReadTiff(filename, page); | |
| 966 } else { | |
| 967 pix = (data) ? pixReadMemFromMultipageTiff(data, size, &offset) | |
| 968 : pixReadFromMultipageTiff(filename, &offset); | |
| 969 } | |
| 970 if (pix == nullptr) { | |
| 971 break; | |
| 972 } | |
| 973 if (offset || page > 0) { | |
| 974 // Only print page number for multipage TIFF file. | |
| 975 tprintf("Page %d\n", page + 1); | |
| 976 } | |
| 977 auto page_string = std::to_string(page); | |
| 978 SetVariable("applybox_page", page_string.c_str()); | |
| 979 bool r = ProcessPage(pix, page, filename, retry_config, timeout_millisec, renderer); | |
| 980 pixDestroy(&pix); | |
| 981 if (!r) { | |
| 982 return false; | |
| 983 } | |
| 984 if (tessedit_page_number >= 0) { | |
| 985 break; | |
| 986 } | |
| 987 if (!offset) { | |
| 988 break; | |
| 989 } | |
| 990 } | |
| 991 return true; | |
| 992 } | |
| 993 | |
| 994 // Master ProcessPages calls ProcessPagesInternal and then does any post- | |
| 995 // processing required due to being in a training mode. | |
| 996 bool TessBaseAPI::ProcessPages(const char *filename, const char *retry_config, int timeout_millisec, | |
| 997 TessResultRenderer *renderer) { | |
| 998 bool result = ProcessPagesInternal(filename, retry_config, timeout_millisec, renderer); | |
| 999 #ifndef DISABLED_LEGACY_ENGINE | |
| 1000 if (result) { | |
| 1001 if (tesseract_->tessedit_train_from_boxes && !tesseract_->WriteTRFile(output_file_.c_str())) { | |
| 1002 tprintf("Write of TR file failed: %s\n", output_file_.c_str()); | |
| 1003 return false; | |
| 1004 } | |
| 1005 } | |
| 1006 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 1007 return result; | |
| 1008 } | |
| 1009 | |
| 1010 #ifdef HAVE_LIBCURL | |
| 1011 static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp) { | |
| 1012 size = size * nmemb; | |
| 1013 auto *buf = reinterpret_cast<std::string *>(userp); | |
| 1014 buf->append(reinterpret_cast<const char *>(contents), size); | |
| 1015 return size; | |
| 1016 } | |
| 1017 #endif | |
| 1018 | |
| 1019 // In the ideal scenario, Tesseract will start working on data as soon | |
| 1020 // as it can. For example, if you stream a filelist through stdin, we | |
| 1021 // should start the OCR process as soon as the first filename is | |
| 1022 // available. This is particularly useful when hooking Tesseract up to | |
| 1023 // slow hardware such as a book scanning machine. | |
| 1024 // | |
| 1025 // Unfortunately there are tradeoffs. You can't seek on stdin. That | |
| 1026 // makes automatic detection of datatype (TIFF? filelist? PNG?) | |
| 1027 // impractical. So we support a command line flag to explicitly | |
| 1028 // identify the scenario that really matters: filelists on | |
| 1029 // stdin. We'll still do our best if the user likes pipes. | |
| 1030 bool TessBaseAPI::ProcessPagesInternal(const char *filename, const char *retry_config, | |
| 1031 int timeout_millisec, TessResultRenderer *renderer) { | |
| 1032 bool stdInput = !strcmp(filename, "stdin") || !strcmp(filename, "-"); | |
| 1033 if (stdInput) { | |
| 1034 #ifdef WIN32 | |
| 1035 if (_setmode(_fileno(stdin), _O_BINARY) == -1) | |
| 1036 tprintf("ERROR: cin to binary: %s", strerror(errno)); | |
| 1037 #endif // WIN32 | |
| 1038 } | |
| 1039 | |
| 1040 if (stream_filelist) { | |
| 1041 return ProcessPagesFileList(stdin, nullptr, retry_config, timeout_millisec, renderer, | |
| 1042 tesseract_->tessedit_page_number); | |
| 1043 } | |
| 1044 | |
| 1045 // At this point we are officially in autodection territory. | |
| 1046 // That means any data in stdin must be buffered, to make it | |
| 1047 // seekable. | |
| 1048 std::string buf; | |
| 1049 const l_uint8 *data = nullptr; | |
| 1050 if (stdInput) { | |
| 1051 buf.assign((std::istreambuf_iterator<char>(std::cin)), (std::istreambuf_iterator<char>())); | |
| 1052 data = reinterpret_cast<const l_uint8 *>(buf.data()); | |
| 1053 } else if (strstr(filename, "://") != nullptr) { | |
| 1054 // Get image or image list by URL. | |
| 1055 #ifdef HAVE_LIBCURL | |
| 1056 CURL *curl = curl_easy_init(); | |
| 1057 if (curl == nullptr) { | |
| 1058 fprintf(stderr, "Error, curl_easy_init failed\n"); | |
| 1059 return false; | |
| 1060 } else { | |
| 1061 CURLcode curlcode; | |
| 1062 auto error = [curl, &curlcode](const char *function) { | |
| 1063 fprintf(stderr, "Error, %s failed with error %s\n", function, curl_easy_strerror(curlcode)); | |
| 1064 curl_easy_cleanup(curl); | |
| 1065 return false; | |
| 1066 }; | |
| 1067 curlcode = curl_easy_setopt(curl, CURLOPT_URL, filename); | |
| 1068 if (curlcode != CURLE_OK) { | |
| 1069 return error("curl_easy_setopt"); | |
| 1070 } | |
| 1071 curlcode = curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1L); | |
| 1072 if (curlcode != CURLE_OK) { | |
| 1073 return error("curl_easy_setopt"); | |
| 1074 } | |
| 1075 // Follow HTTP, HTTPS, FTP and FTPS redirects. | |
| 1076 curlcode = curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1); | |
| 1077 if (curlcode != CURLE_OK) { | |
| 1078 return error("curl_easy_setopt"); | |
| 1079 } | |
| 1080 // Allow no more than 8 redirections to prevent endless loops. | |
| 1081 curlcode = curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 8); | |
| 1082 if (curlcode != CURLE_OK) { | |
| 1083 return error("curl_easy_setopt"); | |
| 1084 } | |
| 1085 int timeout = curl_timeout; | |
| 1086 if (timeout > 0) { | |
| 1087 curlcode = curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1L); | |
| 1088 if (curlcode != CURLE_OK) { | |
| 1089 return error("curl_easy_setopt"); | |
| 1090 } | |
| 1091 curlcode = curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout); | |
| 1092 if (curlcode != CURLE_OK) { | |
| 1093 return error("curl_easy_setopt"); | |
| 1094 } | |
| 1095 } | |
| 1096 std::string cookiefile = curl_cookiefile; | |
| 1097 if (!cookiefile.empty()) { | |
| 1098 curlcode = curl_easy_setopt(curl, CURLOPT_COOKIEFILE, cookiefile.c_str()); | |
| 1099 if (curlcode != CURLE_OK) { | |
| 1100 return error("curl_easy_setopt"); | |
| 1101 } | |
| 1102 } | |
| 1103 curlcode = curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback); | |
| 1104 if (curlcode != CURLE_OK) { | |
| 1105 return error("curl_easy_setopt"); | |
| 1106 } | |
| 1107 curlcode = curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buf); | |
| 1108 if (curlcode != CURLE_OK) { | |
| 1109 return error("curl_easy_setopt"); | |
| 1110 } | |
| 1111 curlcode = curl_easy_setopt(curl, CURLOPT_USERAGENT, "Tesseract OCR"); | |
| 1112 if (curlcode != CURLE_OK) { | |
| 1113 return error("curl_easy_setopt"); | |
| 1114 } | |
| 1115 curlcode = curl_easy_perform(curl); | |
| 1116 if (curlcode != CURLE_OK) { | |
| 1117 return error("curl_easy_perform"); | |
| 1118 } | |
| 1119 curl_easy_cleanup(curl); | |
| 1120 data = reinterpret_cast<const l_uint8 *>(buf.data()); | |
| 1121 } | |
| 1122 #else | |
| 1123 fprintf(stderr, "Error, this tesseract has no URL support\n"); | |
| 1124 return false; | |
| 1125 #endif | |
| 1126 } else { | |
| 1127 // Check whether the input file can be read. | |
| 1128 if (FILE *file = fopen(filename, "rb")) { | |
| 1129 fclose(file); | |
| 1130 } else { | |
| 1131 fprintf(stderr, "Error, cannot read input file %s: %s\n", filename, strerror(errno)); | |
| 1132 return false; | |
| 1133 } | |
| 1134 } | |
| 1135 | |
| 1136 // Here is our autodetection | |
| 1137 int format; | |
| 1138 int r = | |
| 1139 (data != nullptr) ? findFileFormatBuffer(data, &format) : findFileFormat(filename, &format); | |
| 1140 | |
| 1141 // Maybe we have a filelist | |
| 1142 if (r != 0 || format == IFF_UNKNOWN) { | |
| 1143 std::string s; | |
| 1144 if (data != nullptr) { | |
| 1145 s = buf.c_str(); | |
| 1146 } else { | |
| 1147 std::ifstream t(filename); | |
| 1148 std::string u((std::istreambuf_iterator<char>(t)), std::istreambuf_iterator<char>()); | |
| 1149 s = u.c_str(); | |
| 1150 } | |
| 1151 return ProcessPagesFileList(nullptr, &s, retry_config, timeout_millisec, renderer, | |
| 1152 tesseract_->tessedit_page_number); | |
| 1153 } | |
| 1154 | |
| 1155 // Maybe we have a TIFF which is potentially multipage | |
| 1156 bool tiff = (format == IFF_TIFF || format == IFF_TIFF_PACKBITS || format == IFF_TIFF_RLE || | |
| 1157 format == IFF_TIFF_G3 || format == IFF_TIFF_G4 || format == IFF_TIFF_LZW || | |
| 1158 #if LIBLEPT_MAJOR_VERSION > 1 || LIBLEPT_MINOR_VERSION > 76 | |
| 1159 format == IFF_TIFF_JPEG || | |
| 1160 #endif | |
| 1161 format == IFF_TIFF_ZIP); | |
| 1162 | |
| 1163 // Fail early if we can, before producing any output | |
| 1164 Pix *pix = nullptr; | |
| 1165 if (!tiff) { | |
| 1166 pix = (data != nullptr) ? pixReadMem(data, buf.size()) : pixRead(filename); | |
| 1167 if (pix == nullptr) { | |
| 1168 return false; | |
| 1169 } | |
| 1170 } | |
| 1171 | |
| 1172 // Begin the output | |
| 1173 if (renderer && !renderer->BeginDocument(document_title.c_str())) { | |
| 1174 pixDestroy(&pix); | |
| 1175 return false; | |
| 1176 } | |
| 1177 | |
| 1178 // Produce output | |
| 1179 r = (tiff) ? ProcessPagesMultipageTiff(data, buf.size(), filename, retry_config, timeout_millisec, | |
| 1180 renderer, tesseract_->tessedit_page_number) | |
| 1181 : ProcessPage(pix, 0, filename, retry_config, timeout_millisec, renderer); | |
| 1182 | |
| 1183 // Clean up memory as needed | |
| 1184 pixDestroy(&pix); | |
| 1185 | |
| 1186 // End the output | |
| 1187 if (!r || (renderer && !renderer->EndDocument())) { | |
| 1188 return false; | |
| 1189 } | |
| 1190 return true; | |
| 1191 } | |
| 1192 | |
| 1193 bool TessBaseAPI::ProcessPage(Pix *pix, int page_index, const char *filename, | |
| 1194 const char *retry_config, int timeout_millisec, | |
| 1195 TessResultRenderer *renderer) { | |
| 1196 SetInputName(filename); | |
| 1197 SetImage(pix); | |
| 1198 bool failed = false; | |
| 1199 | |
| 1200 if (tesseract_->tessedit_pageseg_mode == PSM_AUTO_ONLY) { | |
| 1201 // Disabled character recognition | |
| 1202 if (! std::unique_ptr<const PageIterator>(AnalyseLayout())) { | |
| 1203 failed = true; | |
| 1204 } | |
| 1205 } else if (tesseract_->tessedit_pageseg_mode == PSM_OSD_ONLY) { | |
| 1206 failed = FindLines() != 0; | |
| 1207 } else if (timeout_millisec > 0) { | |
| 1208 // Running with a timeout. | |
| 1209 ETEXT_DESC monitor; | |
| 1210 monitor.cancel = nullptr; | |
| 1211 monitor.cancel_this = nullptr; | |
| 1212 monitor.set_deadline_msecs(timeout_millisec); | |
| 1213 | |
| 1214 // Now run the main recognition. | |
| 1215 failed = Recognize(&monitor) < 0; | |
| 1216 } else { | |
| 1217 // Normal layout and character recognition with no timeout. | |
| 1218 failed = Recognize(nullptr) < 0; | |
| 1219 } | |
| 1220 | |
| 1221 if (tesseract_->tessedit_write_images) { | |
| 1222 Pix *page_pix = GetThresholdedImage(); | |
| 1223 std::string output_filename = output_file_ + ".processed"; | |
| 1224 if (page_index > 0) { | |
| 1225 output_filename += std::to_string(page_index); | |
| 1226 } | |
| 1227 output_filename += ".tif"; | |
| 1228 pixWrite(output_filename.c_str(), page_pix, IFF_TIFF_G4); | |
| 1229 pixDestroy(&page_pix); | |
| 1230 } | |
| 1231 | |
| 1232 if (failed && retry_config != nullptr && retry_config[0] != '\0') { | |
| 1233 // Save current config variables before switching modes. | |
| 1234 FILE *fp = fopen(kOldVarsFile, "wb"); | |
| 1235 if (fp == nullptr) { | |
| 1236 tprintf("Error, failed to open file \"%s\"\n", kOldVarsFile); | |
| 1237 } else { | |
| 1238 PrintVariables(fp); | |
| 1239 fclose(fp); | |
| 1240 } | |
| 1241 // Switch to alternate mode for retry. | |
| 1242 ReadConfigFile(retry_config); | |
| 1243 SetImage(pix); | |
| 1244 Recognize(nullptr); | |
| 1245 // Restore saved config variables. | |
| 1246 ReadConfigFile(kOldVarsFile); | |
| 1247 } | |
| 1248 | |
| 1249 if (renderer && !failed) { | |
| 1250 failed = !renderer->AddImage(this); | |
| 1251 } | |
| 1252 | |
| 1253 return !failed; | |
| 1254 } | |
| 1255 | |
| 1256 /** | |
| 1257 * Get a left-to-right iterator to the results of LayoutAnalysis and/or | |
| 1258 * Recognize. The returned iterator must be deleted after use. | |
| 1259 */ | |
| 1260 LTRResultIterator *TessBaseAPI::GetLTRIterator() { | |
| 1261 if (tesseract_ == nullptr || page_res_ == nullptr) { | |
| 1262 return nullptr; | |
| 1263 } | |
| 1264 return new LTRResultIterator(page_res_, tesseract_, thresholder_->GetScaleFactor(), | |
| 1265 thresholder_->GetScaledYResolution(), rect_left_, rect_top_, | |
| 1266 rect_width_, rect_height_); | |
| 1267 } | |
| 1268 | |
| 1269 /** | |
| 1270 * Get a reading-order iterator to the results of LayoutAnalysis and/or | |
| 1271 * Recognize. The returned iterator must be deleted after use. | |
| 1272 * WARNING! This class points to data held within the TessBaseAPI class, and | |
| 1273 * therefore can only be used while the TessBaseAPI class still exists and | |
| 1274 * has not been subjected to a call of Init, SetImage, Recognize, Clear, End | |
| 1275 * DetectOS, or anything else that changes the internal PAGE_RES. | |
| 1276 */ | |
| 1277 ResultIterator *TessBaseAPI::GetIterator() { | |
| 1278 if (tesseract_ == nullptr || page_res_ == nullptr) { | |
| 1279 return nullptr; | |
| 1280 } | |
| 1281 return ResultIterator::StartOfParagraph(LTRResultIterator( | |
| 1282 page_res_, tesseract_, thresholder_->GetScaleFactor(), thresholder_->GetScaledYResolution(), | |
| 1283 rect_left_, rect_top_, rect_width_, rect_height_)); | |
| 1284 } | |
| 1285 | |
| 1286 /** | |
| 1287 * Get a mutable iterator to the results of LayoutAnalysis and/or Recognize. | |
| 1288 * The returned iterator must be deleted after use. | |
| 1289 * WARNING! This class points to data held within the TessBaseAPI class, and | |
| 1290 * therefore can only be used while the TessBaseAPI class still exists and | |
| 1291 * has not been subjected to a call of Init, SetImage, Recognize, Clear, End | |
| 1292 * DetectOS, or anything else that changes the internal PAGE_RES. | |
| 1293 */ | |
| 1294 MutableIterator *TessBaseAPI::GetMutableIterator() { | |
| 1295 if (tesseract_ == nullptr || page_res_ == nullptr) { | |
| 1296 return nullptr; | |
| 1297 } | |
| 1298 return new MutableIterator(page_res_, tesseract_, thresholder_->GetScaleFactor(), | |
| 1299 thresholder_->GetScaledYResolution(), rect_left_, rect_top_, | |
| 1300 rect_width_, rect_height_); | |
| 1301 } | |
| 1302 | |
| 1303 /** Make a text string from the internal data structures. */ | |
| 1304 char *TessBaseAPI::GetUTF8Text() { | |
| 1305 if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) { | |
| 1306 return nullptr; | |
| 1307 } | |
| 1308 std::string text(""); | |
| 1309 const std::unique_ptr</*non-const*/ ResultIterator> it(GetIterator()); | |
| 1310 do { | |
| 1311 if (it->Empty(RIL_PARA)) { | |
| 1312 continue; | |
| 1313 } | |
| 1314 auto block_type = it->BlockType(); | |
| 1315 switch (block_type) { | |
| 1316 case PT_FLOWING_IMAGE: | |
| 1317 case PT_HEADING_IMAGE: | |
| 1318 case PT_PULLOUT_IMAGE: | |
| 1319 case PT_HORZ_LINE: | |
| 1320 case PT_VERT_LINE: | |
| 1321 // Ignore images and lines for text output. | |
| 1322 continue; | |
| 1323 case PT_NOISE: | |
| 1324 tprintf("TODO: Please report image which triggers the noise case.\n"); | |
| 1325 ASSERT_HOST(false); | |
| 1326 default: | |
| 1327 break; | |
| 1328 } | |
| 1329 | |
| 1330 const std::unique_ptr<const char[]> para_text(it->GetUTF8Text(RIL_PARA)); | |
| 1331 text += para_text.get(); | |
| 1332 } while (it->Next(RIL_PARA)); | |
| 1333 return copy_string(text); | |
| 1334 } | |
| 1335 | |
| 1336 static void AddBoxToTSV(const PageIterator *it, PageIteratorLevel level, std::string &text) { | |
| 1337 int left, top, right, bottom; | |
| 1338 it->BoundingBox(level, &left, &top, &right, &bottom); | |
| 1339 text += "\t" + std::to_string(left); | |
| 1340 text += "\t" + std::to_string(top); | |
| 1341 text += "\t" + std::to_string(right - left); | |
| 1342 text += "\t" + std::to_string(bottom - top); | |
| 1343 } | |
| 1344 | |
| 1345 /** | |
| 1346 * Make a TSV-formatted string from the internal data structures. | |
| 1347 * page_number is 0-based but will appear in the output as 1-based. | |
| 1348 * Returned string must be freed with the delete [] operator. | |
| 1349 */ | |
| 1350 char *TessBaseAPI::GetTSVText(int page_number) { | |
| 1351 if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0)) { | |
| 1352 return nullptr; | |
| 1353 } | |
| 1354 | |
| 1355 #if !defined(NDEBUG) | |
| 1356 int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1; | |
| 1357 #endif | |
| 1358 int page_id = page_number + 1; // we use 1-based page numbers. | |
| 1359 | |
| 1360 int page_num = page_id; | |
| 1361 int block_num = 0; | |
| 1362 int par_num = 0; | |
| 1363 int line_num = 0; | |
| 1364 int word_num = 0; | |
| 1365 | |
| 1366 std::string tsv_str; | |
| 1367 tsv_str += "1\t" + std::to_string(page_num); // level 1 - page | |
| 1368 tsv_str += "\t" + std::to_string(block_num); | |
| 1369 tsv_str += "\t" + std::to_string(par_num); | |
| 1370 tsv_str += "\t" + std::to_string(line_num); | |
| 1371 tsv_str += "\t" + std::to_string(word_num); | |
| 1372 tsv_str += "\t" + std::to_string(rect_left_); | |
| 1373 tsv_str += "\t" + std::to_string(rect_top_); | |
| 1374 tsv_str += "\t" + std::to_string(rect_width_); | |
| 1375 tsv_str += "\t" + std::to_string(rect_height_); | |
| 1376 tsv_str += "\t-1\t\n"; | |
| 1377 | |
| 1378 const std::unique_ptr</*non-const*/ ResultIterator> res_it(GetIterator()); | |
| 1379 while (!res_it->Empty(RIL_BLOCK)) { | |
| 1380 if (res_it->Empty(RIL_WORD)) { | |
| 1381 res_it->Next(RIL_WORD); | |
| 1382 continue; | |
| 1383 } | |
| 1384 | |
| 1385 // Add rows for any new block/paragraph/textline. | |
| 1386 if (res_it->IsAtBeginningOf(RIL_BLOCK)) { | |
| 1387 block_num++; | |
| 1388 par_num = 0; | |
| 1389 line_num = 0; | |
| 1390 word_num = 0; | |
| 1391 tsv_str += "2\t" + std::to_string(page_num); // level 2 - block | |
| 1392 tsv_str += "\t" + std::to_string(block_num); | |
| 1393 tsv_str += "\t" + std::to_string(par_num); | |
| 1394 tsv_str += "\t" + std::to_string(line_num); | |
| 1395 tsv_str += "\t" + std::to_string(word_num); | |
| 1396 AddBoxToTSV(res_it.get(), RIL_BLOCK, tsv_str); | |
| 1397 tsv_str += "\t-1\t\n"; // end of row for block | |
| 1398 } | |
| 1399 if (res_it->IsAtBeginningOf(RIL_PARA)) { | |
| 1400 par_num++; | |
| 1401 line_num = 0; | |
| 1402 word_num = 0; | |
| 1403 tsv_str += "3\t" + std::to_string(page_num); // level 3 - paragraph | |
| 1404 tsv_str += "\t" + std::to_string(block_num); | |
| 1405 tsv_str += "\t" + std::to_string(par_num); | |
| 1406 tsv_str += "\t" + std::to_string(line_num); | |
| 1407 tsv_str += "\t" + std::to_string(word_num); | |
| 1408 AddBoxToTSV(res_it.get(), RIL_PARA, tsv_str); | |
| 1409 tsv_str += "\t-1\t\n"; // end of row for para | |
| 1410 } | |
| 1411 if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { | |
| 1412 line_num++; | |
| 1413 word_num = 0; | |
| 1414 tsv_str += "4\t" + std::to_string(page_num); // level 4 - line | |
| 1415 tsv_str += "\t" + std::to_string(block_num); | |
| 1416 tsv_str += "\t" + std::to_string(par_num); | |
| 1417 tsv_str += "\t" + std::to_string(line_num); | |
| 1418 tsv_str += "\t" + std::to_string(word_num); | |
| 1419 AddBoxToTSV(res_it.get(), RIL_TEXTLINE, tsv_str); | |
| 1420 tsv_str += "\t-1\t\n"; // end of row for line | |
| 1421 } | |
| 1422 | |
| 1423 // Now, process the word... | |
| 1424 int left, top, right, bottom; | |
| 1425 res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom); | |
| 1426 word_num++; | |
| 1427 tsv_str += "5\t" + std::to_string(page_num); // level 5 - word | |
| 1428 tsv_str += "\t" + std::to_string(block_num); | |
| 1429 tsv_str += "\t" + std::to_string(par_num); | |
| 1430 tsv_str += "\t" + std::to_string(line_num); | |
| 1431 tsv_str += "\t" + std::to_string(word_num); | |
| 1432 tsv_str += "\t" + std::to_string(left); | |
| 1433 tsv_str += "\t" + std::to_string(top); | |
| 1434 tsv_str += "\t" + std::to_string(right - left); | |
| 1435 tsv_str += "\t" + std::to_string(bottom - top); | |
| 1436 tsv_str += "\t" + std::to_string(res_it->Confidence(RIL_WORD)); | |
| 1437 tsv_str += "\t"; | |
| 1438 | |
| 1439 #if !defined(NDEBUG) | |
| 1440 // Increment counts if at end of block/paragraph/textline. | |
| 1441 if (res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD)) { | |
| 1442 lcnt++; | |
| 1443 } | |
| 1444 if (res_it->IsAtFinalElement(RIL_PARA, RIL_WORD)) { | |
| 1445 pcnt++; | |
| 1446 } | |
| 1447 if (res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD)) { | |
| 1448 bcnt++; | |
| 1449 } | |
| 1450 #endif | |
| 1451 | |
| 1452 do { | |
| 1453 tsv_str += std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get(); | |
| 1454 res_it->Next(RIL_SYMBOL); | |
| 1455 } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); | |
| 1456 tsv_str += "\n"; // end of row | |
| 1457 #if !defined(NDEBUG) | |
| 1458 wcnt++; | |
| 1459 #endif | |
| 1460 } | |
| 1461 | |
| 1462 return copy_string(tsv_str); | |
| 1463 } | |
| 1464 | |
| 1465 /** The 5 numbers output for each box (the usual 4 and a page number.) */ | |
| 1466 const int kNumbersPerBlob = 5; | |
| 1467 /** | |
| 1468 * The number of bytes taken by each number. Since we use int16_t for ICOORD, | |
| 1469 * assume only 5 digits max. | |
| 1470 */ | |
| 1471 const int kBytesPerNumber = 5; | |
| 1472 /** | |
| 1473 * Multiplier for max expected textlength assumes (kBytesPerNumber + space) | |
| 1474 * * kNumbersPerBlob plus the newline. Add to this the | |
| 1475 * original UTF8 characters, and one kMaxBytesPerLine for safety. | |
| 1476 */ | |
| 1477 const int kBytesPerBoxFileLine = (kBytesPerNumber + 1) * kNumbersPerBlob + 1; | |
| 1478 /** Max bytes in the decimal representation of int64_t. */ | |
| 1479 const int kBytesPer64BitNumber = 20; | |
| 1480 /** | |
| 1481 * A maximal single box could occupy kNumbersPerBlob numbers at | |
| 1482 * kBytesPer64BitNumber digits (if someone sneaks in a 64 bit value) and a | |
| 1483 * space plus the newline and the maximum length of a UNICHAR. | |
| 1484 * Test against this on each iteration for safety. | |
| 1485 */ | |
| 1486 const int kMaxBytesPerLine = kNumbersPerBlob * (kBytesPer64BitNumber + 1) + 1 + UNICHAR_LEN; | |
| 1487 | |
| 1488 /** | |
| 1489 * The recognized text is returned as a char* which is coded | |
| 1490 * as a UTF8 box file. | |
| 1491 * page_number is a 0-base page index that will appear in the box file. | |
| 1492 * Returned string must be freed with the delete [] operator. | |
| 1493 */ | |
| 1494 char *TessBaseAPI::GetBoxText(int page_number) { | |
| 1495 if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) { | |
| 1496 return nullptr; | |
| 1497 } | |
| 1498 int blob_count; | |
| 1499 int utf8_length = TextLength(&blob_count); | |
| 1500 int total_length = blob_count * kBytesPerBoxFileLine + utf8_length + kMaxBytesPerLine; | |
| 1501 char *result = new char[total_length]; | |
| 1502 result[0] = '\0'; | |
| 1503 int output_length = 0; | |
| 1504 LTRResultIterator *it = GetLTRIterator(); | |
| 1505 do { | |
| 1506 int left, top, right, bottom; | |
| 1507 if (it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom)) { | |
| 1508 const std::unique_ptr</*non-const*/ char[]> text(it->GetUTF8Text(RIL_SYMBOL)); | |
| 1509 // Tesseract uses space for recognition failure. Fix to a reject | |
| 1510 // character, kTesseractReject so we don't create illegal box files. | |
| 1511 for (int i = 0; text[i] != '\0'; ++i) { | |
| 1512 if (text[i] == ' ') { | |
| 1513 text[i] = kTesseractReject; | |
| 1514 } | |
| 1515 } | |
| 1516 snprintf(result + output_length, total_length - output_length, "%s %d %d %d %d %d\n", | |
| 1517 text.get(), left, image_height_ - bottom, right, image_height_ - top, page_number); | |
| 1518 output_length += strlen(result + output_length); | |
| 1519 // Just in case... | |
| 1520 if (output_length + kMaxBytesPerLine > total_length) { | |
| 1521 break; | |
| 1522 } | |
| 1523 } | |
| 1524 } while (it->Next(RIL_SYMBOL)); | |
| 1525 delete it; | |
| 1526 return result; | |
| 1527 } | |
| 1528 | |
| 1529 /** | |
| 1530 * Conversion table for non-latin characters. | |
| 1531 * Maps characters out of the latin set into the latin set. | |
| 1532 * TODO(rays) incorporate this translation into unicharset. | |
| 1533 */ | |
| 1534 const int kUniChs[] = {0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0}; | |
| 1535 /** Latin chars corresponding to the unicode chars above. */ | |
| 1536 const int kLatinChs[] = {0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0}; | |
| 1537 | |
| 1538 /** | |
| 1539 * The recognized text is returned as a char* which is coded | |
| 1540 * as UNLV format Latin-1 with specific reject and suspect codes. | |
| 1541 * Returned string must be freed with the delete [] operator. | |
| 1542 */ | |
| 1543 char *TessBaseAPI::GetUNLVText() { | |
| 1544 if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) { | |
| 1545 return nullptr; | |
| 1546 } | |
| 1547 bool tilde_crunch_written = false; | |
| 1548 bool last_char_was_newline = true; | |
| 1549 bool last_char_was_tilde = false; | |
| 1550 | |
| 1551 int total_length = TextLength(nullptr); | |
| 1552 PAGE_RES_IT page_res_it(page_res_); | |
| 1553 char *result = new char[total_length]; | |
| 1554 char *ptr = result; | |
| 1555 for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) { | |
| 1556 WERD_RES *word = page_res_it.word(); | |
| 1557 // Process the current word. | |
| 1558 if (word->unlv_crunch_mode != CR_NONE) { | |
| 1559 if (word->unlv_crunch_mode != CR_DELETE && | |
| 1560 (!tilde_crunch_written || | |
| 1561 (word->unlv_crunch_mode == CR_KEEP_SPACE && word->word->space() > 0 && | |
| 1562 !word->word->flag(W_FUZZY_NON) && !word->word->flag(W_FUZZY_SP)))) { | |
| 1563 if (!word->word->flag(W_BOL) && word->word->space() > 0 && !word->word->flag(W_FUZZY_NON) && | |
| 1564 !word->word->flag(W_FUZZY_SP)) { | |
| 1565 /* Write a space to separate from preceding good text */ | |
| 1566 *ptr++ = ' '; | |
| 1567 last_char_was_tilde = false; | |
| 1568 } | |
| 1569 if (!last_char_was_tilde) { | |
| 1570 // Write a reject char. | |
| 1571 last_char_was_tilde = true; | |
| 1572 *ptr++ = kUNLVReject; | |
| 1573 tilde_crunch_written = true; | |
| 1574 last_char_was_newline = false; | |
| 1575 } | |
| 1576 } | |
| 1577 } else { | |
| 1578 // NORMAL PROCESSING of non tilde crunched words. | |
| 1579 tilde_crunch_written = false; | |
| 1580 tesseract_->set_unlv_suspects(word); | |
| 1581 const char *wordstr = word->best_choice->unichar_string().c_str(); | |
| 1582 const auto &lengths = word->best_choice->unichar_lengths(); | |
| 1583 int length = lengths.length(); | |
| 1584 int i = 0; | |
| 1585 int offset = 0; | |
| 1586 | |
| 1587 if (last_char_was_tilde && word->word->space() == 0 && wordstr[offset] == ' ') { | |
| 1588 // Prevent adjacent tilde across words - we know that adjacent tildes | |
| 1589 // within words have been removed. | |
| 1590 // Skip the first character. | |
| 1591 offset = lengths[i++]; | |
| 1592 } | |
| 1593 if (i < length && wordstr[offset] != 0) { | |
| 1594 if (!last_char_was_newline) { | |
| 1595 *ptr++ = ' '; | |
| 1596 } else { | |
| 1597 last_char_was_newline = false; | |
| 1598 } | |
| 1599 for (; i < length; offset += lengths[i++]) { | |
| 1600 if (wordstr[offset] == ' ' || wordstr[offset] == kTesseractReject) { | |
| 1601 *ptr++ = kUNLVReject; | |
| 1602 last_char_was_tilde = true; | |
| 1603 } else { | |
| 1604 if (word->reject_map[i].rejected()) { | |
| 1605 *ptr++ = kUNLVSuspect; | |
| 1606 } | |
| 1607 UNICHAR ch(wordstr + offset, lengths[i]); | |
| 1608 int uni_ch = ch.first_uni(); | |
| 1609 for (int j = 0; kUniChs[j] != 0; ++j) { | |
| 1610 if (kUniChs[j] == uni_ch) { | |
| 1611 uni_ch = kLatinChs[j]; | |
| 1612 break; | |
| 1613 } | |
| 1614 } | |
| 1615 if (uni_ch <= 0xff) { | |
| 1616 *ptr++ = static_cast<char>(uni_ch); | |
| 1617 last_char_was_tilde = false; | |
| 1618 } else { | |
| 1619 *ptr++ = kUNLVReject; | |
| 1620 last_char_was_tilde = true; | |
| 1621 } | |
| 1622 } | |
| 1623 } | |
| 1624 } | |
| 1625 } | |
| 1626 if (word->word->flag(W_EOL) && !last_char_was_newline) { | |
| 1627 /* Add a new line output */ | |
| 1628 *ptr++ = '\n'; | |
| 1629 tilde_crunch_written = false; | |
| 1630 last_char_was_newline = true; | |
| 1631 last_char_was_tilde = false; | |
| 1632 } | |
| 1633 } | |
| 1634 *ptr++ = '\n'; | |
| 1635 *ptr = '\0'; | |
| 1636 return result; | |
| 1637 } | |
| 1638 | |
| 1639 #ifndef DISABLED_LEGACY_ENGINE | |
| 1640 | |
| 1641 /** | |
| 1642 * Detect the orientation of the input image and apparent script (alphabet). | |
| 1643 * orient_deg is the detected clockwise rotation of the input image in degrees | |
| 1644 * (0, 90, 180, 270) | |
| 1645 * orient_conf is the confidence (15.0 is reasonably confident) | |
| 1646 * script_name is an ASCII string, the name of the script, e.g. "Latin" | |
| 1647 * script_conf is confidence level in the script | |
| 1648 * Returns true on success and writes values to each parameter as an output | |
| 1649 */ | |
| 1650 bool TessBaseAPI::DetectOrientationScript(int *orient_deg, float *orient_conf, | |
| 1651 const char **script_name, float *script_conf) { | |
| 1652 OSResults osr; | |
| 1653 | |
| 1654 bool osd = DetectOS(&osr); | |
| 1655 if (!osd) { | |
| 1656 return false; | |
| 1657 } | |
| 1658 | |
| 1659 int orient_id = osr.best_result.orientation_id; | |
| 1660 int script_id = osr.get_best_script(orient_id); | |
| 1661 if (orient_conf) { | |
| 1662 *orient_conf = osr.best_result.oconfidence; | |
| 1663 } | |
| 1664 if (orient_deg) { | |
| 1665 *orient_deg = orient_id * 90; // convert quadrant to degrees | |
| 1666 } | |
| 1667 | |
| 1668 if (script_name) { | |
| 1669 const char *script = osr.unicharset->get_script_from_script_id(script_id); | |
| 1670 | |
| 1671 *script_name = script; | |
| 1672 } | |
| 1673 | |
| 1674 if (script_conf) { | |
| 1675 *script_conf = osr.best_result.sconfidence; | |
| 1676 } | |
| 1677 | |
| 1678 return true; | |
| 1679 } | |
| 1680 | |
| 1681 /** | |
| 1682 * The recognized text is returned as a char* which is coded | |
| 1683 * as UTF8 and must be freed with the delete [] operator. | |
| 1684 * page_number is a 0-based page index that will appear in the osd file. | |
| 1685 */ | |
| 1686 char *TessBaseAPI::GetOsdText(int page_number) { | |
| 1687 int orient_deg; | |
| 1688 float orient_conf; | |
| 1689 const char *script_name; | |
| 1690 float script_conf; | |
| 1691 | |
| 1692 if (!DetectOrientationScript(&orient_deg, &orient_conf, &script_name, &script_conf)) { | |
| 1693 return nullptr; | |
| 1694 } | |
| 1695 | |
| 1696 // clockwise rotation needed to make the page upright | |
| 1697 int rotate = OrientationIdToValue(orient_deg / 90); | |
| 1698 | |
| 1699 std::stringstream stream; | |
| 1700 // Use "C" locale (needed for float values orient_conf and script_conf). | |
| 1701 stream.imbue(std::locale::classic()); | |
| 1702 // Use fixed notation with 2 digits after the decimal point for float values. | |
| 1703 stream.precision(2); | |
| 1704 stream << std::fixed << "Page number: " << page_number << "\n" | |
| 1705 << "Orientation in degrees: " << orient_deg << "\n" | |
| 1706 << "Rotate: " << rotate << "\n" | |
| 1707 << "Orientation confidence: " << orient_conf << "\n" | |
| 1708 << "Script: " << script_name << "\n" | |
| 1709 << "Script confidence: " << script_conf << "\n"; | |
| 1710 return copy_string(stream.str()); | |
| 1711 } | |
| 1712 | |
| 1713 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 1714 | |
| 1715 /** Returns the average word confidence for Tesseract page result. */ | |
| 1716 int TessBaseAPI::MeanTextConf() { | |
| 1717 int *conf = AllWordConfidences(); | |
| 1718 if (!conf) { | |
| 1719 return 0; | |
| 1720 } | |
| 1721 int sum = 0; | |
| 1722 int *pt = conf; | |
| 1723 while (*pt >= 0) { | |
| 1724 sum += *pt++; | |
| 1725 } | |
| 1726 if (pt != conf) { | |
| 1727 sum /= pt - conf; | |
| 1728 } | |
| 1729 delete[] conf; | |
| 1730 return sum; | |
| 1731 } | |
| 1732 | |
| 1733 /** Returns an array of all word confidences, terminated by -1. */ | |
| 1734 int *TessBaseAPI::AllWordConfidences() { | |
| 1735 if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) { | |
| 1736 return nullptr; | |
| 1737 } | |
| 1738 int n_word = 0; | |
| 1739 PAGE_RES_IT res_it(page_res_); | |
| 1740 for (res_it.restart_page(); res_it.word() != nullptr; res_it.forward()) { | |
| 1741 n_word++; | |
| 1742 } | |
| 1743 | |
| 1744 int *conf = new int[n_word + 1]; | |
| 1745 n_word = 0; | |
| 1746 for (res_it.restart_page(); res_it.word() != nullptr; res_it.forward()) { | |
| 1747 WERD_RES *word = res_it.word(); | |
| 1748 WERD_CHOICE *choice = word->best_choice; | |
| 1749 int w_conf = static_cast<int>(100 + 5 * choice->certainty()); | |
| 1750 // This is the eq for converting Tesseract confidence to 1..100 | |
| 1751 if (w_conf < 0) { | |
| 1752 w_conf = 0; | |
| 1753 } | |
| 1754 if (w_conf > 100) { | |
| 1755 w_conf = 100; | |
| 1756 } | |
| 1757 conf[n_word++] = w_conf; | |
| 1758 } | |
| 1759 conf[n_word] = -1; | |
| 1760 return conf; | |
| 1761 } | |
| 1762 | |
| 1763 #ifndef DISABLED_LEGACY_ENGINE | |
| 1764 /** | |
| 1765 * Applies the given word to the adaptive classifier if possible. | |
| 1766 * The word must be SPACE-DELIMITED UTF-8 - l i k e t h i s , so it can | |
| 1767 * tell the boundaries of the graphemes. | |
| 1768 * Assumes that SetImage/SetRectangle have been used to set the image | |
| 1769 * to the given word. The mode arg should be PSM_SINGLE_WORD or | |
| 1770 * PSM_CIRCLE_WORD, as that will be used to control layout analysis. | |
| 1771 * The currently set PageSegMode is preserved. | |
| 1772 * Returns false if adaption was not possible for some reason. | |
| 1773 */ | |
| 1774 bool TessBaseAPI::AdaptToWordStr(PageSegMode mode, const char *wordstr) { | |
| 1775 int debug = 0; | |
| 1776 GetIntVariable("applybox_debug", &debug); | |
| 1777 bool success = true; | |
| 1778 PageSegMode current_psm = GetPageSegMode(); | |
| 1779 SetPageSegMode(mode); | |
| 1780 SetVariable("classify_enable_learning", "0"); | |
| 1781 const std::unique_ptr<const char[]> text(GetUTF8Text()); | |
| 1782 if (debug) { | |
| 1783 tprintf("Trying to adapt \"%s\" to \"%s\"\n", text.get(), wordstr); | |
| 1784 } | |
| 1785 if (text != nullptr) { | |
| 1786 PAGE_RES_IT it(page_res_); | |
| 1787 WERD_RES *word_res = it.word(); | |
| 1788 if (word_res != nullptr) { | |
| 1789 word_res->word->set_text(wordstr); | |
| 1790 // Check to see if text matches wordstr. | |
| 1791 int w = 0; | |
| 1792 int t; | |
| 1793 for (t = 0; text[t] != '\0'; ++t) { | |
| 1794 if (text[t] == '\n' || text[t] == ' ') { | |
| 1795 continue; | |
| 1796 } | |
| 1797 while (wordstr[w] == ' ') { | |
| 1798 ++w; | |
| 1799 } | |
| 1800 if (text[t] != wordstr[w]) { | |
| 1801 break; | |
| 1802 } | |
| 1803 ++w; | |
| 1804 } | |
| 1805 if (text[t] != '\0' || wordstr[w] != '\0') { | |
| 1806 // No match. | |
| 1807 delete page_res_; | |
| 1808 std::vector<TBOX> boxes; | |
| 1809 page_res_ = tesseract_->SetupApplyBoxes(boxes, block_list_); | |
| 1810 tesseract_->ReSegmentByClassification(page_res_); | |
| 1811 tesseract_->TidyUp(page_res_); | |
| 1812 PAGE_RES_IT pr_it(page_res_); | |
| 1813 if (pr_it.word() == nullptr) { | |
| 1814 success = false; | |
| 1815 } else { | |
| 1816 word_res = pr_it.word(); | |
| 1817 } | |
| 1818 } else { | |
| 1819 word_res->BestChoiceToCorrectText(); | |
| 1820 } | |
| 1821 if (success) { | |
| 1822 tesseract_->EnableLearning = true; | |
| 1823 tesseract_->LearnWord(nullptr, word_res); | |
| 1824 } | |
| 1825 } else { | |
| 1826 success = false; | |
| 1827 } | |
| 1828 } else { | |
| 1829 success = false; | |
| 1830 } | |
| 1831 SetPageSegMode(current_psm); | |
| 1832 return success; | |
| 1833 } | |
| 1834 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 1835 | |
| 1836 /** | |
| 1837 * Free up recognition results and any stored image data, without actually | |
| 1838 * freeing any recognition data that would be time-consuming to reload. | |
| 1839 * Afterwards, you must call SetImage or TesseractRect before doing | |
| 1840 * any Recognize or Get* operation. | |
| 1841 */ | |
| 1842 void TessBaseAPI::Clear() { | |
| 1843 if (thresholder_ != nullptr) { | |
| 1844 thresholder_->Clear(); | |
| 1845 } | |
| 1846 ClearResults(); | |
| 1847 if (tesseract_ != nullptr) { | |
| 1848 SetInputImage(nullptr); | |
| 1849 } | |
| 1850 } | |
| 1851 | |
| 1852 /** | |
| 1853 * Close down tesseract and free up all memory. End() is equivalent to | |
| 1854 * destructing and reconstructing your TessBaseAPI. | |
| 1855 * Once End() has been used, none of the other API functions may be used | |
| 1856 * other than Init and anything declared above it in the class definition. | |
| 1857 */ | |
| 1858 void TessBaseAPI::End() { | |
| 1859 Clear(); | |
| 1860 delete thresholder_; | |
| 1861 thresholder_ = nullptr; | |
| 1862 delete page_res_; | |
| 1863 page_res_ = nullptr; | |
| 1864 delete block_list_; | |
| 1865 block_list_ = nullptr; | |
| 1866 if (paragraph_models_ != nullptr) { | |
| 1867 for (auto model : *paragraph_models_) { | |
| 1868 delete model; | |
| 1869 } | |
| 1870 delete paragraph_models_; | |
| 1871 paragraph_models_ = nullptr; | |
| 1872 } | |
| 1873 #ifndef DISABLED_LEGACY_ENGINE | |
| 1874 if (osd_tesseract_ == tesseract_) { | |
| 1875 osd_tesseract_ = nullptr; | |
| 1876 } | |
| 1877 delete osd_tesseract_; | |
| 1878 osd_tesseract_ = nullptr; | |
| 1879 delete equ_detect_; | |
| 1880 equ_detect_ = nullptr; | |
| 1881 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 1882 delete tesseract_; | |
| 1883 tesseract_ = nullptr; | |
| 1884 input_file_.clear(); | |
| 1885 output_file_.clear(); | |
| 1886 datapath_.clear(); | |
| 1887 language_.clear(); | |
| 1888 } | |
| 1889 | |
| 1890 // Clear any library-level memory caches. | |
| 1891 // There are a variety of expensive-to-load constant data structures (mostly | |
| 1892 // language dictionaries) that are cached globally -- surviving the Init() | |
| 1893 // and End() of individual TessBaseAPI's. This function allows the clearing | |
| 1894 // of these caches. | |
| 1895 void TessBaseAPI::ClearPersistentCache() { | |
| 1896 Dict::GlobalDawgCache()->DeleteUnusedDawgs(); | |
| 1897 } | |
| 1898 | |
| 1899 /** | |
| 1900 * Check whether a word is valid according to Tesseract's language model | |
| 1901 * returns 0 if the word is invalid, non-zero if valid | |
| 1902 */ | |
| 1903 int TessBaseAPI::IsValidWord(const char *word) const { | |
| 1904 return tesseract_->getDict().valid_word(word); | |
| 1905 } | |
| 1906 // Returns true if utf8_character is defined in the UniCharset. | |
| 1907 bool TessBaseAPI::IsValidCharacter(const char *utf8_character) const { | |
| 1908 return tesseract_->unicharset.contains_unichar(utf8_character); | |
| 1909 } | |
| 1910 | |
| 1911 // TODO(rays) Obsolete this function and replace with a more aptly named | |
| 1912 // function that returns image coordinates rather than tesseract coordinates. | |
| 1913 bool TessBaseAPI::GetTextDirection(int *out_offset, float *out_slope) { | |
| 1914 const std::unique_ptr<const PageIterator> it(AnalyseLayout()); | |
| 1915 if (it == nullptr) { | |
| 1916 return false; | |
| 1917 } | |
| 1918 int x1, x2, y1, y2; | |
| 1919 it->Baseline(RIL_TEXTLINE, &x1, &y1, &x2, &y2); | |
| 1920 // Calculate offset and slope (NOTE: Kind of ugly) | |
| 1921 if (x2 <= x1) { | |
| 1922 x2 = x1 + 1; | |
| 1923 } | |
| 1924 // Convert the point pair to slope/offset of the baseline (in image coords.) | |
| 1925 *out_slope = static_cast<float>(y2 - y1) / (x2 - x1); | |
| 1926 *out_offset = static_cast<int>(y1 - *out_slope * x1); | |
| 1927 // Get the y-coord of the baseline at the left and right edges of the | |
| 1928 // textline's bounding box. | |
| 1929 int left, top, right, bottom; | |
| 1930 if (!it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom)) { | |
| 1931 return false; | |
| 1932 } | |
| 1933 int left_y = IntCastRounded(*out_slope * left + *out_offset); | |
| 1934 int right_y = IntCastRounded(*out_slope * right + *out_offset); | |
| 1935 // Shift the baseline down so it passes through the nearest bottom-corner | |
| 1936 // of the textline's bounding box. This is the difference between the y | |
| 1937 // at the lowest (max) edge of the box and the actual box bottom. | |
| 1938 *out_offset += bottom - std::max(left_y, right_y); | |
| 1939 // Switch back to bottom-up tesseract coordinates. Requires negation of | |
| 1940 // the slope and height - offset for the offset. | |
| 1941 *out_slope = -*out_slope; | |
| 1942 *out_offset = rect_height_ - *out_offset; | |
| 1943 | |
| 1944 return true; | |
| 1945 } | |
| 1946 | |
| 1947 /** Sets Dict::letter_is_okay_ function to point to the given function. */ | |
| 1948 void TessBaseAPI::SetDictFunc(DictFunc f) { | |
| 1949 if (tesseract_ != nullptr) { | |
| 1950 tesseract_->getDict().letter_is_okay_ = f; | |
| 1951 } | |
| 1952 } | |
| 1953 | |
| 1954 /** | |
| 1955 * Sets Dict::probability_in_context_ function to point to the given | |
| 1956 * function. | |
| 1957 * | |
| 1958 * @param f A single function that returns the probability of the current | |
| 1959 * "character" (in general a utf-8 string), given the context of a previous | |
| 1960 * utf-8 string. | |
| 1961 */ | |
| 1962 void TessBaseAPI::SetProbabilityInContextFunc(ProbabilityInContextFunc f) { | |
| 1963 if (tesseract_ != nullptr) { | |
| 1964 tesseract_->getDict().probability_in_context_ = f; | |
| 1965 // Set it for the sublangs too. | |
| 1966 int num_subs = tesseract_->num_sub_langs(); | |
| 1967 for (int i = 0; i < num_subs; ++i) { | |
| 1968 tesseract_->get_sub_lang(i)->getDict().probability_in_context_ = f; | |
| 1969 } | |
| 1970 } | |
| 1971 } | |
| 1972 | |
| 1973 /** Common code for setting the image. */ | |
| 1974 bool TessBaseAPI::InternalSetImage() { | |
| 1975 if (tesseract_ == nullptr) { | |
| 1976 tprintf("Please call Init before attempting to set an image.\n"); | |
| 1977 return false; | |
| 1978 } | |
| 1979 if (thresholder_ == nullptr) { | |
| 1980 thresholder_ = new ImageThresholder; | |
| 1981 } | |
| 1982 ClearResults(); | |
| 1983 return true; | |
| 1984 } | |
| 1985 | |
| 1986 /** | |
| 1987 * Run the thresholder to make the thresholded image, returned in pix, | |
| 1988 * which must not be nullptr. *pix must be initialized to nullptr, or point | |
| 1989 * to an existing pixDestroyable Pix. | |
| 1990 * The usual argument to Threshold is Tesseract::mutable_pix_binary(). | |
| 1991 */ | |
| 1992 bool TessBaseAPI::Threshold(Pix **pix) { | |
| 1993 ASSERT_HOST(pix != nullptr); | |
| 1994 if (*pix != nullptr) { | |
| 1995 pixDestroy(pix); | |
| 1996 } | |
| 1997 // Zero resolution messes up the algorithms, so make sure it is credible. | |
| 1998 int user_dpi = 0; | |
| 1999 GetIntVariable("user_defined_dpi", &user_dpi); | |
| 2000 int y_res = thresholder_->GetScaledYResolution(); | |
| 2001 if (user_dpi && (user_dpi < kMinCredibleResolution || user_dpi > kMaxCredibleResolution)) { | |
| 2002 tprintf( | |
| 2003 "Warning: User defined image dpi is outside of expected range " | |
| 2004 "(%d - %d)!\n", | |
| 2005 kMinCredibleResolution, kMaxCredibleResolution); | |
| 2006 } | |
| 2007 // Always use user defined dpi | |
| 2008 if (user_dpi) { | |
| 2009 thresholder_->SetSourceYResolution(user_dpi); | |
| 2010 } else if (y_res < kMinCredibleResolution || y_res > kMaxCredibleResolution) { | |
| 2011 if (y_res != 0) { | |
| 2012 // Show warning only if a resolution was given. | |
| 2013 tprintf("Warning: Invalid resolution %d dpi. Using %d instead.\n", | |
| 2014 y_res, kMinCredibleResolution); | |
| 2015 } | |
| 2016 thresholder_->SetSourceYResolution(kMinCredibleResolution); | |
| 2017 } | |
| 2018 | |
| 2019 auto thresholding_method = static_cast<ThresholdMethod>(static_cast<int>(tesseract_->thresholding_method)); | |
| 2020 | |
| 2021 if (thresholding_method == ThresholdMethod::Otsu) { | |
| 2022 Image pix_binary(*pix); | |
| 2023 if (!thresholder_->ThresholdToPix(&pix_binary)) { | |
| 2024 return false; | |
| 2025 } | |
| 2026 *pix = pix_binary; | |
| 2027 | |
| 2028 if (!thresholder_->IsBinary()) { | |
| 2029 tesseract_->set_pix_thresholds(thresholder_->GetPixRectThresholds()); | |
| 2030 tesseract_->set_pix_grey(thresholder_->GetPixRectGrey()); | |
| 2031 } else { | |
| 2032 tesseract_->set_pix_thresholds(nullptr); | |
| 2033 tesseract_->set_pix_grey(nullptr); | |
| 2034 } | |
| 2035 } else { | |
| 2036 auto [ok, pix_grey, pix_binary, pix_thresholds] = thresholder_->Threshold(this, thresholding_method); | |
| 2037 | |
| 2038 if (!ok) { | |
| 2039 return false; | |
| 2040 } | |
| 2041 *pix = pix_binary; | |
| 2042 | |
| 2043 tesseract_->set_pix_thresholds(pix_thresholds); | |
| 2044 tesseract_->set_pix_grey(pix_grey); | |
| 2045 } | |
| 2046 | |
| 2047 thresholder_->GetImageSizes(&rect_left_, &rect_top_, &rect_width_, &rect_height_, &image_width_, | |
| 2048 &image_height_); | |
| 2049 | |
| 2050 // Set the internal resolution that is used for layout parameters from the | |
| 2051 // estimated resolution, rather than the image resolution, which may be | |
| 2052 // fabricated, but we will use the image resolution, if there is one, to | |
| 2053 // report output point sizes. | |
| 2054 int estimated_res = ClipToRange(thresholder_->GetScaledEstimatedResolution(), | |
| 2055 kMinCredibleResolution, kMaxCredibleResolution); | |
| 2056 if (estimated_res != thresholder_->GetScaledEstimatedResolution()) { | |
| 2057 tprintf( | |
| 2058 "Estimated internal resolution %d out of range! " | |
| 2059 "Corrected to %d.\n", | |
| 2060 thresholder_->GetScaledEstimatedResolution(), estimated_res); | |
| 2061 } | |
| 2062 tesseract_->set_source_resolution(estimated_res); | |
| 2063 return true; | |
| 2064 } | |
| 2065 | |
| 2066 /** Find lines from the image making the BLOCK_LIST. */ | |
| 2067 int TessBaseAPI::FindLines() { | |
| 2068 if (thresholder_ == nullptr || thresholder_->IsEmpty()) { | |
| 2069 tprintf("Please call SetImage before attempting recognition.\n"); | |
| 2070 return -1; | |
| 2071 } | |
| 2072 if (recognition_done_) { | |
| 2073 ClearResults(); | |
| 2074 } | |
| 2075 if (!block_list_->empty()) { | |
| 2076 return 0; | |
| 2077 } | |
| 2078 if (tesseract_ == nullptr) { | |
| 2079 tesseract_ = new Tesseract; | |
| 2080 #ifndef DISABLED_LEGACY_ENGINE | |
| 2081 tesseract_->InitAdaptiveClassifier(nullptr); | |
| 2082 #endif | |
| 2083 } | |
| 2084 if (tesseract_->pix_binary() == nullptr && !Threshold(&tesseract_->mutable_pix_binary()->pix_)) { | |
| 2085 return -1; | |
| 2086 } | |
| 2087 | |
| 2088 tesseract_->PrepareForPageseg(); | |
| 2089 | |
| 2090 #ifndef DISABLED_LEGACY_ENGINE | |
| 2091 if (tesseract_->textord_equation_detect) { | |
| 2092 if (equ_detect_ == nullptr && !datapath_.empty()) { | |
| 2093 equ_detect_ = new EquationDetect(datapath_.c_str(), nullptr); | |
| 2094 } | |
| 2095 if (equ_detect_ == nullptr) { | |
| 2096 tprintf("Warning: Could not set equation detector\n"); | |
| 2097 } else { | |
| 2098 tesseract_->SetEquationDetect(equ_detect_); | |
| 2099 } | |
| 2100 } | |
| 2101 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 2102 | |
| 2103 Tesseract *osd_tess = osd_tesseract_; | |
| 2104 OSResults osr; | |
| 2105 #ifndef DISABLED_LEGACY_ENGINE | |
| 2106 if (PSM_OSD_ENABLED(tesseract_->tessedit_pageseg_mode) && osd_tess == nullptr) { | |
| 2107 if (strcmp(language_.c_str(), "osd") == 0) { | |
| 2108 osd_tess = tesseract_; | |
| 2109 } else { | |
| 2110 osd_tesseract_ = new Tesseract; | |
| 2111 TessdataManager mgr(reader_); | |
| 2112 if (datapath_.empty()) { | |
| 2113 tprintf( | |
| 2114 "Warning: Auto orientation and script detection requested," | |
| 2115 " but data path is undefined\n"); | |
| 2116 delete osd_tesseract_; | |
| 2117 osd_tesseract_ = nullptr; | |
| 2118 } else if (osd_tesseract_->init_tesseract(datapath_, "", "osd", OEM_TESSERACT_ONLY, | |
| 2119 nullptr, 0, nullptr, nullptr, false, &mgr) == 0) { | |
| 2120 osd_tess = osd_tesseract_; | |
| 2121 osd_tesseract_->set_source_resolution(thresholder_->GetSourceYResolution()); | |
| 2122 } else { | |
| 2123 tprintf( | |
| 2124 "Warning: Auto orientation and script detection requested," | |
| 2125 " but osd language failed to load\n"); | |
| 2126 delete osd_tesseract_; | |
| 2127 osd_tesseract_ = nullptr; | |
| 2128 } | |
| 2129 } | |
| 2130 } | |
| 2131 #endif // ndef DISABLED_LEGACY_ENGINE | |
| 2132 | |
| 2133 if (tesseract_->SegmentPage(input_file_.c_str(), block_list_, osd_tess, &osr) < 0) { | |
| 2134 return -1; | |
| 2135 } | |
| 2136 | |
| 2137 // If Devanagari is being recognized, we use different images for page seg | |
| 2138 // and for OCR. | |
| 2139 tesseract_->PrepareForTessOCR(block_list_, osd_tess, &osr); | |
| 2140 return 0; | |
| 2141 } | |
| 2142 | |
| 2143 /** | |
| 2144 * Return average gradient of lines on page. | |
| 2145 */ | |
| 2146 float TessBaseAPI::GetGradient() { | |
| 2147 return tesseract_->gradient(); | |
| 2148 } | |
| 2149 | |
| 2150 /** Delete the pageres and clear the block list ready for a new page. */ | |
| 2151 void TessBaseAPI::ClearResults() { | |
| 2152 if (tesseract_ != nullptr) { | |
| 2153 tesseract_->Clear(); | |
| 2154 } | |
| 2155 delete page_res_; | |
| 2156 page_res_ = nullptr; | |
| 2157 recognition_done_ = false; | |
| 2158 if (block_list_ == nullptr) { | |
| 2159 block_list_ = new BLOCK_LIST; | |
| 2160 } else { | |
| 2161 block_list_->clear(); | |
| 2162 } | |
| 2163 if (paragraph_models_ != nullptr) { | |
| 2164 for (auto model : *paragraph_models_) { | |
| 2165 delete model; | |
| 2166 } | |
| 2167 delete paragraph_models_; | |
| 2168 paragraph_models_ = nullptr; | |
| 2169 } | |
| 2170 } | |
| 2171 | |
| 2172 /** | |
| 2173 * Return the length of the output text string, as UTF8, assuming | |
| 2174 * liberally two spacing marks after each word (as paragraphs end with two | |
| 2175 * newlines), and assuming a single character reject marker for each rejected | |
| 2176 * character. | |
| 2177 * Also return the number of recognized blobs in blob_count. | |
| 2178 */ | |
| 2179 int TessBaseAPI::TextLength(int *blob_count) const { | |
| 2180 if (tesseract_ == nullptr || page_res_ == nullptr) { | |
| 2181 return 0; | |
| 2182 } | |
| 2183 | |
| 2184 PAGE_RES_IT page_res_it(page_res_); | |
| 2185 int total_length = 2; | |
| 2186 int total_blobs = 0; | |
| 2187 // Iterate over the data structures to extract the recognition result. | |
| 2188 for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) { | |
| 2189 WERD_RES *word = page_res_it.word(); | |
| 2190 WERD_CHOICE *choice = word->best_choice; | |
| 2191 if (choice != nullptr) { | |
| 2192 total_blobs += choice->length() + 2; | |
| 2193 total_length += choice->unichar_string().length() + 2; | |
| 2194 for (int i = 0; i < word->reject_map.length(); ++i) { | |
| 2195 if (word->reject_map[i].rejected()) { | |
| 2196 ++total_length; | |
| 2197 } | |
| 2198 } | |
| 2199 } | |
| 2200 } | |
| 2201 if (blob_count != nullptr) { | |
| 2202 *blob_count = total_blobs; | |
| 2203 } | |
| 2204 return total_length; | |
| 2205 } | |
| 2206 | |
| 2207 #ifndef DISABLED_LEGACY_ENGINE | |
| 2208 /** | |
| 2209 * Estimates the Orientation And Script of the image. | |
| 2210 * Returns true if the image was processed successfully. | |
| 2211 */ | |
| 2212 bool TessBaseAPI::DetectOS(OSResults *osr) { | |
| 2213 if (tesseract_ == nullptr) { | |
| 2214 return false; | |
| 2215 } | |
| 2216 ClearResults(); | |
| 2217 if (tesseract_->pix_binary() == nullptr && !Threshold(&tesseract_->mutable_pix_binary()->pix_)) { | |
| 2218 return false; | |
| 2219 } | |
| 2220 | |
| 2221 if (input_file_.empty()) { | |
| 2222 input_file_ = kInputFile; | |
| 2223 } | |
| 2224 return orientation_and_script_detection(input_file_.c_str(), osr, tesseract_) > 0; | |
| 2225 } | |
| 2226 #endif // #ifndef DISABLED_LEGACY_ENGINE | |
| 2227 | |
| 2228 void TessBaseAPI::set_min_orientation_margin(double margin) { | |
| 2229 tesseract_->min_orientation_margin.set_value(margin); | |
| 2230 } | |
| 2231 | |
| 2232 /** | |
| 2233 * Return text orientation of each block as determined in an earlier page layout | |
| 2234 * analysis operation. Orientation is returned as the number of ccw 90-degree | |
| 2235 * rotations (in [0..3]) required to make the text in the block upright | |
| 2236 * (readable). Note that this may not necessary be the block orientation | |
| 2237 * preferred for recognition (such as the case of vertical CJK text). | |
| 2238 * | |
| 2239 * Also returns whether the text in the block is believed to have vertical | |
| 2240 * writing direction (when in an upright page orientation). | |
| 2241 * | |
| 2242 * The returned array is of length equal to the number of text blocks, which may | |
| 2243 * be less than the total number of blocks. The ordering is intended to be | |
| 2244 * consistent with GetTextLines(). | |
| 2245 */ | |
| 2246 void TessBaseAPI::GetBlockTextOrientations(int **block_orientation, bool **vertical_writing) { | |
| 2247 delete[] * block_orientation; | |
| 2248 *block_orientation = nullptr; | |
| 2249 delete[] * vertical_writing; | |
| 2250 *vertical_writing = nullptr; | |
| 2251 BLOCK_IT block_it(block_list_); | |
| 2252 | |
| 2253 block_it.move_to_first(); | |
| 2254 int num_blocks = 0; | |
| 2255 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { | |
| 2256 if (!block_it.data()->pdblk.poly_block()->IsText()) { | |
| 2257 continue; | |
| 2258 } | |
| 2259 ++num_blocks; | |
| 2260 } | |
| 2261 if (!num_blocks) { | |
| 2262 tprintf("WARNING: Found no blocks\n"); | |
| 2263 return; | |
| 2264 } | |
| 2265 *block_orientation = new int[num_blocks]; | |
| 2266 *vertical_writing = new bool[num_blocks]; | |
| 2267 block_it.move_to_first(); | |
| 2268 int i = 0; | |
| 2269 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { | |
| 2270 if (!block_it.data()->pdblk.poly_block()->IsText()) { | |
| 2271 continue; | |
| 2272 } | |
| 2273 FCOORD re_rotation = block_it.data()->re_rotation(); | |
| 2274 float re_theta = re_rotation.angle(); | |
| 2275 FCOORD classify_rotation = block_it.data()->classify_rotation(); | |
| 2276 float classify_theta = classify_rotation.angle(); | |
| 2277 double rot_theta = -(re_theta - classify_theta) * 2.0 / M_PI; | |
| 2278 if (rot_theta < 0) { | |
| 2279 rot_theta += 4; | |
| 2280 } | |
| 2281 int num_rotations = static_cast<int>(rot_theta + 0.5); | |
| 2282 (*block_orientation)[i] = num_rotations; | |
| 2283 // The classify_rotation is non-zero only if the text has vertical | |
| 2284 // writing direction. | |
| 2285 (*vertical_writing)[i] = classify_rotation.y() != 0.0f; | |
| 2286 ++i; | |
| 2287 } | |
| 2288 } | |
| 2289 | |
| 2290 void TessBaseAPI::DetectParagraphs(bool after_text_recognition) { | |
| 2291 int debug_level = 0; | |
| 2292 GetIntVariable("paragraph_debug_level", &debug_level); | |
| 2293 if (paragraph_models_ == nullptr) { | |
| 2294 paragraph_models_ = new std::vector<ParagraphModel *>; | |
| 2295 } | |
| 2296 MutableIterator *result_it = GetMutableIterator(); | |
| 2297 do { // Detect paragraphs for this block | |
| 2298 std::vector<ParagraphModel *> models; | |
| 2299 ::tesseract::DetectParagraphs(debug_level, after_text_recognition, result_it, &models); | |
| 2300 paragraph_models_->insert(paragraph_models_->end(), models.begin(), models.end()); | |
| 2301 } while (result_it->Next(RIL_BLOCK)); | |
| 2302 delete result_it; | |
| 2303 } | |
| 2304 | |
| 2305 /** This method returns the string form of the specified unichar. */ | |
| 2306 const char *TessBaseAPI::GetUnichar(int unichar_id) const { | |
| 2307 return tesseract_->unicharset.id_to_unichar(unichar_id); | |
| 2308 } | |
| 2309 | |
| 2310 /** Return the pointer to the i-th dawg loaded into tesseract_ object. */ | |
| 2311 const Dawg *TessBaseAPI::GetDawg(int i) const { | |
| 2312 if (tesseract_ == nullptr || i >= NumDawgs()) { | |
| 2313 return nullptr; | |
| 2314 } | |
| 2315 return tesseract_->getDict().GetDawg(i); | |
| 2316 } | |
| 2317 | |
| 2318 /** Return the number of dawgs loaded into tesseract_ object. */ | |
| 2319 int TessBaseAPI::NumDawgs() const { | |
| 2320 return tesseract_ == nullptr ? 0 : tesseract_->getDict().NumDawgs(); | |
| 2321 } | |
| 2322 | |
| 2323 /** Escape a char string - replace <>&"' with HTML codes. */ | |
| 2324 std::string HOcrEscape(const char *text) { | |
| 2325 std::string ret; | |
| 2326 const char *ptr; | |
| 2327 for (ptr = text; *ptr; ptr++) { | |
| 2328 switch (*ptr) { | |
| 2329 case '<': | |
| 2330 ret += "<"; | |
| 2331 break; | |
| 2332 case '>': | |
| 2333 ret += ">"; | |
| 2334 break; | |
| 2335 case '&': | |
| 2336 ret += "&"; | |
| 2337 break; | |
| 2338 case '"': | |
| 2339 ret += """; | |
| 2340 break; | |
| 2341 case '\'': | |
| 2342 ret += "'"; | |
| 2343 break; | |
| 2344 default: | |
| 2345 ret += *ptr; | |
| 2346 } | |
| 2347 } | |
| 2348 return ret; | |
| 2349 } | |
| 2350 | |
| 2351 } // namespace tesseract |
