comparison mupdf-source/thirdparty/tesseract/src/api/baseapi.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /**********************************************************************
2 * File: baseapi.cpp
3 * Description: Simple API for calling tesseract.
4 * Author: Ray Smith
5 *
6 * (C) Copyright 2006, Google Inc.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19 #define _USE_MATH_DEFINES // for M_PI
20
21 // Include automatically generated configuration file if running autoconf.
22 #ifdef HAVE_CONFIG_H
23 # include "config_auto.h"
24 #endif
25
26 #include "boxword.h" // for BoxWord
27 #include "coutln.h" // for C_OUTLINE_IT, C_OUTLINE_LIST
28 #include "dawg_cache.h" // for DawgCache
29 #include "dict.h" // for Dict
30 #include "elst.h" // for ELIST_ITERATOR, ELISTIZE, ELISTIZEH
31 #include "environ.h" // for l_uint8
32 #ifndef DISABLED_LEGACY_ENGINE
33 #include "equationdetect.h" // for EquationDetect, destructor of equ_detect_
34 #endif // ndef DISABLED_LEGACY_ENGINE
35 #include "errcode.h" // for ASSERT_HOST
36 #include "helpers.h" // for IntCastRounded, chomp_string, copy_string
37 #include "host.h" // for MAX_PATH
38 #include "imageio.h" // for IFF_TIFF_G4, IFF_TIFF, IFF_TIFF_G3, ...
39 #ifndef DISABLED_LEGACY_ENGINE
40 # include "intfx.h" // for INT_FX_RESULT_STRUCT
41 #endif
42 #include "mutableiterator.h" // for MutableIterator
43 #include "normalis.h" // for kBlnBaselineOffset, kBlnXHeight
44 #include "pageres.h" // for PAGE_RES_IT, WERD_RES, PAGE_RES, CR_DE...
45 #include "paragraphs.h" // for DetectParagraphs
46 #include "params.h" // for BoolParam, IntParam, DoubleParam, Stri...
47 #include "pdblock.h" // for PDBLK
48 #include "points.h" // for FCOORD
49 #include "polyblk.h" // for POLY_BLOCK
50 #include "rect.h" // for TBOX
51 #include "stepblob.h" // for C_BLOB_IT, C_BLOB, C_BLOB_LIST
52 #include "tessdatamanager.h" // for TessdataManager, kTrainedDataSuffix
53 #include "tesseractclass.h" // for Tesseract
54 #include "tprintf.h" // for tprintf
55 #include "werd.h" // for WERD, WERD_IT, W_FUZZY_NON, W_FUZZY_SP
56 #include "thresholder.h" // for ImageThresholder
57
58 #include <tesseract/baseapi.h>
59 #include <tesseract/ocrclass.h> // for ETEXT_DESC
60 #include <tesseract/osdetect.h> // for OSResults, OSBestResult, OrientationId...
61 #include <tesseract/renderer.h> // for TessResultRenderer
62 #include <tesseract/resultiterator.h> // for ResultIterator
63
64 #include <cmath> // for round, M_PI
65 #include <cstdint> // for int32_t
66 #include <cstring> // for strcmp, strcpy
67 #include <filesystem> // for std::filesystem
68 #include <fstream> // for size_t
69 #include <iostream> // for std::cin
70 #include <locale> // for std::locale::classic
71 #include <memory> // for std::unique_ptr
72 #include <set> // for std::pair
73 #include <sstream> // for std::stringstream
74 #include <vector> // for std::vector
75
76 #include <allheaders.h> // for pixDestroy, boxCreate, boxaAddBox, box...
77 #ifdef HAVE_LIBCURL
78 # include <curl/curl.h>
79 #endif
80
81 #ifdef __linux__
82 # include <csignal> // for sigaction, SA_RESETHAND, SIGBUS, SIGFPE
83 #endif
84
85 #if defined(_WIN32)
86 # include <fcntl.h> // for _O_BINARY
87 # include <io.h> // for _setmode
88 #endif
89
90 namespace tesseract {
91
92 static BOOL_VAR(stream_filelist, false, "Stream a filelist from stdin");
93 static STRING_VAR(document_title, "", "Title of output document (used for hOCR and PDF output)");
94 #ifdef HAVE_LIBCURL
95 static INT_VAR(curl_timeout, 0, "Timeout for curl in seconds");
96 static STRING_VAR(curl_cookiefile, "", "File with cookie data for curl");
97 #endif
98
99 /** Minimum sensible image size to be worth running Tesseract. */
100 const int kMinRectSize = 10;
101 /** Character returned when Tesseract couldn't recognize as anything. */
102 const char kTesseractReject = '~';
103 /** Character used by UNLV error counter as a reject. */
104 const char kUNLVReject = '~';
105 /** Character used by UNLV as a suspect marker. */
106 const char kUNLVSuspect = '^';
107 /**
108 * Temp file used for storing current parameters before applying retry values.
109 */
110 static const char *kOldVarsFile = "failed_vars.txt";
111
112 #ifndef DISABLED_LEGACY_ENGINE
113 /**
114 * Filename used for input image file, from which to derive a name to search
115 * for a possible UNLV zone file, if none is specified by SetInputName.
116 */
117 static const char *kInputFile = "noname.tif";
118 static const char kUnknownFontName[] = "UnknownFont";
119
120 static STRING_VAR(classify_font_name, kUnknownFontName,
121 "Default font name to be used in training");
122
123 // Finds the name of the training font and returns it in fontname, by cutting
124 // it out based on the expectation that the filename is of the form:
125 // /path/to/dir/[lang].[fontname].exp[num]
126 // The [lang], [fontname] and [num] fields should not have '.' characters.
127 // If the global parameter classify_font_name is set, its value is used instead.
128 static void ExtractFontName(const char* filename, std::string* fontname) {
129 *fontname = classify_font_name;
130 if (*fontname == kUnknownFontName) {
131 // filename is expected to be of the form [lang].[fontname].exp[num]
132 // The [lang], [fontname] and [num] fields should not have '.' characters.
133 const char *basename = strrchr(filename, '/');
134 const char *firstdot = strchr(basename ? basename : filename, '.');
135 const char *lastdot = strrchr(filename, '.');
136 if (firstdot != lastdot && firstdot != nullptr && lastdot != nullptr) {
137 ++firstdot;
138 *fontname = firstdot;
139 fontname->resize(lastdot - firstdot);
140 }
141 }
142 }
143 #endif
144
145 /* Add all available languages recursively.
146 */
147 static void addAvailableLanguages(const std::string &datadir,
148 std::vector<std::string> *langs) {
149 for (const auto& entry :
150 std::filesystem::recursive_directory_iterator(datadir,
151 std::filesystem::directory_options::follow_directory_symlink |
152 std::filesystem::directory_options::skip_permission_denied)) {
153 auto path = entry.path().lexically_relative(datadir).string();
154 auto extPos = path.rfind(".traineddata");
155 if (extPos != std::string::npos) {
156 langs->push_back(path.substr(0, extPos));
157 }
158 }
159 }
160
161 TessBaseAPI::TessBaseAPI()
162 : tesseract_(nullptr)
163 , osd_tesseract_(nullptr)
164 , equ_detect_(nullptr)
165 , reader_(nullptr)
166 ,
167 // thresholder_ is initialized to nullptr here, but will be set before use
168 // by: A constructor of a derived API or created
169 // implicitly when used in InternalSetImage.
170 thresholder_(nullptr)
171 , paragraph_models_(nullptr)
172 , block_list_(nullptr)
173 , page_res_(nullptr)
174 , last_oem_requested_(OEM_DEFAULT)
175 , recognition_done_(false)
176 , rect_left_(0)
177 , rect_top_(0)
178 , rect_width_(0)
179 , rect_height_(0)
180 , image_width_(0)
181 , image_height_(0) {
182 }
183
184 TessBaseAPI::~TessBaseAPI() {
185 End();
186 }
187
188 /**
189 * Returns the version identifier as a static string. Do not delete.
190 */
191 const char *TessBaseAPI::Version() {
192 return TESSERACT_VERSION_STR;
193 }
194
195 /**
196 * Set the name of the input file. Needed only for training and
197 * loading a UNLV zone file.
198 */
199 void TessBaseAPI::SetInputName(const char *name) {
200 input_file_ = name ? name : "";
201 }
202
203 /** Set the name of the output files. Needed only for debugging. */
204 void TessBaseAPI::SetOutputName(const char *name) {
205 output_file_ = name ? name : "";
206 }
207
208 bool TessBaseAPI::SetVariable(const char *name, const char *value) {
209 if (tesseract_ == nullptr) {
210 tesseract_ = new Tesseract;
211 }
212 return ParamUtils::SetParam(name, value, SET_PARAM_CONSTRAINT_NON_INIT_ONLY,
213 tesseract_->params());
214 }
215
216 bool TessBaseAPI::SetDebugVariable(const char *name, const char *value) {
217 if (tesseract_ == nullptr) {
218 tesseract_ = new Tesseract;
219 }
220 return ParamUtils::SetParam(name, value, SET_PARAM_CONSTRAINT_DEBUG_ONLY, tesseract_->params());
221 }
222
223 bool TessBaseAPI::GetIntVariable(const char *name, int *value) const {
224 auto *p = ParamUtils::FindParam<IntParam>(name, GlobalParams()->int_params,
225 tesseract_->params()->int_params);
226 if (p == nullptr) {
227 return false;
228 }
229 *value = (int32_t)(*p);
230 return true;
231 }
232
233 bool TessBaseAPI::GetBoolVariable(const char *name, bool *value) const {
234 auto *p = ParamUtils::FindParam<BoolParam>(name, GlobalParams()->bool_params,
235 tesseract_->params()->bool_params);
236 if (p == nullptr) {
237 return false;
238 }
239 *value = bool(*p);
240 return true;
241 }
242
243 const char *TessBaseAPI::GetStringVariable(const char *name) const {
244 auto *p = ParamUtils::FindParam<StringParam>(name, GlobalParams()->string_params,
245 tesseract_->params()->string_params);
246 return (p != nullptr) ? p->c_str() : nullptr;
247 }
248
249 bool TessBaseAPI::GetDoubleVariable(const char *name, double *value) const {
250 auto *p = ParamUtils::FindParam<DoubleParam>(name, GlobalParams()->double_params,
251 tesseract_->params()->double_params);
252 if (p == nullptr) {
253 return false;
254 }
255 *value = (double)(*p);
256 return true;
257 }
258
259 /** Get value of named variable as a string, if it exists. */
260 bool TessBaseAPI::GetVariableAsString(const char *name, std::string *val) const {
261 return ParamUtils::GetParamAsString(name, tesseract_->params(), val);
262 }
263
264 #ifndef DISABLED_LEGACY_ENGINE
265
266 /** Print Tesseract fonts table to the given file. */
267 void TessBaseAPI::PrintFontsTable(FILE *fp) const {
268 const int fontinfo_size = tesseract_->get_fontinfo_table().size();
269 for (int font_index = 1; font_index < fontinfo_size; ++font_index) {
270 FontInfo font = tesseract_->get_fontinfo_table().at(font_index);
271 fprintf(fp, "ID=%3d: %s is_italic=%s is_bold=%s"
272 " is_fixed_pitch=%s is_serif=%s is_fraktur=%s\n",
273 font_index, font.name,
274 font.is_italic() ? "true" : "false",
275 font.is_bold() ? "true" : "false",
276 font.is_fixed_pitch() ? "true" : "false",
277 font.is_serif() ? "true" : "false",
278 font.is_fraktur() ? "true" : "false");
279 }
280 }
281
282 #endif
283
284 /** Print Tesseract parameters to the given file. */
285 void TessBaseAPI::PrintVariables(FILE *fp) const {
286 ParamUtils::PrintParams(fp, tesseract_->params());
287 }
288
289 /**
290 * The datapath must be the name of the data directory or
291 * some other file in which the data directory resides (for instance argv[0].)
292 * The language is (usually) an ISO 639-3 string or nullptr will default to eng.
293 * If numeric_mode is true, then only digits and Roman numerals will
294 * be returned.
295 * @return: 0 on success and -1 on initialization failure.
296 */
297 int TessBaseAPI::Init(const char *datapath, const char *language, OcrEngineMode oem, char **configs,
298 int configs_size, const std::vector<std::string> *vars_vec,
299 const std::vector<std::string> *vars_values, bool set_only_non_debug_params) {
300 return Init(datapath, 0, language, oem, configs, configs_size, vars_vec, vars_values,
301 set_only_non_debug_params, nullptr);
302 }
303
304 // In-memory version reads the traineddata file directly from the given
305 // data[data_size] array. Also implements the version with a datapath in data,
306 // flagged by data_size = 0.
307 int TessBaseAPI::Init(const char *data, int data_size, const char *language, OcrEngineMode oem,
308 char **configs, int configs_size, const std::vector<std::string> *vars_vec,
309 const std::vector<std::string> *vars_values, bool set_only_non_debug_params,
310 FileReader reader) {
311 if (language == nullptr) {
312 language = "";
313 }
314 if (data == nullptr) {
315 data = "";
316 }
317 std::string datapath = data_size == 0 ? data : language;
318 // If the datapath, OcrEngineMode or the language have changed - start again.
319 // Note that the language_ field stores the last requested language that was
320 // initialized successfully, while tesseract_->lang stores the language
321 // actually used. They differ only if the requested language was nullptr, in
322 // which case tesseract_->lang is set to the Tesseract default ("eng").
323 if (tesseract_ != nullptr &&
324 (datapath_.empty() || language_.empty() || datapath_ != datapath ||
325 last_oem_requested_ != oem || (language_ != language && tesseract_->lang != language))) {
326 delete tesseract_;
327 tesseract_ = nullptr;
328 }
329 bool reset_classifier = true;
330 if (tesseract_ == nullptr) {
331 reset_classifier = false;
332 tesseract_ = new Tesseract;
333 if (reader != nullptr) {
334 reader_ = reader;
335 }
336 TessdataManager mgr(reader_);
337 if (data_size != 0) {
338 mgr.LoadMemBuffer(language, data, data_size);
339 }
340 if (tesseract_->init_tesseract(datapath, output_file_, language, oem, configs,
341 configs_size, vars_vec, vars_values, set_only_non_debug_params,
342 &mgr) != 0) {
343 return -1;
344 }
345 }
346
347 // Update datapath and language requested for the last valid initialization.
348 datapath_ = std::move(datapath);
349 if (datapath_.empty() && !tesseract_->datadir.empty()) {
350 datapath_ = tesseract_->datadir;
351 }
352
353 language_ = language;
354 last_oem_requested_ = oem;
355
356 #ifndef DISABLED_LEGACY_ENGINE
357 // For same language and datapath, just reset the adaptive classifier.
358 if (reset_classifier) {
359 tesseract_->ResetAdaptiveClassifier();
360 }
361 #endif // ndef DISABLED_LEGACY_ENGINE
362 return 0;
363 }
364
365 /**
366 * Returns the languages string used in the last valid initialization.
367 * If the last initialization specified "deu+hin" then that will be
368 * returned. If hin loaded eng automatically as well, then that will
369 * not be included in this list. To find the languages actually
370 * loaded use GetLoadedLanguagesAsVector.
371 * The returned string should NOT be deleted.
372 */
373 const char *TessBaseAPI::GetInitLanguagesAsString() const {
374 return language_.c_str();
375 }
376
377 /**
378 * Returns the loaded languages in the vector of std::string.
379 * Includes all languages loaded by the last Init, including those loaded
380 * as dependencies of other loaded languages.
381 */
382 void TessBaseAPI::GetLoadedLanguagesAsVector(std::vector<std::string> *langs) const {
383 langs->clear();
384 if (tesseract_ != nullptr) {
385 langs->push_back(tesseract_->lang);
386 int num_subs = tesseract_->num_sub_langs();
387 for (int i = 0; i < num_subs; ++i) {
388 langs->push_back(tesseract_->get_sub_lang(i)->lang);
389 }
390 }
391 }
392
393 /**
394 * Returns the available languages in the sorted vector of std::string.
395 */
396 void TessBaseAPI::GetAvailableLanguagesAsVector(std::vector<std::string> *langs) const {
397 langs->clear();
398 if (tesseract_ != nullptr) {
399 addAvailableLanguages(tesseract_->datadir, langs);
400 std::sort(langs->begin(), langs->end());
401 }
402 }
403
404 /**
405 * Init only for page layout analysis. Use only for calls to SetImage and
406 * AnalysePage. Calls that attempt recognition will generate an error.
407 */
408 void TessBaseAPI::InitForAnalysePage() {
409 if (tesseract_ == nullptr) {
410 tesseract_ = new Tesseract;
411 #ifndef DISABLED_LEGACY_ENGINE
412 tesseract_->InitAdaptiveClassifier(nullptr);
413 #endif
414 }
415 }
416
417 /**
418 * Read a "config" file containing a set of parameter name, value pairs.
419 * Searches the standard places: tessdata/configs, tessdata/tessconfigs
420 * and also accepts a relative or absolute path name.
421 */
422 void TessBaseAPI::ReadConfigFile(const char *filename) {
423 tesseract_->read_config_file(filename, SET_PARAM_CONSTRAINT_NON_INIT_ONLY);
424 }
425
426 /** Same as above, but only set debug params from the given config file. */
427 void TessBaseAPI::ReadDebugConfigFile(const char *filename) {
428 tesseract_->read_config_file(filename, SET_PARAM_CONSTRAINT_DEBUG_ONLY);
429 }
430
431 /**
432 * Set the current page segmentation mode. Defaults to PSM_AUTO.
433 * The mode is stored as an IntParam so it can also be modified by
434 * ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string).
435 */
436 void TessBaseAPI::SetPageSegMode(PageSegMode mode) {
437 if (tesseract_ == nullptr) {
438 tesseract_ = new Tesseract;
439 }
440 tesseract_->tessedit_pageseg_mode.set_value(mode);
441 }
442
443 /** Return the current page segmentation mode. */
444 PageSegMode TessBaseAPI::GetPageSegMode() const {
445 if (tesseract_ == nullptr) {
446 return PSM_SINGLE_BLOCK;
447 }
448 return static_cast<PageSegMode>(static_cast<int>(tesseract_->tessedit_pageseg_mode));
449 }
450
451 /**
452 * Recognize a rectangle from an image and return the result as a string.
453 * May be called many times for a single Init.
454 * Currently has no error checking.
455 * Greyscale of 8 and color of 24 or 32 bits per pixel may be given.
456 * Palette color images will not work properly and must be converted to
457 * 24 bit.
458 * Binary images of 1 bit per pixel may also be given but they must be
459 * byte packed with the MSB of the first byte being the first pixel, and a
460 * one pixel is WHITE. For binary images set bytes_per_pixel=0.
461 * The recognized text is returned as a char* which is coded
462 * as UTF8 and must be freed with the delete [] operator.
463 */
464 char *TessBaseAPI::TesseractRect(const unsigned char *imagedata, int bytes_per_pixel,
465 int bytes_per_line, int left, int top, int width, int height) {
466 if (tesseract_ == nullptr || width < kMinRectSize || height < kMinRectSize) {
467 return nullptr; // Nothing worth doing.
468 }
469
470 // Since this original api didn't give the exact size of the image,
471 // we have to invent a reasonable value.
472 int bits_per_pixel = bytes_per_pixel == 0 ? 1 : bytes_per_pixel * 8;
473 SetImage(imagedata, bytes_per_line * 8 / bits_per_pixel, height + top, bytes_per_pixel,
474 bytes_per_line);
475 SetRectangle(left, top, width, height);
476
477 return GetUTF8Text();
478 }
479
480 #ifndef DISABLED_LEGACY_ENGINE
481 /**
482 * Call between pages or documents etc to free up memory and forget
483 * adaptive data.
484 */
485 void TessBaseAPI::ClearAdaptiveClassifier() {
486 if (tesseract_ == nullptr) {
487 return;
488 }
489 tesseract_->ResetAdaptiveClassifier();
490 tesseract_->ResetDocumentDictionary();
491 }
492 #endif // ndef DISABLED_LEGACY_ENGINE
493
494 /**
495 * Provide an image for Tesseract to recognize. Format is as
496 * TesseractRect above. Copies the image buffer and converts to Pix.
497 * SetImage clears all recognition results, and sets the rectangle to the
498 * full image, so it may be followed immediately by a GetUTF8Text, and it
499 * will automatically perform recognition.
500 */
501 void TessBaseAPI::SetImage(const unsigned char *imagedata, int width, int height,
502 int bytes_per_pixel, int bytes_per_line) {
503 if (InternalSetImage()) {
504 thresholder_->SetImage(imagedata, width, height, bytes_per_pixel, bytes_per_line);
505 SetInputImage(thresholder_->GetPixRect());
506 }
507 }
508
509 void TessBaseAPI::SetSourceResolution(int ppi) {
510 if (thresholder_) {
511 thresholder_->SetSourceYResolution(ppi);
512 } else {
513 tprintf("Please call SetImage before SetSourceResolution.\n");
514 }
515 }
516
517 /**
518 * Provide an image for Tesseract to recognize. As with SetImage above,
519 * Tesseract takes its own copy of the image, so it need not persist until
520 * after Recognize.
521 * Pix vs raw, which to use?
522 * Use Pix where possible. Tesseract uses Pix as its internal representation
523 * and it is therefore more efficient to provide a Pix directly.
524 */
525 void TessBaseAPI::SetImage(Pix *pix) {
526 if (InternalSetImage()) {
527 if (pixGetSpp(pix) == 4 && pixGetInputFormat(pix) == IFF_PNG) {
528 // remove alpha channel from png
529 Pix *p1 = pixRemoveAlpha(pix);
530 pixSetSpp(p1, 3);
531 (void)pixCopy(pix, p1);
532 pixDestroy(&p1);
533 }
534 thresholder_->SetImage(pix);
535 SetInputImage(thresholder_->GetPixRect());
536 }
537 }
538
539 /**
540 * Restrict recognition to a sub-rectangle of the image. Call after SetImage.
541 * Each SetRectangle clears the recognition results so multiple rectangles
542 * can be recognized with the same image.
543 */
544 void TessBaseAPI::SetRectangle(int left, int top, int width, int height) {
545 if (thresholder_ == nullptr) {
546 return;
547 }
548 thresholder_->SetRectangle(left, top, width, height);
549 ClearResults();
550 }
551
552 /**
553 * ONLY available after SetImage if you have Leptonica installed.
554 * Get a copy of the internal thresholded image from Tesseract.
555 */
556 Pix *TessBaseAPI::GetThresholdedImage() {
557 if (tesseract_ == nullptr || thresholder_ == nullptr) {
558 return nullptr;
559 }
560 if (tesseract_->pix_binary() == nullptr && !Threshold(&tesseract_->mutable_pix_binary()->pix_)) {
561 return nullptr;
562 }
563 return tesseract_->pix_binary().clone();
564 }
565
566 /**
567 * Get the result of page layout analysis as a leptonica-style
568 * Boxa, Pixa pair, in reading order.
569 * Can be called before or after Recognize.
570 */
571 Boxa *TessBaseAPI::GetRegions(Pixa **pixa) {
572 return GetComponentImages(RIL_BLOCK, false, pixa, nullptr);
573 }
574
575 /**
576 * Get the textlines as a leptonica-style Boxa, Pixa pair, in reading order.
577 * Can be called before or after Recognize.
578 * If blockids is not nullptr, the block-id of each line is also returned as an
579 * array of one element per line. delete [] after use.
580 * If paraids is not nullptr, the paragraph-id of each line within its block is
581 * also returned as an array of one element per line. delete [] after use.
582 */
583 Boxa *TessBaseAPI::GetTextlines(const bool raw_image, const int raw_padding, Pixa **pixa,
584 int **blockids, int **paraids) {
585 return GetComponentImages(RIL_TEXTLINE, true, raw_image, raw_padding, pixa, blockids, paraids);
586 }
587
588 /**
589 * Get textlines and strips of image regions as a leptonica-style Boxa, Pixa
590 * pair, in reading order. Enables downstream handling of non-rectangular
591 * regions.
592 * Can be called before or after Recognize.
593 * If blockids is not nullptr, the block-id of each line is also returned as an
594 * array of one element per line. delete [] after use.
595 */
596 Boxa *TessBaseAPI::GetStrips(Pixa **pixa, int **blockids) {
597 return GetComponentImages(RIL_TEXTLINE, false, pixa, blockids);
598 }
599
600 /**
601 * Get the words as a leptonica-style
602 * Boxa, Pixa pair, in reading order.
603 * Can be called before or after Recognize.
604 */
605 Boxa *TessBaseAPI::GetWords(Pixa **pixa) {
606 return GetComponentImages(RIL_WORD, true, pixa, nullptr);
607 }
608
609 /**
610 * Gets the individual connected (text) components (created
611 * after pages segmentation step, but before recognition)
612 * as a leptonica-style Boxa, Pixa pair, in reading order.
613 * Can be called before or after Recognize.
614 */
615 Boxa *TessBaseAPI::GetConnectedComponents(Pixa **pixa) {
616 return GetComponentImages(RIL_SYMBOL, true, pixa, nullptr);
617 }
618
619 /**
620 * Get the given level kind of components (block, textline, word etc.) as a
621 * leptonica-style Boxa, Pixa pair, in reading order.
622 * Can be called before or after Recognize.
623 * If blockids is not nullptr, the block-id of each component is also returned
624 * as an array of one element per component. delete [] after use.
625 * If text_only is true, then only text components are returned.
626 */
627 Boxa *TessBaseAPI::GetComponentImages(PageIteratorLevel level, bool text_only, bool raw_image,
628 const int raw_padding, Pixa **pixa, int **blockids,
629 int **paraids) {
630 /*non-const*/ std::unique_ptr</*non-const*/ PageIterator> page_it(GetIterator());
631 if (page_it == nullptr) {
632 page_it.reset(AnalyseLayout());
633 }
634 if (page_it == nullptr) {
635 return nullptr; // Failed.
636 }
637
638 // Count the components to get a size for the arrays.
639 int component_count = 0;
640 int left, top, right, bottom;
641
642 if (raw_image) {
643 // Get bounding box in original raw image with padding.
644 do {
645 if (page_it->BoundingBox(level, raw_padding, &left, &top, &right, &bottom) &&
646 (!text_only || PTIsTextType(page_it->BlockType()))) {
647 ++component_count;
648 }
649 } while (page_it->Next(level));
650 } else {
651 // Get bounding box from binarized imaged. Note that this could be
652 // differently scaled from the original image.
653 do {
654 if (page_it->BoundingBoxInternal(level, &left, &top, &right, &bottom) &&
655 (!text_only || PTIsTextType(page_it->BlockType()))) {
656 ++component_count;
657 }
658 } while (page_it->Next(level));
659 }
660
661 Boxa *boxa = boxaCreate(component_count);
662 if (pixa != nullptr) {
663 *pixa = pixaCreate(component_count);
664 }
665 if (blockids != nullptr) {
666 *blockids = new int[component_count];
667 }
668 if (paraids != nullptr) {
669 *paraids = new int[component_count];
670 }
671
672 int blockid = 0;
673 int paraid = 0;
674 int component_index = 0;
675 page_it->Begin();
676 do {
677 bool got_bounding_box;
678 if (raw_image) {
679 got_bounding_box = page_it->BoundingBox(level, raw_padding, &left, &top, &right, &bottom);
680 } else {
681 got_bounding_box = page_it->BoundingBoxInternal(level, &left, &top, &right, &bottom);
682 }
683 if (got_bounding_box && (!text_only || PTIsTextType(page_it->BlockType()))) {
684 Box *lbox = boxCreate(left, top, right - left, bottom - top);
685 boxaAddBox(boxa, lbox, L_INSERT);
686 if (pixa != nullptr) {
687 Pix *pix = nullptr;
688 if (raw_image) {
689 pix = page_it->GetImage(level, raw_padding, GetInputImage(), &left, &top);
690 } else {
691 pix = page_it->GetBinaryImage(level);
692 }
693 pixaAddPix(*pixa, pix, L_INSERT);
694 pixaAddBox(*pixa, lbox, L_CLONE);
695 }
696 if (paraids != nullptr) {
697 (*paraids)[component_index] = paraid;
698 if (page_it->IsAtFinalElement(RIL_PARA, level)) {
699 ++paraid;
700 }
701 }
702 if (blockids != nullptr) {
703 (*blockids)[component_index] = blockid;
704 if (page_it->IsAtFinalElement(RIL_BLOCK, level)) {
705 ++blockid;
706 paraid = 0;
707 }
708 }
709 ++component_index;
710 }
711 } while (page_it->Next(level));
712 return boxa;
713 }
714
715 int TessBaseAPI::GetThresholdedImageScaleFactor() const {
716 if (thresholder_ == nullptr) {
717 return 0;
718 }
719 return thresholder_->GetScaleFactor();
720 }
721
722 /**
723 * Runs page layout analysis in the mode set by SetPageSegMode.
724 * May optionally be called prior to Recognize to get access to just
725 * the page layout results. Returns an iterator to the results.
726 * If merge_similar_words is true, words are combined where suitable for use
727 * with a line recognizer. Use if you want to use AnalyseLayout to find the
728 * textlines, and then want to process textline fragments with an external
729 * line recognizer.
730 * Returns nullptr on error or an empty page.
731 * The returned iterator must be deleted after use.
732 * WARNING! This class points to data held within the TessBaseAPI class, and
733 * therefore can only be used while the TessBaseAPI class still exists and
734 * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
735 * DetectOS, or anything else that changes the internal PAGE_RES.
736 */
737 PageIterator *TessBaseAPI::AnalyseLayout() {
738 return AnalyseLayout(false);
739 }
740
741 PageIterator *TessBaseAPI::AnalyseLayout(bool merge_similar_words) {
742 if (FindLines() == 0) {
743 if (block_list_->empty()) {
744 return nullptr; // The page was empty.
745 }
746 page_res_ = new PAGE_RES(merge_similar_words, block_list_, nullptr);
747 DetectParagraphs(false);
748 return new PageIterator(page_res_, tesseract_, thresholder_->GetScaleFactor(),
749 thresholder_->GetScaledYResolution(), rect_left_, rect_top_,
750 rect_width_, rect_height_);
751 }
752 return nullptr;
753 }
754
755 /**
756 * Recognize the tesseract global image and return the result as Tesseract
757 * internal structures.
758 */
759 int TessBaseAPI::Recognize(ETEXT_DESC *monitor) {
760 if (tesseract_ == nullptr) {
761 return -1;
762 }
763 if (FindLines() != 0) {
764 return -1;
765 }
766 delete page_res_;
767 if (block_list_->empty()) {
768 page_res_ = new PAGE_RES(false, block_list_, &tesseract_->prev_word_best_choice_);
769 return 0; // Empty page.
770 }
771
772 tesseract_->SetBlackAndWhitelist();
773 recognition_done_ = true;
774 #ifndef DISABLED_LEGACY_ENGINE
775 if (tesseract_->tessedit_resegment_from_line_boxes) {
776 page_res_ = tesseract_->ApplyBoxes(input_file_.c_str(), true, block_list_);
777 } else if (tesseract_->tessedit_resegment_from_boxes) {
778 page_res_ = tesseract_->ApplyBoxes(input_file_.c_str(), false, block_list_);
779 } else
780 #endif // ndef DISABLED_LEGACY_ENGINE
781 {
782 page_res_ =
783 new PAGE_RES(tesseract_->AnyLSTMLang(), block_list_, &tesseract_->prev_word_best_choice_);
784 }
785
786 if (page_res_ == nullptr) {
787 return -1;
788 }
789
790 if (tesseract_->tessedit_train_line_recognizer) {
791 if (!tesseract_->TrainLineRecognizer(input_file_.c_str(), output_file_, block_list_)) {
792 return -1;
793 }
794 tesseract_->CorrectClassifyWords(page_res_);
795 return 0;
796 }
797 #ifndef DISABLED_LEGACY_ENGINE
798 if (tesseract_->tessedit_make_boxes_from_boxes) {
799 tesseract_->CorrectClassifyWords(page_res_);
800 return 0;
801 }
802 #endif // ndef DISABLED_LEGACY_ENGINE
803
804 int result = 0;
805 if (tesseract_->interactive_display_mode) {
806 #ifndef GRAPHICS_DISABLED
807 tesseract_->pgeditor_main(rect_width_, rect_height_, page_res_);
808 #endif // !GRAPHICS_DISABLED
809 // The page_res is invalid after an interactive session, so cleanup
810 // in a way that lets us continue to the next page without crashing.
811 delete page_res_;
812 page_res_ = nullptr;
813 return -1;
814 #ifndef DISABLED_LEGACY_ENGINE
815 } else if (tesseract_->tessedit_train_from_boxes) {
816 std::string fontname;
817 ExtractFontName(output_file_.c_str(), &fontname);
818 tesseract_->ApplyBoxTraining(fontname, page_res_);
819 } else if (tesseract_->tessedit_ambigs_training) {
820 FILE *training_output_file = tesseract_->init_recog_training(input_file_.c_str());
821 // OCR the page segmented into words by tesseract.
822 tesseract_->recog_training_segmented(input_file_.c_str(), page_res_, monitor,
823 training_output_file);
824 fclose(training_output_file);
825 #endif // ndef DISABLED_LEGACY_ENGINE
826 } else {
827 // Now run the main recognition.
828 bool wait_for_text = true;
829 GetBoolVariable("paragraph_text_based", &wait_for_text);
830 if (!wait_for_text) {
831 DetectParagraphs(false);
832 }
833 if (tesseract_->recog_all_words(page_res_, monitor, nullptr, nullptr, 0)) {
834 if (wait_for_text) {
835 DetectParagraphs(true);
836 }
837 } else {
838 result = -1;
839 }
840 }
841 return result;
842 }
843
844 // Takes ownership of the input pix.
845 void TessBaseAPI::SetInputImage(Pix *pix) {
846 tesseract_->set_pix_original(pix);
847 }
848
849 Pix *TessBaseAPI::GetInputImage() {
850 return tesseract_->pix_original();
851 }
852
853 const char *TessBaseAPI::GetInputName() {
854 if (!input_file_.empty()) {
855 return input_file_.c_str();
856 }
857 return nullptr;
858 }
859
860 const char *TessBaseAPI::GetDatapath() {
861 return tesseract_->datadir.c_str();
862 }
863
864 int TessBaseAPI::GetSourceYResolution() {
865 if (thresholder_ == nullptr)
866 return -1;
867 return thresholder_->GetSourceYResolution();
868 }
869
870 // If flist exists, get data from there. Otherwise get data from buf.
871 // Seems convoluted, but is the easiest way I know of to meet multiple
872 // goals. Support streaming from stdin, and also work on platforms
873 // lacking fmemopen.
874 // TODO: check different logic for flist/buf and simplify.
875 bool TessBaseAPI::ProcessPagesFileList(FILE *flist, std::string *buf, const char *retry_config,
876 int timeout_millisec, TessResultRenderer *renderer,
877 int tessedit_page_number) {
878 if (!flist && !buf) {
879 return false;
880 }
881 unsigned page = (tessedit_page_number >= 0) ? tessedit_page_number : 0;
882 char pagename[MAX_PATH];
883
884 std::vector<std::string> lines;
885 if (!flist) {
886 std::string line;
887 for (const auto ch : *buf) {
888 if (ch == '\n') {
889 lines.push_back(line);
890 line.clear();
891 } else {
892 line.push_back(ch);
893 }
894 }
895 if (!line.empty()) {
896 // Add last line without terminating LF.
897 lines.push_back(line);
898 }
899 if (lines.empty()) {
900 return false;
901 }
902 }
903
904 // Skip to the requested page number.
905 for (unsigned i = 0; i < page; i++) {
906 if (flist) {
907 if (fgets(pagename, sizeof(pagename), flist) == nullptr) {
908 break;
909 }
910 }
911 }
912
913 // Begin producing output
914 if (renderer && !renderer->BeginDocument(document_title.c_str())) {
915 return false;
916 }
917
918 // Loop over all pages - or just the requested one
919 while (true) {
920 if (flist) {
921 if (fgets(pagename, sizeof(pagename), flist) == nullptr) {
922 break;
923 }
924 } else {
925 if (page >= lines.size()) {
926 break;
927 }
928 snprintf(pagename, sizeof(pagename), "%s", lines[page].c_str());
929 }
930 chomp_string(pagename);
931 Pix *pix = pixRead(pagename);
932 if (pix == nullptr) {
933 tprintf("Image file %s cannot be read!\n", pagename);
934 return false;
935 }
936 tprintf("Page %u : %s\n", page, pagename);
937 bool r = ProcessPage(pix, page, pagename, retry_config, timeout_millisec, renderer);
938 pixDestroy(&pix);
939 if (!r) {
940 return false;
941 }
942 if (tessedit_page_number >= 0) {
943 break;
944 }
945 ++page;
946 }
947
948 // Finish producing output
949 if (renderer && !renderer->EndDocument()) {
950 return false;
951 }
952 return true;
953 }
954
955 bool TessBaseAPI::ProcessPagesMultipageTiff(const l_uint8 *data, size_t size, const char *filename,
956 const char *retry_config, int timeout_millisec,
957 TessResultRenderer *renderer,
958 int tessedit_page_number) {
959 Pix *pix = nullptr;
960 int page = (tessedit_page_number >= 0) ? tessedit_page_number : 0;
961 size_t offset = 0;
962 for (;; ++page) {
963 if (tessedit_page_number >= 0) {
964 page = tessedit_page_number;
965 pix = (data) ? pixReadMemTiff(data, size, page) : pixReadTiff(filename, page);
966 } else {
967 pix = (data) ? pixReadMemFromMultipageTiff(data, size, &offset)
968 : pixReadFromMultipageTiff(filename, &offset);
969 }
970 if (pix == nullptr) {
971 break;
972 }
973 if (offset || page > 0) {
974 // Only print page number for multipage TIFF file.
975 tprintf("Page %d\n", page + 1);
976 }
977 auto page_string = std::to_string(page);
978 SetVariable("applybox_page", page_string.c_str());
979 bool r = ProcessPage(pix, page, filename, retry_config, timeout_millisec, renderer);
980 pixDestroy(&pix);
981 if (!r) {
982 return false;
983 }
984 if (tessedit_page_number >= 0) {
985 break;
986 }
987 if (!offset) {
988 break;
989 }
990 }
991 return true;
992 }
993
994 // Master ProcessPages calls ProcessPagesInternal and then does any post-
995 // processing required due to being in a training mode.
996 bool TessBaseAPI::ProcessPages(const char *filename, const char *retry_config, int timeout_millisec,
997 TessResultRenderer *renderer) {
998 bool result = ProcessPagesInternal(filename, retry_config, timeout_millisec, renderer);
999 #ifndef DISABLED_LEGACY_ENGINE
1000 if (result) {
1001 if (tesseract_->tessedit_train_from_boxes && !tesseract_->WriteTRFile(output_file_.c_str())) {
1002 tprintf("Write of TR file failed: %s\n", output_file_.c_str());
1003 return false;
1004 }
1005 }
1006 #endif // ndef DISABLED_LEGACY_ENGINE
1007 return result;
1008 }
1009
1010 #ifdef HAVE_LIBCURL
1011 static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp) {
1012 size = size * nmemb;
1013 auto *buf = reinterpret_cast<std::string *>(userp);
1014 buf->append(reinterpret_cast<const char *>(contents), size);
1015 return size;
1016 }
1017 #endif
1018
1019 // In the ideal scenario, Tesseract will start working on data as soon
1020 // as it can. For example, if you stream a filelist through stdin, we
1021 // should start the OCR process as soon as the first filename is
1022 // available. This is particularly useful when hooking Tesseract up to
1023 // slow hardware such as a book scanning machine.
1024 //
1025 // Unfortunately there are tradeoffs. You can't seek on stdin. That
1026 // makes automatic detection of datatype (TIFF? filelist? PNG?)
1027 // impractical. So we support a command line flag to explicitly
1028 // identify the scenario that really matters: filelists on
1029 // stdin. We'll still do our best if the user likes pipes.
1030 bool TessBaseAPI::ProcessPagesInternal(const char *filename, const char *retry_config,
1031 int timeout_millisec, TessResultRenderer *renderer) {
1032 bool stdInput = !strcmp(filename, "stdin") || !strcmp(filename, "-");
1033 if (stdInput) {
1034 #ifdef WIN32
1035 if (_setmode(_fileno(stdin), _O_BINARY) == -1)
1036 tprintf("ERROR: cin to binary: %s", strerror(errno));
1037 #endif // WIN32
1038 }
1039
1040 if (stream_filelist) {
1041 return ProcessPagesFileList(stdin, nullptr, retry_config, timeout_millisec, renderer,
1042 tesseract_->tessedit_page_number);
1043 }
1044
1045 // At this point we are officially in autodection territory.
1046 // That means any data in stdin must be buffered, to make it
1047 // seekable.
1048 std::string buf;
1049 const l_uint8 *data = nullptr;
1050 if (stdInput) {
1051 buf.assign((std::istreambuf_iterator<char>(std::cin)), (std::istreambuf_iterator<char>()));
1052 data = reinterpret_cast<const l_uint8 *>(buf.data());
1053 } else if (strstr(filename, "://") != nullptr) {
1054 // Get image or image list by URL.
1055 #ifdef HAVE_LIBCURL
1056 CURL *curl = curl_easy_init();
1057 if (curl == nullptr) {
1058 fprintf(stderr, "Error, curl_easy_init failed\n");
1059 return false;
1060 } else {
1061 CURLcode curlcode;
1062 auto error = [curl, &curlcode](const char *function) {
1063 fprintf(stderr, "Error, %s failed with error %s\n", function, curl_easy_strerror(curlcode));
1064 curl_easy_cleanup(curl);
1065 return false;
1066 };
1067 curlcode = curl_easy_setopt(curl, CURLOPT_URL, filename);
1068 if (curlcode != CURLE_OK) {
1069 return error("curl_easy_setopt");
1070 }
1071 curlcode = curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1L);
1072 if (curlcode != CURLE_OK) {
1073 return error("curl_easy_setopt");
1074 }
1075 // Follow HTTP, HTTPS, FTP and FTPS redirects.
1076 curlcode = curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
1077 if (curlcode != CURLE_OK) {
1078 return error("curl_easy_setopt");
1079 }
1080 // Allow no more than 8 redirections to prevent endless loops.
1081 curlcode = curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 8);
1082 if (curlcode != CURLE_OK) {
1083 return error("curl_easy_setopt");
1084 }
1085 int timeout = curl_timeout;
1086 if (timeout > 0) {
1087 curlcode = curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1L);
1088 if (curlcode != CURLE_OK) {
1089 return error("curl_easy_setopt");
1090 }
1091 curlcode = curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout);
1092 if (curlcode != CURLE_OK) {
1093 return error("curl_easy_setopt");
1094 }
1095 }
1096 std::string cookiefile = curl_cookiefile;
1097 if (!cookiefile.empty()) {
1098 curlcode = curl_easy_setopt(curl, CURLOPT_COOKIEFILE, cookiefile.c_str());
1099 if (curlcode != CURLE_OK) {
1100 return error("curl_easy_setopt");
1101 }
1102 }
1103 curlcode = curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
1104 if (curlcode != CURLE_OK) {
1105 return error("curl_easy_setopt");
1106 }
1107 curlcode = curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buf);
1108 if (curlcode != CURLE_OK) {
1109 return error("curl_easy_setopt");
1110 }
1111 curlcode = curl_easy_setopt(curl, CURLOPT_USERAGENT, "Tesseract OCR");
1112 if (curlcode != CURLE_OK) {
1113 return error("curl_easy_setopt");
1114 }
1115 curlcode = curl_easy_perform(curl);
1116 if (curlcode != CURLE_OK) {
1117 return error("curl_easy_perform");
1118 }
1119 curl_easy_cleanup(curl);
1120 data = reinterpret_cast<const l_uint8 *>(buf.data());
1121 }
1122 #else
1123 fprintf(stderr, "Error, this tesseract has no URL support\n");
1124 return false;
1125 #endif
1126 } else {
1127 // Check whether the input file can be read.
1128 if (FILE *file = fopen(filename, "rb")) {
1129 fclose(file);
1130 } else {
1131 fprintf(stderr, "Error, cannot read input file %s: %s\n", filename, strerror(errno));
1132 return false;
1133 }
1134 }
1135
1136 // Here is our autodetection
1137 int format;
1138 int r =
1139 (data != nullptr) ? findFileFormatBuffer(data, &format) : findFileFormat(filename, &format);
1140
1141 // Maybe we have a filelist
1142 if (r != 0 || format == IFF_UNKNOWN) {
1143 std::string s;
1144 if (data != nullptr) {
1145 s = buf.c_str();
1146 } else {
1147 std::ifstream t(filename);
1148 std::string u((std::istreambuf_iterator<char>(t)), std::istreambuf_iterator<char>());
1149 s = u.c_str();
1150 }
1151 return ProcessPagesFileList(nullptr, &s, retry_config, timeout_millisec, renderer,
1152 tesseract_->tessedit_page_number);
1153 }
1154
1155 // Maybe we have a TIFF which is potentially multipage
1156 bool tiff = (format == IFF_TIFF || format == IFF_TIFF_PACKBITS || format == IFF_TIFF_RLE ||
1157 format == IFF_TIFF_G3 || format == IFF_TIFF_G4 || format == IFF_TIFF_LZW ||
1158 #if LIBLEPT_MAJOR_VERSION > 1 || LIBLEPT_MINOR_VERSION > 76
1159 format == IFF_TIFF_JPEG ||
1160 #endif
1161 format == IFF_TIFF_ZIP);
1162
1163 // Fail early if we can, before producing any output
1164 Pix *pix = nullptr;
1165 if (!tiff) {
1166 pix = (data != nullptr) ? pixReadMem(data, buf.size()) : pixRead(filename);
1167 if (pix == nullptr) {
1168 return false;
1169 }
1170 }
1171
1172 // Begin the output
1173 if (renderer && !renderer->BeginDocument(document_title.c_str())) {
1174 pixDestroy(&pix);
1175 return false;
1176 }
1177
1178 // Produce output
1179 r = (tiff) ? ProcessPagesMultipageTiff(data, buf.size(), filename, retry_config, timeout_millisec,
1180 renderer, tesseract_->tessedit_page_number)
1181 : ProcessPage(pix, 0, filename, retry_config, timeout_millisec, renderer);
1182
1183 // Clean up memory as needed
1184 pixDestroy(&pix);
1185
1186 // End the output
1187 if (!r || (renderer && !renderer->EndDocument())) {
1188 return false;
1189 }
1190 return true;
1191 }
1192
1193 bool TessBaseAPI::ProcessPage(Pix *pix, int page_index, const char *filename,
1194 const char *retry_config, int timeout_millisec,
1195 TessResultRenderer *renderer) {
1196 SetInputName(filename);
1197 SetImage(pix);
1198 bool failed = false;
1199
1200 if (tesseract_->tessedit_pageseg_mode == PSM_AUTO_ONLY) {
1201 // Disabled character recognition
1202 if (! std::unique_ptr<const PageIterator>(AnalyseLayout())) {
1203 failed = true;
1204 }
1205 } else if (tesseract_->tessedit_pageseg_mode == PSM_OSD_ONLY) {
1206 failed = FindLines() != 0;
1207 } else if (timeout_millisec > 0) {
1208 // Running with a timeout.
1209 ETEXT_DESC monitor;
1210 monitor.cancel = nullptr;
1211 monitor.cancel_this = nullptr;
1212 monitor.set_deadline_msecs(timeout_millisec);
1213
1214 // Now run the main recognition.
1215 failed = Recognize(&monitor) < 0;
1216 } else {
1217 // Normal layout and character recognition with no timeout.
1218 failed = Recognize(nullptr) < 0;
1219 }
1220
1221 if (tesseract_->tessedit_write_images) {
1222 Pix *page_pix = GetThresholdedImage();
1223 std::string output_filename = output_file_ + ".processed";
1224 if (page_index > 0) {
1225 output_filename += std::to_string(page_index);
1226 }
1227 output_filename += ".tif";
1228 pixWrite(output_filename.c_str(), page_pix, IFF_TIFF_G4);
1229 pixDestroy(&page_pix);
1230 }
1231
1232 if (failed && retry_config != nullptr && retry_config[0] != '\0') {
1233 // Save current config variables before switching modes.
1234 FILE *fp = fopen(kOldVarsFile, "wb");
1235 if (fp == nullptr) {
1236 tprintf("Error, failed to open file \"%s\"\n", kOldVarsFile);
1237 } else {
1238 PrintVariables(fp);
1239 fclose(fp);
1240 }
1241 // Switch to alternate mode for retry.
1242 ReadConfigFile(retry_config);
1243 SetImage(pix);
1244 Recognize(nullptr);
1245 // Restore saved config variables.
1246 ReadConfigFile(kOldVarsFile);
1247 }
1248
1249 if (renderer && !failed) {
1250 failed = !renderer->AddImage(this);
1251 }
1252
1253 return !failed;
1254 }
1255
1256 /**
1257 * Get a left-to-right iterator to the results of LayoutAnalysis and/or
1258 * Recognize. The returned iterator must be deleted after use.
1259 */
1260 LTRResultIterator *TessBaseAPI::GetLTRIterator() {
1261 if (tesseract_ == nullptr || page_res_ == nullptr) {
1262 return nullptr;
1263 }
1264 return new LTRResultIterator(page_res_, tesseract_, thresholder_->GetScaleFactor(),
1265 thresholder_->GetScaledYResolution(), rect_left_, rect_top_,
1266 rect_width_, rect_height_);
1267 }
1268
1269 /**
1270 * Get a reading-order iterator to the results of LayoutAnalysis and/or
1271 * Recognize. The returned iterator must be deleted after use.
1272 * WARNING! This class points to data held within the TessBaseAPI class, and
1273 * therefore can only be used while the TessBaseAPI class still exists and
1274 * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
1275 * DetectOS, or anything else that changes the internal PAGE_RES.
1276 */
1277 ResultIterator *TessBaseAPI::GetIterator() {
1278 if (tesseract_ == nullptr || page_res_ == nullptr) {
1279 return nullptr;
1280 }
1281 return ResultIterator::StartOfParagraph(LTRResultIterator(
1282 page_res_, tesseract_, thresholder_->GetScaleFactor(), thresholder_->GetScaledYResolution(),
1283 rect_left_, rect_top_, rect_width_, rect_height_));
1284 }
1285
1286 /**
1287 * Get a mutable iterator to the results of LayoutAnalysis and/or Recognize.
1288 * The returned iterator must be deleted after use.
1289 * WARNING! This class points to data held within the TessBaseAPI class, and
1290 * therefore can only be used while the TessBaseAPI class still exists and
1291 * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
1292 * DetectOS, or anything else that changes the internal PAGE_RES.
1293 */
1294 MutableIterator *TessBaseAPI::GetMutableIterator() {
1295 if (tesseract_ == nullptr || page_res_ == nullptr) {
1296 return nullptr;
1297 }
1298 return new MutableIterator(page_res_, tesseract_, thresholder_->GetScaleFactor(),
1299 thresholder_->GetScaledYResolution(), rect_left_, rect_top_,
1300 rect_width_, rect_height_);
1301 }
1302
1303 /** Make a text string from the internal data structures. */
1304 char *TessBaseAPI::GetUTF8Text() {
1305 if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) {
1306 return nullptr;
1307 }
1308 std::string text("");
1309 const std::unique_ptr</*non-const*/ ResultIterator> it(GetIterator());
1310 do {
1311 if (it->Empty(RIL_PARA)) {
1312 continue;
1313 }
1314 auto block_type = it->BlockType();
1315 switch (block_type) {
1316 case PT_FLOWING_IMAGE:
1317 case PT_HEADING_IMAGE:
1318 case PT_PULLOUT_IMAGE:
1319 case PT_HORZ_LINE:
1320 case PT_VERT_LINE:
1321 // Ignore images and lines for text output.
1322 continue;
1323 case PT_NOISE:
1324 tprintf("TODO: Please report image which triggers the noise case.\n");
1325 ASSERT_HOST(false);
1326 default:
1327 break;
1328 }
1329
1330 const std::unique_ptr<const char[]> para_text(it->GetUTF8Text(RIL_PARA));
1331 text += para_text.get();
1332 } while (it->Next(RIL_PARA));
1333 return copy_string(text);
1334 }
1335
1336 static void AddBoxToTSV(const PageIterator *it, PageIteratorLevel level, std::string &text) {
1337 int left, top, right, bottom;
1338 it->BoundingBox(level, &left, &top, &right, &bottom);
1339 text += "\t" + std::to_string(left);
1340 text += "\t" + std::to_string(top);
1341 text += "\t" + std::to_string(right - left);
1342 text += "\t" + std::to_string(bottom - top);
1343 }
1344
1345 /**
1346 * Make a TSV-formatted string from the internal data structures.
1347 * page_number is 0-based but will appear in the output as 1-based.
1348 * Returned string must be freed with the delete [] operator.
1349 */
1350 char *TessBaseAPI::GetTSVText(int page_number) {
1351 if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0)) {
1352 return nullptr;
1353 }
1354
1355 #if !defined(NDEBUG)
1356 int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;
1357 #endif
1358 int page_id = page_number + 1; // we use 1-based page numbers.
1359
1360 int page_num = page_id;
1361 int block_num = 0;
1362 int par_num = 0;
1363 int line_num = 0;
1364 int word_num = 0;
1365
1366 std::string tsv_str;
1367 tsv_str += "1\t" + std::to_string(page_num); // level 1 - page
1368 tsv_str += "\t" + std::to_string(block_num);
1369 tsv_str += "\t" + std::to_string(par_num);
1370 tsv_str += "\t" + std::to_string(line_num);
1371 tsv_str += "\t" + std::to_string(word_num);
1372 tsv_str += "\t" + std::to_string(rect_left_);
1373 tsv_str += "\t" + std::to_string(rect_top_);
1374 tsv_str += "\t" + std::to_string(rect_width_);
1375 tsv_str += "\t" + std::to_string(rect_height_);
1376 tsv_str += "\t-1\t\n";
1377
1378 const std::unique_ptr</*non-const*/ ResultIterator> res_it(GetIterator());
1379 while (!res_it->Empty(RIL_BLOCK)) {
1380 if (res_it->Empty(RIL_WORD)) {
1381 res_it->Next(RIL_WORD);
1382 continue;
1383 }
1384
1385 // Add rows for any new block/paragraph/textline.
1386 if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
1387 block_num++;
1388 par_num = 0;
1389 line_num = 0;
1390 word_num = 0;
1391 tsv_str += "2\t" + std::to_string(page_num); // level 2 - block
1392 tsv_str += "\t" + std::to_string(block_num);
1393 tsv_str += "\t" + std::to_string(par_num);
1394 tsv_str += "\t" + std::to_string(line_num);
1395 tsv_str += "\t" + std::to_string(word_num);
1396 AddBoxToTSV(res_it.get(), RIL_BLOCK, tsv_str);
1397 tsv_str += "\t-1\t\n"; // end of row for block
1398 }
1399 if (res_it->IsAtBeginningOf(RIL_PARA)) {
1400 par_num++;
1401 line_num = 0;
1402 word_num = 0;
1403 tsv_str += "3\t" + std::to_string(page_num); // level 3 - paragraph
1404 tsv_str += "\t" + std::to_string(block_num);
1405 tsv_str += "\t" + std::to_string(par_num);
1406 tsv_str += "\t" + std::to_string(line_num);
1407 tsv_str += "\t" + std::to_string(word_num);
1408 AddBoxToTSV(res_it.get(), RIL_PARA, tsv_str);
1409 tsv_str += "\t-1\t\n"; // end of row for para
1410 }
1411 if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
1412 line_num++;
1413 word_num = 0;
1414 tsv_str += "4\t" + std::to_string(page_num); // level 4 - line
1415 tsv_str += "\t" + std::to_string(block_num);
1416 tsv_str += "\t" + std::to_string(par_num);
1417 tsv_str += "\t" + std::to_string(line_num);
1418 tsv_str += "\t" + std::to_string(word_num);
1419 AddBoxToTSV(res_it.get(), RIL_TEXTLINE, tsv_str);
1420 tsv_str += "\t-1\t\n"; // end of row for line
1421 }
1422
1423 // Now, process the word...
1424 int left, top, right, bottom;
1425 res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
1426 word_num++;
1427 tsv_str += "5\t" + std::to_string(page_num); // level 5 - word
1428 tsv_str += "\t" + std::to_string(block_num);
1429 tsv_str += "\t" + std::to_string(par_num);
1430 tsv_str += "\t" + std::to_string(line_num);
1431 tsv_str += "\t" + std::to_string(word_num);
1432 tsv_str += "\t" + std::to_string(left);
1433 tsv_str += "\t" + std::to_string(top);
1434 tsv_str += "\t" + std::to_string(right - left);
1435 tsv_str += "\t" + std::to_string(bottom - top);
1436 tsv_str += "\t" + std::to_string(res_it->Confidence(RIL_WORD));
1437 tsv_str += "\t";
1438
1439 #if !defined(NDEBUG)
1440 // Increment counts if at end of block/paragraph/textline.
1441 if (res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD)) {
1442 lcnt++;
1443 }
1444 if (res_it->IsAtFinalElement(RIL_PARA, RIL_WORD)) {
1445 pcnt++;
1446 }
1447 if (res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD)) {
1448 bcnt++;
1449 }
1450 #endif
1451
1452 do {
1453 tsv_str += std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get();
1454 res_it->Next(RIL_SYMBOL);
1455 } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
1456 tsv_str += "\n"; // end of row
1457 #if !defined(NDEBUG)
1458 wcnt++;
1459 #endif
1460 }
1461
1462 return copy_string(tsv_str);
1463 }
1464
1465 /** The 5 numbers output for each box (the usual 4 and a page number.) */
1466 const int kNumbersPerBlob = 5;
1467 /**
1468 * The number of bytes taken by each number. Since we use int16_t for ICOORD,
1469 * assume only 5 digits max.
1470 */
1471 const int kBytesPerNumber = 5;
1472 /**
1473 * Multiplier for max expected textlength assumes (kBytesPerNumber + space)
1474 * * kNumbersPerBlob plus the newline. Add to this the
1475 * original UTF8 characters, and one kMaxBytesPerLine for safety.
1476 */
1477 const int kBytesPerBoxFileLine = (kBytesPerNumber + 1) * kNumbersPerBlob + 1;
1478 /** Max bytes in the decimal representation of int64_t. */
1479 const int kBytesPer64BitNumber = 20;
1480 /**
1481 * A maximal single box could occupy kNumbersPerBlob numbers at
1482 * kBytesPer64BitNumber digits (if someone sneaks in a 64 bit value) and a
1483 * space plus the newline and the maximum length of a UNICHAR.
1484 * Test against this on each iteration for safety.
1485 */
1486 const int kMaxBytesPerLine = kNumbersPerBlob * (kBytesPer64BitNumber + 1) + 1 + UNICHAR_LEN;
1487
1488 /**
1489 * The recognized text is returned as a char* which is coded
1490 * as a UTF8 box file.
1491 * page_number is a 0-base page index that will appear in the box file.
1492 * Returned string must be freed with the delete [] operator.
1493 */
1494 char *TessBaseAPI::GetBoxText(int page_number) {
1495 if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) {
1496 return nullptr;
1497 }
1498 int blob_count;
1499 int utf8_length = TextLength(&blob_count);
1500 int total_length = blob_count * kBytesPerBoxFileLine + utf8_length + kMaxBytesPerLine;
1501 char *result = new char[total_length];
1502 result[0] = '\0';
1503 int output_length = 0;
1504 LTRResultIterator *it = GetLTRIterator();
1505 do {
1506 int left, top, right, bottom;
1507 if (it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom)) {
1508 const std::unique_ptr</*non-const*/ char[]> text(it->GetUTF8Text(RIL_SYMBOL));
1509 // Tesseract uses space for recognition failure. Fix to a reject
1510 // character, kTesseractReject so we don't create illegal box files.
1511 for (int i = 0; text[i] != '\0'; ++i) {
1512 if (text[i] == ' ') {
1513 text[i] = kTesseractReject;
1514 }
1515 }
1516 snprintf(result + output_length, total_length - output_length, "%s %d %d %d %d %d\n",
1517 text.get(), left, image_height_ - bottom, right, image_height_ - top, page_number);
1518 output_length += strlen(result + output_length);
1519 // Just in case...
1520 if (output_length + kMaxBytesPerLine > total_length) {
1521 break;
1522 }
1523 }
1524 } while (it->Next(RIL_SYMBOL));
1525 delete it;
1526 return result;
1527 }
1528
1529 /**
1530 * Conversion table for non-latin characters.
1531 * Maps characters out of the latin set into the latin set.
1532 * TODO(rays) incorporate this translation into unicharset.
1533 */
1534 const int kUniChs[] = {0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0};
1535 /** Latin chars corresponding to the unicode chars above. */
1536 const int kLatinChs[] = {0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0};
1537
1538 /**
1539 * The recognized text is returned as a char* which is coded
1540 * as UNLV format Latin-1 with specific reject and suspect codes.
1541 * Returned string must be freed with the delete [] operator.
1542 */
1543 char *TessBaseAPI::GetUNLVText() {
1544 if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) {
1545 return nullptr;
1546 }
1547 bool tilde_crunch_written = false;
1548 bool last_char_was_newline = true;
1549 bool last_char_was_tilde = false;
1550
1551 int total_length = TextLength(nullptr);
1552 PAGE_RES_IT page_res_it(page_res_);
1553 char *result = new char[total_length];
1554 char *ptr = result;
1555 for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
1556 WERD_RES *word = page_res_it.word();
1557 // Process the current word.
1558 if (word->unlv_crunch_mode != CR_NONE) {
1559 if (word->unlv_crunch_mode != CR_DELETE &&
1560 (!tilde_crunch_written ||
1561 (word->unlv_crunch_mode == CR_KEEP_SPACE && word->word->space() > 0 &&
1562 !word->word->flag(W_FUZZY_NON) && !word->word->flag(W_FUZZY_SP)))) {
1563 if (!word->word->flag(W_BOL) && word->word->space() > 0 && !word->word->flag(W_FUZZY_NON) &&
1564 !word->word->flag(W_FUZZY_SP)) {
1565 /* Write a space to separate from preceding good text */
1566 *ptr++ = ' ';
1567 last_char_was_tilde = false;
1568 }
1569 if (!last_char_was_tilde) {
1570 // Write a reject char.
1571 last_char_was_tilde = true;
1572 *ptr++ = kUNLVReject;
1573 tilde_crunch_written = true;
1574 last_char_was_newline = false;
1575 }
1576 }
1577 } else {
1578 // NORMAL PROCESSING of non tilde crunched words.
1579 tilde_crunch_written = false;
1580 tesseract_->set_unlv_suspects(word);
1581 const char *wordstr = word->best_choice->unichar_string().c_str();
1582 const auto &lengths = word->best_choice->unichar_lengths();
1583 int length = lengths.length();
1584 int i = 0;
1585 int offset = 0;
1586
1587 if (last_char_was_tilde && word->word->space() == 0 && wordstr[offset] == ' ') {
1588 // Prevent adjacent tilde across words - we know that adjacent tildes
1589 // within words have been removed.
1590 // Skip the first character.
1591 offset = lengths[i++];
1592 }
1593 if (i < length && wordstr[offset] != 0) {
1594 if (!last_char_was_newline) {
1595 *ptr++ = ' ';
1596 } else {
1597 last_char_was_newline = false;
1598 }
1599 for (; i < length; offset += lengths[i++]) {
1600 if (wordstr[offset] == ' ' || wordstr[offset] == kTesseractReject) {
1601 *ptr++ = kUNLVReject;
1602 last_char_was_tilde = true;
1603 } else {
1604 if (word->reject_map[i].rejected()) {
1605 *ptr++ = kUNLVSuspect;
1606 }
1607 UNICHAR ch(wordstr + offset, lengths[i]);
1608 int uni_ch = ch.first_uni();
1609 for (int j = 0; kUniChs[j] != 0; ++j) {
1610 if (kUniChs[j] == uni_ch) {
1611 uni_ch = kLatinChs[j];
1612 break;
1613 }
1614 }
1615 if (uni_ch <= 0xff) {
1616 *ptr++ = static_cast<char>(uni_ch);
1617 last_char_was_tilde = false;
1618 } else {
1619 *ptr++ = kUNLVReject;
1620 last_char_was_tilde = true;
1621 }
1622 }
1623 }
1624 }
1625 }
1626 if (word->word->flag(W_EOL) && !last_char_was_newline) {
1627 /* Add a new line output */
1628 *ptr++ = '\n';
1629 tilde_crunch_written = false;
1630 last_char_was_newline = true;
1631 last_char_was_tilde = false;
1632 }
1633 }
1634 *ptr++ = '\n';
1635 *ptr = '\0';
1636 return result;
1637 }
1638
1639 #ifndef DISABLED_LEGACY_ENGINE
1640
1641 /**
1642 * Detect the orientation of the input image and apparent script (alphabet).
1643 * orient_deg is the detected clockwise rotation of the input image in degrees
1644 * (0, 90, 180, 270)
1645 * orient_conf is the confidence (15.0 is reasonably confident)
1646 * script_name is an ASCII string, the name of the script, e.g. "Latin"
1647 * script_conf is confidence level in the script
1648 * Returns true on success and writes values to each parameter as an output
1649 */
1650 bool TessBaseAPI::DetectOrientationScript(int *orient_deg, float *orient_conf,
1651 const char **script_name, float *script_conf) {
1652 OSResults osr;
1653
1654 bool osd = DetectOS(&osr);
1655 if (!osd) {
1656 return false;
1657 }
1658
1659 int orient_id = osr.best_result.orientation_id;
1660 int script_id = osr.get_best_script(orient_id);
1661 if (orient_conf) {
1662 *orient_conf = osr.best_result.oconfidence;
1663 }
1664 if (orient_deg) {
1665 *orient_deg = orient_id * 90; // convert quadrant to degrees
1666 }
1667
1668 if (script_name) {
1669 const char *script = osr.unicharset->get_script_from_script_id(script_id);
1670
1671 *script_name = script;
1672 }
1673
1674 if (script_conf) {
1675 *script_conf = osr.best_result.sconfidence;
1676 }
1677
1678 return true;
1679 }
1680
1681 /**
1682 * The recognized text is returned as a char* which is coded
1683 * as UTF8 and must be freed with the delete [] operator.
1684 * page_number is a 0-based page index that will appear in the osd file.
1685 */
1686 char *TessBaseAPI::GetOsdText(int page_number) {
1687 int orient_deg;
1688 float orient_conf;
1689 const char *script_name;
1690 float script_conf;
1691
1692 if (!DetectOrientationScript(&orient_deg, &orient_conf, &script_name, &script_conf)) {
1693 return nullptr;
1694 }
1695
1696 // clockwise rotation needed to make the page upright
1697 int rotate = OrientationIdToValue(orient_deg / 90);
1698
1699 std::stringstream stream;
1700 // Use "C" locale (needed for float values orient_conf and script_conf).
1701 stream.imbue(std::locale::classic());
1702 // Use fixed notation with 2 digits after the decimal point for float values.
1703 stream.precision(2);
1704 stream << std::fixed << "Page number: " << page_number << "\n"
1705 << "Orientation in degrees: " << orient_deg << "\n"
1706 << "Rotate: " << rotate << "\n"
1707 << "Orientation confidence: " << orient_conf << "\n"
1708 << "Script: " << script_name << "\n"
1709 << "Script confidence: " << script_conf << "\n";
1710 return copy_string(stream.str());
1711 }
1712
1713 #endif // ndef DISABLED_LEGACY_ENGINE
1714
1715 /** Returns the average word confidence for Tesseract page result. */
1716 int TessBaseAPI::MeanTextConf() {
1717 int *conf = AllWordConfidences();
1718 if (!conf) {
1719 return 0;
1720 }
1721 int sum = 0;
1722 int *pt = conf;
1723 while (*pt >= 0) {
1724 sum += *pt++;
1725 }
1726 if (pt != conf) {
1727 sum /= pt - conf;
1728 }
1729 delete[] conf;
1730 return sum;
1731 }
1732
1733 /** Returns an array of all word confidences, terminated by -1. */
1734 int *TessBaseAPI::AllWordConfidences() {
1735 if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) {
1736 return nullptr;
1737 }
1738 int n_word = 0;
1739 PAGE_RES_IT res_it(page_res_);
1740 for (res_it.restart_page(); res_it.word() != nullptr; res_it.forward()) {
1741 n_word++;
1742 }
1743
1744 int *conf = new int[n_word + 1];
1745 n_word = 0;
1746 for (res_it.restart_page(); res_it.word() != nullptr; res_it.forward()) {
1747 WERD_RES *word = res_it.word();
1748 WERD_CHOICE *choice = word->best_choice;
1749 int w_conf = static_cast<int>(100 + 5 * choice->certainty());
1750 // This is the eq for converting Tesseract confidence to 1..100
1751 if (w_conf < 0) {
1752 w_conf = 0;
1753 }
1754 if (w_conf > 100) {
1755 w_conf = 100;
1756 }
1757 conf[n_word++] = w_conf;
1758 }
1759 conf[n_word] = -1;
1760 return conf;
1761 }
1762
1763 #ifndef DISABLED_LEGACY_ENGINE
1764 /**
1765 * Applies the given word to the adaptive classifier if possible.
1766 * The word must be SPACE-DELIMITED UTF-8 - l i k e t h i s , so it can
1767 * tell the boundaries of the graphemes.
1768 * Assumes that SetImage/SetRectangle have been used to set the image
1769 * to the given word. The mode arg should be PSM_SINGLE_WORD or
1770 * PSM_CIRCLE_WORD, as that will be used to control layout analysis.
1771 * The currently set PageSegMode is preserved.
1772 * Returns false if adaption was not possible for some reason.
1773 */
1774 bool TessBaseAPI::AdaptToWordStr(PageSegMode mode, const char *wordstr) {
1775 int debug = 0;
1776 GetIntVariable("applybox_debug", &debug);
1777 bool success = true;
1778 PageSegMode current_psm = GetPageSegMode();
1779 SetPageSegMode(mode);
1780 SetVariable("classify_enable_learning", "0");
1781 const std::unique_ptr<const char[]> text(GetUTF8Text());
1782 if (debug) {
1783 tprintf("Trying to adapt \"%s\" to \"%s\"\n", text.get(), wordstr);
1784 }
1785 if (text != nullptr) {
1786 PAGE_RES_IT it(page_res_);
1787 WERD_RES *word_res = it.word();
1788 if (word_res != nullptr) {
1789 word_res->word->set_text(wordstr);
1790 // Check to see if text matches wordstr.
1791 int w = 0;
1792 int t;
1793 for (t = 0; text[t] != '\0'; ++t) {
1794 if (text[t] == '\n' || text[t] == ' ') {
1795 continue;
1796 }
1797 while (wordstr[w] == ' ') {
1798 ++w;
1799 }
1800 if (text[t] != wordstr[w]) {
1801 break;
1802 }
1803 ++w;
1804 }
1805 if (text[t] != '\0' || wordstr[w] != '\0') {
1806 // No match.
1807 delete page_res_;
1808 std::vector<TBOX> boxes;
1809 page_res_ = tesseract_->SetupApplyBoxes(boxes, block_list_);
1810 tesseract_->ReSegmentByClassification(page_res_);
1811 tesseract_->TidyUp(page_res_);
1812 PAGE_RES_IT pr_it(page_res_);
1813 if (pr_it.word() == nullptr) {
1814 success = false;
1815 } else {
1816 word_res = pr_it.word();
1817 }
1818 } else {
1819 word_res->BestChoiceToCorrectText();
1820 }
1821 if (success) {
1822 tesseract_->EnableLearning = true;
1823 tesseract_->LearnWord(nullptr, word_res);
1824 }
1825 } else {
1826 success = false;
1827 }
1828 } else {
1829 success = false;
1830 }
1831 SetPageSegMode(current_psm);
1832 return success;
1833 }
1834 #endif // ndef DISABLED_LEGACY_ENGINE
1835
1836 /**
1837 * Free up recognition results and any stored image data, without actually
1838 * freeing any recognition data that would be time-consuming to reload.
1839 * Afterwards, you must call SetImage or TesseractRect before doing
1840 * any Recognize or Get* operation.
1841 */
1842 void TessBaseAPI::Clear() {
1843 if (thresholder_ != nullptr) {
1844 thresholder_->Clear();
1845 }
1846 ClearResults();
1847 if (tesseract_ != nullptr) {
1848 SetInputImage(nullptr);
1849 }
1850 }
1851
1852 /**
1853 * Close down tesseract and free up all memory. End() is equivalent to
1854 * destructing and reconstructing your TessBaseAPI.
1855 * Once End() has been used, none of the other API functions may be used
1856 * other than Init and anything declared above it in the class definition.
1857 */
1858 void TessBaseAPI::End() {
1859 Clear();
1860 delete thresholder_;
1861 thresholder_ = nullptr;
1862 delete page_res_;
1863 page_res_ = nullptr;
1864 delete block_list_;
1865 block_list_ = nullptr;
1866 if (paragraph_models_ != nullptr) {
1867 for (auto model : *paragraph_models_) {
1868 delete model;
1869 }
1870 delete paragraph_models_;
1871 paragraph_models_ = nullptr;
1872 }
1873 #ifndef DISABLED_LEGACY_ENGINE
1874 if (osd_tesseract_ == tesseract_) {
1875 osd_tesseract_ = nullptr;
1876 }
1877 delete osd_tesseract_;
1878 osd_tesseract_ = nullptr;
1879 delete equ_detect_;
1880 equ_detect_ = nullptr;
1881 #endif // ndef DISABLED_LEGACY_ENGINE
1882 delete tesseract_;
1883 tesseract_ = nullptr;
1884 input_file_.clear();
1885 output_file_.clear();
1886 datapath_.clear();
1887 language_.clear();
1888 }
1889
1890 // Clear any library-level memory caches.
1891 // There are a variety of expensive-to-load constant data structures (mostly
1892 // language dictionaries) that are cached globally -- surviving the Init()
1893 // and End() of individual TessBaseAPI's. This function allows the clearing
1894 // of these caches.
1895 void TessBaseAPI::ClearPersistentCache() {
1896 Dict::GlobalDawgCache()->DeleteUnusedDawgs();
1897 }
1898
1899 /**
1900 * Check whether a word is valid according to Tesseract's language model
1901 * returns 0 if the word is invalid, non-zero if valid
1902 */
1903 int TessBaseAPI::IsValidWord(const char *word) const {
1904 return tesseract_->getDict().valid_word(word);
1905 }
1906 // Returns true if utf8_character is defined in the UniCharset.
1907 bool TessBaseAPI::IsValidCharacter(const char *utf8_character) const {
1908 return tesseract_->unicharset.contains_unichar(utf8_character);
1909 }
1910
1911 // TODO(rays) Obsolete this function and replace with a more aptly named
1912 // function that returns image coordinates rather than tesseract coordinates.
1913 bool TessBaseAPI::GetTextDirection(int *out_offset, float *out_slope) {
1914 const std::unique_ptr<const PageIterator> it(AnalyseLayout());
1915 if (it == nullptr) {
1916 return false;
1917 }
1918 int x1, x2, y1, y2;
1919 it->Baseline(RIL_TEXTLINE, &x1, &y1, &x2, &y2);
1920 // Calculate offset and slope (NOTE: Kind of ugly)
1921 if (x2 <= x1) {
1922 x2 = x1 + 1;
1923 }
1924 // Convert the point pair to slope/offset of the baseline (in image coords.)
1925 *out_slope = static_cast<float>(y2 - y1) / (x2 - x1);
1926 *out_offset = static_cast<int>(y1 - *out_slope * x1);
1927 // Get the y-coord of the baseline at the left and right edges of the
1928 // textline's bounding box.
1929 int left, top, right, bottom;
1930 if (!it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom)) {
1931 return false;
1932 }
1933 int left_y = IntCastRounded(*out_slope * left + *out_offset);
1934 int right_y = IntCastRounded(*out_slope * right + *out_offset);
1935 // Shift the baseline down so it passes through the nearest bottom-corner
1936 // of the textline's bounding box. This is the difference between the y
1937 // at the lowest (max) edge of the box and the actual box bottom.
1938 *out_offset += bottom - std::max(left_y, right_y);
1939 // Switch back to bottom-up tesseract coordinates. Requires negation of
1940 // the slope and height - offset for the offset.
1941 *out_slope = -*out_slope;
1942 *out_offset = rect_height_ - *out_offset;
1943
1944 return true;
1945 }
1946
1947 /** Sets Dict::letter_is_okay_ function to point to the given function. */
1948 void TessBaseAPI::SetDictFunc(DictFunc f) {
1949 if (tesseract_ != nullptr) {
1950 tesseract_->getDict().letter_is_okay_ = f;
1951 }
1952 }
1953
1954 /**
1955 * Sets Dict::probability_in_context_ function to point to the given
1956 * function.
1957 *
1958 * @param f A single function that returns the probability of the current
1959 * "character" (in general a utf-8 string), given the context of a previous
1960 * utf-8 string.
1961 */
1962 void TessBaseAPI::SetProbabilityInContextFunc(ProbabilityInContextFunc f) {
1963 if (tesseract_ != nullptr) {
1964 tesseract_->getDict().probability_in_context_ = f;
1965 // Set it for the sublangs too.
1966 int num_subs = tesseract_->num_sub_langs();
1967 for (int i = 0; i < num_subs; ++i) {
1968 tesseract_->get_sub_lang(i)->getDict().probability_in_context_ = f;
1969 }
1970 }
1971 }
1972
1973 /** Common code for setting the image. */
1974 bool TessBaseAPI::InternalSetImage() {
1975 if (tesseract_ == nullptr) {
1976 tprintf("Please call Init before attempting to set an image.\n");
1977 return false;
1978 }
1979 if (thresholder_ == nullptr) {
1980 thresholder_ = new ImageThresholder;
1981 }
1982 ClearResults();
1983 return true;
1984 }
1985
1986 /**
1987 * Run the thresholder to make the thresholded image, returned in pix,
1988 * which must not be nullptr. *pix must be initialized to nullptr, or point
1989 * to an existing pixDestroyable Pix.
1990 * The usual argument to Threshold is Tesseract::mutable_pix_binary().
1991 */
1992 bool TessBaseAPI::Threshold(Pix **pix) {
1993 ASSERT_HOST(pix != nullptr);
1994 if (*pix != nullptr) {
1995 pixDestroy(pix);
1996 }
1997 // Zero resolution messes up the algorithms, so make sure it is credible.
1998 int user_dpi = 0;
1999 GetIntVariable("user_defined_dpi", &user_dpi);
2000 int y_res = thresholder_->GetScaledYResolution();
2001 if (user_dpi && (user_dpi < kMinCredibleResolution || user_dpi > kMaxCredibleResolution)) {
2002 tprintf(
2003 "Warning: User defined image dpi is outside of expected range "
2004 "(%d - %d)!\n",
2005 kMinCredibleResolution, kMaxCredibleResolution);
2006 }
2007 // Always use user defined dpi
2008 if (user_dpi) {
2009 thresholder_->SetSourceYResolution(user_dpi);
2010 } else if (y_res < kMinCredibleResolution || y_res > kMaxCredibleResolution) {
2011 if (y_res != 0) {
2012 // Show warning only if a resolution was given.
2013 tprintf("Warning: Invalid resolution %d dpi. Using %d instead.\n",
2014 y_res, kMinCredibleResolution);
2015 }
2016 thresholder_->SetSourceYResolution(kMinCredibleResolution);
2017 }
2018
2019 auto thresholding_method = static_cast<ThresholdMethod>(static_cast<int>(tesseract_->thresholding_method));
2020
2021 if (thresholding_method == ThresholdMethod::Otsu) {
2022 Image pix_binary(*pix);
2023 if (!thresholder_->ThresholdToPix(&pix_binary)) {
2024 return false;
2025 }
2026 *pix = pix_binary;
2027
2028 if (!thresholder_->IsBinary()) {
2029 tesseract_->set_pix_thresholds(thresholder_->GetPixRectThresholds());
2030 tesseract_->set_pix_grey(thresholder_->GetPixRectGrey());
2031 } else {
2032 tesseract_->set_pix_thresholds(nullptr);
2033 tesseract_->set_pix_grey(nullptr);
2034 }
2035 } else {
2036 auto [ok, pix_grey, pix_binary, pix_thresholds] = thresholder_->Threshold(this, thresholding_method);
2037
2038 if (!ok) {
2039 return false;
2040 }
2041 *pix = pix_binary;
2042
2043 tesseract_->set_pix_thresholds(pix_thresholds);
2044 tesseract_->set_pix_grey(pix_grey);
2045 }
2046
2047 thresholder_->GetImageSizes(&rect_left_, &rect_top_, &rect_width_, &rect_height_, &image_width_,
2048 &image_height_);
2049
2050 // Set the internal resolution that is used for layout parameters from the
2051 // estimated resolution, rather than the image resolution, which may be
2052 // fabricated, but we will use the image resolution, if there is one, to
2053 // report output point sizes.
2054 int estimated_res = ClipToRange(thresholder_->GetScaledEstimatedResolution(),
2055 kMinCredibleResolution, kMaxCredibleResolution);
2056 if (estimated_res != thresholder_->GetScaledEstimatedResolution()) {
2057 tprintf(
2058 "Estimated internal resolution %d out of range! "
2059 "Corrected to %d.\n",
2060 thresholder_->GetScaledEstimatedResolution(), estimated_res);
2061 }
2062 tesseract_->set_source_resolution(estimated_res);
2063 return true;
2064 }
2065
2066 /** Find lines from the image making the BLOCK_LIST. */
2067 int TessBaseAPI::FindLines() {
2068 if (thresholder_ == nullptr || thresholder_->IsEmpty()) {
2069 tprintf("Please call SetImage before attempting recognition.\n");
2070 return -1;
2071 }
2072 if (recognition_done_) {
2073 ClearResults();
2074 }
2075 if (!block_list_->empty()) {
2076 return 0;
2077 }
2078 if (tesseract_ == nullptr) {
2079 tesseract_ = new Tesseract;
2080 #ifndef DISABLED_LEGACY_ENGINE
2081 tesseract_->InitAdaptiveClassifier(nullptr);
2082 #endif
2083 }
2084 if (tesseract_->pix_binary() == nullptr && !Threshold(&tesseract_->mutable_pix_binary()->pix_)) {
2085 return -1;
2086 }
2087
2088 tesseract_->PrepareForPageseg();
2089
2090 #ifndef DISABLED_LEGACY_ENGINE
2091 if (tesseract_->textord_equation_detect) {
2092 if (equ_detect_ == nullptr && !datapath_.empty()) {
2093 equ_detect_ = new EquationDetect(datapath_.c_str(), nullptr);
2094 }
2095 if (equ_detect_ == nullptr) {
2096 tprintf("Warning: Could not set equation detector\n");
2097 } else {
2098 tesseract_->SetEquationDetect(equ_detect_);
2099 }
2100 }
2101 #endif // ndef DISABLED_LEGACY_ENGINE
2102
2103 Tesseract *osd_tess = osd_tesseract_;
2104 OSResults osr;
2105 #ifndef DISABLED_LEGACY_ENGINE
2106 if (PSM_OSD_ENABLED(tesseract_->tessedit_pageseg_mode) && osd_tess == nullptr) {
2107 if (strcmp(language_.c_str(), "osd") == 0) {
2108 osd_tess = tesseract_;
2109 } else {
2110 osd_tesseract_ = new Tesseract;
2111 TessdataManager mgr(reader_);
2112 if (datapath_.empty()) {
2113 tprintf(
2114 "Warning: Auto orientation and script detection requested,"
2115 " but data path is undefined\n");
2116 delete osd_tesseract_;
2117 osd_tesseract_ = nullptr;
2118 } else if (osd_tesseract_->init_tesseract(datapath_, "", "osd", OEM_TESSERACT_ONLY,
2119 nullptr, 0, nullptr, nullptr, false, &mgr) == 0) {
2120 osd_tess = osd_tesseract_;
2121 osd_tesseract_->set_source_resolution(thresholder_->GetSourceYResolution());
2122 } else {
2123 tprintf(
2124 "Warning: Auto orientation and script detection requested,"
2125 " but osd language failed to load\n");
2126 delete osd_tesseract_;
2127 osd_tesseract_ = nullptr;
2128 }
2129 }
2130 }
2131 #endif // ndef DISABLED_LEGACY_ENGINE
2132
2133 if (tesseract_->SegmentPage(input_file_.c_str(), block_list_, osd_tess, &osr) < 0) {
2134 return -1;
2135 }
2136
2137 // If Devanagari is being recognized, we use different images for page seg
2138 // and for OCR.
2139 tesseract_->PrepareForTessOCR(block_list_, osd_tess, &osr);
2140 return 0;
2141 }
2142
2143 /**
2144 * Return average gradient of lines on page.
2145 */
2146 float TessBaseAPI::GetGradient() {
2147 return tesseract_->gradient();
2148 }
2149
2150 /** Delete the pageres and clear the block list ready for a new page. */
2151 void TessBaseAPI::ClearResults() {
2152 if (tesseract_ != nullptr) {
2153 tesseract_->Clear();
2154 }
2155 delete page_res_;
2156 page_res_ = nullptr;
2157 recognition_done_ = false;
2158 if (block_list_ == nullptr) {
2159 block_list_ = new BLOCK_LIST;
2160 } else {
2161 block_list_->clear();
2162 }
2163 if (paragraph_models_ != nullptr) {
2164 for (auto model : *paragraph_models_) {
2165 delete model;
2166 }
2167 delete paragraph_models_;
2168 paragraph_models_ = nullptr;
2169 }
2170 }
2171
2172 /**
2173 * Return the length of the output text string, as UTF8, assuming
2174 * liberally two spacing marks after each word (as paragraphs end with two
2175 * newlines), and assuming a single character reject marker for each rejected
2176 * character.
2177 * Also return the number of recognized blobs in blob_count.
2178 */
2179 int TessBaseAPI::TextLength(int *blob_count) const {
2180 if (tesseract_ == nullptr || page_res_ == nullptr) {
2181 return 0;
2182 }
2183
2184 PAGE_RES_IT page_res_it(page_res_);
2185 int total_length = 2;
2186 int total_blobs = 0;
2187 // Iterate over the data structures to extract the recognition result.
2188 for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
2189 WERD_RES *word = page_res_it.word();
2190 WERD_CHOICE *choice = word->best_choice;
2191 if (choice != nullptr) {
2192 total_blobs += choice->length() + 2;
2193 total_length += choice->unichar_string().length() + 2;
2194 for (int i = 0; i < word->reject_map.length(); ++i) {
2195 if (word->reject_map[i].rejected()) {
2196 ++total_length;
2197 }
2198 }
2199 }
2200 }
2201 if (blob_count != nullptr) {
2202 *blob_count = total_blobs;
2203 }
2204 return total_length;
2205 }
2206
2207 #ifndef DISABLED_LEGACY_ENGINE
2208 /**
2209 * Estimates the Orientation And Script of the image.
2210 * Returns true if the image was processed successfully.
2211 */
2212 bool TessBaseAPI::DetectOS(OSResults *osr) {
2213 if (tesseract_ == nullptr) {
2214 return false;
2215 }
2216 ClearResults();
2217 if (tesseract_->pix_binary() == nullptr && !Threshold(&tesseract_->mutable_pix_binary()->pix_)) {
2218 return false;
2219 }
2220
2221 if (input_file_.empty()) {
2222 input_file_ = kInputFile;
2223 }
2224 return orientation_and_script_detection(input_file_.c_str(), osr, tesseract_) > 0;
2225 }
2226 #endif // #ifndef DISABLED_LEGACY_ENGINE
2227
2228 void TessBaseAPI::set_min_orientation_margin(double margin) {
2229 tesseract_->min_orientation_margin.set_value(margin);
2230 }
2231
2232 /**
2233 * Return text orientation of each block as determined in an earlier page layout
2234 * analysis operation. Orientation is returned as the number of ccw 90-degree
2235 * rotations (in [0..3]) required to make the text in the block upright
2236 * (readable). Note that this may not necessary be the block orientation
2237 * preferred for recognition (such as the case of vertical CJK text).
2238 *
2239 * Also returns whether the text in the block is believed to have vertical
2240 * writing direction (when in an upright page orientation).
2241 *
2242 * The returned array is of length equal to the number of text blocks, which may
2243 * be less than the total number of blocks. The ordering is intended to be
2244 * consistent with GetTextLines().
2245 */
2246 void TessBaseAPI::GetBlockTextOrientations(int **block_orientation, bool **vertical_writing) {
2247 delete[] * block_orientation;
2248 *block_orientation = nullptr;
2249 delete[] * vertical_writing;
2250 *vertical_writing = nullptr;
2251 BLOCK_IT block_it(block_list_);
2252
2253 block_it.move_to_first();
2254 int num_blocks = 0;
2255 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
2256 if (!block_it.data()->pdblk.poly_block()->IsText()) {
2257 continue;
2258 }
2259 ++num_blocks;
2260 }
2261 if (!num_blocks) {
2262 tprintf("WARNING: Found no blocks\n");
2263 return;
2264 }
2265 *block_orientation = new int[num_blocks];
2266 *vertical_writing = new bool[num_blocks];
2267 block_it.move_to_first();
2268 int i = 0;
2269 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
2270 if (!block_it.data()->pdblk.poly_block()->IsText()) {
2271 continue;
2272 }
2273 FCOORD re_rotation = block_it.data()->re_rotation();
2274 float re_theta = re_rotation.angle();
2275 FCOORD classify_rotation = block_it.data()->classify_rotation();
2276 float classify_theta = classify_rotation.angle();
2277 double rot_theta = -(re_theta - classify_theta) * 2.0 / M_PI;
2278 if (rot_theta < 0) {
2279 rot_theta += 4;
2280 }
2281 int num_rotations = static_cast<int>(rot_theta + 0.5);
2282 (*block_orientation)[i] = num_rotations;
2283 // The classify_rotation is non-zero only if the text has vertical
2284 // writing direction.
2285 (*vertical_writing)[i] = classify_rotation.y() != 0.0f;
2286 ++i;
2287 }
2288 }
2289
2290 void TessBaseAPI::DetectParagraphs(bool after_text_recognition) {
2291 int debug_level = 0;
2292 GetIntVariable("paragraph_debug_level", &debug_level);
2293 if (paragraph_models_ == nullptr) {
2294 paragraph_models_ = new std::vector<ParagraphModel *>;
2295 }
2296 MutableIterator *result_it = GetMutableIterator();
2297 do { // Detect paragraphs for this block
2298 std::vector<ParagraphModel *> models;
2299 ::tesseract::DetectParagraphs(debug_level, after_text_recognition, result_it, &models);
2300 paragraph_models_->insert(paragraph_models_->end(), models.begin(), models.end());
2301 } while (result_it->Next(RIL_BLOCK));
2302 delete result_it;
2303 }
2304
2305 /** This method returns the string form of the specified unichar. */
2306 const char *TessBaseAPI::GetUnichar(int unichar_id) const {
2307 return tesseract_->unicharset.id_to_unichar(unichar_id);
2308 }
2309
2310 /** Return the pointer to the i-th dawg loaded into tesseract_ object. */
2311 const Dawg *TessBaseAPI::GetDawg(int i) const {
2312 if (tesseract_ == nullptr || i >= NumDawgs()) {
2313 return nullptr;
2314 }
2315 return tesseract_->getDict().GetDawg(i);
2316 }
2317
2318 /** Return the number of dawgs loaded into tesseract_ object. */
2319 int TessBaseAPI::NumDawgs() const {
2320 return tesseract_ == nullptr ? 0 : tesseract_->getDict().NumDawgs();
2321 }
2322
2323 /** Escape a char string - replace <>&"' with HTML codes. */
2324 std::string HOcrEscape(const char *text) {
2325 std::string ret;
2326 const char *ptr;
2327 for (ptr = text; *ptr; ptr++) {
2328 switch (*ptr) {
2329 case '<':
2330 ret += "&lt;";
2331 break;
2332 case '>':
2333 ret += "&gt;";
2334 break;
2335 case '&':
2336 ret += "&amp;";
2337 break;
2338 case '"':
2339 ret += "&quot;";
2340 break;
2341 case '\'':
2342 ret += "&#39;";
2343 break;
2344 default:
2345 ret += *ptr;
2346 }
2347 }
2348 return ret;
2349 }
2350
2351 } // namespace tesseract