Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/api/baseapi.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/api/baseapi.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,2351 @@ +/********************************************************************** + * File: baseapi.cpp + * Description: Simple API for calling tesseract. + * Author: Ray Smith + * + * (C) Copyright 2006, Google Inc. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + **********************************************************************/ + +#define _USE_MATH_DEFINES // for M_PI + +// Include automatically generated configuration file if running autoconf. +#ifdef HAVE_CONFIG_H +# include "config_auto.h" +#endif + +#include "boxword.h" // for BoxWord +#include "coutln.h" // for C_OUTLINE_IT, C_OUTLINE_LIST +#include "dawg_cache.h" // for DawgCache +#include "dict.h" // for Dict +#include "elst.h" // for ELIST_ITERATOR, ELISTIZE, ELISTIZEH +#include "environ.h" // for l_uint8 +#ifndef DISABLED_LEGACY_ENGINE +#include "equationdetect.h" // for EquationDetect, destructor of equ_detect_ +#endif // ndef DISABLED_LEGACY_ENGINE +#include "errcode.h" // for ASSERT_HOST +#include "helpers.h" // for IntCastRounded, chomp_string, copy_string +#include "host.h" // for MAX_PATH +#include "imageio.h" // for IFF_TIFF_G4, IFF_TIFF, IFF_TIFF_G3, ... +#ifndef DISABLED_LEGACY_ENGINE +# include "intfx.h" // for INT_FX_RESULT_STRUCT +#endif +#include "mutableiterator.h" // for MutableIterator +#include "normalis.h" // for kBlnBaselineOffset, kBlnXHeight +#include "pageres.h" // for PAGE_RES_IT, WERD_RES, PAGE_RES, CR_DE... +#include "paragraphs.h" // for DetectParagraphs +#include "params.h" // for BoolParam, IntParam, DoubleParam, Stri... +#include "pdblock.h" // for PDBLK +#include "points.h" // for FCOORD +#include "polyblk.h" // for POLY_BLOCK +#include "rect.h" // for TBOX +#include "stepblob.h" // for C_BLOB_IT, C_BLOB, C_BLOB_LIST +#include "tessdatamanager.h" // for TessdataManager, kTrainedDataSuffix +#include "tesseractclass.h" // for Tesseract +#include "tprintf.h" // for tprintf +#include "werd.h" // for WERD, WERD_IT, W_FUZZY_NON, W_FUZZY_SP +#include "thresholder.h" // for ImageThresholder + +#include <tesseract/baseapi.h> +#include <tesseract/ocrclass.h> // for ETEXT_DESC +#include <tesseract/osdetect.h> // for OSResults, OSBestResult, OrientationId... +#include <tesseract/renderer.h> // for TessResultRenderer +#include <tesseract/resultiterator.h> // for ResultIterator + +#include <cmath> // for round, M_PI +#include <cstdint> // for int32_t +#include <cstring> // for strcmp, strcpy +#include <filesystem> // for std::filesystem +#include <fstream> // for size_t +#include <iostream> // for std::cin +#include <locale> // for std::locale::classic +#include <memory> // for std::unique_ptr +#include <set> // for std::pair +#include <sstream> // for std::stringstream +#include <vector> // for std::vector + +#include <allheaders.h> // for pixDestroy, boxCreate, boxaAddBox, box... +#ifdef HAVE_LIBCURL +# include <curl/curl.h> +#endif + +#ifdef __linux__ +# include <csignal> // for sigaction, SA_RESETHAND, SIGBUS, SIGFPE +#endif + +#if defined(_WIN32) +# include <fcntl.h> // for _O_BINARY +# include <io.h> // for _setmode +#endif + +namespace tesseract { + +static BOOL_VAR(stream_filelist, false, "Stream a filelist from stdin"); +static STRING_VAR(document_title, "", "Title of output document (used for hOCR and PDF output)"); +#ifdef HAVE_LIBCURL +static INT_VAR(curl_timeout, 0, "Timeout for curl in seconds"); +static STRING_VAR(curl_cookiefile, "", "File with cookie data for curl"); +#endif + +/** Minimum sensible image size to be worth running Tesseract. */ +const int kMinRectSize = 10; +/** Character returned when Tesseract couldn't recognize as anything. */ +const char kTesseractReject = '~'; +/** Character used by UNLV error counter as a reject. */ +const char kUNLVReject = '~'; +/** Character used by UNLV as a suspect marker. */ +const char kUNLVSuspect = '^'; +/** + * Temp file used for storing current parameters before applying retry values. + */ +static const char *kOldVarsFile = "failed_vars.txt"; + +#ifndef DISABLED_LEGACY_ENGINE +/** + * Filename used for input image file, from which to derive a name to search + * for a possible UNLV zone file, if none is specified by SetInputName. + */ +static const char *kInputFile = "noname.tif"; +static const char kUnknownFontName[] = "UnknownFont"; + +static STRING_VAR(classify_font_name, kUnknownFontName, + "Default font name to be used in training"); + +// Finds the name of the training font and returns it in fontname, by cutting +// it out based on the expectation that the filename is of the form: +// /path/to/dir/[lang].[fontname].exp[num] +// The [lang], [fontname] and [num] fields should not have '.' characters. +// If the global parameter classify_font_name is set, its value is used instead. +static void ExtractFontName(const char* filename, std::string* fontname) { + *fontname = classify_font_name; + if (*fontname == kUnknownFontName) { + // filename is expected to be of the form [lang].[fontname].exp[num] + // The [lang], [fontname] and [num] fields should not have '.' characters. + const char *basename = strrchr(filename, '/'); + const char *firstdot = strchr(basename ? basename : filename, '.'); + const char *lastdot = strrchr(filename, '.'); + if (firstdot != lastdot && firstdot != nullptr && lastdot != nullptr) { + ++firstdot; + *fontname = firstdot; + fontname->resize(lastdot - firstdot); + } + } +} +#endif + +/* Add all available languages recursively. + */ +static void addAvailableLanguages(const std::string &datadir, + std::vector<std::string> *langs) { + for (const auto& entry : + std::filesystem::recursive_directory_iterator(datadir, + std::filesystem::directory_options::follow_directory_symlink | + std::filesystem::directory_options::skip_permission_denied)) { + auto path = entry.path().lexically_relative(datadir).string(); + auto extPos = path.rfind(".traineddata"); + if (extPos != std::string::npos) { + langs->push_back(path.substr(0, extPos)); + } + } +} + +TessBaseAPI::TessBaseAPI() + : tesseract_(nullptr) + , osd_tesseract_(nullptr) + , equ_detect_(nullptr) + , reader_(nullptr) + , + // thresholder_ is initialized to nullptr here, but will be set before use + // by: A constructor of a derived API or created + // implicitly when used in InternalSetImage. + thresholder_(nullptr) + , paragraph_models_(nullptr) + , block_list_(nullptr) + , page_res_(nullptr) + , last_oem_requested_(OEM_DEFAULT) + , recognition_done_(false) + , rect_left_(0) + , rect_top_(0) + , rect_width_(0) + , rect_height_(0) + , image_width_(0) + , image_height_(0) { +} + +TessBaseAPI::~TessBaseAPI() { + End(); +} + +/** + * Returns the version identifier as a static string. Do not delete. + */ +const char *TessBaseAPI::Version() { + return TESSERACT_VERSION_STR; +} + +/** + * Set the name of the input file. Needed only for training and + * loading a UNLV zone file. + */ +void TessBaseAPI::SetInputName(const char *name) { + input_file_ = name ? name : ""; +} + +/** Set the name of the output files. Needed only for debugging. */ +void TessBaseAPI::SetOutputName(const char *name) { + output_file_ = name ? name : ""; +} + +bool TessBaseAPI::SetVariable(const char *name, const char *value) { + if (tesseract_ == nullptr) { + tesseract_ = new Tesseract; + } + return ParamUtils::SetParam(name, value, SET_PARAM_CONSTRAINT_NON_INIT_ONLY, + tesseract_->params()); +} + +bool TessBaseAPI::SetDebugVariable(const char *name, const char *value) { + if (tesseract_ == nullptr) { + tesseract_ = new Tesseract; + } + return ParamUtils::SetParam(name, value, SET_PARAM_CONSTRAINT_DEBUG_ONLY, tesseract_->params()); +} + +bool TessBaseAPI::GetIntVariable(const char *name, int *value) const { + auto *p = ParamUtils::FindParam<IntParam>(name, GlobalParams()->int_params, + tesseract_->params()->int_params); + if (p == nullptr) { + return false; + } + *value = (int32_t)(*p); + return true; +} + +bool TessBaseAPI::GetBoolVariable(const char *name, bool *value) const { + auto *p = ParamUtils::FindParam<BoolParam>(name, GlobalParams()->bool_params, + tesseract_->params()->bool_params); + if (p == nullptr) { + return false; + } + *value = bool(*p); + return true; +} + +const char *TessBaseAPI::GetStringVariable(const char *name) const { + auto *p = ParamUtils::FindParam<StringParam>(name, GlobalParams()->string_params, + tesseract_->params()->string_params); + return (p != nullptr) ? p->c_str() : nullptr; +} + +bool TessBaseAPI::GetDoubleVariable(const char *name, double *value) const { + auto *p = ParamUtils::FindParam<DoubleParam>(name, GlobalParams()->double_params, + tesseract_->params()->double_params); + if (p == nullptr) { + return false; + } + *value = (double)(*p); + return true; +} + +/** Get value of named variable as a string, if it exists. */ +bool TessBaseAPI::GetVariableAsString(const char *name, std::string *val) const { + return ParamUtils::GetParamAsString(name, tesseract_->params(), val); +} + +#ifndef DISABLED_LEGACY_ENGINE + +/** Print Tesseract fonts table to the given file. */ +void TessBaseAPI::PrintFontsTable(FILE *fp) const { + const int fontinfo_size = tesseract_->get_fontinfo_table().size(); + for (int font_index = 1; font_index < fontinfo_size; ++font_index) { + FontInfo font = tesseract_->get_fontinfo_table().at(font_index); + fprintf(fp, "ID=%3d: %s is_italic=%s is_bold=%s" + " is_fixed_pitch=%s is_serif=%s is_fraktur=%s\n", + font_index, font.name, + font.is_italic() ? "true" : "false", + font.is_bold() ? "true" : "false", + font.is_fixed_pitch() ? "true" : "false", + font.is_serif() ? "true" : "false", + font.is_fraktur() ? "true" : "false"); + } +} + +#endif + +/** Print Tesseract parameters to the given file. */ +void TessBaseAPI::PrintVariables(FILE *fp) const { + ParamUtils::PrintParams(fp, tesseract_->params()); +} + +/** + * The datapath must be the name of the data directory or + * some other file in which the data directory resides (for instance argv[0].) + * The language is (usually) an ISO 639-3 string or nullptr will default to eng. + * If numeric_mode is true, then only digits and Roman numerals will + * be returned. + * @return: 0 on success and -1 on initialization failure. + */ +int TessBaseAPI::Init(const char *datapath, const char *language, OcrEngineMode oem, char **configs, + int configs_size, const std::vector<std::string> *vars_vec, + const std::vector<std::string> *vars_values, bool set_only_non_debug_params) { + return Init(datapath, 0, language, oem, configs, configs_size, vars_vec, vars_values, + set_only_non_debug_params, nullptr); +} + +// In-memory version reads the traineddata file directly from the given +// data[data_size] array. Also implements the version with a datapath in data, +// flagged by data_size = 0. +int TessBaseAPI::Init(const char *data, int data_size, const char *language, OcrEngineMode oem, + char **configs, int configs_size, const std::vector<std::string> *vars_vec, + const std::vector<std::string> *vars_values, bool set_only_non_debug_params, + FileReader reader) { + if (language == nullptr) { + language = ""; + } + if (data == nullptr) { + data = ""; + } + std::string datapath = data_size == 0 ? data : language; + // If the datapath, OcrEngineMode or the language have changed - start again. + // Note that the language_ field stores the last requested language that was + // initialized successfully, while tesseract_->lang stores the language + // actually used. They differ only if the requested language was nullptr, in + // which case tesseract_->lang is set to the Tesseract default ("eng"). + if (tesseract_ != nullptr && + (datapath_.empty() || language_.empty() || datapath_ != datapath || + last_oem_requested_ != oem || (language_ != language && tesseract_->lang != language))) { + delete tesseract_; + tesseract_ = nullptr; + } + bool reset_classifier = true; + if (tesseract_ == nullptr) { + reset_classifier = false; + tesseract_ = new Tesseract; + if (reader != nullptr) { + reader_ = reader; + } + TessdataManager mgr(reader_); + if (data_size != 0) { + mgr.LoadMemBuffer(language, data, data_size); + } + if (tesseract_->init_tesseract(datapath, output_file_, language, oem, configs, + configs_size, vars_vec, vars_values, set_only_non_debug_params, + &mgr) != 0) { + return -1; + } + } + + // Update datapath and language requested for the last valid initialization. + datapath_ = std::move(datapath); + if (datapath_.empty() && !tesseract_->datadir.empty()) { + datapath_ = tesseract_->datadir; + } + + language_ = language; + last_oem_requested_ = oem; + +#ifndef DISABLED_LEGACY_ENGINE + // For same language and datapath, just reset the adaptive classifier. + if (reset_classifier) { + tesseract_->ResetAdaptiveClassifier(); + } +#endif // ndef DISABLED_LEGACY_ENGINE + return 0; +} + +/** + * Returns the languages string used in the last valid initialization. + * If the last initialization specified "deu+hin" then that will be + * returned. If hin loaded eng automatically as well, then that will + * not be included in this list. To find the languages actually + * loaded use GetLoadedLanguagesAsVector. + * The returned string should NOT be deleted. + */ +const char *TessBaseAPI::GetInitLanguagesAsString() const { + return language_.c_str(); +} + +/** + * Returns the loaded languages in the vector of std::string. + * Includes all languages loaded by the last Init, including those loaded + * as dependencies of other loaded languages. + */ +void TessBaseAPI::GetLoadedLanguagesAsVector(std::vector<std::string> *langs) const { + langs->clear(); + if (tesseract_ != nullptr) { + langs->push_back(tesseract_->lang); + int num_subs = tesseract_->num_sub_langs(); + for (int i = 0; i < num_subs; ++i) { + langs->push_back(tesseract_->get_sub_lang(i)->lang); + } + } +} + +/** + * Returns the available languages in the sorted vector of std::string. + */ +void TessBaseAPI::GetAvailableLanguagesAsVector(std::vector<std::string> *langs) const { + langs->clear(); + if (tesseract_ != nullptr) { + addAvailableLanguages(tesseract_->datadir, langs); + std::sort(langs->begin(), langs->end()); + } +} + +/** + * Init only for page layout analysis. Use only for calls to SetImage and + * AnalysePage. Calls that attempt recognition will generate an error. + */ +void TessBaseAPI::InitForAnalysePage() { + if (tesseract_ == nullptr) { + tesseract_ = new Tesseract; +#ifndef DISABLED_LEGACY_ENGINE + tesseract_->InitAdaptiveClassifier(nullptr); +#endif + } +} + +/** + * Read a "config" file containing a set of parameter name, value pairs. + * Searches the standard places: tessdata/configs, tessdata/tessconfigs + * and also accepts a relative or absolute path name. + */ +void TessBaseAPI::ReadConfigFile(const char *filename) { + tesseract_->read_config_file(filename, SET_PARAM_CONSTRAINT_NON_INIT_ONLY); +} + +/** Same as above, but only set debug params from the given config file. */ +void TessBaseAPI::ReadDebugConfigFile(const char *filename) { + tesseract_->read_config_file(filename, SET_PARAM_CONSTRAINT_DEBUG_ONLY); +} + +/** + * Set the current page segmentation mode. Defaults to PSM_AUTO. + * The mode is stored as an IntParam so it can also be modified by + * ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string). + */ +void TessBaseAPI::SetPageSegMode(PageSegMode mode) { + if (tesseract_ == nullptr) { + tesseract_ = new Tesseract; + } + tesseract_->tessedit_pageseg_mode.set_value(mode); +} + +/** Return the current page segmentation mode. */ +PageSegMode TessBaseAPI::GetPageSegMode() const { + if (tesseract_ == nullptr) { + return PSM_SINGLE_BLOCK; + } + return static_cast<PageSegMode>(static_cast<int>(tesseract_->tessedit_pageseg_mode)); +} + +/** + * Recognize a rectangle from an image and return the result as a string. + * May be called many times for a single Init. + * Currently has no error checking. + * Greyscale of 8 and color of 24 or 32 bits per pixel may be given. + * Palette color images will not work properly and must be converted to + * 24 bit. + * Binary images of 1 bit per pixel may also be given but they must be + * byte packed with the MSB of the first byte being the first pixel, and a + * one pixel is WHITE. For binary images set bytes_per_pixel=0. + * The recognized text is returned as a char* which is coded + * as UTF8 and must be freed with the delete [] operator. + */ +char *TessBaseAPI::TesseractRect(const unsigned char *imagedata, int bytes_per_pixel, + int bytes_per_line, int left, int top, int width, int height) { + if (tesseract_ == nullptr || width < kMinRectSize || height < kMinRectSize) { + return nullptr; // Nothing worth doing. + } + + // Since this original api didn't give the exact size of the image, + // we have to invent a reasonable value. + int bits_per_pixel = bytes_per_pixel == 0 ? 1 : bytes_per_pixel * 8; + SetImage(imagedata, bytes_per_line * 8 / bits_per_pixel, height + top, bytes_per_pixel, + bytes_per_line); + SetRectangle(left, top, width, height); + + return GetUTF8Text(); +} + +#ifndef DISABLED_LEGACY_ENGINE +/** + * Call between pages or documents etc to free up memory and forget + * adaptive data. + */ +void TessBaseAPI::ClearAdaptiveClassifier() { + if (tesseract_ == nullptr) { + return; + } + tesseract_->ResetAdaptiveClassifier(); + tesseract_->ResetDocumentDictionary(); +} +#endif // ndef DISABLED_LEGACY_ENGINE + +/** + * Provide an image for Tesseract to recognize. Format is as + * TesseractRect above. Copies the image buffer and converts to Pix. + * SetImage clears all recognition results, and sets the rectangle to the + * full image, so it may be followed immediately by a GetUTF8Text, and it + * will automatically perform recognition. + */ +void TessBaseAPI::SetImage(const unsigned char *imagedata, int width, int height, + int bytes_per_pixel, int bytes_per_line) { + if (InternalSetImage()) { + thresholder_->SetImage(imagedata, width, height, bytes_per_pixel, bytes_per_line); + SetInputImage(thresholder_->GetPixRect()); + } +} + +void TessBaseAPI::SetSourceResolution(int ppi) { + if (thresholder_) { + thresholder_->SetSourceYResolution(ppi); + } else { + tprintf("Please call SetImage before SetSourceResolution.\n"); + } +} + +/** + * Provide an image for Tesseract to recognize. As with SetImage above, + * Tesseract takes its own copy of the image, so it need not persist until + * after Recognize. + * Pix vs raw, which to use? + * Use Pix where possible. Tesseract uses Pix as its internal representation + * and it is therefore more efficient to provide a Pix directly. + */ +void TessBaseAPI::SetImage(Pix *pix) { + if (InternalSetImage()) { + if (pixGetSpp(pix) == 4 && pixGetInputFormat(pix) == IFF_PNG) { + // remove alpha channel from png + Pix *p1 = pixRemoveAlpha(pix); + pixSetSpp(p1, 3); + (void)pixCopy(pix, p1); + pixDestroy(&p1); + } + thresholder_->SetImage(pix); + SetInputImage(thresholder_->GetPixRect()); + } +} + +/** + * Restrict recognition to a sub-rectangle of the image. Call after SetImage. + * Each SetRectangle clears the recognition results so multiple rectangles + * can be recognized with the same image. + */ +void TessBaseAPI::SetRectangle(int left, int top, int width, int height) { + if (thresholder_ == nullptr) { + return; + } + thresholder_->SetRectangle(left, top, width, height); + ClearResults(); +} + +/** + * ONLY available after SetImage if you have Leptonica installed. + * Get a copy of the internal thresholded image from Tesseract. + */ +Pix *TessBaseAPI::GetThresholdedImage() { + if (tesseract_ == nullptr || thresholder_ == nullptr) { + return nullptr; + } + if (tesseract_->pix_binary() == nullptr && !Threshold(&tesseract_->mutable_pix_binary()->pix_)) { + return nullptr; + } + return tesseract_->pix_binary().clone(); +} + +/** + * Get the result of page layout analysis as a leptonica-style + * Boxa, Pixa pair, in reading order. + * Can be called before or after Recognize. + */ +Boxa *TessBaseAPI::GetRegions(Pixa **pixa) { + return GetComponentImages(RIL_BLOCK, false, pixa, nullptr); +} + +/** + * Get the textlines as a leptonica-style Boxa, Pixa pair, in reading order. + * Can be called before or after Recognize. + * If blockids is not nullptr, the block-id of each line is also returned as an + * array of one element per line. delete [] after use. + * If paraids is not nullptr, the paragraph-id of each line within its block is + * also returned as an array of one element per line. delete [] after use. + */ +Boxa *TessBaseAPI::GetTextlines(const bool raw_image, const int raw_padding, Pixa **pixa, + int **blockids, int **paraids) { + return GetComponentImages(RIL_TEXTLINE, true, raw_image, raw_padding, pixa, blockids, paraids); +} + +/** + * Get textlines and strips of image regions as a leptonica-style Boxa, Pixa + * pair, in reading order. Enables downstream handling of non-rectangular + * regions. + * Can be called before or after Recognize. + * If blockids is not nullptr, the block-id of each line is also returned as an + * array of one element per line. delete [] after use. + */ +Boxa *TessBaseAPI::GetStrips(Pixa **pixa, int **blockids) { + return GetComponentImages(RIL_TEXTLINE, false, pixa, blockids); +} + +/** + * Get the words as a leptonica-style + * Boxa, Pixa pair, in reading order. + * Can be called before or after Recognize. + */ +Boxa *TessBaseAPI::GetWords(Pixa **pixa) { + return GetComponentImages(RIL_WORD, true, pixa, nullptr); +} + +/** + * Gets the individual connected (text) components (created + * after pages segmentation step, but before recognition) + * as a leptonica-style Boxa, Pixa pair, in reading order. + * Can be called before or after Recognize. + */ +Boxa *TessBaseAPI::GetConnectedComponents(Pixa **pixa) { + return GetComponentImages(RIL_SYMBOL, true, pixa, nullptr); +} + +/** + * Get the given level kind of components (block, textline, word etc.) as a + * leptonica-style Boxa, Pixa pair, in reading order. + * Can be called before or after Recognize. + * If blockids is not nullptr, the block-id of each component is also returned + * as an array of one element per component. delete [] after use. + * If text_only is true, then only text components are returned. + */ +Boxa *TessBaseAPI::GetComponentImages(PageIteratorLevel level, bool text_only, bool raw_image, + const int raw_padding, Pixa **pixa, int **blockids, + int **paraids) { + /*non-const*/ std::unique_ptr</*non-const*/ PageIterator> page_it(GetIterator()); + if (page_it == nullptr) { + page_it.reset(AnalyseLayout()); + } + if (page_it == nullptr) { + return nullptr; // Failed. + } + + // Count the components to get a size for the arrays. + int component_count = 0; + int left, top, right, bottom; + + if (raw_image) { + // Get bounding box in original raw image with padding. + do { + if (page_it->BoundingBox(level, raw_padding, &left, &top, &right, &bottom) && + (!text_only || PTIsTextType(page_it->BlockType()))) { + ++component_count; + } + } while (page_it->Next(level)); + } else { + // Get bounding box from binarized imaged. Note that this could be + // differently scaled from the original image. + do { + if (page_it->BoundingBoxInternal(level, &left, &top, &right, &bottom) && + (!text_only || PTIsTextType(page_it->BlockType()))) { + ++component_count; + } + } while (page_it->Next(level)); + } + + Boxa *boxa = boxaCreate(component_count); + if (pixa != nullptr) { + *pixa = pixaCreate(component_count); + } + if (blockids != nullptr) { + *blockids = new int[component_count]; + } + if (paraids != nullptr) { + *paraids = new int[component_count]; + } + + int blockid = 0; + int paraid = 0; + int component_index = 0; + page_it->Begin(); + do { + bool got_bounding_box; + if (raw_image) { + got_bounding_box = page_it->BoundingBox(level, raw_padding, &left, &top, &right, &bottom); + } else { + got_bounding_box = page_it->BoundingBoxInternal(level, &left, &top, &right, &bottom); + } + if (got_bounding_box && (!text_only || PTIsTextType(page_it->BlockType()))) { + Box *lbox = boxCreate(left, top, right - left, bottom - top); + boxaAddBox(boxa, lbox, L_INSERT); + if (pixa != nullptr) { + Pix *pix = nullptr; + if (raw_image) { + pix = page_it->GetImage(level, raw_padding, GetInputImage(), &left, &top); + } else { + pix = page_it->GetBinaryImage(level); + } + pixaAddPix(*pixa, pix, L_INSERT); + pixaAddBox(*pixa, lbox, L_CLONE); + } + if (paraids != nullptr) { + (*paraids)[component_index] = paraid; + if (page_it->IsAtFinalElement(RIL_PARA, level)) { + ++paraid; + } + } + if (blockids != nullptr) { + (*blockids)[component_index] = blockid; + if (page_it->IsAtFinalElement(RIL_BLOCK, level)) { + ++blockid; + paraid = 0; + } + } + ++component_index; + } + } while (page_it->Next(level)); + return boxa; +} + +int TessBaseAPI::GetThresholdedImageScaleFactor() const { + if (thresholder_ == nullptr) { + return 0; + } + return thresholder_->GetScaleFactor(); +} + +/** + * Runs page layout analysis in the mode set by SetPageSegMode. + * May optionally be called prior to Recognize to get access to just + * the page layout results. Returns an iterator to the results. + * If merge_similar_words is true, words are combined where suitable for use + * with a line recognizer. Use if you want to use AnalyseLayout to find the + * textlines, and then want to process textline fragments with an external + * line recognizer. + * Returns nullptr on error or an empty page. + * The returned iterator must be deleted after use. + * WARNING! This class points to data held within the TessBaseAPI class, and + * therefore can only be used while the TessBaseAPI class still exists and + * has not been subjected to a call of Init, SetImage, Recognize, Clear, End + * DetectOS, or anything else that changes the internal PAGE_RES. + */ +PageIterator *TessBaseAPI::AnalyseLayout() { + return AnalyseLayout(false); +} + +PageIterator *TessBaseAPI::AnalyseLayout(bool merge_similar_words) { + if (FindLines() == 0) { + if (block_list_->empty()) { + return nullptr; // The page was empty. + } + page_res_ = new PAGE_RES(merge_similar_words, block_list_, nullptr); + DetectParagraphs(false); + return new PageIterator(page_res_, tesseract_, thresholder_->GetScaleFactor(), + thresholder_->GetScaledYResolution(), rect_left_, rect_top_, + rect_width_, rect_height_); + } + return nullptr; +} + +/** + * Recognize the tesseract global image and return the result as Tesseract + * internal structures. + */ +int TessBaseAPI::Recognize(ETEXT_DESC *monitor) { + if (tesseract_ == nullptr) { + return -1; + } + if (FindLines() != 0) { + return -1; + } + delete page_res_; + if (block_list_->empty()) { + page_res_ = new PAGE_RES(false, block_list_, &tesseract_->prev_word_best_choice_); + return 0; // Empty page. + } + + tesseract_->SetBlackAndWhitelist(); + recognition_done_ = true; +#ifndef DISABLED_LEGACY_ENGINE + if (tesseract_->tessedit_resegment_from_line_boxes) { + page_res_ = tesseract_->ApplyBoxes(input_file_.c_str(), true, block_list_); + } else if (tesseract_->tessedit_resegment_from_boxes) { + page_res_ = tesseract_->ApplyBoxes(input_file_.c_str(), false, block_list_); + } else +#endif // ndef DISABLED_LEGACY_ENGINE + { + page_res_ = + new PAGE_RES(tesseract_->AnyLSTMLang(), block_list_, &tesseract_->prev_word_best_choice_); + } + + if (page_res_ == nullptr) { + return -1; + } + + if (tesseract_->tessedit_train_line_recognizer) { + if (!tesseract_->TrainLineRecognizer(input_file_.c_str(), output_file_, block_list_)) { + return -1; + } + tesseract_->CorrectClassifyWords(page_res_); + return 0; + } +#ifndef DISABLED_LEGACY_ENGINE + if (tesseract_->tessedit_make_boxes_from_boxes) { + tesseract_->CorrectClassifyWords(page_res_); + return 0; + } +#endif // ndef DISABLED_LEGACY_ENGINE + + int result = 0; + if (tesseract_->interactive_display_mode) { +#ifndef GRAPHICS_DISABLED + tesseract_->pgeditor_main(rect_width_, rect_height_, page_res_); +#endif // !GRAPHICS_DISABLED + // The page_res is invalid after an interactive session, so cleanup + // in a way that lets us continue to the next page without crashing. + delete page_res_; + page_res_ = nullptr; + return -1; +#ifndef DISABLED_LEGACY_ENGINE + } else if (tesseract_->tessedit_train_from_boxes) { + std::string fontname; + ExtractFontName(output_file_.c_str(), &fontname); + tesseract_->ApplyBoxTraining(fontname, page_res_); + } else if (tesseract_->tessedit_ambigs_training) { + FILE *training_output_file = tesseract_->init_recog_training(input_file_.c_str()); + // OCR the page segmented into words by tesseract. + tesseract_->recog_training_segmented(input_file_.c_str(), page_res_, monitor, + training_output_file); + fclose(training_output_file); +#endif // ndef DISABLED_LEGACY_ENGINE + } else { + // Now run the main recognition. + bool wait_for_text = true; + GetBoolVariable("paragraph_text_based", &wait_for_text); + if (!wait_for_text) { + DetectParagraphs(false); + } + if (tesseract_->recog_all_words(page_res_, monitor, nullptr, nullptr, 0)) { + if (wait_for_text) { + DetectParagraphs(true); + } + } else { + result = -1; + } + } + return result; +} + +// Takes ownership of the input pix. +void TessBaseAPI::SetInputImage(Pix *pix) { + tesseract_->set_pix_original(pix); +} + +Pix *TessBaseAPI::GetInputImage() { + return tesseract_->pix_original(); +} + +const char *TessBaseAPI::GetInputName() { + if (!input_file_.empty()) { + return input_file_.c_str(); + } + return nullptr; +} + +const char *TessBaseAPI::GetDatapath() { + return tesseract_->datadir.c_str(); +} + +int TessBaseAPI::GetSourceYResolution() { + if (thresholder_ == nullptr) + return -1; + return thresholder_->GetSourceYResolution(); +} + +// If flist exists, get data from there. Otherwise get data from buf. +// Seems convoluted, but is the easiest way I know of to meet multiple +// goals. Support streaming from stdin, and also work on platforms +// lacking fmemopen. +// TODO: check different logic for flist/buf and simplify. +bool TessBaseAPI::ProcessPagesFileList(FILE *flist, std::string *buf, const char *retry_config, + int timeout_millisec, TessResultRenderer *renderer, + int tessedit_page_number) { + if (!flist && !buf) { + return false; + } + unsigned page = (tessedit_page_number >= 0) ? tessedit_page_number : 0; + char pagename[MAX_PATH]; + + std::vector<std::string> lines; + if (!flist) { + std::string line; + for (const auto ch : *buf) { + if (ch == '\n') { + lines.push_back(line); + line.clear(); + } else { + line.push_back(ch); + } + } + if (!line.empty()) { + // Add last line without terminating LF. + lines.push_back(line); + } + if (lines.empty()) { + return false; + } + } + + // Skip to the requested page number. + for (unsigned i = 0; i < page; i++) { + if (flist) { + if (fgets(pagename, sizeof(pagename), flist) == nullptr) { + break; + } + } + } + + // Begin producing output + if (renderer && !renderer->BeginDocument(document_title.c_str())) { + return false; + } + + // Loop over all pages - or just the requested one + while (true) { + if (flist) { + if (fgets(pagename, sizeof(pagename), flist) == nullptr) { + break; + } + } else { + if (page >= lines.size()) { + break; + } + snprintf(pagename, sizeof(pagename), "%s", lines[page].c_str()); + } + chomp_string(pagename); + Pix *pix = pixRead(pagename); + if (pix == nullptr) { + tprintf("Image file %s cannot be read!\n", pagename); + return false; + } + tprintf("Page %u : %s\n", page, pagename); + bool r = ProcessPage(pix, page, pagename, retry_config, timeout_millisec, renderer); + pixDestroy(&pix); + if (!r) { + return false; + } + if (tessedit_page_number >= 0) { + break; + } + ++page; + } + + // Finish producing output + if (renderer && !renderer->EndDocument()) { + return false; + } + return true; +} + +bool TessBaseAPI::ProcessPagesMultipageTiff(const l_uint8 *data, size_t size, const char *filename, + const char *retry_config, int timeout_millisec, + TessResultRenderer *renderer, + int tessedit_page_number) { + Pix *pix = nullptr; + int page = (tessedit_page_number >= 0) ? tessedit_page_number : 0; + size_t offset = 0; + for (;; ++page) { + if (tessedit_page_number >= 0) { + page = tessedit_page_number; + pix = (data) ? pixReadMemTiff(data, size, page) : pixReadTiff(filename, page); + } else { + pix = (data) ? pixReadMemFromMultipageTiff(data, size, &offset) + : pixReadFromMultipageTiff(filename, &offset); + } + if (pix == nullptr) { + break; + } + if (offset || page > 0) { + // Only print page number for multipage TIFF file. + tprintf("Page %d\n", page + 1); + } + auto page_string = std::to_string(page); + SetVariable("applybox_page", page_string.c_str()); + bool r = ProcessPage(pix, page, filename, retry_config, timeout_millisec, renderer); + pixDestroy(&pix); + if (!r) { + return false; + } + if (tessedit_page_number >= 0) { + break; + } + if (!offset) { + break; + } + } + return true; +} + +// Master ProcessPages calls ProcessPagesInternal and then does any post- +// processing required due to being in a training mode. +bool TessBaseAPI::ProcessPages(const char *filename, const char *retry_config, int timeout_millisec, + TessResultRenderer *renderer) { + bool result = ProcessPagesInternal(filename, retry_config, timeout_millisec, renderer); +#ifndef DISABLED_LEGACY_ENGINE + if (result) { + if (tesseract_->tessedit_train_from_boxes && !tesseract_->WriteTRFile(output_file_.c_str())) { + tprintf("Write of TR file failed: %s\n", output_file_.c_str()); + return false; + } + } +#endif // ndef DISABLED_LEGACY_ENGINE + return result; +} + +#ifdef HAVE_LIBCURL +static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp) { + size = size * nmemb; + auto *buf = reinterpret_cast<std::string *>(userp); + buf->append(reinterpret_cast<const char *>(contents), size); + return size; +} +#endif + +// In the ideal scenario, Tesseract will start working on data as soon +// as it can. For example, if you stream a filelist through stdin, we +// should start the OCR process as soon as the first filename is +// available. This is particularly useful when hooking Tesseract up to +// slow hardware such as a book scanning machine. +// +// Unfortunately there are tradeoffs. You can't seek on stdin. That +// makes automatic detection of datatype (TIFF? filelist? PNG?) +// impractical. So we support a command line flag to explicitly +// identify the scenario that really matters: filelists on +// stdin. We'll still do our best if the user likes pipes. +bool TessBaseAPI::ProcessPagesInternal(const char *filename, const char *retry_config, + int timeout_millisec, TessResultRenderer *renderer) { + bool stdInput = !strcmp(filename, "stdin") || !strcmp(filename, "-"); + if (stdInput) { +#ifdef WIN32 + if (_setmode(_fileno(stdin), _O_BINARY) == -1) + tprintf("ERROR: cin to binary: %s", strerror(errno)); +#endif // WIN32 + } + + if (stream_filelist) { + return ProcessPagesFileList(stdin, nullptr, retry_config, timeout_millisec, renderer, + tesseract_->tessedit_page_number); + } + + // At this point we are officially in autodection territory. + // That means any data in stdin must be buffered, to make it + // seekable. + std::string buf; + const l_uint8 *data = nullptr; + if (stdInput) { + buf.assign((std::istreambuf_iterator<char>(std::cin)), (std::istreambuf_iterator<char>())); + data = reinterpret_cast<const l_uint8 *>(buf.data()); + } else if (strstr(filename, "://") != nullptr) { + // Get image or image list by URL. +#ifdef HAVE_LIBCURL + CURL *curl = curl_easy_init(); + if (curl == nullptr) { + fprintf(stderr, "Error, curl_easy_init failed\n"); + return false; + } else { + CURLcode curlcode; + auto error = [curl, &curlcode](const char *function) { + fprintf(stderr, "Error, %s failed with error %s\n", function, curl_easy_strerror(curlcode)); + curl_easy_cleanup(curl); + return false; + }; + curlcode = curl_easy_setopt(curl, CURLOPT_URL, filename); + if (curlcode != CURLE_OK) { + return error("curl_easy_setopt"); + } + curlcode = curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1L); + if (curlcode != CURLE_OK) { + return error("curl_easy_setopt"); + } + // Follow HTTP, HTTPS, FTP and FTPS redirects. + curlcode = curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1); + if (curlcode != CURLE_OK) { + return error("curl_easy_setopt"); + } + // Allow no more than 8 redirections to prevent endless loops. + curlcode = curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 8); + if (curlcode != CURLE_OK) { + return error("curl_easy_setopt"); + } + int timeout = curl_timeout; + if (timeout > 0) { + curlcode = curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1L); + if (curlcode != CURLE_OK) { + return error("curl_easy_setopt"); + } + curlcode = curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout); + if (curlcode != CURLE_OK) { + return error("curl_easy_setopt"); + } + } + std::string cookiefile = curl_cookiefile; + if (!cookiefile.empty()) { + curlcode = curl_easy_setopt(curl, CURLOPT_COOKIEFILE, cookiefile.c_str()); + if (curlcode != CURLE_OK) { + return error("curl_easy_setopt"); + } + } + curlcode = curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback); + if (curlcode != CURLE_OK) { + return error("curl_easy_setopt"); + } + curlcode = curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buf); + if (curlcode != CURLE_OK) { + return error("curl_easy_setopt"); + } + curlcode = curl_easy_setopt(curl, CURLOPT_USERAGENT, "Tesseract OCR"); + if (curlcode != CURLE_OK) { + return error("curl_easy_setopt"); + } + curlcode = curl_easy_perform(curl); + if (curlcode != CURLE_OK) { + return error("curl_easy_perform"); + } + curl_easy_cleanup(curl); + data = reinterpret_cast<const l_uint8 *>(buf.data()); + } +#else + fprintf(stderr, "Error, this tesseract has no URL support\n"); + return false; +#endif + } else { + // Check whether the input file can be read. + if (FILE *file = fopen(filename, "rb")) { + fclose(file); + } else { + fprintf(stderr, "Error, cannot read input file %s: %s\n", filename, strerror(errno)); + return false; + } + } + + // Here is our autodetection + int format; + int r = + (data != nullptr) ? findFileFormatBuffer(data, &format) : findFileFormat(filename, &format); + + // Maybe we have a filelist + if (r != 0 || format == IFF_UNKNOWN) { + std::string s; + if (data != nullptr) { + s = buf.c_str(); + } else { + std::ifstream t(filename); + std::string u((std::istreambuf_iterator<char>(t)), std::istreambuf_iterator<char>()); + s = u.c_str(); + } + return ProcessPagesFileList(nullptr, &s, retry_config, timeout_millisec, renderer, + tesseract_->tessedit_page_number); + } + + // Maybe we have a TIFF which is potentially multipage + bool tiff = (format == IFF_TIFF || format == IFF_TIFF_PACKBITS || format == IFF_TIFF_RLE || + format == IFF_TIFF_G3 || format == IFF_TIFF_G4 || format == IFF_TIFF_LZW || +#if LIBLEPT_MAJOR_VERSION > 1 || LIBLEPT_MINOR_VERSION > 76 + format == IFF_TIFF_JPEG || +#endif + format == IFF_TIFF_ZIP); + + // Fail early if we can, before producing any output + Pix *pix = nullptr; + if (!tiff) { + pix = (data != nullptr) ? pixReadMem(data, buf.size()) : pixRead(filename); + if (pix == nullptr) { + return false; + } + } + + // Begin the output + if (renderer && !renderer->BeginDocument(document_title.c_str())) { + pixDestroy(&pix); + return false; + } + + // Produce output + r = (tiff) ? ProcessPagesMultipageTiff(data, buf.size(), filename, retry_config, timeout_millisec, + renderer, tesseract_->tessedit_page_number) + : ProcessPage(pix, 0, filename, retry_config, timeout_millisec, renderer); + + // Clean up memory as needed + pixDestroy(&pix); + + // End the output + if (!r || (renderer && !renderer->EndDocument())) { + return false; + } + return true; +} + +bool TessBaseAPI::ProcessPage(Pix *pix, int page_index, const char *filename, + const char *retry_config, int timeout_millisec, + TessResultRenderer *renderer) { + SetInputName(filename); + SetImage(pix); + bool failed = false; + + if (tesseract_->tessedit_pageseg_mode == PSM_AUTO_ONLY) { + // Disabled character recognition + if (! std::unique_ptr<const PageIterator>(AnalyseLayout())) { + failed = true; + } + } else if (tesseract_->tessedit_pageseg_mode == PSM_OSD_ONLY) { + failed = FindLines() != 0; + } else if (timeout_millisec > 0) { + // Running with a timeout. + ETEXT_DESC monitor; + monitor.cancel = nullptr; + monitor.cancel_this = nullptr; + monitor.set_deadline_msecs(timeout_millisec); + + // Now run the main recognition. + failed = Recognize(&monitor) < 0; + } else { + // Normal layout and character recognition with no timeout. + failed = Recognize(nullptr) < 0; + } + + if (tesseract_->tessedit_write_images) { + Pix *page_pix = GetThresholdedImage(); + std::string output_filename = output_file_ + ".processed"; + if (page_index > 0) { + output_filename += std::to_string(page_index); + } + output_filename += ".tif"; + pixWrite(output_filename.c_str(), page_pix, IFF_TIFF_G4); + pixDestroy(&page_pix); + } + + if (failed && retry_config != nullptr && retry_config[0] != '\0') { + // Save current config variables before switching modes. + FILE *fp = fopen(kOldVarsFile, "wb"); + if (fp == nullptr) { + tprintf("Error, failed to open file \"%s\"\n", kOldVarsFile); + } else { + PrintVariables(fp); + fclose(fp); + } + // Switch to alternate mode for retry. + ReadConfigFile(retry_config); + SetImage(pix); + Recognize(nullptr); + // Restore saved config variables. + ReadConfigFile(kOldVarsFile); + } + + if (renderer && !failed) { + failed = !renderer->AddImage(this); + } + + return !failed; +} + +/** + * Get a left-to-right iterator to the results of LayoutAnalysis and/or + * Recognize. The returned iterator must be deleted after use. + */ +LTRResultIterator *TessBaseAPI::GetLTRIterator() { + if (tesseract_ == nullptr || page_res_ == nullptr) { + return nullptr; + } + return new LTRResultIterator(page_res_, tesseract_, thresholder_->GetScaleFactor(), + thresholder_->GetScaledYResolution(), rect_left_, rect_top_, + rect_width_, rect_height_); +} + +/** + * Get a reading-order iterator to the results of LayoutAnalysis and/or + * Recognize. The returned iterator must be deleted after use. + * WARNING! This class points to data held within the TessBaseAPI class, and + * therefore can only be used while the TessBaseAPI class still exists and + * has not been subjected to a call of Init, SetImage, Recognize, Clear, End + * DetectOS, or anything else that changes the internal PAGE_RES. + */ +ResultIterator *TessBaseAPI::GetIterator() { + if (tesseract_ == nullptr || page_res_ == nullptr) { + return nullptr; + } + return ResultIterator::StartOfParagraph(LTRResultIterator( + page_res_, tesseract_, thresholder_->GetScaleFactor(), thresholder_->GetScaledYResolution(), + rect_left_, rect_top_, rect_width_, rect_height_)); +} + +/** + * Get a mutable iterator to the results of LayoutAnalysis and/or Recognize. + * The returned iterator must be deleted after use. + * WARNING! This class points to data held within the TessBaseAPI class, and + * therefore can only be used while the TessBaseAPI class still exists and + * has not been subjected to a call of Init, SetImage, Recognize, Clear, End + * DetectOS, or anything else that changes the internal PAGE_RES. + */ +MutableIterator *TessBaseAPI::GetMutableIterator() { + if (tesseract_ == nullptr || page_res_ == nullptr) { + return nullptr; + } + return new MutableIterator(page_res_, tesseract_, thresholder_->GetScaleFactor(), + thresholder_->GetScaledYResolution(), rect_left_, rect_top_, + rect_width_, rect_height_); +} + +/** Make a text string from the internal data structures. */ +char *TessBaseAPI::GetUTF8Text() { + if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) { + return nullptr; + } + std::string text(""); + const std::unique_ptr</*non-const*/ ResultIterator> it(GetIterator()); + do { + if (it->Empty(RIL_PARA)) { + continue; + } + auto block_type = it->BlockType(); + switch (block_type) { + case PT_FLOWING_IMAGE: + case PT_HEADING_IMAGE: + case PT_PULLOUT_IMAGE: + case PT_HORZ_LINE: + case PT_VERT_LINE: + // Ignore images and lines for text output. + continue; + case PT_NOISE: + tprintf("TODO: Please report image which triggers the noise case.\n"); + ASSERT_HOST(false); + default: + break; + } + + const std::unique_ptr<const char[]> para_text(it->GetUTF8Text(RIL_PARA)); + text += para_text.get(); + } while (it->Next(RIL_PARA)); + return copy_string(text); +} + +static void AddBoxToTSV(const PageIterator *it, PageIteratorLevel level, std::string &text) { + int left, top, right, bottom; + it->BoundingBox(level, &left, &top, &right, &bottom); + text += "\t" + std::to_string(left); + text += "\t" + std::to_string(top); + text += "\t" + std::to_string(right - left); + text += "\t" + std::to_string(bottom - top); +} + +/** + * Make a TSV-formatted string from the internal data structures. + * page_number is 0-based but will appear in the output as 1-based. + * Returned string must be freed with the delete [] operator. + */ +char *TessBaseAPI::GetTSVText(int page_number) { + if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0)) { + return nullptr; + } + +#if !defined(NDEBUG) + int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1; +#endif + int page_id = page_number + 1; // we use 1-based page numbers. + + int page_num = page_id; + int block_num = 0; + int par_num = 0; + int line_num = 0; + int word_num = 0; + + std::string tsv_str; + tsv_str += "1\t" + std::to_string(page_num); // level 1 - page + tsv_str += "\t" + std::to_string(block_num); + tsv_str += "\t" + std::to_string(par_num); + tsv_str += "\t" + std::to_string(line_num); + tsv_str += "\t" + std::to_string(word_num); + tsv_str += "\t" + std::to_string(rect_left_); + tsv_str += "\t" + std::to_string(rect_top_); + tsv_str += "\t" + std::to_string(rect_width_); + tsv_str += "\t" + std::to_string(rect_height_); + tsv_str += "\t-1\t\n"; + + const std::unique_ptr</*non-const*/ ResultIterator> res_it(GetIterator()); + while (!res_it->Empty(RIL_BLOCK)) { + if (res_it->Empty(RIL_WORD)) { + res_it->Next(RIL_WORD); + continue; + } + + // Add rows for any new block/paragraph/textline. + if (res_it->IsAtBeginningOf(RIL_BLOCK)) { + block_num++; + par_num = 0; + line_num = 0; + word_num = 0; + tsv_str += "2\t" + std::to_string(page_num); // level 2 - block + tsv_str += "\t" + std::to_string(block_num); + tsv_str += "\t" + std::to_string(par_num); + tsv_str += "\t" + std::to_string(line_num); + tsv_str += "\t" + std::to_string(word_num); + AddBoxToTSV(res_it.get(), RIL_BLOCK, tsv_str); + tsv_str += "\t-1\t\n"; // end of row for block + } + if (res_it->IsAtBeginningOf(RIL_PARA)) { + par_num++; + line_num = 0; + word_num = 0; + tsv_str += "3\t" + std::to_string(page_num); // level 3 - paragraph + tsv_str += "\t" + std::to_string(block_num); + tsv_str += "\t" + std::to_string(par_num); + tsv_str += "\t" + std::to_string(line_num); + tsv_str += "\t" + std::to_string(word_num); + AddBoxToTSV(res_it.get(), RIL_PARA, tsv_str); + tsv_str += "\t-1\t\n"; // end of row for para + } + if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { + line_num++; + word_num = 0; + tsv_str += "4\t" + std::to_string(page_num); // level 4 - line + tsv_str += "\t" + std::to_string(block_num); + tsv_str += "\t" + std::to_string(par_num); + tsv_str += "\t" + std::to_string(line_num); + tsv_str += "\t" + std::to_string(word_num); + AddBoxToTSV(res_it.get(), RIL_TEXTLINE, tsv_str); + tsv_str += "\t-1\t\n"; // end of row for line + } + + // Now, process the word... + int left, top, right, bottom; + res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom); + word_num++; + tsv_str += "5\t" + std::to_string(page_num); // level 5 - word + tsv_str += "\t" + std::to_string(block_num); + tsv_str += "\t" + std::to_string(par_num); + tsv_str += "\t" + std::to_string(line_num); + tsv_str += "\t" + std::to_string(word_num); + tsv_str += "\t" + std::to_string(left); + tsv_str += "\t" + std::to_string(top); + tsv_str += "\t" + std::to_string(right - left); + tsv_str += "\t" + std::to_string(bottom - top); + tsv_str += "\t" + std::to_string(res_it->Confidence(RIL_WORD)); + tsv_str += "\t"; + +#if !defined(NDEBUG) + // Increment counts if at end of block/paragraph/textline. + if (res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD)) { + lcnt++; + } + if (res_it->IsAtFinalElement(RIL_PARA, RIL_WORD)) { + pcnt++; + } + if (res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD)) { + bcnt++; + } +#endif + + do { + tsv_str += std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get(); + res_it->Next(RIL_SYMBOL); + } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); + tsv_str += "\n"; // end of row +#if !defined(NDEBUG) + wcnt++; +#endif + } + + return copy_string(tsv_str); +} + +/** The 5 numbers output for each box (the usual 4 and a page number.) */ +const int kNumbersPerBlob = 5; +/** + * The number of bytes taken by each number. Since we use int16_t for ICOORD, + * assume only 5 digits max. + */ +const int kBytesPerNumber = 5; +/** + * Multiplier for max expected textlength assumes (kBytesPerNumber + space) + * * kNumbersPerBlob plus the newline. Add to this the + * original UTF8 characters, and one kMaxBytesPerLine for safety. + */ +const int kBytesPerBoxFileLine = (kBytesPerNumber + 1) * kNumbersPerBlob + 1; +/** Max bytes in the decimal representation of int64_t. */ +const int kBytesPer64BitNumber = 20; +/** + * A maximal single box could occupy kNumbersPerBlob numbers at + * kBytesPer64BitNumber digits (if someone sneaks in a 64 bit value) and a + * space plus the newline and the maximum length of a UNICHAR. + * Test against this on each iteration for safety. + */ +const int kMaxBytesPerLine = kNumbersPerBlob * (kBytesPer64BitNumber + 1) + 1 + UNICHAR_LEN; + +/** + * The recognized text is returned as a char* which is coded + * as a UTF8 box file. + * page_number is a 0-base page index that will appear in the box file. + * Returned string must be freed with the delete [] operator. + */ +char *TessBaseAPI::GetBoxText(int page_number) { + if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) { + return nullptr; + } + int blob_count; + int utf8_length = TextLength(&blob_count); + int total_length = blob_count * kBytesPerBoxFileLine + utf8_length + kMaxBytesPerLine; + char *result = new char[total_length]; + result[0] = '\0'; + int output_length = 0; + LTRResultIterator *it = GetLTRIterator(); + do { + int left, top, right, bottom; + if (it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom)) { + const std::unique_ptr</*non-const*/ char[]> text(it->GetUTF8Text(RIL_SYMBOL)); + // Tesseract uses space for recognition failure. Fix to a reject + // character, kTesseractReject so we don't create illegal box files. + for (int i = 0; text[i] != '\0'; ++i) { + if (text[i] == ' ') { + text[i] = kTesseractReject; + } + } + snprintf(result + output_length, total_length - output_length, "%s %d %d %d %d %d\n", + text.get(), left, image_height_ - bottom, right, image_height_ - top, page_number); + output_length += strlen(result + output_length); + // Just in case... + if (output_length + kMaxBytesPerLine > total_length) { + break; + } + } + } while (it->Next(RIL_SYMBOL)); + delete it; + return result; +} + +/** + * Conversion table for non-latin characters. + * Maps characters out of the latin set into the latin set. + * TODO(rays) incorporate this translation into unicharset. + */ +const int kUniChs[] = {0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0}; +/** Latin chars corresponding to the unicode chars above. */ +const int kLatinChs[] = {0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0}; + +/** + * The recognized text is returned as a char* which is coded + * as UNLV format Latin-1 with specific reject and suspect codes. + * Returned string must be freed with the delete [] operator. + */ +char *TessBaseAPI::GetUNLVText() { + if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) { + return nullptr; + } + bool tilde_crunch_written = false; + bool last_char_was_newline = true; + bool last_char_was_tilde = false; + + int total_length = TextLength(nullptr); + PAGE_RES_IT page_res_it(page_res_); + char *result = new char[total_length]; + char *ptr = result; + for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) { + WERD_RES *word = page_res_it.word(); + // Process the current word. + if (word->unlv_crunch_mode != CR_NONE) { + if (word->unlv_crunch_mode != CR_DELETE && + (!tilde_crunch_written || + (word->unlv_crunch_mode == CR_KEEP_SPACE && word->word->space() > 0 && + !word->word->flag(W_FUZZY_NON) && !word->word->flag(W_FUZZY_SP)))) { + if (!word->word->flag(W_BOL) && word->word->space() > 0 && !word->word->flag(W_FUZZY_NON) && + !word->word->flag(W_FUZZY_SP)) { + /* Write a space to separate from preceding good text */ + *ptr++ = ' '; + last_char_was_tilde = false; + } + if (!last_char_was_tilde) { + // Write a reject char. + last_char_was_tilde = true; + *ptr++ = kUNLVReject; + tilde_crunch_written = true; + last_char_was_newline = false; + } + } + } else { + // NORMAL PROCESSING of non tilde crunched words. + tilde_crunch_written = false; + tesseract_->set_unlv_suspects(word); + const char *wordstr = word->best_choice->unichar_string().c_str(); + const auto &lengths = word->best_choice->unichar_lengths(); + int length = lengths.length(); + int i = 0; + int offset = 0; + + if (last_char_was_tilde && word->word->space() == 0 && wordstr[offset] == ' ') { + // Prevent adjacent tilde across words - we know that adjacent tildes + // within words have been removed. + // Skip the first character. + offset = lengths[i++]; + } + if (i < length && wordstr[offset] != 0) { + if (!last_char_was_newline) { + *ptr++ = ' '; + } else { + last_char_was_newline = false; + } + for (; i < length; offset += lengths[i++]) { + if (wordstr[offset] == ' ' || wordstr[offset] == kTesseractReject) { + *ptr++ = kUNLVReject; + last_char_was_tilde = true; + } else { + if (word->reject_map[i].rejected()) { + *ptr++ = kUNLVSuspect; + } + UNICHAR ch(wordstr + offset, lengths[i]); + int uni_ch = ch.first_uni(); + for (int j = 0; kUniChs[j] != 0; ++j) { + if (kUniChs[j] == uni_ch) { + uni_ch = kLatinChs[j]; + break; + } + } + if (uni_ch <= 0xff) { + *ptr++ = static_cast<char>(uni_ch); + last_char_was_tilde = false; + } else { + *ptr++ = kUNLVReject; + last_char_was_tilde = true; + } + } + } + } + } + if (word->word->flag(W_EOL) && !last_char_was_newline) { + /* Add a new line output */ + *ptr++ = '\n'; + tilde_crunch_written = false; + last_char_was_newline = true; + last_char_was_tilde = false; + } + } + *ptr++ = '\n'; + *ptr = '\0'; + return result; +} + +#ifndef DISABLED_LEGACY_ENGINE + +/** + * Detect the orientation of the input image and apparent script (alphabet). + * orient_deg is the detected clockwise rotation of the input image in degrees + * (0, 90, 180, 270) + * orient_conf is the confidence (15.0 is reasonably confident) + * script_name is an ASCII string, the name of the script, e.g. "Latin" + * script_conf is confidence level in the script + * Returns true on success and writes values to each parameter as an output + */ +bool TessBaseAPI::DetectOrientationScript(int *orient_deg, float *orient_conf, + const char **script_name, float *script_conf) { + OSResults osr; + + bool osd = DetectOS(&osr); + if (!osd) { + return false; + } + + int orient_id = osr.best_result.orientation_id; + int script_id = osr.get_best_script(orient_id); + if (orient_conf) { + *orient_conf = osr.best_result.oconfidence; + } + if (orient_deg) { + *orient_deg = orient_id * 90; // convert quadrant to degrees + } + + if (script_name) { + const char *script = osr.unicharset->get_script_from_script_id(script_id); + + *script_name = script; + } + + if (script_conf) { + *script_conf = osr.best_result.sconfidence; + } + + return true; +} + +/** + * The recognized text is returned as a char* which is coded + * as UTF8 and must be freed with the delete [] operator. + * page_number is a 0-based page index that will appear in the osd file. + */ +char *TessBaseAPI::GetOsdText(int page_number) { + int orient_deg; + float orient_conf; + const char *script_name; + float script_conf; + + if (!DetectOrientationScript(&orient_deg, &orient_conf, &script_name, &script_conf)) { + return nullptr; + } + + // clockwise rotation needed to make the page upright + int rotate = OrientationIdToValue(orient_deg / 90); + + std::stringstream stream; + // Use "C" locale (needed for float values orient_conf and script_conf). + stream.imbue(std::locale::classic()); + // Use fixed notation with 2 digits after the decimal point for float values. + stream.precision(2); + stream << std::fixed << "Page number: " << page_number << "\n" + << "Orientation in degrees: " << orient_deg << "\n" + << "Rotate: " << rotate << "\n" + << "Orientation confidence: " << orient_conf << "\n" + << "Script: " << script_name << "\n" + << "Script confidence: " << script_conf << "\n"; + return copy_string(stream.str()); +} + +#endif // ndef DISABLED_LEGACY_ENGINE + +/** Returns the average word confidence for Tesseract page result. */ +int TessBaseAPI::MeanTextConf() { + int *conf = AllWordConfidences(); + if (!conf) { + return 0; + } + int sum = 0; + int *pt = conf; + while (*pt >= 0) { + sum += *pt++; + } + if (pt != conf) { + sum /= pt - conf; + } + delete[] conf; + return sum; +} + +/** Returns an array of all word confidences, terminated by -1. */ +int *TessBaseAPI::AllWordConfidences() { + if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) { + return nullptr; + } + int n_word = 0; + PAGE_RES_IT res_it(page_res_); + for (res_it.restart_page(); res_it.word() != nullptr; res_it.forward()) { + n_word++; + } + + int *conf = new int[n_word + 1]; + n_word = 0; + for (res_it.restart_page(); res_it.word() != nullptr; res_it.forward()) { + WERD_RES *word = res_it.word(); + WERD_CHOICE *choice = word->best_choice; + int w_conf = static_cast<int>(100 + 5 * choice->certainty()); + // This is the eq for converting Tesseract confidence to 1..100 + if (w_conf < 0) { + w_conf = 0; + } + if (w_conf > 100) { + w_conf = 100; + } + conf[n_word++] = w_conf; + } + conf[n_word] = -1; + return conf; +} + +#ifndef DISABLED_LEGACY_ENGINE +/** + * Applies the given word to the adaptive classifier if possible. + * The word must be SPACE-DELIMITED UTF-8 - l i k e t h i s , so it can + * tell the boundaries of the graphemes. + * Assumes that SetImage/SetRectangle have been used to set the image + * to the given word. The mode arg should be PSM_SINGLE_WORD or + * PSM_CIRCLE_WORD, as that will be used to control layout analysis. + * The currently set PageSegMode is preserved. + * Returns false if adaption was not possible for some reason. + */ +bool TessBaseAPI::AdaptToWordStr(PageSegMode mode, const char *wordstr) { + int debug = 0; + GetIntVariable("applybox_debug", &debug); + bool success = true; + PageSegMode current_psm = GetPageSegMode(); + SetPageSegMode(mode); + SetVariable("classify_enable_learning", "0"); + const std::unique_ptr<const char[]> text(GetUTF8Text()); + if (debug) { + tprintf("Trying to adapt \"%s\" to \"%s\"\n", text.get(), wordstr); + } + if (text != nullptr) { + PAGE_RES_IT it(page_res_); + WERD_RES *word_res = it.word(); + if (word_res != nullptr) { + word_res->word->set_text(wordstr); + // Check to see if text matches wordstr. + int w = 0; + int t; + for (t = 0; text[t] != '\0'; ++t) { + if (text[t] == '\n' || text[t] == ' ') { + continue; + } + while (wordstr[w] == ' ') { + ++w; + } + if (text[t] != wordstr[w]) { + break; + } + ++w; + } + if (text[t] != '\0' || wordstr[w] != '\0') { + // No match. + delete page_res_; + std::vector<TBOX> boxes; + page_res_ = tesseract_->SetupApplyBoxes(boxes, block_list_); + tesseract_->ReSegmentByClassification(page_res_); + tesseract_->TidyUp(page_res_); + PAGE_RES_IT pr_it(page_res_); + if (pr_it.word() == nullptr) { + success = false; + } else { + word_res = pr_it.word(); + } + } else { + word_res->BestChoiceToCorrectText(); + } + if (success) { + tesseract_->EnableLearning = true; + tesseract_->LearnWord(nullptr, word_res); + } + } else { + success = false; + } + } else { + success = false; + } + SetPageSegMode(current_psm); + return success; +} +#endif // ndef DISABLED_LEGACY_ENGINE + +/** + * Free up recognition results and any stored image data, without actually + * freeing any recognition data that would be time-consuming to reload. + * Afterwards, you must call SetImage or TesseractRect before doing + * any Recognize or Get* operation. + */ +void TessBaseAPI::Clear() { + if (thresholder_ != nullptr) { + thresholder_->Clear(); + } + ClearResults(); + if (tesseract_ != nullptr) { + SetInputImage(nullptr); + } +} + +/** + * Close down tesseract and free up all memory. End() is equivalent to + * destructing and reconstructing your TessBaseAPI. + * Once End() has been used, none of the other API functions may be used + * other than Init and anything declared above it in the class definition. + */ +void TessBaseAPI::End() { + Clear(); + delete thresholder_; + thresholder_ = nullptr; + delete page_res_; + page_res_ = nullptr; + delete block_list_; + block_list_ = nullptr; + if (paragraph_models_ != nullptr) { + for (auto model : *paragraph_models_) { + delete model; + } + delete paragraph_models_; + paragraph_models_ = nullptr; + } +#ifndef DISABLED_LEGACY_ENGINE + if (osd_tesseract_ == tesseract_) { + osd_tesseract_ = nullptr; + } + delete osd_tesseract_; + osd_tesseract_ = nullptr; + delete equ_detect_; + equ_detect_ = nullptr; +#endif // ndef DISABLED_LEGACY_ENGINE + delete tesseract_; + tesseract_ = nullptr; + input_file_.clear(); + output_file_.clear(); + datapath_.clear(); + language_.clear(); +} + +// Clear any library-level memory caches. +// There are a variety of expensive-to-load constant data structures (mostly +// language dictionaries) that are cached globally -- surviving the Init() +// and End() of individual TessBaseAPI's. This function allows the clearing +// of these caches. +void TessBaseAPI::ClearPersistentCache() { + Dict::GlobalDawgCache()->DeleteUnusedDawgs(); +} + +/** + * Check whether a word is valid according to Tesseract's language model + * returns 0 if the word is invalid, non-zero if valid + */ +int TessBaseAPI::IsValidWord(const char *word) const { + return tesseract_->getDict().valid_word(word); +} +// Returns true if utf8_character is defined in the UniCharset. +bool TessBaseAPI::IsValidCharacter(const char *utf8_character) const { + return tesseract_->unicharset.contains_unichar(utf8_character); +} + +// TODO(rays) Obsolete this function and replace with a more aptly named +// function that returns image coordinates rather than tesseract coordinates. +bool TessBaseAPI::GetTextDirection(int *out_offset, float *out_slope) { + const std::unique_ptr<const PageIterator> it(AnalyseLayout()); + if (it == nullptr) { + return false; + } + int x1, x2, y1, y2; + it->Baseline(RIL_TEXTLINE, &x1, &y1, &x2, &y2); + // Calculate offset and slope (NOTE: Kind of ugly) + if (x2 <= x1) { + x2 = x1 + 1; + } + // Convert the point pair to slope/offset of the baseline (in image coords.) + *out_slope = static_cast<float>(y2 - y1) / (x2 - x1); + *out_offset = static_cast<int>(y1 - *out_slope * x1); + // Get the y-coord of the baseline at the left and right edges of the + // textline's bounding box. + int left, top, right, bottom; + if (!it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom)) { + return false; + } + int left_y = IntCastRounded(*out_slope * left + *out_offset); + int right_y = IntCastRounded(*out_slope * right + *out_offset); + // Shift the baseline down so it passes through the nearest bottom-corner + // of the textline's bounding box. This is the difference between the y + // at the lowest (max) edge of the box and the actual box bottom. + *out_offset += bottom - std::max(left_y, right_y); + // Switch back to bottom-up tesseract coordinates. Requires negation of + // the slope and height - offset for the offset. + *out_slope = -*out_slope; + *out_offset = rect_height_ - *out_offset; + + return true; +} + +/** Sets Dict::letter_is_okay_ function to point to the given function. */ +void TessBaseAPI::SetDictFunc(DictFunc f) { + if (tesseract_ != nullptr) { + tesseract_->getDict().letter_is_okay_ = f; + } +} + +/** + * Sets Dict::probability_in_context_ function to point to the given + * function. + * + * @param f A single function that returns the probability of the current + * "character" (in general a utf-8 string), given the context of a previous + * utf-8 string. + */ +void TessBaseAPI::SetProbabilityInContextFunc(ProbabilityInContextFunc f) { + if (tesseract_ != nullptr) { + tesseract_->getDict().probability_in_context_ = f; + // Set it for the sublangs too. + int num_subs = tesseract_->num_sub_langs(); + for (int i = 0; i < num_subs; ++i) { + tesseract_->get_sub_lang(i)->getDict().probability_in_context_ = f; + } + } +} + +/** Common code for setting the image. */ +bool TessBaseAPI::InternalSetImage() { + if (tesseract_ == nullptr) { + tprintf("Please call Init before attempting to set an image.\n"); + return false; + } + if (thresholder_ == nullptr) { + thresholder_ = new ImageThresholder; + } + ClearResults(); + return true; +} + +/** + * Run the thresholder to make the thresholded image, returned in pix, + * which must not be nullptr. *pix must be initialized to nullptr, or point + * to an existing pixDestroyable Pix. + * The usual argument to Threshold is Tesseract::mutable_pix_binary(). + */ +bool TessBaseAPI::Threshold(Pix **pix) { + ASSERT_HOST(pix != nullptr); + if (*pix != nullptr) { + pixDestroy(pix); + } + // Zero resolution messes up the algorithms, so make sure it is credible. + int user_dpi = 0; + GetIntVariable("user_defined_dpi", &user_dpi); + int y_res = thresholder_->GetScaledYResolution(); + if (user_dpi && (user_dpi < kMinCredibleResolution || user_dpi > kMaxCredibleResolution)) { + tprintf( + "Warning: User defined image dpi is outside of expected range " + "(%d - %d)!\n", + kMinCredibleResolution, kMaxCredibleResolution); + } + // Always use user defined dpi + if (user_dpi) { + thresholder_->SetSourceYResolution(user_dpi); + } else if (y_res < kMinCredibleResolution || y_res > kMaxCredibleResolution) { + if (y_res != 0) { + // Show warning only if a resolution was given. + tprintf("Warning: Invalid resolution %d dpi. Using %d instead.\n", + y_res, kMinCredibleResolution); + } + thresholder_->SetSourceYResolution(kMinCredibleResolution); + } + + auto thresholding_method = static_cast<ThresholdMethod>(static_cast<int>(tesseract_->thresholding_method)); + + if (thresholding_method == ThresholdMethod::Otsu) { + Image pix_binary(*pix); + if (!thresholder_->ThresholdToPix(&pix_binary)) { + return false; + } + *pix = pix_binary; + + if (!thresholder_->IsBinary()) { + tesseract_->set_pix_thresholds(thresholder_->GetPixRectThresholds()); + tesseract_->set_pix_grey(thresholder_->GetPixRectGrey()); + } else { + tesseract_->set_pix_thresholds(nullptr); + tesseract_->set_pix_grey(nullptr); + } + } else { + auto [ok, pix_grey, pix_binary, pix_thresholds] = thresholder_->Threshold(this, thresholding_method); + + if (!ok) { + return false; + } + *pix = pix_binary; + + tesseract_->set_pix_thresholds(pix_thresholds); + tesseract_->set_pix_grey(pix_grey); + } + + thresholder_->GetImageSizes(&rect_left_, &rect_top_, &rect_width_, &rect_height_, &image_width_, + &image_height_); + + // Set the internal resolution that is used for layout parameters from the + // estimated resolution, rather than the image resolution, which may be + // fabricated, but we will use the image resolution, if there is one, to + // report output point sizes. + int estimated_res = ClipToRange(thresholder_->GetScaledEstimatedResolution(), + kMinCredibleResolution, kMaxCredibleResolution); + if (estimated_res != thresholder_->GetScaledEstimatedResolution()) { + tprintf( + "Estimated internal resolution %d out of range! " + "Corrected to %d.\n", + thresholder_->GetScaledEstimatedResolution(), estimated_res); + } + tesseract_->set_source_resolution(estimated_res); + return true; +} + +/** Find lines from the image making the BLOCK_LIST. */ +int TessBaseAPI::FindLines() { + if (thresholder_ == nullptr || thresholder_->IsEmpty()) { + tprintf("Please call SetImage before attempting recognition.\n"); + return -1; + } + if (recognition_done_) { + ClearResults(); + } + if (!block_list_->empty()) { + return 0; + } + if (tesseract_ == nullptr) { + tesseract_ = new Tesseract; +#ifndef DISABLED_LEGACY_ENGINE + tesseract_->InitAdaptiveClassifier(nullptr); +#endif + } + if (tesseract_->pix_binary() == nullptr && !Threshold(&tesseract_->mutable_pix_binary()->pix_)) { + return -1; + } + + tesseract_->PrepareForPageseg(); + +#ifndef DISABLED_LEGACY_ENGINE + if (tesseract_->textord_equation_detect) { + if (equ_detect_ == nullptr && !datapath_.empty()) { + equ_detect_ = new EquationDetect(datapath_.c_str(), nullptr); + } + if (equ_detect_ == nullptr) { + tprintf("Warning: Could not set equation detector\n"); + } else { + tesseract_->SetEquationDetect(equ_detect_); + } + } +#endif // ndef DISABLED_LEGACY_ENGINE + + Tesseract *osd_tess = osd_tesseract_; + OSResults osr; +#ifndef DISABLED_LEGACY_ENGINE + if (PSM_OSD_ENABLED(tesseract_->tessedit_pageseg_mode) && osd_tess == nullptr) { + if (strcmp(language_.c_str(), "osd") == 0) { + osd_tess = tesseract_; + } else { + osd_tesseract_ = new Tesseract; + TessdataManager mgr(reader_); + if (datapath_.empty()) { + tprintf( + "Warning: Auto orientation and script detection requested," + " but data path is undefined\n"); + delete osd_tesseract_; + osd_tesseract_ = nullptr; + } else if (osd_tesseract_->init_tesseract(datapath_, "", "osd", OEM_TESSERACT_ONLY, + nullptr, 0, nullptr, nullptr, false, &mgr) == 0) { + osd_tess = osd_tesseract_; + osd_tesseract_->set_source_resolution(thresholder_->GetSourceYResolution()); + } else { + tprintf( + "Warning: Auto orientation and script detection requested," + " but osd language failed to load\n"); + delete osd_tesseract_; + osd_tesseract_ = nullptr; + } + } + } +#endif // ndef DISABLED_LEGACY_ENGINE + + if (tesseract_->SegmentPage(input_file_.c_str(), block_list_, osd_tess, &osr) < 0) { + return -1; + } + + // If Devanagari is being recognized, we use different images for page seg + // and for OCR. + tesseract_->PrepareForTessOCR(block_list_, osd_tess, &osr); + return 0; +} + +/** + * Return average gradient of lines on page. + */ +float TessBaseAPI::GetGradient() { + return tesseract_->gradient(); +} + +/** Delete the pageres and clear the block list ready for a new page. */ +void TessBaseAPI::ClearResults() { + if (tesseract_ != nullptr) { + tesseract_->Clear(); + } + delete page_res_; + page_res_ = nullptr; + recognition_done_ = false; + if (block_list_ == nullptr) { + block_list_ = new BLOCK_LIST; + } else { + block_list_->clear(); + } + if (paragraph_models_ != nullptr) { + for (auto model : *paragraph_models_) { + delete model; + } + delete paragraph_models_; + paragraph_models_ = nullptr; + } +} + +/** + * Return the length of the output text string, as UTF8, assuming + * liberally two spacing marks after each word (as paragraphs end with two + * newlines), and assuming a single character reject marker for each rejected + * character. + * Also return the number of recognized blobs in blob_count. + */ +int TessBaseAPI::TextLength(int *blob_count) const { + if (tesseract_ == nullptr || page_res_ == nullptr) { + return 0; + } + + PAGE_RES_IT page_res_it(page_res_); + int total_length = 2; + int total_blobs = 0; + // Iterate over the data structures to extract the recognition result. + for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) { + WERD_RES *word = page_res_it.word(); + WERD_CHOICE *choice = word->best_choice; + if (choice != nullptr) { + total_blobs += choice->length() + 2; + total_length += choice->unichar_string().length() + 2; + for (int i = 0; i < word->reject_map.length(); ++i) { + if (word->reject_map[i].rejected()) { + ++total_length; + } + } + } + } + if (blob_count != nullptr) { + *blob_count = total_blobs; + } + return total_length; +} + +#ifndef DISABLED_LEGACY_ENGINE +/** + * Estimates the Orientation And Script of the image. + * Returns true if the image was processed successfully. + */ +bool TessBaseAPI::DetectOS(OSResults *osr) { + if (tesseract_ == nullptr) { + return false; + } + ClearResults(); + if (tesseract_->pix_binary() == nullptr && !Threshold(&tesseract_->mutable_pix_binary()->pix_)) { + return false; + } + + if (input_file_.empty()) { + input_file_ = kInputFile; + } + return orientation_and_script_detection(input_file_.c_str(), osr, tesseract_) > 0; +} +#endif // #ifndef DISABLED_LEGACY_ENGINE + +void TessBaseAPI::set_min_orientation_margin(double margin) { + tesseract_->min_orientation_margin.set_value(margin); +} + +/** + * Return text orientation of each block as determined in an earlier page layout + * analysis operation. Orientation is returned as the number of ccw 90-degree + * rotations (in [0..3]) required to make the text in the block upright + * (readable). Note that this may not necessary be the block orientation + * preferred for recognition (such as the case of vertical CJK text). + * + * Also returns whether the text in the block is believed to have vertical + * writing direction (when in an upright page orientation). + * + * The returned array is of length equal to the number of text blocks, which may + * be less than the total number of blocks. The ordering is intended to be + * consistent with GetTextLines(). + */ +void TessBaseAPI::GetBlockTextOrientations(int **block_orientation, bool **vertical_writing) { + delete[] * block_orientation; + *block_orientation = nullptr; + delete[] * vertical_writing; + *vertical_writing = nullptr; + BLOCK_IT block_it(block_list_); + + block_it.move_to_first(); + int num_blocks = 0; + for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { + if (!block_it.data()->pdblk.poly_block()->IsText()) { + continue; + } + ++num_blocks; + } + if (!num_blocks) { + tprintf("WARNING: Found no blocks\n"); + return; + } + *block_orientation = new int[num_blocks]; + *vertical_writing = new bool[num_blocks]; + block_it.move_to_first(); + int i = 0; + for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { + if (!block_it.data()->pdblk.poly_block()->IsText()) { + continue; + } + FCOORD re_rotation = block_it.data()->re_rotation(); + float re_theta = re_rotation.angle(); + FCOORD classify_rotation = block_it.data()->classify_rotation(); + float classify_theta = classify_rotation.angle(); + double rot_theta = -(re_theta - classify_theta) * 2.0 / M_PI; + if (rot_theta < 0) { + rot_theta += 4; + } + int num_rotations = static_cast<int>(rot_theta + 0.5); + (*block_orientation)[i] = num_rotations; + // The classify_rotation is non-zero only if the text has vertical + // writing direction. + (*vertical_writing)[i] = classify_rotation.y() != 0.0f; + ++i; + } +} + +void TessBaseAPI::DetectParagraphs(bool after_text_recognition) { + int debug_level = 0; + GetIntVariable("paragraph_debug_level", &debug_level); + if (paragraph_models_ == nullptr) { + paragraph_models_ = new std::vector<ParagraphModel *>; + } + MutableIterator *result_it = GetMutableIterator(); + do { // Detect paragraphs for this block + std::vector<ParagraphModel *> models; + ::tesseract::DetectParagraphs(debug_level, after_text_recognition, result_it, &models); + paragraph_models_->insert(paragraph_models_->end(), models.begin(), models.end()); + } while (result_it->Next(RIL_BLOCK)); + delete result_it; +} + +/** This method returns the string form of the specified unichar. */ +const char *TessBaseAPI::GetUnichar(int unichar_id) const { + return tesseract_->unicharset.id_to_unichar(unichar_id); +} + +/** Return the pointer to the i-th dawg loaded into tesseract_ object. */ +const Dawg *TessBaseAPI::GetDawg(int i) const { + if (tesseract_ == nullptr || i >= NumDawgs()) { + return nullptr; + } + return tesseract_->getDict().GetDawg(i); +} + +/** Return the number of dawgs loaded into tesseract_ object. */ +int TessBaseAPI::NumDawgs() const { + return tesseract_ == nullptr ? 0 : tesseract_->getDict().NumDawgs(); +} + +/** Escape a char string - replace <>&"' with HTML codes. */ +std::string HOcrEscape(const char *text) { + std::string ret; + const char *ptr; + for (ptr = text; *ptr; ptr++) { + switch (*ptr) { + case '<': + ret += "<"; + break; + case '>': + ret += ">"; + break; + case '&': + ret += "&"; + break; + case '"': + ret += """; + break; + case '\'': + ret += "'"; + break; + default: + ret += *ptr; + } + } + return ret; +} + +} // namespace tesseract
