Mercurial > hgrepos > Python2 > PyMuPDF

diff mupdf-source/thirdparty/tesseract/src/api/baseapi.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author: Franz Glasner <fzglas.hg@dom66.de>
date: Mon, 15 Sep 2025 11:43:07 +0200
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/api/baseapi.cpp	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,2351 @@
+/**********************************************************************
+ * File:        baseapi.cpp
+ * Description: Simple API for calling tesseract.
+ * Author:      Ray Smith
+ *
+ * (C) Copyright 2006, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#define _USE_MATH_DEFINES // for M_PI
+
+// Include automatically generated configuration file if running autoconf.
+#ifdef HAVE_CONFIG_H
+#  include "config_auto.h"
+#endif
+
+#include "boxword.h"    // for BoxWord
+#include "coutln.h"     // for C_OUTLINE_IT, C_OUTLINE_LIST
+#include "dawg_cache.h" // for DawgCache
+#include "dict.h"       // for Dict
+#include "elst.h"       // for ELIST_ITERATOR, ELISTIZE, ELISTIZEH
+#include "environ.h"    // for l_uint8
+#ifndef DISABLED_LEGACY_ENGINE
+#include "equationdetect.h" // for EquationDetect, destructor of equ_detect_
+#endif // ndef DISABLED_LEGACY_ENGINE
+#include "errcode.h" // for ASSERT_HOST
+#include "helpers.h" // for IntCastRounded, chomp_string, copy_string
+#include "host.h"    // for MAX_PATH
+#include "imageio.h" // for IFF_TIFF_G4, IFF_TIFF, IFF_TIFF_G3, ...
+#ifndef DISABLED_LEGACY_ENGINE
+#  include "intfx.h" // for INT_FX_RESULT_STRUCT
+#endif
+#include "mutableiterator.h" // for MutableIterator
+#include "normalis.h"        // for kBlnBaselineOffset, kBlnXHeight
+#include "pageres.h"         // for PAGE_RES_IT, WERD_RES, PAGE_RES, CR_DE...
+#include "paragraphs.h"      // for DetectParagraphs
+#include "params.h"          // for BoolParam, IntParam, DoubleParam, Stri...
+#include "pdblock.h"         // for PDBLK
+#include "points.h"          // for FCOORD
+#include "polyblk.h"         // for POLY_BLOCK
+#include "rect.h"            // for TBOX
+#include "stepblob.h"        // for C_BLOB_IT, C_BLOB, C_BLOB_LIST
+#include "tessdatamanager.h" // for TessdataManager, kTrainedDataSuffix
+#include "tesseractclass.h"  // for Tesseract
+#include "tprintf.h"         // for tprintf
+#include "werd.h"            // for WERD, WERD_IT, W_FUZZY_NON, W_FUZZY_SP
+#include "thresholder.h"     // for ImageThresholder
+
+#include <tesseract/baseapi.h>
+#include <tesseract/ocrclass.h>       // for ETEXT_DESC
+#include <tesseract/osdetect.h>       // for OSResults, OSBestResult, OrientationId...
+#include <tesseract/renderer.h>       // for TessResultRenderer
+#include <tesseract/resultiterator.h> // for ResultIterator
+
+#include <cmath>    // for round, M_PI
+#include <cstdint>  // for int32_t
+#include <cstring>  // for strcmp, strcpy
+#include <filesystem> // for std::filesystem
+#include <fstream>  // for size_t
+#include <iostream> // for std::cin
+#include <locale>   // for std::locale::classic
+#include <memory>   // for std::unique_ptr
+#include <set>      // for std::pair
+#include <sstream>  // for std::stringstream
+#include <vector>   // for std::vector
+
+#include <allheaders.h> // for pixDestroy, boxCreate, boxaAddBox, box...
+#ifdef HAVE_LIBCURL
+#  include <curl/curl.h>
+#endif
+
+#ifdef __linux__
+#  include <csignal> // for sigaction, SA_RESETHAND, SIGBUS, SIGFPE
+#endif
+
+#if defined(_WIN32)
+#  include <fcntl.h> // for _O_BINARY
+#  include <io.h>    // for _setmode
+#endif
+
+namespace tesseract {
+
+static BOOL_VAR(stream_filelist, false, "Stream a filelist from stdin");
+static STRING_VAR(document_title, "", "Title of output document (used for hOCR and PDF output)");
+#ifdef HAVE_LIBCURL
+static INT_VAR(curl_timeout, 0, "Timeout for curl in seconds");
+static STRING_VAR(curl_cookiefile, "", "File with cookie data for curl");
+#endif
+
+/** Minimum sensible image size to be worth running Tesseract. */
+const int kMinRectSize = 10;
+/** Character returned when Tesseract couldn't recognize as anything. */
+const char kTesseractReject = '~';
+/** Character used by UNLV error counter as a reject. */
+const char kUNLVReject = '~';
+/** Character used by UNLV as a suspect marker. */
+const char kUNLVSuspect = '^';
+/**
+ * Temp file used for storing current parameters before applying retry values.
+ */
+static const char *kOldVarsFile = "failed_vars.txt";
+
+#ifndef DISABLED_LEGACY_ENGINE
+/**
+ * Filename used for input image file, from which to derive a name to search
+ * for a possible UNLV zone file, if none is specified by SetInputName.
+ */
+static const char *kInputFile = "noname.tif";
+static const char kUnknownFontName[] = "UnknownFont";
+
+static STRING_VAR(classify_font_name, kUnknownFontName,
+                  "Default font name to be used in training");
+
+// Finds the name of the training font and returns it in fontname, by cutting
+// it out based on the expectation that the filename is of the form:
+// /path/to/dir/[lang].[fontname].exp[num]
+// The [lang], [fontname] and [num] fields should not have '.' characters.
+// If the global parameter classify_font_name is set, its value is used instead.
+static void ExtractFontName(const char* filename, std::string* fontname) {
+  *fontname = classify_font_name;
+  if (*fontname == kUnknownFontName) {
+    // filename is expected to be of the form [lang].[fontname].exp[num]
+    // The [lang], [fontname] and [num] fields should not have '.' characters.
+    const char *basename = strrchr(filename, '/');
+    const char *firstdot = strchr(basename ? basename : filename, '.');
+    const char *lastdot  = strrchr(filename, '.');
+    if (firstdot != lastdot && firstdot != nullptr && lastdot != nullptr) {
+      ++firstdot;
+      *fontname = firstdot;
+      fontname->resize(lastdot - firstdot);
+    }
+  }
+}
+#endif
+
+/* Add all available languages recursively.
+ */
+static void addAvailableLanguages(const std::string &datadir,
+                                  std::vector<std::string> *langs) {
+  for (const auto& entry :
+       std::filesystem::recursive_directory_iterator(datadir,
+         std::filesystem::directory_options::follow_directory_symlink |
+         std::filesystem::directory_options::skip_permission_denied)) {
+    auto path = entry.path().lexically_relative(datadir).string();
+    auto extPos = path.rfind(".traineddata");
+    if (extPos != std::string::npos) {
+      langs->push_back(path.substr(0, extPos));
+    }
+  }
+}
+
+TessBaseAPI::TessBaseAPI()
+    : tesseract_(nullptr)
+    , osd_tesseract_(nullptr)
+    , equ_detect_(nullptr)
+    , reader_(nullptr)
+    ,
+    // thresholder_ is initialized to nullptr here, but will be set before use
+    // by: A constructor of a derived API or created
+    // implicitly when used in InternalSetImage.
+    thresholder_(nullptr)
+    , paragraph_models_(nullptr)
+    , block_list_(nullptr)
+    , page_res_(nullptr)
+    , last_oem_requested_(OEM_DEFAULT)
+    , recognition_done_(false)
+    , rect_left_(0)
+    , rect_top_(0)
+    , rect_width_(0)
+    , rect_height_(0)
+    , image_width_(0)
+    , image_height_(0) {
+}
+
+TessBaseAPI::~TessBaseAPI() {
+  End();
+}
+
+/**
+ * Returns the version identifier as a static string. Do not delete.
+ */
+const char *TessBaseAPI::Version() {
+  return TESSERACT_VERSION_STR;
+}
+
+/**
+ * Set the name of the input file. Needed only for training and
+ * loading a UNLV zone file.
+ */
+void TessBaseAPI::SetInputName(const char *name) {
+  input_file_ = name ? name : "";
+}
+
+/** Set the name of the output files. Needed only for debugging. */
+void TessBaseAPI::SetOutputName(const char *name) {
+  output_file_ = name ? name : "";
+}
+
+bool TessBaseAPI::SetVariable(const char *name, const char *value) {
+  if (tesseract_ == nullptr) {
+    tesseract_ = new Tesseract;
+  }
+  return ParamUtils::SetParam(name, value, SET_PARAM_CONSTRAINT_NON_INIT_ONLY,
+                              tesseract_->params());
+}
+
+bool TessBaseAPI::SetDebugVariable(const char *name, const char *value) {
+  if (tesseract_ == nullptr) {
+    tesseract_ = new Tesseract;
+  }
+  return ParamUtils::SetParam(name, value, SET_PARAM_CONSTRAINT_DEBUG_ONLY, tesseract_->params());
+}
+
+bool TessBaseAPI::GetIntVariable(const char *name, int *value) const {
+  auto *p = ParamUtils::FindParam<IntParam>(name, GlobalParams()->int_params,
+                                            tesseract_->params()->int_params);
+  if (p == nullptr) {
+    return false;
+  }
+  *value = (int32_t)(*p);
+  return true;
+}
+
+bool TessBaseAPI::GetBoolVariable(const char *name, bool *value) const {
+  auto *p = ParamUtils::FindParam<BoolParam>(name, GlobalParams()->bool_params,
+                                             tesseract_->params()->bool_params);
+  if (p == nullptr) {
+    return false;
+  }
+  *value = bool(*p);
+  return true;
+}
+
+const char *TessBaseAPI::GetStringVariable(const char *name) const {
+  auto *p = ParamUtils::FindParam<StringParam>(name, GlobalParams()->string_params,
+                                               tesseract_->params()->string_params);
+  return (p != nullptr) ? p->c_str() : nullptr;
+}
+
+bool TessBaseAPI::GetDoubleVariable(const char *name, double *value) const {
+  auto *p = ParamUtils::FindParam<DoubleParam>(name, GlobalParams()->double_params,
+                                               tesseract_->params()->double_params);
+  if (p == nullptr) {
+    return false;
+  }
+  *value = (double)(*p);
+  return true;
+}
+
+/** Get value of named variable as a string, if it exists. */
+bool TessBaseAPI::GetVariableAsString(const char *name, std::string *val) const {
+  return ParamUtils::GetParamAsString(name, tesseract_->params(), val);
+}
+
+#ifndef DISABLED_LEGACY_ENGINE
+
+/** Print Tesseract fonts table to the given file. */
+void TessBaseAPI::PrintFontsTable(FILE *fp) const {
+  const int fontinfo_size = tesseract_->get_fontinfo_table().size();
+  for (int font_index = 1; font_index < fontinfo_size; ++font_index) {
+    FontInfo font = tesseract_->get_fontinfo_table().at(font_index);
+    fprintf(fp, "ID=%3d: %s is_italic=%s is_bold=%s"
+                " is_fixed_pitch=%s is_serif=%s is_fraktur=%s\n",
+                font_index, font.name,
+                font.is_italic() ? "true" : "false",
+                font.is_bold() ? "true" : "false",
+                font.is_fixed_pitch() ? "true" : "false",
+                font.is_serif() ? "true" : "false",
+                font.is_fraktur() ? "true" : "false");
+  }
+}
+
+#endif
+
+/** Print Tesseract parameters to the given file. */
+void TessBaseAPI::PrintVariables(FILE *fp) const {
+  ParamUtils::PrintParams(fp, tesseract_->params());
+}
+
+/**
+ * The datapath must be the name of the data directory or
+ * some other file in which the data directory resides (for instance argv[0].)
+ * The language is (usually) an ISO 639-3 string or nullptr will default to eng.
+ * If numeric_mode is true, then only digits and Roman numerals will
+ * be returned.
+ * @return: 0 on success and -1 on initialization failure.
+ */
+int TessBaseAPI::Init(const char *datapath, const char *language, OcrEngineMode oem, char **configs,
+                      int configs_size, const std::vector<std::string> *vars_vec,
+                      const std::vector<std::string> *vars_values, bool set_only_non_debug_params) {
+  return Init(datapath, 0, language, oem, configs, configs_size, vars_vec, vars_values,
+              set_only_non_debug_params, nullptr);
+}
+
+// In-memory version reads the traineddata file directly from the given
+// data[data_size] array. Also implements the version with a datapath in data,
+// flagged by data_size = 0.
+int TessBaseAPI::Init(const char *data, int data_size, const char *language, OcrEngineMode oem,
+                      char **configs, int configs_size, const std::vector<std::string> *vars_vec,
+                      const std::vector<std::string> *vars_values, bool set_only_non_debug_params,
+                      FileReader reader) {
+  if (language == nullptr) {
+    language = "";
+  }
+  if (data == nullptr) {
+    data = "";
+  }
+  std::string datapath = data_size == 0 ? data : language;
+  // If the datapath, OcrEngineMode or the language have changed - start again.
+  // Note that the language_ field stores the last requested language that was
+  // initialized successfully, while tesseract_->lang stores the language
+  // actually used. They differ only if the requested language was nullptr, in
+  // which case tesseract_->lang is set to the Tesseract default ("eng").
+  if (tesseract_ != nullptr &&
+      (datapath_.empty() || language_.empty() || datapath_ != datapath ||
+       last_oem_requested_ != oem || (language_ != language && tesseract_->lang != language))) {
+    delete tesseract_;
+    tesseract_ = nullptr;
+  }
+  bool reset_classifier = true;
+  if (tesseract_ == nullptr) {
+    reset_classifier = false;
+    tesseract_ = new Tesseract;
+    if (reader != nullptr) {
+      reader_ = reader;
+    }
+    TessdataManager mgr(reader_);
+    if (data_size != 0) {
+      mgr.LoadMemBuffer(language, data, data_size);
+    }
+    if (tesseract_->init_tesseract(datapath, output_file_, language, oem, configs,
+                                   configs_size, vars_vec, vars_values, set_only_non_debug_params,
+                                   &mgr) != 0) {
+      return -1;
+    }
+  }
+
+  // Update datapath and language requested for the last valid initialization.
+  datapath_ = std::move(datapath);
+  if (datapath_.empty() && !tesseract_->datadir.empty()) {
+    datapath_ = tesseract_->datadir;
+  }
+
+  language_ = language;
+  last_oem_requested_ = oem;
+
+#ifndef DISABLED_LEGACY_ENGINE
+  // For same language and datapath, just reset the adaptive classifier.
+  if (reset_classifier) {
+    tesseract_->ResetAdaptiveClassifier();
+  }
+#endif // ndef DISABLED_LEGACY_ENGINE
+  return 0;
+}
+
+/**
+ * Returns the languages string used in the last valid initialization.
+ * If the last initialization specified "deu+hin" then that will be
+ * returned. If hin loaded eng automatically as well, then that will
+ * not be included in this list. To find the languages actually
+ * loaded use GetLoadedLanguagesAsVector.
+ * The returned string should NOT be deleted.
+ */
+const char *TessBaseAPI::GetInitLanguagesAsString() const {
+  return language_.c_str();
+}
+
+/**
+ * Returns the loaded languages in the vector of std::string.
+ * Includes all languages loaded by the last Init, including those loaded
+ * as dependencies of other loaded languages.
+ */
+void TessBaseAPI::GetLoadedLanguagesAsVector(std::vector<std::string> *langs) const {
+  langs->clear();
+  if (tesseract_ != nullptr) {
+    langs->push_back(tesseract_->lang);
+    int num_subs = tesseract_->num_sub_langs();
+    for (int i = 0; i < num_subs; ++i) {
+      langs->push_back(tesseract_->get_sub_lang(i)->lang);
+    }
+  }
+}
+
+/**
+ * Returns the available languages in the sorted vector of std::string.
+ */
+void TessBaseAPI::GetAvailableLanguagesAsVector(std::vector<std::string> *langs) const {
+  langs->clear();
+  if (tesseract_ != nullptr) {
+    addAvailableLanguages(tesseract_->datadir, langs);
+    std::sort(langs->begin(), langs->end());
+  }
+}
+
+/**
+ * Init only for page layout analysis. Use only for calls to SetImage and
+ * AnalysePage. Calls that attempt recognition will generate an error.
+ */
+void TessBaseAPI::InitForAnalysePage() {
+  if (tesseract_ == nullptr) {
+    tesseract_ = new Tesseract;
+#ifndef DISABLED_LEGACY_ENGINE
+    tesseract_->InitAdaptiveClassifier(nullptr);
+#endif
+  }
+}
+
+/**
+ * Read a "config" file containing a set of parameter name, value pairs.
+ * Searches the standard places: tessdata/configs, tessdata/tessconfigs
+ * and also accepts a relative or absolute path name.
+ */
+void TessBaseAPI::ReadConfigFile(const char *filename) {
+  tesseract_->read_config_file(filename, SET_PARAM_CONSTRAINT_NON_INIT_ONLY);
+}
+
+/** Same as above, but only set debug params from the given config file. */
+void TessBaseAPI::ReadDebugConfigFile(const char *filename) {
+  tesseract_->read_config_file(filename, SET_PARAM_CONSTRAINT_DEBUG_ONLY);
+}
+
+/**
+ * Set the current page segmentation mode. Defaults to PSM_AUTO.
+ * The mode is stored as an IntParam so it can also be modified by
+ * ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string).
+ */
+void TessBaseAPI::SetPageSegMode(PageSegMode mode) {
+  if (tesseract_ == nullptr) {
+    tesseract_ = new Tesseract;
+  }
+  tesseract_->tessedit_pageseg_mode.set_value(mode);
+}
+
+/** Return the current page segmentation mode. */
+PageSegMode TessBaseAPI::GetPageSegMode() const {
+  if (tesseract_ == nullptr) {
+    return PSM_SINGLE_BLOCK;
+  }
+  return static_cast<PageSegMode>(static_cast<int>(tesseract_->tessedit_pageseg_mode));
+}
+
+/**
+ * Recognize a rectangle from an image and return the result as a string.
+ * May be called many times for a single Init.
+ * Currently has no error checking.
+ * Greyscale of 8 and color of 24 or 32 bits per pixel may be given.
+ * Palette color images will not work properly and must be converted to
+ * 24 bit.
+ * Binary images of 1 bit per pixel may also be given but they must be
+ * byte packed with the MSB of the first byte being the first pixel, and a
+ * one pixel is WHITE. For binary images set bytes_per_pixel=0.
+ * The recognized text is returned as a char* which is coded
+ * as UTF8 and must be freed with the delete [] operator.
+ */
+char *TessBaseAPI::TesseractRect(const unsigned char *imagedata, int bytes_per_pixel,
+                                 int bytes_per_line, int left, int top, int width, int height) {
+  if (tesseract_ == nullptr || width < kMinRectSize || height < kMinRectSize) {
+    return nullptr; // Nothing worth doing.
+  }
+
+  // Since this original api didn't give the exact size of the image,
+  // we have to invent a reasonable value.
+  int bits_per_pixel = bytes_per_pixel == 0 ? 1 : bytes_per_pixel * 8;
+  SetImage(imagedata, bytes_per_line * 8 / bits_per_pixel, height + top, bytes_per_pixel,
+           bytes_per_line);
+  SetRectangle(left, top, width, height);
+
+  return GetUTF8Text();
+}
+
+#ifndef DISABLED_LEGACY_ENGINE
+/**
+ * Call between pages or documents etc to free up memory and forget
+ * adaptive data.
+ */
+void TessBaseAPI::ClearAdaptiveClassifier() {
+  if (tesseract_ == nullptr) {
+    return;
+  }
+  tesseract_->ResetAdaptiveClassifier();
+  tesseract_->ResetDocumentDictionary();
+}
+#endif // ndef DISABLED_LEGACY_ENGINE
+
+/**
+ * Provide an image for Tesseract to recognize. Format is as
+ * TesseractRect above. Copies the image buffer and converts to Pix.
+ * SetImage clears all recognition results, and sets the rectangle to the
+ * full image, so it may be followed immediately by a GetUTF8Text, and it
+ * will automatically perform recognition.
+ */
+void TessBaseAPI::SetImage(const unsigned char *imagedata, int width, int height,
+                           int bytes_per_pixel, int bytes_per_line) {
+  if (InternalSetImage()) {
+    thresholder_->SetImage(imagedata, width, height, bytes_per_pixel, bytes_per_line);
+    SetInputImage(thresholder_->GetPixRect());
+  }
+}
+
+void TessBaseAPI::SetSourceResolution(int ppi) {
+  if (thresholder_) {
+    thresholder_->SetSourceYResolution(ppi);
+  } else {
+    tprintf("Please call SetImage before SetSourceResolution.\n");
+  }
+}
+
+/**
+ * Provide an image for Tesseract to recognize. As with SetImage above,
+ * Tesseract takes its own copy of the image, so it need not persist until
+ * after Recognize.
+ * Pix vs raw, which to use?
+ * Use Pix where possible. Tesseract uses Pix as its internal representation
+ * and it is therefore more efficient to provide a Pix directly.
+ */
+void TessBaseAPI::SetImage(Pix *pix) {
+  if (InternalSetImage()) {
+    if (pixGetSpp(pix) == 4 && pixGetInputFormat(pix) == IFF_PNG) {
+      // remove alpha channel from png
+      Pix *p1 = pixRemoveAlpha(pix);
+      pixSetSpp(p1, 3);
+      (void)pixCopy(pix, p1);
+      pixDestroy(&p1);
+    }
+    thresholder_->SetImage(pix);
+    SetInputImage(thresholder_->GetPixRect());
+  }
+}
+
+/**
+ * Restrict recognition to a sub-rectangle of the image. Call after SetImage.
+ * Each SetRectangle clears the recognition results so multiple rectangles
+ * can be recognized with the same image.
+ */
+void TessBaseAPI::SetRectangle(int left, int top, int width, int height) {
+  if (thresholder_ == nullptr) {
+    return;
+  }
+  thresholder_->SetRectangle(left, top, width, height);
+  ClearResults();
+}
+
+/**
+ * ONLY available after SetImage if you have Leptonica installed.
+ * Get a copy of the internal thresholded image from Tesseract.
+ */
+Pix *TessBaseAPI::GetThresholdedImage() {
+  if (tesseract_ == nullptr || thresholder_ == nullptr) {
+    return nullptr;
+  }
+  if (tesseract_->pix_binary() == nullptr && !Threshold(&tesseract_->mutable_pix_binary()->pix_)) {
+    return nullptr;
+  }
+  return tesseract_->pix_binary().clone();
+}
+
+/**
+ * Get the result of page layout analysis as a leptonica-style
+ * Boxa, Pixa pair, in reading order.
+ * Can be called before or after Recognize.
+ */
+Boxa *TessBaseAPI::GetRegions(Pixa **pixa) {
+  return GetComponentImages(RIL_BLOCK, false, pixa, nullptr);
+}
+
+/**
+ * Get the textlines as a leptonica-style Boxa, Pixa pair, in reading order.
+ * Can be called before or after Recognize.
+ * If blockids is not nullptr, the block-id of each line is also returned as an
+ * array of one element per line. delete [] after use.
+ * If paraids is not nullptr, the paragraph-id of each line within its block is
+ * also returned as an array of one element per line. delete [] after use.
+ */
+Boxa *TessBaseAPI::GetTextlines(const bool raw_image, const int raw_padding, Pixa **pixa,
+                                int **blockids, int **paraids) {
+  return GetComponentImages(RIL_TEXTLINE, true, raw_image, raw_padding, pixa, blockids, paraids);
+}
+
+/**
+ * Get textlines and strips of image regions as a leptonica-style Boxa, Pixa
+ * pair, in reading order. Enables downstream handling of non-rectangular
+ * regions.
+ * Can be called before or after Recognize.
+ * If blockids is not nullptr, the block-id of each line is also returned as an
+ * array of one element per line. delete [] after use.
+ */
+Boxa *TessBaseAPI::GetStrips(Pixa **pixa, int **blockids) {
+  return GetComponentImages(RIL_TEXTLINE, false, pixa, blockids);
+}
+
+/**
+ * Get the words as a leptonica-style
+ * Boxa, Pixa pair, in reading order.
+ * Can be called before or after Recognize.
+ */
+Boxa *TessBaseAPI::GetWords(Pixa **pixa) {
+  return GetComponentImages(RIL_WORD, true, pixa, nullptr);
+}
+
+/**
+ * Gets the individual connected (text) components (created
+ * after pages segmentation step, but before recognition)
+ * as a leptonica-style Boxa, Pixa pair, in reading order.
+ * Can be called before or after Recognize.
+ */
+Boxa *TessBaseAPI::GetConnectedComponents(Pixa **pixa) {
+  return GetComponentImages(RIL_SYMBOL, true, pixa, nullptr);
+}
+
+/**
+ * Get the given level kind of components (block, textline, word etc.) as a
+ * leptonica-style Boxa, Pixa pair, in reading order.
+ * Can be called before or after Recognize.
+ * If blockids is not nullptr, the block-id of each component is also returned
+ * as an array of one element per component. delete [] after use.
+ * If text_only is true, then only text components are returned.
+ */
+Boxa *TessBaseAPI::GetComponentImages(PageIteratorLevel level, bool text_only, bool raw_image,
+                                      const int raw_padding, Pixa **pixa, int **blockids,
+                                      int **paraids) {
+  /*non-const*/ std::unique_ptr</*non-const*/ PageIterator> page_it(GetIterator());
+  if (page_it == nullptr) {
+    page_it.reset(AnalyseLayout());
+  }
+  if (page_it == nullptr) {
+    return nullptr; // Failed.
+  }
+
+  // Count the components to get a size for the arrays.
+  int component_count = 0;
+  int left, top, right, bottom;
+
+  if (raw_image) {
+    // Get bounding box in original raw image with padding.
+    do {
+      if (page_it->BoundingBox(level, raw_padding, &left, &top, &right, &bottom) &&
+          (!text_only || PTIsTextType(page_it->BlockType()))) {
+        ++component_count;
+      }
+    } while (page_it->Next(level));
+  } else {
+    // Get bounding box from binarized imaged. Note that this could be
+    // differently scaled from the original image.
+    do {
+      if (page_it->BoundingBoxInternal(level, &left, &top, &right, &bottom) &&
+          (!text_only || PTIsTextType(page_it->BlockType()))) {
+        ++component_count;
+      }
+    } while (page_it->Next(level));
+  }
+
+  Boxa *boxa = boxaCreate(component_count);
+  if (pixa != nullptr) {
+    *pixa = pixaCreate(component_count);
+  }
+  if (blockids != nullptr) {
+    *blockids = new int[component_count];
+  }
+  if (paraids != nullptr) {
+    *paraids = new int[component_count];
+  }
+
+  int blockid = 0;
+  int paraid = 0;
+  int component_index = 0;
+  page_it->Begin();
+  do {
+    bool got_bounding_box;
+    if (raw_image) {
+      got_bounding_box = page_it->BoundingBox(level, raw_padding, &left, &top, &right, &bottom);
+    } else {
+      got_bounding_box = page_it->BoundingBoxInternal(level, &left, &top, &right, &bottom);
+    }
+    if (got_bounding_box && (!text_only || PTIsTextType(page_it->BlockType()))) {
+      Box *lbox = boxCreate(left, top, right - left, bottom - top);
+      boxaAddBox(boxa, lbox, L_INSERT);
+      if (pixa != nullptr) {
+        Pix *pix = nullptr;
+        if (raw_image) {
+          pix = page_it->GetImage(level, raw_padding, GetInputImage(), &left, &top);
+        } else {
+          pix = page_it->GetBinaryImage(level);
+        }
+        pixaAddPix(*pixa, pix, L_INSERT);
+        pixaAddBox(*pixa, lbox, L_CLONE);
+      }
+      if (paraids != nullptr) {
+        (*paraids)[component_index] = paraid;
+        if (page_it->IsAtFinalElement(RIL_PARA, level)) {
+          ++paraid;
+        }
+      }
+      if (blockids != nullptr) {
+        (*blockids)[component_index] = blockid;
+        if (page_it->IsAtFinalElement(RIL_BLOCK, level)) {
+          ++blockid;
+          paraid = 0;
+        }
+      }
+      ++component_index;
+    }
+  } while (page_it->Next(level));
+  return boxa;
+}
+
+int TessBaseAPI::GetThresholdedImageScaleFactor() const {
+  if (thresholder_ == nullptr) {
+    return 0;
+  }
+  return thresholder_->GetScaleFactor();
+}
+
+/**
+ * Runs page layout analysis in the mode set by SetPageSegMode.
+ * May optionally be called prior to Recognize to get access to just
+ * the page layout results. Returns an iterator to the results.
+ * If merge_similar_words is true, words are combined where suitable for use
+ * with a line recognizer. Use if you want to use AnalyseLayout to find the
+ * textlines, and then want to process textline fragments with an external
+ * line recognizer.
+ * Returns nullptr on error or an empty page.
+ * The returned iterator must be deleted after use.
+ * WARNING! This class points to data held within the TessBaseAPI class, and
+ * therefore can only be used while the TessBaseAPI class still exists and
+ * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
+ * DetectOS, or anything else that changes the internal PAGE_RES.
+ */
+PageIterator *TessBaseAPI::AnalyseLayout() {
+  return AnalyseLayout(false);
+}
+
+PageIterator *TessBaseAPI::AnalyseLayout(bool merge_similar_words) {
+  if (FindLines() == 0) {
+    if (block_list_->empty()) {
+      return nullptr; // The page was empty.
+    }
+    page_res_ = new PAGE_RES(merge_similar_words, block_list_, nullptr);
+    DetectParagraphs(false);
+    return new PageIterator(page_res_, tesseract_, thresholder_->GetScaleFactor(),
+                            thresholder_->GetScaledYResolution(), rect_left_, rect_top_,
+                            rect_width_, rect_height_);
+  }
+  return nullptr;
+}
+
+/**
+ * Recognize the tesseract global image and return the result as Tesseract
+ * internal structures.
+ */
+int TessBaseAPI::Recognize(ETEXT_DESC *monitor) {
+  if (tesseract_ == nullptr) {
+    return -1;
+  }
+  if (FindLines() != 0) {
+    return -1;
+  }
+  delete page_res_;
+  if (block_list_->empty()) {
+    page_res_ = new PAGE_RES(false, block_list_, &tesseract_->prev_word_best_choice_);
+    return 0; // Empty page.
+  }
+
+  tesseract_->SetBlackAndWhitelist();
+  recognition_done_ = true;
+#ifndef DISABLED_LEGACY_ENGINE
+  if (tesseract_->tessedit_resegment_from_line_boxes) {
+    page_res_ = tesseract_->ApplyBoxes(input_file_.c_str(), true, block_list_);
+  } else if (tesseract_->tessedit_resegment_from_boxes) {
+    page_res_ = tesseract_->ApplyBoxes(input_file_.c_str(), false, block_list_);
+  } else
+#endif // ndef DISABLED_LEGACY_ENGINE
+  {
+    page_res_ =
+        new PAGE_RES(tesseract_->AnyLSTMLang(), block_list_, &tesseract_->prev_word_best_choice_);
+  }
+
+  if (page_res_ == nullptr) {
+    return -1;
+  }
+
+  if (tesseract_->tessedit_train_line_recognizer) {
+    if (!tesseract_->TrainLineRecognizer(input_file_.c_str(), output_file_, block_list_)) {
+      return -1;
+    }
+    tesseract_->CorrectClassifyWords(page_res_);
+    return 0;
+  }
+#ifndef DISABLED_LEGACY_ENGINE
+  if (tesseract_->tessedit_make_boxes_from_boxes) {
+    tesseract_->CorrectClassifyWords(page_res_);
+    return 0;
+  }
+#endif // ndef DISABLED_LEGACY_ENGINE
+
+  int result = 0;
+  if (tesseract_->interactive_display_mode) {
+#ifndef GRAPHICS_DISABLED
+    tesseract_->pgeditor_main(rect_width_, rect_height_, page_res_);
+#endif // !GRAPHICS_DISABLED
+       // The page_res is invalid after an interactive session, so cleanup
+       // in a way that lets us continue to the next page without crashing.
+    delete page_res_;
+    page_res_ = nullptr;
+    return -1;
+#ifndef DISABLED_LEGACY_ENGINE
+  } else if (tesseract_->tessedit_train_from_boxes) {
+    std::string fontname;
+    ExtractFontName(output_file_.c_str(), &fontname);
+    tesseract_->ApplyBoxTraining(fontname, page_res_);
+  } else if (tesseract_->tessedit_ambigs_training) {
+    FILE *training_output_file = tesseract_->init_recog_training(input_file_.c_str());
+    // OCR the page segmented into words by tesseract.
+    tesseract_->recog_training_segmented(input_file_.c_str(), page_res_, monitor,
+                                         training_output_file);
+    fclose(training_output_file);
+#endif // ndef DISABLED_LEGACY_ENGINE
+  } else {
+    // Now run the main recognition.
+    bool wait_for_text = true;
+    GetBoolVariable("paragraph_text_based", &wait_for_text);
+    if (!wait_for_text) {
+      DetectParagraphs(false);
+    }
+    if (tesseract_->recog_all_words(page_res_, monitor, nullptr, nullptr, 0)) {
+      if (wait_for_text) {
+        DetectParagraphs(true);
+      }
+    } else {
+      result = -1;
+    }
+  }
+  return result;
+}
+
+// Takes ownership of the input pix.
+void TessBaseAPI::SetInputImage(Pix *pix) {
+  tesseract_->set_pix_original(pix);
+}
+
+Pix *TessBaseAPI::GetInputImage() {
+  return tesseract_->pix_original();
+}
+
+const char *TessBaseAPI::GetInputName() {
+  if (!input_file_.empty()) {
+    return input_file_.c_str();
+  }
+  return nullptr;
+}
+
+const char *TessBaseAPI::GetDatapath() {
+  return tesseract_->datadir.c_str();
+}
+
+int TessBaseAPI::GetSourceYResolution() {
+  if (thresholder_ == nullptr)
+    return -1;
+  return thresholder_->GetSourceYResolution();
+}
+
+// If flist exists, get data from there. Otherwise get data from buf.
+// Seems convoluted, but is the easiest way I know of to meet multiple
+// goals. Support streaming from stdin, and also work on platforms
+// lacking fmemopen.
+// TODO: check different logic for flist/buf and simplify.
+bool TessBaseAPI::ProcessPagesFileList(FILE *flist, std::string *buf, const char *retry_config,
+                                       int timeout_millisec, TessResultRenderer *renderer,
+                                       int tessedit_page_number) {
+  if (!flist && !buf) {
+    return false;
+  }
+  unsigned page = (tessedit_page_number >= 0) ? tessedit_page_number : 0;
+  char pagename[MAX_PATH];
+
+  std::vector<std::string> lines;
+  if (!flist) {
+    std::string line;
+    for (const auto ch : *buf) {
+      if (ch == '\n') {
+        lines.push_back(line);
+        line.clear();
+      } else {
+        line.push_back(ch);
+      }
+    }
+    if (!line.empty()) {
+      // Add last line without terminating LF.
+      lines.push_back(line);
+    }
+    if (lines.empty()) {
+      return false;
+    }
+  }
+
+  // Skip to the requested page number.
+  for (unsigned i = 0; i < page; i++) {
+    if (flist) {
+      if (fgets(pagename, sizeof(pagename), flist) == nullptr) {
+        break;
+      }
+    }
+  }
+
+  // Begin producing output
+  if (renderer && !renderer->BeginDocument(document_title.c_str())) {
+    return false;
+  }
+
+  // Loop over all pages - or just the requested one
+  while (true) {
+    if (flist) {
+      if (fgets(pagename, sizeof(pagename), flist) == nullptr) {
+        break;
+      }
+    } else {
+      if (page >= lines.size()) {
+        break;
+      }
+      snprintf(pagename, sizeof(pagename), "%s", lines[page].c_str());
+    }
+    chomp_string(pagename);
+    Pix *pix = pixRead(pagename);
+    if (pix == nullptr) {
+      tprintf("Image file %s cannot be read!\n", pagename);
+      return false;
+    }
+    tprintf("Page %u : %s\n", page, pagename);
+    bool r = ProcessPage(pix, page, pagename, retry_config, timeout_millisec, renderer);
+    pixDestroy(&pix);
+    if (!r) {
+      return false;
+    }
+    if (tessedit_page_number >= 0) {
+      break;
+    }
+    ++page;
+  }
+
+  // Finish producing output
+  if (renderer && !renderer->EndDocument()) {
+    return false;
+  }
+  return true;
+}
+
+bool TessBaseAPI::ProcessPagesMultipageTiff(const l_uint8 *data, size_t size, const char *filename,
+                                            const char *retry_config, int timeout_millisec,
+                                            TessResultRenderer *renderer,
+                                            int tessedit_page_number) {
+  Pix *pix = nullptr;
+  int page = (tessedit_page_number >= 0) ? tessedit_page_number : 0;
+  size_t offset = 0;
+  for (;; ++page) {
+    if (tessedit_page_number >= 0) {
+      page = tessedit_page_number;
+      pix = (data) ? pixReadMemTiff(data, size, page) : pixReadTiff(filename, page);
+    } else {
+      pix = (data) ? pixReadMemFromMultipageTiff(data, size, &offset)
+                   : pixReadFromMultipageTiff(filename, &offset);
+    }
+    if (pix == nullptr) {
+      break;
+    }
+    if (offset || page > 0) {
+      // Only print page number for multipage TIFF file.
+      tprintf("Page %d\n", page + 1);
+    }
+    auto page_string = std::to_string(page);
+    SetVariable("applybox_page", page_string.c_str());
+    bool r = ProcessPage(pix, page, filename, retry_config, timeout_millisec, renderer);
+    pixDestroy(&pix);
+    if (!r) {
+      return false;
+    }
+    if (tessedit_page_number >= 0) {
+      break;
+    }
+    if (!offset) {
+      break;
+    }
+  }
+  return true;
+}
+
+// Master ProcessPages calls ProcessPagesInternal and then does any post-
+// processing required due to being in a training mode.
+bool TessBaseAPI::ProcessPages(const char *filename, const char *retry_config, int timeout_millisec,
+                               TessResultRenderer *renderer) {
+  bool result = ProcessPagesInternal(filename, retry_config, timeout_millisec, renderer);
+#ifndef DISABLED_LEGACY_ENGINE
+  if (result) {
+    if (tesseract_->tessedit_train_from_boxes && !tesseract_->WriteTRFile(output_file_.c_str())) {
+      tprintf("Write of TR file failed: %s\n", output_file_.c_str());
+      return false;
+    }
+  }
+#endif // ndef DISABLED_LEGACY_ENGINE
+  return result;
+}
+
+#ifdef HAVE_LIBCURL
+static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp) {
+  size = size * nmemb;
+  auto *buf = reinterpret_cast<std::string *>(userp);
+  buf->append(reinterpret_cast<const char *>(contents), size);
+  return size;
+}
+#endif
+
+// In the ideal scenario, Tesseract will start working on data as soon
+// as it can. For example, if you stream a filelist through stdin, we
+// should start the OCR process as soon as the first filename is
+// available. This is particularly useful when hooking Tesseract up to
+// slow hardware such as a book scanning machine.
+//
+// Unfortunately there are tradeoffs. You can't seek on stdin. That
+// makes automatic detection of datatype (TIFF? filelist? PNG?)
+// impractical.  So we support a command line flag to explicitly
+// identify the scenario that really matters: filelists on
+// stdin. We'll still do our best if the user likes pipes.
+bool TessBaseAPI::ProcessPagesInternal(const char *filename, const char *retry_config,
+                                       int timeout_millisec, TessResultRenderer *renderer) {
+  bool stdInput = !strcmp(filename, "stdin") || !strcmp(filename, "-");
+  if (stdInput) {
+#ifdef WIN32
+    if (_setmode(_fileno(stdin), _O_BINARY) == -1)
+      tprintf("ERROR: cin to binary: %s", strerror(errno));
+#endif // WIN32
+  }
+
+  if (stream_filelist) {
+    return ProcessPagesFileList(stdin, nullptr, retry_config, timeout_millisec, renderer,
+                                tesseract_->tessedit_page_number);
+  }
+
+  // At this point we are officially in autodection territory.
+  // That means any data in stdin must be buffered, to make it
+  // seekable.
+  std::string buf;
+  const l_uint8 *data = nullptr;
+  if (stdInput) {
+    buf.assign((std::istreambuf_iterator<char>(std::cin)), (std::istreambuf_iterator<char>()));
+    data = reinterpret_cast<const l_uint8 *>(buf.data());
+  } else if (strstr(filename, "://") != nullptr) {
+    // Get image or image list by URL.
+#ifdef HAVE_LIBCURL
+    CURL *curl = curl_easy_init();
+    if (curl == nullptr) {
+      fprintf(stderr, "Error, curl_easy_init failed\n");
+      return false;
+    } else {
+      CURLcode curlcode;
+      auto error = [curl, &curlcode](const char *function) {
+        fprintf(stderr, "Error, %s failed with error %s\n", function, curl_easy_strerror(curlcode));
+        curl_easy_cleanup(curl);
+        return false;
+      };
+      curlcode = curl_easy_setopt(curl, CURLOPT_URL, filename);
+      if (curlcode != CURLE_OK) {
+        return error("curl_easy_setopt");
+      }
+      curlcode = curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1L);
+      if (curlcode != CURLE_OK) {
+        return error("curl_easy_setopt");
+      }
+      // Follow HTTP, HTTPS, FTP and FTPS redirects.
+      curlcode = curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
+      if (curlcode != CURLE_OK) {
+        return error("curl_easy_setopt");
+      }
+      // Allow no more than 8 redirections to prevent endless loops.
+      curlcode = curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 8);
+      if (curlcode != CURLE_OK) {
+        return error("curl_easy_setopt");
+      }
+      int timeout = curl_timeout;
+      if (timeout > 0) {
+        curlcode = curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1L);
+        if (curlcode != CURLE_OK) {
+          return error("curl_easy_setopt");
+        }
+        curlcode = curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout);
+        if (curlcode != CURLE_OK) {
+          return error("curl_easy_setopt");
+        }
+      }
+      std::string cookiefile = curl_cookiefile;
+      if (!cookiefile.empty()) {
+        curlcode = curl_easy_setopt(curl, CURLOPT_COOKIEFILE, cookiefile.c_str());
+        if (curlcode != CURLE_OK) {
+          return error("curl_easy_setopt");
+        }
+      }
+      curlcode = curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
+      if (curlcode != CURLE_OK) {
+        return error("curl_easy_setopt");
+      }
+      curlcode = curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buf);
+      if (curlcode != CURLE_OK) {
+        return error("curl_easy_setopt");
+      }
+      curlcode = curl_easy_setopt(curl, CURLOPT_USERAGENT, "Tesseract OCR");
+      if (curlcode != CURLE_OK) {
+        return error("curl_easy_setopt");
+      }
+      curlcode = curl_easy_perform(curl);
+      if (curlcode != CURLE_OK) {
+        return error("curl_easy_perform");
+      }
+      curl_easy_cleanup(curl);
+      data = reinterpret_cast<const l_uint8 *>(buf.data());
+    }
+#else
+    fprintf(stderr, "Error, this tesseract has no URL support\n");
+    return false;
+#endif
+  } else {
+    // Check whether the input file can be read.
+    if (FILE *file = fopen(filename, "rb")) {
+      fclose(file);
+    } else {
+      fprintf(stderr, "Error, cannot read input file %s: %s\n", filename, strerror(errno));
+      return false;
+    }
+  }
+
+  // Here is our autodetection
+  int format;
+  int r =
+      (data != nullptr) ? findFileFormatBuffer(data, &format) : findFileFormat(filename, &format);
+
+  // Maybe we have a filelist
+  if (r != 0 || format == IFF_UNKNOWN) {
+    std::string s;
+    if (data != nullptr) {
+      s = buf.c_str();
+    } else {
+      std::ifstream t(filename);
+      std::string u((std::istreambuf_iterator<char>(t)), std::istreambuf_iterator<char>());
+      s = u.c_str();
+    }
+    return ProcessPagesFileList(nullptr, &s, retry_config, timeout_millisec, renderer,
+                                tesseract_->tessedit_page_number);
+  }
+
+  // Maybe we have a TIFF which is potentially multipage
+  bool tiff = (format == IFF_TIFF || format == IFF_TIFF_PACKBITS || format == IFF_TIFF_RLE ||
+               format == IFF_TIFF_G3 || format == IFF_TIFF_G4 || format == IFF_TIFF_LZW ||
+#if LIBLEPT_MAJOR_VERSION > 1 || LIBLEPT_MINOR_VERSION > 76
+               format == IFF_TIFF_JPEG ||
+#endif
+               format == IFF_TIFF_ZIP);
+
+  // Fail early if we can, before producing any output
+  Pix *pix = nullptr;
+  if (!tiff) {
+    pix = (data != nullptr) ? pixReadMem(data, buf.size()) : pixRead(filename);
+    if (pix == nullptr) {
+      return false;
+    }
+  }
+
+  // Begin the output
+  if (renderer && !renderer->BeginDocument(document_title.c_str())) {
+    pixDestroy(&pix);
+    return false;
+  }
+
+  // Produce output
+  r = (tiff) ? ProcessPagesMultipageTiff(data, buf.size(), filename, retry_config, timeout_millisec,
+                                         renderer, tesseract_->tessedit_page_number)
+             : ProcessPage(pix, 0, filename, retry_config, timeout_millisec, renderer);
+
+  // Clean up memory as needed
+  pixDestroy(&pix);
+
+  // End the output
+  if (!r || (renderer && !renderer->EndDocument())) {
+    return false;
+  }
+  return true;
+}
+
+bool TessBaseAPI::ProcessPage(Pix *pix, int page_index, const char *filename,
+                              const char *retry_config, int timeout_millisec,
+                              TessResultRenderer *renderer) {
+  SetInputName(filename);
+  SetImage(pix);
+  bool failed = false;
+
+  if (tesseract_->tessedit_pageseg_mode == PSM_AUTO_ONLY) {
+    // Disabled character recognition
+    if (! std::unique_ptr<const PageIterator>(AnalyseLayout())) {
+      failed = true;
+    }
+  } else if (tesseract_->tessedit_pageseg_mode == PSM_OSD_ONLY) {
+    failed = FindLines() != 0;
+  } else if (timeout_millisec > 0) {
+    // Running with a timeout.
+    ETEXT_DESC monitor;
+    monitor.cancel = nullptr;
+    monitor.cancel_this = nullptr;
+    monitor.set_deadline_msecs(timeout_millisec);
+
+    // Now run the main recognition.
+    failed = Recognize(&monitor) < 0;
+  } else {
+    // Normal layout and character recognition with no timeout.
+    failed = Recognize(nullptr) < 0;
+  }
+
+  if (tesseract_->tessedit_write_images) {
+    Pix *page_pix = GetThresholdedImage();
+    std::string output_filename = output_file_ + ".processed";
+    if (page_index > 0) {
+      output_filename += std::to_string(page_index);
+    }
+    output_filename += ".tif";
+    pixWrite(output_filename.c_str(), page_pix, IFF_TIFF_G4);
+    pixDestroy(&page_pix);
+  }
+
+  if (failed && retry_config != nullptr && retry_config[0] != '\0') {
+    // Save current config variables before switching modes.
+    FILE *fp = fopen(kOldVarsFile, "wb");
+    if (fp == nullptr) {
+      tprintf("Error, failed to open file \"%s\"\n", kOldVarsFile);
+    } else {
+      PrintVariables(fp);
+      fclose(fp);
+    }
+    // Switch to alternate mode for retry.
+    ReadConfigFile(retry_config);
+    SetImage(pix);
+    Recognize(nullptr);
+    // Restore saved config variables.
+    ReadConfigFile(kOldVarsFile);
+  }
+
+  if (renderer && !failed) {
+    failed = !renderer->AddImage(this);
+  }
+
+  return !failed;
+}
+
+/**
+ * Get a left-to-right iterator to the results of LayoutAnalysis and/or
+ * Recognize. The returned iterator must be deleted after use.
+ */
+LTRResultIterator *TessBaseAPI::GetLTRIterator() {
+  if (tesseract_ == nullptr || page_res_ == nullptr) {
+    return nullptr;
+  }
+  return new LTRResultIterator(page_res_, tesseract_, thresholder_->GetScaleFactor(),
+                               thresholder_->GetScaledYResolution(), rect_left_, rect_top_,
+                               rect_width_, rect_height_);
+}
+
+/**
+ * Get a reading-order iterator to the results of LayoutAnalysis and/or
+ * Recognize. The returned iterator must be deleted after use.
+ * WARNING! This class points to data held within the TessBaseAPI class, and
+ * therefore can only be used while the TessBaseAPI class still exists and
+ * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
+ * DetectOS, or anything else that changes the internal PAGE_RES.
+ */
+ResultIterator *TessBaseAPI::GetIterator() {
+  if (tesseract_ == nullptr || page_res_ == nullptr) {
+    return nullptr;
+  }
+  return ResultIterator::StartOfParagraph(LTRResultIterator(
+      page_res_, tesseract_, thresholder_->GetScaleFactor(), thresholder_->GetScaledYResolution(),
+      rect_left_, rect_top_, rect_width_, rect_height_));
+}
+
+/**
+ * Get a mutable iterator to the results of LayoutAnalysis and/or Recognize.
+ * The returned iterator must be deleted after use.
+ * WARNING! This class points to data held within the TessBaseAPI class, and
+ * therefore can only be used while the TessBaseAPI class still exists and
+ * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
+ * DetectOS, or anything else that changes the internal PAGE_RES.
+ */
+MutableIterator *TessBaseAPI::GetMutableIterator() {
+  if (tesseract_ == nullptr || page_res_ == nullptr) {
+    return nullptr;
+  }
+  return new MutableIterator(page_res_, tesseract_, thresholder_->GetScaleFactor(),
+                             thresholder_->GetScaledYResolution(), rect_left_, rect_top_,
+                             rect_width_, rect_height_);
+}
+
+/** Make a text string from the internal data structures. */
+char *TessBaseAPI::GetUTF8Text() {
+  if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) {
+    return nullptr;
+  }
+  std::string text("");
+  const std::unique_ptr</*non-const*/ ResultIterator> it(GetIterator());
+  do {
+    if (it->Empty(RIL_PARA)) {
+      continue;
+    }
+    auto block_type = it->BlockType();
+    switch (block_type) {
+      case PT_FLOWING_IMAGE:
+      case PT_HEADING_IMAGE:
+      case PT_PULLOUT_IMAGE:
+      case PT_HORZ_LINE:
+      case PT_VERT_LINE:
+        // Ignore images and lines for text output.
+        continue;
+      case PT_NOISE:
+        tprintf("TODO: Please report image which triggers the noise case.\n");
+        ASSERT_HOST(false);
+      default:
+        break;
+    }
+
+    const std::unique_ptr<const char[]> para_text(it->GetUTF8Text(RIL_PARA));
+    text += para_text.get();
+  } while (it->Next(RIL_PARA));
+  return copy_string(text);
+}
+
+static void AddBoxToTSV(const PageIterator *it, PageIteratorLevel level, std::string &text) {
+  int left, top, right, bottom;
+  it->BoundingBox(level, &left, &top, &right, &bottom);
+  text += "\t" + std::to_string(left);
+  text += "\t" + std::to_string(top);
+  text += "\t" + std::to_string(right - left);
+  text += "\t" + std::to_string(bottom - top);
+}
+
+/**
+ * Make a TSV-formatted string from the internal data structures.
+ * page_number is 0-based but will appear in the output as 1-based.
+ * Returned string must be freed with the delete [] operator.
+ */
+char *TessBaseAPI::GetTSVText(int page_number) {
+  if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0)) {
+    return nullptr;
+  }
+
+#if !defined(NDEBUG)
+  int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;
+#endif
+  int page_id = page_number + 1; // we use 1-based page numbers.
+
+  int page_num = page_id;
+  int block_num = 0;
+  int par_num = 0;
+  int line_num = 0;
+  int word_num = 0;
+
+  std::string tsv_str;
+  tsv_str += "1\t" + std::to_string(page_num); // level 1 - page
+  tsv_str += "\t" + std::to_string(block_num);
+  tsv_str += "\t" + std::to_string(par_num);
+  tsv_str += "\t" + std::to_string(line_num);
+  tsv_str += "\t" + std::to_string(word_num);
+  tsv_str += "\t" + std::to_string(rect_left_);
+  tsv_str += "\t" + std::to_string(rect_top_);
+  tsv_str += "\t" + std::to_string(rect_width_);
+  tsv_str += "\t" + std::to_string(rect_height_);
+  tsv_str += "\t-1\t\n";
+
+  const std::unique_ptr</*non-const*/ ResultIterator> res_it(GetIterator());
+  while (!res_it->Empty(RIL_BLOCK)) {
+    if (res_it->Empty(RIL_WORD)) {
+      res_it->Next(RIL_WORD);
+      continue;
+    }
+
+    // Add rows for any new block/paragraph/textline.
+    if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
+      block_num++;
+      par_num = 0;
+      line_num = 0;
+      word_num = 0;
+      tsv_str += "2\t" + std::to_string(page_num); // level 2 - block
+      tsv_str += "\t" + std::to_string(block_num);
+      tsv_str += "\t" + std::to_string(par_num);
+      tsv_str += "\t" + std::to_string(line_num);
+      tsv_str += "\t" + std::to_string(word_num);
+      AddBoxToTSV(res_it.get(), RIL_BLOCK, tsv_str);
+      tsv_str += "\t-1\t\n"; // end of row for block
+    }
+    if (res_it->IsAtBeginningOf(RIL_PARA)) {
+      par_num++;
+      line_num = 0;
+      word_num = 0;
+      tsv_str += "3\t" + std::to_string(page_num); // level 3 - paragraph
+      tsv_str += "\t" + std::to_string(block_num);
+      tsv_str += "\t" + std::to_string(par_num);
+      tsv_str += "\t" + std::to_string(line_num);
+      tsv_str += "\t" + std::to_string(word_num);
+      AddBoxToTSV(res_it.get(), RIL_PARA, tsv_str);
+      tsv_str += "\t-1\t\n"; // end of row for para
+    }
+    if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
+      line_num++;
+      word_num = 0;
+      tsv_str += "4\t" + std::to_string(page_num); // level 4 - line
+      tsv_str += "\t" + std::to_string(block_num);
+      tsv_str += "\t" + std::to_string(par_num);
+      tsv_str += "\t" + std::to_string(line_num);
+      tsv_str += "\t" + std::to_string(word_num);
+      AddBoxToTSV(res_it.get(), RIL_TEXTLINE, tsv_str);
+      tsv_str += "\t-1\t\n"; // end of row for line
+    }
+
+    // Now, process the word...
+    int left, top, right, bottom;
+    res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
+    word_num++;
+    tsv_str += "5\t" + std::to_string(page_num); // level 5 - word
+    tsv_str += "\t" + std::to_string(block_num);
+    tsv_str += "\t" + std::to_string(par_num);
+    tsv_str += "\t" + std::to_string(line_num);
+    tsv_str += "\t" + std::to_string(word_num);
+    tsv_str += "\t" + std::to_string(left);
+    tsv_str += "\t" + std::to_string(top);
+    tsv_str += "\t" + std::to_string(right - left);
+    tsv_str += "\t" + std::to_string(bottom - top);
+    tsv_str += "\t" + std::to_string(res_it->Confidence(RIL_WORD));
+    tsv_str += "\t";
+
+#if !defined(NDEBUG)
+    // Increment counts if at end of block/paragraph/textline.
+    if (res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD)) {
+      lcnt++;
+    }
+    if (res_it->IsAtFinalElement(RIL_PARA, RIL_WORD)) {
+      pcnt++;
+    }
+    if (res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD)) {
+      bcnt++;
+    }
+#endif
+
+    do {
+      tsv_str += std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get();
+      res_it->Next(RIL_SYMBOL);
+    } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
+    tsv_str += "\n"; // end of row
+#if !defined(NDEBUG)
+    wcnt++;
+#endif
+  }
+
+  return copy_string(tsv_str);
+}
+
+/** The 5 numbers output for each box (the usual 4 and a page number.) */
+const int kNumbersPerBlob = 5;
+/**
+ * The number of bytes taken by each number. Since we use int16_t for ICOORD,
+ * assume only 5 digits max.
+ */
+const int kBytesPerNumber = 5;
+/**
+ * Multiplier for max expected textlength assumes (kBytesPerNumber + space)
+ * * kNumbersPerBlob plus the newline. Add to this the
+ * original UTF8 characters, and one kMaxBytesPerLine for safety.
+ */
+const int kBytesPerBoxFileLine = (kBytesPerNumber + 1) * kNumbersPerBlob + 1;
+/** Max bytes in the decimal representation of int64_t. */
+const int kBytesPer64BitNumber = 20;
+/**
+ * A maximal single box could occupy kNumbersPerBlob numbers at
+ * kBytesPer64BitNumber digits (if someone sneaks in a 64 bit value) and a
+ * space plus the newline and the maximum length of a UNICHAR.
+ * Test against this on each iteration for safety.
+ */
+const int kMaxBytesPerLine = kNumbersPerBlob * (kBytesPer64BitNumber + 1) + 1 + UNICHAR_LEN;
+
+/**
+ * The recognized text is returned as a char* which is coded
+ * as a UTF8 box file.
+ * page_number is a 0-base page index that will appear in the box file.
+ * Returned string must be freed with the delete [] operator.
+ */
+char *TessBaseAPI::GetBoxText(int page_number) {
+  if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) {
+    return nullptr;
+  }
+  int blob_count;
+  int utf8_length = TextLength(&blob_count);
+  int total_length = blob_count * kBytesPerBoxFileLine + utf8_length + kMaxBytesPerLine;
+  char *result = new char[total_length];
+  result[0] = '\0';
+  int output_length = 0;
+  LTRResultIterator *it = GetLTRIterator();
+  do {
+    int left, top, right, bottom;
+    if (it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom)) {
+      const std::unique_ptr</*non-const*/ char[]> text(it->GetUTF8Text(RIL_SYMBOL));
+      // Tesseract uses space for recognition failure. Fix to a reject
+      // character, kTesseractReject so we don't create illegal box files.
+      for (int i = 0; text[i] != '\0'; ++i) {
+        if (text[i] == ' ') {
+          text[i] = kTesseractReject;
+        }
+      }
+      snprintf(result + output_length, total_length - output_length, "%s %d %d %d %d %d\n",
+               text.get(), left, image_height_ - bottom, right, image_height_ - top, page_number);
+      output_length += strlen(result + output_length);
+      // Just in case...
+      if (output_length + kMaxBytesPerLine > total_length) {
+        break;
+      }
+    }
+  } while (it->Next(RIL_SYMBOL));
+  delete it;
+  return result;
+}
+
+/**
+ * Conversion table for non-latin characters.
+ * Maps characters out of the latin set into the latin set.
+ * TODO(rays) incorporate this translation into unicharset.
+ */
+const int kUniChs[] = {0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0};
+/** Latin chars corresponding to the unicode chars above. */
+const int kLatinChs[] = {0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0};
+
+/**
+ * The recognized text is returned as a char* which is coded
+ * as UNLV format Latin-1 with specific reject and suspect codes.
+ * Returned string must be freed with the delete [] operator.
+ */
+char *TessBaseAPI::GetUNLVText() {
+  if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) {
+    return nullptr;
+  }
+  bool tilde_crunch_written = false;
+  bool last_char_was_newline = true;
+  bool last_char_was_tilde = false;
+
+  int total_length = TextLength(nullptr);
+  PAGE_RES_IT page_res_it(page_res_);
+  char *result = new char[total_length];
+  char *ptr = result;
+  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
+    WERD_RES *word = page_res_it.word();
+    // Process the current word.
+    if (word->unlv_crunch_mode != CR_NONE) {
+      if (word->unlv_crunch_mode != CR_DELETE &&
+          (!tilde_crunch_written ||
+           (word->unlv_crunch_mode == CR_KEEP_SPACE && word->word->space() > 0 &&
+            !word->word->flag(W_FUZZY_NON) && !word->word->flag(W_FUZZY_SP)))) {
+        if (!word->word->flag(W_BOL) && word->word->space() > 0 && !word->word->flag(W_FUZZY_NON) &&
+            !word->word->flag(W_FUZZY_SP)) {
+          /* Write a space to separate from preceding good text */
+          *ptr++ = ' ';
+          last_char_was_tilde = false;
+        }
+        if (!last_char_was_tilde) {
+          // Write a reject char.
+          last_char_was_tilde = true;
+          *ptr++ = kUNLVReject;
+          tilde_crunch_written = true;
+          last_char_was_newline = false;
+        }
+      }
+    } else {
+      // NORMAL PROCESSING of non tilde crunched words.
+      tilde_crunch_written = false;
+      tesseract_->set_unlv_suspects(word);
+      const char *wordstr = word->best_choice->unichar_string().c_str();
+      const auto &lengths = word->best_choice->unichar_lengths();
+      int length = lengths.length();
+      int i = 0;
+      int offset = 0;
+
+      if (last_char_was_tilde && word->word->space() == 0 && wordstr[offset] == ' ') {
+        // Prevent adjacent tilde across words - we know that adjacent tildes
+        // within words have been removed.
+        // Skip the first character.
+        offset = lengths[i++];
+      }
+      if (i < length && wordstr[offset] != 0) {
+        if (!last_char_was_newline) {
+          *ptr++ = ' ';
+        } else {
+          last_char_was_newline = false;
+        }
+        for (; i < length; offset += lengths[i++]) {
+          if (wordstr[offset] == ' ' || wordstr[offset] == kTesseractReject) {
+            *ptr++ = kUNLVReject;
+            last_char_was_tilde = true;
+          } else {
+            if (word->reject_map[i].rejected()) {
+              *ptr++ = kUNLVSuspect;
+            }
+            UNICHAR ch(wordstr + offset, lengths[i]);
+            int uni_ch = ch.first_uni();
+            for (int j = 0; kUniChs[j] != 0; ++j) {
+              if (kUniChs[j] == uni_ch) {
+                uni_ch = kLatinChs[j];
+                break;
+              }
+            }
+            if (uni_ch <= 0xff) {
+              *ptr++ = static_cast<char>(uni_ch);
+              last_char_was_tilde = false;
+            } else {
+              *ptr++ = kUNLVReject;
+              last_char_was_tilde = true;
+            }
+          }
+        }
+      }
+    }
+    if (word->word->flag(W_EOL) && !last_char_was_newline) {
+      /* Add a new line output */
+      *ptr++ = '\n';
+      tilde_crunch_written = false;
+      last_char_was_newline = true;
+      last_char_was_tilde = false;
+    }
+  }
+  *ptr++ = '\n';
+  *ptr = '\0';
+  return result;
+}
+
+#ifndef DISABLED_LEGACY_ENGINE
+
+/**
+ * Detect the orientation of the input image and apparent script (alphabet).
+ * orient_deg is the detected clockwise rotation of the input image in degrees
+ * (0, 90, 180, 270)
+ * orient_conf is the confidence (15.0 is reasonably confident)
+ * script_name is an ASCII string, the name of the script, e.g. "Latin"
+ * script_conf is confidence level in the script
+ * Returns true on success and writes values to each parameter as an output
+ */
+bool TessBaseAPI::DetectOrientationScript(int *orient_deg, float *orient_conf,
+                                          const char **script_name, float *script_conf) {
+  OSResults osr;
+
+  bool osd = DetectOS(&osr);
+  if (!osd) {
+    return false;
+  }
+
+  int orient_id = osr.best_result.orientation_id;
+  int script_id = osr.get_best_script(orient_id);
+  if (orient_conf) {
+    *orient_conf = osr.best_result.oconfidence;
+  }
+  if (orient_deg) {
+    *orient_deg = orient_id * 90; // convert quadrant to degrees
+  }
+
+  if (script_name) {
+    const char *script = osr.unicharset->get_script_from_script_id(script_id);
+
+    *script_name = script;
+  }
+
+  if (script_conf) {
+    *script_conf = osr.best_result.sconfidence;
+  }
+
+  return true;
+}
+
+/**
+ * The recognized text is returned as a char* which is coded
+ * as UTF8 and must be freed with the delete [] operator.
+ * page_number is a 0-based page index that will appear in the osd file.
+ */
+char *TessBaseAPI::GetOsdText(int page_number) {
+  int orient_deg;
+  float orient_conf;
+  const char *script_name;
+  float script_conf;
+
+  if (!DetectOrientationScript(&orient_deg, &orient_conf, &script_name, &script_conf)) {
+    return nullptr;
+  }
+
+  // clockwise rotation needed to make the page upright
+  int rotate = OrientationIdToValue(orient_deg / 90);
+
+  std::stringstream stream;
+  // Use "C" locale (needed for float values orient_conf and script_conf).
+  stream.imbue(std::locale::classic());
+  // Use fixed notation with 2 digits after the decimal point for float values.
+  stream.precision(2);
+  stream << std::fixed << "Page number: " << page_number << "\n"
+         << "Orientation in degrees: " << orient_deg << "\n"
+         << "Rotate: " << rotate << "\n"
+         << "Orientation confidence: " << orient_conf << "\n"
+         << "Script: " << script_name << "\n"
+         << "Script confidence: " << script_conf << "\n";
+  return copy_string(stream.str());
+}
+
+#endif // ndef DISABLED_LEGACY_ENGINE
+
+/** Returns the average word confidence for Tesseract page result. */
+int TessBaseAPI::MeanTextConf() {
+  int *conf = AllWordConfidences();
+  if (!conf) {
+    return 0;
+  }
+  int sum = 0;
+  int *pt = conf;
+  while (*pt >= 0) {
+    sum += *pt++;
+  }
+  if (pt != conf) {
+    sum /= pt - conf;
+  }
+  delete[] conf;
+  return sum;
+}
+
+/** Returns an array of all word confidences, terminated by -1. */
+int *TessBaseAPI::AllWordConfidences() {
+  if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) {
+    return nullptr;
+  }
+  int n_word = 0;
+  PAGE_RES_IT res_it(page_res_);
+  for (res_it.restart_page(); res_it.word() != nullptr; res_it.forward()) {
+    n_word++;
+  }
+
+  int *conf = new int[n_word + 1];
+  n_word = 0;
+  for (res_it.restart_page(); res_it.word() != nullptr; res_it.forward()) {
+    WERD_RES *word = res_it.word();
+    WERD_CHOICE *choice = word->best_choice;
+    int w_conf = static_cast<int>(100 + 5 * choice->certainty());
+    // This is the eq for converting Tesseract confidence to 1..100
+    if (w_conf < 0) {
+      w_conf = 0;
+    }
+    if (w_conf > 100) {
+      w_conf = 100;
+    }
+    conf[n_word++] = w_conf;
+  }
+  conf[n_word] = -1;
+  return conf;
+}
+
+#ifndef DISABLED_LEGACY_ENGINE
+/**
+ * Applies the given word to the adaptive classifier if possible.
+ * The word must be SPACE-DELIMITED UTF-8 - l i k e t h i s , so it can
+ * tell the boundaries of the graphemes.
+ * Assumes that SetImage/SetRectangle have been used to set the image
+ * to the given word. The mode arg should be PSM_SINGLE_WORD or
+ * PSM_CIRCLE_WORD, as that will be used to control layout analysis.
+ * The currently set PageSegMode is preserved.
+ * Returns false if adaption was not possible for some reason.
+ */
+bool TessBaseAPI::AdaptToWordStr(PageSegMode mode, const char *wordstr) {
+  int debug = 0;
+  GetIntVariable("applybox_debug", &debug);
+  bool success = true;
+  PageSegMode current_psm = GetPageSegMode();
+  SetPageSegMode(mode);
+  SetVariable("classify_enable_learning", "0");
+  const std::unique_ptr<const char[]> text(GetUTF8Text());
+  if (debug) {
+    tprintf("Trying to adapt \"%s\" to \"%s\"\n", text.get(), wordstr);
+  }
+  if (text != nullptr) {
+    PAGE_RES_IT it(page_res_);
+    WERD_RES *word_res = it.word();
+    if (word_res != nullptr) {
+      word_res->word->set_text(wordstr);
+      // Check to see if text matches wordstr.
+      int w = 0;
+      int t;
+      for (t = 0; text[t] != '\0'; ++t) {
+        if (text[t] == '\n' || text[t] == ' ') {
+          continue;
+        }
+        while (wordstr[w] == ' ') {
+          ++w;
+        }
+        if (text[t] != wordstr[w]) {
+          break;
+        }
+        ++w;
+      }
+      if (text[t] != '\0' || wordstr[w] != '\0') {
+        // No match.
+        delete page_res_;
+        std::vector<TBOX> boxes;
+        page_res_ = tesseract_->SetupApplyBoxes(boxes, block_list_);
+        tesseract_->ReSegmentByClassification(page_res_);
+        tesseract_->TidyUp(page_res_);
+        PAGE_RES_IT pr_it(page_res_);
+        if (pr_it.word() == nullptr) {
+          success = false;
+        } else {
+          word_res = pr_it.word();
+        }
+      } else {
+        word_res->BestChoiceToCorrectText();
+      }
+      if (success) {
+        tesseract_->EnableLearning = true;
+        tesseract_->LearnWord(nullptr, word_res);
+      }
+    } else {
+      success = false;
+    }
+  } else {
+    success = false;
+  }
+  SetPageSegMode(current_psm);
+  return success;
+}
+#endif // ndef DISABLED_LEGACY_ENGINE
+
+/**
+ * Free up recognition results and any stored image data, without actually
+ * freeing any recognition data that would be time-consuming to reload.
+ * Afterwards, you must call SetImage or TesseractRect before doing
+ * any Recognize or Get* operation.
+ */
+void TessBaseAPI::Clear() {
+  if (thresholder_ != nullptr) {
+    thresholder_->Clear();
+  }
+  ClearResults();
+  if (tesseract_ != nullptr) {
+    SetInputImage(nullptr);
+  }
+}
+
+/**
+ * Close down tesseract and free up all memory. End() is equivalent to
+ * destructing and reconstructing your TessBaseAPI.
+ * Once End() has been used, none of the other API functions may be used
+ * other than Init and anything declared above it in the class definition.
+ */
+void TessBaseAPI::End() {
+  Clear();
+  delete thresholder_;
+  thresholder_ = nullptr;
+  delete page_res_;
+  page_res_ = nullptr;
+  delete block_list_;
+  block_list_ = nullptr;
+  if (paragraph_models_ != nullptr) {
+    for (auto model : *paragraph_models_) {
+      delete model;
+    }
+    delete paragraph_models_;
+    paragraph_models_ = nullptr;
+  }
+#ifndef DISABLED_LEGACY_ENGINE
+  if (osd_tesseract_ == tesseract_) {
+    osd_tesseract_ = nullptr;
+  }
+  delete osd_tesseract_;
+  osd_tesseract_ = nullptr;
+  delete equ_detect_;
+  equ_detect_ = nullptr;
+#endif // ndef DISABLED_LEGACY_ENGINE
+  delete tesseract_;
+  tesseract_ = nullptr;
+  input_file_.clear();
+  output_file_.clear();
+  datapath_.clear();
+  language_.clear();
+}
+
+// Clear any library-level memory caches.
+// There are a variety of expensive-to-load constant data structures (mostly
+// language dictionaries) that are cached globally -- surviving the Init()
+// and End() of individual TessBaseAPI's.  This function allows the clearing
+// of these caches.
+void TessBaseAPI::ClearPersistentCache() {
+  Dict::GlobalDawgCache()->DeleteUnusedDawgs();
+}
+
+/**
+ * Check whether a word is valid according to Tesseract's language model
+ * returns 0 if the word is invalid, non-zero if valid
+ */
+int TessBaseAPI::IsValidWord(const char *word) const {
+  return tesseract_->getDict().valid_word(word);
+}
+// Returns true if utf8_character is defined in the UniCharset.
+bool TessBaseAPI::IsValidCharacter(const char *utf8_character) const {
+  return tesseract_->unicharset.contains_unichar(utf8_character);
+}
+
+// TODO(rays) Obsolete this function and replace with a more aptly named
+// function that returns image coordinates rather than tesseract coordinates.
+bool TessBaseAPI::GetTextDirection(int *out_offset, float *out_slope) {
+  const std::unique_ptr<const PageIterator> it(AnalyseLayout());
+  if (it == nullptr) {
+    return false;
+  }
+  int x1, x2, y1, y2;
+  it->Baseline(RIL_TEXTLINE, &x1, &y1, &x2, &y2);
+  // Calculate offset and slope (NOTE: Kind of ugly)
+  if (x2 <= x1) {
+    x2 = x1 + 1;
+  }
+  // Convert the point pair to slope/offset of the baseline (in image coords.)
+  *out_slope = static_cast<float>(y2 - y1) / (x2 - x1);
+  *out_offset = static_cast<int>(y1 - *out_slope * x1);
+  // Get the y-coord of the baseline at the left and right edges of the
+  // textline's bounding box.
+  int left, top, right, bottom;
+  if (!it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom)) {
+    return false;
+  }
+  int left_y = IntCastRounded(*out_slope * left + *out_offset);
+  int right_y = IntCastRounded(*out_slope * right + *out_offset);
+  // Shift the baseline down so it passes through the nearest bottom-corner
+  // of the textline's bounding box. This is the difference between the y
+  // at the lowest (max) edge of the box and the actual box bottom.
+  *out_offset += bottom - std::max(left_y, right_y);
+  // Switch back to bottom-up tesseract coordinates. Requires negation of
+  // the slope and height - offset for the offset.
+  *out_slope = -*out_slope;
+  *out_offset = rect_height_ - *out_offset;
+
+  return true;
+}
+
+/** Sets Dict::letter_is_okay_ function to point to the given function. */
+void TessBaseAPI::SetDictFunc(DictFunc f) {
+  if (tesseract_ != nullptr) {
+    tesseract_->getDict().letter_is_okay_ = f;
+  }
+}
+
+/**
+ * Sets Dict::probability_in_context_ function to point to the given
+ * function.
+ *
+ * @param f A single function that returns the probability of the current
+ * "character" (in general a utf-8 string), given the context of a previous
+ * utf-8 string.
+ */
+void TessBaseAPI::SetProbabilityInContextFunc(ProbabilityInContextFunc f) {
+  if (tesseract_ != nullptr) {
+    tesseract_->getDict().probability_in_context_ = f;
+    // Set it for the sublangs too.
+    int num_subs = tesseract_->num_sub_langs();
+    for (int i = 0; i < num_subs; ++i) {
+      tesseract_->get_sub_lang(i)->getDict().probability_in_context_ = f;
+    }
+  }
+}
+
+/** Common code for setting the image. */
+bool TessBaseAPI::InternalSetImage() {
+  if (tesseract_ == nullptr) {
+    tprintf("Please call Init before attempting to set an image.\n");
+    return false;
+  }
+  if (thresholder_ == nullptr) {
+    thresholder_ = new ImageThresholder;
+  }
+  ClearResults();
+  return true;
+}
+
+/**
+ * Run the thresholder to make the thresholded image, returned in pix,
+ * which must not be nullptr. *pix must be initialized to nullptr, or point
+ * to an existing pixDestroyable Pix.
+ * The usual argument to Threshold is Tesseract::mutable_pix_binary().
+ */
+bool TessBaseAPI::Threshold(Pix **pix) {
+  ASSERT_HOST(pix != nullptr);
+  if (*pix != nullptr) {
+    pixDestroy(pix);
+  }
+  // Zero resolution messes up the algorithms, so make sure it is credible.
+  int user_dpi = 0;
+  GetIntVariable("user_defined_dpi", &user_dpi);
+  int y_res = thresholder_->GetScaledYResolution();
+  if (user_dpi && (user_dpi < kMinCredibleResolution || user_dpi > kMaxCredibleResolution)) {
+    tprintf(
+        "Warning: User defined image dpi is outside of expected range "
+        "(%d - %d)!\n",
+        kMinCredibleResolution, kMaxCredibleResolution);
+  }
+  // Always use user defined dpi
+  if (user_dpi) {
+    thresholder_->SetSourceYResolution(user_dpi);
+  } else if (y_res < kMinCredibleResolution || y_res > kMaxCredibleResolution) {
+    if (y_res != 0) {
+      // Show warning only if a resolution was given.
+      tprintf("Warning: Invalid resolution %d dpi. Using %d instead.\n",
+              y_res, kMinCredibleResolution);
+    }
+    thresholder_->SetSourceYResolution(kMinCredibleResolution);
+  }
+
+  auto thresholding_method = static_cast<ThresholdMethod>(static_cast<int>(tesseract_->thresholding_method));
+
+  if (thresholding_method == ThresholdMethod::Otsu) {
+    Image pix_binary(*pix);
+    if (!thresholder_->ThresholdToPix(&pix_binary)) {
+      return false;
+    }
+    *pix = pix_binary;
+
+    if (!thresholder_->IsBinary()) {
+      tesseract_->set_pix_thresholds(thresholder_->GetPixRectThresholds());
+      tesseract_->set_pix_grey(thresholder_->GetPixRectGrey());
+    } else {
+      tesseract_->set_pix_thresholds(nullptr);
+      tesseract_->set_pix_grey(nullptr);
+    }
+  } else {
+    auto [ok, pix_grey, pix_binary, pix_thresholds] = thresholder_->Threshold(this, thresholding_method);
+
+    if (!ok) {
+      return false;
+    }
+    *pix = pix_binary;
+
+    tesseract_->set_pix_thresholds(pix_thresholds);
+    tesseract_->set_pix_grey(pix_grey);
+  }
+
+  thresholder_->GetImageSizes(&rect_left_, &rect_top_, &rect_width_, &rect_height_, &image_width_,
+                              &image_height_);
+
+  // Set the internal resolution that is used for layout parameters from the
+  // estimated resolution, rather than the image resolution, which may be
+  // fabricated, but we will use the image resolution, if there is one, to
+  // report output point sizes.
+  int estimated_res = ClipToRange(thresholder_->GetScaledEstimatedResolution(),
+                                  kMinCredibleResolution, kMaxCredibleResolution);
+  if (estimated_res != thresholder_->GetScaledEstimatedResolution()) {
+    tprintf(
+        "Estimated internal resolution %d out of range! "
+        "Corrected to %d.\n",
+        thresholder_->GetScaledEstimatedResolution(), estimated_res);
+  }
+  tesseract_->set_source_resolution(estimated_res);
+  return true;
+}
+
+/** Find lines from the image making the BLOCK_LIST. */
+int TessBaseAPI::FindLines() {
+  if (thresholder_ == nullptr || thresholder_->IsEmpty()) {
+    tprintf("Please call SetImage before attempting recognition.\n");
+    return -1;
+  }
+  if (recognition_done_) {
+    ClearResults();
+  }
+  if (!block_list_->empty()) {
+    return 0;
+  }
+  if (tesseract_ == nullptr) {
+    tesseract_ = new Tesseract;
+#ifndef DISABLED_LEGACY_ENGINE
+    tesseract_->InitAdaptiveClassifier(nullptr);
+#endif
+  }
+  if (tesseract_->pix_binary() == nullptr && !Threshold(&tesseract_->mutable_pix_binary()->pix_)) {
+    return -1;
+  }
+
+  tesseract_->PrepareForPageseg();
+
+#ifndef DISABLED_LEGACY_ENGINE
+  if (tesseract_->textord_equation_detect) {
+    if (equ_detect_ == nullptr && !datapath_.empty()) {
+      equ_detect_ = new EquationDetect(datapath_.c_str(), nullptr);
+    }
+    if (equ_detect_ == nullptr) {
+      tprintf("Warning: Could not set equation detector\n");
+    } else {
+      tesseract_->SetEquationDetect(equ_detect_);
+    }
+  }
+#endif // ndef DISABLED_LEGACY_ENGINE
+
+  Tesseract *osd_tess = osd_tesseract_;
+  OSResults osr;
+#ifndef DISABLED_LEGACY_ENGINE
+  if (PSM_OSD_ENABLED(tesseract_->tessedit_pageseg_mode) && osd_tess == nullptr) {
+    if (strcmp(language_.c_str(), "osd") == 0) {
+      osd_tess = tesseract_;
+    } else {
+      osd_tesseract_ = new Tesseract;
+      TessdataManager mgr(reader_);
+      if (datapath_.empty()) {
+        tprintf(
+            "Warning: Auto orientation and script detection requested,"
+            " but data path is undefined\n");
+        delete osd_tesseract_;
+        osd_tesseract_ = nullptr;
+      } else if (osd_tesseract_->init_tesseract(datapath_, "", "osd", OEM_TESSERACT_ONLY,
+                                                nullptr, 0, nullptr, nullptr, false, &mgr) == 0) {
+        osd_tess = osd_tesseract_;
+        osd_tesseract_->set_source_resolution(thresholder_->GetSourceYResolution());
+      } else {
+        tprintf(
+            "Warning: Auto orientation and script detection requested,"
+            " but osd language failed to load\n");
+        delete osd_tesseract_;
+        osd_tesseract_ = nullptr;
+      }
+    }
+  }
+#endif // ndef DISABLED_LEGACY_ENGINE
+
+  if (tesseract_->SegmentPage(input_file_.c_str(), block_list_, osd_tess, &osr) < 0) {
+    return -1;
+  }
+
+  // If Devanagari is being recognized, we use different images for page seg
+  // and for OCR.
+  tesseract_->PrepareForTessOCR(block_list_, osd_tess, &osr);
+  return 0;
+}
+
+/**
+ * Return average gradient of lines on page.
+ */
+float TessBaseAPI::GetGradient() {
+  return tesseract_->gradient();
+}
+
+/** Delete the pageres and clear the block list ready for a new page. */
+void TessBaseAPI::ClearResults() {
+  if (tesseract_ != nullptr) {
+    tesseract_->Clear();
+  }
+  delete page_res_;
+  page_res_ = nullptr;
+  recognition_done_ = false;
+  if (block_list_ == nullptr) {
+    block_list_ = new BLOCK_LIST;
+  } else {
+    block_list_->clear();
+  }
+  if (paragraph_models_ != nullptr) {
+    for (auto model : *paragraph_models_) {
+      delete model;
+    }
+    delete paragraph_models_;
+    paragraph_models_ = nullptr;
+  }
+}
+
+/**
+ * Return the length of the output text string, as UTF8, assuming
+ * liberally two spacing marks after each word (as paragraphs end with two
+ * newlines), and assuming a single character reject marker for each rejected
+ * character.
+ * Also return the number of recognized blobs in blob_count.
+ */
+int TessBaseAPI::TextLength(int *blob_count) const {
+  if (tesseract_ == nullptr || page_res_ == nullptr) {
+    return 0;
+  }
+
+  PAGE_RES_IT page_res_it(page_res_);
+  int total_length = 2;
+  int total_blobs = 0;
+  // Iterate over the data structures to extract the recognition result.
+  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
+    WERD_RES *word = page_res_it.word();
+    WERD_CHOICE *choice = word->best_choice;
+    if (choice != nullptr) {
+      total_blobs += choice->length() + 2;
+      total_length += choice->unichar_string().length() + 2;
+      for (int i = 0; i < word->reject_map.length(); ++i) {
+        if (word->reject_map[i].rejected()) {
+          ++total_length;
+        }
+      }
+    }
+  }
+  if (blob_count != nullptr) {
+    *blob_count = total_blobs;
+  }
+  return total_length;
+}
+
+#ifndef DISABLED_LEGACY_ENGINE
+/**
+ * Estimates the Orientation And Script of the image.
+ * Returns true if the image was processed successfully.
+ */
+bool TessBaseAPI::DetectOS(OSResults *osr) {
+  if (tesseract_ == nullptr) {
+    return false;
+  }
+  ClearResults();
+  if (tesseract_->pix_binary() == nullptr && !Threshold(&tesseract_->mutable_pix_binary()->pix_)) {
+    return false;
+  }
+
+  if (input_file_.empty()) {
+    input_file_ = kInputFile;
+  }
+  return orientation_and_script_detection(input_file_.c_str(), osr, tesseract_) > 0;
+}
+#endif // #ifndef DISABLED_LEGACY_ENGINE
+
+void TessBaseAPI::set_min_orientation_margin(double margin) {
+  tesseract_->min_orientation_margin.set_value(margin);
+}
+
+/**
+ * Return text orientation of each block as determined in an earlier page layout
+ * analysis operation. Orientation is returned as the number of ccw 90-degree
+ * rotations (in [0..3]) required to make the text in the block upright
+ * (readable). Note that this may not necessary be the block orientation
+ * preferred for recognition (such as the case of vertical CJK text).
+ *
+ * Also returns whether the text in the block is believed to have vertical
+ * writing direction (when in an upright page orientation).
+ *
+ * The returned array is of length equal to the number of text blocks, which may
+ * be less than the total number of blocks. The ordering is intended to be
+ * consistent with GetTextLines().
+ */
+void TessBaseAPI::GetBlockTextOrientations(int **block_orientation, bool **vertical_writing) {
+  delete[] * block_orientation;
+  *block_orientation = nullptr;
+  delete[] * vertical_writing;
+  *vertical_writing = nullptr;
+  BLOCK_IT block_it(block_list_);
+
+  block_it.move_to_first();
+  int num_blocks = 0;
+  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
+    if (!block_it.data()->pdblk.poly_block()->IsText()) {
+      continue;
+    }
+    ++num_blocks;
+  }
+  if (!num_blocks) {
+    tprintf("WARNING: Found no blocks\n");
+    return;
+  }
+  *block_orientation = new int[num_blocks];
+  *vertical_writing = new bool[num_blocks];
+  block_it.move_to_first();
+  int i = 0;
+  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
+    if (!block_it.data()->pdblk.poly_block()->IsText()) {
+      continue;
+    }
+    FCOORD re_rotation = block_it.data()->re_rotation();
+    float re_theta = re_rotation.angle();
+    FCOORD classify_rotation = block_it.data()->classify_rotation();
+    float classify_theta = classify_rotation.angle();
+    double rot_theta = -(re_theta - classify_theta) * 2.0 / M_PI;
+    if (rot_theta < 0) {
+      rot_theta += 4;
+    }
+    int num_rotations = static_cast<int>(rot_theta + 0.5);
+    (*block_orientation)[i] = num_rotations;
+    // The classify_rotation is non-zero only if the text has vertical
+    // writing direction.
+    (*vertical_writing)[i] = classify_rotation.y() != 0.0f;
+    ++i;
+  }
+}
+
+void TessBaseAPI::DetectParagraphs(bool after_text_recognition) {
+  int debug_level = 0;
+  GetIntVariable("paragraph_debug_level", &debug_level);
+  if (paragraph_models_ == nullptr) {
+    paragraph_models_ = new std::vector<ParagraphModel *>;
+  }
+  MutableIterator *result_it = GetMutableIterator();
+  do { // Detect paragraphs for this block
+    std::vector<ParagraphModel *> models;
+    ::tesseract::DetectParagraphs(debug_level, after_text_recognition, result_it, &models);
+    paragraph_models_->insert(paragraph_models_->end(), models.begin(), models.end());
+  } while (result_it->Next(RIL_BLOCK));
+  delete result_it;
+}
+
+/** This method returns the string form of the specified unichar. */
+const char *TessBaseAPI::GetUnichar(int unichar_id) const {
+  return tesseract_->unicharset.id_to_unichar(unichar_id);
+}
+
+/** Return the pointer to the i-th dawg loaded into tesseract_ object. */
+const Dawg *TessBaseAPI::GetDawg(int i) const {
+  if (tesseract_ == nullptr || i >= NumDawgs()) {
+    return nullptr;
+  }
+  return tesseract_->getDict().GetDawg(i);
+}
+
+/** Return the number of dawgs loaded into tesseract_ object. */
+int TessBaseAPI::NumDawgs() const {
+  return tesseract_ == nullptr ? 0 : tesseract_->getDict().NumDawgs();
+}
+
+/** Escape a char string - replace <>&"' with HTML codes. */
+std::string HOcrEscape(const char *text) {
+  std::string ret;
+  const char *ptr;
+  for (ptr = text; *ptr; ptr++) {
+    switch (*ptr) {
+      case '<':
+        ret += "&lt;";
+        break;
+      case '>':
+        ret += "&gt;";
+        break;
+      case '&':
+        ret += "&amp;";
+        break;
+      case '"':
+        ret += "&quot;";
+        break;
+      case '\'':
+        ret += "&#39;";
+        break;
+      default:
+        ret += *ptr;
+    }
+  }
+  return ret;
+}
+
+} // namespace tesseract
author	Franz Glasner <fzglas.hg@dom66.de>
date	Mon, 15 Sep 2025 11:43:07 +0200
parents
children