Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/api/pagerenderer.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/api/pagerenderer.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,1132 @@ +// File: pagerenderer.cpp +// Description: PAGE XML rendering interface +// Author: Jan Kamlah + +// (C) Copyright 2024 +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "errcode.h" // for ASSERT_HOST +#include "helpers.h" // for copy_string +#include "tprintf.h" // for tprintf + +#include <tesseract/baseapi.h> +#include <tesseract/renderer.h> + +#include <ctime> +#include <iomanip> +#include <memory> +#include <regex> +#include <sstream> // for std::stringstream +#include <unordered_set> + +#include <allheaders.h> +#if (LIBLEPT_MAJOR_VERSION == 1 && LIBLEPT_MINOR_VERSION >= 83) || \ + LIBLEPT_MAJOR_VERSION > 1 +# include <array_internal.h> +# include <pix_internal.h> +#endif + +namespace tesseract { + +/// +/// Slope and offset between two points +/// +static void GetSlopeAndOffset(float x0, float y0, float x1, float y1, float *m, + float *b) { + float slope; + + slope = ((y1 - y0) / (x1 - x0)); + *m = slope; + *b = y0 - slope * x0; +} + +/// +/// Write coordinates in the form of a points to a stream +/// +static void AddPointsToPAGE(Pta *pts, std::stringstream &str) { + int num_pts; + + str << "<Coords points=\""; + num_pts = ptaGetCount(pts); + for (int p = 0; p < num_pts; ++p) { + int x, y; + ptaGetIPt(pts, p, &x, &y); + if (p != 0) { + str << " "; + } + str << std::to_string(x) << "," << std::to_string(y); + } + str << "\"/>\n"; +} + +/// +/// Convert bbox information to top and bottom polygon +/// +static void AddPointToWordPolygon( + const ResultIterator *res_it, PageIteratorLevel level, Pta *word_top_pts, + Pta *word_bottom_pts, tesseract::WritingDirection writing_direction) { + int left, top, right, bottom; + + res_it->BoundingBox(level, &left, &top, &right, &bottom); + + if (writing_direction != WRITING_DIRECTION_TOP_TO_BOTTOM) { + ptaAddPt(word_top_pts, left, top); + ptaAddPt(word_top_pts, right, top); + + ptaAddPt(word_bottom_pts, left, bottom); + ptaAddPt(word_bottom_pts, right, bottom); + + } else { + // Transform from ttb to ltr + ptaAddPt(word_top_pts, top, right); + ptaAddPt(word_top_pts, bottom, right); + + ptaAddPt(word_bottom_pts, top, left); + ptaAddPt(word_bottom_pts, bottom, left); + } +} + +/// +/// Transpose polygonline, destroy old and return new pts +/// +Pta *TransposePolygonline(Pta *pts) { + Pta *pts_transposed; + + pts_transposed = ptaTranspose(pts); + ptaDestroy(&pts); + return pts_transposed; +} + +/// +/// Reverse polygonline, destroy old and return new pts +/// +Pta *ReversePolygonline(Pta *pts, int type) { + Pta *pts_reversed; + + pts_reversed = ptaReverse(pts, type); + ptaDestroy(&pts); + return pts_reversed; +} + +/// +/// Destroy old and create new pts +/// +Pta *DestroyAndCreatePta(Pta *pts) { + ptaDestroy(&pts); + return ptaCreate(0); +} + +/// +/// Recalculate linepolygon +/// Create a hull for overlapping areas +/// +Pta *RecalcPolygonline(Pta *pts, bool upper) { + int num_pts, num_bin, index = 0; + int y, x0, y0, x1, y1; + float x_min, y_min, x_max, y_max; + NUMA *bin_line; + Pta *pts_recalc; + + ptaGetMinMax(pts, &x_min, &y_min, &x_max, &y_max); + num_bin = x_max - x_min; + bin_line = numaCreate(num_bin + 1); + + for (int p = 0; p <= num_bin; ++p) { + bin_line->array[p] = -1.; + } + + num_pts = ptaGetCount(pts); + + if (num_pts == 2) { + pts_recalc = ptaCopy(pts); + ptaDestroy(&pts); + return pts_recalc; + } + + do { + ptaGetIPt(pts, index, &x0, &y0); + ptaGetIPt(pts, index + 1, &x1, &y1); + for (int p = x0 - x_min; p <= x1 - x_min; ++p) { + if (!upper) { + if (bin_line->array[p] == -1. || y0 > bin_line->array[p]) { + bin_line->array[p] = y0; + } + } else { + if (bin_line->array[p] == -1. || y0 < bin_line->array[p]) { + bin_line->array[p] = y0; + } + } + } + index += 2; + } while (index < num_pts - 1); + + pts_recalc = ptaCreate(0); + + for (int p = 0; p <= num_bin; ++p) { + if (p == 0) { + y = bin_line->array[p]; + ptaAddPt(pts_recalc, x_min + p, y); + } else if (p == num_bin) { + ptaAddPt(pts_recalc, x_min + p, y); + break; + } else if (y != bin_line->array[p]) { + if (y != -1.) { + ptaAddPt(pts_recalc, x_min + p, y); + } + y = bin_line->array[p]; + if (y != -1.) { + ptaAddPt(pts_recalc, x_min + p, y); + } + } + } + + ptaDestroy(&pts); + return pts_recalc; +} + +/// +/// Create a rectangle hull around a single line +/// +Pta *PolygonToBoxCoords(Pta *pts) { + Pta *pts_box; + float x_min, y_min, x_max, y_max; + + pts_box = ptaCreate(0); + ptaGetMinMax(pts, &x_min, &y_min, &x_max, &y_max); + ptaAddPt(pts_box, x_min, y_min); + ptaAddPt(pts_box, x_max, y_min); + ptaAddPt(pts_box, x_max, y_max); + ptaAddPt(pts_box, x_min, y_max); + ptaDestroy(&pts); + return pts_box; +} + +/// +/// Create a rectangle polygon round the existing multiple lines +/// +static void UpdateBlockPoints(Pta *block_top_pts, Pta *block_bottom_pts, + Pta *line_top_pts, Pta *line_bottom_pts, int lcnt, + int last_word_in_cblock) { + int num_pts; + int x, y; + + // Create a hull around all lines + if (lcnt == 0 && last_word_in_cblock) { + ptaJoin(block_top_pts, line_top_pts, 0, -1); + ptaJoin(block_bottom_pts, line_bottom_pts, 0, -1); + } else if (lcnt == 0) { + ptaJoin(block_top_pts, line_top_pts, 0, -1); + num_pts = ptaGetCount(line_bottom_pts); + ptaGetIPt(line_bottom_pts, num_pts - 1, &x, &y); + ptaAddPt(block_top_pts, x, y); + ptaGetIPt(line_bottom_pts, 0, &x, &y); + ptaAddPt(block_bottom_pts, x, y); + } else if (last_word_in_cblock) { + ptaGetIPt(line_top_pts, 0, &x, &y); + ptaAddPt(block_bottom_pts, x, y); + ptaJoin(block_bottom_pts, line_bottom_pts, 0, -1); + num_pts = ptaGetCount(line_top_pts); + ptaGetIPt(line_top_pts, num_pts - 1, &x, &y); + ptaAddPt(block_top_pts, x, y); + } else { + ptaGetIPt(line_top_pts, 0, &x, &y); + ptaAddPt(block_bottom_pts, x, y); + ptaGetIPt(line_bottom_pts, 0, &x, &y); + ptaAddPt(block_bottom_pts, x, y); + num_pts = ptaGetCount(line_top_pts); + ptaGetIPt(line_top_pts, num_pts - 1, &x, &y); + ptaAddPt(block_top_pts, x, y); + num_pts = ptaGetCount(line_bottom_pts); + ptaGetIPt(line_bottom_pts, num_pts - 1, &x, &y); + ptaAddPt(block_top_pts, x, y); + }; +} + +/// +/// Simplify polygonlines (only expanding not shrinking) (Due to recalculation +/// currently not necessary) +/// +static void SimplifyLinePolygon(Pta *polyline, int tolerance, bool upper) { + int x0, y0, x1, y1, x2, y2, x3, y3, index = 1; + float m, b, y_min, y_max; + + while (index <= polyline->n - 2) { + ptaGetIPt(polyline, index - 1, &x0, &y0); + ptaGetIPt(polyline, index, &x1, &y1); + ptaGetIPt(polyline, index + 1, &x2, &y2); + if (index + 2 < polyline->n) { + // Delete two point indentations + ptaGetIPt(polyline, index + 2, &x3, &y3); + if (abs(x3 - x0) <= tolerance * 2) { + GetSlopeAndOffset(x0, y0, x3, y3, &m, &b); + + if (upper && (m * x1 + b) < y1 && (m * x2 + b) < y2) { + ptaRemovePt(polyline, index + 1); + ptaRemovePt(polyline, index); + continue; + } else if (!upper && (m * x1 + b) > y1 && (m * x2 + b) > y2) { + ptaRemovePt(polyline, index + 1); + ptaRemovePt(polyline, index); + continue; + } + } + } + // Delete one point indentations + if (abs(y0 - y1) <= tolerance && abs(y1 - y2) <= tolerance) { + GetSlopeAndOffset(x0, y0, x2, y2, &m, &b); + if (upper && (m * x1 + b) <= y1) { + ptaRemovePt(polyline, index); + continue; + } else if (!upper && (m * x1 + b) >= y1) { + ptaRemovePt(polyline, index); + continue; + } + } + // Delete near by points + if (x1 != x0 && abs(y1 - y0) < 4 && abs(x1 - x0) <= tolerance) { + if (upper) { + y_min = std::min(y0, y1); + GetSlopeAndOffset(x0, y_min, x2, y2, &m, &b); + if ((m * x1 + b) <= y1) { + polyline->y[index - 1] = std::min(y0, y1); + ptaRemovePt(polyline, index); + continue; + } + } else { + y_max = std::max(y0, y1); + GetSlopeAndOffset(x0, y_max, x2, y2, &m, &b); + if ((m * x1 + b) >= y1) { + polyline->y[index - 1] = y_max; + ptaRemovePt(polyline, index); + continue; + } + } + } + index++; + } +} + +/// +/// Directly write bounding box information as coordinates a stream +/// +static void AddBoxToPAGE(const ResultIterator *it, PageIteratorLevel level, + std::stringstream &page_str) { + int left, top, right, bottom; + + it->BoundingBox(level, &left, &top, &right, &bottom); + page_str << "<Coords points=\"" << left << "," << top << " " << right << "," + << top << " " << right << "," << bottom << " " << left << "," + << bottom << "\"/>\n"; +} + +/// +/// Join ltr and rtl polygon information +/// +static void AppendLinePolygon(Pta *pts_ltr, Pta *pts_rtl, Pta *ptss, + tesseract::WritingDirection writing_direction) { + // If writing direction is NOT right-to-left, handle the left-to-right case. + if (writing_direction != WRITING_DIRECTION_RIGHT_TO_LEFT) { + if (ptaGetCount(pts_rtl) != 0) { + ptaJoin(pts_ltr, pts_rtl, 0, -1); + DestroyAndCreatePta(pts_rtl); + } + ptaJoin(pts_ltr, ptss, 0, -1); + } else { + // For right-to-left, work with a copy of ptss initially. + PTA *ptsd = ptaCopy(ptss); + if (ptaGetCount(pts_rtl) != 0) { + ptaJoin(ptsd, pts_rtl, 0, -1); + } + ptaDestroy(&pts_rtl); + ptaCopy(ptsd); + } +} + +/// +/// Convert baseline to points and add to polygon +/// +static void AddBaselineToPTA(const ResultIterator *it, PageIteratorLevel level, + Pta *baseline_pts) { + int x1, y1, x2, y2; + + it->Baseline(level, &x1, &y1, &x2, &y2); + ptaAddPt(baseline_pts, x1, y1); + ptaAddPt(baseline_pts, x2, y2); +} + +/// +/// Directly write baseline information as baseline points a stream +/// +static void AddBaselinePtsToPAGE(Pta *baseline_pts, std::stringstream &str) { + int x, y, num_pts = baseline_pts->n; + + str << "<Baseline points=\""; + for (int p = 0; p < num_pts; ++p) { + ptaGetIPt(baseline_pts, p, &x, &y); + if (p != 0) { + str << " "; + } + str << std::to_string(x) << "," << std::to_string(y); + } + str << "\"/>\n"; +} + +/// +/// Sort baseline points ascending and deleting duplicates +/// +Pta *SortBaseline(Pta *baseline_pts, + tesseract::WritingDirection writing_direction) { + int num_pts, index = 0; + float x0, y0, x1, y1; + Pta *sorted_baseline_pts; + + sorted_baseline_pts = + ptaSort(baseline_pts, L_SORT_BY_X, L_SORT_INCREASING, nullptr); + + do { + ptaGetPt(sorted_baseline_pts, index, &x0, &y0); + ptaGetPt(sorted_baseline_pts, index + 1, &x1, &y1); + if (x0 >= x1) { + sorted_baseline_pts->y[index] = std::min(y0, y1); + ptaRemovePt(sorted_baseline_pts, index + 1); + } else { + index++; + } + num_pts = ptaGetCount(sorted_baseline_pts); + } while (index < num_pts - 1); + + ptaDestroy(&baseline_pts); + return sorted_baseline_pts; +} + +/// +/// Clip baseline to range of the exsitings polygon and simplifies the baseline +/// linepolygon +/// +Pta *ClipAndSimplifyBaseline(Pta *bottom_pts, Pta *baseline_pts, + tesseract::WritingDirection writing_direction) { + int num_pts; + float m, b, x0, y0, x1, y1; + float x_min, y_min, x_max, y_max; + Pta *baseline_clipped_pts; + + ptaGetMinMax(bottom_pts, &x_min, &y_min, &x_max, &y_max); + num_pts = ptaGetCount(baseline_pts); + baseline_clipped_pts = ptaCreate(0); + + // Clip Baseline + for (int p = 0; p < num_pts; ++p) { + ptaGetPt(baseline_pts, p, &x0, &y0); + if (x0 < x_min) { + if (p + 1 < num_pts) { + ptaGetPt(baseline_pts, p + 1, &x1, &y1); + if (x1 < x_min) { + continue; + } else { + GetSlopeAndOffset(x0, y0, x1, y1, &m, &b); + y0 = int(x_min * m + b); + x0 = x_min; + } + } + } else if (x0 > x_max) { + if (ptaGetCount(baseline_clipped_pts) > 0 && p > 0) { + ptaGetPt(baseline_pts, p - 1, &x1, &y1); + // See comment above + GetSlopeAndOffset(x1, y1, x0, y0, &m, &b); + y0 = int(x_max * m + b); + x0 = x_max; + ptaAddPt(baseline_clipped_pts, x0, y0); + break; + } + } + ptaAddPt(baseline_clipped_pts, x0, y0); + } + if (writing_direction == WRITING_DIRECTION_TOP_TO_BOTTOM) { + SimplifyLinePolygon(baseline_clipped_pts, 3, 0); + } else { + SimplifyLinePolygon(baseline_clipped_pts, 3, 1); + } + SimplifyLinePolygon( + baseline_clipped_pts, 3, + writing_direction == WRITING_DIRECTION_TOP_TO_BOTTOM ? 0 : 1); + + // Check the number of points in baseline_clipped_pts after processing + int clipped_pts_count = ptaGetCount(baseline_clipped_pts); + + if (clipped_pts_count < 2) { + // If there's only one point in baseline_clipped_pts, duplicate it + ptaDestroy(&baseline_clipped_pts); // Clean up the created but unused Pta + baseline_clipped_pts = ptaCreate(0); + ptaAddPt(baseline_clipped_pts, x_min, y_min); + ptaAddPt(baseline_clipped_pts, x_max, y_min); + } + + return baseline_clipped_pts; +} + +/// +/// Fit the baseline points into the existings polygon +/// +Pta *FitBaselineIntoLinePolygon(Pta *bottom_pts, Pta *baseline_pts, + tesseract::WritingDirection writing_direction) { + int num_pts, num_bin, x0, y0, x1, y1; + float m, b; + float x_min, y_min, x_max, y_max; + float delta_median, delta_median_Q1, delta_median_Q3; + NUMA *bin_line, *poly_bl_delta; + Pta *baseline_recalc_pts, *baseline_clipped_pts; + + ptaGetMinMax(bottom_pts, &x_min, &y_min, &x_max, &y_max); + num_bin = x_max - x_min; + bin_line = numaCreate(num_bin + 1); + + for (int p = 0; p < num_bin + 1; ++p) { + bin_line->array[p] = -1.; + } + + num_pts = ptaGetCount(bottom_pts); + // Create an interpolated polygon with stepsize 1. + for (int index = 0; index < num_pts - 1; ++index) { + ptaGetIPt(bottom_pts, index, &x0, &y0); + ptaGetIPt(bottom_pts, index + 1, &x1, &y1); + if (x0 >= x1) { + continue; + } + if (y0 == y1) { + for (int p = x0 - x_min; p < x1 - x_min + 1; ++p) { + if (bin_line->array[p] == -1. || y0 > bin_line->array[p]) { + bin_line->array[p] = y0; + } + } + } else { + GetSlopeAndOffset(x0, y0, x1, y1, &m, &b); + for (int p = x0 - x_min; p < x1 - x_min + 1; ++p) { + if (bin_line->array[p] == -1. || + ((p + x_min) * m + b) > bin_line->array[p]) { + bin_line->array[p] = ((p + x_min) * m + b); + } + } + } + } + + num_pts = ptaGetCount(baseline_pts); + baseline_clipped_pts = ptaCreate(0); + poly_bl_delta = numaCreate(0); + + // Clip Baseline and create a set of deltas between baseline and polygon + for (int p = 0; p < num_pts; ++p) { + ptaGetIPt(baseline_pts, p, &x0, &y0); + + if (x0 < x_min) { + ptaGetIPt(baseline_pts, p + 1, &x1, &y1); + if (x1 < x_min) { + continue; + } else { + GetSlopeAndOffset(x0, y0, x1, y1, &m, &b); + y0 = int(x_min * m + b); + x0 = x_min; + } + } else if (x0 > x_max) { + if (ptaGetCount(baseline_clipped_pts) > 0) { + ptaGetIPt(baseline_pts, p - 1, &x1, &y1); + GetSlopeAndOffset(x1, y1, x0, y0, &m, &b); + y0 = int(x_max * m + b); + x0 = x_max; + int x_val = x0 - x_min; + numaAddNumber(poly_bl_delta, abs(bin_line->array[x_val] - y0)); + ptaAddPt(baseline_clipped_pts, x0, y0); + break; + } + } + int x_val = x0 - x_min; + numaAddNumber(poly_bl_delta, abs(bin_line->array[x_val] - y0)); + ptaAddPt(baseline_clipped_pts, x0, y0); + } + + ptaDestroy(&baseline_pts); + + // Calculate quartiles to find outliers + numaGetMedian(poly_bl_delta, &delta_median); + numaGetRankValue(poly_bl_delta, 0.25, nullptr, 0, &delta_median_Q1); + numaGetRankValue(poly_bl_delta, 0.75, nullptr, 0, &delta_median_Q3); + + // Fit baseline into the polygon + // Todo: Needs maybe some adjustments to suppress fitting to superscript + // glyphs + baseline_recalc_pts = ptaCreate(0); + num_pts = ptaGetCount(baseline_clipped_pts); + for (int p = 0; p < num_pts; ++p) { + ptaGetIPt(baseline_clipped_pts, p, &x0, &y0); + int x_val = x0 - x_min; + // Delete outliers with IQR + if (abs(y0 - bin_line->array[x_val]) > + 1.5 * delta_median_Q3 + delta_median && + p != 0 && p != num_pts - 1) { + continue; + } + if (writing_direction == WRITING_DIRECTION_TOP_TO_BOTTOM) { + if (y0 < bin_line->array[x_val]) { + ptaAddPt(baseline_recalc_pts, x0, bin_line->array[x_val]); + } else { + ptaAddPt(baseline_recalc_pts, x0, y0); + } + } else { + if (y0 > bin_line->array[x_val]) { + ptaAddPt(baseline_recalc_pts, x0, bin_line->array[x_val]); + } else { + ptaAddPt(baseline_recalc_pts, x0, y0); + } + } + } + // Return recalculated baseline if this fails return the bottom line as + // baseline + ptaDestroy(&baseline_clipped_pts); + if (ptaGetCount(baseline_recalc_pts) < 2) { + ptaDestroy(&baseline_recalc_pts); + return ptaCopy(bottom_pts); + } else { + return baseline_recalc_pts; + } +} + +/// Convert writing direction to string representation +const char *WritingDirectionToStr(int wd) { + switch (wd) { + case 0: + return "left-to-right"; + case 1: + return "right-to-left"; + case 2: + return "top-to-bottom"; + default: + return "bottom-to-top"; + } +} +/// +/// Append the PAGE XML for the beginning of the document +/// +bool TessPAGERenderer::BeginDocumentHandler() { + // Delay the XML output because we need the name of the image file. + begin_document = true; + return true; +} + +/// +/// Append the PAGE XML for the layout of the image +/// +bool TessPAGERenderer::AddImageHandler(TessBaseAPI *api) { + if (begin_document) { + AppendString( + "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n" + "<PcGts " + "xmlns=\"http://schema.primaresearch.org/PAGE/gts/pagecontent/" + "2019-07-15\" " + "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" " + "xsi:schemaLocation=\"http://schema.primaresearch.org/PAGE/gts/" + "pagecontent/2019-07-15 " + "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/" + "pagecontent.xsd\">\n" + "\t<Metadata"); + + // If a URL is used to recognize an image add it as <Metadata + // externalRef="url"> + if (std::regex_search(api->GetInputName(), + std::regex("^(https?|ftp|ssh):"))) { + AppendString(" externalRef=\""); + AppendString(api->GetInputName()); + AppendString("\" "); + } + + AppendString( + ">\n" + "\t\t<Creator>Tesseract - "); + AppendString(TESSERACT_VERSION_STR); + // If gmtime conversion is problematic maybe l_getFormattedDate can be used + // here + // char *datestr = l_getFormattedDate(); + std::time_t now = std::time(nullptr); + std::tm *now_tm = std::gmtime(&now); + char mbstr[100]; + std::strftime(mbstr, sizeof(mbstr), "%Y-%m-%dT%H:%M:%S", now_tm); + AppendString( + "</Creator>\n" + "\t\t<Created>"); + AppendString(mbstr); + AppendString("</Created>\n"); + AppendString("\t\t<LastChange>"); + AppendString(mbstr); + AppendString( + "</LastChange>\n" + "\t</Metadata>\n"); + begin_document = false; + } + + const std::unique_ptr<const char[]> text(api->GetPAGEText(imagenum())); + if (text == nullptr) { + return false; + } + + AppendString(text.get()); + + return true; +} + +/// +/// Append the PAGE XML for the end of the document +/// +bool TessPAGERenderer::EndDocumentHandler() { + AppendString("\t\t</Page>\n</PcGts>\n"); + return true; +} + +TessPAGERenderer::TessPAGERenderer(const char *outputbase) + : TessResultRenderer(outputbase, "page.xml"), begin_document(false) {} + +/// +/// Make an XML-formatted string with PAGE markup from the internal +/// data structures. +/// +char *TessBaseAPI::GetPAGEText(int page_number) { + return GetPAGEText(nullptr, page_number); +} + +/// +/// Make an XML-formatted string with PAGE markup from the internal +/// data structures. +/// +char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) { + if (tesseract_ == nullptr || + (page_res_ == nullptr && Recognize(monitor) < 0)) { + return nullptr; + } + + int rcnt = 0, lcnt = 0, wcnt = 0; + + if (input_file_.empty()) { + SetInputName(nullptr); + } + + // Used variables + + std::stringstream reading_order_str; + std::stringstream region_content; + std::stringstream line_content; + std::stringstream word_content; + std::stringstream line_str; + std::stringstream line_inter_str; + std::stringstream word_str; + std::stringstream page_str; + + float x1, y1, x2, y2; + + tesseract::Orientation orientation_block = ORIENTATION_PAGE_UP; + tesseract::WritingDirection writing_direction_block = + WRITING_DIRECTION_LEFT_TO_RIGHT; + tesseract::TextlineOrder textline_order_block; + + Pta *block_top_pts = ptaCreate(0); + Pta *block_bottom_pts = ptaCreate(0); + Pta *line_top_ltr_pts = ptaCreate(0); + Pta *line_bottom_ltr_pts = ptaCreate(0); + Pta *line_top_rtl_pts = ptaCreate(0); + Pta *line_bottom_rtl_pts = ptaCreate(0); + Pta *word_top_pts = ptaCreate(0); + Pta *word_bottom_pts = ptaCreate(0); + Pta *word_baseline_pts = ptaCreate(0); + Pta *line_baseline_rtl_pts = ptaCreate(0); + Pta *line_baseline_ltr_pts = ptaCreate(0); + Pta *line_baseline_pts = ptaCreate(0); + + bool POLYGONFLAG; + GetBoolVariable("page_xml_polygon", &POLYGONFLAG); + int LEVELFLAG; + GetIntVariable("page_xml_level", &LEVELFLAG); + + if (LEVELFLAG != 0 && LEVELFLAG != 1) { + tprintf( + "For now, only line level and word level are available, and the level " + "is reset to line level.\n"); + LEVELFLAG = 0; + } + + // Use "C" locale (needed for int values larger than 999). + page_str.imbue(std::locale::classic()); + reading_order_str << "\t<Page " << "imageFilename=\"" << GetInputName(); + // AppendString(api->GetInputName()); + reading_order_str << "\" " << "imageWidth=\"" << rect_width_ << "\" " + << "imageHeight=\"" << rect_height_ << "\">\n"; + std::size_t ro_id = std::hash<std::string>{}(GetInputName()); + reading_order_str << "\t\t<ReadingOrder>\n" + << "\t\t\t<OrderedGroup id=\"ro" << ro_id + << "\" caption=\"Regions reading order\">\n"; + + std::unique_ptr<ResultIterator> res_it(GetIterator()); + + float block_conf = 0; + float line_conf = 0; + + while (!res_it->Empty(RIL_BLOCK)) { + if (res_it->Empty(RIL_WORD)) { + res_it->Next(RIL_WORD); + continue; + } + + auto block_type = res_it->BlockType(); + + switch (block_type) { + case PT_FLOWING_IMAGE: + case PT_HEADING_IMAGE: + case PT_PULLOUT_IMAGE: { + // Handle all kinds of images. + page_str << "\t\t<GraphicRegion id=\"r" << rcnt++ << "\">\n"; + page_str << "\t\t\t"; + AddBoxToPAGE(res_it.get(), RIL_BLOCK, page_str); + page_str << "\t\t</GraphicRegion>\n"; + res_it->Next(RIL_BLOCK); + continue; + } + case PT_HORZ_LINE: + case PT_VERT_LINE: + // Handle horizontal and vertical lines. + page_str << "\t\t<SeparatorRegion id=\"r" << rcnt++ << "\">\n"; + page_str << "\t\t\t"; + AddBoxToPAGE(res_it.get(), RIL_BLOCK, page_str); + page_str << "\t\t</SeparatorRegion>\n"; + res_it->Next(RIL_BLOCK); + continue; + case PT_NOISE: + tprintf("TODO: Please report image which triggers the noise case.\n"); + ASSERT_HOST(false); + default: + break; + } + + if (res_it->IsAtBeginningOf(RIL_BLOCK)) { + // Add Block to reading order + reading_order_str << "\t\t\t\t<RegionRefIndexed " << "index=\"" << rcnt + << "\" " << "regionRef=\"r" << rcnt << "\"/>\n"; + + float deskew_angle; + res_it->Orientation(&orientation_block, &writing_direction_block, + &textline_order_block, &deskew_angle); + block_conf = ((res_it->Confidence(RIL_BLOCK)) / 100.); + page_str << "\t\t<TextRegion id=\"r" << rcnt << "\" " << "custom=\"" + << "readingOrder {index:" << rcnt << ";} "; + if (writing_direction_block != WRITING_DIRECTION_LEFT_TO_RIGHT) { + page_str << "readingDirection {" + << WritingDirectionToStr(writing_direction_block) << ";} "; + } + page_str << "orientation {" << orientation_block << ";}\">\n"; + page_str << "\t\t\t"; + if ((!POLYGONFLAG || (orientation_block != ORIENTATION_PAGE_UP && + orientation_block != ORIENTATION_PAGE_DOWN)) && + LEVELFLAG == 0) { + AddBoxToPAGE(res_it.get(), RIL_BLOCK, page_str); + } + } + + // Writing direction changes at a per-word granularity + // tesseract::WritingDirection writing_direction_before; + auto writing_direction = writing_direction_block; + if (writing_direction_block != WRITING_DIRECTION_TOP_TO_BOTTOM) { + switch (res_it->WordDirection()) { + case DIR_LEFT_TO_RIGHT: + writing_direction = WRITING_DIRECTION_LEFT_TO_RIGHT; + break; + case DIR_RIGHT_TO_LEFT: + writing_direction = WRITING_DIRECTION_RIGHT_TO_LEFT; + break; + default: + break; + } + } + + bool ttb_flag = (writing_direction == WRITING_DIRECTION_TOP_TO_BOTTOM); + // TODO: Rework polygon handling if line is skewed (90 or 180 degress), + // for now using LinePts + bool skewed_flag = (orientation_block != ORIENTATION_PAGE_UP && + orientation_block != ORIENTATION_PAGE_DOWN); + + if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { + // writing_direction_before = writing_direction; + line_conf = ((res_it->Confidence(RIL_TEXTLINE)) / 100.); + std::string textline = res_it->GetUTF8Text(RIL_TEXTLINE); + if (textline.back() == '\n') { + textline.erase(textline.length() - 1); + } + line_content << HOcrEscape(textline.c_str()); + line_str << "\t\t\t<TextLine id=\"r" << rcnt << "l" << lcnt << "\" "; + if (writing_direction != WRITING_DIRECTION_LEFT_TO_RIGHT && + writing_direction != writing_direction_block) { + line_str << "readingDirection=\"" + << WritingDirectionToStr(writing_direction) << "\" "; + } + line_str << "custom=\"" << "readingOrder {index:" << lcnt << ";}\">\n"; + // If level is linebased, get the line polygon and baseline + if (LEVELFLAG == 0 && (!POLYGONFLAG || skewed_flag)) { + AddPointToWordPolygon(res_it.get(), RIL_TEXTLINE, line_top_ltr_pts, + line_bottom_ltr_pts, writing_direction); + AddBaselineToPTA(res_it.get(), RIL_TEXTLINE, line_baseline_pts); + if (ttb_flag) { + line_baseline_pts = TransposePolygonline(line_baseline_pts); + } + } + } + + // Get information if word is last in line and if its last in the region + bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD); + bool last_word_in_cblock = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD); + + float word_conf = ((res_it->Confidence(RIL_WORD)) / 100.); + + // Create word stream if word level output is active + if (LEVELFLAG > 0) { + word_str << "\t\t\t\t<Word id=\"r" << rcnt << "l" << lcnt << "w" << wcnt + << "\" readingDirection=\"" + << WritingDirectionToStr(writing_direction) << "\" " + << "custom=\"" << "readingOrder {index:" << wcnt << ";}\">\n"; + if ((!POLYGONFLAG || skewed_flag) || ttb_flag) { + AddPointToWordPolygon(res_it.get(), RIL_WORD, word_top_pts, word_bottom_pts, + writing_direction); + } + } + + if (POLYGONFLAG && !skewed_flag && ttb_flag && LEVELFLAG == 0) { + AddPointToWordPolygon(res_it.get(), RIL_WORD, word_top_pts, word_bottom_pts, + writing_direction); + } + + // Get the word baseline information + AddBaselineToPTA(res_it.get(), RIL_WORD, word_baseline_pts); + + // Get the word text content and polygon + do { + const std::unique_ptr<const char[]> grapheme( + res_it->GetUTF8Text(RIL_SYMBOL)); + if (grapheme && grapheme[0] != 0) { + word_content << HOcrEscape(grapheme.get()).c_str(); + if (POLYGONFLAG && !skewed_flag && !ttb_flag) { + AddPointToWordPolygon(res_it.get(), RIL_SYMBOL, word_top_pts, + word_bottom_pts, writing_direction); + } + } + res_it->Next(RIL_SYMBOL); + } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); + + if (LEVELFLAG > 0 || (POLYGONFLAG && !skewed_flag)) { + // Sort wordpolygons + word_top_pts = RecalcPolygonline(word_top_pts, 1 - ttb_flag); + word_bottom_pts = RecalcPolygonline(word_bottom_pts, 0 + ttb_flag); + + // AppendLinePolygon + AppendLinePolygon(line_top_ltr_pts, line_top_rtl_pts, word_top_pts, + writing_direction); + AppendLinePolygon(line_bottom_ltr_pts, line_bottom_rtl_pts, + word_bottom_pts, writing_direction); + + // Word level polygon + word_bottom_pts = ReversePolygonline(word_bottom_pts, 1); + ptaJoin(word_top_pts, word_bottom_pts, 0, -1); + } + + // Reverse the word baseline direction for rtl + if (writing_direction == WRITING_DIRECTION_RIGHT_TO_LEFT) { + word_baseline_pts = ReversePolygonline(word_baseline_pts, 1); + } + + // Write word information to the output + if (LEVELFLAG > 0) { + word_str << "\t\t\t\t\t"; + if (ttb_flag) { + word_top_pts = TransposePolygonline(word_top_pts); + } + AddPointsToPAGE(word_top_pts, word_str); + word_str << "\t\t\t\t\t"; + AddBaselinePtsToPAGE(word_baseline_pts, word_str); + word_str << "\t\t\t\t\t<TextEquiv index=\"1\" conf=\"" + << std::setprecision(4) << word_conf << "\">\n" + << "\t\t\t\t\t\t<Unicode>" << word_content.str() + << "</Unicode>\n" + << "\t\t\t\t\t</TextEquiv>\n" + << "\t\t\t\t</Word>\n"; + } + if (LEVELFLAG > 0 || (POLYGONFLAG && !skewed_flag)) { + // Add wordbaseline to linebaseline + if (ttb_flag) { + word_baseline_pts = TransposePolygonline(word_baseline_pts); + } + ptaJoin(line_baseline_pts, word_baseline_pts, 0, -1); + } + word_baseline_pts = DestroyAndCreatePta(word_baseline_pts); + + // Reset word pts arrays + word_top_pts = DestroyAndCreatePta(word_top_pts); + word_bottom_pts = DestroyAndCreatePta(word_bottom_pts); + + // Check why this combination of words is not working as expected! + // Write the word contents to the line +#if 0 + if (!last_word_in_line && writing_direction_before != writing_direction && + writing_direction < 2 && writing_direction_before < 2 && + res_it->WordDirection()) { + if (writing_direction_before == WRITING_DIRECTION_LEFT_TO_RIGHT) { + // line_content << "" << word_content.str(); + } else { + // line_content << "" << word_content.str(); + } + } else { + // line_content << word_content.str(); + } + // Check if WordIsNeutral + if (res_it->WordDirection()) { + writing_direction_before = writing_direction; + } +#endif + word_content.str(""); + wcnt++; + + // Write line information to the output + if (last_word_in_line) { + // Combine ltr and rtl lines + if (ptaGetCount(line_top_rtl_pts) != 0) { + ptaJoin(line_top_ltr_pts, line_top_rtl_pts, 0, -1); + line_top_rtl_pts = DestroyAndCreatePta(line_top_rtl_pts); + } + if (ptaGetCount(line_bottom_rtl_pts) != 0) { + ptaJoin(line_bottom_ltr_pts, line_bottom_rtl_pts, 0, -1); + line_bottom_rtl_pts = DestroyAndCreatePta(line_bottom_rtl_pts); + } + if ((POLYGONFLAG && !skewed_flag) || LEVELFLAG > 0) { + // Recalc Polygonlines + line_top_ltr_pts = RecalcPolygonline(line_top_ltr_pts, 1 - ttb_flag); + line_bottom_ltr_pts = + RecalcPolygonline(line_bottom_ltr_pts, 0 + ttb_flag); + + // Smooth the polygonline + SimplifyLinePolygon(line_top_ltr_pts, 5, 1 - ttb_flag); + SimplifyLinePolygon(line_bottom_ltr_pts, 5, 0 + ttb_flag); + + // Fit linepolygon matching the baselinepoints + line_baseline_pts = SortBaseline(line_baseline_pts, writing_direction); + // Fitting baseline into polygon is currently deactivated + // it tends to push the baseline directly under superscritpts + // but the baseline is always inside the polygon maybe it will be useful + // for something line_baseline_pts = + // FitBaselineIntoLinePolygon(line_bottom_ltr_pts, line_baseline_pts, + // writing_direction); and it only cut it to the length and simplifies + // the linepolyon + line_baseline_pts = ClipAndSimplifyBaseline( + line_bottom_ltr_pts, line_baseline_pts, writing_direction); + + // Update polygon of the block + UpdateBlockPoints(block_top_pts, block_bottom_pts, line_top_ltr_pts, + line_bottom_ltr_pts, lcnt, last_word_in_cblock); + } + // Line level polygon + line_bottom_ltr_pts = ReversePolygonline(line_bottom_ltr_pts, 1); + ptaJoin(line_top_ltr_pts, line_bottom_ltr_pts, 0, -1); + line_bottom_ltr_pts = DestroyAndCreatePta(line_bottom_ltr_pts); + + if (LEVELFLAG > 0 && !(POLYGONFLAG && !skewed_flag)) { + line_top_ltr_pts = PolygonToBoxCoords(line_top_ltr_pts); + } + + // Write level points + line_str << "\t\t\t\t"; + if (ttb_flag) { + line_top_ltr_pts = TransposePolygonline(line_top_ltr_pts); + } + AddPointsToPAGE(line_top_ltr_pts, line_str); + line_top_ltr_pts = DestroyAndCreatePta(line_top_ltr_pts); + + // Write Baseline + line_str << "\t\t\t\t"; + if (ttb_flag) { + line_baseline_pts = TransposePolygonline(line_baseline_pts); + } + AddBaselinePtsToPAGE(line_baseline_pts, line_str); + line_baseline_pts = DestroyAndCreatePta(line_baseline_pts); + + // Add word information if word level output is active + line_str << word_str.str(); + word_str.str(""); + // Write Line TextEquiv + line_str << "\t\t\t\t<TextEquiv index=\"1\" conf=\"" + << std::setprecision(4) << line_conf << "\">\n" + << "\t\t\t\t\t<Unicode>" << line_content.str() << "</Unicode>\n" + << "\t\t\t\t</TextEquiv>\n"; + line_str << "\t\t\t</TextLine>\n"; + region_content << line_content.str(); + line_content.str(""); + if (!last_word_in_cblock) { + region_content << '\n'; + } + lcnt++; + wcnt = 0; + } + + // Write region information to the output + if (last_word_in_cblock) { + if ((POLYGONFLAG && !skewed_flag) || LEVELFLAG > 0) { + page_str << "<Coords points=\""; + block_bottom_pts = ReversePolygonline(block_bottom_pts, 1); + ptaJoin(block_top_pts, block_bottom_pts, 0, -1); + if (ttb_flag) { + block_top_pts = TransposePolygonline(block_top_pts); + } + ptaGetMinMax(block_top_pts, &x1, &y1, &x2, &y2); + page_str << (l_uint32)x1 << "," << (l_uint32)y1; + page_str << " " << (l_uint32)x2 << "," << (l_uint32)y1; + page_str << " " << (l_uint32)x2 << "," << (l_uint32)y2; + page_str << " " << (l_uint32)x1 << "," << (l_uint32)y2; + page_str << "\"/>\n"; + block_top_pts = DestroyAndCreatePta(block_top_pts); + block_bottom_pts = DestroyAndCreatePta(block_bottom_pts); + } + page_str << line_str.str(); + line_str.str(""); + page_str << "\t\t\t<TextEquiv index=\"1\" conf=\"" << std::setprecision(4) + << block_conf << "\">\n" + << "\t\t\t\t<Unicode>" << region_content.str() << "</Unicode>\n" + << "\t\t\t</TextEquiv>\n"; + page_str << "\t\t</TextRegion>\n"; + region_content.str(""); + rcnt++; + lcnt = 0; + } + } + + // Destroy all point information + ptaDestroy(&block_top_pts); + ptaDestroy(&block_bottom_pts); + ptaDestroy(&line_top_ltr_pts); + ptaDestroy(&line_bottom_ltr_pts); + ptaDestroy(&line_top_rtl_pts); + ptaDestroy(&line_bottom_rtl_pts); + ptaDestroy(&word_top_pts); + ptaDestroy(&word_bottom_pts); + ptaDestroy(&word_baseline_pts); + ptaDestroy(&line_baseline_rtl_pts); + ptaDestroy(&line_baseline_ltr_pts); + ptaDestroy(&line_baseline_pts); + + reading_order_str << "\t\t\t</OrderedGroup>\n" + << "\t\t</ReadingOrder>\n"; + + reading_order_str << page_str.str(); + page_str.str(""); + const std::string &text = reading_order_str.str(); + reading_order_str.str(""); + + return copy_string(text); +} + +} // namespace tesseract
