Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/api/altorenderer.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/api/altorenderer.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,261 @@ +// File: altorenderer.cpp +// Description: ALTO rendering interface +// Author: Jake Sebright + +// (C) Copyright 2018 +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "errcode.h" // for ASSERT_HOST +#include "helpers.h" // for copy_string +#include "tprintf.h" // for tprintf + +#include <tesseract/baseapi.h> +#include <tesseract/renderer.h> + +#include <memory> +#include <sstream> // for std::stringstream + +namespace tesseract { + +/// Add coordinates to specified TextBlock, TextLine or String bounding box. +/// Add word confidence if adding to a String bounding box. +/// +static void AddBoxToAlto(const ResultIterator *it, PageIteratorLevel level, + std::stringstream &alto_str) { + int left, top, right, bottom; + it->BoundingBox(level, &left, &top, &right, &bottom); + + int hpos = left; + int vpos = top; + int height = bottom - top; + int width = right - left; + + alto_str << " HPOS=\"" << hpos << "\""; + alto_str << " VPOS=\"" << vpos << "\""; + alto_str << " WIDTH=\"" << width << "\""; + alto_str << " HEIGHT=\"" << height << "\""; + + if (level == RIL_WORD) { + int wc = it->Confidence(RIL_WORD); + alto_str << " WC=\"0." << wc << "\""; + } else { + alto_str << ">"; + } +} + +/// +/// Append the ALTO XML for the beginning of the document +/// +bool TessAltoRenderer::BeginDocumentHandler() { + // Delay the XML output because we need the name of the image file. + begin_document = true; + return true; +} + +/// +/// Append the ALTO XML for the layout of the image +/// +bool TessAltoRenderer::AddImageHandler(TessBaseAPI *api) { + if (begin_document) { + AppendString( + "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + "<alto xmlns=\"http://www.loc.gov/standards/alto/ns-v3#\" " + "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" " + "xsi:schemaLocation=\"http://www.loc.gov/standards/alto/ns-v3# " + "http://www.loc.gov/alto/v3/alto-3-0.xsd\">\n" + "\t<Description>\n" + "\t\t<MeasurementUnit>pixel</MeasurementUnit>\n" + "\t\t<sourceImageInformation>\n" + "\t\t\t<fileName>"); + + AppendString(api->GetInputName()); + + AppendString( + "</fileName>\n" + "\t\t</sourceImageInformation>\n" + "\t\t<OCRProcessing ID=\"OCR_0\">\n" + "\t\t\t<ocrProcessingStep>\n" + "\t\t\t\t<processingSoftware>\n" + "\t\t\t\t\t<softwareName>tesseract "); + AppendString(TessBaseAPI::Version()); + AppendString( + "</softwareName>\n" + "\t\t\t\t</processingSoftware>\n" + "\t\t\t</ocrProcessingStep>\n" + "\t\t</OCRProcessing>\n" + "\t</Description>\n" + "\t<Layout>\n"); + begin_document = false; + } + + const std::unique_ptr<const char[]> text(api->GetAltoText(imagenum())); + if (text == nullptr) { + return false; + } + + AppendString(text.get()); + + return true; +} + +/// +/// Append the ALTO XML for the end of the document +/// +bool TessAltoRenderer::EndDocumentHandler() { + AppendString("\t</Layout>\n</alto>\n"); + + return true; +} + +TessAltoRenderer::TessAltoRenderer(const char *outputbase) + : TessResultRenderer(outputbase, "xml"), + begin_document(false) {} + +/// +/// Make an XML-formatted string with ALTO markup from the internal +/// data structures. +/// +char *TessBaseAPI::GetAltoText(int page_number) { + return GetAltoText(nullptr, page_number); +} + +/// +/// Make an XML-formatted string with ALTO markup from the internal +/// data structures. +/// +char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) { + if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0)) { + return nullptr; + } + + int lcnt = 0, tcnt = 0, bcnt = 0, wcnt = 0; + + if (input_file_.empty()) { + SetInputName(nullptr); + } + + std::stringstream alto_str; + // Use "C" locale (needed for int values larger than 999). + alto_str.imbue(std::locale::classic()); + alto_str << "\t\t<Page WIDTH=\"" << rect_width_ << "\" HEIGHT=\"" << rect_height_ + << "\" PHYSICAL_IMG_NR=\"" << page_number << "\"" + << " ID=\"page_" << page_number << "\">\n" + << "\t\t\t<PrintSpace HPOS=\"0\" VPOS=\"0\"" + << " WIDTH=\"" << rect_width_ << "\"" + << " HEIGHT=\"" << rect_height_ << "\">\n"; + + std::unique_ptr<ResultIterator> res_it(GetIterator()); + while (!res_it->Empty(RIL_BLOCK)) { + if (res_it->Empty(RIL_WORD)) { + res_it->Next(RIL_WORD); + continue; + } + + int left, top, right, bottom; + auto block_type = res_it->BlockType(); + + switch (block_type) { + case PT_FLOWING_IMAGE: + case PT_HEADING_IMAGE: + case PT_PULLOUT_IMAGE: { + // Handle all kinds of images. + // TODO: optionally add TYPE, for example TYPE="photo". + alto_str << "\t\t\t\t<Illustration ID=\"cblock_" << bcnt++ << "\""; + AddBoxToAlto(res_it.get(), RIL_BLOCK, alto_str); + alto_str << "</Illustration>\n"; + res_it->Next(RIL_BLOCK); + continue; + } + case PT_HORZ_LINE: + case PT_VERT_LINE: + // Handle horizontal and vertical lines. + alto_str << "\t\t\t\t<GraphicalElement ID=\"cblock_" << bcnt++ << "\""; + AddBoxToAlto(res_it.get(), RIL_BLOCK, alto_str); + alto_str << "</GraphicalElement >\n"; + res_it->Next(RIL_BLOCK); + continue; + case PT_NOISE: + tprintf("TODO: Please report image which triggers the noise case.\n"); + ASSERT_HOST(false); + default: + break; + } + + if (res_it->IsAtBeginningOf(RIL_BLOCK)) { + alto_str << "\t\t\t\t<ComposedBlock ID=\"cblock_" << bcnt << "\""; + AddBoxToAlto(res_it.get(), RIL_BLOCK, alto_str); + alto_str << "\n"; + } + + if (res_it->IsAtBeginningOf(RIL_PARA)) { + alto_str << "\t\t\t\t\t<TextBlock ID=\"block_" << tcnt << "\""; + AddBoxToAlto(res_it.get(), RIL_PARA, alto_str); + alto_str << "\n"; + } + + if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { + alto_str << "\t\t\t\t\t\t<TextLine ID=\"line_" << lcnt << "\""; + AddBoxToAlto(res_it.get(), RIL_TEXTLINE, alto_str); + alto_str << "\n"; + } + + alto_str << "\t\t\t\t\t\t\t<String ID=\"string_" << wcnt << "\""; + AddBoxToAlto(res_it.get(), RIL_WORD, alto_str); + alto_str << " CONTENT=\""; + + bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD); + bool last_word_in_tblock = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD); + bool last_word_in_cblock = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD); + + res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom); + + do { + const std::unique_ptr<const char[]> grapheme(res_it->GetUTF8Text(RIL_SYMBOL)); + if (grapheme && grapheme[0] != 0) { + alto_str << HOcrEscape(grapheme.get()).c_str(); + } + res_it->Next(RIL_SYMBOL); + } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); + + alto_str << "\"/>"; + + wcnt++; + + if (last_word_in_line) { + alto_str << "\n\t\t\t\t\t\t</TextLine>\n"; + lcnt++; + } else { + int hpos = right; + int vpos = top; + res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom); + int width = left - hpos; + alto_str << "<SP WIDTH=\"" << width << "\" VPOS=\"" << vpos << "\" HPOS=\"" << hpos + << "\"/>\n"; + } + + if (last_word_in_tblock) { + alto_str << "\t\t\t\t\t</TextBlock>\n"; + tcnt++; + } + + if (last_word_in_cblock) { + alto_str << "\t\t\t\t</ComposedBlock>\n"; + bcnt++; + } + } + + alto_str << "\t\t\t</PrintSpace>\n" + << "\t\t</Page>\n"; + + return copy_string(alto_str.str()); +} + +} // namespace tesseract
