Python2/PyMuPDF: mupdf-source/thirdparty/tesseract/src/api/hocrrenderer.cpp comparison

comparison mupdf-source/thirdparty/tesseract/src/api/hocrrenderer.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.

author	Franz Glasner <fzglas.hg@dom66.de>
date	Mon, 15 Sep 2025 11:43:07 +0200
parents
children

comparison

equal deleted inserted replaced

-:1d09e1dec1d9
+:b50eed0cc0ef
+/**********************************************************************
+* File:        hocrrenderer.cpp
+* Description: Simple API for calling tesseract.
+* Author:      Ray Smith (original code from baseapi.cpp)
+* Author:      Stefan Weil (moved to separate file and cleaned code)
+*
+* (C) Copyright 2006, Google Inc.
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+** http://www.apache.org/licenses/LICENSE-2.0
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*
+**********************************************************************/
+#include <tesseract/baseapi.h> // for TessBaseAPI
+#include <locale>              // for std::locale::classic
+#include <memory>              // for std::unique_ptr
+#include <sstream>             // for std::stringstream
+#include <tesseract/renderer.h>
+#include "helpers.h"        // for copy_string
+#include "tesseractclass.h" // for Tesseract
+namespace tesseract {
+/**
+* Gets the block orientation at the current iterator position.
+*/
+static tesseract::Orientation GetBlockTextOrientation(const PageIterator *it) {
+tesseract::Orientation orientation;
+tesseract::WritingDirection writing_direction;
+tesseract::TextlineOrder textline_order;
+float deskew_angle;
+it->Orientation(&orientation, &writing_direction, &textline_order,
+&deskew_angle);
+return orientation;
+}
+/**
+* Fits a line to the baseline at the given level, and appends its coefficients
+* to the hOCR string.
+* NOTE: The hOCR spec is unclear on how to specify baseline coefficients for
+* rotated textlines. For this reason, on textlines that are not upright, this
+* method currently only inserts a 'textangle' property to indicate the rotation
+* direction and does not add any baseline information to the hocr string.
+*/
+static void AddBaselineCoordsTohOCR(const PageIterator *it,
+PageIteratorLevel level,
+std::stringstream &hocr_str) {
+tesseract::Orientation orientation = GetBlockTextOrientation(it);
+if (orientation != ORIENTATION_PAGE_UP) {
+hocr_str << "; textangle " << 360 - orientation * 90;
+return;
+}
+int left, top, right, bottom;
+it->BoundingBox(level, &left, &top, &right, &bottom);
+// Try to get the baseline coordinates at this level.
+int x1, y1, x2, y2;
+if (!it->Baseline(level, &x1, &y1, &x2, &y2)) {
+return;
+}
+// Following the description of this field of the hOCR spec, we convert the
+// baseline coordinates so that "the bottom left of the bounding box is the
+// origin".
+x1 -= left;
+x2 -= left;
+y1 -= bottom;
+y2 -= bottom;
+// Now fit a line through the points so we can extract coefficients for the
+// equation:  y = p1 x + p0
+if (x1 == x2) {
+// Problem computing the polynomial coefficients.
+return;
+}
+double p1 = (y2 - y1) / static_cast<double>(x2 - x1);
+double p0 = y1 - p1 * x1;
+hocr_str << "; baseline " << round(p1 * 1000.0) / 1000.0 << " "
+<< round(p0 * 1000.0) / 1000.0;
+}
+static void AddBoxTohOCR(const ResultIterator *it, PageIteratorLevel level,
+std::stringstream &hocr_str) {
+int left, top, right, bottom;
+it->BoundingBox(level, &left, &top, &right, &bottom);
+// This is the only place we use double quotes instead of single quotes,
+// but it may too late to change for consistency
+hocr_str << " title=\"bbox " << left << " " << top << " " << right << " "
+<< bottom;
+// Add baseline coordinates & heights for textlines only.
+if (level == RIL_TEXTLINE) {
+AddBaselineCoordsTohOCR(it, level, hocr_str);
+// add custom height measures
+float row_height, descenders, ascenders; // row attributes
+it->RowAttributes(&row_height, &descenders, &ascenders);
+// TODO(rays): Do we want to limit these to a single decimal place?
+hocr_str << "; x_size " << row_height << "; x_descenders " << -descenders
+<< "; x_ascenders " << ascenders;
+}
+hocr_str << "\">";
+}
+/**
+* Make a HTML-formatted string with hOCR markup from the internal
+* data structures.
+* page_number is 0-based but will appear in the output as 1-based.
+* Image name/input_file_ can be set by SetInputName before calling
+* GetHOCRText
+* STL removed from original patch submission and refactored by rays.
+* Returned string must be freed with the delete [] operator.
+*/
+char *TessBaseAPI::GetHOCRText(int page_number) {
+return GetHOCRText(nullptr, page_number);
+}
+/**
+* Make a HTML-formatted string with hOCR markup from the internal
+* data structures.
+* page_number is 0-based but will appear in the output as 1-based.
+* Image name/input_file_ can be set by SetInputName before calling
+* GetHOCRText
+* STL removed from original patch submission and refactored by rays.
+* Returned string must be freed with the delete [] operator.
+*/
+char *TessBaseAPI::GetHOCRText(ETEXT_DESC *monitor, int page_number) {
+if (tesseract_ == nullptr ||
+(page_res_ == nullptr && Recognize(monitor) < 0)) {
+return nullptr;
+}
+int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, scnt = 1, tcnt = 1, ccnt = 1;
+int page_id = page_number + 1; // hOCR uses 1-based page numbers.
+bool para_is_ltr = true;       // Default direction is LTR
+const char *paragraph_lang = nullptr;
+bool font_info = false;
+bool hocr_boxes = false;
+GetBoolVariable("hocr_font_info", &font_info);
+GetBoolVariable("hocr_char_boxes", &hocr_boxes);
+if (input_file_.empty()) {
+SetInputName(nullptr);
+}
+std::stringstream hocr_str;
+// Use "C" locale (needed for double values x_size and x_descenders).
+hocr_str.imbue(std::locale::classic());
+// Use 8 digits for double values.
+hocr_str.precision(8);
+hocr_str << "  <div class='ocr_page'"
+<< " id='"
+<< "page_" << page_id << "'"
+<< " title='image \"";
+if (!input_file_.empty()) {
+hocr_str << HOcrEscape(input_file_.c_str());
+} else {
+hocr_str << "unknown";
+}
+hocr_str << "\"; bbox " << rect_left_ << " " << rect_top_ << " "
+<< rect_width_ << " " << rect_height_ << "; ppageno " << page_number
+<< "; scan_res " << GetSourceYResolution() << " "
+<< GetSourceYResolution() << "'>\n";
+std::unique_ptr<ResultIterator> res_it(GetIterator());
+while (!res_it->Empty(RIL_BLOCK)) {
+int left, top, right, bottom;
+auto block_type = res_it->BlockType();
+switch (block_type) {
+case PT_FLOWING_IMAGE:
+case PT_HEADING_IMAGE:
+case PT_PULLOUT_IMAGE: {
+// Handle all kinds of images.
+res_it.get()->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom);
+hocr_str << "   <div class='ocr_photo' id='block_" << page_id << '_'
+<< bcnt++ << "' title=\"bbox " << left << " " << top << " "
+<< right << " " << bottom << "\"></div>\n";
+res_it->Next(RIL_BLOCK);
+continue;
+}
+case PT_HORZ_LINE:
+case PT_VERT_LINE:
+// Handle horizontal and vertical lines.
+res_it.get()->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom);
+hocr_str << "   <div class='ocr_separator' id='block_" << page_id << '_'
+<< bcnt++ << "' title=\"bbox " << left << " " << top << " "
+<< right << " " << bottom << "\"></div>\n";
+res_it->Next(RIL_BLOCK);
+continue;
+case PT_NOISE:
+tprintf("TODO: Please report image which triggers the noise case.\n");
+ASSERT_HOST(false);
+default:
+break;
+}
+if (res_it->Empty(RIL_WORD)) {
+res_it->Next(RIL_WORD);
+continue;
+}
+// Open any new block/paragraph/textline.
+if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
+para_is_ltr = true; // reset to default direction
+hocr_str << "   <div class='ocr_carea'"
+<< " id='"
+<< "block_" << page_id << "_" << bcnt << "'";
+AddBoxTohOCR(res_it.get(), RIL_BLOCK, hocr_str);
+}
+if (res_it->IsAtBeginningOf(RIL_PARA)) {
+hocr_str << "\n    <p class='ocr_par'";
+para_is_ltr = res_it->ParagraphIsLtr();
+if (!para_is_ltr) {
+hocr_str << " dir='rtl'";
+}
+hocr_str << " id='"
+<< "par_" << page_id << "_" << pcnt << "'";
+paragraph_lang = res_it->WordRecognitionLanguage();
+if (paragraph_lang) {
+hocr_str << " lang='" << paragraph_lang << "'";
+}
+AddBoxTohOCR(res_it.get(), RIL_PARA, hocr_str);
+}
+if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
+hocr_str << "\n     <span class='";
+switch (block_type) {
+case PT_HEADING_TEXT:
+hocr_str << "ocr_header";
+break;
+case PT_PULLOUT_TEXT:
+hocr_str << "ocr_textfloat";
+break;
+case PT_CAPTION_TEXT:
+hocr_str << "ocr_caption";
+break;
+case PT_FLOWING_IMAGE:
+case PT_HEADING_IMAGE:
+case PT_PULLOUT_IMAGE:
+ASSERT_HOST(false);
+break;
+default:
+hocr_str << "ocr_line";
+}
+hocr_str << "' id='"
+<< "line_" << page_id << "_" << lcnt << "'";
+AddBoxTohOCR(res_it.get(), RIL_TEXTLINE, hocr_str);
+}
+// Now, process the word...
+int32_t lstm_choice_mode = tesseract_->lstm_choice_mode;
+std::vector<std::vector<std::vector<std::pair<const char *, float>>>>
+*rawTimestepMap = nullptr;
+std::vector<std::vector<std::pair<const char *, float>>> *CTCMap = nullptr;
+if (lstm_choice_mode) {
+CTCMap = res_it->GetBestLSTMSymbolChoices();
+rawTimestepMap = res_it->GetRawLSTMTimesteps();
+}
+hocr_str << "\n      <span class='ocrx_word'"
+<< " id='"
+<< "word_" << page_id << "_" << wcnt << "'";
+bool bold, italic, underlined, monospace, serif, smallcaps;
+int pointsize, font_id;
+res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
+const char *font_name =
+res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace,
+&serif, &smallcaps, &pointsize, &font_id);
+hocr_str << " title='bbox " << left << " " << top << " " << right << " "
+<< bottom << "; x_wconf "
+<< static_cast<int>(res_it->Confidence(RIL_WORD));
+if (font_info) {
+if (font_name) {
+hocr_str << "; x_font " << HOcrEscape(font_name).c_str();
+}
+hocr_str << "; x_fsize " << pointsize;
+}
+hocr_str << "'";
+const char *lang = res_it->WordRecognitionLanguage();
+if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) {
+hocr_str << " lang='" << lang << "'";
+}
+switch (res_it->WordDirection()) {
+// Only emit direction if different from current paragraph direction
+case DIR_LEFT_TO_RIGHT:
+if (!para_is_ltr) {
+hocr_str << " dir='ltr'";
+}
+break;
+case DIR_RIGHT_TO_LEFT:
+if (para_is_ltr) {
+hocr_str << " dir='rtl'";
+}
+break;
+case DIR_MIX:
+case DIR_NEUTRAL:
+default: // Do nothing.
+break;
+}
+hocr_str << ">";
+bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
+bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
+bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
+if (bold) {
+hocr_str << "<strong>";
+}
+if (italic) {
+hocr_str << "<em>";
+}
+do {
+const std::unique_ptr<const char[]> grapheme(
+res_it->GetUTF8Text(RIL_SYMBOL));
+if (grapheme && grapheme[0] != 0) {
+if (hocr_boxes) {
+res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);
+hocr_str << "\n       <span class='ocrx_cinfo' title='x_bboxes "
+<< left << " " << top << " " << right << " " << bottom
+<< "; x_conf " << res_it->Confidence(RIL_SYMBOL) << "'>";
+}
+hocr_str << HOcrEscape(grapheme.get()).c_str();
+if (hocr_boxes) {
+hocr_str << "</span>";
+tesseract::ChoiceIterator ci(*res_it);
+if (lstm_choice_mode == 1 && ci.Timesteps() != nullptr) {
+std::vector<std::vector<std::pair<const char *, float>>> *symbol =
+ci.Timesteps();
+hocr_str << "\n        <span class='ocr_symbol'"
+<< " id='"
+<< "symbol_" << page_id << "_" << wcnt << "_" << scnt
+<< "'>";
+for (const auto &timestep : *symbol) {
+hocr_str << "\n         <span class='ocrx_cinfo'"
+<< " id='"
+<< "timestep" << page_id << "_" << wcnt << "_" << tcnt
+<< "'>";
+for (auto conf : timestep) {
+hocr_str << "\n          <span class='ocrx_cinfo'"
+<< " id='"
+<< "choice_" << page_id << "_" << wcnt << "_" << ccnt
+<< "'"
+<< " title='x_confs " << int(conf.second * 100) << "'>"
+<< HOcrEscape(conf.first).c_str() << "</span>";
+++ccnt;
+}
+hocr_str << "</span>";
+++tcnt;
+}
+hocr_str << "\n        </span>";
+++scnt;
+} else if (lstm_choice_mode == 2) {
+hocr_str << "\n        <span class='ocrx_cinfo'"
+<< " id='"
+<< "lstm_choices_" << page_id << "_" << wcnt << "_" << tcnt
+<< "'>";
+do {
+const char *choice = ci.GetUTF8Text();
+float choiceconf = ci.Confidence();
+if (choice != nullptr) {
+hocr_str << "\n         <span class='ocrx_cinfo'"
+<< " id='"
+<< "choice_" << page_id << "_" << wcnt << "_" << ccnt
+<< "'"
+<< " title='x_confs " << choiceconf << "'>"
+<< HOcrEscape(choice).c_str() << "</span>";
+ccnt++;
+}
+} while (ci.Next());
+hocr_str << "\n        </span>";
+tcnt++;
+}
+}
+}
+res_it->Next(RIL_SYMBOL);
+} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
+if (italic) {
+hocr_str << "</em>";
+}
+if (bold) {
+hocr_str << "</strong>";
+}
+// If the lstm choice mode is required it is added here
+if (lstm_choice_mode == 1 && !hocr_boxes && rawTimestepMap != nullptr) {
+for (const auto &symbol : *rawTimestepMap) {
+hocr_str << "\n       <span class='ocr_symbol'"
+<< " id='"
+<< "symbol_" << page_id << "_" << wcnt << "_" << scnt << "'>";
+for (const auto &timestep : symbol) {
+hocr_str << "\n        <span class='ocrx_cinfo'"
+<< " id='"
+<< "timestep" << page_id << "_" << wcnt << "_" << tcnt
+<< "'>";
+for (auto &&conf : timestep) {
+hocr_str << "\n         <span class='ocrx_cinfo'"
+<< " id='"
+<< "choice_" << page_id << "_" << wcnt << "_" << ccnt
+<< "'"
+<< " title='x_confs " << int(conf.second * 100) << "'>"
+<< HOcrEscape(conf.first).c_str() << "</span>";
+++ccnt;
+}
+hocr_str << "</span>";
+++tcnt;
+}
+hocr_str << "</span>";
+++scnt;
+}
+} else if (lstm_choice_mode == 2 && !hocr_boxes && CTCMap != nullptr) {
+for (const auto &timestep : *CTCMap) {
+if (timestep.size() > 0) {
+hocr_str << "\n       <span class='ocrx_cinfo'"
+<< " id='"
+<< "lstm_choices_" << page_id << "_" << wcnt << "_" << tcnt
+<< "'>";
+for (auto &j : timestep) {
+float conf = 100 - tesseract_->lstm_rating_coefficient * j.second;
+if (conf < 0.0f) {
+conf = 0.0f;
+}
+if (conf > 100.0f) {
+conf = 100.0f;
+}
+hocr_str << "\n        <span class='ocrx_cinfo'"
+<< " id='"
+<< "choice_" << page_id << "_" << wcnt << "_" << ccnt
+<< "'"
+<< " title='x_confs " << conf << "'>"
+<< HOcrEscape(j.first).c_str() << "</span>";
+ccnt++;
+}
+hocr_str << "</span>";
+tcnt++;
+}
+}
+}
+// Close ocrx_word.
+if (hocr_boxes || lstm_choice_mode > 0) {
+hocr_str << "\n      ";
+}
+hocr_str << "</span>";
+tcnt = 1;
+ccnt = 1;
+wcnt++;
+// Close any ending block/paragraph/textline.
+if (last_word_in_line) {
+hocr_str << "\n     </span>";
+lcnt++;
+}
+if (last_word_in_para) {
+hocr_str << "\n    </p>\n";
+pcnt++;
+para_is_ltr = true; // back to default direction
+}
+if (last_word_in_block) {
+hocr_str << "   </div>\n";
+bcnt++;
+}
+}
+hocr_str << "  </div>\n";
+return copy_string(hocr_str.str());
+}
+/**********************************************************************
+* HOcr Text Renderer interface implementation
+**********************************************************************/
+TessHOcrRenderer::TessHOcrRenderer(const char *outputbase)
+: TessResultRenderer(outputbase, "hocr") {
+font_info_ = false;
+}
+TessHOcrRenderer::TessHOcrRenderer(const char *outputbase, bool font_info)
+: TessResultRenderer(outputbase, "hocr") {
+font_info_ = font_info;
+}
+bool TessHOcrRenderer::BeginDocumentHandler() {
+AppendString(
+"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
+"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n"
+"    \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"
+"<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" "
+"lang=\"en\">\n <head>\n  <title>");
+AppendString(title());
+AppendString(
+"</title>\n"
+"  <meta http-equiv=\"Content-Type\" content=\"text/html;"
+"charset=utf-8\"/>\n"
+"  <meta name='ocr-system' content='tesseract " TESSERACT_VERSION_STR
+"' />\n"
+"  <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par"
+" ocr_line ocrx_word ocrp_dir ocrp_lang ocrp_wconf");
+if (font_info_) {
+AppendString(" ocrp_font ocrp_fsize");
+}
+AppendString(
+"'/>\n"
+" </head>\n"
+" <body>\n");
+return true;
+}
+bool TessHOcrRenderer::EndDocumentHandler() {
+AppendString(" </body>\n</html>\n");
+return true;
+}
+bool TessHOcrRenderer::AddImageHandler(TessBaseAPI *api) {
+const std::unique_ptr<const char[]> hocr(api->GetHOCRText(imagenum()));
+if (hocr == nullptr) {
+return false;
+}
+AppendString(hocr.get());
+return true;
+}
+} // namespace tesseract

Mercurial > hgrepos > Python2 > PyMuPDF

comparison mupdf-source/thirdparty/tesseract/src/api/hocrrenderer.cpp @ 2:b50eed0cc0ef upstream