Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/api/hocrrenderer.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/api/hocrrenderer.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,524 @@ +/********************************************************************** + * File: hocrrenderer.cpp + * Description: Simple API for calling tesseract. + * Author: Ray Smith (original code from baseapi.cpp) + * Author: Stefan Weil (moved to separate file and cleaned code) + * + * (C) Copyright 2006, Google Inc. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + **********************************************************************/ + +#include <tesseract/baseapi.h> // for TessBaseAPI +#include <locale> // for std::locale::classic +#include <memory> // for std::unique_ptr +#include <sstream> // for std::stringstream +#include <tesseract/renderer.h> +#include "helpers.h" // for copy_string +#include "tesseractclass.h" // for Tesseract + +namespace tesseract { + +/** + * Gets the block orientation at the current iterator position. + */ +static tesseract::Orientation GetBlockTextOrientation(const PageIterator *it) { + tesseract::Orientation orientation; + tesseract::WritingDirection writing_direction; + tesseract::TextlineOrder textline_order; + float deskew_angle; + it->Orientation(&orientation, &writing_direction, &textline_order, + &deskew_angle); + return orientation; +} + +/** + * Fits a line to the baseline at the given level, and appends its coefficients + * to the hOCR string. + * NOTE: The hOCR spec is unclear on how to specify baseline coefficients for + * rotated textlines. For this reason, on textlines that are not upright, this + * method currently only inserts a 'textangle' property to indicate the rotation + * direction and does not add any baseline information to the hocr string. + */ +static void AddBaselineCoordsTohOCR(const PageIterator *it, + PageIteratorLevel level, + std::stringstream &hocr_str) { + tesseract::Orientation orientation = GetBlockTextOrientation(it); + if (orientation != ORIENTATION_PAGE_UP) { + hocr_str << "; textangle " << 360 - orientation * 90; + return; + } + + int left, top, right, bottom; + it->BoundingBox(level, &left, &top, &right, &bottom); + + // Try to get the baseline coordinates at this level. + int x1, y1, x2, y2; + if (!it->Baseline(level, &x1, &y1, &x2, &y2)) { + return; + } + // Following the description of this field of the hOCR spec, we convert the + // baseline coordinates so that "the bottom left of the bounding box is the + // origin". + x1 -= left; + x2 -= left; + y1 -= bottom; + y2 -= bottom; + + // Now fit a line through the points so we can extract coefficients for the + // equation: y = p1 x + p0 + if (x1 == x2) { + // Problem computing the polynomial coefficients. + return; + } + double p1 = (y2 - y1) / static_cast<double>(x2 - x1); + double p0 = y1 - p1 * x1; + + hocr_str << "; baseline " << round(p1 * 1000.0) / 1000.0 << " " + << round(p0 * 1000.0) / 1000.0; +} + +static void AddBoxTohOCR(const ResultIterator *it, PageIteratorLevel level, + std::stringstream &hocr_str) { + int left, top, right, bottom; + it->BoundingBox(level, &left, &top, &right, &bottom); + // This is the only place we use double quotes instead of single quotes, + // but it may too late to change for consistency + hocr_str << " title=\"bbox " << left << " " << top << " " << right << " " + << bottom; + // Add baseline coordinates & heights for textlines only. + if (level == RIL_TEXTLINE) { + AddBaselineCoordsTohOCR(it, level, hocr_str); + // add custom height measures + float row_height, descenders, ascenders; // row attributes + it->RowAttributes(&row_height, &descenders, &ascenders); + // TODO(rays): Do we want to limit these to a single decimal place? + hocr_str << "; x_size " << row_height << "; x_descenders " << -descenders + << "; x_ascenders " << ascenders; + } + hocr_str << "\">"; +} + +/** + * Make a HTML-formatted string with hOCR markup from the internal + * data structures. + * page_number is 0-based but will appear in the output as 1-based. + * Image name/input_file_ can be set by SetInputName before calling + * GetHOCRText + * STL removed from original patch submission and refactored by rays. + * Returned string must be freed with the delete [] operator. + */ +char *TessBaseAPI::GetHOCRText(int page_number) { + return GetHOCRText(nullptr, page_number); +} + +/** + * Make a HTML-formatted string with hOCR markup from the internal + * data structures. + * page_number is 0-based but will appear in the output as 1-based. + * Image name/input_file_ can be set by SetInputName before calling + * GetHOCRText + * STL removed from original patch submission and refactored by rays. + * Returned string must be freed with the delete [] operator. + */ +char *TessBaseAPI::GetHOCRText(ETEXT_DESC *monitor, int page_number) { + if (tesseract_ == nullptr || + (page_res_ == nullptr && Recognize(monitor) < 0)) { + return nullptr; + } + + int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, scnt = 1, tcnt = 1, ccnt = 1; + int page_id = page_number + 1; // hOCR uses 1-based page numbers. + bool para_is_ltr = true; // Default direction is LTR + const char *paragraph_lang = nullptr; + bool font_info = false; + bool hocr_boxes = false; + GetBoolVariable("hocr_font_info", &font_info); + GetBoolVariable("hocr_char_boxes", &hocr_boxes); + + if (input_file_.empty()) { + SetInputName(nullptr); + } + + std::stringstream hocr_str; + // Use "C" locale (needed for double values x_size and x_descenders). + hocr_str.imbue(std::locale::classic()); + // Use 8 digits for double values. + hocr_str.precision(8); + hocr_str << " <div class='ocr_page'" + << " id='" + << "page_" << page_id << "'" + << " title='image \""; + if (!input_file_.empty()) { + hocr_str << HOcrEscape(input_file_.c_str()); + } else { + hocr_str << "unknown"; + } + + hocr_str << "\"; bbox " << rect_left_ << " " << rect_top_ << " " + << rect_width_ << " " << rect_height_ << "; ppageno " << page_number + << "; scan_res " << GetSourceYResolution() << " " + << GetSourceYResolution() << "'>\n"; + + std::unique_ptr<ResultIterator> res_it(GetIterator()); + while (!res_it->Empty(RIL_BLOCK)) { + int left, top, right, bottom; + auto block_type = res_it->BlockType(); + switch (block_type) { + case PT_FLOWING_IMAGE: + case PT_HEADING_IMAGE: + case PT_PULLOUT_IMAGE: { + // Handle all kinds of images. + res_it.get()->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom); + hocr_str << " <div class='ocr_photo' id='block_" << page_id << '_' + << bcnt++ << "' title=\"bbox " << left << " " << top << " " + << right << " " << bottom << "\"></div>\n"; + res_it->Next(RIL_BLOCK); + continue; + } + case PT_HORZ_LINE: + case PT_VERT_LINE: + // Handle horizontal and vertical lines. + res_it.get()->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom); + hocr_str << " <div class='ocr_separator' id='block_" << page_id << '_' + << bcnt++ << "' title=\"bbox " << left << " " << top << " " + << right << " " << bottom << "\"></div>\n"; + res_it->Next(RIL_BLOCK); + continue; + case PT_NOISE: + tprintf("TODO: Please report image which triggers the noise case.\n"); + ASSERT_HOST(false); + default: + break; + } + + if (res_it->Empty(RIL_WORD)) { + res_it->Next(RIL_WORD); + continue; + } + + // Open any new block/paragraph/textline. + if (res_it->IsAtBeginningOf(RIL_BLOCK)) { + para_is_ltr = true; // reset to default direction + hocr_str << " <div class='ocr_carea'" + << " id='" + << "block_" << page_id << "_" << bcnt << "'"; + AddBoxTohOCR(res_it.get(), RIL_BLOCK, hocr_str); + } + if (res_it->IsAtBeginningOf(RIL_PARA)) { + hocr_str << "\n <p class='ocr_par'"; + para_is_ltr = res_it->ParagraphIsLtr(); + if (!para_is_ltr) { + hocr_str << " dir='rtl'"; + } + hocr_str << " id='" + << "par_" << page_id << "_" << pcnt << "'"; + paragraph_lang = res_it->WordRecognitionLanguage(); + if (paragraph_lang) { + hocr_str << " lang='" << paragraph_lang << "'"; + } + AddBoxTohOCR(res_it.get(), RIL_PARA, hocr_str); + } + if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { + hocr_str << "\n <span class='"; + switch (block_type) { + case PT_HEADING_TEXT: + hocr_str << "ocr_header"; + break; + case PT_PULLOUT_TEXT: + hocr_str << "ocr_textfloat"; + break; + case PT_CAPTION_TEXT: + hocr_str << "ocr_caption"; + break; + case PT_FLOWING_IMAGE: + case PT_HEADING_IMAGE: + case PT_PULLOUT_IMAGE: + ASSERT_HOST(false); + break; + default: + hocr_str << "ocr_line"; + } + hocr_str << "' id='" + << "line_" << page_id << "_" << lcnt << "'"; + AddBoxTohOCR(res_it.get(), RIL_TEXTLINE, hocr_str); + } + + // Now, process the word... + int32_t lstm_choice_mode = tesseract_->lstm_choice_mode; + std::vector<std::vector<std::vector<std::pair<const char *, float>>>> + *rawTimestepMap = nullptr; + std::vector<std::vector<std::pair<const char *, float>>> *CTCMap = nullptr; + if (lstm_choice_mode) { + CTCMap = res_it->GetBestLSTMSymbolChoices(); + rawTimestepMap = res_it->GetRawLSTMTimesteps(); + } + hocr_str << "\n <span class='ocrx_word'" + << " id='" + << "word_" << page_id << "_" << wcnt << "'"; + bool bold, italic, underlined, monospace, serif, smallcaps; + int pointsize, font_id; + res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom); + const char *font_name = + res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, + &serif, &smallcaps, &pointsize, &font_id); + hocr_str << " title='bbox " << left << " " << top << " " << right << " " + << bottom << "; x_wconf " + << static_cast<int>(res_it->Confidence(RIL_WORD)); + if (font_info) { + if (font_name) { + hocr_str << "; x_font " << HOcrEscape(font_name).c_str(); + } + hocr_str << "; x_fsize " << pointsize; + } + hocr_str << "'"; + const char *lang = res_it->WordRecognitionLanguage(); + if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) { + hocr_str << " lang='" << lang << "'"; + } + switch (res_it->WordDirection()) { + // Only emit direction if different from current paragraph direction + case DIR_LEFT_TO_RIGHT: + if (!para_is_ltr) { + hocr_str << " dir='ltr'"; + } + break; + case DIR_RIGHT_TO_LEFT: + if (para_is_ltr) { + hocr_str << " dir='rtl'"; + } + break; + case DIR_MIX: + case DIR_NEUTRAL: + default: // Do nothing. + break; + } + hocr_str << ">"; + bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD); + bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD); + bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD); + if (bold) { + hocr_str << "<strong>"; + } + if (italic) { + hocr_str << "<em>"; + } + do { + const std::unique_ptr<const char[]> grapheme( + res_it->GetUTF8Text(RIL_SYMBOL)); + if (grapheme && grapheme[0] != 0) { + if (hocr_boxes) { + res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom); + hocr_str << "\n <span class='ocrx_cinfo' title='x_bboxes " + << left << " " << top << " " << right << " " << bottom + << "; x_conf " << res_it->Confidence(RIL_SYMBOL) << "'>"; + } + hocr_str << HOcrEscape(grapheme.get()).c_str(); + if (hocr_boxes) { + hocr_str << "</span>"; + tesseract::ChoiceIterator ci(*res_it); + if (lstm_choice_mode == 1 && ci.Timesteps() != nullptr) { + std::vector<std::vector<std::pair<const char *, float>>> *symbol = + ci.Timesteps(); + hocr_str << "\n <span class='ocr_symbol'" + << " id='" + << "symbol_" << page_id << "_" << wcnt << "_" << scnt + << "'>"; + for (const auto ×tep : *symbol) { + hocr_str << "\n <span class='ocrx_cinfo'" + << " id='" + << "timestep" << page_id << "_" << wcnt << "_" << tcnt + << "'>"; + for (auto conf : timestep) { + hocr_str << "\n <span class='ocrx_cinfo'" + << " id='" + << "choice_" << page_id << "_" << wcnt << "_" << ccnt + << "'" + << " title='x_confs " << int(conf.second * 100) << "'>" + << HOcrEscape(conf.first).c_str() << "</span>"; + ++ccnt; + } + hocr_str << "</span>"; + ++tcnt; + } + hocr_str << "\n </span>"; + ++scnt; + } else if (lstm_choice_mode == 2) { + hocr_str << "\n <span class='ocrx_cinfo'" + << " id='" + << "lstm_choices_" << page_id << "_" << wcnt << "_" << tcnt + << "'>"; + do { + const char *choice = ci.GetUTF8Text(); + float choiceconf = ci.Confidence(); + if (choice != nullptr) { + hocr_str << "\n <span class='ocrx_cinfo'" + << " id='" + << "choice_" << page_id << "_" << wcnt << "_" << ccnt + << "'" + << " title='x_confs " << choiceconf << "'>" + << HOcrEscape(choice).c_str() << "</span>"; + ccnt++; + } + } while (ci.Next()); + hocr_str << "\n </span>"; + tcnt++; + } + } + } + res_it->Next(RIL_SYMBOL); + } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); + if (italic) { + hocr_str << "</em>"; + } + if (bold) { + hocr_str << "</strong>"; + } + // If the lstm choice mode is required it is added here + if (lstm_choice_mode == 1 && !hocr_boxes && rawTimestepMap != nullptr) { + for (const auto &symbol : *rawTimestepMap) { + hocr_str << "\n <span class='ocr_symbol'" + << " id='" + << "symbol_" << page_id << "_" << wcnt << "_" << scnt << "'>"; + for (const auto ×tep : symbol) { + hocr_str << "\n <span class='ocrx_cinfo'" + << " id='" + << "timestep" << page_id << "_" << wcnt << "_" << tcnt + << "'>"; + for (auto &&conf : timestep) { + hocr_str << "\n <span class='ocrx_cinfo'" + << " id='" + << "choice_" << page_id << "_" << wcnt << "_" << ccnt + << "'" + << " title='x_confs " << int(conf.second * 100) << "'>" + << HOcrEscape(conf.first).c_str() << "</span>"; + ++ccnt; + } + hocr_str << "</span>"; + ++tcnt; + } + hocr_str << "</span>"; + ++scnt; + } + } else if (lstm_choice_mode == 2 && !hocr_boxes && CTCMap != nullptr) { + for (const auto ×tep : *CTCMap) { + if (timestep.size() > 0) { + hocr_str << "\n <span class='ocrx_cinfo'" + << " id='" + << "lstm_choices_" << page_id << "_" << wcnt << "_" << tcnt + << "'>"; + for (auto &j : timestep) { + float conf = 100 - tesseract_->lstm_rating_coefficient * j.second; + if (conf < 0.0f) { + conf = 0.0f; + } + if (conf > 100.0f) { + conf = 100.0f; + } + hocr_str << "\n <span class='ocrx_cinfo'" + << " id='" + << "choice_" << page_id << "_" << wcnt << "_" << ccnt + << "'" + << " title='x_confs " << conf << "'>" + << HOcrEscape(j.first).c_str() << "</span>"; + ccnt++; + } + hocr_str << "</span>"; + tcnt++; + } + } + } + // Close ocrx_word. + if (hocr_boxes || lstm_choice_mode > 0) { + hocr_str << "\n "; + } + hocr_str << "</span>"; + tcnt = 1; + ccnt = 1; + wcnt++; + // Close any ending block/paragraph/textline. + if (last_word_in_line) { + hocr_str << "\n </span>"; + lcnt++; + } + if (last_word_in_para) { + hocr_str << "\n </p>\n"; + pcnt++; + para_is_ltr = true; // back to default direction + } + if (last_word_in_block) { + hocr_str << " </div>\n"; + bcnt++; + } + } + hocr_str << " </div>\n"; + + return copy_string(hocr_str.str()); +} + +/********************************************************************** + * HOcr Text Renderer interface implementation + **********************************************************************/ +TessHOcrRenderer::TessHOcrRenderer(const char *outputbase) + : TessResultRenderer(outputbase, "hocr") { + font_info_ = false; +} + +TessHOcrRenderer::TessHOcrRenderer(const char *outputbase, bool font_info) + : TessResultRenderer(outputbase, "hocr") { + font_info_ = font_info; +} + +bool TessHOcrRenderer::BeginDocumentHandler() { + AppendString( + "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n" + " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" + "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" " + "lang=\"en\">\n <head>\n <title>"); + AppendString(title()); + AppendString( + "</title>\n" + " <meta http-equiv=\"Content-Type\" content=\"text/html;" + "charset=utf-8\"/>\n" + " <meta name='ocr-system' content='tesseract " TESSERACT_VERSION_STR + "' />\n" + " <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par" + " ocr_line ocrx_word ocrp_dir ocrp_lang ocrp_wconf"); + if (font_info_) { + AppendString(" ocrp_font ocrp_fsize"); + } + AppendString( + "'/>\n" + " </head>\n" + " <body>\n"); + + return true; +} + +bool TessHOcrRenderer::EndDocumentHandler() { + AppendString(" </body>\n</html>\n"); + + return true; +} + +bool TessHOcrRenderer::AddImageHandler(TessBaseAPI *api) { + const std::unique_ptr<const char[]> hocr(api->GetHOCRText(imagenum())); + if (hocr == nullptr) { + return false; + } + + AppendString(hocr.get()); + + return true; +} + +} // namespace tesseract
