Mercurial > hgrepos > Python2 > PyMuPDF
view mupdf-source/thirdparty/tesseract/src/api/hocrrenderer.cpp @ 46:7ee69f120f19 default tip
>>>>> tag v1.26.5+1 for changeset b74429b0f5c4
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Sat, 11 Oct 2025 17:17:30 +0200 |
| parents | b50eed0cc0ef |
| children |
line wrap: on
line source
/********************************************************************** * File: hocrrenderer.cpp * Description: Simple API for calling tesseract. * Author: Ray Smith (original code from baseapi.cpp) * Author: Stefan Weil (moved to separate file and cleaned code) * * (C) Copyright 2006, Google Inc. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License. * **********************************************************************/ #include <tesseract/baseapi.h> // for TessBaseAPI #include <locale> // for std::locale::classic #include <memory> // for std::unique_ptr #include <sstream> // for std::stringstream #include <tesseract/renderer.h> #include "helpers.h" // for copy_string #include "tesseractclass.h" // for Tesseract namespace tesseract { /** * Gets the block orientation at the current iterator position. */ static tesseract::Orientation GetBlockTextOrientation(const PageIterator *it) { tesseract::Orientation orientation; tesseract::WritingDirection writing_direction; tesseract::TextlineOrder textline_order; float deskew_angle; it->Orientation(&orientation, &writing_direction, &textline_order, &deskew_angle); return orientation; } /** * Fits a line to the baseline at the given level, and appends its coefficients * to the hOCR string. * NOTE: The hOCR spec is unclear on how to specify baseline coefficients for * rotated textlines. For this reason, on textlines that are not upright, this * method currently only inserts a 'textangle' property to indicate the rotation * direction and does not add any baseline information to the hocr string. */ static void AddBaselineCoordsTohOCR(const PageIterator *it, PageIteratorLevel level, std::stringstream &hocr_str) { tesseract::Orientation orientation = GetBlockTextOrientation(it); if (orientation != ORIENTATION_PAGE_UP) { hocr_str << "; textangle " << 360 - orientation * 90; return; } int left, top, right, bottom; it->BoundingBox(level, &left, &top, &right, &bottom); // Try to get the baseline coordinates at this level. int x1, y1, x2, y2; if (!it->Baseline(level, &x1, &y1, &x2, &y2)) { return; } // Following the description of this field of the hOCR spec, we convert the // baseline coordinates so that "the bottom left of the bounding box is the // origin". x1 -= left; x2 -= left; y1 -= bottom; y2 -= bottom; // Now fit a line through the points so we can extract coefficients for the // equation: y = p1 x + p0 if (x1 == x2) { // Problem computing the polynomial coefficients. return; } double p1 = (y2 - y1) / static_cast<double>(x2 - x1); double p0 = y1 - p1 * x1; hocr_str << "; baseline " << round(p1 * 1000.0) / 1000.0 << " " << round(p0 * 1000.0) / 1000.0; } static void AddBoxTohOCR(const ResultIterator *it, PageIteratorLevel level, std::stringstream &hocr_str) { int left, top, right, bottom; it->BoundingBox(level, &left, &top, &right, &bottom); // This is the only place we use double quotes instead of single quotes, // but it may too late to change for consistency hocr_str << " title=\"bbox " << left << " " << top << " " << right << " " << bottom; // Add baseline coordinates & heights for textlines only. if (level == RIL_TEXTLINE) { AddBaselineCoordsTohOCR(it, level, hocr_str); // add custom height measures float row_height, descenders, ascenders; // row attributes it->RowAttributes(&row_height, &descenders, &ascenders); // TODO(rays): Do we want to limit these to a single decimal place? hocr_str << "; x_size " << row_height << "; x_descenders " << -descenders << "; x_ascenders " << ascenders; } hocr_str << "\">"; } /** * Make a HTML-formatted string with hOCR markup from the internal * data structures. * page_number is 0-based but will appear in the output as 1-based. * Image name/input_file_ can be set by SetInputName before calling * GetHOCRText * STL removed from original patch submission and refactored by rays. * Returned string must be freed with the delete [] operator. */ char *TessBaseAPI::GetHOCRText(int page_number) { return GetHOCRText(nullptr, page_number); } /** * Make a HTML-formatted string with hOCR markup from the internal * data structures. * page_number is 0-based but will appear in the output as 1-based. * Image name/input_file_ can be set by SetInputName before calling * GetHOCRText * STL removed from original patch submission and refactored by rays. * Returned string must be freed with the delete [] operator. */ char *TessBaseAPI::GetHOCRText(ETEXT_DESC *monitor, int page_number) { if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0)) { return nullptr; } int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, scnt = 1, tcnt = 1, ccnt = 1; int page_id = page_number + 1; // hOCR uses 1-based page numbers. bool para_is_ltr = true; // Default direction is LTR const char *paragraph_lang = nullptr; bool font_info = false; bool hocr_boxes = false; GetBoolVariable("hocr_font_info", &font_info); GetBoolVariable("hocr_char_boxes", &hocr_boxes); if (input_file_.empty()) { SetInputName(nullptr); } std::stringstream hocr_str; // Use "C" locale (needed for double values x_size and x_descenders). hocr_str.imbue(std::locale::classic()); // Use 8 digits for double values. hocr_str.precision(8); hocr_str << " <div class='ocr_page'" << " id='" << "page_" << page_id << "'" << " title='image \""; if (!input_file_.empty()) { hocr_str << HOcrEscape(input_file_.c_str()); } else { hocr_str << "unknown"; } hocr_str << "\"; bbox " << rect_left_ << " " << rect_top_ << " " << rect_width_ << " " << rect_height_ << "; ppageno " << page_number << "; scan_res " << GetSourceYResolution() << " " << GetSourceYResolution() << "'>\n"; std::unique_ptr<ResultIterator> res_it(GetIterator()); while (!res_it->Empty(RIL_BLOCK)) { int left, top, right, bottom; auto block_type = res_it->BlockType(); switch (block_type) { case PT_FLOWING_IMAGE: case PT_HEADING_IMAGE: case PT_PULLOUT_IMAGE: { // Handle all kinds of images. res_it.get()->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom); hocr_str << " <div class='ocr_photo' id='block_" << page_id << '_' << bcnt++ << "' title=\"bbox " << left << " " << top << " " << right << " " << bottom << "\"></div>\n"; res_it->Next(RIL_BLOCK); continue; } case PT_HORZ_LINE: case PT_VERT_LINE: // Handle horizontal and vertical lines. res_it.get()->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom); hocr_str << " <div class='ocr_separator' id='block_" << page_id << '_' << bcnt++ << "' title=\"bbox " << left << " " << top << " " << right << " " << bottom << "\"></div>\n"; res_it->Next(RIL_BLOCK); continue; case PT_NOISE: tprintf("TODO: Please report image which triggers the noise case.\n"); ASSERT_HOST(false); default: break; } if (res_it->Empty(RIL_WORD)) { res_it->Next(RIL_WORD); continue; } // Open any new block/paragraph/textline. if (res_it->IsAtBeginningOf(RIL_BLOCK)) { para_is_ltr = true; // reset to default direction hocr_str << " <div class='ocr_carea'" << " id='" << "block_" << page_id << "_" << bcnt << "'"; AddBoxTohOCR(res_it.get(), RIL_BLOCK, hocr_str); } if (res_it->IsAtBeginningOf(RIL_PARA)) { hocr_str << "\n <p class='ocr_par'"; para_is_ltr = res_it->ParagraphIsLtr(); if (!para_is_ltr) { hocr_str << " dir='rtl'"; } hocr_str << " id='" << "par_" << page_id << "_" << pcnt << "'"; paragraph_lang = res_it->WordRecognitionLanguage(); if (paragraph_lang) { hocr_str << " lang='" << paragraph_lang << "'"; } AddBoxTohOCR(res_it.get(), RIL_PARA, hocr_str); } if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { hocr_str << "\n <span class='"; switch (block_type) { case PT_HEADING_TEXT: hocr_str << "ocr_header"; break; case PT_PULLOUT_TEXT: hocr_str << "ocr_textfloat"; break; case PT_CAPTION_TEXT: hocr_str << "ocr_caption"; break; case PT_FLOWING_IMAGE: case PT_HEADING_IMAGE: case PT_PULLOUT_IMAGE: ASSERT_HOST(false); break; default: hocr_str << "ocr_line"; } hocr_str << "' id='" << "line_" << page_id << "_" << lcnt << "'"; AddBoxTohOCR(res_it.get(), RIL_TEXTLINE, hocr_str); } // Now, process the word... int32_t lstm_choice_mode = tesseract_->lstm_choice_mode; std::vector<std::vector<std::vector<std::pair<const char *, float>>>> *rawTimestepMap = nullptr; std::vector<std::vector<std::pair<const char *, float>>> *CTCMap = nullptr; if (lstm_choice_mode) { CTCMap = res_it->GetBestLSTMSymbolChoices(); rawTimestepMap = res_it->GetRawLSTMTimesteps(); } hocr_str << "\n <span class='ocrx_word'" << " id='" << "word_" << page_id << "_" << wcnt << "'"; bool bold, italic, underlined, monospace, serif, smallcaps; int pointsize, font_id; res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom); const char *font_name = res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif, &smallcaps, &pointsize, &font_id); hocr_str << " title='bbox " << left << " " << top << " " << right << " " << bottom << "; x_wconf " << static_cast<int>(res_it->Confidence(RIL_WORD)); if (font_info) { if (font_name) { hocr_str << "; x_font " << HOcrEscape(font_name).c_str(); } hocr_str << "; x_fsize " << pointsize; } hocr_str << "'"; const char *lang = res_it->WordRecognitionLanguage(); if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) { hocr_str << " lang='" << lang << "'"; } switch (res_it->WordDirection()) { // Only emit direction if different from current paragraph direction case DIR_LEFT_TO_RIGHT: if (!para_is_ltr) { hocr_str << " dir='ltr'"; } break; case DIR_RIGHT_TO_LEFT: if (para_is_ltr) { hocr_str << " dir='rtl'"; } break; case DIR_MIX: case DIR_NEUTRAL: default: // Do nothing. break; } hocr_str << ">"; bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD); bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD); bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD); if (bold) { hocr_str << "<strong>"; } if (italic) { hocr_str << "<em>"; } do { const std::unique_ptr<const char[]> grapheme( res_it->GetUTF8Text(RIL_SYMBOL)); if (grapheme && grapheme[0] != 0) { if (hocr_boxes) { res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom); hocr_str << "\n <span class='ocrx_cinfo' title='x_bboxes " << left << " " << top << " " << right << " " << bottom << "; x_conf " << res_it->Confidence(RIL_SYMBOL) << "'>"; } hocr_str << HOcrEscape(grapheme.get()).c_str(); if (hocr_boxes) { hocr_str << "</span>"; tesseract::ChoiceIterator ci(*res_it); if (lstm_choice_mode == 1 && ci.Timesteps() != nullptr) { std::vector<std::vector<std::pair<const char *, float>>> *symbol = ci.Timesteps(); hocr_str << "\n <span class='ocr_symbol'" << " id='" << "symbol_" << page_id << "_" << wcnt << "_" << scnt << "'>"; for (const auto ×tep : *symbol) { hocr_str << "\n <span class='ocrx_cinfo'" << " id='" << "timestep" << page_id << "_" << wcnt << "_" << tcnt << "'>"; for (auto conf : timestep) { hocr_str << "\n <span class='ocrx_cinfo'" << " id='" << "choice_" << page_id << "_" << wcnt << "_" << ccnt << "'" << " title='x_confs " << int(conf.second * 100) << "'>" << HOcrEscape(conf.first).c_str() << "</span>"; ++ccnt; } hocr_str << "</span>"; ++tcnt; } hocr_str << "\n </span>"; ++scnt; } else if (lstm_choice_mode == 2) { hocr_str << "\n <span class='ocrx_cinfo'" << " id='" << "lstm_choices_" << page_id << "_" << wcnt << "_" << tcnt << "'>"; do { const char *choice = ci.GetUTF8Text(); float choiceconf = ci.Confidence(); if (choice != nullptr) { hocr_str << "\n <span class='ocrx_cinfo'" << " id='" << "choice_" << page_id << "_" << wcnt << "_" << ccnt << "'" << " title='x_confs " << choiceconf << "'>" << HOcrEscape(choice).c_str() << "</span>"; ccnt++; } } while (ci.Next()); hocr_str << "\n </span>"; tcnt++; } } } res_it->Next(RIL_SYMBOL); } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); if (italic) { hocr_str << "</em>"; } if (bold) { hocr_str << "</strong>"; } // If the lstm choice mode is required it is added here if (lstm_choice_mode == 1 && !hocr_boxes && rawTimestepMap != nullptr) { for (const auto &symbol : *rawTimestepMap) { hocr_str << "\n <span class='ocr_symbol'" << " id='" << "symbol_" << page_id << "_" << wcnt << "_" << scnt << "'>"; for (const auto ×tep : symbol) { hocr_str << "\n <span class='ocrx_cinfo'" << " id='" << "timestep" << page_id << "_" << wcnt << "_" << tcnt << "'>"; for (auto &&conf : timestep) { hocr_str << "\n <span class='ocrx_cinfo'" << " id='" << "choice_" << page_id << "_" << wcnt << "_" << ccnt << "'" << " title='x_confs " << int(conf.second * 100) << "'>" << HOcrEscape(conf.first).c_str() << "</span>"; ++ccnt; } hocr_str << "</span>"; ++tcnt; } hocr_str << "</span>"; ++scnt; } } else if (lstm_choice_mode == 2 && !hocr_boxes && CTCMap != nullptr) { for (const auto ×tep : *CTCMap) { if (timestep.size() > 0) { hocr_str << "\n <span class='ocrx_cinfo'" << " id='" << "lstm_choices_" << page_id << "_" << wcnt << "_" << tcnt << "'>"; for (auto &j : timestep) { float conf = 100 - tesseract_->lstm_rating_coefficient * j.second; if (conf < 0.0f) { conf = 0.0f; } if (conf > 100.0f) { conf = 100.0f; } hocr_str << "\n <span class='ocrx_cinfo'" << " id='" << "choice_" << page_id << "_" << wcnt << "_" << ccnt << "'" << " title='x_confs " << conf << "'>" << HOcrEscape(j.first).c_str() << "</span>"; ccnt++; } hocr_str << "</span>"; tcnt++; } } } // Close ocrx_word. if (hocr_boxes || lstm_choice_mode > 0) { hocr_str << "\n "; } hocr_str << "</span>"; tcnt = 1; ccnt = 1; wcnt++; // Close any ending block/paragraph/textline. if (last_word_in_line) { hocr_str << "\n </span>"; lcnt++; } if (last_word_in_para) { hocr_str << "\n </p>\n"; pcnt++; para_is_ltr = true; // back to default direction } if (last_word_in_block) { hocr_str << " </div>\n"; bcnt++; } } hocr_str << " </div>\n"; return copy_string(hocr_str.str()); } /********************************************************************** * HOcr Text Renderer interface implementation **********************************************************************/ TessHOcrRenderer::TessHOcrRenderer(const char *outputbase) : TessResultRenderer(outputbase, "hocr") { font_info_ = false; } TessHOcrRenderer::TessHOcrRenderer(const char *outputbase, bool font_info) : TessResultRenderer(outputbase, "hocr") { font_info_ = font_info; } bool TessHOcrRenderer::BeginDocumentHandler() { AppendString( "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n" " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" " "lang=\"en\">\n <head>\n <title>"); AppendString(title()); AppendString( "</title>\n" " <meta http-equiv=\"Content-Type\" content=\"text/html;" "charset=utf-8\"/>\n" " <meta name='ocr-system' content='tesseract " TESSERACT_VERSION_STR "' />\n" " <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par" " ocr_line ocrx_word ocrp_dir ocrp_lang ocrp_wconf"); if (font_info_) { AppendString(" ocrp_font ocrp_fsize"); } AppendString( "'/>\n" " </head>\n" " <body>\n"); return true; } bool TessHOcrRenderer::EndDocumentHandler() { AppendString(" </body>\n</html>\n"); return true; } bool TessHOcrRenderer::AddImageHandler(TessBaseAPI *api) { const std::unique_ptr<const char[]> hocr(api->GetHOCRText(imagenum())); if (hocr == nullptr) { return false; } AppendString(hocr.get()); return true; } } // namespace tesseract
