view mupdf-source/thirdparty/tesseract/src/api/hocrrenderer.cpp @ 46:7ee69f120f19 default tip

>>>>> tag v1.26.5+1 for changeset b74429b0f5c4
author Franz Glasner <fzglas.hg@dom66.de>
date Sat, 11 Oct 2025 17:17:30 +0200
parents b50eed0cc0ef
children
line wrap: on
line source

/**********************************************************************
 * File:        hocrrenderer.cpp
 * Description: Simple API for calling tesseract.
 * Author:      Ray Smith (original code from baseapi.cpp)
 * Author:      Stefan Weil (moved to separate file and cleaned code)
 *
 * (C) Copyright 2006, Google Inc.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.
 *
 **********************************************************************/

#include <tesseract/baseapi.h> // for TessBaseAPI
#include <locale>              // for std::locale::classic
#include <memory>              // for std::unique_ptr
#include <sstream>             // for std::stringstream
#include <tesseract/renderer.h>
#include "helpers.h"        // for copy_string
#include "tesseractclass.h" // for Tesseract

namespace tesseract {

/**
 * Gets the block orientation at the current iterator position.
 */
static tesseract::Orientation GetBlockTextOrientation(const PageIterator *it) {
  tesseract::Orientation orientation;
  tesseract::WritingDirection writing_direction;
  tesseract::TextlineOrder textline_order;
  float deskew_angle;
  it->Orientation(&orientation, &writing_direction, &textline_order,
                  &deskew_angle);
  return orientation;
}

/**
 * Fits a line to the baseline at the given level, and appends its coefficients
 * to the hOCR string.
 * NOTE: The hOCR spec is unclear on how to specify baseline coefficients for
 * rotated textlines. For this reason, on textlines that are not upright, this
 * method currently only inserts a 'textangle' property to indicate the rotation
 * direction and does not add any baseline information to the hocr string.
 */
static void AddBaselineCoordsTohOCR(const PageIterator *it,
                                    PageIteratorLevel level,
                                    std::stringstream &hocr_str) {
  tesseract::Orientation orientation = GetBlockTextOrientation(it);
  if (orientation != ORIENTATION_PAGE_UP) {
    hocr_str << "; textangle " << 360 - orientation * 90;
    return;
  }

  int left, top, right, bottom;
  it->BoundingBox(level, &left, &top, &right, &bottom);

  // Try to get the baseline coordinates at this level.
  int x1, y1, x2, y2;
  if (!it->Baseline(level, &x1, &y1, &x2, &y2)) {
    return;
  }
  // Following the description of this field of the hOCR spec, we convert the
  // baseline coordinates so that "the bottom left of the bounding box is the
  // origin".
  x1 -= left;
  x2 -= left;
  y1 -= bottom;
  y2 -= bottom;

  // Now fit a line through the points so we can extract coefficients for the
  // equation:  y = p1 x + p0
  if (x1 == x2) {
    // Problem computing the polynomial coefficients.
    return;
  }
  double p1 = (y2 - y1) / static_cast<double>(x2 - x1);
  double p0 = y1 - p1 * x1;

  hocr_str << "; baseline " << round(p1 * 1000.0) / 1000.0 << " "
           << round(p0 * 1000.0) / 1000.0;
}

static void AddBoxTohOCR(const ResultIterator *it, PageIteratorLevel level,
                         std::stringstream &hocr_str) {
  int left, top, right, bottom;
  it->BoundingBox(level, &left, &top, &right, &bottom);
  // This is the only place we use double quotes instead of single quotes,
  // but it may too late to change for consistency
  hocr_str << " title=\"bbox " << left << " " << top << " " << right << " "
           << bottom;
  // Add baseline coordinates & heights for textlines only.
  if (level == RIL_TEXTLINE) {
    AddBaselineCoordsTohOCR(it, level, hocr_str);
    // add custom height measures
    float row_height, descenders, ascenders; // row attributes
    it->RowAttributes(&row_height, &descenders, &ascenders);
    // TODO(rays): Do we want to limit these to a single decimal place?
    hocr_str << "; x_size " << row_height << "; x_descenders " << -descenders
             << "; x_ascenders " << ascenders;
  }
  hocr_str << "\">";
}

/**
 * Make a HTML-formatted string with hOCR markup from the internal
 * data structures.
 * page_number is 0-based but will appear in the output as 1-based.
 * Image name/input_file_ can be set by SetInputName before calling
 * GetHOCRText
 * STL removed from original patch submission and refactored by rays.
 * Returned string must be freed with the delete [] operator.
 */
char *TessBaseAPI::GetHOCRText(int page_number) {
  return GetHOCRText(nullptr, page_number);
}

/**
 * Make a HTML-formatted string with hOCR markup from the internal
 * data structures.
 * page_number is 0-based but will appear in the output as 1-based.
 * Image name/input_file_ can be set by SetInputName before calling
 * GetHOCRText
 * STL removed from original patch submission and refactored by rays.
 * Returned string must be freed with the delete [] operator.
 */
char *TessBaseAPI::GetHOCRText(ETEXT_DESC *monitor, int page_number) {
  if (tesseract_ == nullptr ||
      (page_res_ == nullptr && Recognize(monitor) < 0)) {
    return nullptr;
  }

  int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, scnt = 1, tcnt = 1, ccnt = 1;
  int page_id = page_number + 1; // hOCR uses 1-based page numbers.
  bool para_is_ltr = true;       // Default direction is LTR
  const char *paragraph_lang = nullptr;
  bool font_info = false;
  bool hocr_boxes = false;
  GetBoolVariable("hocr_font_info", &font_info);
  GetBoolVariable("hocr_char_boxes", &hocr_boxes);

  if (input_file_.empty()) {
    SetInputName(nullptr);
  }

  std::stringstream hocr_str;
  // Use "C" locale (needed for double values x_size and x_descenders).
  hocr_str.imbue(std::locale::classic());
  // Use 8 digits for double values.
  hocr_str.precision(8);
  hocr_str << "  <div class='ocr_page'"
           << " id='"
           << "page_" << page_id << "'"
           << " title='image \"";
  if (!input_file_.empty()) {
    hocr_str << HOcrEscape(input_file_.c_str());
  } else {
    hocr_str << "unknown";
  }

  hocr_str << "\"; bbox " << rect_left_ << " " << rect_top_ << " "
           << rect_width_ << " " << rect_height_ << "; ppageno " << page_number
           << "; scan_res " << GetSourceYResolution() << " "
           << GetSourceYResolution() << "'>\n";

  std::unique_ptr<ResultIterator> res_it(GetIterator());
  while (!res_it->Empty(RIL_BLOCK)) {
    int left, top, right, bottom;
    auto block_type = res_it->BlockType();
    switch (block_type) {
      case PT_FLOWING_IMAGE:
      case PT_HEADING_IMAGE:
      case PT_PULLOUT_IMAGE: {
        // Handle all kinds of images.
        res_it.get()->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom);
        hocr_str << "   <div class='ocr_photo' id='block_" << page_id << '_'
                 << bcnt++ << "' title=\"bbox " << left << " " << top << " "
                 << right << " " << bottom << "\"></div>\n";
        res_it->Next(RIL_BLOCK);
        continue;
      }
      case PT_HORZ_LINE:
      case PT_VERT_LINE:
        // Handle horizontal and vertical lines.
        res_it.get()->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom);
        hocr_str << "   <div class='ocr_separator' id='block_" << page_id << '_'
                 << bcnt++ << "' title=\"bbox " << left << " " << top << " "
                 << right << " " << bottom << "\"></div>\n";
        res_it->Next(RIL_BLOCK);
        continue;
      case PT_NOISE:
        tprintf("TODO: Please report image which triggers the noise case.\n");
        ASSERT_HOST(false);
      default:
        break;
    }

    if (res_it->Empty(RIL_WORD)) {
      res_it->Next(RIL_WORD);
      continue;
    }

    // Open any new block/paragraph/textline.
    if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
      para_is_ltr = true; // reset to default direction
      hocr_str << "   <div class='ocr_carea'"
               << " id='"
               << "block_" << page_id << "_" << bcnt << "'";
      AddBoxTohOCR(res_it.get(), RIL_BLOCK, hocr_str);
    }
    if (res_it->IsAtBeginningOf(RIL_PARA)) {
      hocr_str << "\n    <p class='ocr_par'";
      para_is_ltr = res_it->ParagraphIsLtr();
      if (!para_is_ltr) {
        hocr_str << " dir='rtl'";
      }
      hocr_str << " id='"
               << "par_" << page_id << "_" << pcnt << "'";
      paragraph_lang = res_it->WordRecognitionLanguage();
      if (paragraph_lang) {
        hocr_str << " lang='" << paragraph_lang << "'";
      }
      AddBoxTohOCR(res_it.get(), RIL_PARA, hocr_str);
    }
    if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
      hocr_str << "\n     <span class='";
      switch (block_type) {
        case PT_HEADING_TEXT:
          hocr_str << "ocr_header";
          break;
        case PT_PULLOUT_TEXT:
          hocr_str << "ocr_textfloat";
          break;
        case PT_CAPTION_TEXT:
          hocr_str << "ocr_caption";
          break;
        case PT_FLOWING_IMAGE:
        case PT_HEADING_IMAGE:
        case PT_PULLOUT_IMAGE:
          ASSERT_HOST(false);
          break;
        default:
          hocr_str << "ocr_line";
      }
      hocr_str << "' id='"
               << "line_" << page_id << "_" << lcnt << "'";
      AddBoxTohOCR(res_it.get(), RIL_TEXTLINE, hocr_str);
    }

    // Now, process the word...
    int32_t lstm_choice_mode = tesseract_->lstm_choice_mode;
    std::vector<std::vector<std::vector<std::pair<const char *, float>>>>
        *rawTimestepMap = nullptr;
    std::vector<std::vector<std::pair<const char *, float>>> *CTCMap = nullptr;
    if (lstm_choice_mode) {
      CTCMap = res_it->GetBestLSTMSymbolChoices();
      rawTimestepMap = res_it->GetRawLSTMTimesteps();
    }
    hocr_str << "\n      <span class='ocrx_word'"
             << " id='"
             << "word_" << page_id << "_" << wcnt << "'";
    bool bold, italic, underlined, monospace, serif, smallcaps;
    int pointsize, font_id;
    res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
    const char *font_name =
        res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace,
                                   &serif, &smallcaps, &pointsize, &font_id);
    hocr_str << " title='bbox " << left << " " << top << " " << right << " "
             << bottom << "; x_wconf "
             << static_cast<int>(res_it->Confidence(RIL_WORD));
    if (font_info) {
      if (font_name) {
        hocr_str << "; x_font " << HOcrEscape(font_name).c_str();
      }
      hocr_str << "; x_fsize " << pointsize;
    }
    hocr_str << "'";
    const char *lang = res_it->WordRecognitionLanguage();
    if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) {
      hocr_str << " lang='" << lang << "'";
    }
    switch (res_it->WordDirection()) {
      // Only emit direction if different from current paragraph direction
      case DIR_LEFT_TO_RIGHT:
        if (!para_is_ltr) {
          hocr_str << " dir='ltr'";
        }
        break;
      case DIR_RIGHT_TO_LEFT:
        if (para_is_ltr) {
          hocr_str << " dir='rtl'";
        }
        break;
      case DIR_MIX:
      case DIR_NEUTRAL:
      default: // Do nothing.
        break;
    }
    hocr_str << ">";
    bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
    bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
    bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
    if (bold) {
      hocr_str << "<strong>";
    }
    if (italic) {
      hocr_str << "<em>";
    }
    do {
      const std::unique_ptr<const char[]> grapheme(
          res_it->GetUTF8Text(RIL_SYMBOL));
      if (grapheme && grapheme[0] != 0) {
        if (hocr_boxes) {
          res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);
          hocr_str << "\n       <span class='ocrx_cinfo' title='x_bboxes "
                   << left << " " << top << " " << right << " " << bottom
                   << "; x_conf " << res_it->Confidence(RIL_SYMBOL) << "'>";
        }
        hocr_str << HOcrEscape(grapheme.get()).c_str();
        if (hocr_boxes) {
          hocr_str << "</span>";
          tesseract::ChoiceIterator ci(*res_it);
          if (lstm_choice_mode == 1 && ci.Timesteps() != nullptr) {
            std::vector<std::vector<std::pair<const char *, float>>> *symbol =
                ci.Timesteps();
            hocr_str << "\n        <span class='ocr_symbol'"
                     << " id='"
                     << "symbol_" << page_id << "_" << wcnt << "_" << scnt
                     << "'>";
            for (const auto &timestep : *symbol) {
              hocr_str << "\n         <span class='ocrx_cinfo'"
                       << " id='"
                       << "timestep" << page_id << "_" << wcnt << "_" << tcnt
                       << "'>";
              for (auto conf : timestep) {
                hocr_str << "\n          <span class='ocrx_cinfo'"
                         << " id='"
                         << "choice_" << page_id << "_" << wcnt << "_" << ccnt
                         << "'"
                         << " title='x_confs " << int(conf.second * 100) << "'>"
                         << HOcrEscape(conf.first).c_str() << "</span>";
                ++ccnt;
              }
              hocr_str << "</span>";
              ++tcnt;
            }
            hocr_str << "\n        </span>";
            ++scnt;
          } else if (lstm_choice_mode == 2) {
            hocr_str << "\n        <span class='ocrx_cinfo'"
                     << " id='"
                     << "lstm_choices_" << page_id << "_" << wcnt << "_" << tcnt
                     << "'>";
            do {
              const char *choice = ci.GetUTF8Text();
              float choiceconf = ci.Confidence();
              if (choice != nullptr) {
                hocr_str << "\n         <span class='ocrx_cinfo'"
                         << " id='"
                         << "choice_" << page_id << "_" << wcnt << "_" << ccnt
                         << "'"
                         << " title='x_confs " << choiceconf << "'>"
                         << HOcrEscape(choice).c_str() << "</span>";
                ccnt++;
              }
            } while (ci.Next());
            hocr_str << "\n        </span>";
            tcnt++;
          }
        }
      }
      res_it->Next(RIL_SYMBOL);
    } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
    if (italic) {
      hocr_str << "</em>";
    }
    if (bold) {
      hocr_str << "</strong>";
    }
    // If the lstm choice mode is required it is added here
    if (lstm_choice_mode == 1 && !hocr_boxes && rawTimestepMap != nullptr) {
      for (const auto &symbol : *rawTimestepMap) {
        hocr_str << "\n       <span class='ocr_symbol'"
                 << " id='"
                 << "symbol_" << page_id << "_" << wcnt << "_" << scnt << "'>";
        for (const auto &timestep : symbol) {
          hocr_str << "\n        <span class='ocrx_cinfo'"
                   << " id='"
                   << "timestep" << page_id << "_" << wcnt << "_" << tcnt
                   << "'>";
          for (auto &&conf : timestep) {
            hocr_str << "\n         <span class='ocrx_cinfo'"
                     << " id='"
                     << "choice_" << page_id << "_" << wcnt << "_" << ccnt
                     << "'"
                     << " title='x_confs " << int(conf.second * 100) << "'>"
                     << HOcrEscape(conf.first).c_str() << "</span>";
            ++ccnt;
          }
          hocr_str << "</span>";
          ++tcnt;
        }
        hocr_str << "</span>";
        ++scnt;
      }
    } else if (lstm_choice_mode == 2 && !hocr_boxes && CTCMap != nullptr) {
      for (const auto &timestep : *CTCMap) {
        if (timestep.size() > 0) {
          hocr_str << "\n       <span class='ocrx_cinfo'"
                   << " id='"
                   << "lstm_choices_" << page_id << "_" << wcnt << "_" << tcnt
                   << "'>";
          for (auto &j : timestep) {
            float conf = 100 - tesseract_->lstm_rating_coefficient * j.second;
            if (conf < 0.0f) {
              conf = 0.0f;
            }
            if (conf > 100.0f) {
              conf = 100.0f;
            }
            hocr_str << "\n        <span class='ocrx_cinfo'"
                     << " id='"
                     << "choice_" << page_id << "_" << wcnt << "_" << ccnt
                     << "'"
                     << " title='x_confs " << conf << "'>"
                     << HOcrEscape(j.first).c_str() << "</span>";
            ccnt++;
          }
          hocr_str << "</span>";
          tcnt++;
        }
      }
    }
    // Close ocrx_word.
    if (hocr_boxes || lstm_choice_mode > 0) {
      hocr_str << "\n      ";
    }
    hocr_str << "</span>";
    tcnt = 1;
    ccnt = 1;
    wcnt++;
    // Close any ending block/paragraph/textline.
    if (last_word_in_line) {
      hocr_str << "\n     </span>";
      lcnt++;
    }
    if (last_word_in_para) {
      hocr_str << "\n    </p>\n";
      pcnt++;
      para_is_ltr = true; // back to default direction
    }
    if (last_word_in_block) {
      hocr_str << "   </div>\n";
      bcnt++;
    }
  }
  hocr_str << "  </div>\n";

  return copy_string(hocr_str.str());
}

/**********************************************************************
 * HOcr Text Renderer interface implementation
 **********************************************************************/
TessHOcrRenderer::TessHOcrRenderer(const char *outputbase)
    : TessResultRenderer(outputbase, "hocr") {
  font_info_ = false;
}

TessHOcrRenderer::TessHOcrRenderer(const char *outputbase, bool font_info)
    : TessResultRenderer(outputbase, "hocr") {
  font_info_ = font_info;
}

bool TessHOcrRenderer::BeginDocumentHandler() {
  AppendString(
      "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
      "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n"
      "    \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"
      "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" "
      "lang=\"en\">\n <head>\n  <title>");
  AppendString(title());
  AppendString(
      "</title>\n"
      "  <meta http-equiv=\"Content-Type\" content=\"text/html;"
      "charset=utf-8\"/>\n"
      "  <meta name='ocr-system' content='tesseract " TESSERACT_VERSION_STR
      "' />\n"
      "  <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par"
      " ocr_line ocrx_word ocrp_dir ocrp_lang ocrp_wconf");
  if (font_info_) {
    AppendString(" ocrp_font ocrp_fsize");
  }
  AppendString(
      "'/>\n"
      " </head>\n"
      " <body>\n");

  return true;
}

bool TessHOcrRenderer::EndDocumentHandler() {
  AppendString(" </body>\n</html>\n");

  return true;
}

bool TessHOcrRenderer::AddImageHandler(TessBaseAPI *api) {
  const std::unique_ptr<const char[]> hocr(api->GetHOCRText(imagenum()));
  if (hocr == nullptr) {
    return false;
  }

  AppendString(hocr.get());

  return true;
}

} // namespace tesseract