Mercurial > hgrepos > Python2 > PyMuPDF

diff mupdf-source/thirdparty/tesseract/src/training/pango/stringrenderer.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author: Franz Glasner <fzglas.hg@dom66.de>
date: Mon, 15 Sep 2025 11:43:07 +0200
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/training/pango/stringrenderer.cpp	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,898 @@
+/**********************************************************************
+ * File:        stringrenderer.cpp
+ * Description: Class for rendering UTF-8 text to an image, and retrieving
+ *              bounding boxes around each grapheme cluster.
+ * Author:      Ranjith Unnikrishnan
+ *
+ * (C) Copyright 2013, Google Inc.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ **********************************************************************/
+
+#include "stringrenderer.h"
+
+#include <allheaders.h> // from leptonica
+#include "boxchar.h"
+#include "helpers.h" // for TRand
+#include "ligature_table.h"
+#include "normstrngs.h"
+#include "tlog.h"
+
+#include <tesseract/unichar.h>
+
+#include "pango/pango-font.h"
+#include "pango/pango-glyph-item.h"
+#include "unicode/uchar.h" // from libicu
+
+#include <algorithm>
+#include <cassert>
+#include <cstdio>
+#include <cstring>
+#include <map>
+#include <utility>
+#include <vector>
+
+#define DISABLE_HEAP_LEAK_CHECK
+
+namespace tesseract {
+
+static const int kDefaultOutputResolution = 300;
+
+// Word joiner (U+2060) inserted after letters in ngram mode, as per
+// recommendation in http://unicode.org/reports/tr14/ to avoid line-breaks at
+// hyphens and other non-alpha characters.
+static const char *kWordJoinerUTF8 = "\u2060";
+
+static bool IsCombiner(int ch) {
+  const int char_type = u_charType(ch);
+  return ((char_type == U_NON_SPACING_MARK) || (char_type == U_ENCLOSING_MARK) ||
+          (char_type == U_COMBINING_SPACING_MARK));
+}
+
+static std::string EncodeAsUTF8(const char32 ch32) {
+  UNICHAR uni_ch(ch32);
+  return std::string(uni_ch.utf8(), uni_ch.utf8_len());
+}
+
+// Returns true with probability 'prob'.
+static bool RandBool(const double prob, TRand *rand) {
+  if (prob == 1.0) {
+    return true;
+  }
+  if (prob == 0.0) {
+    return false;
+  }
+  return rand->UnsignedRand(1.0) < prob;
+}
+
+/* static */
+static Image CairoARGB32ToPixFormat(cairo_surface_t *surface) {
+  if (cairo_image_surface_get_format(surface) != CAIRO_FORMAT_ARGB32) {
+    printf("Unexpected surface format %d\n", cairo_image_surface_get_format(surface));
+    return nullptr;
+  }
+  const int width = cairo_image_surface_get_width(surface);
+  const int height = cairo_image_surface_get_height(surface);
+  Image pix = pixCreate(width, height, 32);
+  int byte_stride = cairo_image_surface_get_stride(surface);
+
+  for (int i = 0; i < height; ++i) {
+    memcpy(reinterpret_cast<unsigned char *>(pixGetData(pix) + i * pixGetWpl(pix)) + 1,
+           cairo_image_surface_get_data(surface) + i * byte_stride,
+           byte_stride - ((i == height - 1) ? 1 : 0));
+  }
+  return pix;
+}
+
+StringRenderer::StringRenderer(const std::string &font_desc, int page_width, int page_height)
+    : font_(font_desc)
+    , page_width_(page_width)
+    , page_height_(page_height)
+    , h_margin_(50)
+    , v_margin_(50)
+    , pen_color_{0.0, 0.0, 0.0}
+    , char_spacing_(0)
+    , leading_(0)
+    , vertical_text_(false)
+    , gravity_hint_strong_(false)
+    , render_fullwidth_latin_(false)
+    , underline_start_prob_(0)
+    , underline_continuation_prob_(0)
+    , underline_style_(PANGO_UNDERLINE_SINGLE)
+    , drop_uncovered_chars_(true)
+    , strip_unrenderable_words_(false)
+    , add_ligatures_(false)
+    , output_word_boxes_(false)
+    , surface_(nullptr)
+    , cr_(nullptr)
+    , layout_(nullptr)
+    , start_box_(0)
+    , page_(0)
+    , box_padding_(0)
+    , page_boxes_(nullptr)
+    , total_chars_(0)
+    , font_index_(0)
+    , last_offset_(0) {
+  set_resolution(kDefaultOutputResolution);
+  set_font(font_desc);
+}
+
+bool StringRenderer::set_font(const std::string &desc) {
+  bool success = font_.ParseFontDescriptionName(desc);
+  font_.set_resolution(resolution_);
+  return success;
+}
+
+void StringRenderer::set_resolution(const int resolution) {
+  resolution_ = resolution;
+  font_.set_resolution(resolution);
+}
+
+void StringRenderer::set_underline_start_prob(const double frac) {
+  underline_start_prob_ = std::min(std::max(frac, 0.0), 1.0);
+}
+
+void StringRenderer::set_underline_continuation_prob(const double frac) {
+  underline_continuation_prob_ = std::min(std::max(frac, 0.0), 1.0);
+}
+
+StringRenderer::~StringRenderer() {
+  ClearBoxes();
+  FreePangoCairo();
+}
+
+void StringRenderer::InitPangoCairo() {
+  FreePangoCairo();
+  surface_ = cairo_image_surface_create(CAIRO_FORMAT_ARGB32, page_width_, page_height_);
+  cr_ = cairo_create(surface_);
+  {
+    DISABLE_HEAP_LEAK_CHECK;
+    layout_ = pango_cairo_create_layout(cr_);
+  }
+
+  if (vertical_text_) {
+    PangoContext *context = pango_layout_get_context(layout_);
+    pango_context_set_base_gravity(context, PANGO_GRAVITY_EAST);
+    if (gravity_hint_strong_) {
+      pango_context_set_gravity_hint(context, PANGO_GRAVITY_HINT_STRONG);
+    }
+    pango_layout_context_changed(layout_);
+  }
+
+  SetLayoutProperties();
+}
+
+void StringRenderer::SetLayoutProperties() {
+  std::string font_desc = font_.DescriptionName();
+  // Specify the font via a description name
+  PangoFontDescription *desc = pango_font_description_from_string(font_desc.c_str());
+  // Assign the font description to the layout
+  pango_layout_set_font_description(layout_, desc);
+  pango_font_description_free(desc); // free the description
+  pango_cairo_context_set_resolution(pango_layout_get_context(layout_), resolution_);
+
+  int max_width = page_width_ - 2 * h_margin_;
+  int max_height = page_height_ - 2 * v_margin_;
+  tlog(3, "max_width = %d, max_height = %d\n", max_width, max_height);
+  if (vertical_text_) {
+    using std::swap;
+    swap(max_width, max_height);
+  }
+  pango_layout_set_width(layout_, max_width * PANGO_SCALE);
+  // Ultra-wide Thai strings need to wrap at char level.
+  pango_layout_set_wrap(layout_, PANGO_WRAP_WORD_CHAR);
+
+  // Adjust character spacing
+  PangoAttrList *attr_list = pango_attr_list_new();
+  if (char_spacing_) {
+    PangoAttribute *spacing_attr = pango_attr_letter_spacing_new(char_spacing_ * PANGO_SCALE);
+    spacing_attr->start_index = 0;
+    spacing_attr->end_index = static_cast<guint>(-1);
+    pango_attr_list_change(attr_list, spacing_attr);
+  }
+
+  if (add_ligatures_) {
+    set_features("liga, clig, dlig, hlig");
+    PangoAttribute *feature_attr = pango_attr_font_features_new(features_.c_str());
+    pango_attr_list_change(attr_list, feature_attr);
+  }
+
+  pango_layout_set_attributes(layout_, attr_list);
+  pango_attr_list_unref(attr_list);
+  // Adjust line spacing
+  if (leading_) {
+    pango_layout_set_spacing(layout_, leading_ * PANGO_SCALE);
+  }
+}
+
+void StringRenderer::FreePangoCairo() {
+  if (layout_) {
+    g_object_unref(layout_);
+    layout_ = nullptr;
+  }
+  if (cr_) {
+    cairo_destroy(cr_);
+    cr_ = nullptr;
+  }
+  if (surface_) {
+    cairo_surface_destroy(surface_);
+    surface_ = nullptr;
+  }
+}
+
+void StringRenderer::SetWordUnderlineAttributes(const std::string &page_text) {
+  if (underline_start_prob_ == 0) {
+    return;
+  }
+  PangoAttrList *attr_list = pango_layout_get_attributes(layout_);
+
+  const char *text = page_text.c_str();
+  size_t offset = 0;
+  TRand rand;
+  bool started_underline = false;
+  PangoAttribute *und_attr = nullptr;
+
+  while (offset < page_text.length()) {
+    offset += SpanUTF8Whitespace(text + offset);
+    if (offset == page_text.length()) {
+      break;
+    }
+
+    int word_start = offset;
+    int word_len = SpanUTF8NotWhitespace(text + offset);
+    offset += word_len;
+    if (started_underline) {
+      // Should we continue the underline to the next word?
+      if (RandBool(underline_continuation_prob_, &rand)) {
+        // Continue the current underline to this word.
+        und_attr->end_index = word_start + word_len;
+      } else {
+        // Otherwise end the current underline attribute at the end of the
+        // previous word.
+        pango_attr_list_insert(attr_list, und_attr);
+        started_underline = false;
+        und_attr = nullptr;
+      }
+    }
+    if (!started_underline && RandBool(underline_start_prob_, &rand)) {
+      // Start a new underline attribute
+      und_attr = pango_attr_underline_new(underline_style_);
+      und_attr->start_index = word_start;
+      und_attr->end_index = word_start + word_len;
+      started_underline = true;
+    }
+  }
+  // Finish the current underline attribute at the end of the page.
+  if (started_underline) {
+    und_attr->end_index = page_text.length();
+    pango_attr_list_insert(attr_list, und_attr);
+  }
+}
+
+// Returns offset in utf8 bytes to first page.
+int StringRenderer::FindFirstPageBreakOffset(const char *text, int text_length) {
+  if (!text_length) {
+    return 0;
+  }
+  const int max_height = (page_height_ - 2 * v_margin_);
+  const int max_width = (page_width_ - 2 * h_margin_);
+  const int max_layout_height = vertical_text_ ? max_width : max_height;
+
+  UNICHAR::const_iterator it = UNICHAR::begin(text, text_length);
+  const UNICHAR::const_iterator it_end = UNICHAR::end(text, text_length);
+  const int kMaxUnicodeBufLength = 15000;
+  for (int i = 0; i < kMaxUnicodeBufLength && it != it_end; ++it, ++i) {
+    ;
+  }
+  int buf_length = it.utf8_data() - text;
+  tlog(1, "len = %d  buf_len = %d\n", text_length, buf_length);
+  pango_layout_set_text(layout_, text, buf_length);
+
+  PangoLayoutIter *line_iter = nullptr;
+  { // Fontconfig caches some info here that is not freed before exit.
+    DISABLE_HEAP_LEAK_CHECK;
+    line_iter = pango_layout_get_iter(layout_);
+  }
+  bool first_page = true;
+  int page_top = 0;
+  int offset = buf_length;
+  do {
+    // Get bounding box of the current line
+    PangoRectangle line_ink_rect;
+    pango_layout_iter_get_line_extents(line_iter, &line_ink_rect, nullptr);
+    pango_extents_to_pixels(&line_ink_rect, nullptr);
+    PangoLayoutLine *line = pango_layout_iter_get_line_readonly(line_iter);
+    if (first_page) {
+      page_top = line_ink_rect.y;
+      first_page = false;
+    }
+    int line_bottom = line_ink_rect.y + line_ink_rect.height;
+    if (line_bottom - page_top > max_layout_height) {
+      offset = line->start_index;
+      tlog(1, "Found offset = %d\n", offset);
+      break;
+    }
+  } while (pango_layout_iter_next_line(line_iter));
+  pango_layout_iter_free(line_iter);
+  return offset;
+}
+
+const std::vector<BoxChar *> &StringRenderer::GetBoxes() const {
+  return boxchars_;
+}
+
+Boxa *StringRenderer::GetPageBoxes() const {
+  return page_boxes_;
+}
+
+void StringRenderer::RotatePageBoxes(float rotation) {
+  BoxChar::RotateBoxes(rotation, page_width_ / 2, page_height_ / 2, start_box_, boxchars_.size(),
+                       &boxchars_);
+}
+
+void StringRenderer::ClearBoxes() {
+  for (auto &boxchar : boxchars_) {
+    delete boxchar;
+  }
+  boxchars_.clear();
+  boxaDestroy(&page_boxes_);
+}
+
+std::string StringRenderer::GetBoxesStr() {
+  BoxChar::PrepareToWrite(&boxchars_);
+  return BoxChar::GetTesseractBoxStr(page_height_, boxchars_);
+}
+
+void StringRenderer::WriteAllBoxes(const std::string &filename) {
+  BoxChar::PrepareToWrite(&boxchars_);
+  BoxChar::WriteTesseractBoxFile(filename, page_height_, boxchars_);
+}
+
+// Returns cluster strings in logical order.
+bool StringRenderer::GetClusterStrings(std::vector<std::string> *cluster_text) {
+  std::map<int, std::string> start_byte_to_text;
+  PangoLayoutIter *run_iter = pango_layout_get_iter(layout_);
+  const char *full_text = pango_layout_get_text(layout_);
+  do {
+    PangoLayoutRun *run = pango_layout_iter_get_run_readonly(run_iter);
+    if (!run) {
+      // End of line nullptr run marker
+      tlog(2, "Found end of line marker\n");
+      continue;
+    }
+    PangoGlyphItemIter cluster_iter;
+    gboolean have_cluster;
+    for (have_cluster = pango_glyph_item_iter_init_start(&cluster_iter, run, full_text);
+         have_cluster; have_cluster = pango_glyph_item_iter_next_cluster(&cluster_iter)) {
+      const int start_byte_index = cluster_iter.start_index;
+      const int end_byte_index = cluster_iter.end_index;
+      std::string text =
+          std::string(full_text + start_byte_index, end_byte_index - start_byte_index);
+      if (IsUTF8Whitespace(text.c_str())) {
+        tlog(2, "Found whitespace\n");
+        text = " ";
+      }
+      tlog(2, "start_byte=%d end_byte=%d : '%s'\n", start_byte_index, end_byte_index, text.c_str());
+      if (add_ligatures_) {
+        // Make sure the output box files have ligatured text in case the font
+        // decided to use an unmapped glyph.
+        text = LigatureTable::Get()->AddLigatures(text, nullptr);
+      }
+      start_byte_to_text[start_byte_index] = std::move(text);
+    }
+  } while (pango_layout_iter_next_run(run_iter));
+  pango_layout_iter_free(run_iter);
+
+  cluster_text->clear();
+  for (auto it = start_byte_to_text.begin(); it != start_byte_to_text.end(); ++it) {
+    cluster_text->push_back(it->second);
+  }
+  return !cluster_text->empty();
+}
+
+// Merges an array of BoxChars into words based on the identification of
+// BoxChars containing the space character as inter-word separators.
+//
+// Sometime two adjacent characters in the sequence may be detected as lying on
+// different lines based on their spatial positions. This may be the result of a
+// newline character at end of the last word on a line in the source text, or of
+// a discretionary line-break created by Pango at intra-word locations like
+// hyphens. When this is detected the word is split at that location into
+// multiple BoxChars. Otherwise, each resulting BoxChar will contain a word and
+// its bounding box.
+static void MergeBoxCharsToWords(std::vector<BoxChar *> *boxchars) {
+  std::vector<BoxChar *> result;
+  bool started_word = false;
+  for (auto &boxchar : *boxchars) {
+    if (boxchar->ch() == " " || boxchar->box() == nullptr) {
+      result.push_back(boxchar);
+      boxchar = nullptr;
+      started_word = false;
+      continue;
+    }
+
+    if (!started_word) {
+      // Begin new word
+      started_word = true;
+      result.push_back(boxchar);
+      boxchar = nullptr;
+    } else {
+      BoxChar *last_boxchar = result.back();
+      // Compute bounding box union
+      const Box *box = boxchar->box();
+      Box *last_box = last_boxchar->mutable_box();
+      int left = std::min(last_box->x, box->x);
+      int right = std::max(last_box->x + last_box->w, box->x + box->w);
+      int top = std::min(last_box->y, box->y);
+      int bottom = std::max(last_box->y + last_box->h, box->y + box->h);
+      // Conclude that the word was broken to span multiple lines based on the
+      // size of the merged bounding box in relation to those of the individual
+      // characters seen so far.
+      if (right - left > last_box->w + 5 * box->w) {
+        tlog(1, "Found line break after '%s'", last_boxchar->ch().c_str());
+        // Insert a fake interword space and start a new word with the current
+        // boxchar.
+        result.push_back(new BoxChar(" ", 1));
+        result.push_back(boxchar);
+        boxchar = nullptr;
+        continue;
+      }
+      // Append to last word
+      last_boxchar->mutable_ch()->append(boxchar->ch());
+      last_box->x = left;
+      last_box->w = right - left;
+      last_box->y = top;
+      last_box->h = bottom - top;
+      delete boxchar;
+      boxchar = nullptr;
+    }
+  }
+  boxchars->swap(result);
+}
+
+void StringRenderer::ComputeClusterBoxes() {
+  const char *text = pango_layout_get_text(layout_);
+  PangoLayoutIter *cluster_iter = pango_layout_get_iter(layout_);
+
+  // Do a first pass to store cluster start indexes.
+  std::vector<int> cluster_start_indices;
+  do {
+    cluster_start_indices.push_back(pango_layout_iter_get_index(cluster_iter));
+    tlog(3, "Added %d\n", cluster_start_indices.back());
+  } while (pango_layout_iter_next_cluster(cluster_iter));
+  pango_layout_iter_free(cluster_iter);
+  cluster_start_indices.push_back(strlen(text));
+  tlog(3, "Added last index %d\n", cluster_start_indices.back());
+  // Sort the indices and create a map from start to end indices.
+  std::sort(cluster_start_indices.begin(), cluster_start_indices.end());
+  std::map<int, int> cluster_start_to_end_index;
+  for (size_t i = 0; i + 1 < cluster_start_indices.size(); ++i) {
+    cluster_start_to_end_index[cluster_start_indices[i]] = cluster_start_indices[i + 1];
+  }
+
+  // Iterate again to compute cluster boxes and their text with the obtained
+  // cluster extent information.
+  cluster_iter = pango_layout_get_iter(layout_);
+  // Store BoxChars* sorted by their byte start positions
+  std::map<int, BoxChar *> start_byte_to_box;
+  do {
+    PangoRectangle cluster_rect;
+    pango_layout_iter_get_cluster_extents(cluster_iter, &cluster_rect, nullptr);
+    pango_extents_to_pixels(&cluster_rect, nullptr);
+    const int start_byte_index = pango_layout_iter_get_index(cluster_iter);
+    const int end_byte_index = cluster_start_to_end_index[start_byte_index];
+    std::string cluster_text =
+        std::string(text + start_byte_index, end_byte_index - start_byte_index);
+    if (!cluster_text.empty() && cluster_text[0] == '\n') {
+      tlog(2, "Skipping newlines at start of text.\n");
+      continue;
+    }
+    if (!cluster_rect.width || !cluster_rect.height || IsUTF8Whitespace(cluster_text.c_str())) {
+      tlog(2, "Skipping whitespace with boxdim (%d,%d) '%s'\n", cluster_rect.width,
+           cluster_rect.height, cluster_text.c_str());
+      auto *boxchar = new BoxChar(" ", 1);
+      boxchar->set_page(page_);
+      start_byte_to_box[start_byte_index] = boxchar;
+      continue;
+    }
+    // Prepare a boxchar for addition at this byte position.
+    tlog(2, "[%d %d], %d, %d : start_byte=%d end_byte=%d : '%s'\n", cluster_rect.x, cluster_rect.y,
+         cluster_rect.width, cluster_rect.height, start_byte_index, end_byte_index,
+         cluster_text.c_str());
+    ASSERT_HOST_MSG(cluster_rect.width, "cluster_text:%s  start_byte_index:%d\n",
+                    cluster_text.c_str(), start_byte_index);
+    ASSERT_HOST_MSG(cluster_rect.height, "cluster_text:%s  start_byte_index:%d\n",
+                    cluster_text.c_str(), start_byte_index);
+    if (box_padding_) {
+      cluster_rect.x = std::max(0, cluster_rect.x - box_padding_);
+      cluster_rect.width += 2 * box_padding_;
+      cluster_rect.y = std::max(0, cluster_rect.y - box_padding_);
+      cluster_rect.height += 2 * box_padding_;
+    }
+    if (add_ligatures_) {
+      // Make sure the output box files have ligatured text in case the font
+      // decided to use an unmapped glyph.
+      cluster_text = LigatureTable::Get()->AddLigatures(cluster_text, nullptr);
+    }
+    auto *boxchar = new BoxChar(cluster_text.c_str(), cluster_text.size());
+    boxchar->set_page(page_);
+    boxchar->AddBox(cluster_rect.x, cluster_rect.y, cluster_rect.width, cluster_rect.height);
+    start_byte_to_box[start_byte_index] = boxchar;
+  } while (pango_layout_iter_next_cluster(cluster_iter));
+  pango_layout_iter_free(cluster_iter);
+
+  // There is a subtle bug in the cluster text reported by the PangoLayoutIter
+  // on ligatured characters (eg. The word "Lam-Aliph" in arabic). To work
+  // around this, we use text reported using the PangoGlyphIter which is
+  // accurate.
+  // TODO(ranjith): Revisit whether this is still needed in newer versions of
+  // pango.
+  std::vector<std::string> cluster_text;
+  if (GetClusterStrings(&cluster_text)) {
+    ASSERT_HOST(cluster_text.size() == start_byte_to_box.size());
+    int ind = 0;
+    for (auto it = start_byte_to_box.begin(); it != start_byte_to_box.end(); ++it, ++ind) {
+      it->second->mutable_ch()->swap(cluster_text[ind]);
+    }
+  }
+
+  // Append to the boxchars list in byte order.
+  std::vector<BoxChar *> page_boxchars;
+  page_boxchars.reserve(start_byte_to_box.size());
+  std::string last_ch;
+  for (auto it = start_byte_to_box.begin(); it != start_byte_to_box.end(); ++it) {
+    if (it->second->ch() == kWordJoinerUTF8) {
+      // Skip zero-width joiner characters (ZWJs) here.
+      delete it->second;
+    } else {
+      page_boxchars.push_back(it->second);
+    }
+  }
+  CorrectBoxPositionsToLayout(&page_boxchars);
+
+  if (render_fullwidth_latin_) {
+    for (auto &it : start_byte_to_box) {
+      // Convert fullwidth Latin characters to their halfwidth forms.
+      std::string half(ConvertFullwidthLatinToBasicLatin(it.second->ch()));
+      it.second->mutable_ch()->swap(half);
+    }
+  }
+
+  // Merge the character boxes into word boxes if we are rendering n-grams.
+  if (output_word_boxes_) {
+    MergeBoxCharsToWords(&page_boxchars);
+  }
+
+  boxchars_.insert(boxchars_.end(), page_boxchars.begin(), page_boxchars.end());
+
+  // Compute the page bounding box
+  Box *page_box = nullptr;
+  Boxa *all_boxes = nullptr;
+  for (auto &page_boxchar : page_boxchars) {
+    if (page_boxchar->box() == nullptr) {
+      continue;
+    }
+    if (all_boxes == nullptr) {
+      all_boxes = boxaCreate(0);
+    }
+    boxaAddBox(all_boxes, page_boxchar->mutable_box(), L_CLONE);
+  }
+  if (all_boxes != nullptr) {
+    boxaGetExtent(all_boxes, nullptr, nullptr, &page_box);
+    boxaDestroy(&all_boxes);
+    if (page_boxes_ == nullptr) {
+      page_boxes_ = boxaCreate(0);
+    }
+    boxaAddBox(page_boxes_, page_box, L_INSERT);
+  }
+}
+
+void StringRenderer::CorrectBoxPositionsToLayout(std::vector<BoxChar *> *boxchars) {
+  if (vertical_text_) {
+    const double rotation = -pango_gravity_to_rotation(
+        pango_context_get_base_gravity(pango_layout_get_context(layout_)));
+    BoxChar::TranslateBoxes(page_width_ - h_margin_, v_margin_, boxchars);
+    BoxChar::RotateBoxes(rotation, page_width_ - h_margin_, v_margin_, 0, boxchars->size(),
+                         boxchars);
+  } else {
+    BoxChar::TranslateBoxes(h_margin_, v_margin_, boxchars);
+  }
+}
+
+int StringRenderer::StripUnrenderableWords(std::string *utf8_text) const {
+  std::string output_text;
+  std::string unrenderable_words;
+  const char *text = utf8_text->c_str();
+  size_t offset = 0;
+  int num_dropped = 0;
+  while (offset < utf8_text->length()) {
+    int space_len = SpanUTF8Whitespace(text + offset);
+    output_text.append(text + offset, space_len);
+    offset += space_len;
+    if (offset == utf8_text->length()) {
+      break;
+    }
+
+    int word_len = SpanUTF8NotWhitespace(text + offset);
+    if (font_.CanRenderString(text + offset, word_len)) {
+      output_text.append(text + offset, word_len);
+    } else {
+      ++num_dropped;
+      unrenderable_words.append(text + offset, word_len);
+      unrenderable_words.append(" ");
+    }
+    offset += word_len;
+  }
+  utf8_text->swap(output_text);
+
+  if (num_dropped > 0) {
+    tprintf("Stripped %d unrenderable word(s): '%s'\n", num_dropped, unrenderable_words.c_str());
+  }
+  return num_dropped;
+}
+
+int StringRenderer::RenderToGrayscaleImage(const char *text, int text_length, Image *pix) {
+  Image orig_pix = nullptr;
+  int offset = RenderToImage(text, text_length, &orig_pix);
+  if (orig_pix) {
+    *pix = pixConvertTo8(orig_pix, false);
+    orig_pix.destroy();
+  }
+  return offset;
+}
+
+int StringRenderer::RenderToBinaryImage(const char *text, int text_length, int threshold,
+                                        Image *pix) {
+  Image orig_pix = nullptr;
+  int offset = RenderToImage(text, text_length, &orig_pix);
+  if (orig_pix) {
+    Image gray_pix = pixConvertTo8(orig_pix, false);
+    orig_pix.destroy();
+    *pix = pixThresholdToBinary(gray_pix, threshold);
+    gray_pix.destroy();
+  } else {
+    *pix = orig_pix;
+  }
+  return offset;
+}
+
+// Add word joiner (WJ) characters between adjacent non-space characters except
+// immediately before a combiner.
+/* static */
+std::string StringRenderer::InsertWordJoiners(const std::string &text) {
+  std::string out_str;
+  const UNICHAR::const_iterator it_end = UNICHAR::end(text.c_str(), text.length());
+  for (UNICHAR::const_iterator it = UNICHAR::begin(text.c_str(), text.length()); it != it_end;
+       ++it) {
+    // Add the symbol to the output string.
+    out_str.append(it.utf8_data(), it.utf8_len());
+    // Check the next symbol.
+    UNICHAR::const_iterator next_it = it;
+    ++next_it;
+    bool next_char_is_boundary = (next_it == it_end || *next_it == ' ');
+    bool next_char_is_combiner = (next_it == it_end) ? false : IsCombiner(*next_it);
+    if (*it != ' ' && *it != '\n' && !next_char_is_boundary && !next_char_is_combiner) {
+      out_str += kWordJoinerUTF8;
+    }
+  }
+  return out_str;
+}
+
+// Convert halfwidth Basic Latin characters to their fullwidth forms.
+std::string StringRenderer::ConvertBasicLatinToFullwidthLatin(const std::string &str) {
+  std::string full_str;
+  const UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), str.length());
+  for (UNICHAR::const_iterator it = UNICHAR::begin(str.c_str(), str.length()); it != it_end; ++it) {
+    // Convert printable and non-space 7-bit ASCII characters to
+    // their fullwidth forms.
+    if (IsInterchangeValid7BitAscii(*it) && isprint(*it) && !isspace(*it)) {
+      // Convert by adding 0xFEE0 to the codepoint of 7-bit ASCII.
+      char32 full_char = *it + 0xFEE0;
+      full_str.append(EncodeAsUTF8(full_char));
+    } else {
+      full_str.append(it.utf8_data(), it.utf8_len());
+    }
+  }
+  return full_str;
+}
+
+// Convert fullwidth Latin characters to their halfwidth forms.
+std::string StringRenderer::ConvertFullwidthLatinToBasicLatin(const std::string &str) {
+  std::string half_str;
+  UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), str.length());
+  for (UNICHAR::const_iterator it = UNICHAR::begin(str.c_str(), str.length()); it != it_end; ++it) {
+    char32 half_char = FullwidthToHalfwidth(*it);
+    // Convert fullwidth Latin characters to their halfwidth forms
+    // only if halfwidth forms are printable and non-space 7-bit ASCII.
+    if (IsInterchangeValid7BitAscii(half_char) && isprint(half_char) && !isspace(half_char)) {
+      half_str.append(EncodeAsUTF8(half_char));
+    } else {
+      half_str.append(it.utf8_data(), it.utf8_len());
+    }
+  }
+  return half_str;
+}
+
+// Returns offset to end of text substring rendered in this method.
+int StringRenderer::RenderToImage(const char *text, int text_length, Image *pix) {
+  if (pix && *pix) {
+    pix->destroy();
+  }
+  InitPangoCairo();
+
+  const int page_offset = FindFirstPageBreakOffset(text, text_length);
+  if (!page_offset) {
+    return 0;
+  }
+  start_box_ = boxchars_.size();
+
+  if (!vertical_text_) {
+    // Translate by the specified margin
+    cairo_translate(cr_, h_margin_, v_margin_);
+  } else {
+    // Vertical text rendering is achieved by a two-step process of first
+    // performing regular horizontal layout with character orientation set to
+    // EAST, and then translating and rotating the layout before rendering onto
+    // the desired image surface. The settings required for the former step are
+    // done within InitPangoCairo().
+    //
+    // Translate to the top-right margin of page
+    cairo_translate(cr_, page_width_ - h_margin_, v_margin_);
+    // Rotate the layout
+    double rotation = -pango_gravity_to_rotation(
+        pango_context_get_base_gravity(pango_layout_get_context(layout_)));
+    tlog(2, "Rotating by %f radians\n", rotation);
+    cairo_rotate(cr_, rotation);
+    pango_cairo_update_layout(cr_, layout_);
+  }
+  std::string page_text(text, page_offset);
+  if (render_fullwidth_latin_) {
+    // Convert Basic Latin to their fullwidth forms.
+    page_text = ConvertBasicLatinToFullwidthLatin(page_text);
+  }
+  if (strip_unrenderable_words_) {
+    StripUnrenderableWords(&page_text);
+  }
+  if (drop_uncovered_chars_ && !font_.CoversUTF8Text(page_text.c_str(), page_text.length())) {
+    int num_dropped = font_.DropUncoveredChars(&page_text);
+    if (num_dropped) {
+      tprintf("WARNING: Dropped %d uncovered characters\n", num_dropped);
+    }
+  }
+  if (add_ligatures_) {
+    // Add ligatures wherever possible, including custom ligatures.
+    page_text = LigatureTable::Get()->AddLigatures(page_text, &font_);
+  }
+  if (underline_start_prob_ > 0) {
+    SetWordUnderlineAttributes(page_text);
+  }
+
+  pango_layout_set_text(layout_, page_text.c_str(), page_text.length());
+
+  if (pix) {
+    // Set a white background for the target image surface.
+    cairo_set_source_rgb(cr_, 1.0, 1.0, 1.0); // sets drawing colour to white
+    // Fill the surface with the active colour (if you don't do this, you will
+    // be given a surface with a transparent background to draw on)
+    cairo_paint(cr_);
+    // Set the ink color to black
+    cairo_set_source_rgb(cr_, pen_color_[0], pen_color_[1], pen_color_[2]);
+    // If the target surface or transformation properties of the cairo instance
+    // have changed, update the pango layout to reflect this
+    pango_cairo_update_layout(cr_, layout_);
+    {
+      DISABLE_HEAP_LEAK_CHECK; // for Fontconfig
+      // Draw the pango layout onto the cairo surface
+      pango_cairo_show_layout(cr_, layout_);
+    }
+    *pix = CairoARGB32ToPixFormat(surface_);
+  }
+  ComputeClusterBoxes();
+  FreePangoCairo();
+  // Update internal state variables.
+  ++page_;
+  return page_offset;
+}
+
+// Render a string to an image, returning it as an 8 bit pix.  Behaves as
+// RenderString, except that it ignores the font set at construction and works
+// through all the fonts, returning 0 until they are exhausted, at which point
+// it returns the value it should have returned all along, but no pix this time.
+// Fonts that don't contain a given proportion of the characters in the string
+// get skipped.
+// Fonts that work each get rendered and the font name gets added
+// to the image.
+// NOTE that no boxes are produced by this function.
+//
+// Example usage: To render a null terminated char-array "txt"
+//
+// int offset = 0;
+// do {
+//   Image pix;
+//   offset += renderer.RenderAllFontsToImage(min_proportion, txt + offset,
+//                                            strlen(txt + offset), nullptr,
+//                                            &pix);
+//   ...
+// } while (offset < strlen(text));
+//
+int StringRenderer::RenderAllFontsToImage(double min_coverage, const char *text, int text_length,
+                                          std::string *font_used, Image *image) {
+  *image = nullptr;
+  // Select a suitable font to render the title with.
+  const char kTitleTemplate[] = "%s : %d hits = %.2f%%, raw = %d = %.2f%%";
+  std::string title_font;
+  if (!FontUtils::SelectFont(kTitleTemplate, strlen(kTitleTemplate), &title_font, nullptr)) {
+    tprintf("WARNING: Could not find a font to render image title with!\n");
+    title_font = "Arial";
+  }
+  title_font += " 8";
+  tlog(1, "Selected title font: %s\n", title_font.c_str());
+  if (font_used) {
+    font_used->clear();
+  }
+
+  std::string orig_font = font_.DescriptionName();
+  if (char_map_.empty()) {
+    total_chars_ = 0;
+    // Fill the hash table and use that for computing which fonts to use.
+    for (UNICHAR::const_iterator it = UNICHAR::begin(text, text_length);
+         it != UNICHAR::end(text, text_length); ++it) {
+      ++total_chars_;
+      ++char_map_[*it];
+    }
+    tprintf("Total chars = %d\n", total_chars_);
+  }
+  const std::vector<std::string> &all_fonts = FontUtils::ListAvailableFonts();
+
+  for (size_t i = font_index_; i < all_fonts.size(); ++i) {
+    ++font_index_;
+    int raw_score = 0;
+    int ok_chars = FontUtils::FontScore(char_map_, all_fonts[i], &raw_score, nullptr);
+    if (ok_chars > 0 && ok_chars >= total_chars_ * min_coverage) {
+      set_font(all_fonts[i]);
+      int offset = RenderToBinaryImage(text, text_length, 128, image);
+      ClearBoxes(); // Get rid of them as they are garbage.
+      const int kMaxTitleLength = 1024;
+      char title[kMaxTitleLength];
+      snprintf(title, kMaxTitleLength, kTitleTemplate, all_fonts[i].c_str(), ok_chars,
+               100.0 * ok_chars / total_chars_, raw_score, 100.0 * raw_score / char_map_.size());
+      tprintf("%s\n", title);
+      // This is a good font! Store the offset to return once we've tried all
+      // the fonts.
+      if (offset) {
+        last_offset_ = offset;
+        if (font_used) {
+          *font_used = all_fonts[i];
+        }
+      }
+      // Add the font to the image.
+      set_font(title_font);
+      v_margin_ /= 8;
+      Image title_image = nullptr;
+      RenderToBinaryImage(title, strlen(title), 128, &title_image);
+      *image |= title_image;
+      title_image.destroy();
+
+      v_margin_ *= 8;
+      set_font(orig_font);
+      // We return the real offset only after cycling through the list of fonts.
+      return 0;
+    } else {
+      tprintf("Font %s failed with %d hits = %.2f%%\n", all_fonts[i].c_str(), ok_chars,
+              100.0 * ok_chars / total_chars_);
+    }
+  }
+  font_index_ = 0;
+  char_map_.clear();
+  return last_offset_ == 0 ? -1 : last_offset_;
+}
+
+} // namespace tesseract
author	Franz Glasner <fzglas.hg@dom66.de>
date	Mon, 15 Sep 2025 11:43:07 +0200
parents
children