diff mupdf-source/thirdparty/tesseract/src/training/pango/stringrenderer.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/training/pango/stringrenderer.h	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,229 @@
+/**********************************************************************
+ * File:        stringrenderer.h
+ * Description: Class for rendering UTF-8 text to an image, and retrieving
+ *              bounding boxes around each grapheme cluster.
+ *
+ *              Instances are created using a font description string
+ *              (eg. "Arial Italic 12"; see pango_font_info.h for the format)
+ *              and the page dimensions. Other renderer properties such as
+ *              spacing, ligaturization, as well a preprocessing behavior such
+ *              as removal of unrenderable words and a special n-gram mode may
+ *              be set using respective set_* methods.
+ *
+ * Author:      Ranjith Unnikrishnan
+ *
+ * (C) Copyright 2013, Google Inc.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef TESSERACT_TRAINING_STRINGRENDERER_H_
+#define TESSERACT_TRAINING_STRINGRENDERER_H_
+
+#include "export.h"
+
+#include "pango/pango-layout.h"
+#include "pango/pangocairo.h"
+#include "pango_font_info.h"
+
+#include "image.h"
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+struct Boxa;
+struct Pix;
+
+namespace tesseract {
+
+class BoxChar;
+
+class TESS_PANGO_TRAINING_API StringRenderer {
+public:
+  StringRenderer(const std::string &font_desc, int page_width, int page_height);
+  ~StringRenderer();
+
+  // Renders the text with the chosen font and returns the byte offset up to
+  // which the text could be rendered so as to fit the specified page
+  // dimensions.
+  int RenderToImage(const char *text, int text_length, Image *pix);
+  int RenderToGrayscaleImage(const char *text, int text_length, Image *pix);
+  int RenderToBinaryImage(const char *text, int text_length, int threshold, Image *pix);
+  // Renders a line of text with all available fonts that were able to render
+  // at least min_coverage fraction of the input text. Use 1.0 to require that
+  // a font be able to render all the text.
+  int RenderAllFontsToImage(double min_coverage, const char *text, int text_length,
+                            std::string *font_used, Image *pix);
+
+  bool set_font(const std::string &desc);
+  // Char spacing is in PIXELS!!!!.
+  void set_char_spacing(int char_spacing) {
+    char_spacing_ = char_spacing;
+  }
+  void set_leading(int leading) {
+    leading_ = leading;
+  }
+  void set_resolution(const int resolution);
+  void set_vertical_text(bool vertical_text) {
+    vertical_text_ = vertical_text;
+  }
+  void set_gravity_hint_strong(bool gravity_hint_strong) {
+    gravity_hint_strong_ = gravity_hint_strong;
+  }
+  void set_render_fullwidth_latin(bool render_fullwidth_latin) {
+    render_fullwidth_latin_ = render_fullwidth_latin;
+  }
+  // Sets the probability (value in [0, 1]) of starting to render a word with an
+  // underline. This implementation consider words to be space-delimited
+  // sequences of characters.
+  void set_underline_start_prob(const double frac);
+  // Set the probability (value in [0, 1]) of continuing a started underline to
+  // the next word.
+  void set_underline_continuation_prob(const double frac);
+  void set_underline_style(const PangoUnderline style) {
+    underline_style_ = style;
+  }
+  void set_features(const char *features) {
+    features_ = features;
+  }
+  void set_page(int page) {
+    page_ = page;
+  }
+  void set_box_padding(int val) {
+    box_padding_ = val;
+  }
+  void set_drop_uncovered_chars(bool val) {
+    drop_uncovered_chars_ = val;
+  }
+  void set_strip_unrenderable_words(bool val) {
+    strip_unrenderable_words_ = val;
+  }
+  void set_output_word_boxes(bool val) {
+    output_word_boxes_ = val;
+  }
+  // Before rendering the string, replace latin characters with their optional
+  // ligatured forms (such as "fi", "ffi" etc.) if the font_ covers those
+  // unicodes.
+  void set_add_ligatures(bool add_ligatures) {
+    add_ligatures_ = add_ligatures;
+  }
+  // Set the rgb value of the text ink. Values range in [0, 1.0]
+  void set_pen_color(double r, double g, double b) {
+    pen_color_[0] = r;
+    pen_color_[1] = g;
+    pen_color_[2] = b;
+  }
+  void set_h_margin(const int h_margin) {
+    h_margin_ = h_margin;
+  }
+  void set_v_margin(const int v_margin) {
+    v_margin_ = v_margin;
+  }
+  const PangoFontInfo &font() const {
+    return font_;
+  }
+  int h_margin() const {
+    return h_margin_;
+  }
+  int v_margin() const {
+    return v_margin_;
+  }
+
+  // Get the boxchars of all clusters rendered thus far (or since the last call
+  // to ClearBoxes()).
+  const std::vector<BoxChar *> &GetBoxes() const;
+  // Get the rendered page bounding boxes of all pages created thus far (or
+  // since last call to ClearBoxes()).
+  Boxa *GetPageBoxes() const;
+
+  // Rotate the boxes on the most recent page by the given rotation.
+  void RotatePageBoxes(float rotation);
+  // Delete all boxes.
+  void ClearBoxes();
+  // Returns the boxes in a boxfile string.
+  std::string GetBoxesStr();
+  // Writes the boxes to a boxfile.
+  void WriteAllBoxes(const std::string &filename);
+  // Removes space-delimited words from the string that are not renderable by
+  // the current font and returns the count of such words.
+  int StripUnrenderableWords(std::string *utf8_text) const;
+
+  // Insert a Word Joiner symbol (U+2060) between adjacent characters, excluding
+  // spaces and combining types, in each word before rendering to ensure words
+  // are not broken across lines. The output boxchars will not contain the
+  // joiner.
+  static std::string InsertWordJoiners(const std::string &text);
+
+  // Helper functions to convert fullwidth Latin and halfwidth Basic Latin.
+  static std::string ConvertBasicLatinToFullwidthLatin(const std::string &text);
+  static std::string ConvertFullwidthLatinToBasicLatin(const std::string &text);
+
+protected:
+  // Init and free local renderer objects.
+  void InitPangoCairo();
+  void FreePangoCairo();
+  // Set rendering properties.
+  void SetLayoutProperties();
+  void SetWordUnderlineAttributes(const std::string &page_text);
+  // Compute bounding boxes around grapheme clusters.
+  void ComputeClusterBoxes();
+  void CorrectBoxPositionsToLayout(std::vector<BoxChar *> *boxchars);
+  bool GetClusterStrings(std::vector<std::string> *cluster_text);
+  int FindFirstPageBreakOffset(const char *text, int text_length);
+
+  PangoFontInfo font_;
+  // Page properties
+  int page_width_, page_height_, h_margin_, v_margin_;
+  // Text rendering properties
+  double pen_color_[3];
+  int char_spacing_;
+  int leading_, resolution_;
+  bool vertical_text_;
+  bool gravity_hint_strong_;
+  bool render_fullwidth_latin_;
+  double underline_start_prob_;
+  double underline_continuation_prob_;
+  PangoUnderline underline_style_;
+  std::string features_;
+  // Text filtering options
+  bool drop_uncovered_chars_;
+  bool strip_unrenderable_words_;
+  bool add_ligatures_;
+  bool output_word_boxes_;
+  // Pango and cairo specific objects
+  cairo_surface_t *surface_;
+  cairo_t *cr_;
+  PangoLayout *layout_;
+  // Internal state of current page number, updated on successive calls to
+  // RenderToImage()
+  int start_box_;
+  int page_;
+  // Boxes and associated text for all pages rendered with RenderToImage() since
+  // the last call to ClearBoxes().
+  std::vector<BoxChar *> boxchars_;
+  int box_padding_;
+  // Bounding boxes for pages since the last call to ClearBoxes().
+  Boxa *page_boxes_;
+
+  // Objects cached for subsequent calls to RenderAllFontsToImage()
+  std::unordered_map<char32, int64_t> char_map_; // Time-saving char histogram.
+  int total_chars_;                              // Number in the string to be rendered.
+  unsigned int font_index_;                      // Index of next font to use in font list.
+  int last_offset_;                              // Offset returned from last successful rendering
+
+private:
+  StringRenderer(const StringRenderer &) = delete;
+  void operator=(const StringRenderer &) = delete;
+};
+} // namespace tesseract
+
+#endif // THIRD_PARTY_TESSERACT_TRAINING_STRINGRENDERER_H_