diff mupdf-source/thirdparty/tesseract/src/training/pango/pango_font_info.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/training/pango/pango_font_info.h	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,209 @@
+/**********************************************************************
+ * File:        pango_font_info.h
+ * Description: Font-related objects and helper functions
+ * Author:      Ranjith Unnikrishnan
+ * Created:     Mon Nov 18 2013
+ *
+ * (C) Copyright 2013, Google Inc.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef TESSERACT_TRAINING_PANGO_FONT_INFO_H_
+#define TESSERACT_TRAINING_PANGO_FONT_INFO_H_
+
+#include "export.h"
+
+#include "commandlineflags.h"
+
+#include "pango/pango-font.h"
+#include "pango/pango.h"
+#include "pango/pangocairo.h"
+
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+using char32 = signed int;
+
+namespace tesseract {
+
+// Data holder class for a font, intended to avoid having to work with Pango or
+// FontConfig-specific objects directly.
+class TESS_PANGO_TRAINING_API PangoFontInfo {
+public:
+  enum FontTypeEnum {
+    UNKNOWN,
+    SERIF,
+    SANS_SERIF,
+    DECORATIVE,
+  };
+  PangoFontInfo();
+  ~PangoFontInfo();
+  // Initialize from parsing a font description name, defined as a string of the
+  // format:
+  //   "FamilyName [FaceName] [PointSize]"
+  // where a missing FaceName implies the default regular face.
+  // eg. "Arial Italic 12", "Verdana"
+  //
+  // FaceName is a combination of:
+  //   [StyleName] [Variant] [Weight] [Stretch]
+  // with (all optional) Pango-defined values of:
+  // StyleName: Oblique, Italic
+  // Variant  : Small-Caps
+  // Weight   : Ultra-Light, Light, Medium, Semi-Bold, Bold, Ultra-Bold, Heavy
+  // Stretch  : Ultra-Condensed, Extra-Condensed, Condensed, Semi-Condensed,
+  //            Semi-Expanded, Expanded, Extra-Expanded, Ultra-Expanded.
+  explicit PangoFontInfo(const std::string &name);
+  bool ParseFontDescriptionName(const std::string &name);
+
+  // Returns true if the font have codepoint coverage for the specified text.
+  bool CoversUTF8Text(const char *utf8_text, int byte_length) const;
+  // Modifies string to remove unicode points that are not covered by the
+  // font. Returns the number of characters dropped.
+  int DropUncoveredChars(std::string *utf8_text) const;
+
+  // Returns true if the entire string can be rendered by the font with full
+  // character coverage and no unknown glyph or dotted-circle glyph
+  // substitutions on encountering a badly formed unicode sequence.
+  // If true, returns individual graphemes. Any whitespace characters in the
+  // original string are also included in the list.
+  bool CanRenderString(const char *utf8_word, int len, std::vector<std::string> *graphemes) const;
+  bool CanRenderString(const char *utf8_word, int len) const;
+
+  // Retrieves the x_bearing and x_advance for the given utf8 character in the
+  // font. Returns false if the glyph for the character could not be found in
+  // the font.
+  // Ref: http://freetype.sourceforge.net/freetype2/docs/glyphs/glyphs-3.html
+  bool GetSpacingProperties(const std::string &utf8_char, int *x_bearing, int *x_advance) const;
+
+  // If not already initialized, initializes FontConfig by setting its
+  // environment variable and creating a fonts.conf file that points to the
+  // FLAGS_fonts_dir and the cache to FLAGS_fontconfig_tmpdir.
+  static void SoftInitFontConfig();
+  // Re-initializes font config, whether or not already initialized.
+  // If already initialized, any existing cache is deleted, just to be sure.
+  static void HardInitFontConfig(const char *fonts_dir, const char *cache_dir);
+
+  // Accessors
+  std::string DescriptionName() const;
+  // Font Family name eg. "Arial"
+  const std::string &family_name() const {
+    return family_name_;
+  }
+  // Size in points (1/72"), rounded to the nearest integer.
+  int font_size() const {
+    return font_size_;
+  }
+  FontTypeEnum font_type() const {
+    return font_type_;
+  }
+
+  int resolution() const {
+    return resolution_;
+  }
+  void set_resolution(const int resolution) {
+    resolution_ = resolution;
+  }
+
+private:
+  friend class FontUtils;
+  void Clear();
+  bool ParseFontDescription(const PangoFontDescription *desc);
+  // Returns the PangoFont structure corresponding to the closest available font
+  // in the font map.
+  PangoFont *ToPangoFont() const;
+
+  // Font properties set automatically from parsing the font description name.
+  std::string family_name_;
+  int font_size_;
+  FontTypeEnum font_type_;
+  // The Pango description that was used to initialize the instance.
+  PangoFontDescription *desc_;
+  // Default output resolution to assume for GetSpacingProperties() and any
+  // other methods that returns pixel values.
+  int resolution_;
+  // Fontconfig operates through an environment variable, so it intrinsically
+  // cannot be thread-friendly, but you can serialize multiple independent
+  // font configurations by calling HardInitFontConfig(fonts_dir, cache_dir).
+  // These hold the last initialized values set by HardInitFontConfig or
+  // the first call to SoftInitFontConfig.
+  // Directory to be scanned for font files.
+  static std::string fonts_dir_;
+  // Directory to store the cache of font information. (Can be the same as
+  // fonts_dir_)
+  static std::string cache_dir_;
+
+private:
+  PangoFontInfo(const PangoFontInfo &) = delete;
+  void operator=(const PangoFontInfo &) = delete;
+};
+
+// Static utility methods for querying font availability and font-selection
+// based on codepoint coverage.
+class TESS_PANGO_TRAINING_API FontUtils {
+public:
+  // Returns true if the font of the given description name is available in the
+  // target directory specified by --fonts_dir
+  static bool IsAvailableFont(const char *font_desc) {
+    return IsAvailableFont(font_desc, nullptr);
+  }
+  // Returns true if the font of the given description name is available in the
+  // target directory specified by --fonts_dir. If false is returned, and
+  // best_match is not nullptr, the closest matching font is returned there.
+  static bool IsAvailableFont(const char *font_desc, std::string *best_match);
+  // Outputs description names of available fonts.
+  static const std::vector<std::string> &ListAvailableFonts();
+
+  // Picks font among available fonts that covers and can render the given word,
+  // and returns the font description name and the decomposition of the word to
+  // graphemes. Returns false if no suitable font was found.
+  static bool SelectFont(const char *utf8_word, const int utf8_len, std::string *font_name,
+                         std::vector<std::string> *graphemes);
+
+  // Picks font among all_fonts that covers and can render the given word,
+  // and returns the font description name and the decomposition of the word to
+  // graphemes. Returns false if no suitable font was found.
+  static bool SelectFont(const char *utf8_word, const int utf8_len,
+                         const std::vector<std::string> &all_fonts, std::string *font_name,
+                         std::vector<std::string> *graphemes);
+
+  // NOTE: The following utilities were written to be backward compatible with
+  // StringRender.
+
+  // BestFonts returns a font name and a bit vector of the characters it
+  // can render for the fonts that score within some fraction of the best
+  // font on the characters in the given hash map.
+  // In the flags vector, each flag is set according to whether the
+  // corresponding character (in order of iterating ch_map) can be rendered.
+  // The return string is a list of the acceptable fonts that were used.
+  static std::string BestFonts(const std::unordered_map<char32, int64_t> &ch_map,
+                               std::vector<std::pair<const char *, std::vector<bool>>> *font_flag);
+
+  // FontScore returns the weighted renderability score of the given
+  // hash map character table in the given font. The unweighted score
+  // is also returned in raw_score.
+  // The values in the bool vector ch_flags correspond to whether the
+  // corresponding character (in order of iterating ch_map) can be rendered.
+  static int FontScore(const std::unordered_map<char32, int64_t> &ch_map,
+                       const std::string &fontname, int *raw_score, std::vector<bool> *ch_flags);
+
+  // PangoFontInfo is reinitialized, so clear the static list of fonts.
+  static void ReInit();
+  static void PangoFontTypeInfo();
+
+private:
+  static std::vector<std::string> available_fonts_; // cache list
+};
+} // namespace tesseract
+
+#endif // TESSERACT_TRAINING_PANGO_FONT_INFO_H_