diff mupdf-source/thirdparty/tesseract/src/ccstruct/fontinfo.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/ccstruct/fontinfo.h	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,204 @@
+///////////////////////////////////////////////////////////////////////
+// File:        fontinfo.h
+// Description: Font information classes abstracted from intproto.h/cpp.
+// Author:      rays@google.com (Ray Smith)
+//
+// (C) Copyright 2011, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CCSTRUCT_FONTINFO_H_
+#define TESSERACT_CCSTRUCT_FONTINFO_H_
+
+#include "errcode.h"
+
+#include <tesseract/unichar.h>
+#include "genericvector.h"
+
+#include <cstdint> // for uint16_t, uint32_t
+#include <cstdio>  // for FILE
+#include <vector>
+
+namespace tesseract {
+
+template <typename T>
+class UnicityTable;
+
+// Simple struct to hold a font and a score. The scores come from the low-level
+// integer matcher, so they are in the uint16_t range. Fonts are an index to
+// fontinfo_table.
+// These get copied around a lot, so best to keep them small.
+struct ScoredFont {
+  ScoredFont() : fontinfo_id(-1), score(0) {}
+  ScoredFont(int font_id, uint16_t classifier_score)
+      : fontinfo_id(font_id), score(classifier_score) {}
+
+  // Index into fontinfo table, but inside the classifier, may be a shapetable
+  // index.
+  int32_t fontinfo_id;
+  // Raw score from the low-level classifier.
+  uint16_t score;
+};
+
+// Struct for information about spacing between characters in a particular font.
+struct FontSpacingInfo {
+  int16_t x_gap_before;
+  int16_t x_gap_after;
+  std::vector<UNICHAR_ID> kerned_unichar_ids;
+  std::vector<int16_t> kerned_x_gaps;
+};
+
+/*
+ * font_properties contains properties about boldness, italicness, fixed pitch,
+ * serif, fraktur
+ */
+struct FontInfo {
+  FontInfo() : name(nullptr), properties(0), universal_id(0), spacing_vec(nullptr) {}
+  ~FontInfo() = default;
+
+  bool operator==(const FontInfo &rhs) const {
+    return strcmp(name, rhs.name) == 0;
+  }
+
+  // Writes to the given file. Returns false in case of error.
+  bool Serialize(FILE *fp) const;
+  // Reads from the given file. Returns false in case of error.
+  // If swap is true, assumes a big/little-endian swap is needed.
+  bool DeSerialize(TFile *fp);
+
+  // Reserves unicharset_size spots in spacing_vec.
+  void init_spacing(int unicharset_size) {
+    spacing_vec = new std::vector<FontSpacingInfo *>(unicharset_size);
+  }
+  // Adds the given pointer to FontSpacingInfo to spacing_vec member
+  // (FontInfo class takes ownership of the pointer).
+  // Note: init_spacing should be called before calling this function.
+  void add_spacing(UNICHAR_ID uch_id, FontSpacingInfo *spacing_info) {
+    ASSERT_HOST(static_cast<size_t>(uch_id) < spacing_vec->size());
+    (*spacing_vec)[uch_id] = spacing_info;
+  }
+
+  // Returns the pointer to FontSpacingInfo for the given UNICHAR_ID.
+  const FontSpacingInfo *get_spacing(UNICHAR_ID uch_id) const {
+    return (spacing_vec == nullptr || spacing_vec->size() <= static_cast<size_t>(uch_id)) ? nullptr
+                                                                     : (*spacing_vec)[uch_id];
+  }
+
+  // Fills spacing with the value of the x gap expected between the two given
+  // UNICHAR_IDs. Returns true on success.
+  bool get_spacing(UNICHAR_ID prev_uch_id, UNICHAR_ID uch_id, int *spacing) const {
+    const FontSpacingInfo *prev_fsi = this->get_spacing(prev_uch_id);
+    const FontSpacingInfo *fsi = this->get_spacing(uch_id);
+    if (prev_fsi == nullptr || fsi == nullptr) {
+      return false;
+    }
+    size_t i = 0;
+    for (; i < prev_fsi->kerned_unichar_ids.size(); ++i) {
+      if (prev_fsi->kerned_unichar_ids[i] == uch_id) {
+        break;
+      }
+    }
+    if (i < prev_fsi->kerned_unichar_ids.size()) {
+      *spacing = prev_fsi->kerned_x_gaps[i];
+    } else {
+      *spacing = prev_fsi->x_gap_after + fsi->x_gap_before;
+    }
+    return true;
+  }
+
+  bool is_italic() const {
+    return properties & 1;
+  }
+  bool is_bold() const {
+    return (properties & 2) != 0;
+  }
+  bool is_fixed_pitch() const {
+    return (properties & 4) != 0;
+  }
+  bool is_serif() const {
+    return (properties & 8) != 0;
+  }
+  bool is_fraktur() const {
+    return (properties & 16) != 0;
+  }
+
+  char *name;
+  uint32_t properties;
+  // The universal_id is a field reserved for the initialization process
+  // to assign a unique id number to all fonts loaded for the current
+  // combination of languages. This id will then be returned by
+  // ResultIterator::WordFontAttributes.
+  int32_t universal_id;
+  // Horizontal spacing between characters (indexed by UNICHAR_ID).
+  std::vector<FontSpacingInfo *> *spacing_vec;
+};
+
+// Every class (character) owns a FontSet that represents all the fonts that can
+// render this character.
+// Since almost all the characters from the same script share the same set of
+// fonts, the sets are shared over multiple classes (see
+// Classify::fontset_table_). Thus, a class only store an id to a set.
+// Because some fonts cannot render just one character of a set, there are a
+// lot of FontSet that differ only by one font. Rather than storing directly
+// the FontInfo in the FontSet structure, it's better to share FontInfos among
+// FontSets (Classify::fontinfo_table_).
+using FontSet = std::vector<int>;
+
+// Class that adds a bit of functionality on top of GenericVector to
+// implement a table of FontInfo that replaces UniCityTable<FontInfo>.
+// TODO(rays) change all references once all existing traineddata files
+// are replaced.
+class FontInfoTable : public GenericVector<FontInfo> {
+public:
+  TESS_API // when you remove inheritance from GenericVector, move this on
+  // class level
+  FontInfoTable();
+  TESS_API
+  ~FontInfoTable();
+
+  // Writes to the given file. Returns false in case of error.
+  TESS_API
+  bool Serialize(FILE *fp) const;
+  // Reads from the given file. Returns false in case of error.
+  // If swap is true, assumes a big/little-endian swap is needed.
+  TESS_API
+  bool DeSerialize(TFile *fp);
+
+  // Returns true if the given set of fonts includes one with the same
+  // properties as font_id.
+  TESS_API
+  bool SetContainsFontProperties(int font_id, const std::vector<ScoredFont> &font_set) const;
+  // Returns true if the given set of fonts includes multiple properties.
+  TESS_API
+  bool SetContainsMultipleFontProperties(const std::vector<ScoredFont> &font_set) const;
+
+  // Moves any non-empty FontSpacingInfo entries from other to this.
+  TESS_API
+  void MoveSpacingInfoFrom(FontInfoTable *other);
+  // Moves this to the target unicity table.
+  TESS_API
+  void MoveTo(UnicityTable<FontInfo> *target);
+};
+
+// Deletion callbacks for GenericVector.
+void FontInfoDeleteCallback(FontInfo f);
+
+// Callbacks used by UnicityTable to read/write FontInfo/FontSet structures.
+bool read_info(TFile *f, FontInfo *fi);
+bool write_info(FILE *f, const FontInfo &fi);
+bool read_spacing_info(TFile *f, FontInfo *fi);
+bool write_spacing_info(FILE *f, const FontInfo &fi);
+bool write_set(FILE *f, const FontSet &fs);
+
+} // namespace tesseract.
+
+#endif /* THIRD_PARTY_TESSERACT_CCSTRUCT_FONTINFO_H_ */