Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccstruct/fontinfo.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: fontinfo.h | |
| 3 // Description: Font information classes abstracted from intproto.h/cpp. | |
| 4 // Author: rays@google.com (Ray Smith) | |
| 5 // | |
| 6 // (C) Copyright 2011, Google Inc. | |
| 7 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 // you may not use this file except in compliance with the License. | |
| 9 // You may obtain a copy of the License at | |
| 10 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 // Unless required by applicable law or agreed to in writing, software | |
| 12 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 // See the License for the specific language governing permissions and | |
| 15 // limitations under the License. | |
| 16 // | |
| 17 /////////////////////////////////////////////////////////////////////// | |
| 18 | |
| 19 #ifndef TESSERACT_CCSTRUCT_FONTINFO_H_ | |
| 20 #define TESSERACT_CCSTRUCT_FONTINFO_H_ | |
| 21 | |
| 22 #include "errcode.h" | |
| 23 | |
| 24 #include <tesseract/unichar.h> | |
| 25 #include "genericvector.h" | |
| 26 | |
| 27 #include <cstdint> // for uint16_t, uint32_t | |
| 28 #include <cstdio> // for FILE | |
| 29 #include <vector> | |
| 30 | |
| 31 namespace tesseract { | |
| 32 | |
| 33 template <typename T> | |
| 34 class UnicityTable; | |
| 35 | |
| 36 // Simple struct to hold a font and a score. The scores come from the low-level | |
| 37 // integer matcher, so they are in the uint16_t range. Fonts are an index to | |
| 38 // fontinfo_table. | |
| 39 // These get copied around a lot, so best to keep them small. | |
| 40 struct ScoredFont { | |
| 41 ScoredFont() : fontinfo_id(-1), score(0) {} | |
| 42 ScoredFont(int font_id, uint16_t classifier_score) | |
| 43 : fontinfo_id(font_id), score(classifier_score) {} | |
| 44 | |
| 45 // Index into fontinfo table, but inside the classifier, may be a shapetable | |
| 46 // index. | |
| 47 int32_t fontinfo_id; | |
| 48 // Raw score from the low-level classifier. | |
| 49 uint16_t score; | |
| 50 }; | |
| 51 | |
| 52 // Struct for information about spacing between characters in a particular font. | |
| 53 struct FontSpacingInfo { | |
| 54 int16_t x_gap_before; | |
| 55 int16_t x_gap_after; | |
| 56 std::vector<UNICHAR_ID> kerned_unichar_ids; | |
| 57 std::vector<int16_t> kerned_x_gaps; | |
| 58 }; | |
| 59 | |
| 60 /* | |
| 61 * font_properties contains properties about boldness, italicness, fixed pitch, | |
| 62 * serif, fraktur | |
| 63 */ | |
| 64 struct FontInfo { | |
| 65 FontInfo() : name(nullptr), properties(0), universal_id(0), spacing_vec(nullptr) {} | |
| 66 ~FontInfo() = default; | |
| 67 | |
| 68 bool operator==(const FontInfo &rhs) const { | |
| 69 return strcmp(name, rhs.name) == 0; | |
| 70 } | |
| 71 | |
| 72 // Writes to the given file. Returns false in case of error. | |
| 73 bool Serialize(FILE *fp) const; | |
| 74 // Reads from the given file. Returns false in case of error. | |
| 75 // If swap is true, assumes a big/little-endian swap is needed. | |
| 76 bool DeSerialize(TFile *fp); | |
| 77 | |
| 78 // Reserves unicharset_size spots in spacing_vec. | |
| 79 void init_spacing(int unicharset_size) { | |
| 80 spacing_vec = new std::vector<FontSpacingInfo *>(unicharset_size); | |
| 81 } | |
| 82 // Adds the given pointer to FontSpacingInfo to spacing_vec member | |
| 83 // (FontInfo class takes ownership of the pointer). | |
| 84 // Note: init_spacing should be called before calling this function. | |
| 85 void add_spacing(UNICHAR_ID uch_id, FontSpacingInfo *spacing_info) { | |
| 86 ASSERT_HOST(static_cast<size_t>(uch_id) < spacing_vec->size()); | |
| 87 (*spacing_vec)[uch_id] = spacing_info; | |
| 88 } | |
| 89 | |
| 90 // Returns the pointer to FontSpacingInfo for the given UNICHAR_ID. | |
| 91 const FontSpacingInfo *get_spacing(UNICHAR_ID uch_id) const { | |
| 92 return (spacing_vec == nullptr || spacing_vec->size() <= static_cast<size_t>(uch_id)) ? nullptr | |
| 93 : (*spacing_vec)[uch_id]; | |
| 94 } | |
| 95 | |
| 96 // Fills spacing with the value of the x gap expected between the two given | |
| 97 // UNICHAR_IDs. Returns true on success. | |
| 98 bool get_spacing(UNICHAR_ID prev_uch_id, UNICHAR_ID uch_id, int *spacing) const { | |
| 99 const FontSpacingInfo *prev_fsi = this->get_spacing(prev_uch_id); | |
| 100 const FontSpacingInfo *fsi = this->get_spacing(uch_id); | |
| 101 if (prev_fsi == nullptr || fsi == nullptr) { | |
| 102 return false; | |
| 103 } | |
| 104 size_t i = 0; | |
| 105 for (; i < prev_fsi->kerned_unichar_ids.size(); ++i) { | |
| 106 if (prev_fsi->kerned_unichar_ids[i] == uch_id) { | |
| 107 break; | |
| 108 } | |
| 109 } | |
| 110 if (i < prev_fsi->kerned_unichar_ids.size()) { | |
| 111 *spacing = prev_fsi->kerned_x_gaps[i]; | |
| 112 } else { | |
| 113 *spacing = prev_fsi->x_gap_after + fsi->x_gap_before; | |
| 114 } | |
| 115 return true; | |
| 116 } | |
| 117 | |
| 118 bool is_italic() const { | |
| 119 return properties & 1; | |
| 120 } | |
| 121 bool is_bold() const { | |
| 122 return (properties & 2) != 0; | |
| 123 } | |
| 124 bool is_fixed_pitch() const { | |
| 125 return (properties & 4) != 0; | |
| 126 } | |
| 127 bool is_serif() const { | |
| 128 return (properties & 8) != 0; | |
| 129 } | |
| 130 bool is_fraktur() const { | |
| 131 return (properties & 16) != 0; | |
| 132 } | |
| 133 | |
| 134 char *name; | |
| 135 uint32_t properties; | |
| 136 // The universal_id is a field reserved for the initialization process | |
| 137 // to assign a unique id number to all fonts loaded for the current | |
| 138 // combination of languages. This id will then be returned by | |
| 139 // ResultIterator::WordFontAttributes. | |
| 140 int32_t universal_id; | |
| 141 // Horizontal spacing between characters (indexed by UNICHAR_ID). | |
| 142 std::vector<FontSpacingInfo *> *spacing_vec; | |
| 143 }; | |
| 144 | |
| 145 // Every class (character) owns a FontSet that represents all the fonts that can | |
| 146 // render this character. | |
| 147 // Since almost all the characters from the same script share the same set of | |
| 148 // fonts, the sets are shared over multiple classes (see | |
| 149 // Classify::fontset_table_). Thus, a class only store an id to a set. | |
| 150 // Because some fonts cannot render just one character of a set, there are a | |
| 151 // lot of FontSet that differ only by one font. Rather than storing directly | |
| 152 // the FontInfo in the FontSet structure, it's better to share FontInfos among | |
| 153 // FontSets (Classify::fontinfo_table_). | |
| 154 using FontSet = std::vector<int>; | |
| 155 | |
| 156 // Class that adds a bit of functionality on top of GenericVector to | |
| 157 // implement a table of FontInfo that replaces UniCityTable<FontInfo>. | |
| 158 // TODO(rays) change all references once all existing traineddata files | |
| 159 // are replaced. | |
| 160 class FontInfoTable : public GenericVector<FontInfo> { | |
| 161 public: | |
| 162 TESS_API // when you remove inheritance from GenericVector, move this on | |
| 163 // class level | |
| 164 FontInfoTable(); | |
| 165 TESS_API | |
| 166 ~FontInfoTable(); | |
| 167 | |
| 168 // Writes to the given file. Returns false in case of error. | |
| 169 TESS_API | |
| 170 bool Serialize(FILE *fp) const; | |
| 171 // Reads from the given file. Returns false in case of error. | |
| 172 // If swap is true, assumes a big/little-endian swap is needed. | |
| 173 TESS_API | |
| 174 bool DeSerialize(TFile *fp); | |
| 175 | |
| 176 // Returns true if the given set of fonts includes one with the same | |
| 177 // properties as font_id. | |
| 178 TESS_API | |
| 179 bool SetContainsFontProperties(int font_id, const std::vector<ScoredFont> &font_set) const; | |
| 180 // Returns true if the given set of fonts includes multiple properties. | |
| 181 TESS_API | |
| 182 bool SetContainsMultipleFontProperties(const std::vector<ScoredFont> &font_set) const; | |
| 183 | |
| 184 // Moves any non-empty FontSpacingInfo entries from other to this. | |
| 185 TESS_API | |
| 186 void MoveSpacingInfoFrom(FontInfoTable *other); | |
| 187 // Moves this to the target unicity table. | |
| 188 TESS_API | |
| 189 void MoveTo(UnicityTable<FontInfo> *target); | |
| 190 }; | |
| 191 | |
| 192 // Deletion callbacks for GenericVector. | |
| 193 void FontInfoDeleteCallback(FontInfo f); | |
| 194 | |
| 195 // Callbacks used by UnicityTable to read/write FontInfo/FontSet structures. | |
| 196 bool read_info(TFile *f, FontInfo *fi); | |
| 197 bool write_info(FILE *f, const FontInfo &fi); | |
| 198 bool read_spacing_info(TFile *f, FontInfo *fi); | |
| 199 bool write_spacing_info(FILE *f, const FontInfo &fi); | |
| 200 bool write_set(FILE *f, const FontSet &fs); | |
| 201 | |
| 202 } // namespace tesseract. | |
| 203 | |
| 204 #endif /* THIRD_PARTY_TESSERACT_CCSTRUCT_FONTINFO_H_ */ |
