Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/classify/shapetable.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 // Copyright 2010 Google Inc. All Rights Reserved. | |
| 2 // Author: rays@google.com (Ray Smith) | |
| 3 /////////////////////////////////////////////////////////////////////// | |
| 4 // File: shapetable.h | |
| 5 // Description: Class to map a classifier shape index to unicharset | |
| 6 // indices and font indices. | |
| 7 // Author: Ray Smith | |
| 8 // | |
| 9 // (C) Copyright 2010, Google Inc. | |
| 10 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 11 // you may not use this file except in compliance with the License. | |
| 12 // You may obtain a copy of the License at | |
| 13 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 14 // Unless required by applicable law or agreed to in writing, software | |
| 15 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 16 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 17 // See the License for the specific language governing permissions and | |
| 18 // limitations under the License. | |
| 19 // | |
| 20 /////////////////////////////////////////////////////////////////////// | |
| 21 | |
| 22 #ifndef TESSERACT_CLASSIFY_SHAPETABLE_H_ | |
| 23 #define TESSERACT_CLASSIFY_SHAPETABLE_H_ | |
| 24 | |
| 25 #include "bitvector.h" | |
| 26 #include "fontinfo.h" | |
| 27 #include "genericheap.h" | |
| 28 #include "intmatcher.h" | |
| 29 #include "tesserrstream.h" // for tesserr | |
| 30 | |
| 31 namespace tesseract { | |
| 32 | |
| 33 class UNICHARSET; | |
| 34 class ShapeTable; | |
| 35 | |
| 36 // Simple struct to hold a single classifier unichar selection, a corresponding | |
| 37 // rating, and a list of appropriate fonts. | |
| 38 struct UnicharRating { | |
| 39 UnicharRating() : unichar_id(0), rating(0.0f), adapted(false), config(0), feature_misses(0) {} | |
| 40 UnicharRating(int u, float r) | |
| 41 : unichar_id(u), rating(r), adapted(false), config(0), feature_misses(0) {} | |
| 42 | |
| 43 // Print debug info. | |
| 44 void Print() const { | |
| 45 tesserr << "Unichar-id=" << unichar_id << ", rating=" << rating | |
| 46 << ", adapted=" << adapted << ", config=" << config | |
| 47 << ", misses=" << feature_misses << ", " | |
| 48 << fonts.size() << " fonts\n"; | |
| 49 } | |
| 50 | |
| 51 // Helper function to get the index of the first result with the required | |
| 52 // unichar_id. If the results are sorted by rating, this will also be the | |
| 53 // best result with the required unichar_id. | |
| 54 // Returns -1 if the unichar_id is not found | |
| 55 static int FirstResultWithUnichar(const std::vector<UnicharRating> &results, | |
| 56 UNICHAR_ID unichar_id); | |
| 57 | |
| 58 // Index into some UNICHARSET table indicates the class of the answer. | |
| 59 UNICHAR_ID unichar_id; | |
| 60 // Rating from classifier with 1.0 perfect and 0.0 impossible. | |
| 61 // Call it a probability if you must. | |
| 62 float rating; | |
| 63 // True if this result is from the adaptive classifier. | |
| 64 bool adapted; | |
| 65 // Index of best matching font configuration of result. | |
| 66 uint8_t config; | |
| 67 // Number of features that were total misses - were liked by no classes. | |
| 68 uint16_t feature_misses; | |
| 69 // Unsorted collection of fontinfo ids and scores. Note that a raw result | |
| 70 // from the IntegerMatch will contain config ids, that require transforming | |
| 71 // to fontinfo ids via fontsets and (possibly) shapetable. | |
| 72 std::vector<ScoredFont> fonts; | |
| 73 }; | |
| 74 | |
| 75 // Classifier result from a low-level classification is an index into some | |
| 76 // ShapeTable and a rating. | |
| 77 struct ShapeRating { | |
| 78 ShapeRating() : shape_id(0), rating(0.0f), raw(0.0f), font(0.0f), joined(false), broken(false) {} | |
| 79 ShapeRating(int s, float r) | |
| 80 : shape_id(s), rating(r), raw(1.0f), font(0.0f), joined(false), broken(false) {} | |
| 81 | |
| 82 // Helper function to get the index of the first result with the required | |
| 83 // unichar_id. If the results are sorted by rating, this will also be the | |
| 84 // best result with the required unichar_id. | |
| 85 // Returns -1 if the unichar_id is not found | |
| 86 static int FirstResultWithUnichar(const std::vector<ShapeRating> &results, | |
| 87 const ShapeTable &shape_table, UNICHAR_ID unichar_id); | |
| 88 | |
| 89 // Index into some shape table indicates the class of the answer. | |
| 90 int shape_id; | |
| 91 // Rating from classifier with 1.0 perfect and 0.0 impossible. | |
| 92 // Call it a probability if you must. | |
| 93 float rating; | |
| 94 // Subsidiary rating that a classifier may use internally. | |
| 95 float raw; | |
| 96 // Subsidiary rating that a classifier may use internally. | |
| 97 float font; | |
| 98 // Flag indicating that the input may be joined. | |
| 99 bool joined; | |
| 100 // Flag indicating that the input may be broken (a fragment). | |
| 101 bool broken; | |
| 102 }; | |
| 103 | |
| 104 // Simple struct to hold an entry for a heap-based priority queue of | |
| 105 // ShapeRating. | |
| 106 struct ShapeQueueEntry { | |
| 107 ShapeQueueEntry() : result(ShapeRating(0, 0.0f)), level(0) {} | |
| 108 ShapeQueueEntry(const ShapeRating &rating, int level0) : result(rating), level(level0) {} | |
| 109 | |
| 110 // Sort by decreasing rating and decreasing level for equal rating. | |
| 111 bool operator<(const ShapeQueueEntry &other) const { | |
| 112 if (result.rating > other.result.rating) { | |
| 113 return true; | |
| 114 } | |
| 115 if (result.rating == other.result.rating) { | |
| 116 return level > other.level; | |
| 117 } | |
| 118 return false; | |
| 119 } | |
| 120 | |
| 121 // Output from classifier. | |
| 122 ShapeRating result; | |
| 123 // Which level in the tree did this come from? | |
| 124 int level; | |
| 125 }; | |
| 126 using ShapeQueue = GenericHeap<ShapeQueueEntry>; | |
| 127 | |
| 128 // Simple struct to hold a set of fonts associated with a single unichar-id. | |
| 129 // A vector of UnicharAndFonts makes a shape. | |
| 130 struct UnicharAndFonts { | |
| 131 UnicharAndFonts() : unichar_id(0) {} | |
| 132 UnicharAndFonts(int uni_id, int font_id) : unichar_id(uni_id) { | |
| 133 font_ids.push_back(font_id); | |
| 134 } | |
| 135 | |
| 136 // Writes to the given file. Returns false in case of error. | |
| 137 bool Serialize(FILE *fp) const; | |
| 138 // Reads from the given file. Returns false in case of error. | |
| 139 bool DeSerialize(TFile *fp); | |
| 140 | |
| 141 // Sort function to sort a pair of UnicharAndFonts by unichar_id. | |
| 142 static int SortByUnicharId(const void *v1, const void *v2); | |
| 143 static bool StdSortByUnicharId(const UnicharAndFonts &v1, const UnicharAndFonts &v2); | |
| 144 | |
| 145 std::vector<int32_t> font_ids; | |
| 146 int32_t unichar_id; | |
| 147 }; | |
| 148 | |
| 149 // A Shape is a collection of unichar-ids and a list of fonts associated with | |
| 150 // each, organized as a vector of UnicharAndFonts. Conceptually a Shape is | |
| 151 // a classifiable unit, and represents a group of characters or parts of | |
| 152 // characters that have a similar or identical shape. Shapes/ShapeTables may | |
| 153 // be organized hierarchically from identical shapes at the leaves to vaguely | |
| 154 // similar shapes near the root. | |
| 155 class TESS_API Shape { | |
| 156 public: | |
| 157 Shape() : destination_index_(-1) {} | |
| 158 | |
| 159 // Writes to the given file. Returns false in case of error. | |
| 160 bool Serialize(FILE *fp) const; | |
| 161 // Reads from the given file. Returns false in case of error. | |
| 162 bool DeSerialize(TFile *fp); | |
| 163 | |
| 164 int destination_index() const { | |
| 165 return destination_index_; | |
| 166 } | |
| 167 void set_destination_index(int index) { | |
| 168 destination_index_ = index; | |
| 169 } | |
| 170 int size() const { | |
| 171 return unichars_.size(); | |
| 172 } | |
| 173 // Returns a UnicharAndFonts entry for the given index, which must be | |
| 174 // in the range [0, size()). | |
| 175 const UnicharAndFonts &operator[](int index) const { | |
| 176 return unichars_[index]; | |
| 177 } | |
| 178 // Sets the unichar_id of the given index to the new unichar_id. | |
| 179 void SetUnicharId(int index, int unichar_id) { | |
| 180 unichars_[index].unichar_id = unichar_id; | |
| 181 } | |
| 182 // Adds a font_id for the given unichar_id. If the unichar_id is not | |
| 183 // in the shape, it is added. | |
| 184 void AddToShape(int unichar_id, int font_id); | |
| 185 // Adds everything in other to this. | |
| 186 void AddShape(const Shape &other); | |
| 187 // Returns true if the shape contains the given unichar_id, font_id pair. | |
| 188 bool ContainsUnicharAndFont(int unichar_id, int font_id) const; | |
| 189 // Returns true if the shape contains the given unichar_id, ignoring font. | |
| 190 bool ContainsUnichar(int unichar_id) const; | |
| 191 // Returns true if the shape contains the given font, ignoring unichar_id. | |
| 192 bool ContainsFont(int font_id) const; | |
| 193 // Returns true if the shape contains the given font properties, ignoring | |
| 194 // unichar_id. | |
| 195 bool ContainsFontProperties(const FontInfoTable &font_table, uint32_t properties) const; | |
| 196 // Returns true if the shape contains multiple different font properties, | |
| 197 // ignoring unichar_id. | |
| 198 bool ContainsMultipleFontProperties(const FontInfoTable &font_table) const; | |
| 199 // Returns true if this shape is equal to other (ignoring order of unichars | |
| 200 // and fonts). | |
| 201 bool operator==(const Shape &other) const; | |
| 202 // Returns true if this is a subset (including equal) of other. | |
| 203 bool IsSubsetOf(const Shape &other) const; | |
| 204 // Returns true if the lists of unichar ids are the same in this and other, | |
| 205 // ignoring fonts. | |
| 206 // NOT const, as it will sort the unichars on demand. | |
| 207 bool IsEqualUnichars(Shape *other); | |
| 208 | |
| 209 private: | |
| 210 // Sorts the unichars_ vector by unichar. | |
| 211 void SortUnichars(); | |
| 212 | |
| 213 // Flag indicates that the unichars are sorted, allowing faster set | |
| 214 // operations with another shape. | |
| 215 bool unichars_sorted_ = false; | |
| 216 // If this Shape is part of a ShapeTable the destination_index_ is the index | |
| 217 // of some other shape in the ShapeTable with which this shape is merged. | |
| 218 int destination_index_ = 0; | |
| 219 // Array of unichars, each with a set of fonts. Each unichar has at most | |
| 220 // one entry in the vector. | |
| 221 std::vector<UnicharAndFonts> unichars_; | |
| 222 }; | |
| 223 | |
| 224 // ShapeTable is a class to encapsulate the triple indirection that is | |
| 225 // used here. | |
| 226 // ShapeTable is a vector of shapes. | |
| 227 // Each shape is a vector of UnicharAndFonts representing the set of unichars | |
| 228 // that the shape represents. | |
| 229 // Each UnicharAndFonts also lists the fonts of the unichar_id that were | |
| 230 // mapped to the shape during training. | |
| 231 class TESS_API ShapeTable { | |
| 232 public: | |
| 233 ShapeTable(); | |
| 234 // The UNICHARSET reference supplied here, or in set_unicharset below must | |
| 235 // exist for the entire life of the ShapeTable. It is used only by DebugStr. | |
| 236 explicit ShapeTable(const UNICHARSET &unicharset); | |
| 237 ~ShapeTable() { | |
| 238 for (auto data : shape_table_) { | |
| 239 delete data; | |
| 240 } | |
| 241 } | |
| 242 | |
| 243 // Writes to the given file. Returns false in case of error. | |
| 244 bool Serialize(FILE *fp) const; | |
| 245 // Reads from the given file. Returns false in case of error. | |
| 246 bool DeSerialize(TFile *fp); | |
| 247 | |
| 248 // Accessors. | |
| 249 unsigned NumShapes() const { | |
| 250 return shape_table_.size(); | |
| 251 } | |
| 252 const UNICHARSET &unicharset() const { | |
| 253 return *unicharset_; | |
| 254 } | |
| 255 // Returns the number of fonts used in this ShapeTable, computing it if | |
| 256 // necessary. | |
| 257 int NumFonts() const; | |
| 258 // Shapetable takes a pointer to the UNICHARSET, so it must persist for the | |
| 259 // entire life of the ShapeTable. | |
| 260 void set_unicharset(const UNICHARSET &unicharset) { | |
| 261 unicharset_ = &unicharset; | |
| 262 } | |
| 263 // Re-indexes the class_ids in the shapetable according to the given map. | |
| 264 // Useful in conjunction with set_unicharset. | |
| 265 void ReMapClassIds(const std::vector<int> &unicharset_map); | |
| 266 // Returns a string listing the classes/fonts in a shape. | |
| 267 std::string DebugStr(unsigned shape_id) const; | |
| 268 // Returns a debug string summarizing the table. | |
| 269 std::string SummaryStr() const; | |
| 270 | |
| 271 // Adds a new shape starting with the given unichar_id and font_id. | |
| 272 // Returns the assigned index. | |
| 273 unsigned AddShape(int unichar_id, int font_id); | |
| 274 // Adds a copy of the given shape unless it is already present. | |
| 275 // Returns the assigned index or index of existing shape if already present. | |
| 276 unsigned AddShape(const Shape &other); | |
| 277 // Removes the shape given by the shape index. All indices above are changed! | |
| 278 void DeleteShape(unsigned shape_id); | |
| 279 // Adds a font_id to the given existing shape index for the given | |
| 280 // unichar_id. If the unichar_id is not in the shape, it is added. | |
| 281 void AddToShape(unsigned shape_id, int unichar_id, int font_id); | |
| 282 // Adds the given shape to the existing shape with the given index. | |
| 283 void AddShapeToShape(unsigned shape_id, const Shape &other); | |
| 284 // Returns the id of the shape that contains the given unichar and font. | |
| 285 // If not found, returns -1. | |
| 286 // If font_id < 0, the font_id is ignored and the first shape that matches | |
| 287 // the unichar_id is returned. | |
| 288 int FindShape(int unichar_id, int font_id) const; | |
| 289 // Returns the first unichar_id and font_id in the given shape. | |
| 290 void GetFirstUnicharAndFont(unsigned shape_id, int *unichar_id, int *font_id) const; | |
| 291 | |
| 292 // Accessors for the Shape with the given shape_id. | |
| 293 const Shape &GetShape(unsigned shape_id) const { | |
| 294 return *shape_table_[shape_id]; | |
| 295 } | |
| 296 Shape *MutableShape(unsigned shape_id) { | |
| 297 return shape_table_[shape_id]; | |
| 298 } | |
| 299 | |
| 300 // Expands all the classes/fonts in the shape individually to build | |
| 301 // a ShapeTable. | |
| 302 int BuildFromShape(const Shape &shape, const ShapeTable &master_shapes); | |
| 303 | |
| 304 // Returns true if the shapes are already merged. | |
| 305 bool AlreadyMerged(unsigned shape_id1, unsigned shape_id2) const; | |
| 306 // Returns true if any shape contains multiple unichars. | |
| 307 bool AnyMultipleUnichars() const; | |
| 308 // Returns the maximum number of unichars over all shapes. | |
| 309 int MaxNumUnichars() const; | |
| 310 // Merges shapes with a common unichar over the [start, end) interval. | |
| 311 // Assumes single unichar per shape. | |
| 312 void ForceFontMerges(unsigned start, unsigned end); | |
| 313 // Returns the number of unichars in the master shape. | |
| 314 unsigned MasterUnicharCount(unsigned shape_id) const; | |
| 315 // Returns the sum of the font counts in the master shape. | |
| 316 int MasterFontCount(unsigned shape_id) const; | |
| 317 // Returns the number of unichars that would result from merging the shapes. | |
| 318 int MergedUnicharCount(unsigned shape_id1, unsigned shape_id2) const; | |
| 319 // Merges two shape_ids, leaving shape_id2 marked as merged. | |
| 320 void MergeShapes(unsigned shape_id1, unsigned shape_id2); | |
| 321 // Swaps two shape_ids. | |
| 322 void SwapShapes(unsigned shape_id1, unsigned shape_id2); | |
| 323 // Appends the master shapes from other to this. | |
| 324 // Used to create a clean ShapeTable from a merged one, or to create a | |
| 325 // copy of a ShapeTable. | |
| 326 // If not nullptr, shape_map is set to map other shape_ids to this's | |
| 327 // shape_ids. | |
| 328 void AppendMasterShapes(const ShapeTable &other, std::vector<int> *shape_map); | |
| 329 // Returns the number of master shapes remaining after merging. | |
| 330 int NumMasterShapes() const; | |
| 331 // Returns the destination of this shape, (if merged), taking into account | |
| 332 // the fact that the destination may itself have been merged. | |
| 333 // For a non-merged shape, returns the input shape_id. | |
| 334 unsigned MasterDestinationIndex(unsigned shape_id) const; | |
| 335 | |
| 336 // Returns false if the unichars in neither shape is a subset of the other.. | |
| 337 bool SubsetUnichar(unsigned shape_id1, unsigned shape_id2) const; | |
| 338 // Returns false if the unichars in neither shape is a subset of the other.. | |
| 339 bool MergeSubsetUnichar(int merge_id1, int merge_id2, unsigned shape_id) const; | |
| 340 // Returns true if the unichar sets are equal between the shapes. | |
| 341 bool EqualUnichars(unsigned shape_id1, unsigned shape_id2) const; | |
| 342 bool MergeEqualUnichars(int merge_id1, int merge_id2, unsigned shape_id) const; | |
| 343 // Returns true if there is a common unichar between the shapes. | |
| 344 bool CommonUnichars(unsigned shape_id1, unsigned shape_id2) const; | |
| 345 // Returns true if there is a common font id between the shapes. | |
| 346 bool CommonFont(unsigned shape_id1, unsigned shape_id2) const; | |
| 347 | |
| 348 // Adds the unichars of the given shape_id to the vector of results. Any | |
| 349 // unichar_id that is already present just has the fonts added to the | |
| 350 // font set for that result without adding a new entry in the vector. | |
| 351 // NOTE: it is assumed that the results are given to this function in order | |
| 352 // of decreasing rating. | |
| 353 // The unichar_map vector indicates the index of the results entry containing | |
| 354 // each unichar, or -1 if the unichar is not yet included in results. | |
| 355 void AddShapeToResults(const ShapeRating &shape_rating, std::vector<int> *unichar_map, | |
| 356 std::vector<UnicharRating> *results) const; | |
| 357 | |
| 358 private: | |
| 359 // Adds the given unichar_id to the results if needed, updating unichar_map | |
| 360 // and returning the index of unichar in results. | |
| 361 int AddUnicharToResults(int unichar_id, float rating, std::vector<int> *unichar_map, | |
| 362 std::vector<UnicharRating> *results) const; | |
| 363 | |
| 364 // Pointer to a provided unicharset used only by the Debugstr member. | |
| 365 const UNICHARSET *unicharset_; | |
| 366 // Vector of pointers to the Shapes in this ShapeTable. | |
| 367 std::vector<Shape *> shape_table_; | |
| 368 | |
| 369 // Cached data calculated on demand. | |
| 370 mutable int num_fonts_; | |
| 371 }; | |
| 372 | |
| 373 } // namespace tesseract. | |
| 374 | |
| 375 #endif // TESSERACT_CLASSIFY_SHAPETABLE_H_ |
