comparison mupdf-source/thirdparty/tesseract/src/classify/shapetable.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 // Copyright 2010 Google Inc. All Rights Reserved.
2 // Author: rays@google.com (Ray Smith)
3 ///////////////////////////////////////////////////////////////////////
4 // File: shapetable.h
5 // Description: Class to map a classifier shape index to unicharset
6 // indices and font indices.
7 // Author: Ray Smith
8 //
9 // (C) Copyright 2010, Google Inc.
10 // Licensed under the Apache License, Version 2.0 (the "License");
11 // you may not use this file except in compliance with the License.
12 // You may obtain a copy of the License at
13 // http://www.apache.org/licenses/LICENSE-2.0
14 // Unless required by applicable law or agreed to in writing, software
15 // distributed under the License is distributed on an "AS IS" BASIS,
16 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 // See the License for the specific language governing permissions and
18 // limitations under the License.
19 //
20 ///////////////////////////////////////////////////////////////////////
21
22 #ifndef TESSERACT_CLASSIFY_SHAPETABLE_H_
23 #define TESSERACT_CLASSIFY_SHAPETABLE_H_
24
25 #include "bitvector.h"
26 #include "fontinfo.h"
27 #include "genericheap.h"
28 #include "intmatcher.h"
29 #include "tesserrstream.h" // for tesserr
30
31 namespace tesseract {
32
33 class UNICHARSET;
34 class ShapeTable;
35
36 // Simple struct to hold a single classifier unichar selection, a corresponding
37 // rating, and a list of appropriate fonts.
38 struct UnicharRating {
39 UnicharRating() : unichar_id(0), rating(0.0f), adapted(false), config(0), feature_misses(0) {}
40 UnicharRating(int u, float r)
41 : unichar_id(u), rating(r), adapted(false), config(0), feature_misses(0) {}
42
43 // Print debug info.
44 void Print() const {
45 tesserr << "Unichar-id=" << unichar_id << ", rating=" << rating
46 << ", adapted=" << adapted << ", config=" << config
47 << ", misses=" << feature_misses << ", "
48 << fonts.size() << " fonts\n";
49 }
50
51 // Helper function to get the index of the first result with the required
52 // unichar_id. If the results are sorted by rating, this will also be the
53 // best result with the required unichar_id.
54 // Returns -1 if the unichar_id is not found
55 static int FirstResultWithUnichar(const std::vector<UnicharRating> &results,
56 UNICHAR_ID unichar_id);
57
58 // Index into some UNICHARSET table indicates the class of the answer.
59 UNICHAR_ID unichar_id;
60 // Rating from classifier with 1.0 perfect and 0.0 impossible.
61 // Call it a probability if you must.
62 float rating;
63 // True if this result is from the adaptive classifier.
64 bool adapted;
65 // Index of best matching font configuration of result.
66 uint8_t config;
67 // Number of features that were total misses - were liked by no classes.
68 uint16_t feature_misses;
69 // Unsorted collection of fontinfo ids and scores. Note that a raw result
70 // from the IntegerMatch will contain config ids, that require transforming
71 // to fontinfo ids via fontsets and (possibly) shapetable.
72 std::vector<ScoredFont> fonts;
73 };
74
75 // Classifier result from a low-level classification is an index into some
76 // ShapeTable and a rating.
77 struct ShapeRating {
78 ShapeRating() : shape_id(0), rating(0.0f), raw(0.0f), font(0.0f), joined(false), broken(false) {}
79 ShapeRating(int s, float r)
80 : shape_id(s), rating(r), raw(1.0f), font(0.0f), joined(false), broken(false) {}
81
82 // Helper function to get the index of the first result with the required
83 // unichar_id. If the results are sorted by rating, this will also be the
84 // best result with the required unichar_id.
85 // Returns -1 if the unichar_id is not found
86 static int FirstResultWithUnichar(const std::vector<ShapeRating> &results,
87 const ShapeTable &shape_table, UNICHAR_ID unichar_id);
88
89 // Index into some shape table indicates the class of the answer.
90 int shape_id;
91 // Rating from classifier with 1.0 perfect and 0.0 impossible.
92 // Call it a probability if you must.
93 float rating;
94 // Subsidiary rating that a classifier may use internally.
95 float raw;
96 // Subsidiary rating that a classifier may use internally.
97 float font;
98 // Flag indicating that the input may be joined.
99 bool joined;
100 // Flag indicating that the input may be broken (a fragment).
101 bool broken;
102 };
103
104 // Simple struct to hold an entry for a heap-based priority queue of
105 // ShapeRating.
106 struct ShapeQueueEntry {
107 ShapeQueueEntry() : result(ShapeRating(0, 0.0f)), level(0) {}
108 ShapeQueueEntry(const ShapeRating &rating, int level0) : result(rating), level(level0) {}
109
110 // Sort by decreasing rating and decreasing level for equal rating.
111 bool operator<(const ShapeQueueEntry &other) const {
112 if (result.rating > other.result.rating) {
113 return true;
114 }
115 if (result.rating == other.result.rating) {
116 return level > other.level;
117 }
118 return false;
119 }
120
121 // Output from classifier.
122 ShapeRating result;
123 // Which level in the tree did this come from?
124 int level;
125 };
126 using ShapeQueue = GenericHeap<ShapeQueueEntry>;
127
128 // Simple struct to hold a set of fonts associated with a single unichar-id.
129 // A vector of UnicharAndFonts makes a shape.
130 struct UnicharAndFonts {
131 UnicharAndFonts() : unichar_id(0) {}
132 UnicharAndFonts(int uni_id, int font_id) : unichar_id(uni_id) {
133 font_ids.push_back(font_id);
134 }
135
136 // Writes to the given file. Returns false in case of error.
137 bool Serialize(FILE *fp) const;
138 // Reads from the given file. Returns false in case of error.
139 bool DeSerialize(TFile *fp);
140
141 // Sort function to sort a pair of UnicharAndFonts by unichar_id.
142 static int SortByUnicharId(const void *v1, const void *v2);
143 static bool StdSortByUnicharId(const UnicharAndFonts &v1, const UnicharAndFonts &v2);
144
145 std::vector<int32_t> font_ids;
146 int32_t unichar_id;
147 };
148
149 // A Shape is a collection of unichar-ids and a list of fonts associated with
150 // each, organized as a vector of UnicharAndFonts. Conceptually a Shape is
151 // a classifiable unit, and represents a group of characters or parts of
152 // characters that have a similar or identical shape. Shapes/ShapeTables may
153 // be organized hierarchically from identical shapes at the leaves to vaguely
154 // similar shapes near the root.
155 class TESS_API Shape {
156 public:
157 Shape() : destination_index_(-1) {}
158
159 // Writes to the given file. Returns false in case of error.
160 bool Serialize(FILE *fp) const;
161 // Reads from the given file. Returns false in case of error.
162 bool DeSerialize(TFile *fp);
163
164 int destination_index() const {
165 return destination_index_;
166 }
167 void set_destination_index(int index) {
168 destination_index_ = index;
169 }
170 int size() const {
171 return unichars_.size();
172 }
173 // Returns a UnicharAndFonts entry for the given index, which must be
174 // in the range [0, size()).
175 const UnicharAndFonts &operator[](int index) const {
176 return unichars_[index];
177 }
178 // Sets the unichar_id of the given index to the new unichar_id.
179 void SetUnicharId(int index, int unichar_id) {
180 unichars_[index].unichar_id = unichar_id;
181 }
182 // Adds a font_id for the given unichar_id. If the unichar_id is not
183 // in the shape, it is added.
184 void AddToShape(int unichar_id, int font_id);
185 // Adds everything in other to this.
186 void AddShape(const Shape &other);
187 // Returns true if the shape contains the given unichar_id, font_id pair.
188 bool ContainsUnicharAndFont(int unichar_id, int font_id) const;
189 // Returns true if the shape contains the given unichar_id, ignoring font.
190 bool ContainsUnichar(int unichar_id) const;
191 // Returns true if the shape contains the given font, ignoring unichar_id.
192 bool ContainsFont(int font_id) const;
193 // Returns true if the shape contains the given font properties, ignoring
194 // unichar_id.
195 bool ContainsFontProperties(const FontInfoTable &font_table, uint32_t properties) const;
196 // Returns true if the shape contains multiple different font properties,
197 // ignoring unichar_id.
198 bool ContainsMultipleFontProperties(const FontInfoTable &font_table) const;
199 // Returns true if this shape is equal to other (ignoring order of unichars
200 // and fonts).
201 bool operator==(const Shape &other) const;
202 // Returns true if this is a subset (including equal) of other.
203 bool IsSubsetOf(const Shape &other) const;
204 // Returns true if the lists of unichar ids are the same in this and other,
205 // ignoring fonts.
206 // NOT const, as it will sort the unichars on demand.
207 bool IsEqualUnichars(Shape *other);
208
209 private:
210 // Sorts the unichars_ vector by unichar.
211 void SortUnichars();
212
213 // Flag indicates that the unichars are sorted, allowing faster set
214 // operations with another shape.
215 bool unichars_sorted_ = false;
216 // If this Shape is part of a ShapeTable the destination_index_ is the index
217 // of some other shape in the ShapeTable with which this shape is merged.
218 int destination_index_ = 0;
219 // Array of unichars, each with a set of fonts. Each unichar has at most
220 // one entry in the vector.
221 std::vector<UnicharAndFonts> unichars_;
222 };
223
224 // ShapeTable is a class to encapsulate the triple indirection that is
225 // used here.
226 // ShapeTable is a vector of shapes.
227 // Each shape is a vector of UnicharAndFonts representing the set of unichars
228 // that the shape represents.
229 // Each UnicharAndFonts also lists the fonts of the unichar_id that were
230 // mapped to the shape during training.
231 class TESS_API ShapeTable {
232 public:
233 ShapeTable();
234 // The UNICHARSET reference supplied here, or in set_unicharset below must
235 // exist for the entire life of the ShapeTable. It is used only by DebugStr.
236 explicit ShapeTable(const UNICHARSET &unicharset);
237 ~ShapeTable() {
238 for (auto data : shape_table_) {
239 delete data;
240 }
241 }
242
243 // Writes to the given file. Returns false in case of error.
244 bool Serialize(FILE *fp) const;
245 // Reads from the given file. Returns false in case of error.
246 bool DeSerialize(TFile *fp);
247
248 // Accessors.
249 unsigned NumShapes() const {
250 return shape_table_.size();
251 }
252 const UNICHARSET &unicharset() const {
253 return *unicharset_;
254 }
255 // Returns the number of fonts used in this ShapeTable, computing it if
256 // necessary.
257 int NumFonts() const;
258 // Shapetable takes a pointer to the UNICHARSET, so it must persist for the
259 // entire life of the ShapeTable.
260 void set_unicharset(const UNICHARSET &unicharset) {
261 unicharset_ = &unicharset;
262 }
263 // Re-indexes the class_ids in the shapetable according to the given map.
264 // Useful in conjunction with set_unicharset.
265 void ReMapClassIds(const std::vector<int> &unicharset_map);
266 // Returns a string listing the classes/fonts in a shape.
267 std::string DebugStr(unsigned shape_id) const;
268 // Returns a debug string summarizing the table.
269 std::string SummaryStr() const;
270
271 // Adds a new shape starting with the given unichar_id and font_id.
272 // Returns the assigned index.
273 unsigned AddShape(int unichar_id, int font_id);
274 // Adds a copy of the given shape unless it is already present.
275 // Returns the assigned index or index of existing shape if already present.
276 unsigned AddShape(const Shape &other);
277 // Removes the shape given by the shape index. All indices above are changed!
278 void DeleteShape(unsigned shape_id);
279 // Adds a font_id to the given existing shape index for the given
280 // unichar_id. If the unichar_id is not in the shape, it is added.
281 void AddToShape(unsigned shape_id, int unichar_id, int font_id);
282 // Adds the given shape to the existing shape with the given index.
283 void AddShapeToShape(unsigned shape_id, const Shape &other);
284 // Returns the id of the shape that contains the given unichar and font.
285 // If not found, returns -1.
286 // If font_id < 0, the font_id is ignored and the first shape that matches
287 // the unichar_id is returned.
288 int FindShape(int unichar_id, int font_id) const;
289 // Returns the first unichar_id and font_id in the given shape.
290 void GetFirstUnicharAndFont(unsigned shape_id, int *unichar_id, int *font_id) const;
291
292 // Accessors for the Shape with the given shape_id.
293 const Shape &GetShape(unsigned shape_id) const {
294 return *shape_table_[shape_id];
295 }
296 Shape *MutableShape(unsigned shape_id) {
297 return shape_table_[shape_id];
298 }
299
300 // Expands all the classes/fonts in the shape individually to build
301 // a ShapeTable.
302 int BuildFromShape(const Shape &shape, const ShapeTable &master_shapes);
303
304 // Returns true if the shapes are already merged.
305 bool AlreadyMerged(unsigned shape_id1, unsigned shape_id2) const;
306 // Returns true if any shape contains multiple unichars.
307 bool AnyMultipleUnichars() const;
308 // Returns the maximum number of unichars over all shapes.
309 int MaxNumUnichars() const;
310 // Merges shapes with a common unichar over the [start, end) interval.
311 // Assumes single unichar per shape.
312 void ForceFontMerges(unsigned start, unsigned end);
313 // Returns the number of unichars in the master shape.
314 unsigned MasterUnicharCount(unsigned shape_id) const;
315 // Returns the sum of the font counts in the master shape.
316 int MasterFontCount(unsigned shape_id) const;
317 // Returns the number of unichars that would result from merging the shapes.
318 int MergedUnicharCount(unsigned shape_id1, unsigned shape_id2) const;
319 // Merges two shape_ids, leaving shape_id2 marked as merged.
320 void MergeShapes(unsigned shape_id1, unsigned shape_id2);
321 // Swaps two shape_ids.
322 void SwapShapes(unsigned shape_id1, unsigned shape_id2);
323 // Appends the master shapes from other to this.
324 // Used to create a clean ShapeTable from a merged one, or to create a
325 // copy of a ShapeTable.
326 // If not nullptr, shape_map is set to map other shape_ids to this's
327 // shape_ids.
328 void AppendMasterShapes(const ShapeTable &other, std::vector<int> *shape_map);
329 // Returns the number of master shapes remaining after merging.
330 int NumMasterShapes() const;
331 // Returns the destination of this shape, (if merged), taking into account
332 // the fact that the destination may itself have been merged.
333 // For a non-merged shape, returns the input shape_id.
334 unsigned MasterDestinationIndex(unsigned shape_id) const;
335
336 // Returns false if the unichars in neither shape is a subset of the other..
337 bool SubsetUnichar(unsigned shape_id1, unsigned shape_id2) const;
338 // Returns false if the unichars in neither shape is a subset of the other..
339 bool MergeSubsetUnichar(int merge_id1, int merge_id2, unsigned shape_id) const;
340 // Returns true if the unichar sets are equal between the shapes.
341 bool EqualUnichars(unsigned shape_id1, unsigned shape_id2) const;
342 bool MergeEqualUnichars(int merge_id1, int merge_id2, unsigned shape_id) const;
343 // Returns true if there is a common unichar between the shapes.
344 bool CommonUnichars(unsigned shape_id1, unsigned shape_id2) const;
345 // Returns true if there is a common font id between the shapes.
346 bool CommonFont(unsigned shape_id1, unsigned shape_id2) const;
347
348 // Adds the unichars of the given shape_id to the vector of results. Any
349 // unichar_id that is already present just has the fonts added to the
350 // font set for that result without adding a new entry in the vector.
351 // NOTE: it is assumed that the results are given to this function in order
352 // of decreasing rating.
353 // The unichar_map vector indicates the index of the results entry containing
354 // each unichar, or -1 if the unichar is not yet included in results.
355 void AddShapeToResults(const ShapeRating &shape_rating, std::vector<int> *unichar_map,
356 std::vector<UnicharRating> *results) const;
357
358 private:
359 // Adds the given unichar_id to the results if needed, updating unichar_map
360 // and returning the index of unichar in results.
361 int AddUnicharToResults(int unichar_id, float rating, std::vector<int> *unichar_map,
362 std::vector<UnicharRating> *results) const;
363
364 // Pointer to a provided unicharset used only by the Debugstr member.
365 const UNICHARSET *unicharset_;
366 // Vector of pointers to the Shapes in this ShapeTable.
367 std::vector<Shape *> shape_table_;
368
369 // Cached data calculated on demand.
370 mutable int num_fonts_;
371 };
372
373 } // namespace tesseract.
374
375 #endif // TESSERACT_CLASSIFY_SHAPETABLE_H_