comparison mupdf-source/thirdparty/tesseract/src/classify/trainingsample.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 // Copyright 2010 Google Inc. All Rights Reserved.
2 // Author: rays@google.com (Ray Smith)
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 //
14 ///////////////////////////////////////////////////////////////////////
15
16 #ifndef TESSERACT_TRAINING_TRAININGSAMPLE_H_
17 #define TESSERACT_TRAINING_TRAININGSAMPLE_H_
18
19 #include "elst.h"
20 #include "featdefs.h"
21 #include "intfx.h"
22 #include "intmatcher.h"
23 #include "matrix.h"
24 #include "mf.h"
25 #include "mfdefs.h"
26 #include "picofeat.h"
27 #include "shapetable.h"
28 #include "unicharset.h"
29
30 struct Pix;
31
32 namespace tesseract {
33
34 class IntFeatureMap;
35 class IntFeatureSpace;
36 class ShapeTable;
37
38 // Number of elements of cn_feature_.
39 static const int kNumCNParams = 4;
40 // Number of ways to shift the features when randomizing.
41 static const int kSampleYShiftSize = 5;
42 // Number of ways to scale the features when randomizing.
43 static const int kSampleScaleSize = 3;
44 // Total number of different ways to manipulate the features when randomizing.
45 // The first and last combinations are removed to avoid an excessive
46 // top movement (first) and an identity transformation (last).
47 // WARNING: To avoid patterned duplication of samples, be sure to keep
48 // kSampleRandomSize prime!
49 // Eg with current values (kSampleYShiftSize = 5 and TkSampleScaleSize = 3)
50 // kSampleRandomSize is 13, which is prime.
51 static const int kSampleRandomSize = kSampleYShiftSize * kSampleScaleSize - 2;
52 // ASSERT_IS_PRIME(kSampleRandomSize) !!
53
54 class TESS_API TrainingSample : public ELIST_LINK {
55 public:
56 TrainingSample()
57 : class_id_(INVALID_UNICHAR_ID)
58 , font_id_(0)
59 , page_num_(0)
60 , num_features_(0)
61 , num_micro_features_(0)
62 , outline_length_(0)
63 , features_(nullptr)
64 , micro_features_(nullptr)
65 , weight_(1.0)
66 , max_dist_(0.0)
67 , sample_index_(0)
68 , features_are_indexed_(false)
69 , features_are_mapped_(false)
70 , is_error_(false) {}
71 ~TrainingSample();
72
73 // Saves the given features into a TrainingSample. The features are copied,
74 // so may be deleted afterwards. Delete the return value after use.
75 static TrainingSample *CopyFromFeatures(const INT_FX_RESULT_STRUCT &fx_info,
76 const TBOX &bounding_box,
77 const INT_FEATURE_STRUCT *features, int num_features);
78 // Returns the cn_feature as a FEATURE_STRUCT* needed by cntraining.
79 FEATURE_STRUCT *GetCNFeature() const;
80 // Constructs and returns a copy "randomized" by the method given by
81 // the randomizer index. If index is out of [0, kSampleRandomSize) then
82 // an exact copy is returned.
83 TrainingSample *RandomizedCopy(int index) const;
84 // Constructs and returns an exact copy.
85 TrainingSample *Copy() const;
86
87 // WARNING! Serialize/DeSerialize do not save/restore the "cache" data
88 // members, which is mostly the mapped features, and the weight.
89 // It is assumed these can all be reconstructed from what is saved.
90 // Writes to the given file. Returns false in case of error.
91 bool Serialize(FILE *fp) const;
92 // Creates from the given file. Returns nullptr in case of error.
93 // If swap is true, assumes a big/little-endian swap is needed.
94 static TrainingSample *DeSerializeCreate(bool swap, FILE *fp);
95 // Reads from the given file. Returns false in case of error.
96 // If swap is true, assumes a big/little-endian swap is needed.
97 bool DeSerialize(bool swap, FILE *fp);
98
99 // Extracts the needed information from the CHAR_DESC_STRUCT.
100 void ExtractCharDesc(int feature_type, int micro_type, int cn_type, int geo_type,
101 CHAR_DESC_STRUCT *char_desc);
102
103 // Sets the mapped_features_ from the features_ using the provided
104 // feature_space to the indexed versions of the features.
105 void IndexFeatures(const IntFeatureSpace &feature_space);
106
107 // Returns a pix representing the sample. (Int features only.)
108 Image RenderToPix(const UNICHARSET *unicharset) const;
109 // Displays the features in the given window with the given color.
110 void DisplayFeatures(ScrollView::Color color, ScrollView *window) const;
111
112 // Returns a pix of the original sample image. The pix is padded all round
113 // by padding wherever possible.
114 // The returned Pix must be pixDestroyed after use.
115 // If the input page_pix is nullptr, nullptr is returned.
116 Image GetSamplePix(int padding, Image page_pix) const;
117
118 // Accessors.
119 UNICHAR_ID class_id() const {
120 return class_id_;
121 }
122 void set_class_id(int id) {
123 class_id_ = id;
124 }
125 int font_id() const {
126 return font_id_;
127 }
128 void set_font_id(int id) {
129 font_id_ = id;
130 }
131 int page_num() const {
132 return page_num_;
133 }
134 void set_page_num(int page) {
135 page_num_ = page;
136 }
137 const TBOX &bounding_box() const {
138 return bounding_box_;
139 }
140 void set_bounding_box(const TBOX &box) {
141 bounding_box_ = box;
142 }
143 uint32_t num_features() const {
144 return num_features_;
145 }
146 const INT_FEATURE_STRUCT *features() const {
147 return features_;
148 }
149 uint32_t num_micro_features() const {
150 return num_micro_features_;
151 }
152 const MicroFeature *micro_features() const {
153 return micro_features_;
154 }
155 int outline_length() const {
156 return outline_length_;
157 }
158 float cn_feature(int index) const {
159 return cn_feature_[index];
160 }
161 int geo_feature(int index) const {
162 return geo_feature_[index];
163 }
164 double weight() const {
165 return weight_;
166 }
167 void set_weight(double value) {
168 weight_ = value;
169 }
170 double max_dist() const {
171 return max_dist_;
172 }
173 void set_max_dist(double value) {
174 max_dist_ = value;
175 }
176 int sample_index() const {
177 return sample_index_;
178 }
179 void set_sample_index(int value) {
180 sample_index_ = value;
181 }
182 bool features_are_mapped() const {
183 return features_are_mapped_;
184 }
185 const std::vector<int> &mapped_features() const {
186 ASSERT_HOST(features_are_mapped_);
187 return mapped_features_;
188 }
189 const std::vector<int> &indexed_features() const {
190 ASSERT_HOST(features_are_indexed_);
191 return mapped_features_;
192 }
193 bool is_error() const {
194 return is_error_;
195 }
196 void set_is_error(bool value) {
197 is_error_ = value;
198 }
199
200 private:
201 // Unichar id that this sample represents. There obviously must be a
202 // reference UNICHARSET somewhere. Usually in TrainingSampleSet.
203 UNICHAR_ID class_id_;
204 // Font id in which this sample was printed. Refers to a fontinfo_table_ in
205 // MasterTrainer.
206 int font_id_;
207 // Number of page that the sample came from.
208 int page_num_;
209 // Bounding box of sample in original image.
210 TBOX bounding_box_;
211 // Number of INT_FEATURE_STRUCT in features_ array.
212 uint32_t num_features_;
213 // Number of MicroFeature in micro_features_ array.
214 uint32_t num_micro_features_;
215 // Total length of outline in the baseline normalized coordinate space.
216 // See comment in WERD_RES class definition for a discussion of coordinate
217 // spaces.
218 int outline_length_;
219 // Array of features.
220 INT_FEATURE_STRUCT *features_;
221 // Array of features.
222 MicroFeature *micro_features_;
223 // The one and only CN feature. Indexed by NORM_PARAM_NAME enum.
224 float cn_feature_[kNumCNParams];
225 // The one and only geometric feature. (Aims at replacing cn_feature_).
226 // Indexed by GeoParams enum in picofeat.h
227 int geo_feature_[GeoCount];
228
229 // Non-serialized cache data.
230 // Weight used for boosting training.
231 double weight_;
232 // Maximum distance to other samples of same class/font used in computing
233 // the canonical sample.
234 double max_dist_;
235 // Global index of this sample.
236 int sample_index_;
237
238 public:
239 // both are used in training tools
240 // hide after refactoring
241
242 // Indexed/mapped features, as indicated by the bools below.
243 std::vector<int> mapped_features_;
244 bool features_are_indexed_;
245 bool features_are_mapped_;
246
247 private:
248 // True if the last classification was an error by the current definition.
249 bool is_error_;
250
251 // Randomizing factors.
252 static const int kYShiftValues[kSampleYShiftSize];
253 static const double kScaleValues[kSampleScaleSize];
254 };
255
256 ELISTIZEH(TrainingSample)
257
258 } // namespace tesseract
259
260 #endif // TESSERACT_TRAINING_TRAININGSAMPLE_H_