comparison mupdf-source/thirdparty/tesseract/src/classify/picofeat.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /******************************************************************************
2 ** Filename: picofeat.c
3 ** Purpose: Definition of pico-features.
4 ** Author: Dan Johnson
5 **
6 ** (c) Copyright Hewlett-Packard Company, 1988.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 ******************************************************************************/
17
18 #include "picofeat.h"
19
20 #include "classify.h"
21 #include "featdefs.h"
22 #include "fpoint.h"
23 #include "mfoutline.h"
24 #include "ocrfeatures.h"
25 #include "params.h"
26 #include "trainingsample.h"
27
28 #include <cmath>
29 #include <cstdio>
30
31 namespace tesseract {
32
33 /*---------------------------------------------------------------------------
34 Variables
35 ----------------------------------------------------------------------------*/
36
37 double_VAR(classify_pico_feature_length, 0.05, "Pico Feature Length");
38
39 /*---------------------------------------------------------------------------
40 Private Function Prototypes
41 ----------------------------------------------------------------------------*/
42 void ConvertSegmentToPicoFeat(FPOINT *Start, FPOINT *End, FEATURE_SET FeatureSet);
43
44 void ConvertToPicoFeatures2(MFOUTLINE Outline, FEATURE_SET FeatureSet);
45
46 void NormalizePicoX(FEATURE_SET FeatureSet);
47
48 /*----------------------------------------------------------------------------
49 Public Code
50 ----------------------------------------------------------------------------*/
51 /*---------------------------------------------------------------------------*/
52 /**
53 * Operation: Dummy for now.
54 *
55 * Globals:
56 * - classify_norm_method normalization method currently specified
57 * @param Blob blob to extract pico-features from
58 * @return Pico-features for Blob.
59 */
60 FEATURE_SET Classify::ExtractPicoFeatures(TBLOB *Blob) {
61 auto FeatureSet = new FEATURE_SET_STRUCT(MAX_PICO_FEATURES);
62 auto Outlines = ConvertBlob(Blob);
63 float XScale, YScale;
64 NormalizeOutlines(Outlines, &XScale, &YScale);
65 auto RemainingOutlines = Outlines;
66 iterate(RemainingOutlines) {
67 auto Outline = static_cast<MFOUTLINE>(RemainingOutlines->first_node());
68 ConvertToPicoFeatures2(Outline, FeatureSet);
69 }
70 if (classify_norm_method == baseline) {
71 NormalizePicoX(FeatureSet);
72 }
73 FreeOutlines(Outlines);
74 return (FeatureSet);
75
76 } /* ExtractPicoFeatures */
77
78 /*----------------------------------------------------------------------------
79 Private Code
80 ----------------------------------------------------------------------------*/
81 /*---------------------------------------------------------------------------*/
82 /**
83 * This routine converts an entire segment of an outline
84 * into a set of pico features which are added to
85 * FeatureSet. The length of the segment is rounded to the
86 * nearest whole number of pico-features. The pico-features
87 * are spaced evenly over the entire segment.
88 * Results are placed in FeatureSet.
89 * Globals:
90 * - classify_pico_feature_length length of a single pico-feature
91 * @param Start starting point of pico-feature
92 * @param End ending point of pico-feature
93 * @param FeatureSet set to add pico-feature to
94 */
95 void ConvertSegmentToPicoFeat(FPOINT *Start, FPOINT *End, FEATURE_SET FeatureSet) {
96 float Angle;
97 float Length;
98 int NumFeatures;
99 FPOINT Center;
100 FPOINT Delta;
101 int i;
102
103 Angle = NormalizedAngleFrom(Start, End, 1.0);
104 Length = DistanceBetween(*Start, *End);
105 NumFeatures = static_cast<int>(floor(Length / classify_pico_feature_length + 0.5));
106 if (NumFeatures < 1) {
107 NumFeatures = 1;
108 }
109
110 /* compute vector for one pico feature */
111 Delta.x = XDelta(*Start, *End) / NumFeatures;
112 Delta.y = YDelta(*Start, *End) / NumFeatures;
113
114 /* compute position of first pico feature */
115 Center.x = Start->x + Delta.x / 2.0;
116 Center.y = Start->y + Delta.y / 2.0;
117
118 /* compute each pico feature in segment and add to feature set */
119 for (i = 0; i < NumFeatures; i++) {
120 auto Feature = new FEATURE_STRUCT(&PicoFeatDesc);
121 Feature->Params[PicoFeatDir] = Angle;
122 Feature->Params[PicoFeatX] = Center.x;
123 Feature->Params[PicoFeatY] = Center.y;
124 AddFeature(FeatureSet, Feature);
125
126 Center.x += Delta.x;
127 Center.y += Delta.y;
128 }
129 } /* ConvertSegmentToPicoFeat */
130
131 /*---------------------------------------------------------------------------*/
132 /**
133 * This routine steps through the specified outline and cuts it
134 * up into pieces of equal length. These pieces become the
135 * desired pico-features. Each segment in the outline
136 * is converted into an integral number of pico-features.
137 * Results are returned in FeatureSet.
138 *
139 * Globals:
140 * - classify_pico_feature_length length of features to be extracted
141 * @param Outline outline to extract micro-features from
142 * @param FeatureSet set of features to add pico-features to
143 */
144 void ConvertToPicoFeatures2(MFOUTLINE Outline, FEATURE_SET FeatureSet) {
145 MFOUTLINE Next;
146 MFOUTLINE First;
147 MFOUTLINE Current;
148
149 if (DegenerateOutline(Outline)) {
150 return;
151 }
152
153 First = Outline;
154 Current = First;
155 Next = NextPointAfter(Current);
156 do {
157 /* note that an edge is hidden if the ending point of the edge is
158 marked as hidden. This situation happens because the order of
159 the outlines is reversed when they are converted from the old
160 format. In the old format, a hidden edge is marked by the
161 starting point for that edge. */
162 if (!(PointAt(Next)->Hidden)) {
163 ConvertSegmentToPicoFeat(&(PointAt(Current)->Point), &(PointAt(Next)->Point), FeatureSet);
164 }
165
166 Current = Next;
167 Next = NextPointAfter(Current);
168 } while (Current != First);
169
170 } /* ConvertToPicoFeatures2 */
171
172 /*---------------------------------------------------------------------------*/
173 /**
174 * This routine computes the average x position over all
175 * of the pico-features in FeatureSet and then renormalizes
176 * the pico-features to force this average to be the x origin
177 * (i.e. x=0).
178 * FeatureSet is changed.
179 * @param FeatureSet pico-features to be normalized
180 */
181 void NormalizePicoX(FEATURE_SET FeatureSet) {
182 int i;
183 FEATURE Feature;
184 float Origin = 0.0;
185
186 for (i = 0; i < FeatureSet->NumFeatures; i++) {
187 Feature = FeatureSet->Features[i];
188 Origin += Feature->Params[PicoFeatX];
189 }
190 Origin /= FeatureSet->NumFeatures;
191
192 for (i = 0; i < FeatureSet->NumFeatures; i++) {
193 Feature = FeatureSet->Features[i];
194 Feature->Params[PicoFeatX] -= Origin;
195 }
196 } /* NormalizePicoX */
197
198 /*---------------------------------------------------------------------------*/
199 /**
200 * @param blob blob to extract features from
201 * @param fx_info
202 * @return Integer character-normalized features for blob.
203 */
204 FEATURE_SET Classify::ExtractIntCNFeatures(const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info) {
205 INT_FX_RESULT_STRUCT local_fx_info(fx_info);
206 std::vector<INT_FEATURE_STRUCT> bl_features;
207 tesseract::TrainingSample *sample =
208 tesseract::BlobToTrainingSample(blob, false, &local_fx_info, &bl_features);
209 if (sample == nullptr) {
210 return nullptr;
211 }
212
213 uint32_t num_features = sample->num_features();
214 const INT_FEATURE_STRUCT *features = sample->features();
215 auto feature_set = new FEATURE_SET_STRUCT(num_features);
216 for (uint32_t f = 0; f < num_features; ++f) {
217 auto feature = new FEATURE_STRUCT(&IntFeatDesc);
218 feature->Params[IntX] = features[f].X;
219 feature->Params[IntY] = features[f].Y;
220 feature->Params[IntDir] = features[f].Theta;
221 AddFeature(feature_set, feature);
222 }
223 delete sample;
224
225 return feature_set;
226 } /* ExtractIntCNFeatures */
227
228 /*---------------------------------------------------------------------------*/
229 /**
230 * @param blob blob to extract features from
231 * @param fx_info
232 * @return Geometric (top/bottom/width) features for blob.
233 */
234 FEATURE_SET Classify::ExtractIntGeoFeatures(const TBLOB &blob,
235 const INT_FX_RESULT_STRUCT &fx_info) {
236 INT_FX_RESULT_STRUCT local_fx_info(fx_info);
237 std::vector<INT_FEATURE_STRUCT> bl_features;
238 tesseract::TrainingSample *sample =
239 tesseract::BlobToTrainingSample(blob, false, &local_fx_info, &bl_features);
240 if (sample == nullptr) {
241 return nullptr;
242 }
243
244 auto feature_set = new FEATURE_SET_STRUCT(1);
245 auto feature = new FEATURE_STRUCT(&IntFeatDesc);
246
247 feature->Params[GeoBottom] = sample->geo_feature(GeoBottom);
248 feature->Params[GeoTop] = sample->geo_feature(GeoTop);
249 feature->Params[GeoWidth] = sample->geo_feature(GeoWidth);
250 AddFeature(feature_set, feature);
251 delete sample;
252
253 return feature_set;
254 } /* ExtractIntGeoFeatures */
255
256 } // namespace tesseract.