comparison mupdf-source/thirdparty/tesseract/src/classify/normmatch.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /******************************************************************************
2 ** Filename: normmatch.c
3 ** Purpose: Simple matcher based on character normalization features.
4 ** Author: Dan Johnson
5 **
6 ** (c) Copyright Hewlett-Packard Company, 1988.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 ******************************************************************************/
17 /*----------------------------------------------------------------------------
18 Include Files and Type Defines
19 ----------------------------------------------------------------------------*/
20 #include "normmatch.h"
21
22 #include "classify.h"
23 #include "clusttool.h"
24 #include "helpers.h"
25 #include "normfeat.h"
26 #include "params.h"
27 #include "unicharset.h"
28
29 #include <cmath>
30 #include <cstdio>
31 #include <sstream> // for std::istringstream
32
33 namespace tesseract {
34
35 struct NORM_PROTOS {
36 NORM_PROTOS(size_t n) : NumProtos(n), Protos(n) {
37 }
38 int NumParams = 0;
39 int NumProtos;
40 PARAM_DESC *ParamDesc = nullptr;
41 std::vector<LIST> Protos;
42 };
43
44 /*----------------------------------------------------------------------------
45 Private Code
46 ----------------------------------------------------------------------------*/
47
48 /**
49 * @name NormEvidenceOf
50 *
51 * Return the new type of evidence number corresponding to this
52 * normalization adjustment. The equation that represents the transform is:
53 * 1 / (1 + (NormAdj / midpoint) ^ curl)
54 */
55 static float NormEvidenceOf(float NormAdj) {
56 NormAdj /= static_cast<float>(classify_norm_adj_midpoint);
57
58 if (classify_norm_adj_curl == 3) {
59 NormAdj = NormAdj * NormAdj * NormAdj;
60 } else if (classify_norm_adj_curl == 2) {
61 NormAdj = NormAdj * NormAdj;
62 } else {
63 NormAdj = std::pow(NormAdj, static_cast<float>(classify_norm_adj_curl));
64 }
65 return (1 / (1 + NormAdj));
66 }
67
68 /*----------------------------------------------------------------------------
69 Variables
70 ----------------------------------------------------------------------------*/
71
72 /** control knobs used to control the normalization adjustment process */
73 double_VAR(classify_norm_adj_midpoint, 32.0, "Norm adjust midpoint ...");
74 double_VAR(classify_norm_adj_curl, 2.0, "Norm adjust curl ...");
75 /** Weight of width variance against height and vertical position. */
76 const float kWidthErrorWeighting = 0.125f;
77
78 /*----------------------------------------------------------------------------
79 Public Code
80 ----------------------------------------------------------------------------*/
81 /**
82 * This routine compares Features against each character
83 * normalization proto for ClassId and returns the match
84 * rating of the best match.
85 * @param ClassId id of class to match against
86 * @param feature character normalization feature
87 * @param DebugMatch controls dump of debug info
88 *
89 * Globals:
90 * #NormProtos character normalization prototypes
91 *
92 * @return Best match rating for Feature against protos of ClassId.
93 */
94 float Classify::ComputeNormMatch(CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch) {
95 if (ClassId >= NormProtos->NumProtos) {
96 ClassId = NO_CLASS;
97 }
98
99 /* handle requests for classification as noise */
100 if (ClassId == NO_CLASS) {
101 /* kludge - clean up constants and make into control knobs later */
102 float Match = (feature.Params[CharNormLength] * feature.Params[CharNormLength] * 500.0f +
103 feature.Params[CharNormRx] * feature.Params[CharNormRx] * 8000.0f +
104 feature.Params[CharNormRy] * feature.Params[CharNormRy] * 8000.0f);
105 return (1 - NormEvidenceOf(Match));
106 }
107
108 if (DebugMatch) {
109 tprintf("\nChar norm for class %s\n", unicharset.id_to_unichar(ClassId));
110 }
111
112 LIST Protos = NormProtos->Protos[ClassId];
113 if (Protos == nullptr) {
114 // Avoid FP overflow in NormEvidenceOf.
115 return 1.0f;
116 }
117
118 float BestMatch = FLT_MAX;
119 iterate(Protos) {
120 auto Proto = reinterpret_cast<PROTOTYPE *>(Protos->first_node());
121 float Delta = feature.Params[CharNormY] - Proto->Mean[CharNormY];
122 float Match = Delta * Delta * Proto->Weight.Elliptical[CharNormY];
123 if (DebugMatch) {
124 tprintf("YMiddle: Proto=%g, Delta=%g, Var=%g, Dist=%g\n", Proto->Mean[CharNormY], Delta,
125 Proto->Weight.Elliptical[CharNormY], Match);
126 }
127 Delta = feature.Params[CharNormRx] - Proto->Mean[CharNormRx];
128 Match += Delta * Delta * Proto->Weight.Elliptical[CharNormRx];
129 if (DebugMatch) {
130 tprintf("Height: Proto=%g, Delta=%g, Var=%g, Dist=%g\n", Proto->Mean[CharNormRx], Delta,
131 Proto->Weight.Elliptical[CharNormRx], Match);
132 }
133 // Ry is width! See intfx.cpp.
134 Delta = feature.Params[CharNormRy] - Proto->Mean[CharNormRy];
135 if (DebugMatch) {
136 tprintf("Width: Proto=%g, Delta=%g, Var=%g\n", Proto->Mean[CharNormRy], Delta,
137 Proto->Weight.Elliptical[CharNormRy]);
138 }
139 Delta = Delta * Delta * Proto->Weight.Elliptical[CharNormRy];
140 Delta *= kWidthErrorWeighting;
141 Match += Delta;
142 if (DebugMatch) {
143 tprintf("Total Dist=%g, scaled=%g, sigmoid=%g, penalty=%g\n", Match,
144 Match / classify_norm_adj_midpoint, NormEvidenceOf(Match),
145 256 * (1 - NormEvidenceOf(Match)));
146 }
147
148 if (Match < BestMatch) {
149 BestMatch = Match;
150 }
151 }
152 return 1 - NormEvidenceOf(BestMatch);
153 } /* ComputeNormMatch */
154
155 void Classify::FreeNormProtos() {
156 if (NormProtos != nullptr) {
157 for (int i = 0; i < NormProtos->NumProtos; i++) {
158 FreeProtoList(&NormProtos->Protos[i]);
159 }
160 delete[] NormProtos->ParamDesc;
161 delete NormProtos;
162 NormProtos = nullptr;
163 }
164 }
165
166 /**
167 * This routine allocates a new data structure to hold
168 * a set of character normalization protos. It then fills in
169 * the data structure by reading from the specified File.
170 * @param fp open text file to read normalization protos from
171 * Globals: none
172 * @return Character normalization protos.
173 */
174 NORM_PROTOS *Classify::ReadNormProtos(TFile *fp) {
175 char unichar[2 * UNICHAR_LEN + 1];
176 UNICHAR_ID unichar_id;
177 LIST Protos;
178 int NumProtos;
179
180 /* allocate and initialization data structure */
181 auto NormProtos = new NORM_PROTOS(unicharset.size());
182
183 /* read file header and save in data structure */
184 NormProtos->NumParams = ReadSampleSize(fp);
185 NormProtos->ParamDesc = ReadParamDesc(fp, NormProtos->NumParams);
186
187 /* read protos for each class into a separate list */
188 const int kMaxLineSize = 100;
189 char line[kMaxLineSize];
190 while (fp->FGets(line, kMaxLineSize) != nullptr) {
191 std::istringstream stream(line);
192 stream.imbue(std::locale::classic());
193 stream >> unichar >> NumProtos;
194 if (stream.fail()) {
195 continue;
196 }
197 if (unicharset.contains_unichar(unichar)) {
198 unichar_id = unicharset.unichar_to_id(unichar);
199 Protos = NormProtos->Protos[unichar_id];
200 for (int i = 0; i < NumProtos; i++) {
201 Protos = push_last(Protos, ReadPrototype(fp, NormProtos->NumParams));
202 }
203 NormProtos->Protos[unichar_id] = Protos;
204 } else {
205 tprintf("Error: unichar %s in normproto file is not in unichar set.\n", unichar);
206 for (int i = 0; i < NumProtos; i++) {
207 FreePrototype(ReadPrototype(fp, NormProtos->NumParams));
208 }
209 }
210 }
211 return NormProtos;
212 } /* ReadNormProtos */
213
214 } // namespace tesseract