comparison mupdf-source/thirdparty/tesseract/src/training/cntraining.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /******************************************************************************
2 ** Filename: cntraining.cpp
3 ** Purpose: Generates a normproto and pffmtable.
4 ** Author: Dan Johnson
5 ** Revisment: Christy Russon
6 **
7 ** (c) Copyright Hewlett-Packard Company, 1988.
8 ** Licensed under the Apache License, Version 2.0 (the "License");
9 ** you may not use this file except in compliance with the License.
10 ** You may obtain a copy of the License at
11 ** http://www.apache.org/licenses/LICENSE-2.0
12 ** Unless required by applicable law or agreed to in writing, software
13 ** distributed under the License is distributed on an "AS IS" BASIS,
14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 ** See the License for the specific language governing permissions and
16 ** limitations under the License.
17 ******************************************************************************/
18
19 /*----------------------------------------------------------------------------
20 Include Files and Type Defines
21 ----------------------------------------------------------------------------*/
22 #include <tesseract/unichar.h>
23 #include <cmath>
24 #include <cstdio>
25 #include <cstring>
26 #include "cluster.h"
27 #include "clusttool.h"
28 #include "commontraining.h"
29 #include "featdefs.h"
30 #include "ocrfeatures.h"
31 #include "oldlist.h"
32
33 #define PROGRAM_FEATURE_TYPE "cn"
34
35 using namespace tesseract;
36
37 /*----------------------------------------------------------------------------
38 Private Function Prototypes
39 ----------------------------------------------------------------------------*/
40
41 static void WriteNormProtos(const char *Directory, LIST LabeledProtoList,
42 const FEATURE_DESC_STRUCT *feature_desc);
43
44 static void WriteProtos(FILE *File, uint16_t N, LIST ProtoList, bool WriteSigProtos,
45 bool WriteInsigProtos);
46
47 /*----------------------------------------------------------------------------
48 Global Data Definitions and Declarations
49 ----------------------------------------------------------------------------*/
50 /* global variable to hold configuration parameters to control clustering */
51 //-M 0.025 -B 0.05 -I 0.8 -C 1e-3
52 static const CLUSTERCONFIG CNConfig = {elliptical, 0.025, 0.05, 0.8, 1e-3, 0};
53
54 /*----------------------------------------------------------------------------
55 Public Code
56 ----------------------------------------------------------------------------*/
57
58 /**
59 * This program reads in a text file consisting of feature
60 * samples from a training page in the following format:
61 * @verbatim
62 FontName CharName NumberOfFeatureTypes(N)
63 FeatureTypeName1 NumberOfFeatures(M)
64 Feature1
65 ...
66 FeatureM
67 FeatureTypeName2 NumberOfFeatures(M)
68 Feature1
69 ...
70 FeatureM
71 ...
72 FeatureTypeNameN NumberOfFeatures(M)
73 Feature1
74 ...
75 FeatureM
76 FontName CharName ...
77 @endverbatim
78 * It then appends these samples into a separate file for each
79 * character. The name of the file is
80 *
81 * DirectoryName/FontName/CharName.FeatureTypeName
82 *
83 * The DirectoryName can be specified via a command
84 * line argument. If not specified, it defaults to the
85 * current directory. The format of the resulting files is:
86 * @verbatim
87 NumberOfFeatures(M)
88 Feature1
89 ...
90 FeatureM
91 NumberOfFeatures(M)
92 ...
93 @endverbatim
94 * The output files each have a header which describes the
95 * type of feature which the file contains. This header is
96 * in the format required by the clusterer. A command line
97 * argument can also be used to specify that only the first
98 * N samples of each class should be used.
99 * @param argc number of command line arguments
100 * @param argv array of command line arguments
101 * @return 0 on success
102 */
103 int main(int argc, char *argv[]) {
104 tesseract::CheckSharedLibraryVersion();
105
106 // Set the global Config parameters before parsing the command line.
107 Config = CNConfig;
108
109 LIST CharList = NIL_LIST;
110 CLUSTERER *Clusterer = nullptr;
111 LIST ProtoList = NIL_LIST;
112 LIST NormProtoList = NIL_LIST;
113 LIST pCharList;
114 LABELEDLIST CharSample;
115 FEATURE_DEFS_STRUCT FeatureDefs;
116 InitFeatureDefs(&FeatureDefs);
117
118 ParseArguments(&argc, &argv);
119 #if !defined(NDEBUG)
120 int num_fonts = 0;
121 #endif
122 for (const char *PageName = *++argv; PageName != nullptr; PageName = *++argv) {
123 printf("Reading %s ...\n", PageName);
124 FILE *TrainingPage = fopen(PageName, "rb");
125 ASSERT_HOST(TrainingPage);
126 if (TrainingPage) {
127 ReadTrainingSamples(FeatureDefs, PROGRAM_FEATURE_TYPE, 100, nullptr, TrainingPage, &CharList);
128 fclose(TrainingPage);
129 #if !defined(NDEBUG)
130 ++num_fonts;
131 #endif
132 }
133 }
134 printf("Clustering ...\n");
135 // To allow an individual font to form a separate cluster,
136 // reduce the min samples:
137 // Config.MinSamples = 0.5 / num_fonts;
138 pCharList = CharList;
139 // The norm protos will count the source protos, so we keep them here in
140 // freeable_protos, so they can be freed later.
141 std::vector<LIST> freeable_protos;
142 iterate(pCharList) {
143 // Cluster
144 CharSample = reinterpret_cast<LABELEDLIST>(pCharList->first_node());
145 Clusterer = SetUpForClustering(FeatureDefs, CharSample, PROGRAM_FEATURE_TYPE);
146 if (Clusterer == nullptr) { // To avoid a SIGSEGV
147 fprintf(stderr, "Error: nullptr clusterer!\n");
148 return EXIT_FAILURE;
149 }
150 float SavedMinSamples = Config.MinSamples;
151 // To disable the tendency to produce a single cluster for all fonts,
152 // make MagicSamples an impossible to achieve number:
153 // Config.MagicSamples = CharSample->SampleCount * 10;
154 Config.MagicSamples = CharSample->SampleCount;
155 while (Config.MinSamples > 0.001) {
156 ProtoList = ClusterSamples(Clusterer, &Config);
157 if (NumberOfProtos(ProtoList, true, false) > 0) {
158 break;
159 } else {
160 Config.MinSamples *= 0.95;
161 printf(
162 "0 significant protos for %s."
163 " Retrying clustering with MinSamples = %f%%\n",
164 CharSample->Label.c_str(), Config.MinSamples);
165 }
166 }
167 Config.MinSamples = SavedMinSamples;
168 AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label);
169 freeable_protos.push_back(ProtoList);
170 FreeClusterer(Clusterer);
171 }
172 FreeTrainingSamples(CharList);
173 int desc_index = ShortNameToFeatureType(FeatureDefs, PROGRAM_FEATURE_TYPE);
174 WriteNormProtos(FLAGS_D.c_str(), NormProtoList, FeatureDefs.FeatureDesc[desc_index]);
175 FreeNormProtoList(NormProtoList);
176 for (auto &freeable_proto : freeable_protos) {
177 FreeProtoList(&freeable_proto);
178 }
179 printf("\n");
180 return EXIT_SUCCESS;
181 } // main
182
183 /*----------------------------------------------------------------------------
184 Private Code
185 ----------------------------------------------------------------------------*/
186
187 /*----------------------------------------------------------------------------*/
188 /**
189 * This routine writes the specified samples into files which
190 * are organized according to the font name and character name
191 * of the samples.
192 * @param Directory directory to place sample files into
193 * @param LabeledProtoList List of labeled protos
194 * @param feature_desc Description of the features
195 */
196 static void WriteNormProtos(const char *Directory, LIST LabeledProtoList,
197 const FEATURE_DESC_STRUCT *feature_desc) {
198 FILE *File;
199 LABELEDLIST LabeledProto;
200 int N;
201
202 std::string Filename = "";
203 if (Directory != nullptr && Directory[0] != '\0') {
204 Filename += Directory;
205 Filename += "/";
206 }
207 Filename += "normproto";
208 printf("\nWriting %s ...", Filename.c_str());
209 File = fopen(Filename.c_str(), "wb");
210 ASSERT_HOST(File);
211 fprintf(File, "%0d\n", feature_desc->NumParams);
212 WriteParamDesc(File, feature_desc->NumParams, feature_desc->ParamDesc);
213 iterate(LabeledProtoList) {
214 LabeledProto = reinterpret_cast<LABELEDLIST>(LabeledProtoList->first_node());
215 N = NumberOfProtos(LabeledProto->List, true, false);
216 if (N < 1) {
217 printf(
218 "\nError! Not enough protos for %s: %d protos"
219 " (%d significant protos"
220 ", %d insignificant protos)\n",
221 LabeledProto->Label.c_str(), N, NumberOfProtos(LabeledProto->List, true, false),
222 NumberOfProtos(LabeledProto->List, false, true));
223 exit(1);
224 }
225 fprintf(File, "\n%s %d\n", LabeledProto->Label.c_str(), N);
226 WriteProtos(File, feature_desc->NumParams, LabeledProto->List, true, false);
227 }
228 fclose(File);
229
230 } // WriteNormProtos
231
232 /*-------------------------------------------------------------------------*/
233
234 static void WriteProtos(FILE *File, uint16_t N, LIST ProtoList, bool WriteSigProtos,
235 bool WriteInsigProtos) {
236 PROTOTYPE *Proto;
237
238 // write prototypes
239 iterate(ProtoList) {
240 Proto = reinterpret_cast<PROTOTYPE *>(ProtoList->first_node());
241 if ((Proto->Significant && WriteSigProtos) || (!Proto->Significant && WriteInsigProtos)) {
242 WritePrototype(File, N, Proto);
243 }
244 }
245 } // WriteProtos