comparison mupdf-source/thirdparty/tesseract/src/training/mftraining.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /******************************************************************************
2 ** Filename: mftraining.c
3 ** Purpose: Separates training pages into files for each character.
4 ** Strips from files only the features and there parameters of
5 ** the feature type mf.
6 ** Author: Dan Johnson
7 ** Revisment: Christy Russon
8 **
9 ** (c) Copyright Hewlett-Packard Company, 1988.
10 ** Licensed under the Apache License, Version 2.0 (the "License");
11 ** you may not use this file except in compliance with the License.
12 ** You may obtain a copy of the License at
13 ** http://www.apache.org/licenses/LICENSE-2.0
14 ** Unless required by applicable law or agreed to in writing, software
15 ** distributed under the License is distributed on an "AS IS" BASIS,
16 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 ** See the License for the specific language governing permissions and
18 ** limitations under the License.
19 ******************************************************************************/
20 /*----------------------------------------------------------------------------
21 Include Files and Type Defines
22 ----------------------------------------------------------------------------*/
23
24 #define _USE_MATH_DEFINES // for M_PI
25 #ifdef HAVE_CONFIG_H
26 # include "config_auto.h"
27 #endif
28
29 #include <cmath> // for M_PI
30 #include <cstdio>
31 #include <cstring>
32
33 #include "classify.h"
34 #include "cluster.h"
35 #include "clusttool.h"
36 #include "commontraining.h"
37 #include "featdefs.h"
38 #include "fontinfo.h"
39 #include "indexmapbidi.h"
40 #include "intproto.h"
41 #include "mastertrainer.h"
42 #include "mergenf.h"
43 #include "mf.h"
44 #include "ocrfeatures.h"
45 #include "oldlist.h"
46 #include "protos.h"
47 #include "shapetable.h"
48 #include "tprintf.h"
49 #include "unicity_table.h"
50
51 using namespace tesseract;
52
53 /*----------------------------------------------------------------------------
54 Public Code
55 -----------------------------------------------------------------------------*/
56 #ifndef GRAPHICS_DISABLED
57 static void DisplayProtoList(const char *ch, LIST protolist) {
58 auto window = std::make_unique<ScrollView>("Char samples", 50, 200, 520, 520, 260, 260, true);
59 LIST proto = protolist;
60 iterate(proto) {
61 auto *prototype = reinterpret_cast<PROTOTYPE *>(proto->first_node());
62 if (prototype->Significant) {
63 window->Pen(ScrollView::GREEN);
64 } else if (prototype->NumSamples == 0) {
65 window->Pen(ScrollView::BLUE);
66 } else if (prototype->Merged) {
67 window->Pen(ScrollView::MAGENTA);
68 } else {
69 window->Pen(ScrollView::RED);
70 }
71 float x = CenterX(prototype->Mean);
72 float y = CenterY(prototype->Mean);
73 double angle = OrientationOf(prototype->Mean) * 2 * M_PI;
74 auto dx = static_cast<float>(LengthOf(prototype->Mean) * cos(angle) / 2);
75 auto dy = static_cast<float>(LengthOf(prototype->Mean) * sin(angle) / 2);
76 window->SetCursor((x - dx) * 256, (y - dy) * 256);
77 window->DrawTo((x + dx) * 256, (y + dy) * 256);
78 auto prototypeNumSamples = prototype->NumSamples;
79 if (prototype->Significant) {
80 tprintf("Green proto at (%g,%g)+(%g,%g) %d samples\n", x, y, dx, dy, prototypeNumSamples);
81 } else if (prototype->NumSamples > 0 && !prototype->Merged) {
82 tprintf("Red proto at (%g,%g)+(%g,%g) %d samples\n", x, y, dx, dy, prototypeNumSamples);
83 }
84 }
85 window->Update();
86 }
87 #endif // !GRAPHICS_DISABLED
88
89 // Helper to run clustering on a single config.
90 // Mostly copied from the old mftraining, but with renamed variables.
91 static LIST ClusterOneConfig(int shape_id, const char *class_label, LIST mf_classes,
92 const ShapeTable &shape_table, MasterTrainer *trainer) {
93 int num_samples;
94 CLUSTERER *clusterer =
95 trainer->SetupForClustering(shape_table, feature_defs, shape_id, &num_samples);
96 Config.MagicSamples = num_samples;
97 LIST proto_list = ClusterSamples(clusterer, &Config);
98 CleanUpUnusedData(proto_list);
99
100 // Merge protos where reasonable to make more of them significant by
101 // representing almost all samples of the class/font.
102 MergeInsignificantProtos(proto_list, class_label, clusterer, &Config);
103 #ifndef GRAPHICS_DISABLED
104 if (strcmp(FLAGS_test_ch.c_str(), class_label) == 0) {
105 DisplayProtoList(FLAGS_test_ch.c_str(), proto_list);
106 }
107 #endif // !GRAPHICS_DISABLED
108 // Delete the protos that will not be used in the inttemp output file.
109 proto_list = RemoveInsignificantProtos(proto_list, true, false, clusterer->SampleSize);
110 FreeClusterer(clusterer);
111 MERGE_CLASS merge_class = FindClass(mf_classes, class_label);
112 if (merge_class == nullptr) {
113 merge_class = new MERGE_CLASS_NODE(class_label);
114 mf_classes = push(mf_classes, merge_class);
115 }
116 int config_id = AddConfigToClass(merge_class->Class);
117 merge_class->Class->font_set.push_back(shape_id);
118 LIST proto_it = proto_list;
119 iterate(proto_it) {
120 auto *prototype = reinterpret_cast<PROTOTYPE *>(proto_it->first_node());
121 // See if proto can be approximated by existing proto.
122 int p_id = FindClosestExistingProto(merge_class->Class, merge_class->NumMerged, prototype);
123 if (p_id == NO_PROTO) {
124 // Need to make a new proto, as it doesn't match anything.
125 p_id = AddProtoToClass(merge_class->Class);
126 MakeNewFromOld(ProtoIn(merge_class->Class, p_id), prototype);
127 merge_class->NumMerged[p_id] = 1;
128 } else {
129 PROTO_STRUCT dummy_proto;
130 MakeNewFromOld(&dummy_proto, prototype);
131 // Merge with the similar proto.
132 ComputeMergedProto(ProtoIn(merge_class->Class, p_id), &dummy_proto,
133 static_cast<float>(merge_class->NumMerged[p_id]), 1.0,
134 ProtoIn(merge_class->Class, p_id));
135 merge_class->NumMerged[p_id]++;
136 }
137 AddProtoToConfig(p_id, merge_class->Class->Configurations[config_id]);
138 }
139 FreeProtoList(&proto_list);
140 return mf_classes;
141 }
142
143 // Helper to setup the config map.
144 // Setup an index mapping from the shapes in the shape table to the classes
145 // that will be trained. In keeping with the original design, each shape
146 // with the same list of unichars becomes a different class and the configs
147 // represent the different combinations of fonts.
148 static void SetupConfigMap(ShapeTable *shape_table, IndexMapBiDi *config_map) {
149 int num_configs = shape_table->NumShapes();
150 config_map->Init(num_configs, true);
151 config_map->Setup();
152 for (int c1 = 0; c1 < num_configs; ++c1) {
153 // Only process ids that are not already merged.
154 if (config_map->SparseToCompact(c1) == c1) {
155 Shape *shape1 = shape_table->MutableShape(c1);
156 // Find all the subsequent shapes that are equal.
157 for (int c2 = c1 + 1; c2 < num_configs; ++c2) {
158 if (shape_table->MutableShape(c2)->IsEqualUnichars(shape1)) {
159 config_map->Merge(c1, c2);
160 }
161 }
162 }
163 }
164 config_map->CompleteMerges();
165 }
166
167 /**
168 * This program reads in a text file consisting of feature
169 * samples from a training page in the following format:
170 * @verbatim
171 FontName UTF8-char-str xmin ymin xmax ymax page-number
172 NumberOfFeatureTypes(N)
173 FeatureTypeName1 NumberOfFeatures(M)
174 Feature1
175 ...
176 FeatureM
177 FeatureTypeName2 NumberOfFeatures(M)
178 Feature1
179 ...
180 FeatureM
181 ...
182 FeatureTypeNameN NumberOfFeatures(M)
183 Feature1
184 ...
185 FeatureM
186 FontName CharName ...
187 @endverbatim
188 * The result of this program is a binary inttemp file used by
189 * the OCR engine.
190 * @param argc number of command line arguments
191 * @param argv array of command line arguments
192 * @return 0 if no error occurred
193 */
194 int main(int argc, char **argv) {
195 tesseract::CheckSharedLibraryVersion();
196
197 ParseArguments(&argc, &argv);
198
199 ShapeTable *shape_table = nullptr;
200 std::string file_prefix;
201 // Load the training data.
202 auto trainer = tesseract::LoadTrainingData(argv + 1, false, &shape_table, file_prefix);
203 if (trainer == nullptr) {
204 return EXIT_FAILURE; // Failed.
205 }
206
207 // Setup an index mapping from the shapes in the shape table to the classes
208 // that will be trained. In keeping with the original design, each shape
209 // with the same list of unichars becomes a different class and the configs
210 // represent the different combinations of fonts.
211 IndexMapBiDi config_map;
212 SetupConfigMap(shape_table, &config_map);
213
214 WriteShapeTable(file_prefix, *shape_table);
215 // If the shape_table is flat, then either we didn't run shape clustering, or
216 // it did nothing, so we just output the trainer's unicharset.
217 // Otherwise shape_set will hold a fake unicharset with an entry for each
218 // shape in the shape table, and we will output that instead.
219 UNICHARSET shape_set;
220 const UNICHARSET *unicharset = &trainer->unicharset();
221 // If we ran shapeclustering (and it worked) then at least one shape will
222 // have multiple unichars, so we have to build a fake unicharset.
223 if (shape_table->AnyMultipleUnichars()) {
224 unicharset = &shape_set;
225 // Now build a fake unicharset for the compact shape space to keep the
226 // output modules happy that we are doing things correctly.
227 int num_shapes = config_map.CompactSize();
228 for (int s = 0; s < num_shapes; ++s) {
229 char shape_label[14];
230 snprintf(shape_label, sizeof(shape_label), "sh%04d", s);
231 shape_set.unichar_insert(shape_label);
232 }
233 }
234
235 // Now train each config separately.
236 int num_configs = shape_table->NumShapes();
237 LIST mf_classes = NIL_LIST;
238 for (int s = 0; s < num_configs; ++s) {
239 int unichar_id, font_id;
240 if (unicharset == &shape_set) {
241 // Using fake unichar_ids from the config_map/shape_set.
242 unichar_id = config_map.SparseToCompact(s);
243 } else {
244 // Get the real unichar_id from the shape table/unicharset.
245 shape_table->GetFirstUnicharAndFont(s, &unichar_id, &font_id);
246 }
247 const char *class_label = unicharset->id_to_unichar(unichar_id);
248 mf_classes = ClusterOneConfig(s, class_label, mf_classes, *shape_table, trainer.get());
249 }
250 std::string inttemp_file = file_prefix;
251 inttemp_file += "inttemp";
252 std::string pffmtable_file = std::move(file_prefix);
253 pffmtable_file += "pffmtable";
254 CLASS_STRUCT *float_classes = SetUpForFloat2Int(*unicharset, mf_classes);
255 // Now write the inttemp and pffmtable.
256 trainer->WriteInttempAndPFFMTable(trainer->unicharset(), *unicharset, *shape_table, float_classes,
257 inttemp_file.c_str(), pffmtable_file.c_str());
258 for (size_t c = 0; c < unicharset->size(); ++c) {
259 FreeClassFields(&float_classes[c]);
260 }
261 delete[] float_classes;
262 FreeLabeledClassList(mf_classes);
263 delete shape_table;
264 printf("Done!\n");
265 if (!FLAGS_test_ch.empty()) {
266 // If we are displaying debug window(s), wait for the user to look at them.
267 printf("Hit return to exit...\n");
268 while (getchar() != '\n') {
269 ;
270 }
271 }
272 return EXIT_SUCCESS;
273 } /* main */