Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/training/mftraining.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /****************************************************************************** | |
| 2 ** Filename: mftraining.c | |
| 3 ** Purpose: Separates training pages into files for each character. | |
| 4 ** Strips from files only the features and there parameters of | |
| 5 ** the feature type mf. | |
| 6 ** Author: Dan Johnson | |
| 7 ** Revisment: Christy Russon | |
| 8 ** | |
| 9 ** (c) Copyright Hewlett-Packard Company, 1988. | |
| 10 ** Licensed under the Apache License, Version 2.0 (the "License"); | |
| 11 ** you may not use this file except in compliance with the License. | |
| 12 ** You may obtain a copy of the License at | |
| 13 ** http://www.apache.org/licenses/LICENSE-2.0 | |
| 14 ** Unless required by applicable law or agreed to in writing, software | |
| 15 ** distributed under the License is distributed on an "AS IS" BASIS, | |
| 16 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 17 ** See the License for the specific language governing permissions and | |
| 18 ** limitations under the License. | |
| 19 ******************************************************************************/ | |
| 20 /*---------------------------------------------------------------------------- | |
| 21 Include Files and Type Defines | |
| 22 ----------------------------------------------------------------------------*/ | |
| 23 | |
| 24 #define _USE_MATH_DEFINES // for M_PI | |
| 25 #ifdef HAVE_CONFIG_H | |
| 26 # include "config_auto.h" | |
| 27 #endif | |
| 28 | |
| 29 #include <cmath> // for M_PI | |
| 30 #include <cstdio> | |
| 31 #include <cstring> | |
| 32 | |
| 33 #include "classify.h" | |
| 34 #include "cluster.h" | |
| 35 #include "clusttool.h" | |
| 36 #include "commontraining.h" | |
| 37 #include "featdefs.h" | |
| 38 #include "fontinfo.h" | |
| 39 #include "indexmapbidi.h" | |
| 40 #include "intproto.h" | |
| 41 #include "mastertrainer.h" | |
| 42 #include "mergenf.h" | |
| 43 #include "mf.h" | |
| 44 #include "ocrfeatures.h" | |
| 45 #include "oldlist.h" | |
| 46 #include "protos.h" | |
| 47 #include "shapetable.h" | |
| 48 #include "tprintf.h" | |
| 49 #include "unicity_table.h" | |
| 50 | |
| 51 using namespace tesseract; | |
| 52 | |
| 53 /*---------------------------------------------------------------------------- | |
| 54 Public Code | |
| 55 -----------------------------------------------------------------------------*/ | |
| 56 #ifndef GRAPHICS_DISABLED | |
| 57 static void DisplayProtoList(const char *ch, LIST protolist) { | |
| 58 auto window = std::make_unique<ScrollView>("Char samples", 50, 200, 520, 520, 260, 260, true); | |
| 59 LIST proto = protolist; | |
| 60 iterate(proto) { | |
| 61 auto *prototype = reinterpret_cast<PROTOTYPE *>(proto->first_node()); | |
| 62 if (prototype->Significant) { | |
| 63 window->Pen(ScrollView::GREEN); | |
| 64 } else if (prototype->NumSamples == 0) { | |
| 65 window->Pen(ScrollView::BLUE); | |
| 66 } else if (prototype->Merged) { | |
| 67 window->Pen(ScrollView::MAGENTA); | |
| 68 } else { | |
| 69 window->Pen(ScrollView::RED); | |
| 70 } | |
| 71 float x = CenterX(prototype->Mean); | |
| 72 float y = CenterY(prototype->Mean); | |
| 73 double angle = OrientationOf(prototype->Mean) * 2 * M_PI; | |
| 74 auto dx = static_cast<float>(LengthOf(prototype->Mean) * cos(angle) / 2); | |
| 75 auto dy = static_cast<float>(LengthOf(prototype->Mean) * sin(angle) / 2); | |
| 76 window->SetCursor((x - dx) * 256, (y - dy) * 256); | |
| 77 window->DrawTo((x + dx) * 256, (y + dy) * 256); | |
| 78 auto prototypeNumSamples = prototype->NumSamples; | |
| 79 if (prototype->Significant) { | |
| 80 tprintf("Green proto at (%g,%g)+(%g,%g) %d samples\n", x, y, dx, dy, prototypeNumSamples); | |
| 81 } else if (prototype->NumSamples > 0 && !prototype->Merged) { | |
| 82 tprintf("Red proto at (%g,%g)+(%g,%g) %d samples\n", x, y, dx, dy, prototypeNumSamples); | |
| 83 } | |
| 84 } | |
| 85 window->Update(); | |
| 86 } | |
| 87 #endif // !GRAPHICS_DISABLED | |
| 88 | |
| 89 // Helper to run clustering on a single config. | |
| 90 // Mostly copied from the old mftraining, but with renamed variables. | |
| 91 static LIST ClusterOneConfig(int shape_id, const char *class_label, LIST mf_classes, | |
| 92 const ShapeTable &shape_table, MasterTrainer *trainer) { | |
| 93 int num_samples; | |
| 94 CLUSTERER *clusterer = | |
| 95 trainer->SetupForClustering(shape_table, feature_defs, shape_id, &num_samples); | |
| 96 Config.MagicSamples = num_samples; | |
| 97 LIST proto_list = ClusterSamples(clusterer, &Config); | |
| 98 CleanUpUnusedData(proto_list); | |
| 99 | |
| 100 // Merge protos where reasonable to make more of them significant by | |
| 101 // representing almost all samples of the class/font. | |
| 102 MergeInsignificantProtos(proto_list, class_label, clusterer, &Config); | |
| 103 #ifndef GRAPHICS_DISABLED | |
| 104 if (strcmp(FLAGS_test_ch.c_str(), class_label) == 0) { | |
| 105 DisplayProtoList(FLAGS_test_ch.c_str(), proto_list); | |
| 106 } | |
| 107 #endif // !GRAPHICS_DISABLED | |
| 108 // Delete the protos that will not be used in the inttemp output file. | |
| 109 proto_list = RemoveInsignificantProtos(proto_list, true, false, clusterer->SampleSize); | |
| 110 FreeClusterer(clusterer); | |
| 111 MERGE_CLASS merge_class = FindClass(mf_classes, class_label); | |
| 112 if (merge_class == nullptr) { | |
| 113 merge_class = new MERGE_CLASS_NODE(class_label); | |
| 114 mf_classes = push(mf_classes, merge_class); | |
| 115 } | |
| 116 int config_id = AddConfigToClass(merge_class->Class); | |
| 117 merge_class->Class->font_set.push_back(shape_id); | |
| 118 LIST proto_it = proto_list; | |
| 119 iterate(proto_it) { | |
| 120 auto *prototype = reinterpret_cast<PROTOTYPE *>(proto_it->first_node()); | |
| 121 // See if proto can be approximated by existing proto. | |
| 122 int p_id = FindClosestExistingProto(merge_class->Class, merge_class->NumMerged, prototype); | |
| 123 if (p_id == NO_PROTO) { | |
| 124 // Need to make a new proto, as it doesn't match anything. | |
| 125 p_id = AddProtoToClass(merge_class->Class); | |
| 126 MakeNewFromOld(ProtoIn(merge_class->Class, p_id), prototype); | |
| 127 merge_class->NumMerged[p_id] = 1; | |
| 128 } else { | |
| 129 PROTO_STRUCT dummy_proto; | |
| 130 MakeNewFromOld(&dummy_proto, prototype); | |
| 131 // Merge with the similar proto. | |
| 132 ComputeMergedProto(ProtoIn(merge_class->Class, p_id), &dummy_proto, | |
| 133 static_cast<float>(merge_class->NumMerged[p_id]), 1.0, | |
| 134 ProtoIn(merge_class->Class, p_id)); | |
| 135 merge_class->NumMerged[p_id]++; | |
| 136 } | |
| 137 AddProtoToConfig(p_id, merge_class->Class->Configurations[config_id]); | |
| 138 } | |
| 139 FreeProtoList(&proto_list); | |
| 140 return mf_classes; | |
| 141 } | |
| 142 | |
| 143 // Helper to setup the config map. | |
| 144 // Setup an index mapping from the shapes in the shape table to the classes | |
| 145 // that will be trained. In keeping with the original design, each shape | |
| 146 // with the same list of unichars becomes a different class and the configs | |
| 147 // represent the different combinations of fonts. | |
| 148 static void SetupConfigMap(ShapeTable *shape_table, IndexMapBiDi *config_map) { | |
| 149 int num_configs = shape_table->NumShapes(); | |
| 150 config_map->Init(num_configs, true); | |
| 151 config_map->Setup(); | |
| 152 for (int c1 = 0; c1 < num_configs; ++c1) { | |
| 153 // Only process ids that are not already merged. | |
| 154 if (config_map->SparseToCompact(c1) == c1) { | |
| 155 Shape *shape1 = shape_table->MutableShape(c1); | |
| 156 // Find all the subsequent shapes that are equal. | |
| 157 for (int c2 = c1 + 1; c2 < num_configs; ++c2) { | |
| 158 if (shape_table->MutableShape(c2)->IsEqualUnichars(shape1)) { | |
| 159 config_map->Merge(c1, c2); | |
| 160 } | |
| 161 } | |
| 162 } | |
| 163 } | |
| 164 config_map->CompleteMerges(); | |
| 165 } | |
| 166 | |
| 167 /** | |
| 168 * This program reads in a text file consisting of feature | |
| 169 * samples from a training page in the following format: | |
| 170 * @verbatim | |
| 171 FontName UTF8-char-str xmin ymin xmax ymax page-number | |
| 172 NumberOfFeatureTypes(N) | |
| 173 FeatureTypeName1 NumberOfFeatures(M) | |
| 174 Feature1 | |
| 175 ... | |
| 176 FeatureM | |
| 177 FeatureTypeName2 NumberOfFeatures(M) | |
| 178 Feature1 | |
| 179 ... | |
| 180 FeatureM | |
| 181 ... | |
| 182 FeatureTypeNameN NumberOfFeatures(M) | |
| 183 Feature1 | |
| 184 ... | |
| 185 FeatureM | |
| 186 FontName CharName ... | |
| 187 @endverbatim | |
| 188 * The result of this program is a binary inttemp file used by | |
| 189 * the OCR engine. | |
| 190 * @param argc number of command line arguments | |
| 191 * @param argv array of command line arguments | |
| 192 * @return 0 if no error occurred | |
| 193 */ | |
| 194 int main(int argc, char **argv) { | |
| 195 tesseract::CheckSharedLibraryVersion(); | |
| 196 | |
| 197 ParseArguments(&argc, &argv); | |
| 198 | |
| 199 ShapeTable *shape_table = nullptr; | |
| 200 std::string file_prefix; | |
| 201 // Load the training data. | |
| 202 auto trainer = tesseract::LoadTrainingData(argv + 1, false, &shape_table, file_prefix); | |
| 203 if (trainer == nullptr) { | |
| 204 return EXIT_FAILURE; // Failed. | |
| 205 } | |
| 206 | |
| 207 // Setup an index mapping from the shapes in the shape table to the classes | |
| 208 // that will be trained. In keeping with the original design, each shape | |
| 209 // with the same list of unichars becomes a different class and the configs | |
| 210 // represent the different combinations of fonts. | |
| 211 IndexMapBiDi config_map; | |
| 212 SetupConfigMap(shape_table, &config_map); | |
| 213 | |
| 214 WriteShapeTable(file_prefix, *shape_table); | |
| 215 // If the shape_table is flat, then either we didn't run shape clustering, or | |
| 216 // it did nothing, so we just output the trainer's unicharset. | |
| 217 // Otherwise shape_set will hold a fake unicharset with an entry for each | |
| 218 // shape in the shape table, and we will output that instead. | |
| 219 UNICHARSET shape_set; | |
| 220 const UNICHARSET *unicharset = &trainer->unicharset(); | |
| 221 // If we ran shapeclustering (and it worked) then at least one shape will | |
| 222 // have multiple unichars, so we have to build a fake unicharset. | |
| 223 if (shape_table->AnyMultipleUnichars()) { | |
| 224 unicharset = &shape_set; | |
| 225 // Now build a fake unicharset for the compact shape space to keep the | |
| 226 // output modules happy that we are doing things correctly. | |
| 227 int num_shapes = config_map.CompactSize(); | |
| 228 for (int s = 0; s < num_shapes; ++s) { | |
| 229 char shape_label[14]; | |
| 230 snprintf(shape_label, sizeof(shape_label), "sh%04d", s); | |
| 231 shape_set.unichar_insert(shape_label); | |
| 232 } | |
| 233 } | |
| 234 | |
| 235 // Now train each config separately. | |
| 236 int num_configs = shape_table->NumShapes(); | |
| 237 LIST mf_classes = NIL_LIST; | |
| 238 for (int s = 0; s < num_configs; ++s) { | |
| 239 int unichar_id, font_id; | |
| 240 if (unicharset == &shape_set) { | |
| 241 // Using fake unichar_ids from the config_map/shape_set. | |
| 242 unichar_id = config_map.SparseToCompact(s); | |
| 243 } else { | |
| 244 // Get the real unichar_id from the shape table/unicharset. | |
| 245 shape_table->GetFirstUnicharAndFont(s, &unichar_id, &font_id); | |
| 246 } | |
| 247 const char *class_label = unicharset->id_to_unichar(unichar_id); | |
| 248 mf_classes = ClusterOneConfig(s, class_label, mf_classes, *shape_table, trainer.get()); | |
| 249 } | |
| 250 std::string inttemp_file = file_prefix; | |
| 251 inttemp_file += "inttemp"; | |
| 252 std::string pffmtable_file = std::move(file_prefix); | |
| 253 pffmtable_file += "pffmtable"; | |
| 254 CLASS_STRUCT *float_classes = SetUpForFloat2Int(*unicharset, mf_classes); | |
| 255 // Now write the inttemp and pffmtable. | |
| 256 trainer->WriteInttempAndPFFMTable(trainer->unicharset(), *unicharset, *shape_table, float_classes, | |
| 257 inttemp_file.c_str(), pffmtable_file.c_str()); | |
| 258 for (size_t c = 0; c < unicharset->size(); ++c) { | |
| 259 FreeClassFields(&float_classes[c]); | |
| 260 } | |
| 261 delete[] float_classes; | |
| 262 FreeLabeledClassList(mf_classes); | |
| 263 delete shape_table; | |
| 264 printf("Done!\n"); | |
| 265 if (!FLAGS_test_ch.empty()) { | |
| 266 // If we are displaying debug window(s), wait for the user to look at them. | |
| 267 printf("Hit return to exit...\n"); | |
| 268 while (getchar() != '\n') { | |
| 269 ; | |
| 270 } | |
| 271 } | |
| 272 return EXIT_SUCCESS; | |
| 273 } /* main */ |
