Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/training/common/commontraining.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 // Copyright 2008 Google Inc. All Rights Reserved. | |
| 2 // Author: scharron@google.com (Samuel Charron) | |
| 3 // | |
| 4 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 5 // you may not use this file except in compliance with the License. | |
| 6 // You may obtain a copy of the License at | |
| 7 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 8 // Unless required by applicable law or agreed to in writing, software | |
| 9 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 11 // See the License for the specific language governing permissions and | |
| 12 // limitations under the License. | |
| 13 | |
| 14 #define _USE_MATH_DEFINES // for M_PI | |
| 15 | |
| 16 #include "commontraining.h" | |
| 17 | |
| 18 #ifdef DISABLED_LEGACY_ENGINE | |
| 19 | |
| 20 # include "params.h" | |
| 21 # include "tprintf.h" | |
| 22 | |
| 23 namespace tesseract { | |
| 24 | |
| 25 INT_PARAM_FLAG(debug_level, 0, "Level of Trainer debugging"); | |
| 26 INT_PARAM_FLAG(load_images, 0, "Load images with tr files"); | |
| 27 STRING_PARAM_FLAG(configfile, "", "File to load more configs from"); | |
| 28 STRING_PARAM_FLAG(D, "", "Directory to write output files to"); | |
| 29 STRING_PARAM_FLAG(F, "font_properties", "File listing font properties"); | |
| 30 STRING_PARAM_FLAG(X, "", "File listing font xheights"); | |
| 31 STRING_PARAM_FLAG(U, "unicharset", "File to load unicharset from"); | |
| 32 STRING_PARAM_FLAG(O, "", "File to write unicharset to"); | |
| 33 STRING_PARAM_FLAG(output_trainer, "", "File to write trainer to"); | |
| 34 STRING_PARAM_FLAG(test_ch, "", "UTF8 test character string"); | |
| 35 STRING_PARAM_FLAG(fonts_dir, "", | |
| 36 "If empty it uses system default. Otherwise it overrides " | |
| 37 "system default font location"); | |
| 38 STRING_PARAM_FLAG(fontconfig_tmpdir, "/tmp", "Overrides fontconfig default temporary dir"); | |
| 39 | |
| 40 /** | |
| 41 * This routine parses the command line arguments that were | |
| 42 * passed to the program and uses them to set relevant | |
| 43 * training-related global parameters. | |
| 44 * | |
| 45 * Globals: | |
| 46 * - Config current clustering parameters | |
| 47 * @param argc number of command line arguments to parse | |
| 48 * @param argv command line arguments | |
| 49 * @note Exceptions: Illegal options terminate the program. | |
| 50 */ | |
| 51 void ParseArguments(int *argc, char ***argv) { | |
| 52 std::string usage; | |
| 53 if (*argc) { | |
| 54 usage += (*argv)[0]; | |
| 55 usage += " -v | --version | "; | |
| 56 usage += (*argv)[0]; | |
| 57 } | |
| 58 usage += " [.tr files ...]"; | |
| 59 tesseract::ParseCommandLineFlags(usage.c_str(), argc, argv, true); | |
| 60 } | |
| 61 | |
| 62 } // namespace tesseract. | |
| 63 | |
| 64 #else | |
| 65 | |
| 66 # include <allheaders.h> | |
| 67 # include "ccutil.h" | |
| 68 # include "classify.h" | |
| 69 # include "cluster.h" | |
| 70 # include "clusttool.h" | |
| 71 # include "featdefs.h" | |
| 72 # include "fontinfo.h" | |
| 73 # include "intfeaturespace.h" | |
| 74 # include "mastertrainer.h" | |
| 75 # include "mf.h" | |
| 76 # include "oldlist.h" | |
| 77 # include "params.h" | |
| 78 # include "shapetable.h" | |
| 79 # include "tessdatamanager.h" | |
| 80 # include "tprintf.h" | |
| 81 # include "unicity_table.h" | |
| 82 | |
| 83 namespace tesseract { | |
| 84 | |
| 85 // Global Variables. | |
| 86 | |
| 87 // global variable to hold configuration parameters to control clustering | |
| 88 // -M 0.625 -B 0.05 -I 1.0 -C 1e-6. | |
| 89 CLUSTERCONFIG Config = {elliptical, 0.625, 0.05, 1.0, 1e-6, 0}; | |
| 90 FEATURE_DEFS_STRUCT feature_defs; | |
| 91 static CCUtil ccutil; | |
| 92 | |
| 93 INT_PARAM_FLAG(debug_level, 0, "Level of Trainer debugging"); | |
| 94 static INT_PARAM_FLAG(load_images, 0, "Load images with tr files"); | |
| 95 static STRING_PARAM_FLAG(configfile, "", "File to load more configs from"); | |
| 96 STRING_PARAM_FLAG(D, "", "Directory to write output files to"); | |
| 97 STRING_PARAM_FLAG(F, "font_properties", "File listing font properties"); | |
| 98 STRING_PARAM_FLAG(X, "", "File listing font xheights"); | |
| 99 STRING_PARAM_FLAG(U, "unicharset", "File to load unicharset from"); | |
| 100 STRING_PARAM_FLAG(O, "", "File to write unicharset to"); | |
| 101 STRING_PARAM_FLAG(output_trainer, "", "File to write trainer to"); | |
| 102 STRING_PARAM_FLAG(test_ch, "", "UTF8 test character string"); | |
| 103 STRING_PARAM_FLAG(fonts_dir, "", ""); | |
| 104 STRING_PARAM_FLAG(fontconfig_tmpdir, "", ""); | |
| 105 static DOUBLE_PARAM_FLAG(clusterconfig_min_samples_fraction, Config.MinSamples, | |
| 106 "Min number of samples per proto as % of total"); | |
| 107 static DOUBLE_PARAM_FLAG(clusterconfig_max_illegal, Config.MaxIllegal, | |
| 108 "Max percentage of samples in a cluster which have more" | |
| 109 " than 1 feature in that cluster"); | |
| 110 static DOUBLE_PARAM_FLAG(clusterconfig_independence, Config.Independence, | |
| 111 "Desired independence between dimensions"); | |
| 112 static DOUBLE_PARAM_FLAG(clusterconfig_confidence, Config.Confidence, | |
| 113 "Desired confidence in prototypes created"); | |
| 114 | |
| 115 /** | |
| 116 * This routine parses the command line arguments that were | |
| 117 * passed to the program and uses them to set relevant | |
| 118 * training-related global parameters. | |
| 119 * | |
| 120 * Globals: | |
| 121 * - Config current clustering parameters | |
| 122 * @param argc number of command line arguments to parse | |
| 123 * @param argv command line arguments | |
| 124 */ | |
| 125 void ParseArguments(int *argc, char ***argv) { | |
| 126 std::string usage; | |
| 127 if (*argc) { | |
| 128 usage += (*argv)[0]; | |
| 129 usage += " -v | --version | "; | |
| 130 usage += (*argv)[0]; | |
| 131 } | |
| 132 usage += " [.tr files ...]"; | |
| 133 tesseract::ParseCommandLineFlags(usage.c_str(), argc, argv, true); | |
| 134 // Set some global values based on the flags. | |
| 135 Config.MinSamples = | |
| 136 std::max(0.0, std::min(1.0, double(FLAGS_clusterconfig_min_samples_fraction))); | |
| 137 Config.MaxIllegal = std::max(0.0, std::min(1.0, double(FLAGS_clusterconfig_max_illegal))); | |
| 138 Config.Independence = std::max(0.0, std::min(1.0, double(FLAGS_clusterconfig_independence))); | |
| 139 Config.Confidence = std::max(0.0, std::min(1.0, double(FLAGS_clusterconfig_confidence))); | |
| 140 // Set additional parameters from config file if specified. | |
| 141 if (!FLAGS_configfile.empty()) { | |
| 142 tesseract::ParamUtils::ReadParamsFile( | |
| 143 FLAGS_configfile.c_str(), tesseract::SET_PARAM_CONSTRAINT_NON_INIT_ONLY, ccutil.params()); | |
| 144 } | |
| 145 } | |
| 146 | |
| 147 // Helper loads shape table from the given file. | |
| 148 ShapeTable *LoadShapeTable(const std::string &file_prefix) { | |
| 149 ShapeTable *shape_table = nullptr; | |
| 150 std::string shape_table_file = file_prefix; | |
| 151 shape_table_file += kShapeTableFileSuffix; | |
| 152 TFile shape_fp; | |
| 153 if (shape_fp.Open(shape_table_file.c_str(), nullptr)) { | |
| 154 shape_table = new ShapeTable; | |
| 155 if (!shape_table->DeSerialize(&shape_fp)) { | |
| 156 delete shape_table; | |
| 157 shape_table = nullptr; | |
| 158 tprintf("Error: Failed to read shape table %s\n", shape_table_file.c_str()); | |
| 159 } else { | |
| 160 int num_shapes = shape_table->NumShapes(); | |
| 161 tprintf("Read shape table %s of %d shapes\n", shape_table_file.c_str(), num_shapes); | |
| 162 } | |
| 163 } else { | |
| 164 tprintf("Warning: No shape table file present: %s\n", shape_table_file.c_str()); | |
| 165 } | |
| 166 return shape_table; | |
| 167 } | |
| 168 | |
| 169 // Helper to write the shape_table. | |
| 170 void WriteShapeTable(const std::string &file_prefix, const ShapeTable &shape_table) { | |
| 171 std::string shape_table_file = file_prefix; | |
| 172 shape_table_file += kShapeTableFileSuffix; | |
| 173 FILE *fp = fopen(shape_table_file.c_str(), "wb"); | |
| 174 if (fp != nullptr) { | |
| 175 if (!shape_table.Serialize(fp)) { | |
| 176 fprintf(stderr, "Error writing shape table: %s\n", shape_table_file.c_str()); | |
| 177 } | |
| 178 fclose(fp); | |
| 179 } else { | |
| 180 fprintf(stderr, "Error creating shape table: %s\n", shape_table_file.c_str()); | |
| 181 } | |
| 182 } | |
| 183 | |
| 184 /** | |
| 185 * Creates a MasterTrainer and loads the training data into it: | |
| 186 * Initializes feature_defs and IntegerFX. | |
| 187 * Loads the shape_table if shape_table != nullptr. | |
| 188 * Loads initial unicharset from -U command-line option. | |
| 189 * If FLAGS_T is set, loads the majority of data from there, else: | |
| 190 * - Loads font info from -F option. | |
| 191 * - Loads xheights from -X option. | |
| 192 * - Loads samples from .tr files in remaining command-line args. | |
| 193 * - Deletes outliers and computes canonical samples. | |
| 194 * - If FLAGS_output_trainer is set, saves the trainer for future use. | |
| 195 * TODO: Who uses that? There is currently no code which reads it. | |
| 196 * Computes canonical and cloud features. | |
| 197 * If shape_table is not nullptr, but failed to load, make a fake flat one, | |
| 198 * as shape clustering was not run. | |
| 199 */ | |
| 200 std::unique_ptr<MasterTrainer> LoadTrainingData(const char *const *filelist, bool replication, | |
| 201 ShapeTable **shape_table, std::string &file_prefix) { | |
| 202 InitFeatureDefs(&feature_defs); | |
| 203 InitIntegerFX(); | |
| 204 file_prefix = ""; | |
| 205 if (!FLAGS_D.empty()) { | |
| 206 file_prefix += FLAGS_D.c_str(); | |
| 207 file_prefix += "/"; | |
| 208 } | |
| 209 // If we are shape clustering (nullptr shape_table) or we successfully load | |
| 210 // a shape_table written by a previous shape clustering, then | |
| 211 // shape_analysis will be true, meaning that the MasterTrainer will replace | |
| 212 // some members of the unicharset with their fragments. | |
| 213 bool shape_analysis = false; | |
| 214 if (shape_table != nullptr) { | |
| 215 *shape_table = LoadShapeTable(file_prefix); | |
| 216 if (*shape_table != nullptr) { | |
| 217 shape_analysis = true; | |
| 218 } | |
| 219 } else { | |
| 220 shape_analysis = true; | |
| 221 } | |
| 222 auto trainer = std::make_unique<MasterTrainer>(NM_CHAR_ANISOTROPIC, shape_analysis, replication, | |
| 223 FLAGS_debug_level); | |
| 224 IntFeatureSpace fs; | |
| 225 fs.Init(kBoostXYBuckets, kBoostXYBuckets, kBoostDirBuckets); | |
| 226 trainer->LoadUnicharset(FLAGS_U.c_str()); | |
| 227 // Get basic font information from font_properties. | |
| 228 if (!FLAGS_F.empty()) { | |
| 229 if (!trainer->LoadFontInfo(FLAGS_F.c_str())) { | |
| 230 return {}; | |
| 231 } | |
| 232 } | |
| 233 if (!FLAGS_X.empty()) { | |
| 234 if (!trainer->LoadXHeights(FLAGS_X.c_str())) { | |
| 235 return {}; | |
| 236 } | |
| 237 } | |
| 238 trainer->SetFeatureSpace(fs); | |
| 239 // Load training data from .tr files in filelist (terminated by nullptr). | |
| 240 for (const char *page_name = *filelist++; page_name != nullptr; page_name = *filelist++) { | |
| 241 tprintf("Reading %s ...\n", page_name); | |
| 242 trainer->ReadTrainingSamples(page_name, feature_defs, false); | |
| 243 | |
| 244 // If there is a file with [lang].[fontname].exp[num].fontinfo present, | |
| 245 // read font spacing information in to fontinfo_table. | |
| 246 int pagename_len = strlen(page_name); | |
| 247 char *fontinfo_file_name = new char[pagename_len + 7]; | |
| 248 strncpy(fontinfo_file_name, page_name, pagename_len - 2); // remove "tr" | |
| 249 strcpy(fontinfo_file_name + pagename_len - 2, "fontinfo"); // +"fontinfo" | |
| 250 trainer->AddSpacingInfo(fontinfo_file_name); | |
| 251 delete[] fontinfo_file_name; | |
| 252 | |
| 253 // Load the images into memory if required by the classifier. | |
| 254 if (FLAGS_load_images) { | |
| 255 std::string image_name = page_name; | |
| 256 // Chop off the tr and replace with tif. Extension must be tif! | |
| 257 image_name.resize(image_name.length() - 2); | |
| 258 image_name += "tif"; | |
| 259 trainer->LoadPageImages(image_name.c_str()); | |
| 260 } | |
| 261 } | |
| 262 trainer->PostLoadCleanup(); | |
| 263 // Write the master trainer if required. | |
| 264 if (!FLAGS_output_trainer.empty()) { | |
| 265 FILE *fp = fopen(FLAGS_output_trainer.c_str(), "wb"); | |
| 266 if (fp == nullptr) { | |
| 267 tprintf("Can't create saved trainer data!\n"); | |
| 268 } else { | |
| 269 trainer->Serialize(fp); | |
| 270 fclose(fp); | |
| 271 } | |
| 272 } | |
| 273 trainer->PreTrainingSetup(); | |
| 274 if (!FLAGS_O.empty() && !trainer->unicharset().save_to_file(FLAGS_O.c_str())) { | |
| 275 fprintf(stderr, "Failed to save unicharset to file %s\n", FLAGS_O.c_str()); | |
| 276 return {}; | |
| 277 } | |
| 278 | |
| 279 if (shape_table != nullptr) { | |
| 280 // If we previously failed to load a shapetable, then shape clustering | |
| 281 // wasn't run so make a flat one now. | |
| 282 if (*shape_table == nullptr) { | |
| 283 *shape_table = new ShapeTable; | |
| 284 trainer->SetupFlatShapeTable(*shape_table); | |
| 285 tprintf("Flat shape table summary: %s\n", (*shape_table)->SummaryStr().c_str()); | |
| 286 } | |
| 287 (*shape_table)->set_unicharset(trainer->unicharset()); | |
| 288 } | |
| 289 return trainer; | |
| 290 } | |
| 291 | |
| 292 /*---------------------------------------------------------------------------*/ | |
| 293 /** | |
| 294 * This routine searches through a list of labeled lists to find | |
| 295 * a list with the specified label. If a matching labeled list | |
| 296 * cannot be found, nullptr is returned. | |
| 297 * @param List list to search | |
| 298 * @param Label label to search for | |
| 299 * @return Labeled list with the specified label or nullptr. | |
| 300 * @note Globals: none | |
| 301 */ | |
| 302 LABELEDLIST FindList(LIST List, const std::string &Label) { | |
| 303 LABELEDLIST LabeledList; | |
| 304 | |
| 305 iterate(List) { | |
| 306 LabeledList = reinterpret_cast<LABELEDLIST>(List->first_node()); | |
| 307 if (LabeledList->Label == Label) { | |
| 308 return (LabeledList); | |
| 309 } | |
| 310 } | |
| 311 return (nullptr); | |
| 312 | |
| 313 } /* FindList */ | |
| 314 | |
| 315 /*---------------------------------------------------------------------------*/ | |
| 316 // TODO(rays) This is now used only by cntraining. Convert cntraining to use | |
| 317 // the new method or get rid of it entirely. | |
| 318 /** | |
| 319 * This routine reads training samples from a file and | |
| 320 * places them into a data structure which organizes the | |
| 321 * samples by FontName and CharName. It then returns this | |
| 322 * data structure. | |
| 323 * @param file open text file to read samples from | |
| 324 * @param feature_definitions | |
| 325 * @param feature_name | |
| 326 * @param max_samples | |
| 327 * @param unicharset | |
| 328 * @param training_samples | |
| 329 */ | |
| 330 void ReadTrainingSamples(const FEATURE_DEFS_STRUCT &feature_definitions, const char *feature_name, | |
| 331 int max_samples, UNICHARSET *unicharset, FILE *file, | |
| 332 LIST *training_samples) { | |
| 333 char buffer[2048]; | |
| 334 char unichar[UNICHAR_LEN + 1]; | |
| 335 LABELEDLIST char_sample; | |
| 336 FEATURE_SET feature_samples; | |
| 337 uint32_t feature_type = ShortNameToFeatureType(feature_definitions, feature_name); | |
| 338 | |
| 339 // Zero out the font_sample_count for all the classes. | |
| 340 LIST it = *training_samples; | |
| 341 iterate(it) { | |
| 342 char_sample = reinterpret_cast<LABELEDLIST>(it->first_node()); | |
| 343 char_sample->font_sample_count = 0; | |
| 344 } | |
| 345 | |
| 346 while (fgets(buffer, 2048, file) != nullptr) { | |
| 347 if (buffer[0] == '\n') { | |
| 348 continue; | |
| 349 } | |
| 350 | |
| 351 sscanf(buffer, "%*s %s", unichar); | |
| 352 if (unicharset != nullptr && !unicharset->contains_unichar(unichar)) { | |
| 353 unicharset->unichar_insert(unichar); | |
| 354 if (unicharset->size() > MAX_NUM_CLASSES) { | |
| 355 tprintf( | |
| 356 "Error: Size of unicharset in training is " | |
| 357 "greater than MAX_NUM_CLASSES\n"); | |
| 358 exit(1); | |
| 359 } | |
| 360 } | |
| 361 char_sample = FindList(*training_samples, unichar); | |
| 362 if (char_sample == nullptr) { | |
| 363 char_sample = new LABELEDLISTNODE(unichar); | |
| 364 *training_samples = push(*training_samples, char_sample); | |
| 365 } | |
| 366 auto char_desc = ReadCharDescription(feature_definitions, file); | |
| 367 feature_samples = char_desc->FeatureSets[feature_type]; | |
| 368 if (char_sample->font_sample_count < max_samples || max_samples <= 0) { | |
| 369 char_sample->List = push(char_sample->List, feature_samples); | |
| 370 char_sample->SampleCount++; | |
| 371 char_sample->font_sample_count++; | |
| 372 } else { | |
| 373 delete feature_samples; | |
| 374 } | |
| 375 for (size_t i = 0; i < char_desc->NumFeatureSets; i++) { | |
| 376 if (feature_type != i) { | |
| 377 delete char_desc->FeatureSets[i]; | |
| 378 } | |
| 379 char_desc->FeatureSets[i] = nullptr; | |
| 380 } | |
| 381 delete char_desc; | |
| 382 } | |
| 383 } // ReadTrainingSamples | |
| 384 | |
| 385 /*---------------------------------------------------------------------------*/ | |
| 386 /** | |
| 387 * This routine deallocates all of the space allocated to | |
| 388 * the specified list of training samples. | |
| 389 * @param CharList list of all fonts in document | |
| 390 */ | |
| 391 void FreeTrainingSamples(LIST CharList) { | |
| 392 LABELEDLIST char_sample; | |
| 393 FEATURE_SET FeatureSet; | |
| 394 LIST FeatureList; | |
| 395 | |
| 396 LIST nodes = CharList; | |
| 397 iterate(CharList) { /* iterate through all of the fonts */ | |
| 398 char_sample = reinterpret_cast<LABELEDLIST>(CharList->first_node()); | |
| 399 FeatureList = char_sample->List; | |
| 400 iterate(FeatureList) { /* iterate through all of the classes */ | |
| 401 FeatureSet = reinterpret_cast<FEATURE_SET>(FeatureList->first_node()); | |
| 402 delete FeatureSet; | |
| 403 } | |
| 404 FreeLabeledList(char_sample); | |
| 405 } | |
| 406 destroy(nodes); | |
| 407 } /* FreeTrainingSamples */ | |
| 408 | |
| 409 /*---------------------------------------------------------------------------*/ | |
| 410 /** | |
| 411 * This routine deallocates all of the memory consumed by | |
| 412 * a labeled list. It does not free any memory which may be | |
| 413 * consumed by the items in the list. | |
| 414 * @param LabeledList labeled list to be freed | |
| 415 * @note Globals: none | |
| 416 */ | |
| 417 void FreeLabeledList(LABELEDLIST LabeledList) { | |
| 418 destroy(LabeledList->List); | |
| 419 delete LabeledList; | |
| 420 } /* FreeLabeledList */ | |
| 421 | |
| 422 /*---------------------------------------------------------------------------*/ | |
| 423 /** | |
| 424 * This routine reads samples from a LABELEDLIST and enters | |
| 425 * those samples into a clusterer data structure. This | |
| 426 * data structure is then returned to the caller. | |
| 427 * @param char_sample: LABELEDLIST that holds all the feature information for a | |
| 428 * @param FeatureDefs | |
| 429 * @param program_feature_type | |
| 430 * given character. | |
| 431 * @return Pointer to new clusterer data structure. | |
| 432 * @note Globals: None | |
| 433 */ | |
| 434 CLUSTERER *SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST char_sample, | |
| 435 const char *program_feature_type) { | |
| 436 uint16_t N; | |
| 437 CLUSTERER *Clusterer; | |
| 438 LIST FeatureList = nullptr; | |
| 439 FEATURE_SET FeatureSet = nullptr; | |
| 440 | |
| 441 int32_t desc_index = ShortNameToFeatureType(FeatureDefs, program_feature_type); | |
| 442 N = FeatureDefs.FeatureDesc[desc_index]->NumParams; | |
| 443 Clusterer = MakeClusterer(N, FeatureDefs.FeatureDesc[desc_index]->ParamDesc); | |
| 444 | |
| 445 FeatureList = char_sample->List; | |
| 446 uint32_t CharID = 0; | |
| 447 std::vector<float> Sample; | |
| 448 iterate(FeatureList) { | |
| 449 FeatureSet = reinterpret_cast<FEATURE_SET>(FeatureList->first_node()); | |
| 450 for (int i = 0; i < FeatureSet->MaxNumFeatures; i++) { | |
| 451 if (Sample.empty()) { | |
| 452 Sample.resize(N); | |
| 453 } | |
| 454 for (int j = 0; j < N; j++) { | |
| 455 Sample[j] = FeatureSet->Features[i]->Params[j]; | |
| 456 } | |
| 457 MakeSample(Clusterer, &Sample[0], CharID); | |
| 458 } | |
| 459 CharID++; | |
| 460 } | |
| 461 return Clusterer; | |
| 462 | |
| 463 } /* SetUpForClustering */ | |
| 464 | |
| 465 /*------------------------------------------------------------------------*/ | |
| 466 void MergeInsignificantProtos(LIST ProtoList, const char *label, CLUSTERER *Clusterer, | |
| 467 CLUSTERCONFIG *clusterconfig) { | |
| 468 PROTOTYPE *Prototype; | |
| 469 bool debug = strcmp(FLAGS_test_ch.c_str(), label) == 0; | |
| 470 | |
| 471 LIST pProtoList = ProtoList; | |
| 472 iterate(pProtoList) { | |
| 473 Prototype = reinterpret_cast<PROTOTYPE *>(pProtoList->first_node()); | |
| 474 if (Prototype->Significant || Prototype->Merged) { | |
| 475 continue; | |
| 476 } | |
| 477 float best_dist = 0.125; | |
| 478 PROTOTYPE *best_match = nullptr; | |
| 479 // Find the nearest alive prototype. | |
| 480 LIST list_it = ProtoList; | |
| 481 iterate(list_it) { | |
| 482 auto *test_p = reinterpret_cast<PROTOTYPE *>(list_it->first_node()); | |
| 483 if (test_p != Prototype && !test_p->Merged) { | |
| 484 float dist = ComputeDistance(Clusterer->SampleSize, Clusterer->ParamDesc, &Prototype->Mean[0], | |
| 485 &test_p->Mean[0]); | |
| 486 if (dist < best_dist) { | |
| 487 best_match = test_p; | |
| 488 best_dist = dist; | |
| 489 } | |
| 490 } | |
| 491 } | |
| 492 if (best_match != nullptr && !best_match->Significant) { | |
| 493 if (debug) { | |
| 494 auto bestMatchNumSamples = best_match->NumSamples; | |
| 495 auto prototypeNumSamples = Prototype->NumSamples; | |
| 496 tprintf("Merging red clusters (%d+%d) at %g,%g and %g,%g\n", bestMatchNumSamples, | |
| 497 prototypeNumSamples, best_match->Mean[0], best_match->Mean[1], Prototype->Mean[0], | |
| 498 Prototype->Mean[1]); | |
| 499 } | |
| 500 best_match->NumSamples = | |
| 501 MergeClusters(Clusterer->SampleSize, Clusterer->ParamDesc, best_match->NumSamples, | |
| 502 Prototype->NumSamples, &best_match->Mean[0], &best_match->Mean[0], &Prototype->Mean[0]); | |
| 503 Prototype->NumSamples = 0; | |
| 504 Prototype->Merged = true; | |
| 505 } else if (best_match != nullptr) { | |
| 506 if (debug) { | |
| 507 tprintf("Red proto at %g,%g matched a green one at %g,%g\n", Prototype->Mean[0], | |
| 508 Prototype->Mean[1], best_match->Mean[0], best_match->Mean[1]); | |
| 509 } | |
| 510 Prototype->Merged = true; | |
| 511 } | |
| 512 } | |
| 513 // Mark significant those that now have enough samples. | |
| 514 int min_samples = static_cast<int32_t>(clusterconfig->MinSamples * Clusterer->NumChar); | |
| 515 pProtoList = ProtoList; | |
| 516 iterate(pProtoList) { | |
| 517 Prototype = reinterpret_cast<PROTOTYPE *>(pProtoList->first_node()); | |
| 518 // Process insignificant protos that do not match a green one | |
| 519 if (!Prototype->Significant && Prototype->NumSamples >= min_samples && !Prototype->Merged) { | |
| 520 if (debug) { | |
| 521 tprintf("Red proto at %g,%g becoming green\n", Prototype->Mean[0], Prototype->Mean[1]); | |
| 522 } | |
| 523 Prototype->Significant = true; | |
| 524 } | |
| 525 } | |
| 526 } /* MergeInsignificantProtos */ | |
| 527 | |
| 528 /*-----------------------------------------------------------------------------*/ | |
| 529 void CleanUpUnusedData(LIST ProtoList) { | |
| 530 PROTOTYPE *Prototype; | |
| 531 | |
| 532 iterate(ProtoList) { | |
| 533 Prototype = reinterpret_cast<PROTOTYPE *>(ProtoList->first_node()); | |
| 534 delete[] Prototype->Variance.Elliptical; | |
| 535 Prototype->Variance.Elliptical = nullptr; | |
| 536 delete[] Prototype->Magnitude.Elliptical; | |
| 537 Prototype->Magnitude.Elliptical = nullptr; | |
| 538 delete[] Prototype->Weight.Elliptical; | |
| 539 Prototype->Weight.Elliptical = nullptr; | |
| 540 } | |
| 541 } | |
| 542 | |
| 543 /*------------------------------------------------------------------------*/ | |
| 544 LIST RemoveInsignificantProtos(LIST ProtoList, bool KeepSigProtos, bool KeepInsigProtos, int N) | |
| 545 | |
| 546 { | |
| 547 LIST NewProtoList = NIL_LIST; | |
| 548 auto pProtoList = ProtoList; | |
| 549 iterate(pProtoList) { | |
| 550 auto Proto = reinterpret_cast<PROTOTYPE *>(pProtoList->first_node()); | |
| 551 if ((Proto->Significant && KeepSigProtos) || (!Proto->Significant && KeepInsigProtos)) { | |
| 552 auto NewProto = new PROTOTYPE; | |
| 553 NewProto->Mean = Proto->Mean; | |
| 554 NewProto->Significant = Proto->Significant; | |
| 555 NewProto->Style = Proto->Style; | |
| 556 NewProto->NumSamples = Proto->NumSamples; | |
| 557 NewProto->Cluster = nullptr; | |
| 558 NewProto->Distrib.clear(); | |
| 559 | |
| 560 if (Proto->Variance.Elliptical != nullptr) { | |
| 561 NewProto->Variance.Elliptical = new float[N]; | |
| 562 for (int i = 0; i < N; i++) { | |
| 563 NewProto->Variance.Elliptical[i] = Proto->Variance.Elliptical[i]; | |
| 564 } | |
| 565 } else { | |
| 566 NewProto->Variance.Elliptical = nullptr; | |
| 567 } | |
| 568 //--------------------------------------------- | |
| 569 if (Proto->Magnitude.Elliptical != nullptr) { | |
| 570 NewProto->Magnitude.Elliptical = new float[N]; | |
| 571 for (int i = 0; i < N; i++) { | |
| 572 NewProto->Magnitude.Elliptical[i] = Proto->Magnitude.Elliptical[i]; | |
| 573 } | |
| 574 } else { | |
| 575 NewProto->Magnitude.Elliptical = nullptr; | |
| 576 } | |
| 577 //------------------------------------------------ | |
| 578 if (Proto->Weight.Elliptical != nullptr) { | |
| 579 NewProto->Weight.Elliptical = new float[N]; | |
| 580 for (int i = 0; i < N; i++) { | |
| 581 NewProto->Weight.Elliptical[i] = Proto->Weight.Elliptical[i]; | |
| 582 } | |
| 583 } else { | |
| 584 NewProto->Weight.Elliptical = nullptr; | |
| 585 } | |
| 586 | |
| 587 NewProto->TotalMagnitude = Proto->TotalMagnitude; | |
| 588 NewProto->LogMagnitude = Proto->LogMagnitude; | |
| 589 NewProtoList = push_last(NewProtoList, NewProto); | |
| 590 } | |
| 591 } | |
| 592 FreeProtoList(&ProtoList); | |
| 593 return (NewProtoList); | |
| 594 } /* RemoveInsignificantProtos */ | |
| 595 | |
| 596 /*----------------------------------------------------------------------------*/ | |
| 597 MERGE_CLASS FindClass(LIST List, const std::string &Label) { | |
| 598 MERGE_CLASS MergeClass; | |
| 599 | |
| 600 iterate(List) { | |
| 601 MergeClass = reinterpret_cast<MERGE_CLASS>(List->first_node()); | |
| 602 if (MergeClass->Label == Label) { | |
| 603 return (MergeClass); | |
| 604 } | |
| 605 } | |
| 606 return (nullptr); | |
| 607 | |
| 608 } /* FindClass */ | |
| 609 | |
| 610 /*-----------------------------------------------------------------------------*/ | |
| 611 /** | |
| 612 * This routine deallocates all of the space allocated to | |
| 613 * the specified list of training samples. | |
| 614 * @param ClassList list of all fonts in document | |
| 615 */ | |
| 616 void FreeLabeledClassList(LIST ClassList) { | |
| 617 MERGE_CLASS MergeClass; | |
| 618 | |
| 619 LIST nodes = ClassList; | |
| 620 iterate(ClassList) /* iterate through all of the fonts */ | |
| 621 { | |
| 622 MergeClass = reinterpret_cast<MERGE_CLASS>(ClassList->first_node()); | |
| 623 FreeClass(MergeClass->Class); | |
| 624 delete MergeClass; | |
| 625 } | |
| 626 destroy(nodes); | |
| 627 | |
| 628 } /* FreeLabeledClassList */ | |
| 629 | |
| 630 /* SetUpForFloat2Int */ | |
| 631 CLASS_STRUCT *SetUpForFloat2Int(const UNICHARSET &unicharset, LIST LabeledClassList) { | |
| 632 MERGE_CLASS MergeClass; | |
| 633 CLASS_TYPE Class; | |
| 634 int NumProtos; | |
| 635 int NumConfigs; | |
| 636 int NumWords; | |
| 637 int i, j; | |
| 638 float Values[3]; | |
| 639 PROTO_STRUCT *NewProto; | |
| 640 PROTO_STRUCT *OldProto; | |
| 641 BIT_VECTOR NewConfig; | |
| 642 BIT_VECTOR OldConfig; | |
| 643 | |
| 644 // printf("Float2Int ...\n"); | |
| 645 | |
| 646 auto *float_classes = new CLASS_STRUCT[unicharset.size()]; | |
| 647 iterate(LabeledClassList) { | |
| 648 UnicityTable<int> font_set; | |
| 649 MergeClass = reinterpret_cast<MERGE_CLASS>(LabeledClassList->first_node()); | |
| 650 Class = &float_classes[unicharset.unichar_to_id(MergeClass->Label.c_str())]; | |
| 651 NumProtos = MergeClass->Class->NumProtos; | |
| 652 NumConfigs = MergeClass->Class->NumConfigs; | |
| 653 font_set.move(&MergeClass->Class->font_set); | |
| 654 Class->NumProtos = NumProtos; | |
| 655 Class->MaxNumProtos = NumProtos; | |
| 656 Class->Prototypes.resize(NumProtos); | |
| 657 for (i = 0; i < NumProtos; i++) { | |
| 658 NewProto = ProtoIn(Class, i); | |
| 659 OldProto = ProtoIn(MergeClass->Class, i); | |
| 660 Values[0] = OldProto->X; | |
| 661 Values[1] = OldProto->Y; | |
| 662 Values[2] = OldProto->Angle; | |
| 663 Normalize(Values); | |
| 664 NewProto->X = OldProto->X; | |
| 665 NewProto->Y = OldProto->Y; | |
| 666 NewProto->Length = OldProto->Length; | |
| 667 NewProto->Angle = OldProto->Angle; | |
| 668 NewProto->A = Values[0]; | |
| 669 NewProto->B = Values[1]; | |
| 670 NewProto->C = Values[2]; | |
| 671 } | |
| 672 | |
| 673 Class->NumConfigs = NumConfigs; | |
| 674 Class->MaxNumConfigs = NumConfigs; | |
| 675 Class->font_set.move(&font_set); | |
| 676 Class->Configurations.resize(NumConfigs); | |
| 677 NumWords = WordsInVectorOfSize(NumProtos); | |
| 678 for (i = 0; i < NumConfigs; i++) { | |
| 679 NewConfig = NewBitVector(NumProtos); | |
| 680 OldConfig = MergeClass->Class->Configurations[i]; | |
| 681 for (j = 0; j < NumWords; j++) { | |
| 682 NewConfig[j] = OldConfig[j]; | |
| 683 } | |
| 684 Class->Configurations[i] = NewConfig; | |
| 685 } | |
| 686 } | |
| 687 return float_classes; | |
| 688 } // SetUpForFloat2Int | |
| 689 | |
| 690 /*--------------------------------------------------------------------------*/ | |
| 691 void Normalize(float *Values) { | |
| 692 float Slope; | |
| 693 float Intercept; | |
| 694 float Normalizer; | |
| 695 | |
| 696 Slope = tan(Values[2] * 2 * M_PI); | |
| 697 Intercept = Values[1] - Slope * Values[0]; | |
| 698 Normalizer = 1 / sqrt(Slope * Slope + 1.0); | |
| 699 | |
| 700 Values[0] = Slope * Normalizer; | |
| 701 Values[1] = -Normalizer; | |
| 702 Values[2] = Intercept * Normalizer; | |
| 703 } // Normalize | |
| 704 | |
| 705 /*-------------------------------------------------------------------------*/ | |
| 706 void FreeNormProtoList(LIST CharList) | |
| 707 | |
| 708 { | |
| 709 LABELEDLIST char_sample; | |
| 710 | |
| 711 LIST nodes = CharList; | |
| 712 iterate(CharList) /* iterate through all of the fonts */ | |
| 713 { | |
| 714 char_sample = reinterpret_cast<LABELEDLIST>(CharList->first_node()); | |
| 715 FreeLabeledList(char_sample); | |
| 716 } | |
| 717 destroy(nodes); | |
| 718 | |
| 719 } // FreeNormProtoList | |
| 720 | |
| 721 /*---------------------------------------------------------------------------*/ | |
| 722 void AddToNormProtosList(LIST *NormProtoList, LIST ProtoList, const std::string &CharName) { | |
| 723 auto LabeledProtoList = new LABELEDLISTNODE(CharName.c_str()); | |
| 724 iterate(ProtoList) { | |
| 725 auto Proto = reinterpret_cast<PROTOTYPE *>(ProtoList->first_node()); | |
| 726 LabeledProtoList->List = push(LabeledProtoList->List, Proto); | |
| 727 } | |
| 728 *NormProtoList = push(*NormProtoList, LabeledProtoList); | |
| 729 } | |
| 730 | |
| 731 /*---------------------------------------------------------------------------*/ | |
| 732 int NumberOfProtos(LIST ProtoList, bool CountSigProtos, bool CountInsigProtos) { | |
| 733 int N = 0; | |
| 734 iterate(ProtoList) { | |
| 735 auto *Proto = reinterpret_cast<PROTOTYPE *>(ProtoList->first_node()); | |
| 736 if ((Proto->Significant && CountSigProtos) || (!Proto->Significant && CountInsigProtos)) { | |
| 737 N++; | |
| 738 } | |
| 739 } | |
| 740 return (N); | |
| 741 } | |
| 742 | |
| 743 } // namespace tesseract. | |
| 744 | |
| 745 #endif // def DISABLED_LEGACY_ENGINE |
