Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/training/mftraining.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/training/mftraining.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,273 @@ +/****************************************************************************** + ** Filename: mftraining.c + ** Purpose: Separates training pages into files for each character. + ** Strips from files only the features and there parameters of + ** the feature type mf. + ** Author: Dan Johnson + ** Revisment: Christy Russon + ** + ** (c) Copyright Hewlett-Packard Company, 1988. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + ******************************************************************************/ +/*---------------------------------------------------------------------------- + Include Files and Type Defines +----------------------------------------------------------------------------*/ + +#define _USE_MATH_DEFINES // for M_PI +#ifdef HAVE_CONFIG_H +# include "config_auto.h" +#endif + +#include <cmath> // for M_PI +#include <cstdio> +#include <cstring> + +#include "classify.h" +#include "cluster.h" +#include "clusttool.h" +#include "commontraining.h" +#include "featdefs.h" +#include "fontinfo.h" +#include "indexmapbidi.h" +#include "intproto.h" +#include "mastertrainer.h" +#include "mergenf.h" +#include "mf.h" +#include "ocrfeatures.h" +#include "oldlist.h" +#include "protos.h" +#include "shapetable.h" +#include "tprintf.h" +#include "unicity_table.h" + +using namespace tesseract; + +/*---------------------------------------------------------------------------- + Public Code +-----------------------------------------------------------------------------*/ +#ifndef GRAPHICS_DISABLED +static void DisplayProtoList(const char *ch, LIST protolist) { + auto window = std::make_unique<ScrollView>("Char samples", 50, 200, 520, 520, 260, 260, true); + LIST proto = protolist; + iterate(proto) { + auto *prototype = reinterpret_cast<PROTOTYPE *>(proto->first_node()); + if (prototype->Significant) { + window->Pen(ScrollView::GREEN); + } else if (prototype->NumSamples == 0) { + window->Pen(ScrollView::BLUE); + } else if (prototype->Merged) { + window->Pen(ScrollView::MAGENTA); + } else { + window->Pen(ScrollView::RED); + } + float x = CenterX(prototype->Mean); + float y = CenterY(prototype->Mean); + double angle = OrientationOf(prototype->Mean) * 2 * M_PI; + auto dx = static_cast<float>(LengthOf(prototype->Mean) * cos(angle) / 2); + auto dy = static_cast<float>(LengthOf(prototype->Mean) * sin(angle) / 2); + window->SetCursor((x - dx) * 256, (y - dy) * 256); + window->DrawTo((x + dx) * 256, (y + dy) * 256); + auto prototypeNumSamples = prototype->NumSamples; + if (prototype->Significant) { + tprintf("Green proto at (%g,%g)+(%g,%g) %d samples\n", x, y, dx, dy, prototypeNumSamples); + } else if (prototype->NumSamples > 0 && !prototype->Merged) { + tprintf("Red proto at (%g,%g)+(%g,%g) %d samples\n", x, y, dx, dy, prototypeNumSamples); + } + } + window->Update(); +} +#endif // !GRAPHICS_DISABLED + +// Helper to run clustering on a single config. +// Mostly copied from the old mftraining, but with renamed variables. +static LIST ClusterOneConfig(int shape_id, const char *class_label, LIST mf_classes, + const ShapeTable &shape_table, MasterTrainer *trainer) { + int num_samples; + CLUSTERER *clusterer = + trainer->SetupForClustering(shape_table, feature_defs, shape_id, &num_samples); + Config.MagicSamples = num_samples; + LIST proto_list = ClusterSamples(clusterer, &Config); + CleanUpUnusedData(proto_list); + + // Merge protos where reasonable to make more of them significant by + // representing almost all samples of the class/font. + MergeInsignificantProtos(proto_list, class_label, clusterer, &Config); +#ifndef GRAPHICS_DISABLED + if (strcmp(FLAGS_test_ch.c_str(), class_label) == 0) { + DisplayProtoList(FLAGS_test_ch.c_str(), proto_list); + } +#endif // !GRAPHICS_DISABLED + // Delete the protos that will not be used in the inttemp output file. + proto_list = RemoveInsignificantProtos(proto_list, true, false, clusterer->SampleSize); + FreeClusterer(clusterer); + MERGE_CLASS merge_class = FindClass(mf_classes, class_label); + if (merge_class == nullptr) { + merge_class = new MERGE_CLASS_NODE(class_label); + mf_classes = push(mf_classes, merge_class); + } + int config_id = AddConfigToClass(merge_class->Class); + merge_class->Class->font_set.push_back(shape_id); + LIST proto_it = proto_list; + iterate(proto_it) { + auto *prototype = reinterpret_cast<PROTOTYPE *>(proto_it->first_node()); + // See if proto can be approximated by existing proto. + int p_id = FindClosestExistingProto(merge_class->Class, merge_class->NumMerged, prototype); + if (p_id == NO_PROTO) { + // Need to make a new proto, as it doesn't match anything. + p_id = AddProtoToClass(merge_class->Class); + MakeNewFromOld(ProtoIn(merge_class->Class, p_id), prototype); + merge_class->NumMerged[p_id] = 1; + } else { + PROTO_STRUCT dummy_proto; + MakeNewFromOld(&dummy_proto, prototype); + // Merge with the similar proto. + ComputeMergedProto(ProtoIn(merge_class->Class, p_id), &dummy_proto, + static_cast<float>(merge_class->NumMerged[p_id]), 1.0, + ProtoIn(merge_class->Class, p_id)); + merge_class->NumMerged[p_id]++; + } + AddProtoToConfig(p_id, merge_class->Class->Configurations[config_id]); + } + FreeProtoList(&proto_list); + return mf_classes; +} + +// Helper to setup the config map. +// Setup an index mapping from the shapes in the shape table to the classes +// that will be trained. In keeping with the original design, each shape +// with the same list of unichars becomes a different class and the configs +// represent the different combinations of fonts. +static void SetupConfigMap(ShapeTable *shape_table, IndexMapBiDi *config_map) { + int num_configs = shape_table->NumShapes(); + config_map->Init(num_configs, true); + config_map->Setup(); + for (int c1 = 0; c1 < num_configs; ++c1) { + // Only process ids that are not already merged. + if (config_map->SparseToCompact(c1) == c1) { + Shape *shape1 = shape_table->MutableShape(c1); + // Find all the subsequent shapes that are equal. + for (int c2 = c1 + 1; c2 < num_configs; ++c2) { + if (shape_table->MutableShape(c2)->IsEqualUnichars(shape1)) { + config_map->Merge(c1, c2); + } + } + } + } + config_map->CompleteMerges(); +} + +/** + * This program reads in a text file consisting of feature + * samples from a training page in the following format: + * @verbatim + FontName UTF8-char-str xmin ymin xmax ymax page-number + NumberOfFeatureTypes(N) + FeatureTypeName1 NumberOfFeatures(M) + Feature1 + ... + FeatureM + FeatureTypeName2 NumberOfFeatures(M) + Feature1 + ... + FeatureM + ... + FeatureTypeNameN NumberOfFeatures(M) + Feature1 + ... + FeatureM + FontName CharName ... + @endverbatim + * The result of this program is a binary inttemp file used by + * the OCR engine. + * @param argc number of command line arguments + * @param argv array of command line arguments + * @return 0 if no error occurred + */ +int main(int argc, char **argv) { + tesseract::CheckSharedLibraryVersion(); + + ParseArguments(&argc, &argv); + + ShapeTable *shape_table = nullptr; + std::string file_prefix; + // Load the training data. + auto trainer = tesseract::LoadTrainingData(argv + 1, false, &shape_table, file_prefix); + if (trainer == nullptr) { + return EXIT_FAILURE; // Failed. + } + + // Setup an index mapping from the shapes in the shape table to the classes + // that will be trained. In keeping with the original design, each shape + // with the same list of unichars becomes a different class and the configs + // represent the different combinations of fonts. + IndexMapBiDi config_map; + SetupConfigMap(shape_table, &config_map); + + WriteShapeTable(file_prefix, *shape_table); + // If the shape_table is flat, then either we didn't run shape clustering, or + // it did nothing, so we just output the trainer's unicharset. + // Otherwise shape_set will hold a fake unicharset with an entry for each + // shape in the shape table, and we will output that instead. + UNICHARSET shape_set; + const UNICHARSET *unicharset = &trainer->unicharset(); + // If we ran shapeclustering (and it worked) then at least one shape will + // have multiple unichars, so we have to build a fake unicharset. + if (shape_table->AnyMultipleUnichars()) { + unicharset = &shape_set; + // Now build a fake unicharset for the compact shape space to keep the + // output modules happy that we are doing things correctly. + int num_shapes = config_map.CompactSize(); + for (int s = 0; s < num_shapes; ++s) { + char shape_label[14]; + snprintf(shape_label, sizeof(shape_label), "sh%04d", s); + shape_set.unichar_insert(shape_label); + } + } + + // Now train each config separately. + int num_configs = shape_table->NumShapes(); + LIST mf_classes = NIL_LIST; + for (int s = 0; s < num_configs; ++s) { + int unichar_id, font_id; + if (unicharset == &shape_set) { + // Using fake unichar_ids from the config_map/shape_set. + unichar_id = config_map.SparseToCompact(s); + } else { + // Get the real unichar_id from the shape table/unicharset. + shape_table->GetFirstUnicharAndFont(s, &unichar_id, &font_id); + } + const char *class_label = unicharset->id_to_unichar(unichar_id); + mf_classes = ClusterOneConfig(s, class_label, mf_classes, *shape_table, trainer.get()); + } + std::string inttemp_file = file_prefix; + inttemp_file += "inttemp"; + std::string pffmtable_file = std::move(file_prefix); + pffmtable_file += "pffmtable"; + CLASS_STRUCT *float_classes = SetUpForFloat2Int(*unicharset, mf_classes); + // Now write the inttemp and pffmtable. + trainer->WriteInttempAndPFFMTable(trainer->unicharset(), *unicharset, *shape_table, float_classes, + inttemp_file.c_str(), pffmtable_file.c_str()); + for (size_t c = 0; c < unicharset->size(); ++c) { + FreeClassFields(&float_classes[c]); + } + delete[] float_classes; + FreeLabeledClassList(mf_classes); + delete shape_table; + printf("Done!\n"); + if (!FLAGS_test_ch.empty()) { + // If we are displaying debug window(s), wait for the user to look at them. + printf("Hit return to exit...\n"); + while (getchar() != '\n') { + ; + } + } + return EXIT_SUCCESS; +} /* main */
