Mercurial > hgrepos > Python2 > PyMuPDF
view mupdf-source/thirdparty/tesseract/src/training/cntraining.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line source
/****************************************************************************** ** Filename: cntraining.cpp ** Purpose: Generates a normproto and pffmtable. ** Author: Dan Johnson ** Revisment: Christy Russon ** ** (c) Copyright Hewlett-Packard Company, 1988. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License. ******************************************************************************/ /*---------------------------------------------------------------------------- Include Files and Type Defines ----------------------------------------------------------------------------*/ #include <tesseract/unichar.h> #include <cmath> #include <cstdio> #include <cstring> #include "cluster.h" #include "clusttool.h" #include "commontraining.h" #include "featdefs.h" #include "ocrfeatures.h" #include "oldlist.h" #define PROGRAM_FEATURE_TYPE "cn" using namespace tesseract; /*---------------------------------------------------------------------------- Private Function Prototypes ----------------------------------------------------------------------------*/ static void WriteNormProtos(const char *Directory, LIST LabeledProtoList, const FEATURE_DESC_STRUCT *feature_desc); static void WriteProtos(FILE *File, uint16_t N, LIST ProtoList, bool WriteSigProtos, bool WriteInsigProtos); /*---------------------------------------------------------------------------- Global Data Definitions and Declarations ----------------------------------------------------------------------------*/ /* global variable to hold configuration parameters to control clustering */ //-M 0.025 -B 0.05 -I 0.8 -C 1e-3 static const CLUSTERCONFIG CNConfig = {elliptical, 0.025, 0.05, 0.8, 1e-3, 0}; /*---------------------------------------------------------------------------- Public Code ----------------------------------------------------------------------------*/ /** * This program reads in a text file consisting of feature * samples from a training page in the following format: * @verbatim FontName CharName NumberOfFeatureTypes(N) FeatureTypeName1 NumberOfFeatures(M) Feature1 ... FeatureM FeatureTypeName2 NumberOfFeatures(M) Feature1 ... FeatureM ... FeatureTypeNameN NumberOfFeatures(M) Feature1 ... FeatureM FontName CharName ... @endverbatim * It then appends these samples into a separate file for each * character. The name of the file is * * DirectoryName/FontName/CharName.FeatureTypeName * * The DirectoryName can be specified via a command * line argument. If not specified, it defaults to the * current directory. The format of the resulting files is: * @verbatim NumberOfFeatures(M) Feature1 ... FeatureM NumberOfFeatures(M) ... @endverbatim * The output files each have a header which describes the * type of feature which the file contains. This header is * in the format required by the clusterer. A command line * argument can also be used to specify that only the first * N samples of each class should be used. * @param argc number of command line arguments * @param argv array of command line arguments * @return 0 on success */ int main(int argc, char *argv[]) { tesseract::CheckSharedLibraryVersion(); // Set the global Config parameters before parsing the command line. Config = CNConfig; LIST CharList = NIL_LIST; CLUSTERER *Clusterer = nullptr; LIST ProtoList = NIL_LIST; LIST NormProtoList = NIL_LIST; LIST pCharList; LABELEDLIST CharSample; FEATURE_DEFS_STRUCT FeatureDefs; InitFeatureDefs(&FeatureDefs); ParseArguments(&argc, &argv); #if !defined(NDEBUG) int num_fonts = 0; #endif for (const char *PageName = *++argv; PageName != nullptr; PageName = *++argv) { printf("Reading %s ...\n", PageName); FILE *TrainingPage = fopen(PageName, "rb"); ASSERT_HOST(TrainingPage); if (TrainingPage) { ReadTrainingSamples(FeatureDefs, PROGRAM_FEATURE_TYPE, 100, nullptr, TrainingPage, &CharList); fclose(TrainingPage); #if !defined(NDEBUG) ++num_fonts; #endif } } printf("Clustering ...\n"); // To allow an individual font to form a separate cluster, // reduce the min samples: // Config.MinSamples = 0.5 / num_fonts; pCharList = CharList; // The norm protos will count the source protos, so we keep them here in // freeable_protos, so they can be freed later. std::vector<LIST> freeable_protos; iterate(pCharList) { // Cluster CharSample = reinterpret_cast<LABELEDLIST>(pCharList->first_node()); Clusterer = SetUpForClustering(FeatureDefs, CharSample, PROGRAM_FEATURE_TYPE); if (Clusterer == nullptr) { // To avoid a SIGSEGV fprintf(stderr, "Error: nullptr clusterer!\n"); return EXIT_FAILURE; } float SavedMinSamples = Config.MinSamples; // To disable the tendency to produce a single cluster for all fonts, // make MagicSamples an impossible to achieve number: // Config.MagicSamples = CharSample->SampleCount * 10; Config.MagicSamples = CharSample->SampleCount; while (Config.MinSamples > 0.001) { ProtoList = ClusterSamples(Clusterer, &Config); if (NumberOfProtos(ProtoList, true, false) > 0) { break; } else { Config.MinSamples *= 0.95; printf( "0 significant protos for %s." " Retrying clustering with MinSamples = %f%%\n", CharSample->Label.c_str(), Config.MinSamples); } } Config.MinSamples = SavedMinSamples; AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label); freeable_protos.push_back(ProtoList); FreeClusterer(Clusterer); } FreeTrainingSamples(CharList); int desc_index = ShortNameToFeatureType(FeatureDefs, PROGRAM_FEATURE_TYPE); WriteNormProtos(FLAGS_D.c_str(), NormProtoList, FeatureDefs.FeatureDesc[desc_index]); FreeNormProtoList(NormProtoList); for (auto &freeable_proto : freeable_protos) { FreeProtoList(&freeable_proto); } printf("\n"); return EXIT_SUCCESS; } // main /*---------------------------------------------------------------------------- Private Code ----------------------------------------------------------------------------*/ /*----------------------------------------------------------------------------*/ /** * This routine writes the specified samples into files which * are organized according to the font name and character name * of the samples. * @param Directory directory to place sample files into * @param LabeledProtoList List of labeled protos * @param feature_desc Description of the features */ static void WriteNormProtos(const char *Directory, LIST LabeledProtoList, const FEATURE_DESC_STRUCT *feature_desc) { FILE *File; LABELEDLIST LabeledProto; int N; std::string Filename = ""; if (Directory != nullptr && Directory[0] != '\0') { Filename += Directory; Filename += "/"; } Filename += "normproto"; printf("\nWriting %s ...", Filename.c_str()); File = fopen(Filename.c_str(), "wb"); ASSERT_HOST(File); fprintf(File, "%0d\n", feature_desc->NumParams); WriteParamDesc(File, feature_desc->NumParams, feature_desc->ParamDesc); iterate(LabeledProtoList) { LabeledProto = reinterpret_cast<LABELEDLIST>(LabeledProtoList->first_node()); N = NumberOfProtos(LabeledProto->List, true, false); if (N < 1) { printf( "\nError! Not enough protos for %s: %d protos" " (%d significant protos" ", %d insignificant protos)\n", LabeledProto->Label.c_str(), N, NumberOfProtos(LabeledProto->List, true, false), NumberOfProtos(LabeledProto->List, false, true)); exit(1); } fprintf(File, "\n%s %d\n", LabeledProto->Label.c_str(), N); WriteProtos(File, feature_desc->NumParams, LabeledProto->List, true, false); } fclose(File); } // WriteNormProtos /*-------------------------------------------------------------------------*/ static void WriteProtos(FILE *File, uint16_t N, LIST ProtoList, bool WriteSigProtos, bool WriteInsigProtos) { PROTOTYPE *Proto; // write prototypes iterate(ProtoList) { Proto = reinterpret_cast<PROTOTYPE *>(ProtoList->first_node()); if ((Proto->Significant && WriteSigProtos) || (!Proto->Significant && WriteInsigProtos)) { WritePrototype(File, N, Proto); } } } // WriteProtos
