Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/training/cntraining.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/training/cntraining.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,245 @@ +/****************************************************************************** + ** Filename: cntraining.cpp + ** Purpose: Generates a normproto and pffmtable. + ** Author: Dan Johnson + ** Revisment: Christy Russon + ** + ** (c) Copyright Hewlett-Packard Company, 1988. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + ******************************************************************************/ + +/*---------------------------------------------------------------------------- + Include Files and Type Defines +----------------------------------------------------------------------------*/ +#include <tesseract/unichar.h> +#include <cmath> +#include <cstdio> +#include <cstring> +#include "cluster.h" +#include "clusttool.h" +#include "commontraining.h" +#include "featdefs.h" +#include "ocrfeatures.h" +#include "oldlist.h" + +#define PROGRAM_FEATURE_TYPE "cn" + +using namespace tesseract; + +/*---------------------------------------------------------------------------- + Private Function Prototypes +----------------------------------------------------------------------------*/ + +static void WriteNormProtos(const char *Directory, LIST LabeledProtoList, + const FEATURE_DESC_STRUCT *feature_desc); + +static void WriteProtos(FILE *File, uint16_t N, LIST ProtoList, bool WriteSigProtos, + bool WriteInsigProtos); + +/*---------------------------------------------------------------------------- + Global Data Definitions and Declarations +----------------------------------------------------------------------------*/ +/* global variable to hold configuration parameters to control clustering */ +//-M 0.025 -B 0.05 -I 0.8 -C 1e-3 +static const CLUSTERCONFIG CNConfig = {elliptical, 0.025, 0.05, 0.8, 1e-3, 0}; + +/*---------------------------------------------------------------------------- + Public Code +----------------------------------------------------------------------------*/ + +/** +* This program reads in a text file consisting of feature +* samples from a training page in the following format: +* @verbatim + FontName CharName NumberOfFeatureTypes(N) + FeatureTypeName1 NumberOfFeatures(M) + Feature1 + ... + FeatureM + FeatureTypeName2 NumberOfFeatures(M) + Feature1 + ... + FeatureM + ... + FeatureTypeNameN NumberOfFeatures(M) + Feature1 + ... + FeatureM + FontName CharName ... +@endverbatim +* It then appends these samples into a separate file for each +* character. The name of the file is +* +* DirectoryName/FontName/CharName.FeatureTypeName +* +* The DirectoryName can be specified via a command +* line argument. If not specified, it defaults to the +* current directory. The format of the resulting files is: +* @verbatim + NumberOfFeatures(M) + Feature1 + ... + FeatureM + NumberOfFeatures(M) + ... +@endverbatim +* The output files each have a header which describes the +* type of feature which the file contains. This header is +* in the format required by the clusterer. A command line +* argument can also be used to specify that only the first +* N samples of each class should be used. +* @param argc number of command line arguments +* @param argv array of command line arguments +* @return 0 on success +*/ +int main(int argc, char *argv[]) { + tesseract::CheckSharedLibraryVersion(); + + // Set the global Config parameters before parsing the command line. + Config = CNConfig; + + LIST CharList = NIL_LIST; + CLUSTERER *Clusterer = nullptr; + LIST ProtoList = NIL_LIST; + LIST NormProtoList = NIL_LIST; + LIST pCharList; + LABELEDLIST CharSample; + FEATURE_DEFS_STRUCT FeatureDefs; + InitFeatureDefs(&FeatureDefs); + + ParseArguments(&argc, &argv); +#if !defined(NDEBUG) + int num_fonts = 0; +#endif + for (const char *PageName = *++argv; PageName != nullptr; PageName = *++argv) { + printf("Reading %s ...\n", PageName); + FILE *TrainingPage = fopen(PageName, "rb"); + ASSERT_HOST(TrainingPage); + if (TrainingPage) { + ReadTrainingSamples(FeatureDefs, PROGRAM_FEATURE_TYPE, 100, nullptr, TrainingPage, &CharList); + fclose(TrainingPage); +#if !defined(NDEBUG) + ++num_fonts; +#endif + } + } + printf("Clustering ...\n"); + // To allow an individual font to form a separate cluster, + // reduce the min samples: + // Config.MinSamples = 0.5 / num_fonts; + pCharList = CharList; + // The norm protos will count the source protos, so we keep them here in + // freeable_protos, so they can be freed later. + std::vector<LIST> freeable_protos; + iterate(pCharList) { + // Cluster + CharSample = reinterpret_cast<LABELEDLIST>(pCharList->first_node()); + Clusterer = SetUpForClustering(FeatureDefs, CharSample, PROGRAM_FEATURE_TYPE); + if (Clusterer == nullptr) { // To avoid a SIGSEGV + fprintf(stderr, "Error: nullptr clusterer!\n"); + return EXIT_FAILURE; + } + float SavedMinSamples = Config.MinSamples; + // To disable the tendency to produce a single cluster for all fonts, + // make MagicSamples an impossible to achieve number: + // Config.MagicSamples = CharSample->SampleCount * 10; + Config.MagicSamples = CharSample->SampleCount; + while (Config.MinSamples > 0.001) { + ProtoList = ClusterSamples(Clusterer, &Config); + if (NumberOfProtos(ProtoList, true, false) > 0) { + break; + } else { + Config.MinSamples *= 0.95; + printf( + "0 significant protos for %s." + " Retrying clustering with MinSamples = %f%%\n", + CharSample->Label.c_str(), Config.MinSamples); + } + } + Config.MinSamples = SavedMinSamples; + AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label); + freeable_protos.push_back(ProtoList); + FreeClusterer(Clusterer); + } + FreeTrainingSamples(CharList); + int desc_index = ShortNameToFeatureType(FeatureDefs, PROGRAM_FEATURE_TYPE); + WriteNormProtos(FLAGS_D.c_str(), NormProtoList, FeatureDefs.FeatureDesc[desc_index]); + FreeNormProtoList(NormProtoList); + for (auto &freeable_proto : freeable_protos) { + FreeProtoList(&freeable_proto); + } + printf("\n"); + return EXIT_SUCCESS; +} // main + +/*---------------------------------------------------------------------------- + Private Code +----------------------------------------------------------------------------*/ + +/*----------------------------------------------------------------------------*/ +/** + * This routine writes the specified samples into files which + * are organized according to the font name and character name + * of the samples. + * @param Directory directory to place sample files into + * @param LabeledProtoList List of labeled protos + * @param feature_desc Description of the features + */ +static void WriteNormProtos(const char *Directory, LIST LabeledProtoList, + const FEATURE_DESC_STRUCT *feature_desc) { + FILE *File; + LABELEDLIST LabeledProto; + int N; + + std::string Filename = ""; + if (Directory != nullptr && Directory[0] != '\0') { + Filename += Directory; + Filename += "/"; + } + Filename += "normproto"; + printf("\nWriting %s ...", Filename.c_str()); + File = fopen(Filename.c_str(), "wb"); + ASSERT_HOST(File); + fprintf(File, "%0d\n", feature_desc->NumParams); + WriteParamDesc(File, feature_desc->NumParams, feature_desc->ParamDesc); + iterate(LabeledProtoList) { + LabeledProto = reinterpret_cast<LABELEDLIST>(LabeledProtoList->first_node()); + N = NumberOfProtos(LabeledProto->List, true, false); + if (N < 1) { + printf( + "\nError! Not enough protos for %s: %d protos" + " (%d significant protos" + ", %d insignificant protos)\n", + LabeledProto->Label.c_str(), N, NumberOfProtos(LabeledProto->List, true, false), + NumberOfProtos(LabeledProto->List, false, true)); + exit(1); + } + fprintf(File, "\n%s %d\n", LabeledProto->Label.c_str(), N); + WriteProtos(File, feature_desc->NumParams, LabeledProto->List, true, false); + } + fclose(File); + +} // WriteNormProtos + +/*-------------------------------------------------------------------------*/ + +static void WriteProtos(FILE *File, uint16_t N, LIST ProtoList, bool WriteSigProtos, + bool WriteInsigProtos) { + PROTOTYPE *Proto; + + // write prototypes + iterate(ProtoList) { + Proto = reinterpret_cast<PROTOTYPE *>(ProtoList->first_node()); + if ((Proto->Significant && WriteSigProtos) || (!Proto->Significant && WriteInsigProtos)) { + WritePrototype(File, N, Proto); + } + } +} // WriteProtos
