Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/training/cntraining.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /****************************************************************************** | |
| 2 ** Filename: cntraining.cpp | |
| 3 ** Purpose: Generates a normproto and pffmtable. | |
| 4 ** Author: Dan Johnson | |
| 5 ** Revisment: Christy Russon | |
| 6 ** | |
| 7 ** (c) Copyright Hewlett-Packard Company, 1988. | |
| 8 ** Licensed under the Apache License, Version 2.0 (the "License"); | |
| 9 ** you may not use this file except in compliance with the License. | |
| 10 ** You may obtain a copy of the License at | |
| 11 ** http://www.apache.org/licenses/LICENSE-2.0 | |
| 12 ** Unless required by applicable law or agreed to in writing, software | |
| 13 ** distributed under the License is distributed on an "AS IS" BASIS, | |
| 14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 15 ** See the License for the specific language governing permissions and | |
| 16 ** limitations under the License. | |
| 17 ******************************************************************************/ | |
| 18 | |
| 19 /*---------------------------------------------------------------------------- | |
| 20 Include Files and Type Defines | |
| 21 ----------------------------------------------------------------------------*/ | |
| 22 #include <tesseract/unichar.h> | |
| 23 #include <cmath> | |
| 24 #include <cstdio> | |
| 25 #include <cstring> | |
| 26 #include "cluster.h" | |
| 27 #include "clusttool.h" | |
| 28 #include "commontraining.h" | |
| 29 #include "featdefs.h" | |
| 30 #include "ocrfeatures.h" | |
| 31 #include "oldlist.h" | |
| 32 | |
| 33 #define PROGRAM_FEATURE_TYPE "cn" | |
| 34 | |
| 35 using namespace tesseract; | |
| 36 | |
| 37 /*---------------------------------------------------------------------------- | |
| 38 Private Function Prototypes | |
| 39 ----------------------------------------------------------------------------*/ | |
| 40 | |
| 41 static void WriteNormProtos(const char *Directory, LIST LabeledProtoList, | |
| 42 const FEATURE_DESC_STRUCT *feature_desc); | |
| 43 | |
| 44 static void WriteProtos(FILE *File, uint16_t N, LIST ProtoList, bool WriteSigProtos, | |
| 45 bool WriteInsigProtos); | |
| 46 | |
| 47 /*---------------------------------------------------------------------------- | |
| 48 Global Data Definitions and Declarations | |
| 49 ----------------------------------------------------------------------------*/ | |
| 50 /* global variable to hold configuration parameters to control clustering */ | |
| 51 //-M 0.025 -B 0.05 -I 0.8 -C 1e-3 | |
| 52 static const CLUSTERCONFIG CNConfig = {elliptical, 0.025, 0.05, 0.8, 1e-3, 0}; | |
| 53 | |
| 54 /*---------------------------------------------------------------------------- | |
| 55 Public Code | |
| 56 ----------------------------------------------------------------------------*/ | |
| 57 | |
| 58 /** | |
| 59 * This program reads in a text file consisting of feature | |
| 60 * samples from a training page in the following format: | |
| 61 * @verbatim | |
| 62 FontName CharName NumberOfFeatureTypes(N) | |
| 63 FeatureTypeName1 NumberOfFeatures(M) | |
| 64 Feature1 | |
| 65 ... | |
| 66 FeatureM | |
| 67 FeatureTypeName2 NumberOfFeatures(M) | |
| 68 Feature1 | |
| 69 ... | |
| 70 FeatureM | |
| 71 ... | |
| 72 FeatureTypeNameN NumberOfFeatures(M) | |
| 73 Feature1 | |
| 74 ... | |
| 75 FeatureM | |
| 76 FontName CharName ... | |
| 77 @endverbatim | |
| 78 * It then appends these samples into a separate file for each | |
| 79 * character. The name of the file is | |
| 80 * | |
| 81 * DirectoryName/FontName/CharName.FeatureTypeName | |
| 82 * | |
| 83 * The DirectoryName can be specified via a command | |
| 84 * line argument. If not specified, it defaults to the | |
| 85 * current directory. The format of the resulting files is: | |
| 86 * @verbatim | |
| 87 NumberOfFeatures(M) | |
| 88 Feature1 | |
| 89 ... | |
| 90 FeatureM | |
| 91 NumberOfFeatures(M) | |
| 92 ... | |
| 93 @endverbatim | |
| 94 * The output files each have a header which describes the | |
| 95 * type of feature which the file contains. This header is | |
| 96 * in the format required by the clusterer. A command line | |
| 97 * argument can also be used to specify that only the first | |
| 98 * N samples of each class should be used. | |
| 99 * @param argc number of command line arguments | |
| 100 * @param argv array of command line arguments | |
| 101 * @return 0 on success | |
| 102 */ | |
| 103 int main(int argc, char *argv[]) { | |
| 104 tesseract::CheckSharedLibraryVersion(); | |
| 105 | |
| 106 // Set the global Config parameters before parsing the command line. | |
| 107 Config = CNConfig; | |
| 108 | |
| 109 LIST CharList = NIL_LIST; | |
| 110 CLUSTERER *Clusterer = nullptr; | |
| 111 LIST ProtoList = NIL_LIST; | |
| 112 LIST NormProtoList = NIL_LIST; | |
| 113 LIST pCharList; | |
| 114 LABELEDLIST CharSample; | |
| 115 FEATURE_DEFS_STRUCT FeatureDefs; | |
| 116 InitFeatureDefs(&FeatureDefs); | |
| 117 | |
| 118 ParseArguments(&argc, &argv); | |
| 119 #if !defined(NDEBUG) | |
| 120 int num_fonts = 0; | |
| 121 #endif | |
| 122 for (const char *PageName = *++argv; PageName != nullptr; PageName = *++argv) { | |
| 123 printf("Reading %s ...\n", PageName); | |
| 124 FILE *TrainingPage = fopen(PageName, "rb"); | |
| 125 ASSERT_HOST(TrainingPage); | |
| 126 if (TrainingPage) { | |
| 127 ReadTrainingSamples(FeatureDefs, PROGRAM_FEATURE_TYPE, 100, nullptr, TrainingPage, &CharList); | |
| 128 fclose(TrainingPage); | |
| 129 #if !defined(NDEBUG) | |
| 130 ++num_fonts; | |
| 131 #endif | |
| 132 } | |
| 133 } | |
| 134 printf("Clustering ...\n"); | |
| 135 // To allow an individual font to form a separate cluster, | |
| 136 // reduce the min samples: | |
| 137 // Config.MinSamples = 0.5 / num_fonts; | |
| 138 pCharList = CharList; | |
| 139 // The norm protos will count the source protos, so we keep them here in | |
| 140 // freeable_protos, so they can be freed later. | |
| 141 std::vector<LIST> freeable_protos; | |
| 142 iterate(pCharList) { | |
| 143 // Cluster | |
| 144 CharSample = reinterpret_cast<LABELEDLIST>(pCharList->first_node()); | |
| 145 Clusterer = SetUpForClustering(FeatureDefs, CharSample, PROGRAM_FEATURE_TYPE); | |
| 146 if (Clusterer == nullptr) { // To avoid a SIGSEGV | |
| 147 fprintf(stderr, "Error: nullptr clusterer!\n"); | |
| 148 return EXIT_FAILURE; | |
| 149 } | |
| 150 float SavedMinSamples = Config.MinSamples; | |
| 151 // To disable the tendency to produce a single cluster for all fonts, | |
| 152 // make MagicSamples an impossible to achieve number: | |
| 153 // Config.MagicSamples = CharSample->SampleCount * 10; | |
| 154 Config.MagicSamples = CharSample->SampleCount; | |
| 155 while (Config.MinSamples > 0.001) { | |
| 156 ProtoList = ClusterSamples(Clusterer, &Config); | |
| 157 if (NumberOfProtos(ProtoList, true, false) > 0) { | |
| 158 break; | |
| 159 } else { | |
| 160 Config.MinSamples *= 0.95; | |
| 161 printf( | |
| 162 "0 significant protos for %s." | |
| 163 " Retrying clustering with MinSamples = %f%%\n", | |
| 164 CharSample->Label.c_str(), Config.MinSamples); | |
| 165 } | |
| 166 } | |
| 167 Config.MinSamples = SavedMinSamples; | |
| 168 AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label); | |
| 169 freeable_protos.push_back(ProtoList); | |
| 170 FreeClusterer(Clusterer); | |
| 171 } | |
| 172 FreeTrainingSamples(CharList); | |
| 173 int desc_index = ShortNameToFeatureType(FeatureDefs, PROGRAM_FEATURE_TYPE); | |
| 174 WriteNormProtos(FLAGS_D.c_str(), NormProtoList, FeatureDefs.FeatureDesc[desc_index]); | |
| 175 FreeNormProtoList(NormProtoList); | |
| 176 for (auto &freeable_proto : freeable_protos) { | |
| 177 FreeProtoList(&freeable_proto); | |
| 178 } | |
| 179 printf("\n"); | |
| 180 return EXIT_SUCCESS; | |
| 181 } // main | |
| 182 | |
| 183 /*---------------------------------------------------------------------------- | |
| 184 Private Code | |
| 185 ----------------------------------------------------------------------------*/ | |
| 186 | |
| 187 /*----------------------------------------------------------------------------*/ | |
| 188 /** | |
| 189 * This routine writes the specified samples into files which | |
| 190 * are organized according to the font name and character name | |
| 191 * of the samples. | |
| 192 * @param Directory directory to place sample files into | |
| 193 * @param LabeledProtoList List of labeled protos | |
| 194 * @param feature_desc Description of the features | |
| 195 */ | |
| 196 static void WriteNormProtos(const char *Directory, LIST LabeledProtoList, | |
| 197 const FEATURE_DESC_STRUCT *feature_desc) { | |
| 198 FILE *File; | |
| 199 LABELEDLIST LabeledProto; | |
| 200 int N; | |
| 201 | |
| 202 std::string Filename = ""; | |
| 203 if (Directory != nullptr && Directory[0] != '\0') { | |
| 204 Filename += Directory; | |
| 205 Filename += "/"; | |
| 206 } | |
| 207 Filename += "normproto"; | |
| 208 printf("\nWriting %s ...", Filename.c_str()); | |
| 209 File = fopen(Filename.c_str(), "wb"); | |
| 210 ASSERT_HOST(File); | |
| 211 fprintf(File, "%0d\n", feature_desc->NumParams); | |
| 212 WriteParamDesc(File, feature_desc->NumParams, feature_desc->ParamDesc); | |
| 213 iterate(LabeledProtoList) { | |
| 214 LabeledProto = reinterpret_cast<LABELEDLIST>(LabeledProtoList->first_node()); | |
| 215 N = NumberOfProtos(LabeledProto->List, true, false); | |
| 216 if (N < 1) { | |
| 217 printf( | |
| 218 "\nError! Not enough protos for %s: %d protos" | |
| 219 " (%d significant protos" | |
| 220 ", %d insignificant protos)\n", | |
| 221 LabeledProto->Label.c_str(), N, NumberOfProtos(LabeledProto->List, true, false), | |
| 222 NumberOfProtos(LabeledProto->List, false, true)); | |
| 223 exit(1); | |
| 224 } | |
| 225 fprintf(File, "\n%s %d\n", LabeledProto->Label.c_str(), N); | |
| 226 WriteProtos(File, feature_desc->NumParams, LabeledProto->List, true, false); | |
| 227 } | |
| 228 fclose(File); | |
| 229 | |
| 230 } // WriteNormProtos | |
| 231 | |
| 232 /*-------------------------------------------------------------------------*/ | |
| 233 | |
| 234 static void WriteProtos(FILE *File, uint16_t N, LIST ProtoList, bool WriteSigProtos, | |
| 235 bool WriteInsigProtos) { | |
| 236 PROTOTYPE *Proto; | |
| 237 | |
| 238 // write prototypes | |
| 239 iterate(ProtoList) { | |
| 240 Proto = reinterpret_cast<PROTOTYPE *>(ProtoList->first_node()); | |
| 241 if ((Proto->Significant && WriteSigProtos) || (!Proto->Significant && WriteInsigProtos)) { | |
| 242 WritePrototype(File, N, Proto); | |
| 243 } | |
| 244 } | |
| 245 } // WriteProtos |
