Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/classify/normmatch.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /****************************************************************************** | |
| 2 ** Filename: normmatch.c | |
| 3 ** Purpose: Simple matcher based on character normalization features. | |
| 4 ** Author: Dan Johnson | |
| 5 ** | |
| 6 ** (c) Copyright Hewlett-Packard Company, 1988. | |
| 7 ** Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 ** you may not use this file except in compliance with the License. | |
| 9 ** You may obtain a copy of the License at | |
| 10 ** http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 ** Unless required by applicable law or agreed to in writing, software | |
| 12 ** distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 ** See the License for the specific language governing permissions and | |
| 15 ** limitations under the License. | |
| 16 ******************************************************************************/ | |
| 17 /*---------------------------------------------------------------------------- | |
| 18 Include Files and Type Defines | |
| 19 ----------------------------------------------------------------------------*/ | |
| 20 #include "normmatch.h" | |
| 21 | |
| 22 #include "classify.h" | |
| 23 #include "clusttool.h" | |
| 24 #include "helpers.h" | |
| 25 #include "normfeat.h" | |
| 26 #include "params.h" | |
| 27 #include "unicharset.h" | |
| 28 | |
| 29 #include <cmath> | |
| 30 #include <cstdio> | |
| 31 #include <sstream> // for std::istringstream | |
| 32 | |
| 33 namespace tesseract { | |
| 34 | |
| 35 struct NORM_PROTOS { | |
| 36 NORM_PROTOS(size_t n) : NumProtos(n), Protos(n) { | |
| 37 } | |
| 38 int NumParams = 0; | |
| 39 int NumProtos; | |
| 40 PARAM_DESC *ParamDesc = nullptr; | |
| 41 std::vector<LIST> Protos; | |
| 42 }; | |
| 43 | |
| 44 /*---------------------------------------------------------------------------- | |
| 45 Private Code | |
| 46 ----------------------------------------------------------------------------*/ | |
| 47 | |
| 48 /** | |
| 49 * @name NormEvidenceOf | |
| 50 * | |
| 51 * Return the new type of evidence number corresponding to this | |
| 52 * normalization adjustment. The equation that represents the transform is: | |
| 53 * 1 / (1 + (NormAdj / midpoint) ^ curl) | |
| 54 */ | |
| 55 static float NormEvidenceOf(float NormAdj) { | |
| 56 NormAdj /= static_cast<float>(classify_norm_adj_midpoint); | |
| 57 | |
| 58 if (classify_norm_adj_curl == 3) { | |
| 59 NormAdj = NormAdj * NormAdj * NormAdj; | |
| 60 } else if (classify_norm_adj_curl == 2) { | |
| 61 NormAdj = NormAdj * NormAdj; | |
| 62 } else { | |
| 63 NormAdj = std::pow(NormAdj, static_cast<float>(classify_norm_adj_curl)); | |
| 64 } | |
| 65 return (1 / (1 + NormAdj)); | |
| 66 } | |
| 67 | |
| 68 /*---------------------------------------------------------------------------- | |
| 69 Variables | |
| 70 ----------------------------------------------------------------------------*/ | |
| 71 | |
| 72 /** control knobs used to control the normalization adjustment process */ | |
| 73 double_VAR(classify_norm_adj_midpoint, 32.0, "Norm adjust midpoint ..."); | |
| 74 double_VAR(classify_norm_adj_curl, 2.0, "Norm adjust curl ..."); | |
| 75 /** Weight of width variance against height and vertical position. */ | |
| 76 const float kWidthErrorWeighting = 0.125f; | |
| 77 | |
| 78 /*---------------------------------------------------------------------------- | |
| 79 Public Code | |
| 80 ----------------------------------------------------------------------------*/ | |
| 81 /** | |
| 82 * This routine compares Features against each character | |
| 83 * normalization proto for ClassId and returns the match | |
| 84 * rating of the best match. | |
| 85 * @param ClassId id of class to match against | |
| 86 * @param feature character normalization feature | |
| 87 * @param DebugMatch controls dump of debug info | |
| 88 * | |
| 89 * Globals: | |
| 90 * #NormProtos character normalization prototypes | |
| 91 * | |
| 92 * @return Best match rating for Feature against protos of ClassId. | |
| 93 */ | |
| 94 float Classify::ComputeNormMatch(CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch) { | |
| 95 if (ClassId >= NormProtos->NumProtos) { | |
| 96 ClassId = NO_CLASS; | |
| 97 } | |
| 98 | |
| 99 /* handle requests for classification as noise */ | |
| 100 if (ClassId == NO_CLASS) { | |
| 101 /* kludge - clean up constants and make into control knobs later */ | |
| 102 float Match = (feature.Params[CharNormLength] * feature.Params[CharNormLength] * 500.0f + | |
| 103 feature.Params[CharNormRx] * feature.Params[CharNormRx] * 8000.0f + | |
| 104 feature.Params[CharNormRy] * feature.Params[CharNormRy] * 8000.0f); | |
| 105 return (1 - NormEvidenceOf(Match)); | |
| 106 } | |
| 107 | |
| 108 if (DebugMatch) { | |
| 109 tprintf("\nChar norm for class %s\n", unicharset.id_to_unichar(ClassId)); | |
| 110 } | |
| 111 | |
| 112 LIST Protos = NormProtos->Protos[ClassId]; | |
| 113 if (Protos == nullptr) { | |
| 114 // Avoid FP overflow in NormEvidenceOf. | |
| 115 return 1.0f; | |
| 116 } | |
| 117 | |
| 118 float BestMatch = FLT_MAX; | |
| 119 iterate(Protos) { | |
| 120 auto Proto = reinterpret_cast<PROTOTYPE *>(Protos->first_node()); | |
| 121 float Delta = feature.Params[CharNormY] - Proto->Mean[CharNormY]; | |
| 122 float Match = Delta * Delta * Proto->Weight.Elliptical[CharNormY]; | |
| 123 if (DebugMatch) { | |
| 124 tprintf("YMiddle: Proto=%g, Delta=%g, Var=%g, Dist=%g\n", Proto->Mean[CharNormY], Delta, | |
| 125 Proto->Weight.Elliptical[CharNormY], Match); | |
| 126 } | |
| 127 Delta = feature.Params[CharNormRx] - Proto->Mean[CharNormRx]; | |
| 128 Match += Delta * Delta * Proto->Weight.Elliptical[CharNormRx]; | |
| 129 if (DebugMatch) { | |
| 130 tprintf("Height: Proto=%g, Delta=%g, Var=%g, Dist=%g\n", Proto->Mean[CharNormRx], Delta, | |
| 131 Proto->Weight.Elliptical[CharNormRx], Match); | |
| 132 } | |
| 133 // Ry is width! See intfx.cpp. | |
| 134 Delta = feature.Params[CharNormRy] - Proto->Mean[CharNormRy]; | |
| 135 if (DebugMatch) { | |
| 136 tprintf("Width: Proto=%g, Delta=%g, Var=%g\n", Proto->Mean[CharNormRy], Delta, | |
| 137 Proto->Weight.Elliptical[CharNormRy]); | |
| 138 } | |
| 139 Delta = Delta * Delta * Proto->Weight.Elliptical[CharNormRy]; | |
| 140 Delta *= kWidthErrorWeighting; | |
| 141 Match += Delta; | |
| 142 if (DebugMatch) { | |
| 143 tprintf("Total Dist=%g, scaled=%g, sigmoid=%g, penalty=%g\n", Match, | |
| 144 Match / classify_norm_adj_midpoint, NormEvidenceOf(Match), | |
| 145 256 * (1 - NormEvidenceOf(Match))); | |
| 146 } | |
| 147 | |
| 148 if (Match < BestMatch) { | |
| 149 BestMatch = Match; | |
| 150 } | |
| 151 } | |
| 152 return 1 - NormEvidenceOf(BestMatch); | |
| 153 } /* ComputeNormMatch */ | |
| 154 | |
| 155 void Classify::FreeNormProtos() { | |
| 156 if (NormProtos != nullptr) { | |
| 157 for (int i = 0; i < NormProtos->NumProtos; i++) { | |
| 158 FreeProtoList(&NormProtos->Protos[i]); | |
| 159 } | |
| 160 delete[] NormProtos->ParamDesc; | |
| 161 delete NormProtos; | |
| 162 NormProtos = nullptr; | |
| 163 } | |
| 164 } | |
| 165 | |
| 166 /** | |
| 167 * This routine allocates a new data structure to hold | |
| 168 * a set of character normalization protos. It then fills in | |
| 169 * the data structure by reading from the specified File. | |
| 170 * @param fp open text file to read normalization protos from | |
| 171 * Globals: none | |
| 172 * @return Character normalization protos. | |
| 173 */ | |
| 174 NORM_PROTOS *Classify::ReadNormProtos(TFile *fp) { | |
| 175 char unichar[2 * UNICHAR_LEN + 1]; | |
| 176 UNICHAR_ID unichar_id; | |
| 177 LIST Protos; | |
| 178 int NumProtos; | |
| 179 | |
| 180 /* allocate and initialization data structure */ | |
| 181 auto NormProtos = new NORM_PROTOS(unicharset.size()); | |
| 182 | |
| 183 /* read file header and save in data structure */ | |
| 184 NormProtos->NumParams = ReadSampleSize(fp); | |
| 185 NormProtos->ParamDesc = ReadParamDesc(fp, NormProtos->NumParams); | |
| 186 | |
| 187 /* read protos for each class into a separate list */ | |
| 188 const int kMaxLineSize = 100; | |
| 189 char line[kMaxLineSize]; | |
| 190 while (fp->FGets(line, kMaxLineSize) != nullptr) { | |
| 191 std::istringstream stream(line); | |
| 192 stream.imbue(std::locale::classic()); | |
| 193 stream >> unichar >> NumProtos; | |
| 194 if (stream.fail()) { | |
| 195 continue; | |
| 196 } | |
| 197 if (unicharset.contains_unichar(unichar)) { | |
| 198 unichar_id = unicharset.unichar_to_id(unichar); | |
| 199 Protos = NormProtos->Protos[unichar_id]; | |
| 200 for (int i = 0; i < NumProtos; i++) { | |
| 201 Protos = push_last(Protos, ReadPrototype(fp, NormProtos->NumParams)); | |
| 202 } | |
| 203 NormProtos->Protos[unichar_id] = Protos; | |
| 204 } else { | |
| 205 tprintf("Error: unichar %s in normproto file is not in unichar set.\n", unichar); | |
| 206 for (int i = 0; i < NumProtos; i++) { | |
| 207 FreePrototype(ReadPrototype(fp, NormProtos->NumParams)); | |
| 208 } | |
| 209 } | |
| 210 } | |
| 211 return NormProtos; | |
| 212 } /* ReadNormProtos */ | |
| 213 | |
| 214 } // namespace tesseract |
