Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/wordrec/params_model.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: params_model.cpp | |
| 3 // Description: Trained language model parameters. | |
| 4 // Author: David Eger | |
| 5 // | |
| 6 // (C) Copyright 2012, Google Inc. | |
| 7 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 // you may not use this file except in compliance with the License. | |
| 9 // You may obtain a copy of the License at | |
| 10 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 // Unless required by applicable law or agreed to in writing, software | |
| 12 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 // See the License for the specific language governing permissions and | |
| 15 // limitations under the License. | |
| 16 // | |
| 17 /////////////////////////////////////////////////////////////////////// | |
| 18 | |
| 19 #include "params_model.h" | |
| 20 | |
| 21 #include <cctype> | |
| 22 #include <cmath> | |
| 23 #include <cstdio> | |
| 24 | |
| 25 #include "bitvector.h" | |
| 26 #include "helpers.h" // for ClipToRange | |
| 27 #include "serialis.h" // for TFile | |
| 28 #include "tprintf.h" | |
| 29 | |
| 30 namespace tesseract { | |
| 31 | |
| 32 // Scale factor to apply to params model scores. | |
| 33 static const float kScoreScaleFactor = 100.0f; | |
| 34 // Minimum cost result to return. | |
| 35 static const float kMinFinalCost = 0.001f; | |
| 36 // Maximum cost result to return. | |
| 37 static const float kMaxFinalCost = 100.0f; | |
| 38 | |
| 39 void ParamsModel::Print() { | |
| 40 for (int p = 0; p < PTRAIN_NUM_PASSES; ++p) { | |
| 41 tprintf("ParamsModel for pass %d lang %s\n", p, lang_.c_str()); | |
| 42 for (unsigned i = 0; i < weights_vec_[p].size(); ++i) { | |
| 43 tprintf("%s = %g\n", kParamsTrainingFeatureTypeName[i], weights_vec_[p][i]); | |
| 44 } | |
| 45 } | |
| 46 } | |
| 47 | |
| 48 void ParamsModel::Copy(const ParamsModel &other_model) { | |
| 49 for (int p = 0; p < PTRAIN_NUM_PASSES; ++p) { | |
| 50 weights_vec_[p] = other_model.weights_for_pass(static_cast<PassEnum>(p)); | |
| 51 } | |
| 52 } | |
| 53 | |
| 54 // Given a (modifiable) line, parse out a key / value pair. | |
| 55 // Return true on success. | |
| 56 bool ParamsModel::ParseLine(char *line, char **key, float *val) { | |
| 57 if (line[0] == '#') { | |
| 58 return false; | |
| 59 } | |
| 60 int end_of_key = 0; | |
| 61 while (line[end_of_key] && !(isascii(line[end_of_key]) && isspace(line[end_of_key]))) { | |
| 62 end_of_key++; | |
| 63 } | |
| 64 if (!line[end_of_key]) { | |
| 65 tprintf("ParamsModel::Incomplete line %s\n", line); | |
| 66 return false; | |
| 67 } | |
| 68 line[end_of_key++] = 0; | |
| 69 *key = line; | |
| 70 if (sscanf(line + end_of_key, " %f", val) != 1) { | |
| 71 return false; | |
| 72 } | |
| 73 return true; | |
| 74 } | |
| 75 | |
| 76 // Applies params model weights to the given features. | |
| 77 // Assumes that features is an array of size PTRAIN_NUM_FEATURE_TYPES. | |
| 78 // The cost is set to a number that can be multiplied by the outline length, | |
| 79 // as with the old ratings scheme. This enables words of different length | |
| 80 // and combinations of words to be compared meaningfully. | |
| 81 float ParamsModel::ComputeCost(const float features[]) const { | |
| 82 float unnorm_score = 0.0; | |
| 83 for (int f = 0; f < PTRAIN_NUM_FEATURE_TYPES; ++f) { | |
| 84 unnorm_score += weights_vec_[pass_][f] * features[f]; | |
| 85 } | |
| 86 return ClipToRange(-unnorm_score / kScoreScaleFactor, kMinFinalCost, kMaxFinalCost); | |
| 87 } | |
| 88 | |
| 89 bool ParamsModel::Equivalent(const ParamsModel &that) const { | |
| 90 float epsilon = 0.0001f; | |
| 91 for (int p = 0; p < PTRAIN_NUM_PASSES; ++p) { | |
| 92 if (weights_vec_[p].size() != that.weights_vec_[p].size()) { | |
| 93 return false; | |
| 94 } | |
| 95 for (unsigned i = 0; i < weights_vec_[p].size(); i++) { | |
| 96 if (weights_vec_[p][i] != that.weights_vec_[p][i] && | |
| 97 std::fabs(weights_vec_[p][i] - that.weights_vec_[p][i]) > epsilon) { | |
| 98 return false; | |
| 99 } | |
| 100 } | |
| 101 } | |
| 102 return true; | |
| 103 } | |
| 104 | |
| 105 bool ParamsModel::LoadFromFp(const char *lang, TFile *fp) { | |
| 106 const int kMaxLineSize = 100; | |
| 107 char line[kMaxLineSize]; | |
| 108 BitVector present; | |
| 109 present.Init(PTRAIN_NUM_FEATURE_TYPES); | |
| 110 lang_ = lang; | |
| 111 // Load weights for passes with adaption on. | |
| 112 std::vector<float> &weights = weights_vec_[pass_]; | |
| 113 weights.clear(); | |
| 114 weights.resize(PTRAIN_NUM_FEATURE_TYPES, 0.0f); | |
| 115 | |
| 116 while (fp->FGets(line, kMaxLineSize) != nullptr) { | |
| 117 char *key = nullptr; | |
| 118 float value; | |
| 119 if (!ParseLine(line, &key, &value)) { | |
| 120 continue; | |
| 121 } | |
| 122 int idx = ParamsTrainingFeatureByName(key); | |
| 123 if (idx < 0) { | |
| 124 tprintf("ParamsModel::Unknown parameter %s\n", key); | |
| 125 continue; | |
| 126 } | |
| 127 if (!present[idx]) { | |
| 128 present.SetValue(idx, true); | |
| 129 } | |
| 130 weights[idx] = value; | |
| 131 } | |
| 132 bool complete = (present.NumSetBits() == PTRAIN_NUM_FEATURE_TYPES); | |
| 133 if (!complete) { | |
| 134 for (int i = 0; i < PTRAIN_NUM_FEATURE_TYPES; i++) { | |
| 135 if (!present[i]) { | |
| 136 tprintf("Missing field %s.\n", kParamsTrainingFeatureTypeName[i]); | |
| 137 } | |
| 138 } | |
| 139 lang_ = ""; | |
| 140 weights.clear(); | |
| 141 } | |
| 142 return complete; | |
| 143 } | |
| 144 | |
| 145 bool ParamsModel::SaveToFile(const char *full_path) const { | |
| 146 const std::vector<float> &weights = weights_vec_[pass_]; | |
| 147 if (weights.size() != PTRAIN_NUM_FEATURE_TYPES) { | |
| 148 tprintf("Refusing to save ParamsModel that has not been initialized.\n"); | |
| 149 return false; | |
| 150 } | |
| 151 FILE *fp = fopen(full_path, "wb"); | |
| 152 if (!fp) { | |
| 153 tprintf("Could not open %s for writing.\n", full_path); | |
| 154 return false; | |
| 155 } | |
| 156 bool all_good = true; | |
| 157 for (unsigned i = 0; i < weights.size(); i++) { | |
| 158 if (fprintf(fp, "%s %f\n", kParamsTrainingFeatureTypeName[i], weights[i]) < 0) { | |
| 159 all_good = false; | |
| 160 } | |
| 161 } | |
| 162 fclose(fp); | |
| 163 return all_good; | |
| 164 } | |
| 165 | |
| 166 } // namespace tesseract |
