Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/training/common/ctc.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: ctc.cpp | |
| 3 // Description: Slightly improved standard CTC to compute the targets. | |
| 4 // Author: Ray Smith | |
| 5 // | |
| 6 // (C) Copyright 2016, Google Inc. | |
| 7 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 // you may not use this file except in compliance with the License. | |
| 9 // You may obtain a copy of the License at | |
| 10 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 // Unless required by applicable law or agreed to in writing, software | |
| 12 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 // See the License for the specific language governing permissions and | |
| 15 // limitations under the License. | |
| 16 /////////////////////////////////////////////////////////////////////// | |
| 17 | |
| 18 #include "ctc.h" | |
| 19 | |
| 20 #include "matrix.h" | |
| 21 #include "network.h" | |
| 22 #include "networkio.h" | |
| 23 #include "scrollview.h" | |
| 24 | |
| 25 #include <algorithm> | |
| 26 #include <cfloat> // for FLT_MAX | |
| 27 #include <cmath> | |
| 28 #include <memory> | |
| 29 | |
| 30 namespace tesseract { | |
| 31 | |
| 32 // Magic constants that keep CTC stable. | |
| 33 // Minimum probability limit for softmax input to ctc_loss. | |
| 34 const float CTC::kMinProb_ = 1e-12; | |
| 35 // Maximum absolute argument to exp(). | |
| 36 const double CTC::kMaxExpArg_ = 80.0; | |
| 37 // Minimum probability for total prob in time normalization. | |
| 38 const double CTC::kMinTotalTimeProb_ = 1e-8; | |
| 39 // Minimum probability for total prob in final normalization. | |
| 40 const double CTC::kMinTotalFinalProb_ = 1e-6; | |
| 41 | |
| 42 // Builds a target using CTC. Slightly improved as follows: | |
| 43 // Includes normalizations and clipping for stability. | |
| 44 // labels should be pre-padded with nulls everywhere. | |
| 45 // labels can be longer than the time sequence, but the total number of | |
| 46 // essential labels (non-null plus nulls between equal labels) must not exceed | |
| 47 // the number of timesteps in outputs. | |
| 48 // outputs is the output of the network, and should have already been | |
| 49 // normalized with NormalizeProbs. | |
| 50 // On return targets is filled with the computed targets. | |
| 51 // Returns false if there is insufficient time for the labels. | |
| 52 /* static */ | |
| 53 bool CTC::ComputeCTCTargets(const std::vector<int> &labels, int null_char, | |
| 54 const GENERIC_2D_ARRAY<float> &outputs, NetworkIO *targets) { | |
| 55 std::unique_ptr<CTC> ctc(new CTC(labels, null_char, outputs)); | |
| 56 if (!ctc->ComputeLabelLimits()) { | |
| 57 return false; // Not enough time. | |
| 58 } | |
| 59 // Generate simple targets purely from the truth labels by spreading them | |
| 60 // evenly over time. | |
| 61 GENERIC_2D_ARRAY<float> simple_targets; | |
| 62 ctc->ComputeSimpleTargets(&simple_targets); | |
| 63 // Add the simple targets as a starter bias to the network outputs. | |
| 64 float bias_fraction = ctc->CalculateBiasFraction(); | |
| 65 simple_targets *= bias_fraction; | |
| 66 ctc->outputs_ += simple_targets; | |
| 67 NormalizeProbs(&ctc->outputs_); | |
| 68 // Run regular CTC on the biased outputs. | |
| 69 // Run forward and backward | |
| 70 GENERIC_2D_ARRAY<double> log_alphas, log_betas; | |
| 71 ctc->Forward(&log_alphas); | |
| 72 ctc->Backward(&log_betas); | |
| 73 // Normalize and come out of log space with a clipped softmax over time. | |
| 74 log_alphas += log_betas; | |
| 75 ctc->NormalizeSequence(&log_alphas); | |
| 76 ctc->LabelsToClasses(log_alphas, targets); | |
| 77 NormalizeProbs(targets); | |
| 78 return true; | |
| 79 } | |
| 80 | |
| 81 CTC::CTC(const std::vector<int> &labels, int null_char, const GENERIC_2D_ARRAY<float> &outputs) | |
| 82 : labels_(labels), outputs_(outputs), null_char_(null_char) { | |
| 83 num_timesteps_ = outputs.dim1(); | |
| 84 num_classes_ = outputs.dim2(); | |
| 85 num_labels_ = labels_.size(); | |
| 86 } | |
| 87 | |
| 88 // Computes vectors of min and max label index for each timestep, based on | |
| 89 // whether skippability of nulls makes it possible to complete a valid path. | |
| 90 bool CTC::ComputeLabelLimits() { | |
| 91 min_labels_.clear(); | |
| 92 min_labels_.resize(num_timesteps_, 0); | |
| 93 max_labels_.clear(); | |
| 94 max_labels_.resize(num_timesteps_, 0); | |
| 95 int min_u = num_labels_ - 1; | |
| 96 if (labels_[min_u] == null_char_) { | |
| 97 --min_u; | |
| 98 } | |
| 99 for (int t = num_timesteps_ - 1; t >= 0; --t) { | |
| 100 min_labels_[t] = min_u; | |
| 101 if (min_u > 0) { | |
| 102 --min_u; | |
| 103 if (labels_[min_u] == null_char_ && min_u > 0 && labels_[min_u + 1] != labels_[min_u - 1]) { | |
| 104 --min_u; | |
| 105 } | |
| 106 } | |
| 107 } | |
| 108 int max_u = labels_[0] == null_char_; | |
| 109 for (int t = 0; t < num_timesteps_; ++t) { | |
| 110 max_labels_[t] = max_u; | |
| 111 if (max_labels_[t] < min_labels_[t]) { | |
| 112 return false; // Not enough room. | |
| 113 } | |
| 114 if (max_u + 1 < num_labels_) { | |
| 115 ++max_u; | |
| 116 if (labels_[max_u] == null_char_ && max_u + 1 < num_labels_ && | |
| 117 labels_[max_u + 1] != labels_[max_u - 1]) { | |
| 118 ++max_u; | |
| 119 } | |
| 120 } | |
| 121 } | |
| 122 return true; | |
| 123 } | |
| 124 | |
| 125 // Computes targets based purely on the labels by spreading the labels evenly | |
| 126 // over the available timesteps. | |
| 127 void CTC::ComputeSimpleTargets(GENERIC_2D_ARRAY<float> *targets) const { | |
| 128 // Initialize all targets to zero. | |
| 129 targets->Resize(num_timesteps_, num_classes_, 0.0f); | |
| 130 std::vector<float> half_widths; | |
| 131 std::vector<int> means; | |
| 132 ComputeWidthsAndMeans(&half_widths, &means); | |
| 133 for (int l = 0; l < num_labels_; ++l) { | |
| 134 int label = labels_[l]; | |
| 135 float left_half_width = half_widths[l]; | |
| 136 float right_half_width = left_half_width; | |
| 137 int mean = means[l]; | |
| 138 if (label == null_char_) { | |
| 139 if (!NeededNull(l)) { | |
| 140 if ((l > 0 && mean == means[l - 1]) || (l + 1 < num_labels_ && mean == means[l + 1])) { | |
| 141 continue; // Drop overlapping null. | |
| 142 } | |
| 143 } | |
| 144 // Make sure that no space is left unoccupied and that non-nulls always | |
| 145 // peak at 1 by stretching nulls to meet their neighbors. | |
| 146 if (l > 0) { | |
| 147 left_half_width = mean - means[l - 1]; | |
| 148 } | |
| 149 if (l + 1 < num_labels_) { | |
| 150 right_half_width = means[l + 1] - mean; | |
| 151 } | |
| 152 } | |
| 153 if (mean >= 0 && mean < num_timesteps_) { | |
| 154 targets->put(mean, label, 1.0f); | |
| 155 } | |
| 156 for (int offset = 1; offset < left_half_width && mean >= offset; ++offset) { | |
| 157 float prob = 1.0f - offset / left_half_width; | |
| 158 if (mean - offset < num_timesteps_ && prob > targets->get(mean - offset, label)) { | |
| 159 targets->put(mean - offset, label, prob); | |
| 160 } | |
| 161 } | |
| 162 for (int offset = 1; offset < right_half_width && mean + offset < num_timesteps_; ++offset) { | |
| 163 float prob = 1.0f - offset / right_half_width; | |
| 164 if (mean + offset >= 0 && prob > targets->get(mean + offset, label)) { | |
| 165 targets->put(mean + offset, label, prob); | |
| 166 } | |
| 167 } | |
| 168 } | |
| 169 } | |
| 170 | |
| 171 // Computes mean positions and half widths of the simple targets by spreading | |
| 172 // the labels evenly over the available timesteps. | |
| 173 void CTC::ComputeWidthsAndMeans(std::vector<float> *half_widths, std::vector<int> *means) const { | |
| 174 // Count the number of labels of each type, in regexp terms, counts plus | |
| 175 // (non-null or necessary null, which must occur at least once) and star | |
| 176 // (optional null). | |
| 177 int num_plus = 0, num_star = 0; | |
| 178 for (int i = 0; i < num_labels_; ++i) { | |
| 179 if (labels_[i] != null_char_ || NeededNull(i)) { | |
| 180 ++num_plus; | |
| 181 } else { | |
| 182 ++num_star; | |
| 183 } | |
| 184 } | |
| 185 // Compute the size for each type. If there is enough space for everything | |
| 186 // to have size>=1, then all are equal, otherwise plus_size=1 and star gets | |
| 187 // whatever is left-over. | |
| 188 float plus_size = 1.0f, star_size = 0.0f; | |
| 189 float total_floating = num_plus + num_star; | |
| 190 if (total_floating <= num_timesteps_) { | |
| 191 plus_size = star_size = num_timesteps_ / total_floating; | |
| 192 } else if (num_star > 0) { | |
| 193 star_size = static_cast<float>(num_timesteps_ - num_plus) / num_star; | |
| 194 } | |
| 195 // Set the width and compute the mean of each. | |
| 196 float mean_pos = 0.0f; | |
| 197 for (int i = 0; i < num_labels_; ++i) { | |
| 198 float half_width; | |
| 199 if (labels_[i] != null_char_ || NeededNull(i)) { | |
| 200 half_width = plus_size / 2.0f; | |
| 201 } else { | |
| 202 half_width = star_size / 2.0f; | |
| 203 } | |
| 204 mean_pos += half_width; | |
| 205 means->push_back(static_cast<int>(mean_pos)); | |
| 206 mean_pos += half_width; | |
| 207 half_widths->push_back(half_width); | |
| 208 } | |
| 209 } | |
| 210 | |
| 211 // Helper returns the index of the highest probability label at timestep t. | |
| 212 static int BestLabel(const GENERIC_2D_ARRAY<float> &outputs, int t) { | |
| 213 int result = 0; | |
| 214 int num_classes = outputs.dim2(); | |
| 215 const float *outputs_t = outputs[t]; | |
| 216 for (int c = 1; c < num_classes; ++c) { | |
| 217 if (outputs_t[c] > outputs_t[result]) { | |
| 218 result = c; | |
| 219 } | |
| 220 } | |
| 221 return result; | |
| 222 } | |
| 223 | |
| 224 // Calculates and returns a suitable fraction of the simple targets to add | |
| 225 // to the network outputs. | |
| 226 float CTC::CalculateBiasFraction() { | |
| 227 // Compute output labels via basic decoding. | |
| 228 std::vector<int> output_labels; | |
| 229 for (int t = 0; t < num_timesteps_; ++t) { | |
| 230 int label = BestLabel(outputs_, t); | |
| 231 while (t + 1 < num_timesteps_ && BestLabel(outputs_, t + 1) == label) { | |
| 232 ++t; | |
| 233 } | |
| 234 if (label != null_char_) { | |
| 235 output_labels.push_back(label); | |
| 236 } | |
| 237 } | |
| 238 // Simple bag of labels error calculation. | |
| 239 std::vector<int> truth_counts(num_classes_); | |
| 240 std::vector<int> output_counts(num_classes_); | |
| 241 for (int l = 0; l < num_labels_; ++l) { | |
| 242 ++truth_counts[labels_[l]]; | |
| 243 } | |
| 244 for (auto l : output_labels) { | |
| 245 ++output_counts[l]; | |
| 246 } | |
| 247 // Count the number of true and false positive non-nulls and truth labels. | |
| 248 int true_pos = 0, false_pos = 0, total_labels = 0; | |
| 249 for (int c = 0; c < num_classes_; ++c) { | |
| 250 if (c == null_char_) { | |
| 251 continue; | |
| 252 } | |
| 253 int truth_count = truth_counts[c]; | |
| 254 int ocr_count = output_counts[c]; | |
| 255 if (truth_count > 0) { | |
| 256 total_labels += truth_count; | |
| 257 if (ocr_count > truth_count) { | |
| 258 true_pos += truth_count; | |
| 259 false_pos += ocr_count - truth_count; | |
| 260 } else { | |
| 261 true_pos += ocr_count; | |
| 262 } | |
| 263 } | |
| 264 // We don't need to count classes that don't exist in the truth as | |
| 265 // false positives, because they don't affect CTC at all. | |
| 266 } | |
| 267 if (total_labels == 0) { | |
| 268 return 0.0f; | |
| 269 } | |
| 270 return exp(std::max(true_pos - false_pos, 1) * std::log(kMinProb_) / total_labels); | |
| 271 } | |
| 272 | |
| 273 // Given ln(x) and ln(y), returns ln(x + y), using: | |
| 274 // ln(x + y) = ln(y) + ln(1 + exp(ln(y) - ln(x)), ensuring that ln(x) is the | |
| 275 // bigger number to maximize precision. | |
| 276 static double LogSumExp(double ln_x, double ln_y) { | |
| 277 if (ln_x >= ln_y) { | |
| 278 return ln_x + log1p(exp(ln_y - ln_x)); | |
| 279 } else { | |
| 280 return ln_y + log1p(exp(ln_x - ln_y)); | |
| 281 } | |
| 282 } | |
| 283 | |
| 284 // Runs the forward CTC pass, filling in log_probs. | |
| 285 void CTC::Forward(GENERIC_2D_ARRAY<double> *log_probs) const { | |
| 286 log_probs->Resize(num_timesteps_, num_labels_, -FLT_MAX); | |
| 287 log_probs->put(0, 0, log(outputs_(0, labels_[0]))); | |
| 288 if (labels_[0] == null_char_) { | |
| 289 log_probs->put(0, 1, log(outputs_(0, labels_[1]))); | |
| 290 } | |
| 291 for (int t = 1; t < num_timesteps_; ++t) { | |
| 292 const float *outputs_t = outputs_[t]; | |
| 293 for (int u = min_labels_[t]; u <= max_labels_[t]; ++u) { | |
| 294 // Continuing the same label. | |
| 295 double log_sum = log_probs->get(t - 1, u); | |
| 296 // Change from previous label. | |
| 297 if (u > 0) { | |
| 298 log_sum = LogSumExp(log_sum, log_probs->get(t - 1, u - 1)); | |
| 299 } | |
| 300 // Skip the null if allowed. | |
| 301 if (u >= 2 && labels_[u - 1] == null_char_ && labels_[u] != labels_[u - 2]) { | |
| 302 log_sum = LogSumExp(log_sum, log_probs->get(t - 1, u - 2)); | |
| 303 } | |
| 304 // Add in the log prob of the current label. | |
| 305 double label_prob = outputs_t[labels_[u]]; | |
| 306 log_sum += log(label_prob); | |
| 307 log_probs->put(t, u, log_sum); | |
| 308 } | |
| 309 } | |
| 310 } | |
| 311 | |
| 312 // Runs the backward CTC pass, filling in log_probs. | |
| 313 void CTC::Backward(GENERIC_2D_ARRAY<double> *log_probs) const { | |
| 314 log_probs->Resize(num_timesteps_, num_labels_, -FLT_MAX); | |
| 315 log_probs->put(num_timesteps_ - 1, num_labels_ - 1, 0.0); | |
| 316 if (labels_[num_labels_ - 1] == null_char_) { | |
| 317 log_probs->put(num_timesteps_ - 1, num_labels_ - 2, 0.0); | |
| 318 } | |
| 319 for (int t = num_timesteps_ - 2; t >= 0; --t) { | |
| 320 const float *outputs_tp1 = outputs_[t + 1]; | |
| 321 for (int u = min_labels_[t]; u <= max_labels_[t]; ++u) { | |
| 322 // Continuing the same label. | |
| 323 double log_sum = log_probs->get(t + 1, u) + std::log(outputs_tp1[labels_[u]]); | |
| 324 // Change from previous label. | |
| 325 if (u + 1 < num_labels_) { | |
| 326 double prev_prob = outputs_tp1[labels_[u + 1]]; | |
| 327 log_sum = LogSumExp(log_sum, log_probs->get(t + 1, u + 1) + log(prev_prob)); | |
| 328 } | |
| 329 // Skip the null if allowed. | |
| 330 if (u + 2 < num_labels_ && labels_[u + 1] == null_char_ && labels_[u] != labels_[u + 2]) { | |
| 331 double skip_prob = outputs_tp1[labels_[u + 2]]; | |
| 332 log_sum = LogSumExp(log_sum, log_probs->get(t + 1, u + 2) + log(skip_prob)); | |
| 333 } | |
| 334 log_probs->put(t, u, log_sum); | |
| 335 } | |
| 336 } | |
| 337 } | |
| 338 | |
| 339 // Normalizes and brings probs out of log space with a softmax over time. | |
| 340 void CTC::NormalizeSequence(GENERIC_2D_ARRAY<double> *probs) const { | |
| 341 double max_logprob = probs->Max(); | |
| 342 for (int u = 0; u < num_labels_; ++u) { | |
| 343 double total = 0.0; | |
| 344 for (int t = 0; t < num_timesteps_; ++t) { | |
| 345 // Separate impossible path from unlikely probs. | |
| 346 double prob = probs->get(t, u); | |
| 347 if (prob > -FLT_MAX) { | |
| 348 prob = ClippedExp(prob - max_logprob); | |
| 349 } else { | |
| 350 prob = 0.0; | |
| 351 } | |
| 352 total += prob; | |
| 353 probs->put(t, u, prob); | |
| 354 } | |
| 355 // Note that although this is a probability distribution over time and | |
| 356 // therefore should sum to 1, it is important to allow some labels to be | |
| 357 // all zero, (or at least tiny) as it is necessary to skip some blanks. | |
| 358 if (total < kMinTotalTimeProb_) { | |
| 359 total = kMinTotalTimeProb_; | |
| 360 } | |
| 361 for (int t = 0; t < num_timesteps_; ++t) { | |
| 362 probs->put(t, u, probs->get(t, u) / total); | |
| 363 } | |
| 364 } | |
| 365 } | |
| 366 | |
| 367 // For each timestep computes the max prob for each class over all | |
| 368 // instances of the class in the labels_, and sets the targets to | |
| 369 // the max observed prob. | |
| 370 void CTC::LabelsToClasses(const GENERIC_2D_ARRAY<double> &probs, NetworkIO *targets) const { | |
| 371 // For each timestep compute the max prob for each class over all | |
| 372 // instances of the class in the labels_. | |
| 373 for (int t = 0; t < num_timesteps_; ++t) { | |
| 374 float *targets_t = targets->f(t); | |
| 375 std::vector<double> class_probs(num_classes_); | |
| 376 for (int u = 0; u < num_labels_; ++u) { | |
| 377 double prob = probs(t, u); | |
| 378 // Note that although Graves specifies sum over all labels of the same | |
| 379 // class, we need to allow skipped blanks to go to zero, so they don't | |
| 380 // interfere with the non-blanks, so max is better than sum. | |
| 381 if (prob > class_probs[labels_[u]]) { | |
| 382 class_probs[labels_[u]] = prob; | |
| 383 } | |
| 384 // class_probs[labels_[u]] += prob; | |
| 385 } | |
| 386 int best_class = 0; | |
| 387 for (int c = 0; c < num_classes_; ++c) { | |
| 388 targets_t[c] = class_probs[c]; | |
| 389 if (class_probs[c] > class_probs[best_class]) { | |
| 390 best_class = c; | |
| 391 } | |
| 392 } | |
| 393 } | |
| 394 } | |
| 395 | |
| 396 // Normalizes the probabilities such that no target has a prob below min_prob, | |
| 397 // and, provided that the initial total is at least min_total_prob, then all | |
| 398 // probs will sum to 1, otherwise to sum/min_total_prob. The maximum output | |
| 399 // probability is thus 1 - (num_classes-1)*min_prob. | |
| 400 /* static */ | |
| 401 void CTC::NormalizeProbs(GENERIC_2D_ARRAY<float> *probs) { | |
| 402 int num_timesteps = probs->dim1(); | |
| 403 int num_classes = probs->dim2(); | |
| 404 for (int t = 0; t < num_timesteps; ++t) { | |
| 405 float *probs_t = (*probs)[t]; | |
| 406 // Compute the total and clip that to prevent amplification of noise. | |
| 407 double total = 0.0; | |
| 408 for (int c = 0; c < num_classes; ++c) { | |
| 409 total += probs_t[c]; | |
| 410 } | |
| 411 if (total < kMinTotalFinalProb_) { | |
| 412 total = kMinTotalFinalProb_; | |
| 413 } | |
| 414 // Compute the increased total as a result of clipping. | |
| 415 double increment = 0.0; | |
| 416 for (int c = 0; c < num_classes; ++c) { | |
| 417 double prob = probs_t[c] / total; | |
| 418 if (prob < kMinProb_) { | |
| 419 increment += kMinProb_ - prob; | |
| 420 } | |
| 421 } | |
| 422 // Now normalize with clipping. Any additional clipping is negligible. | |
| 423 total += increment; | |
| 424 for (int c = 0; c < num_classes; ++c) { | |
| 425 float prob = probs_t[c] / total; | |
| 426 probs_t[c] = std::max(prob, kMinProb_); | |
| 427 } | |
| 428 } | |
| 429 } | |
| 430 | |
| 431 // Returns true if the label at index is a needed null. | |
| 432 bool CTC::NeededNull(int index) const { | |
| 433 return labels_[index] == null_char_ && index > 0 && index + 1 < num_labels_ && | |
| 434 labels_[index + 1] == labels_[index - 1]; | |
| 435 } | |
| 436 | |
| 437 } // namespace tesseract |
