Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/training/common/errorcounter.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 // Copyright 2011 Google Inc. All Rights Reserved. | |
| 2 // Author: rays@google.com (Ray Smith) | |
| 3 // | |
| 4 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 5 // you may not use this file except in compliance with the License. | |
| 6 // You may obtain a copy of the License at | |
| 7 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 8 // Unless required by applicable law or agreed to in writing, software | |
| 9 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 11 // See the License for the specific language governing permissions and | |
| 12 // limitations under the License. | |
| 13 // | |
| 14 /////////////////////////////////////////////////////////////////////// | |
| 15 | |
| 16 #ifdef HAVE_CONFIG_H | |
| 17 # include "config_auto.h" | |
| 18 #endif | |
| 19 | |
| 20 #include "errorcounter.h" | |
| 21 | |
| 22 #include "fontinfo.h" | |
| 23 #include "sampleiterator.h" | |
| 24 #include "shapeclassifier.h" | |
| 25 #include "shapetable.h" | |
| 26 #include "tesserrstream.h" | |
| 27 #include "trainingsample.h" | |
| 28 #include "trainingsampleset.h" | |
| 29 #include "unicity_table.h" | |
| 30 | |
| 31 #include <algorithm> | |
| 32 #include <ctime> | |
| 33 | |
| 34 namespace tesseract { | |
| 35 | |
| 36 // Difference in result rating to be thought of as an "equal" choice. | |
| 37 const double kRatingEpsilon = 1.0 / 32; | |
| 38 | |
| 39 // Tests a classifier, computing its error rate. | |
| 40 // See errorcounter.h for description of arguments. | |
| 41 // Iterates over the samples, calling the classifier in normal/silent mode. | |
| 42 // If the classifier makes a CT_UNICHAR_TOPN_ERR error, and the appropriate | |
| 43 // report_level is set (4 or greater), it will then call the classifier again | |
| 44 // with a debug flag and a keep_this argument to find out what is going on. | |
| 45 double ErrorCounter::ComputeErrorRate(ShapeClassifier *classifier, int report_level, | |
| 46 CountTypes boosting_mode, const FontInfoTable &fontinfo_table, | |
| 47 const std::vector<Image > &page_images, SampleIterator *it, | |
| 48 double *unichar_error, double *scaled_error, | |
| 49 std::string *fonts_report) { | |
| 50 const int fontsize = it->sample_set()->NumFonts(); | |
| 51 ErrorCounter counter(classifier->GetUnicharset(), fontsize); | |
| 52 std::vector<UnicharRating> results; | |
| 53 | |
| 54 clock_t total_time = 0; | |
| 55 if (report_level > 1) { | |
| 56 total_time = clock(); | |
| 57 } | |
| 58 unsigned total_samples = 0; | |
| 59 double unscaled_error = 0.0; | |
| 60 // Set a number of samples on which to run the classify debug mode. | |
| 61 int error_samples = report_level > 3 ? report_level * report_level : 0; | |
| 62 // Iterate over all the samples, accumulating errors. | |
| 63 for (it->Begin(); !it->AtEnd(); it->Next()) { | |
| 64 TrainingSample *mutable_sample = it->MutableSample(); | |
| 65 int page_index = mutable_sample->page_num(); | |
| 66 Image page_pix = | |
| 67 0 <= page_index && page_index < page_images.size() ? page_images[page_index] : nullptr; | |
| 68 // No debug, no keep this. | |
| 69 classifier->UnicharClassifySample(*mutable_sample, page_pix, 0, INVALID_UNICHAR_ID, &results); | |
| 70 bool debug_it = false; | |
| 71 int correct_id = mutable_sample->class_id(); | |
| 72 if (counter.unicharset_.has_special_codes() && | |
| 73 (correct_id == UNICHAR_SPACE || correct_id == UNICHAR_JOINED || | |
| 74 correct_id == UNICHAR_BROKEN)) { | |
| 75 // This is junk so use the special counter. | |
| 76 debug_it = counter.AccumulateJunk(report_level > 3, results, mutable_sample); | |
| 77 } else { | |
| 78 debug_it = counter.AccumulateErrors(report_level > 3, boosting_mode, fontinfo_table, results, | |
| 79 mutable_sample); | |
| 80 } | |
| 81 if (debug_it && error_samples > 0) { | |
| 82 // Running debug, keep the correct answer, and debug the classifier. | |
| 83 tprintf("Error on sample %d: %s Classifier debug output:\n", it->GlobalSampleIndex(), | |
| 84 it->sample_set()->SampleToString(*mutable_sample).c_str()); | |
| 85 #ifndef GRAPHICS_DISABLED | |
| 86 classifier->DebugDisplay(*mutable_sample, page_pix, correct_id); | |
| 87 #endif | |
| 88 --error_samples; | |
| 89 } | |
| 90 ++total_samples; | |
| 91 } | |
| 92 // Create the appropriate error report. | |
| 93 unscaled_error = counter.ReportErrors(report_level, boosting_mode, fontinfo_table, *it, | |
| 94 unichar_error, fonts_report); | |
| 95 if (scaled_error != nullptr) { | |
| 96 *scaled_error = counter.scaled_error_; | |
| 97 } | |
| 98 if (report_level > 1 && total_samples > 0) { | |
| 99 // It is useful to know the time in microseconds/char. | |
| 100 total_time = 1000 * (clock() - total_time) / CLOCKS_PER_SEC; | |
| 101 tesserr << "Errors computed in " << total_time << " ms at " | |
| 102 << 1000 * total_time / total_samples << " μs/char\n"; | |
| 103 } | |
| 104 return unscaled_error; | |
| 105 } | |
| 106 | |
| 107 // Tests a pair of classifiers, debugging errors of the new against the old. | |
| 108 // See errorcounter.h for description of arguments. | |
| 109 // Iterates over the samples, calling the classifiers in normal/silent mode. | |
| 110 // If the new_classifier makes a boosting_mode error that the old_classifier | |
| 111 // does not, it will then call the new_classifier again with a debug flag | |
| 112 // and a keep_this argument to find out what is going on. | |
| 113 void ErrorCounter::DebugNewErrors(ShapeClassifier *new_classifier, ShapeClassifier *old_classifier, | |
| 114 CountTypes boosting_mode, const FontInfoTable &fontinfo_table, | |
| 115 const std::vector<Image > &page_images, SampleIterator *it) { | |
| 116 int fontsize = it->sample_set()->NumFonts(); | |
| 117 ErrorCounter old_counter(old_classifier->GetUnicharset(), fontsize); | |
| 118 ErrorCounter new_counter(new_classifier->GetUnicharset(), fontsize); | |
| 119 std::vector<UnicharRating> results; | |
| 120 | |
| 121 #if !defined(NDEBUG) | |
| 122 int total_samples = 0; | |
| 123 #endif | |
| 124 int error_samples = 25; | |
| 125 int total_new_errors = 0; | |
| 126 // Iterate over all the samples, accumulating errors. | |
| 127 for (it->Begin(); !it->AtEnd(); it->Next()) { | |
| 128 TrainingSample *mutable_sample = it->MutableSample(); | |
| 129 int page_index = mutable_sample->page_num(); | |
| 130 Image page_pix = | |
| 131 0 <= page_index && page_index < page_images.size() ? page_images[page_index] : nullptr; | |
| 132 // No debug, no keep this. | |
| 133 old_classifier->UnicharClassifySample(*mutable_sample, page_pix, 0, INVALID_UNICHAR_ID, | |
| 134 &results); | |
| 135 int correct_id = mutable_sample->class_id(); | |
| 136 if (correct_id != 0 && !old_counter.AccumulateErrors(true, boosting_mode, fontinfo_table, | |
| 137 results, mutable_sample)) { | |
| 138 // old classifier was correct, check the new one. | |
| 139 new_classifier->UnicharClassifySample(*mutable_sample, page_pix, 0, INVALID_UNICHAR_ID, | |
| 140 &results); | |
| 141 if (correct_id != 0 && new_counter.AccumulateErrors(true, boosting_mode, fontinfo_table, | |
| 142 results, mutable_sample)) { | |
| 143 tprintf("New Error on sample %d: Classifier debug output:\n", it->GlobalSampleIndex()); | |
| 144 ++total_new_errors; | |
| 145 new_classifier->UnicharClassifySample(*mutable_sample, page_pix, 1, correct_id, &results); | |
| 146 if (results.size() > 0 && error_samples > 0) { | |
| 147 #ifndef GRAPHICS_DISABLED | |
| 148 new_classifier->DebugDisplay(*mutable_sample, page_pix, correct_id); | |
| 149 #endif | |
| 150 --error_samples; | |
| 151 } | |
| 152 } | |
| 153 } | |
| 154 #if !defined(NDEBUG) | |
| 155 ++total_samples; | |
| 156 #endif | |
| 157 } | |
| 158 tprintf("Total new errors = %d\n", total_new_errors); | |
| 159 } | |
| 160 | |
| 161 // Constructor is private. Only anticipated use of ErrorCounter is via | |
| 162 // the static ComputeErrorRate. | |
| 163 ErrorCounter::ErrorCounter(const UNICHARSET &unicharset, int fontsize) | |
| 164 : scaled_error_(0.0) | |
| 165 , rating_epsilon_(kRatingEpsilon) | |
| 166 , unichar_counts_(unicharset.size(), unicharset.size(), 0) | |
| 167 , ok_score_hist_(0, 101) | |
| 168 , bad_score_hist_(0, 101) | |
| 169 , unicharset_(unicharset) { | |
| 170 Counts empty_counts; | |
| 171 font_counts_.clear(); | |
| 172 font_counts_.resize(fontsize, empty_counts); | |
| 173 multi_unichar_counts_.clear(); | |
| 174 multi_unichar_counts_.resize(unicharset.size(), 0); | |
| 175 } | |
| 176 | |
| 177 // Accumulates the errors from the classifier results on a single sample. | |
| 178 // Returns true if debug is true and a CT_UNICHAR_TOPN_ERR error occurred. | |
| 179 // boosting_mode selects the type of error to be used for boosting and the | |
| 180 // is_error_ member of sample is set according to whether the required type | |
| 181 // of error occurred. The font_table provides access to font properties | |
| 182 // for error counting and shape_table is used to understand the relationship | |
| 183 // between unichar_ids and shape_ids in the results | |
| 184 bool ErrorCounter::AccumulateErrors(bool debug, CountTypes boosting_mode, | |
| 185 const FontInfoTable &font_table, | |
| 186 const std::vector<UnicharRating> &results, | |
| 187 TrainingSample *sample) { | |
| 188 int num_results = results.size(); | |
| 189 int answer_actual_rank = -1; | |
| 190 int font_id = sample->font_id(); | |
| 191 int unichar_id = sample->class_id(); | |
| 192 sample->set_is_error(false); | |
| 193 if (num_results == 0) { | |
| 194 // Reject. We count rejects as a separate category, but still mark the | |
| 195 // sample as an error in case any training module wants to use that to | |
| 196 // improve the classifier. | |
| 197 sample->set_is_error(true); | |
| 198 ++font_counts_[font_id].n[CT_REJECT]; | |
| 199 } else { | |
| 200 // Find rank of correct unichar answer, using rating_epsilon_ to allow | |
| 201 // different answers to score as equal. (Ignoring the font.) | |
| 202 int epsilon_rank = 0; | |
| 203 int answer_epsilon_rank = -1; | |
| 204 int num_top_answers = 0; | |
| 205 double prev_rating = results[0].rating; | |
| 206 bool joined = false; | |
| 207 bool broken = false; | |
| 208 int res_index = 0; | |
| 209 while (res_index < num_results) { | |
| 210 if (results[res_index].rating < prev_rating - rating_epsilon_) { | |
| 211 ++epsilon_rank; | |
| 212 prev_rating = results[res_index].rating; | |
| 213 } | |
| 214 if (results[res_index].unichar_id == unichar_id && answer_epsilon_rank < 0) { | |
| 215 answer_epsilon_rank = epsilon_rank; | |
| 216 answer_actual_rank = res_index; | |
| 217 } | |
| 218 if (results[res_index].unichar_id == UNICHAR_JOINED && unicharset_.has_special_codes()) { | |
| 219 joined = true; | |
| 220 } else if (results[res_index].unichar_id == UNICHAR_BROKEN && | |
| 221 unicharset_.has_special_codes()) { | |
| 222 broken = true; | |
| 223 } else if (epsilon_rank == 0) { | |
| 224 ++num_top_answers; | |
| 225 } | |
| 226 ++res_index; | |
| 227 } | |
| 228 if (answer_actual_rank != 0) { | |
| 229 // Correct result is not absolute top. | |
| 230 ++font_counts_[font_id].n[CT_UNICHAR_TOPTOP_ERR]; | |
| 231 if (boosting_mode == CT_UNICHAR_TOPTOP_ERR) { | |
| 232 sample->set_is_error(true); | |
| 233 } | |
| 234 } | |
| 235 if (answer_epsilon_rank == 0) { | |
| 236 ++font_counts_[font_id].n[CT_UNICHAR_TOP_OK]; | |
| 237 // Unichar OK, but count if multiple unichars. | |
| 238 if (num_top_answers > 1) { | |
| 239 ++font_counts_[font_id].n[CT_OK_MULTI_UNICHAR]; | |
| 240 ++multi_unichar_counts_[unichar_id]; | |
| 241 } | |
| 242 // Check to see if any font in the top choice has attributes that match. | |
| 243 // TODO(rays) It is easy to add counters for individual font attributes | |
| 244 // here if we want them. | |
| 245 if (font_table.SetContainsFontProperties(font_id, results[answer_actual_rank].fonts)) { | |
| 246 // Font attributes were matched. | |
| 247 // Check for multiple properties. | |
| 248 if (font_table.SetContainsMultipleFontProperties(results[answer_actual_rank].fonts)) { | |
| 249 ++font_counts_[font_id].n[CT_OK_MULTI_FONT]; | |
| 250 } | |
| 251 } else { | |
| 252 // Font attributes weren't matched. | |
| 253 ++font_counts_[font_id].n[CT_FONT_ATTR_ERR]; | |
| 254 } | |
| 255 } else { | |
| 256 // This is a top unichar error. | |
| 257 ++font_counts_[font_id].n[CT_UNICHAR_TOP1_ERR]; | |
| 258 if (boosting_mode == CT_UNICHAR_TOP1_ERR) { | |
| 259 sample->set_is_error(true); | |
| 260 } | |
| 261 // Count maps from unichar id to wrong unichar id. | |
| 262 ++unichar_counts_(unichar_id, results[0].unichar_id); | |
| 263 if (answer_epsilon_rank < 0 || answer_epsilon_rank >= 2) { | |
| 264 // It is also a 2nd choice unichar error. | |
| 265 ++font_counts_[font_id].n[CT_UNICHAR_TOP2_ERR]; | |
| 266 if (boosting_mode == CT_UNICHAR_TOP2_ERR) { | |
| 267 sample->set_is_error(true); | |
| 268 } | |
| 269 } | |
| 270 if (answer_epsilon_rank < 0) { | |
| 271 // It is also a top-n choice unichar error. | |
| 272 ++font_counts_[font_id].n[CT_UNICHAR_TOPN_ERR]; | |
| 273 if (boosting_mode == CT_UNICHAR_TOPN_ERR) { | |
| 274 sample->set_is_error(true); | |
| 275 } | |
| 276 answer_epsilon_rank = epsilon_rank; | |
| 277 } | |
| 278 } | |
| 279 // Compute mean number of return values and mean rank of correct answer. | |
| 280 font_counts_[font_id].n[CT_NUM_RESULTS] += num_results; | |
| 281 font_counts_[font_id].n[CT_RANK] += answer_epsilon_rank; | |
| 282 if (joined) { | |
| 283 ++font_counts_[font_id].n[CT_OK_JOINED]; | |
| 284 } | |
| 285 if (broken) { | |
| 286 ++font_counts_[font_id].n[CT_OK_BROKEN]; | |
| 287 } | |
| 288 } | |
| 289 // If it was an error for boosting then sum the weight. | |
| 290 if (sample->is_error()) { | |
| 291 scaled_error_ += sample->weight(); | |
| 292 if (debug) { | |
| 293 tprintf("%d results for char %s font %d :", num_results, | |
| 294 unicharset_.id_to_unichar(unichar_id), font_id); | |
| 295 for (int i = 0; i < num_results; ++i) { | |
| 296 tprintf(" %.3f : %s\n", results[i].rating, | |
| 297 unicharset_.id_to_unichar(results[i].unichar_id)); | |
| 298 } | |
| 299 return true; | |
| 300 } | |
| 301 int percent = 0; | |
| 302 if (num_results > 0) { | |
| 303 percent = IntCastRounded(results[0].rating * 100); | |
| 304 } | |
| 305 bad_score_hist_.add(percent, 1); | |
| 306 } else { | |
| 307 int percent = 0; | |
| 308 if (answer_actual_rank >= 0) { | |
| 309 percent = IntCastRounded(results[answer_actual_rank].rating * 100); | |
| 310 } | |
| 311 ok_score_hist_.add(percent, 1); | |
| 312 } | |
| 313 return false; | |
| 314 } | |
| 315 | |
| 316 // Accumulates counts for junk. Counts only whether the junk was correctly | |
| 317 // rejected or not. | |
| 318 bool ErrorCounter::AccumulateJunk(bool debug, const std::vector<UnicharRating> &results, | |
| 319 TrainingSample *sample) { | |
| 320 // For junk we accept no answer, or an explicit shape answer matching the | |
| 321 // class id of the sample. | |
| 322 const int num_results = results.size(); | |
| 323 const int font_id = sample->font_id(); | |
| 324 const int unichar_id = sample->class_id(); | |
| 325 int percent = 0; | |
| 326 if (num_results > 0) { | |
| 327 percent = IntCastRounded(results[0].rating * 100); | |
| 328 } | |
| 329 if (num_results > 0 && results[0].unichar_id != unichar_id) { | |
| 330 // This is a junk error. | |
| 331 ++font_counts_[font_id].n[CT_ACCEPTED_JUNK]; | |
| 332 sample->set_is_error(true); | |
| 333 // It counts as an error for boosting too so sum the weight. | |
| 334 scaled_error_ += sample->weight(); | |
| 335 bad_score_hist_.add(percent, 1); | |
| 336 return debug; | |
| 337 } else { | |
| 338 // Correctly rejected. | |
| 339 ++font_counts_[font_id].n[CT_REJECTED_JUNK]; | |
| 340 sample->set_is_error(false); | |
| 341 ok_score_hist_.add(percent, 1); | |
| 342 } | |
| 343 return false; | |
| 344 } | |
| 345 | |
| 346 // Creates a report of the error rate. The report_level controls the detail | |
| 347 // that is reported to stderr via tprintf: | |
| 348 // 0 -> no output. | |
| 349 // >=1 -> bottom-line error rate. | |
| 350 // >=3 -> font-level error rate. | |
| 351 // boosting_mode determines the return value. It selects which (un-weighted) | |
| 352 // error rate to return. | |
| 353 // The fontinfo_table from MasterTrainer provides the names of fonts. | |
| 354 // The it determines the current subset of the training samples. | |
| 355 // If not nullptr, the top-choice unichar error rate is saved in unichar_error. | |
| 356 // If not nullptr, the report string is saved in fonts_report. | |
| 357 // (Ignoring report_level). | |
| 358 double ErrorCounter::ReportErrors(int report_level, CountTypes boosting_mode, | |
| 359 const FontInfoTable &fontinfo_table, const SampleIterator &it, | |
| 360 double *unichar_error, std::string *fonts_report) { | |
| 361 // Compute totals over all the fonts and report individual font results | |
| 362 // when required. | |
| 363 Counts totals; | |
| 364 int fontsize = font_counts_.size(); | |
| 365 for (int f = 0; f < fontsize; ++f) { | |
| 366 // Accumulate counts over fonts. | |
| 367 totals += font_counts_[f]; | |
| 368 std::string font_report; | |
| 369 if (ReportString(false, font_counts_[f], font_report)) { | |
| 370 if (fonts_report != nullptr) { | |
| 371 *fonts_report += fontinfo_table.at(f).name; | |
| 372 *fonts_report += ": "; | |
| 373 *fonts_report += font_report; | |
| 374 *fonts_report += "\n"; | |
| 375 } | |
| 376 if (report_level > 2) { | |
| 377 // Report individual font error rates. | |
| 378 tprintf("%s: %s\n", fontinfo_table.at(f).name, font_report.c_str()); | |
| 379 } | |
| 380 } | |
| 381 } | |
| 382 // Report the totals. | |
| 383 std::string total_report; | |
| 384 bool any_results = ReportString(true, totals, total_report); | |
| 385 if (fonts_report != nullptr && fonts_report->empty()) { | |
| 386 // Make sure we return something even if there were no samples. | |
| 387 *fonts_report = "NoSamplesFound: "; | |
| 388 *fonts_report += total_report; | |
| 389 *fonts_report += "\n"; | |
| 390 } | |
| 391 if (report_level > 0) { | |
| 392 // Report the totals. | |
| 393 std::string total_report; | |
| 394 if (any_results) { | |
| 395 tprintf("TOTAL Scaled Err=%.4g%%, %s\n", scaled_error_ * 100.0, total_report.c_str()); | |
| 396 } | |
| 397 // Report the worst substitution error only for now. | |
| 398 if (totals.n[CT_UNICHAR_TOP1_ERR] > 0) { | |
| 399 int charsetsize = unicharset_.size(); | |
| 400 int worst_uni_id = 0; | |
| 401 int worst_result_id = 0; | |
| 402 int worst_err = 0; | |
| 403 for (int u = 0; u < charsetsize; ++u) { | |
| 404 for (int v = 0; v < charsetsize; ++v) { | |
| 405 if (unichar_counts_(u, v) > worst_err) { | |
| 406 worst_err = unichar_counts_(u, v); | |
| 407 worst_uni_id = u; | |
| 408 worst_result_id = v; | |
| 409 } | |
| 410 } | |
| 411 } | |
| 412 if (worst_err > 0) { | |
| 413 tprintf("Worst error = %d:%s -> %s with %d/%d=%.2f%% errors\n", worst_uni_id, | |
| 414 unicharset_.id_to_unichar(worst_uni_id), unicharset_.id_to_unichar(worst_result_id), | |
| 415 worst_err, totals.n[CT_UNICHAR_TOP1_ERR], | |
| 416 100.0 * worst_err / totals.n[CT_UNICHAR_TOP1_ERR]); | |
| 417 } | |
| 418 } | |
| 419 tprintf("Multi-unichar shape use:\n"); | |
| 420 for (int u = 0; u < multi_unichar_counts_.size(); ++u) { | |
| 421 if (multi_unichar_counts_[u] > 0) { | |
| 422 tprintf("%d multiple answers for unichar: %s\n", multi_unichar_counts_[u], | |
| 423 unicharset_.id_to_unichar(u)); | |
| 424 } | |
| 425 } | |
| 426 tprintf("OK Score histogram:\n"); | |
| 427 ok_score_hist_.print(); | |
| 428 tprintf("ERROR Score histogram:\n"); | |
| 429 bad_score_hist_.print(); | |
| 430 } | |
| 431 | |
| 432 double rates[CT_SIZE]; | |
| 433 if (!ComputeRates(totals, rates)) { | |
| 434 return 0.0; | |
| 435 } | |
| 436 // Set output values if asked for. | |
| 437 if (unichar_error != nullptr) { | |
| 438 *unichar_error = rates[CT_UNICHAR_TOP1_ERR]; | |
| 439 } | |
| 440 return rates[boosting_mode]; | |
| 441 } | |
| 442 | |
| 443 // Sets the report string to a combined human and machine-readable report | |
| 444 // string of the error rates. | |
| 445 // Returns false if there is no data, leaving report unchanged, unless | |
| 446 // even_if_empty is true. | |
| 447 bool ErrorCounter::ReportString(bool even_if_empty, const Counts &counts, std::string &report) { | |
| 448 // Compute the error rates. | |
| 449 double rates[CT_SIZE]; | |
| 450 if (!ComputeRates(counts, rates) && !even_if_empty) { | |
| 451 return false; | |
| 452 } | |
| 453 // Using %.4g%%, the length of the output string should exactly match the | |
| 454 // length of the format string, but in case of overflow, allow for +eddd | |
| 455 // on each number. | |
| 456 const int kMaxExtraLength = 5; // Length of +eddd. | |
| 457 // Keep this format string and the snprintf in sync with the CountTypes enum. | |
| 458 const char format_str[] = | |
| 459 "Unichar=%.4g%%[1], %.4g%%[2], %.4g%%[n], %.4g%%[T] " | |
| 460 "Mult=%.4g%%, Jn=%.4g%%, Brk=%.4g%%, Rej=%.4g%%, " | |
| 461 "FontAttr=%.4g%%, Multi=%.4g%%, " | |
| 462 "Answers=%.3g, Rank=%.3g, " | |
| 463 "OKjunk=%.4g%%, Badjunk=%.4g%%"; | |
| 464 constexpr size_t max_str_len = sizeof(format_str) + kMaxExtraLength * (CT_SIZE - 1) + 1; | |
| 465 char formatted_str[max_str_len]; | |
| 466 snprintf(formatted_str, max_str_len, format_str, rates[CT_UNICHAR_TOP1_ERR] * 100.0, | |
| 467 rates[CT_UNICHAR_TOP2_ERR] * 100.0, rates[CT_UNICHAR_TOPN_ERR] * 100.0, | |
| 468 rates[CT_UNICHAR_TOPTOP_ERR] * 100.0, rates[CT_OK_MULTI_UNICHAR] * 100.0, | |
| 469 rates[CT_OK_JOINED] * 100.0, rates[CT_OK_BROKEN] * 100.0, rates[CT_REJECT] * 100.0, | |
| 470 rates[CT_FONT_ATTR_ERR] * 100.0, rates[CT_OK_MULTI_FONT] * 100.0, rates[CT_NUM_RESULTS], | |
| 471 rates[CT_RANK], 100.0 * rates[CT_REJECTED_JUNK], 100.0 * rates[CT_ACCEPTED_JUNK]); | |
| 472 report = formatted_str; | |
| 473 // Now append each field of counts with a tab in front so the result can | |
| 474 // be loaded into a spreadsheet. | |
| 475 for (int ct : counts.n) { | |
| 476 report += "\t" + std::to_string(ct); | |
| 477 } | |
| 478 return true; | |
| 479 } | |
| 480 | |
| 481 // Computes the error rates and returns in rates which is an array of size | |
| 482 // CT_SIZE. Returns false if there is no data, leaving rates unchanged. | |
| 483 bool ErrorCounter::ComputeRates(const Counts &counts, double rates[CT_SIZE]) { | |
| 484 const int ok_samples = | |
| 485 counts.n[CT_UNICHAR_TOP_OK] + counts.n[CT_UNICHAR_TOP1_ERR] + counts.n[CT_REJECT]; | |
| 486 const int junk_samples = counts.n[CT_REJECTED_JUNK] + counts.n[CT_ACCEPTED_JUNK]; | |
| 487 // Compute rates for normal chars. | |
| 488 double denominator = static_cast<double>(std::max(ok_samples, 1)); | |
| 489 for (int ct = 0; ct <= CT_RANK; ++ct) { | |
| 490 rates[ct] = counts.n[ct] / denominator; | |
| 491 } | |
| 492 // Compute rates for junk. | |
| 493 denominator = static_cast<double>(std::max(junk_samples, 1)); | |
| 494 for (int ct = CT_REJECTED_JUNK; ct <= CT_ACCEPTED_JUNK; ++ct) { | |
| 495 rates[ct] = counts.n[ct] / denominator; | |
| 496 } | |
| 497 return ok_samples != 0 || junk_samples != 0; | |
| 498 } | |
| 499 | |
| 500 ErrorCounter::Counts::Counts() { | |
| 501 memset(n, 0, sizeof(n[0]) * CT_SIZE); | |
| 502 } | |
| 503 // Adds other into this for computing totals. | |
| 504 void ErrorCounter::Counts::operator+=(const Counts &other) { | |
| 505 for (int ct = 0; ct < CT_SIZE; ++ct) { | |
| 506 n[ct] += other.n[ct]; | |
| 507 } | |
| 508 } | |
| 509 | |
| 510 } // namespace tesseract. |
