Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/training/common/errorcounter.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/training/common/errorcounter.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,510 @@ +// Copyright 2011 Google Inc. All Rights Reserved. +// Author: rays@google.com (Ray Smith) +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#ifdef HAVE_CONFIG_H +# include "config_auto.h" +#endif + +#include "errorcounter.h" + +#include "fontinfo.h" +#include "sampleiterator.h" +#include "shapeclassifier.h" +#include "shapetable.h" +#include "tesserrstream.h" +#include "trainingsample.h" +#include "trainingsampleset.h" +#include "unicity_table.h" + +#include <algorithm> +#include <ctime> + +namespace tesseract { + +// Difference in result rating to be thought of as an "equal" choice. +const double kRatingEpsilon = 1.0 / 32; + +// Tests a classifier, computing its error rate. +// See errorcounter.h for description of arguments. +// Iterates over the samples, calling the classifier in normal/silent mode. +// If the classifier makes a CT_UNICHAR_TOPN_ERR error, and the appropriate +// report_level is set (4 or greater), it will then call the classifier again +// with a debug flag and a keep_this argument to find out what is going on. +double ErrorCounter::ComputeErrorRate(ShapeClassifier *classifier, int report_level, + CountTypes boosting_mode, const FontInfoTable &fontinfo_table, + const std::vector<Image > &page_images, SampleIterator *it, + double *unichar_error, double *scaled_error, + std::string *fonts_report) { + const int fontsize = it->sample_set()->NumFonts(); + ErrorCounter counter(classifier->GetUnicharset(), fontsize); + std::vector<UnicharRating> results; + + clock_t total_time = 0; + if (report_level > 1) { + total_time = clock(); + } + unsigned total_samples = 0; + double unscaled_error = 0.0; + // Set a number of samples on which to run the classify debug mode. + int error_samples = report_level > 3 ? report_level * report_level : 0; + // Iterate over all the samples, accumulating errors. + for (it->Begin(); !it->AtEnd(); it->Next()) { + TrainingSample *mutable_sample = it->MutableSample(); + int page_index = mutable_sample->page_num(); + Image page_pix = + 0 <= page_index && page_index < page_images.size() ? page_images[page_index] : nullptr; + // No debug, no keep this. + classifier->UnicharClassifySample(*mutable_sample, page_pix, 0, INVALID_UNICHAR_ID, &results); + bool debug_it = false; + int correct_id = mutable_sample->class_id(); + if (counter.unicharset_.has_special_codes() && + (correct_id == UNICHAR_SPACE || correct_id == UNICHAR_JOINED || + correct_id == UNICHAR_BROKEN)) { + // This is junk so use the special counter. + debug_it = counter.AccumulateJunk(report_level > 3, results, mutable_sample); + } else { + debug_it = counter.AccumulateErrors(report_level > 3, boosting_mode, fontinfo_table, results, + mutable_sample); + } + if (debug_it && error_samples > 0) { + // Running debug, keep the correct answer, and debug the classifier. + tprintf("Error on sample %d: %s Classifier debug output:\n", it->GlobalSampleIndex(), + it->sample_set()->SampleToString(*mutable_sample).c_str()); +#ifndef GRAPHICS_DISABLED + classifier->DebugDisplay(*mutable_sample, page_pix, correct_id); +#endif + --error_samples; + } + ++total_samples; + } + // Create the appropriate error report. + unscaled_error = counter.ReportErrors(report_level, boosting_mode, fontinfo_table, *it, + unichar_error, fonts_report); + if (scaled_error != nullptr) { + *scaled_error = counter.scaled_error_; + } + if (report_level > 1 && total_samples > 0) { + // It is useful to know the time in microseconds/char. + total_time = 1000 * (clock() - total_time) / CLOCKS_PER_SEC; + tesserr << "Errors computed in " << total_time << " ms at " + << 1000 * total_time / total_samples << " μs/char\n"; + } + return unscaled_error; +} + +// Tests a pair of classifiers, debugging errors of the new against the old. +// See errorcounter.h for description of arguments. +// Iterates over the samples, calling the classifiers in normal/silent mode. +// If the new_classifier makes a boosting_mode error that the old_classifier +// does not, it will then call the new_classifier again with a debug flag +// and a keep_this argument to find out what is going on. +void ErrorCounter::DebugNewErrors(ShapeClassifier *new_classifier, ShapeClassifier *old_classifier, + CountTypes boosting_mode, const FontInfoTable &fontinfo_table, + const std::vector<Image > &page_images, SampleIterator *it) { + int fontsize = it->sample_set()->NumFonts(); + ErrorCounter old_counter(old_classifier->GetUnicharset(), fontsize); + ErrorCounter new_counter(new_classifier->GetUnicharset(), fontsize); + std::vector<UnicharRating> results; + +#if !defined(NDEBUG) + int total_samples = 0; +#endif + int error_samples = 25; + int total_new_errors = 0; + // Iterate over all the samples, accumulating errors. + for (it->Begin(); !it->AtEnd(); it->Next()) { + TrainingSample *mutable_sample = it->MutableSample(); + int page_index = mutable_sample->page_num(); + Image page_pix = + 0 <= page_index && page_index < page_images.size() ? page_images[page_index] : nullptr; + // No debug, no keep this. + old_classifier->UnicharClassifySample(*mutable_sample, page_pix, 0, INVALID_UNICHAR_ID, + &results); + int correct_id = mutable_sample->class_id(); + if (correct_id != 0 && !old_counter.AccumulateErrors(true, boosting_mode, fontinfo_table, + results, mutable_sample)) { + // old classifier was correct, check the new one. + new_classifier->UnicharClassifySample(*mutable_sample, page_pix, 0, INVALID_UNICHAR_ID, + &results); + if (correct_id != 0 && new_counter.AccumulateErrors(true, boosting_mode, fontinfo_table, + results, mutable_sample)) { + tprintf("New Error on sample %d: Classifier debug output:\n", it->GlobalSampleIndex()); + ++total_new_errors; + new_classifier->UnicharClassifySample(*mutable_sample, page_pix, 1, correct_id, &results); + if (results.size() > 0 && error_samples > 0) { +#ifndef GRAPHICS_DISABLED + new_classifier->DebugDisplay(*mutable_sample, page_pix, correct_id); +#endif + --error_samples; + } + } + } +#if !defined(NDEBUG) + ++total_samples; +#endif + } + tprintf("Total new errors = %d\n", total_new_errors); +} + +// Constructor is private. Only anticipated use of ErrorCounter is via +// the static ComputeErrorRate. +ErrorCounter::ErrorCounter(const UNICHARSET &unicharset, int fontsize) + : scaled_error_(0.0) + , rating_epsilon_(kRatingEpsilon) + , unichar_counts_(unicharset.size(), unicharset.size(), 0) + , ok_score_hist_(0, 101) + , bad_score_hist_(0, 101) + , unicharset_(unicharset) { + Counts empty_counts; + font_counts_.clear(); + font_counts_.resize(fontsize, empty_counts); + multi_unichar_counts_.clear(); + multi_unichar_counts_.resize(unicharset.size(), 0); +} + +// Accumulates the errors from the classifier results on a single sample. +// Returns true if debug is true and a CT_UNICHAR_TOPN_ERR error occurred. +// boosting_mode selects the type of error to be used for boosting and the +// is_error_ member of sample is set according to whether the required type +// of error occurred. The font_table provides access to font properties +// for error counting and shape_table is used to understand the relationship +// between unichar_ids and shape_ids in the results +bool ErrorCounter::AccumulateErrors(bool debug, CountTypes boosting_mode, + const FontInfoTable &font_table, + const std::vector<UnicharRating> &results, + TrainingSample *sample) { + int num_results = results.size(); + int answer_actual_rank = -1; + int font_id = sample->font_id(); + int unichar_id = sample->class_id(); + sample->set_is_error(false); + if (num_results == 0) { + // Reject. We count rejects as a separate category, but still mark the + // sample as an error in case any training module wants to use that to + // improve the classifier. + sample->set_is_error(true); + ++font_counts_[font_id].n[CT_REJECT]; + } else { + // Find rank of correct unichar answer, using rating_epsilon_ to allow + // different answers to score as equal. (Ignoring the font.) + int epsilon_rank = 0; + int answer_epsilon_rank = -1; + int num_top_answers = 0; + double prev_rating = results[0].rating; + bool joined = false; + bool broken = false; + int res_index = 0; + while (res_index < num_results) { + if (results[res_index].rating < prev_rating - rating_epsilon_) { + ++epsilon_rank; + prev_rating = results[res_index].rating; + } + if (results[res_index].unichar_id == unichar_id && answer_epsilon_rank < 0) { + answer_epsilon_rank = epsilon_rank; + answer_actual_rank = res_index; + } + if (results[res_index].unichar_id == UNICHAR_JOINED && unicharset_.has_special_codes()) { + joined = true; + } else if (results[res_index].unichar_id == UNICHAR_BROKEN && + unicharset_.has_special_codes()) { + broken = true; + } else if (epsilon_rank == 0) { + ++num_top_answers; + } + ++res_index; + } + if (answer_actual_rank != 0) { + // Correct result is not absolute top. + ++font_counts_[font_id].n[CT_UNICHAR_TOPTOP_ERR]; + if (boosting_mode == CT_UNICHAR_TOPTOP_ERR) { + sample->set_is_error(true); + } + } + if (answer_epsilon_rank == 0) { + ++font_counts_[font_id].n[CT_UNICHAR_TOP_OK]; + // Unichar OK, but count if multiple unichars. + if (num_top_answers > 1) { + ++font_counts_[font_id].n[CT_OK_MULTI_UNICHAR]; + ++multi_unichar_counts_[unichar_id]; + } + // Check to see if any font in the top choice has attributes that match. + // TODO(rays) It is easy to add counters for individual font attributes + // here if we want them. + if (font_table.SetContainsFontProperties(font_id, results[answer_actual_rank].fonts)) { + // Font attributes were matched. + // Check for multiple properties. + if (font_table.SetContainsMultipleFontProperties(results[answer_actual_rank].fonts)) { + ++font_counts_[font_id].n[CT_OK_MULTI_FONT]; + } + } else { + // Font attributes weren't matched. + ++font_counts_[font_id].n[CT_FONT_ATTR_ERR]; + } + } else { + // This is a top unichar error. + ++font_counts_[font_id].n[CT_UNICHAR_TOP1_ERR]; + if (boosting_mode == CT_UNICHAR_TOP1_ERR) { + sample->set_is_error(true); + } + // Count maps from unichar id to wrong unichar id. + ++unichar_counts_(unichar_id, results[0].unichar_id); + if (answer_epsilon_rank < 0 || answer_epsilon_rank >= 2) { + // It is also a 2nd choice unichar error. + ++font_counts_[font_id].n[CT_UNICHAR_TOP2_ERR]; + if (boosting_mode == CT_UNICHAR_TOP2_ERR) { + sample->set_is_error(true); + } + } + if (answer_epsilon_rank < 0) { + // It is also a top-n choice unichar error. + ++font_counts_[font_id].n[CT_UNICHAR_TOPN_ERR]; + if (boosting_mode == CT_UNICHAR_TOPN_ERR) { + sample->set_is_error(true); + } + answer_epsilon_rank = epsilon_rank; + } + } + // Compute mean number of return values and mean rank of correct answer. + font_counts_[font_id].n[CT_NUM_RESULTS] += num_results; + font_counts_[font_id].n[CT_RANK] += answer_epsilon_rank; + if (joined) { + ++font_counts_[font_id].n[CT_OK_JOINED]; + } + if (broken) { + ++font_counts_[font_id].n[CT_OK_BROKEN]; + } + } + // If it was an error for boosting then sum the weight. + if (sample->is_error()) { + scaled_error_ += sample->weight(); + if (debug) { + tprintf("%d results for char %s font %d :", num_results, + unicharset_.id_to_unichar(unichar_id), font_id); + for (int i = 0; i < num_results; ++i) { + tprintf(" %.3f : %s\n", results[i].rating, + unicharset_.id_to_unichar(results[i].unichar_id)); + } + return true; + } + int percent = 0; + if (num_results > 0) { + percent = IntCastRounded(results[0].rating * 100); + } + bad_score_hist_.add(percent, 1); + } else { + int percent = 0; + if (answer_actual_rank >= 0) { + percent = IntCastRounded(results[answer_actual_rank].rating * 100); + } + ok_score_hist_.add(percent, 1); + } + return false; +} + +// Accumulates counts for junk. Counts only whether the junk was correctly +// rejected or not. +bool ErrorCounter::AccumulateJunk(bool debug, const std::vector<UnicharRating> &results, + TrainingSample *sample) { + // For junk we accept no answer, or an explicit shape answer matching the + // class id of the sample. + const int num_results = results.size(); + const int font_id = sample->font_id(); + const int unichar_id = sample->class_id(); + int percent = 0; + if (num_results > 0) { + percent = IntCastRounded(results[0].rating * 100); + } + if (num_results > 0 && results[0].unichar_id != unichar_id) { + // This is a junk error. + ++font_counts_[font_id].n[CT_ACCEPTED_JUNK]; + sample->set_is_error(true); + // It counts as an error for boosting too so sum the weight. + scaled_error_ += sample->weight(); + bad_score_hist_.add(percent, 1); + return debug; + } else { + // Correctly rejected. + ++font_counts_[font_id].n[CT_REJECTED_JUNK]; + sample->set_is_error(false); + ok_score_hist_.add(percent, 1); + } + return false; +} + +// Creates a report of the error rate. The report_level controls the detail +// that is reported to stderr via tprintf: +// 0 -> no output. +// >=1 -> bottom-line error rate. +// >=3 -> font-level error rate. +// boosting_mode determines the return value. It selects which (un-weighted) +// error rate to return. +// The fontinfo_table from MasterTrainer provides the names of fonts. +// The it determines the current subset of the training samples. +// If not nullptr, the top-choice unichar error rate is saved in unichar_error. +// If not nullptr, the report string is saved in fonts_report. +// (Ignoring report_level). +double ErrorCounter::ReportErrors(int report_level, CountTypes boosting_mode, + const FontInfoTable &fontinfo_table, const SampleIterator &it, + double *unichar_error, std::string *fonts_report) { + // Compute totals over all the fonts and report individual font results + // when required. + Counts totals; + int fontsize = font_counts_.size(); + for (int f = 0; f < fontsize; ++f) { + // Accumulate counts over fonts. + totals += font_counts_[f]; + std::string font_report; + if (ReportString(false, font_counts_[f], font_report)) { + if (fonts_report != nullptr) { + *fonts_report += fontinfo_table.at(f).name; + *fonts_report += ": "; + *fonts_report += font_report; + *fonts_report += "\n"; + } + if (report_level > 2) { + // Report individual font error rates. + tprintf("%s: %s\n", fontinfo_table.at(f).name, font_report.c_str()); + } + } + } + // Report the totals. + std::string total_report; + bool any_results = ReportString(true, totals, total_report); + if (fonts_report != nullptr && fonts_report->empty()) { + // Make sure we return something even if there were no samples. + *fonts_report = "NoSamplesFound: "; + *fonts_report += total_report; + *fonts_report += "\n"; + } + if (report_level > 0) { + // Report the totals. + std::string total_report; + if (any_results) { + tprintf("TOTAL Scaled Err=%.4g%%, %s\n", scaled_error_ * 100.0, total_report.c_str()); + } + // Report the worst substitution error only for now. + if (totals.n[CT_UNICHAR_TOP1_ERR] > 0) { + int charsetsize = unicharset_.size(); + int worst_uni_id = 0; + int worst_result_id = 0; + int worst_err = 0; + for (int u = 0; u < charsetsize; ++u) { + for (int v = 0; v < charsetsize; ++v) { + if (unichar_counts_(u, v) > worst_err) { + worst_err = unichar_counts_(u, v); + worst_uni_id = u; + worst_result_id = v; + } + } + } + if (worst_err > 0) { + tprintf("Worst error = %d:%s -> %s with %d/%d=%.2f%% errors\n", worst_uni_id, + unicharset_.id_to_unichar(worst_uni_id), unicharset_.id_to_unichar(worst_result_id), + worst_err, totals.n[CT_UNICHAR_TOP1_ERR], + 100.0 * worst_err / totals.n[CT_UNICHAR_TOP1_ERR]); + } + } + tprintf("Multi-unichar shape use:\n"); + for (int u = 0; u < multi_unichar_counts_.size(); ++u) { + if (multi_unichar_counts_[u] > 0) { + tprintf("%d multiple answers for unichar: %s\n", multi_unichar_counts_[u], + unicharset_.id_to_unichar(u)); + } + } + tprintf("OK Score histogram:\n"); + ok_score_hist_.print(); + tprintf("ERROR Score histogram:\n"); + bad_score_hist_.print(); + } + + double rates[CT_SIZE]; + if (!ComputeRates(totals, rates)) { + return 0.0; + } + // Set output values if asked for. + if (unichar_error != nullptr) { + *unichar_error = rates[CT_UNICHAR_TOP1_ERR]; + } + return rates[boosting_mode]; +} + +// Sets the report string to a combined human and machine-readable report +// string of the error rates. +// Returns false if there is no data, leaving report unchanged, unless +// even_if_empty is true. +bool ErrorCounter::ReportString(bool even_if_empty, const Counts &counts, std::string &report) { + // Compute the error rates. + double rates[CT_SIZE]; + if (!ComputeRates(counts, rates) && !even_if_empty) { + return false; + } + // Using %.4g%%, the length of the output string should exactly match the + // length of the format string, but in case of overflow, allow for +eddd + // on each number. + const int kMaxExtraLength = 5; // Length of +eddd. + // Keep this format string and the snprintf in sync with the CountTypes enum. + const char format_str[] = + "Unichar=%.4g%%[1], %.4g%%[2], %.4g%%[n], %.4g%%[T] " + "Mult=%.4g%%, Jn=%.4g%%, Brk=%.4g%%, Rej=%.4g%%, " + "FontAttr=%.4g%%, Multi=%.4g%%, " + "Answers=%.3g, Rank=%.3g, " + "OKjunk=%.4g%%, Badjunk=%.4g%%"; + constexpr size_t max_str_len = sizeof(format_str) + kMaxExtraLength * (CT_SIZE - 1) + 1; + char formatted_str[max_str_len]; + snprintf(formatted_str, max_str_len, format_str, rates[CT_UNICHAR_TOP1_ERR] * 100.0, + rates[CT_UNICHAR_TOP2_ERR] * 100.0, rates[CT_UNICHAR_TOPN_ERR] * 100.0, + rates[CT_UNICHAR_TOPTOP_ERR] * 100.0, rates[CT_OK_MULTI_UNICHAR] * 100.0, + rates[CT_OK_JOINED] * 100.0, rates[CT_OK_BROKEN] * 100.0, rates[CT_REJECT] * 100.0, + rates[CT_FONT_ATTR_ERR] * 100.0, rates[CT_OK_MULTI_FONT] * 100.0, rates[CT_NUM_RESULTS], + rates[CT_RANK], 100.0 * rates[CT_REJECTED_JUNK], 100.0 * rates[CT_ACCEPTED_JUNK]); + report = formatted_str; + // Now append each field of counts with a tab in front so the result can + // be loaded into a spreadsheet. + for (int ct : counts.n) { + report += "\t" + std::to_string(ct); + } + return true; +} + +// Computes the error rates and returns in rates which is an array of size +// CT_SIZE. Returns false if there is no data, leaving rates unchanged. +bool ErrorCounter::ComputeRates(const Counts &counts, double rates[CT_SIZE]) { + const int ok_samples = + counts.n[CT_UNICHAR_TOP_OK] + counts.n[CT_UNICHAR_TOP1_ERR] + counts.n[CT_REJECT]; + const int junk_samples = counts.n[CT_REJECTED_JUNK] + counts.n[CT_ACCEPTED_JUNK]; + // Compute rates for normal chars. + double denominator = static_cast<double>(std::max(ok_samples, 1)); + for (int ct = 0; ct <= CT_RANK; ++ct) { + rates[ct] = counts.n[ct] / denominator; + } + // Compute rates for junk. + denominator = static_cast<double>(std::max(junk_samples, 1)); + for (int ct = CT_REJECTED_JUNK; ct <= CT_ACCEPTED_JUNK; ++ct) { + rates[ct] = counts.n[ct] / denominator; + } + return ok_samples != 0 || junk_samples != 0; +} + +ErrorCounter::Counts::Counts() { + memset(n, 0, sizeof(n[0]) * CT_SIZE); +} +// Adds other into this for computing totals. +void ErrorCounter::Counts::operator+=(const Counts &other) { + for (int ct = 0; ct < CT_SIZE; ++ct) { + n[ct] += other.n[ct]; + } +} + +} // namespace tesseract.
