Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/training/common/errorcounter.h @ 3:2c135c81b16c
MERGE: upstream PyMuPDF 1.26.4 with MuPDF 1.26.7
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:44:09 +0200 |
| parents | b50eed0cc0ef |
| children |
comparison
equal
deleted
inserted
replaced
| 0:6015a75abc2d | 3:2c135c81b16c |
|---|---|
| 1 // Copyright 2011 Google Inc. All Rights Reserved. | |
| 2 // Author: rays@google.com (Ray Smith) | |
| 3 // | |
| 4 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 5 // you may not use this file except in compliance with the License. | |
| 6 // You may obtain a copy of the License at | |
| 7 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 8 // Unless required by applicable law or agreed to in writing, software | |
| 9 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 11 // See the License for the specific language governing permissions and | |
| 12 // limitations under the License. | |
| 13 // | |
| 14 /////////////////////////////////////////////////////////////////////// | |
| 15 | |
| 16 #ifndef THIRD_PARTY_TESSERACT_CLASSIFY_ERRORCOUNTER_H_ | |
| 17 #define THIRD_PARTY_TESSERACT_CLASSIFY_ERRORCOUNTER_H_ | |
| 18 | |
| 19 #include "matrix.h" | |
| 20 #include "statistc.h" | |
| 21 | |
| 22 struct Pix; | |
| 23 | |
| 24 namespace tesseract { | |
| 25 | |
| 26 template <typename T> | |
| 27 class UnicityTable; | |
| 28 struct FontInfo; | |
| 29 class FontInfoTable; | |
| 30 class SampleIterator; | |
| 31 class ShapeClassifier; | |
| 32 class TrainingSample; | |
| 33 struct UnicharRating; | |
| 34 | |
| 35 // Enumeration of the different types of error count. | |
| 36 // Error counts work as follows: | |
| 37 // | |
| 38 // Ground truth is a valid unichar-id / font-id pair: | |
| 39 // Number of classifier answers? | |
| 40 // 0 >0 | |
| 41 // CT_REJECT unichar-id matches top shape? | |
| 42 // __________ yes! no | |
| 43 // CT_UNICHAR_TOP_OK CT_UNICHAR_TOP1_ERR | |
| 44 // Top shape-id has multiple unichars? 2nd shape unichar id matches? | |
| 45 // yes! no yes! no | |
| 46 // CT_OK_MULTI_UNICHAR | _____ CT_UNICHAR_TOP2_ERR | |
| 47 // Font attributes match? Any unichar-id matches? | |
| 48 // yes! no yes! no | |
| 49 // CT_FONT_ATTR_OK CT_FONT_ATTR_ERR ______ CT_UNICHAR_TOPN_ERR | |
| 50 // | __________________ _________________ | |
| 51 // Top shape-id has multiple font attrs? | |
| 52 // yes! no | |
| 53 // CT_OK_MULTI_FONT | |
| 54 // _____________________________ | |
| 55 // | |
| 56 // Note that multiple counts may be activated for a single sample! | |
| 57 // | |
| 58 // Ground truth is for a fragment/n-gram that is NOT in the unicharset. | |
| 59 // This is called junk and is expected to be rejected: | |
| 60 // Number of classifier answers? | |
| 61 // 0 >0 | |
| 62 // CT_REJECTED_JUNK CT_ACCEPTED_JUNK | |
| 63 // | |
| 64 // Also, CT_NUM_RESULTS stores the mean number of results, and CT_RANK stores | |
| 65 // the mean rank of the correct result, counting from 0, and with an error | |
| 66 // receiving the number of answers as the correct rank. | |
| 67 // | |
| 68 // Keep in sync with the ReportString function. | |
| 69 enum CountTypes { | |
| 70 CT_UNICHAR_TOP_OK, // Top shape contains correct unichar id. | |
| 71 // The rank of the results in TOP1, TOP2, TOPN is determined by a gap of | |
| 72 // kRatingEpsilon from the first result in each group. The real top choice | |
| 73 // is measured using TOPTOP. | |
| 74 CT_UNICHAR_TOP1_ERR, // Top shape does not contain correct unichar id. | |
| 75 CT_UNICHAR_TOP2_ERR, // Top 2 shapes don't contain correct unichar id. | |
| 76 CT_UNICHAR_TOPN_ERR, // No output shape contains correct unichar id. | |
| 77 CT_UNICHAR_TOPTOP_ERR, // Very top choice not correct. | |
| 78 CT_OK_MULTI_UNICHAR, // Top shape id has correct unichar id, and others. | |
| 79 CT_OK_JOINED, // Top shape id is correct but marked joined. | |
| 80 CT_OK_BROKEN, // Top shape id is correct but marked broken. | |
| 81 CT_REJECT, // Classifier hates this. | |
| 82 CT_FONT_ATTR_ERR, // Top unichar OK, but font attributes incorrect. | |
| 83 CT_OK_MULTI_FONT, // CT_FONT_ATTR_OK but there are multiple font attrs. | |
| 84 CT_NUM_RESULTS, // Number of answers produced. | |
| 85 CT_RANK, // Rank of correct answer. | |
| 86 CT_REJECTED_JUNK, // Junk that was correctly rejected. | |
| 87 CT_ACCEPTED_JUNK, // Junk that was incorrectly classified otherwise. | |
| 88 | |
| 89 CT_SIZE // Number of types for array sizing. | |
| 90 }; | |
| 91 | |
| 92 // Class to encapsulate all the functionality and sub-structures required | |
| 93 // to count errors for an isolated character classifier (ShapeClassifier). | |
| 94 class ErrorCounter { | |
| 95 public: | |
| 96 // Computes and returns the unweighted boosting_mode error rate of the given | |
| 97 // classifier. Can be used for testing, or inside an iterative training | |
| 98 // system, including one that uses boosting. | |
| 99 // report_levels: | |
| 100 // 0 = no output. | |
| 101 // 1 = bottom-line error rate. | |
| 102 // 2 = bottom-line error rate + time. | |
| 103 // 3 = font-level error rate + time. | |
| 104 // 4 = list of all errors + short classifier debug output on 16 errors. | |
| 105 // 5 = list of all errors + short classifier debug output on 25 errors. | |
| 106 // * The boosting_mode determines which error type is used for computing the | |
| 107 // scaled_error output, and setting the is_error flag in the samples. | |
| 108 // * The fontinfo_table is used to get string font names for the debug | |
| 109 // output, and also to count font attributes errors. | |
| 110 // * The page_images vector may contain a Pix* (which may be nullptr) for each | |
| 111 // page index assigned to the samples. | |
| 112 // * The it provides encapsulated iteration over some sample set. | |
| 113 // * The outputs unichar_error, scaled_error and totals_report are all | |
| 114 // optional. | |
| 115 // * If not nullptr, unichar error gets the top1 unichar error rate. | |
| 116 // * Scaled_error gets the error chosen by boosting_mode weighted by the | |
| 117 // weights on the samples. | |
| 118 // * Fonts_report gets a string summarizing the error rates for each font in | |
| 119 // both human-readable form and as a tab-separated list of error counts. | |
| 120 // The human-readable form is all before the first tab. | |
| 121 // * The return value is the un-weighted version of the scaled_error. | |
| 122 static double ComputeErrorRate(ShapeClassifier *classifier, int report_level, | |
| 123 CountTypes boosting_mode, const FontInfoTable &fontinfo_table, | |
| 124 const std::vector<Image > &page_images, SampleIterator *it, | |
| 125 double *unichar_error, double *scaled_error, std::string *fonts_report); | |
| 126 // Tests a pair of classifiers, debugging errors of the new against the old. | |
| 127 // See errorcounter.h for description of arguments. | |
| 128 // Iterates over the samples, calling the classifiers in normal/silent mode. | |
| 129 // If the new_classifier makes a boosting_mode error that the old_classifier | |
| 130 // does not, and the appropriate, it will then call the new_classifier again | |
| 131 // with a debug flag and a keep_this argument to find out what is going on. | |
| 132 static void DebugNewErrors(ShapeClassifier *new_classifier, ShapeClassifier *old_classifier, | |
| 133 CountTypes boosting_mode, const FontInfoTable &fontinfo_table, | |
| 134 const std::vector<Image > &page_images, SampleIterator *it); | |
| 135 | |
| 136 private: | |
| 137 // Simple struct to hold an array of counts. | |
| 138 struct Counts { | |
| 139 Counts(); | |
| 140 // Adds other into this for computing totals. | |
| 141 void operator+=(const Counts &other); | |
| 142 | |
| 143 int n[CT_SIZE]; | |
| 144 }; | |
| 145 | |
| 146 // Constructor is private. Only anticipated use of ErrorCounter is via | |
| 147 // the static ComputeErrorRate. | |
| 148 ErrorCounter(const UNICHARSET &unicharset, int fontsize); | |
| 149 ~ErrorCounter() = default; | |
| 150 | |
| 151 // Accumulates the errors from the classifier results on a single sample. | |
| 152 // Returns true if debug is true and a CT_UNICHAR_TOPN_ERR error occurred. | |
| 153 // boosting_mode selects the type of error to be used for boosting and the | |
| 154 // is_error_ member of sample is set according to whether the required type | |
| 155 // of error occurred. The font_table provides access to font properties | |
| 156 // for error counting and shape_table is used to understand the relationship | |
| 157 // between unichar_ids and shape_ids in the results | |
| 158 bool AccumulateErrors(bool debug, CountTypes boosting_mode, const FontInfoTable &font_table, | |
| 159 const std::vector<UnicharRating> &results, TrainingSample *sample); | |
| 160 | |
| 161 // Accumulates counts for junk. Counts only whether the junk was correctly | |
| 162 // rejected or not. | |
| 163 bool AccumulateJunk(bool debug, const std::vector<UnicharRating> &results, | |
| 164 TrainingSample *sample); | |
| 165 | |
| 166 // Creates a report of the error rate. The report_level controls the detail | |
| 167 // that is reported to stderr via tprintf: | |
| 168 // 0 -> no output. | |
| 169 // >=1 -> bottom-line error rate. | |
| 170 // >=3 -> font-level error rate. | |
| 171 // boosting_mode determines the return value. It selects which (un-weighted) | |
| 172 // error rate to return. | |
| 173 // The fontinfo_table from MasterTrainer provides the names of fonts. | |
| 174 // The it determines the current subset of the training samples. | |
| 175 // If not nullptr, the top-choice unichar error rate is saved in | |
| 176 // unichar_error. If not nullptr, the report string is saved in fonts_report. | |
| 177 // (Ignoring report_level). | |
| 178 double ReportErrors(int report_level, CountTypes boosting_mode, | |
| 179 const FontInfoTable &fontinfo_table, const SampleIterator &it, | |
| 180 double *unichar_error, std::string *fonts_report); | |
| 181 | |
| 182 // Sets the report string to a combined human and machine-readable report | |
| 183 // string of the error rates. | |
| 184 // Returns false if there is no data, leaving report unchanged, unless | |
| 185 // even_if_empty is true. | |
| 186 static bool ReportString(bool even_if_empty, const Counts &counts, std::string &report); | |
| 187 | |
| 188 // Computes the error rates and returns in rates which is an array of size | |
| 189 // CT_SIZE. Returns false if there is no data, leaving rates unchanged. | |
| 190 static bool ComputeRates(const Counts &counts, double rates[CT_SIZE]); | |
| 191 | |
| 192 // Total scaled error used by boosting algorithms. | |
| 193 double scaled_error_; | |
| 194 // Difference in result rating to be thought of as an "equal" choice. | |
| 195 double rating_epsilon_; | |
| 196 // Vector indexed by font_id from the samples of error accumulators. | |
| 197 std::vector<Counts> font_counts_; | |
| 198 // Counts of the results that map each unichar_id (from samples) to an | |
| 199 // incorrect shape_id. | |
| 200 GENERIC_2D_ARRAY<int> unichar_counts_; | |
| 201 // Count of the number of times each shape_id occurs, is correct, and multi- | |
| 202 // unichar. | |
| 203 std::vector<int> multi_unichar_counts_; | |
| 204 // Histogram of scores (as percent) for correct answers. | |
| 205 STATS ok_score_hist_; | |
| 206 // Histogram of scores (as percent) for incorrect answers. | |
| 207 STATS bad_score_hist_; | |
| 208 // Unicharset for printing character ids in results. | |
| 209 const UNICHARSET &unicharset_; | |
| 210 }; | |
| 211 | |
| 212 } // namespace tesseract. | |
| 213 | |
| 214 #endif /* THIRD_PARTY_TESSERACT_CLASSIFY_ERRORCOUNTER_H_ */ |
