Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccmain/fixxht.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /********************************************************************** | |
| 2 * File: fixxht.cpp (Formerly fixxht.c) | |
| 3 * Description: Improve x_ht and look out for case inconsistencies | |
| 4 * Author: Phil Cheatle | |
| 5 * Created: Thu Aug 5 14:11:08 BST 1993 | |
| 6 * | |
| 7 * (C) Copyright 1992, Hewlett-Packard Ltd. | |
| 8 ** Licensed under the Apache License, Version 2.0 (the "License"); | |
| 9 ** you may not use this file except in compliance with the License. | |
| 10 ** You may obtain a copy of the License at | |
| 11 ** http://www.apache.org/licenses/LICENSE-2.0 | |
| 12 ** Unless required by applicable law or agreed to in writing, software | |
| 13 ** distributed under the License is distributed on an "AS IS" BASIS, | |
| 14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 15 ** See the License for the specific language governing permissions and | |
| 16 ** limitations under the License. | |
| 17 * | |
| 18 **********************************************************************/ | |
| 19 | |
| 20 #include "float2int.h" | |
| 21 #include "params.h" | |
| 22 #include "tesseractclass.h" | |
| 23 | |
| 24 #include <algorithm> | |
| 25 #include <cctype> | |
| 26 #include <cmath> | |
| 27 #include <cstring> | |
| 28 | |
| 29 namespace tesseract { | |
| 30 | |
| 31 // Fixxht overview. | |
| 32 // Premise: Initial estimate of x-height is adequate most of the time, but | |
| 33 // occasionally it is incorrect. Most notable causes of failure are: | |
| 34 // 1. Small caps, where the top of the caps is the same as the body text | |
| 35 // xheight. For small caps words the xheight needs to be reduced to correctly | |
| 36 // recognize the caps in the small caps word. | |
| 37 // 2. All xheight lines, such as summer. Here the initial estimate will have | |
| 38 // guessed that the blob tops are caps and will have placed the xheight too low. | |
| 39 // 3. Noise/logos beside words, or changes in font size on a line. Such | |
| 40 // things can blow the statistics and cause an incorrect estimate. | |
| 41 // 4. Incorrect baseline. Can happen when 2 columns are incorrectly merged. | |
| 42 // In this case the x-height is often still correct. | |
| 43 // | |
| 44 // Algorithm. | |
| 45 // Compare the vertical position (top only) of alphnumerics in a word with | |
| 46 // the range of positions in training data (in the unicharset). | |
| 47 // See CountMisfitTops. If any characters disagree sufficiently with the | |
| 48 // initial xheight estimate, then recalculate the xheight, re-run OCR on | |
| 49 // the word, and if the number of vertical misfits goes down, along with | |
| 50 // either the word rating or certainty, then keep the new xheight. | |
| 51 // The new xheight is calculated as follows:ComputeCompatibleXHeight | |
| 52 // For each alphanumeric character that has a vertically misplaced top | |
| 53 // (a misfit), yet its bottom is within the acceptable range (ie it is not | |
| 54 // likely a sub-or super-script) calculate the range of acceptable xheight | |
| 55 // positions from its range of tops, and give each value in the range a | |
| 56 // number of votes equal to the distance of its top from its acceptance range. | |
| 57 // The x-height position with the median of the votes becomes the new | |
| 58 // x-height. This assumes that most characters will be correctly recognized | |
| 59 // even if the x-height is incorrect. This is not a terrible assumption, but | |
| 60 // it is not great. An improvement would be to use a classifier that does | |
| 61 // not care about vertical position or scaling at all. | |
| 62 // Separately collect stats on shifted baselines and apply the same logic to | |
| 63 // computing a best-fit shift to fix the error. If the baseline needs to be | |
| 64 // shifted, but the x-height is OK, returns the original x-height along with | |
| 65 // the baseline shift to indicate that recognition needs to re-run. | |
| 66 | |
| 67 // If the max-min top of a unicharset char is bigger than kMaxCharTopRange | |
| 68 // then the char top cannot be used to judge misfits or suggest a new top. | |
| 69 const int kMaxCharTopRange = 48; | |
| 70 | |
| 71 // Returns the number of misfit blob tops in this word. | |
| 72 int Tesseract::CountMisfitTops(WERD_RES *word_res) { | |
| 73 int bad_blobs = 0; | |
| 74 int num_blobs = word_res->rebuild_word->NumBlobs(); | |
| 75 for (int blob_id = 0; blob_id < num_blobs; ++blob_id) { | |
| 76 TBLOB *blob = word_res->rebuild_word->blobs[blob_id]; | |
| 77 UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id); | |
| 78 if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) { | |
| 79 int top = blob->bounding_box().top(); | |
| 80 if (top >= INT_FEAT_RANGE) { | |
| 81 top = INT_FEAT_RANGE - 1; | |
| 82 } | |
| 83 int min_bottom, max_bottom, min_top, max_top; | |
| 84 unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom, &min_top, &max_top); | |
| 85 if (max_top - min_top > kMaxCharTopRange) { | |
| 86 continue; | |
| 87 } | |
| 88 bool bad = | |
| 89 top < min_top - x_ht_acceptance_tolerance || top > max_top + x_ht_acceptance_tolerance; | |
| 90 if (bad) { | |
| 91 ++bad_blobs; | |
| 92 } | |
| 93 if (debug_x_ht_level >= 1) { | |
| 94 tprintf("Class %s is %s with top %d vs limits of %d->%d, +/-%d\n", | |
| 95 unicharset.id_to_unichar(class_id), bad ? "Misfit" : "OK", top, min_top, max_top, | |
| 96 static_cast<int>(x_ht_acceptance_tolerance)); | |
| 97 } | |
| 98 } | |
| 99 } | |
| 100 return bad_blobs; | |
| 101 } | |
| 102 | |
| 103 // Returns a new x-height maximally compatible with the result in word_res. | |
| 104 // See comment above for overall algorithm. | |
| 105 float Tesseract::ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift) { | |
| 106 STATS top_stats(0, UINT8_MAX - 1); | |
| 107 STATS shift_stats(-UINT8_MAX, UINT8_MAX - 1); | |
| 108 int bottom_shift = 0; | |
| 109 int num_blobs = word_res->rebuild_word->NumBlobs(); | |
| 110 do { | |
| 111 top_stats.clear(); | |
| 112 shift_stats.clear(); | |
| 113 for (int blob_id = 0; blob_id < num_blobs; ++blob_id) { | |
| 114 TBLOB *blob = word_res->rebuild_word->blobs[blob_id]; | |
| 115 UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id); | |
| 116 if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) { | |
| 117 int top = blob->bounding_box().top() + bottom_shift; | |
| 118 // Clip the top to the limit of normalized feature space. | |
| 119 if (top >= INT_FEAT_RANGE) { | |
| 120 top = INT_FEAT_RANGE - 1; | |
| 121 } | |
| 122 int bottom = blob->bounding_box().bottom() + bottom_shift; | |
| 123 int min_bottom, max_bottom, min_top, max_top; | |
| 124 unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom, &min_top, &max_top); | |
| 125 // Chars with a wild top range would mess up the result so ignore them. | |
| 126 if (max_top - min_top > kMaxCharTopRange) { | |
| 127 continue; | |
| 128 } | |
| 129 int misfit_dist = std::max((min_top - x_ht_acceptance_tolerance) - top, | |
| 130 top - (max_top + x_ht_acceptance_tolerance)); | |
| 131 int height = top - kBlnBaselineOffset; | |
| 132 if (debug_x_ht_level >= 2) { | |
| 133 tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d: ", | |
| 134 unicharset.id_to_unichar(class_id), height, min_bottom, max_bottom, min_top, | |
| 135 max_top, bottom, top); | |
| 136 } | |
| 137 // Use only chars that fit in the expected bottom range, and where | |
| 138 // the range of tops is sensibly near the xheight. | |
| 139 if (min_bottom <= bottom + x_ht_acceptance_tolerance && | |
| 140 bottom - x_ht_acceptance_tolerance <= max_bottom && min_top > kBlnBaselineOffset && | |
| 141 max_top - kBlnBaselineOffset >= kBlnXHeight && misfit_dist > 0) { | |
| 142 // Compute the x-height position using proportionality between the | |
| 143 // actual height and expected height. | |
| 144 int min_xht = DivRounded(height * kBlnXHeight, max_top - kBlnBaselineOffset); | |
| 145 int max_xht = DivRounded(height * kBlnXHeight, min_top - kBlnBaselineOffset); | |
| 146 if (debug_x_ht_level >= 2) { | |
| 147 tprintf(" xht range min=%d, max=%d\n", min_xht, max_xht); | |
| 148 } | |
| 149 // The range of expected heights gets a vote equal to the distance | |
| 150 // of the actual top from the expected top. | |
| 151 for (int y = min_xht; y <= max_xht; ++y) { | |
| 152 top_stats.add(y, misfit_dist); | |
| 153 } | |
| 154 } else if ((min_bottom > bottom + x_ht_acceptance_tolerance || | |
| 155 bottom - x_ht_acceptance_tolerance > max_bottom) && | |
| 156 bottom_shift == 0) { | |
| 157 // Get the range of required bottom shift. | |
| 158 int min_shift = min_bottom - bottom; | |
| 159 int max_shift = max_bottom - bottom; | |
| 160 if (debug_x_ht_level >= 2) { | |
| 161 tprintf(" bottom shift min=%d, max=%d\n", min_shift, max_shift); | |
| 162 } | |
| 163 // The range of expected shifts gets a vote equal to the min distance | |
| 164 // of the actual bottom from the expected bottom, spread over the | |
| 165 // range of its acceptance. | |
| 166 int misfit_weight = abs(min_shift); | |
| 167 if (max_shift > min_shift) { | |
| 168 misfit_weight /= max_shift - min_shift; | |
| 169 } | |
| 170 for (int y = min_shift; y <= max_shift; ++y) { | |
| 171 shift_stats.add(y, misfit_weight); | |
| 172 } | |
| 173 } else { | |
| 174 if (bottom_shift == 0) { | |
| 175 // Things with bottoms that are already ok need to say so, on the | |
| 176 // 1st iteration only. | |
| 177 shift_stats.add(0, kBlnBaselineOffset); | |
| 178 } | |
| 179 if (debug_x_ht_level >= 2) { | |
| 180 tprintf(" already OK\n"); | |
| 181 } | |
| 182 } | |
| 183 } | |
| 184 } | |
| 185 if (shift_stats.get_total() > top_stats.get_total()) { | |
| 186 bottom_shift = IntCastRounded(shift_stats.median()); | |
| 187 if (debug_x_ht_level >= 2) { | |
| 188 tprintf("Applying bottom shift=%d\n", bottom_shift); | |
| 189 } | |
| 190 } | |
| 191 } while (bottom_shift != 0 && top_stats.get_total() < shift_stats.get_total()); | |
| 192 // Baseline shift is opposite sign to the bottom shift. | |
| 193 *baseline_shift = -bottom_shift / word_res->denorm.y_scale(); | |
| 194 if (debug_x_ht_level >= 2) { | |
| 195 tprintf("baseline shift=%g\n", *baseline_shift); | |
| 196 } | |
| 197 if (top_stats.get_total() == 0) { | |
| 198 return bottom_shift != 0 ? word_res->x_height : 0.0f; | |
| 199 } | |
| 200 // The new xheight is just the median vote, which is then scaled out | |
| 201 // of BLN space back to pixel space to get the x-height in pixel space. | |
| 202 float new_xht = top_stats.median(); | |
| 203 if (debug_x_ht_level >= 2) { | |
| 204 tprintf("Median xht=%f\n", new_xht); | |
| 205 tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n", new_xht, | |
| 206 new_xht / word_res->denorm.y_scale()); | |
| 207 } | |
| 208 // The xheight must change by at least x_ht_min_change to be used. | |
| 209 if (std::fabs(new_xht - kBlnXHeight) >= x_ht_min_change) { | |
| 210 return new_xht / word_res->denorm.y_scale(); | |
| 211 } else { | |
| 212 return bottom_shift != 0 ? word_res->x_height : 0.0f; | |
| 213 } | |
| 214 } | |
| 215 | |
| 216 } // namespace tesseract |
