Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccmain/superscript.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /****************************************************************** | |
| 2 * File: superscript.cpp | |
| 3 * Description: Correction pass to fix superscripts and subscripts. | |
| 4 * Author: David Eger | |
| 5 * | |
| 6 * (C) Copyright 2012, Google, Inc. | |
| 7 ** Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 ** you may not use this file except in compliance with the License. | |
| 9 ** You may obtain a copy of the License at | |
| 10 ** http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 ** Unless required by applicable law or agreed to in writing, software | |
| 12 ** distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 ** See the License for the specific language governing permissions and | |
| 15 ** limitations under the License. | |
| 16 * | |
| 17 **********************************************************************/ | |
| 18 | |
| 19 #include "normalis.h" | |
| 20 #include "tesseractclass.h" | |
| 21 | |
| 22 namespace tesseract { | |
| 23 | |
| 24 static int LeadingUnicharsToChopped(WERD_RES *word, int num_unichars) { | |
| 25 int num_chopped = 0; | |
| 26 for (int i = 0; i < num_unichars; i++) { | |
| 27 num_chopped += word->best_state[i]; | |
| 28 } | |
| 29 return num_chopped; | |
| 30 } | |
| 31 | |
| 32 static int TrailingUnicharsToChopped(WERD_RES *word, int num_unichars) { | |
| 33 int num_chopped = 0; | |
| 34 for (int i = 0; i < num_unichars; i++) { | |
| 35 num_chopped += word->best_state[word->best_state.size() - 1 - i]; | |
| 36 } | |
| 37 return num_chopped; | |
| 38 } | |
| 39 | |
| 40 /** | |
| 41 * Given a recognized blob, see if a contiguous collection of sub-pieces | |
| 42 * (chopped blobs) starting at its left might qualify as being a subscript | |
| 43 * or superscript letter based only on y position. Also do this for the | |
| 44 * right side. | |
| 45 */ | |
| 46 static void YOutlierPieces(WERD_RES *word, int rebuilt_blob_index, int super_y_bottom, | |
| 47 int sub_y_top, ScriptPos *leading_pos, int *num_leading_outliers, | |
| 48 ScriptPos *trailing_pos, int *num_trailing_outliers) { | |
| 49 ScriptPos sp_unused1, sp_unused2; | |
| 50 int unused1, unused2; | |
| 51 if (!leading_pos) { | |
| 52 leading_pos = &sp_unused1; | |
| 53 } | |
| 54 if (!num_leading_outliers) { | |
| 55 num_leading_outliers = &unused1; | |
| 56 } | |
| 57 if (!trailing_pos) { | |
| 58 trailing_pos = &sp_unused2; | |
| 59 } | |
| 60 if (!num_trailing_outliers) { | |
| 61 num_trailing_outliers = &unused2; | |
| 62 } | |
| 63 | |
| 64 *num_leading_outliers = *num_trailing_outliers = 0; | |
| 65 *leading_pos = *trailing_pos = SP_NORMAL; | |
| 66 | |
| 67 int chopped_start = LeadingUnicharsToChopped(word, rebuilt_blob_index); | |
| 68 int num_chopped_pieces = word->best_state[rebuilt_blob_index]; | |
| 69 ScriptPos last_pos = SP_NORMAL; | |
| 70 int trailing_outliers = 0; | |
| 71 for (int i = 0; i < num_chopped_pieces; i++) { | |
| 72 TBOX box = word->chopped_word->blobs[chopped_start + i]->bounding_box(); | |
| 73 ScriptPos pos = SP_NORMAL; | |
| 74 if (box.bottom() >= super_y_bottom) { | |
| 75 pos = SP_SUPERSCRIPT; | |
| 76 } else if (box.top() <= sub_y_top) { | |
| 77 pos = SP_SUBSCRIPT; | |
| 78 } | |
| 79 if (pos == SP_NORMAL) { | |
| 80 if (trailing_outliers == i) { | |
| 81 *num_leading_outliers = trailing_outliers; | |
| 82 *leading_pos = last_pos; | |
| 83 } | |
| 84 trailing_outliers = 0; | |
| 85 } else { | |
| 86 if (pos == last_pos) { | |
| 87 trailing_outliers++; | |
| 88 } else { | |
| 89 trailing_outliers = 1; | |
| 90 } | |
| 91 } | |
| 92 last_pos = pos; | |
| 93 } | |
| 94 *num_trailing_outliers = trailing_outliers; | |
| 95 *trailing_pos = last_pos; | |
| 96 } | |
| 97 | |
| 98 /** | |
| 99 * Attempt to split off any high (or low) bits at the ends of the word with poor | |
| 100 * certainty and recognize them separately. If the certainty gets much better | |
| 101 * and other sanity checks pass, accept. | |
| 102 * | |
| 103 * This superscript fix is meant to be called in the second pass of recognition | |
| 104 * when we have tried once and already have a preliminary answer for word. | |
| 105 * | |
| 106 * @return Whether we modified the given word. | |
| 107 */ | |
| 108 bool Tesseract::SubAndSuperscriptFix(WERD_RES *word) { | |
| 109 if (word->tess_failed || word->word->flag(W_REP_CHAR) || !word->best_choice) { | |
| 110 return false; | |
| 111 } | |
| 112 int num_leading, num_trailing; | |
| 113 ScriptPos sp_leading, sp_trailing; | |
| 114 float leading_certainty, trailing_certainty; | |
| 115 float avg_certainty, unlikely_threshold; | |
| 116 | |
| 117 // Calculate the number of whole suspicious characters at the edges. | |
| 118 GetSubAndSuperscriptCandidates(word, &num_leading, &sp_leading, &leading_certainty, &num_trailing, | |
| 119 &sp_trailing, &trailing_certainty, &avg_certainty, | |
| 120 &unlikely_threshold); | |
| 121 | |
| 122 const char *leading_pos = sp_leading == SP_SUBSCRIPT ? "sub" : "super"; | |
| 123 const char *trailing_pos = sp_trailing == SP_SUBSCRIPT ? "sub" : "super"; | |
| 124 | |
| 125 int num_blobs = word->best_choice->length(); | |
| 126 | |
| 127 // Calculate the remainder (partial characters) at the edges. | |
| 128 // This accounts for us having classified the best version of | |
| 129 // a word as [speaker?'] when it was instead [speaker.^{21}] | |
| 130 // (that is we accidentally thought the 2 was attached to the period). | |
| 131 int num_remainder_leading = 0, num_remainder_trailing = 0; | |
| 132 if (num_leading + num_trailing < num_blobs && unlikely_threshold < 0.0) { | |
| 133 int super_y_bottom = kBlnBaselineOffset + kBlnXHeight * superscript_min_y_bottom; | |
| 134 int sub_y_top = kBlnBaselineOffset + kBlnXHeight * subscript_max_y_top; | |
| 135 int last_word_char = num_blobs - 1 - num_trailing; | |
| 136 float last_char_certainty = word->best_choice->certainty(last_word_char); | |
| 137 if (word->best_choice->unichar_id(last_word_char) != 0 && | |
| 138 last_char_certainty <= unlikely_threshold) { | |
| 139 ScriptPos rpos; | |
| 140 YOutlierPieces(word, last_word_char, super_y_bottom, sub_y_top, nullptr, nullptr, &rpos, | |
| 141 &num_remainder_trailing); | |
| 142 if (num_trailing > 0 && rpos != sp_trailing) { | |
| 143 num_remainder_trailing = 0; | |
| 144 } | |
| 145 if (num_remainder_trailing > 0 && last_char_certainty < trailing_certainty) { | |
| 146 trailing_certainty = last_char_certainty; | |
| 147 } | |
| 148 } | |
| 149 bool another_blob_available = | |
| 150 (num_remainder_trailing == 0) || num_leading + num_trailing + 1 < num_blobs; | |
| 151 int first_char_certainty = word->best_choice->certainty(num_leading); | |
| 152 if (another_blob_available && word->best_choice->unichar_id(num_leading) != 0 && | |
| 153 first_char_certainty <= unlikely_threshold) { | |
| 154 ScriptPos lpos; | |
| 155 YOutlierPieces(word, num_leading, super_y_bottom, sub_y_top, &lpos, &num_remainder_leading, | |
| 156 nullptr, nullptr); | |
| 157 if (num_leading > 0 && lpos != sp_leading) { | |
| 158 num_remainder_leading = 0; | |
| 159 } | |
| 160 if (num_remainder_leading > 0 && first_char_certainty < leading_certainty) { | |
| 161 leading_certainty = first_char_certainty; | |
| 162 } | |
| 163 } | |
| 164 } | |
| 165 | |
| 166 // If nothing to do, bail now. | |
| 167 if (num_leading + num_trailing + num_remainder_leading + num_remainder_trailing == 0) { | |
| 168 return false; | |
| 169 } | |
| 170 | |
| 171 if (superscript_debug >= 1) { | |
| 172 tprintf("Candidate for superscript detection: %s (", | |
| 173 word->best_choice->unichar_string().c_str()); | |
| 174 if (num_leading || num_remainder_leading) { | |
| 175 tprintf("%d.%d %s-leading ", num_leading, num_remainder_leading, leading_pos); | |
| 176 } | |
| 177 if (num_trailing || num_remainder_trailing) { | |
| 178 tprintf("%d.%d %s-trailing ", num_trailing, num_remainder_trailing, trailing_pos); | |
| 179 } | |
| 180 tprintf(")\n"); | |
| 181 } | |
| 182 if (superscript_debug >= 3) { | |
| 183 word->best_choice->print(); | |
| 184 } | |
| 185 if (superscript_debug >= 2) { | |
| 186 tprintf(" Certainties -- Average: %.2f Unlikely thresh: %.2f ", avg_certainty, | |
| 187 unlikely_threshold); | |
| 188 if (num_leading) { | |
| 189 tprintf("Orig. leading (min): %.2f ", leading_certainty); | |
| 190 } | |
| 191 if (num_trailing) { | |
| 192 tprintf("Orig. trailing (min): %.2f ", trailing_certainty); | |
| 193 } | |
| 194 tprintf("\n"); | |
| 195 } | |
| 196 | |
| 197 // We've now calculated the number of rebuilt blobs we want to carve off. | |
| 198 // However, split_word() works from TBLOBs in chopped_word, so we need to | |
| 199 // convert to those. | |
| 200 int num_chopped_leading = LeadingUnicharsToChopped(word, num_leading) + num_remainder_leading; | |
| 201 int num_chopped_trailing = TrailingUnicharsToChopped(word, num_trailing) + num_remainder_trailing; | |
| 202 | |
| 203 int retry_leading = 0; | |
| 204 int retry_trailing = 0; | |
| 205 bool is_good = false; | |
| 206 WERD_RES *revised = TrySuperscriptSplits(num_chopped_leading, leading_certainty, sp_leading, | |
| 207 num_chopped_trailing, trailing_certainty, sp_trailing, | |
| 208 word, &is_good, &retry_leading, &retry_trailing); | |
| 209 if (is_good) { | |
| 210 word->ConsumeWordResults(revised); | |
| 211 } else if (retry_leading || retry_trailing) { | |
| 212 int retry_chopped_leading = LeadingUnicharsToChopped(revised, retry_leading); | |
| 213 int retry_chopped_trailing = TrailingUnicharsToChopped(revised, retry_trailing); | |
| 214 WERD_RES *revised2 = TrySuperscriptSplits( | |
| 215 retry_chopped_leading, leading_certainty, sp_leading, retry_chopped_trailing, | |
| 216 trailing_certainty, sp_trailing, revised, &is_good, &retry_leading, &retry_trailing); | |
| 217 if (is_good) { | |
| 218 word->ConsumeWordResults(revised2); | |
| 219 } | |
| 220 delete revised2; | |
| 221 } | |
| 222 delete revised; | |
| 223 return is_good; | |
| 224 } | |
| 225 | |
| 226 /** | |
| 227 * Determine how many characters (rebuilt blobs) on each end of a given word | |
| 228 * might plausibly be superscripts so SubAndSuperscriptFix can try to | |
| 229 * re-recognize them. Even if we find no whole blobs at either end, | |
| 230 * we will set *unlikely_threshold to a certainty that might be used to | |
| 231 * select "bad enough" outlier characters. If *unlikely_threshold is set to 0, | |
| 232 * though, there's really no hope. | |
| 233 * | |
| 234 * @param[in] word The word to examine. | |
| 235 * @param[out] num_rebuilt_leading the number of rebuilt blobs at the start | |
| 236 * of the word which are all up or down and | |
| 237 * seem badly classified. | |
| 238 * @param[out] leading_pos "super" or "sub" (for debugging) | |
| 239 * @param[out] leading_certainty the worst certainty in the leading blobs. | |
| 240 * @param[out] num_rebuilt_trailing the number of rebuilt blobs at the end | |
| 241 * of the word which are all up or down and | |
| 242 * seem badly classified. | |
| 243 * @param[out] trailing_pos "super" or "sub" (for debugging) | |
| 244 * @param[out] trailing_certainty the worst certainty in the trailing blobs. | |
| 245 * @param[out] avg_certainty the average certainty of "normal" blobs in | |
| 246 * the word. | |
| 247 * @param[out] unlikely_threshold the threshold (on certainty) we used to | |
| 248 * select "bad enough" outlier characters. | |
| 249 */ | |
| 250 void Tesseract::GetSubAndSuperscriptCandidates(const WERD_RES *word, int *num_rebuilt_leading, | |
| 251 ScriptPos *leading_pos, float *leading_certainty, | |
| 252 int *num_rebuilt_trailing, ScriptPos *trailing_pos, | |
| 253 float *trailing_certainty, float *avg_certainty, | |
| 254 float *unlikely_threshold) { | |
| 255 *avg_certainty = *unlikely_threshold = 0.0f; | |
| 256 *num_rebuilt_leading = *num_rebuilt_trailing = 0; | |
| 257 *leading_certainty = *trailing_certainty = 0.0f; | |
| 258 | |
| 259 int super_y_bottom = kBlnBaselineOffset + kBlnXHeight * superscript_min_y_bottom; | |
| 260 int sub_y_top = kBlnBaselineOffset + kBlnXHeight * subscript_max_y_top; | |
| 261 | |
| 262 // Step one: Get an average certainty for "normally placed" characters. | |
| 263 | |
| 264 // Counts here are of blobs in the rebuild_word / unichars in best_choice. | |
| 265 *leading_pos = *trailing_pos = SP_NORMAL; | |
| 266 int leading_outliers = 0; | |
| 267 int trailing_outliers = 0; | |
| 268 int num_normal = 0; | |
| 269 float normal_certainty_total = 0.0f; | |
| 270 float worst_normal_certainty = 0.0f; | |
| 271 ScriptPos last_pos = SP_NORMAL; | |
| 272 int num_blobs = word->rebuild_word->NumBlobs(); | |
| 273 for (int b = 0; b < num_blobs; ++b) { | |
| 274 TBOX box = word->rebuild_word->blobs[b]->bounding_box(); | |
| 275 ScriptPos pos = SP_NORMAL; | |
| 276 if (box.bottom() >= super_y_bottom) { | |
| 277 pos = SP_SUPERSCRIPT; | |
| 278 } else if (box.top() <= sub_y_top) { | |
| 279 pos = SP_SUBSCRIPT; | |
| 280 } | |
| 281 if (pos == SP_NORMAL) { | |
| 282 if (word->best_choice->unichar_id(b) != 0) { | |
| 283 float char_certainty = word->best_choice->certainty(b); | |
| 284 if (char_certainty < worst_normal_certainty) { | |
| 285 worst_normal_certainty = char_certainty; | |
| 286 } | |
| 287 num_normal++; | |
| 288 normal_certainty_total += char_certainty; | |
| 289 } | |
| 290 if (trailing_outliers == b) { | |
| 291 leading_outliers = trailing_outliers; | |
| 292 *leading_pos = last_pos; | |
| 293 } | |
| 294 trailing_outliers = 0; | |
| 295 } else { | |
| 296 if (last_pos == pos) { | |
| 297 trailing_outliers++; | |
| 298 } else { | |
| 299 trailing_outliers = 1; | |
| 300 } | |
| 301 } | |
| 302 last_pos = pos; | |
| 303 } | |
| 304 *trailing_pos = last_pos; | |
| 305 if (num_normal >= 3) { // throw out the worst as an outlier. | |
| 306 num_normal--; | |
| 307 normal_certainty_total -= worst_normal_certainty; | |
| 308 } | |
| 309 if (num_normal > 0) { | |
| 310 *avg_certainty = normal_certainty_total / num_normal; | |
| 311 *unlikely_threshold = superscript_worse_certainty * (*avg_certainty); | |
| 312 } | |
| 313 if (num_normal == 0 || (leading_outliers == 0 && trailing_outliers == 0)) { | |
| 314 return; | |
| 315 } | |
| 316 | |
| 317 // Step two: Try to split off bits of the word that are both outliers | |
| 318 // and have much lower certainty than average | |
| 319 // Calculate num_leading and leading_certainty. | |
| 320 for (*leading_certainty = 0.0f, *num_rebuilt_leading = 0; *num_rebuilt_leading < leading_outliers; | |
| 321 (*num_rebuilt_leading)++) { | |
| 322 float char_certainty = word->best_choice->certainty(*num_rebuilt_leading); | |
| 323 if (char_certainty > *unlikely_threshold) { | |
| 324 break; | |
| 325 } | |
| 326 if (char_certainty < *leading_certainty) { | |
| 327 *leading_certainty = char_certainty; | |
| 328 } | |
| 329 } | |
| 330 | |
| 331 // Calculate num_trailing and trailing_certainty. | |
| 332 for (*trailing_certainty = 0.0f, *num_rebuilt_trailing = 0; | |
| 333 *num_rebuilt_trailing < trailing_outliers; (*num_rebuilt_trailing)++) { | |
| 334 int blob_idx = num_blobs - 1 - *num_rebuilt_trailing; | |
| 335 float char_certainty = word->best_choice->certainty(blob_idx); | |
| 336 if (char_certainty > *unlikely_threshold) { | |
| 337 break; | |
| 338 } | |
| 339 if (char_certainty < *trailing_certainty) { | |
| 340 *trailing_certainty = char_certainty; | |
| 341 } | |
| 342 } | |
| 343 } | |
| 344 | |
| 345 /** | |
| 346 * Try splitting off the given number of (chopped) blobs from the front and | |
| 347 * back of the given word and recognizing the pieces. | |
| 348 * | |
| 349 * @param[in] num_chopped_leading how many chopped blobs from the left | |
| 350 * end of the word to chop off and try recognizing as a | |
| 351 * superscript (or subscript) | |
| 352 * @param[in] leading_certainty the (minimum) certainty had by the | |
| 353 * characters in the original leading section. | |
| 354 * @param[in] leading_pos "super" or "sub" (for debugging) | |
| 355 * @param[in] num_chopped_trailing how many chopped blobs from the right | |
| 356 * end of the word to chop off and try recognizing as a | |
| 357 * superscript (or subscript) | |
| 358 * @param[in] trailing_certainty the (minimum) certainty had by the | |
| 359 * characters in the original trailing section. | |
| 360 * @param[in] trailing_pos "super" or "sub" (for debugging) | |
| 361 * @param[in] word the word to try to chop up. | |
| 362 * @param[out] is_good do we believe our result? | |
| 363 * @param[out] retry_rebuild_leading, retry_rebuild_trailing | |
| 364 * If non-zero, and !is_good, then the caller may have luck trying | |
| 365 * to split the returned word with this number of (rebuilt) leading | |
| 366 * and trailing blobs / unichars. | |
| 367 * @return A word which is the result of re-recognizing as asked. | |
| 368 */ | |
| 369 WERD_RES *Tesseract::TrySuperscriptSplits(int num_chopped_leading, float leading_certainty, | |
| 370 ScriptPos leading_pos, int num_chopped_trailing, | |
| 371 float trailing_certainty, ScriptPos trailing_pos, | |
| 372 WERD_RES *word, bool *is_good, int *retry_rebuild_leading, | |
| 373 int *retry_rebuild_trailing) { | |
| 374 int num_chopped = word->chopped_word->NumBlobs(); | |
| 375 | |
| 376 *retry_rebuild_leading = *retry_rebuild_trailing = 0; | |
| 377 | |
| 378 // Chop apart the word into up to three pieces. | |
| 379 | |
| 380 BlamerBundle *bb0 = nullptr; | |
| 381 BlamerBundle *bb1 = nullptr; | |
| 382 WERD_RES *prefix = nullptr; | |
| 383 WERD_RES *core = nullptr; | |
| 384 WERD_RES *suffix = nullptr; | |
| 385 if (num_chopped_leading > 0) { | |
| 386 prefix = new WERD_RES(*word); | |
| 387 split_word(prefix, num_chopped_leading, &core, &bb0); | |
| 388 } else { | |
| 389 core = new WERD_RES(*word); | |
| 390 } | |
| 391 | |
| 392 if (num_chopped_trailing > 0) { | |
| 393 int split_pt = num_chopped - num_chopped_trailing - num_chopped_leading; | |
| 394 split_word(core, split_pt, &suffix, &bb1); | |
| 395 } | |
| 396 | |
| 397 // Recognize the pieces in turn. | |
| 398 int saved_cp_multiplier = classify_class_pruner_multiplier; | |
| 399 int saved_im_multiplier = classify_integer_matcher_multiplier; | |
| 400 if (prefix) { | |
| 401 // Turn off Tesseract's y-position penalties for the leading superscript. | |
| 402 classify_class_pruner_multiplier.set_value(0); | |
| 403 classify_integer_matcher_multiplier.set_value(0); | |
| 404 | |
| 405 // Adjust our expectations about the baseline for this prefix. | |
| 406 if (superscript_debug >= 3) { | |
| 407 tprintf(" recognizing first %d chopped blobs\n", num_chopped_leading); | |
| 408 } | |
| 409 recog_word_recursive(prefix); | |
| 410 if (superscript_debug >= 2) { | |
| 411 tprintf(" The leading bits look like %s %s\n", ScriptPosToString(leading_pos), | |
| 412 prefix->best_choice->unichar_string().c_str()); | |
| 413 } | |
| 414 | |
| 415 // Restore the normal y-position penalties. | |
| 416 classify_class_pruner_multiplier.set_value(saved_cp_multiplier); | |
| 417 classify_integer_matcher_multiplier.set_value(saved_im_multiplier); | |
| 418 } | |
| 419 | |
| 420 if (superscript_debug >= 3) { | |
| 421 tprintf(" recognizing middle %d chopped blobs\n", | |
| 422 num_chopped - num_chopped_leading - num_chopped_trailing); | |
| 423 } | |
| 424 | |
| 425 if (suffix) { | |
| 426 // Turn off Tesseract's y-position penalties for the trailing superscript. | |
| 427 classify_class_pruner_multiplier.set_value(0); | |
| 428 classify_integer_matcher_multiplier.set_value(0); | |
| 429 | |
| 430 if (superscript_debug >= 3) { | |
| 431 tprintf(" recognizing last %d chopped blobs\n", num_chopped_trailing); | |
| 432 } | |
| 433 recog_word_recursive(suffix); | |
| 434 if (superscript_debug >= 2) { | |
| 435 tprintf(" The trailing bits look like %s %s\n", ScriptPosToString(trailing_pos), | |
| 436 suffix->best_choice->unichar_string().c_str()); | |
| 437 } | |
| 438 | |
| 439 // Restore the normal y-position penalties. | |
| 440 classify_class_pruner_multiplier.set_value(saved_cp_multiplier); | |
| 441 classify_integer_matcher_multiplier.set_value(saved_im_multiplier); | |
| 442 } | |
| 443 | |
| 444 // Evaluate whether we think the results are believably better | |
| 445 // than what we already had. | |
| 446 bool good_prefix = | |
| 447 !prefix || BelievableSuperscript(superscript_debug >= 1, *prefix, | |
| 448 superscript_bettered_certainty * leading_certainty, | |
| 449 retry_rebuild_leading, nullptr); | |
| 450 bool good_suffix = | |
| 451 !suffix || BelievableSuperscript(superscript_debug >= 1, *suffix, | |
| 452 superscript_bettered_certainty * trailing_certainty, nullptr, | |
| 453 retry_rebuild_trailing); | |
| 454 | |
| 455 *is_good = good_prefix && good_suffix; | |
| 456 if (!*is_good && !*retry_rebuild_leading && !*retry_rebuild_trailing) { | |
| 457 // None of it is any good. Quit now. | |
| 458 delete core; | |
| 459 delete prefix; | |
| 460 delete suffix; | |
| 461 delete bb1; | |
| 462 return nullptr; | |
| 463 } | |
| 464 recog_word_recursive(core); | |
| 465 | |
| 466 // Now paste the results together into core. | |
| 467 if (suffix) { | |
| 468 suffix->SetAllScriptPositions(trailing_pos); | |
| 469 join_words(core, suffix, bb1); | |
| 470 } | |
| 471 if (prefix) { | |
| 472 prefix->SetAllScriptPositions(leading_pos); | |
| 473 join_words(prefix, core, bb0); | |
| 474 core = prefix; | |
| 475 prefix = nullptr; | |
| 476 } | |
| 477 | |
| 478 if (superscript_debug >= 1) { | |
| 479 tprintf("%s superscript fix: %s\n", *is_good ? "ACCEPT" : "REJECT", | |
| 480 core->best_choice->unichar_string().c_str()); | |
| 481 } | |
| 482 return core; | |
| 483 } | |
| 484 | |
| 485 /** | |
| 486 * Return whether this is believable superscript or subscript text. | |
| 487 * | |
| 488 * We insist that: | |
| 489 * + there are no punctuation marks. | |
| 490 * + there are no italics. | |
| 491 * + no normal-sized character is smaller than superscript_scaledown_ratio | |
| 492 * of what it ought to be, and | |
| 493 * + each character is at least as certain as certainty_threshold. | |
| 494 * | |
| 495 * @param[in] debug If true, spew debug output | |
| 496 * @param[in] word The word whose best_choice we're evaluating | |
| 497 * @param[in] certainty_threshold If any of the characters have less | |
| 498 * certainty than this, reject. | |
| 499 * @param[out] left_ok How many left-side characters were ok? | |
| 500 * @param[out] right_ok How many right-side characters were ok? | |
| 501 * @return Whether the complete best choice is believable as a superscript. | |
| 502 */ | |
| 503 bool Tesseract::BelievableSuperscript(bool debug, const WERD_RES &word, float certainty_threshold, | |
| 504 int *left_ok, int *right_ok) const { | |
| 505 unsigned initial_ok_run_count = 0; | |
| 506 unsigned ok_run_count = 0; | |
| 507 float worst_certainty = 0.0f; | |
| 508 const WERD_CHOICE &wc = *word.best_choice; | |
| 509 | |
| 510 const UnicityTable<FontInfo> &fontinfo_table = get_fontinfo_table(); | |
| 511 for (unsigned i = 0; i < wc.length(); i++) { | |
| 512 TBLOB *blob = word.rebuild_word->blobs[i]; | |
| 513 UNICHAR_ID unichar_id = wc.unichar_id(i); | |
| 514 float char_certainty = wc.certainty(i); | |
| 515 bool bad_certainty = char_certainty < certainty_threshold; | |
| 516 bool is_punc = wc.unicharset()->get_ispunctuation(unichar_id); | |
| 517 bool is_italic = word.fontinfo && word.fontinfo->is_italic(); | |
| 518 BLOB_CHOICE *choice = word.GetBlobChoice(i); | |
| 519 if (choice && fontinfo_table.size() > 0) { | |
| 520 // Get better information from the specific choice, if available. | |
| 521 int font_id1 = choice->fontinfo_id(); | |
| 522 bool font1_is_italic = font_id1 >= 0 ? fontinfo_table.at(font_id1).is_italic() : false; | |
| 523 int font_id2 = choice->fontinfo_id2(); | |
| 524 is_italic = font1_is_italic && (font_id2 < 0 || fontinfo_table.at(font_id2).is_italic()); | |
| 525 } | |
| 526 | |
| 527 float height_fraction = 1.0f; | |
| 528 float char_height = blob->bounding_box().height(); | |
| 529 float normal_height = char_height; | |
| 530 if (wc.unicharset()->top_bottom_useful()) { | |
| 531 int min_bot, max_bot, min_top, max_top; | |
| 532 wc.unicharset()->get_top_bottom(unichar_id, &min_bot, &max_bot, &min_top, &max_top); | |
| 533 float hi_height = max_top - max_bot; | |
| 534 float lo_height = min_top - min_bot; | |
| 535 normal_height = (hi_height + lo_height) / 2; | |
| 536 if (normal_height >= kBlnXHeight) { | |
| 537 // Only ding characters that we have decent information for because | |
| 538 // they're supposed to be normal sized, not tiny specks or dashes. | |
| 539 height_fraction = char_height / normal_height; | |
| 540 } | |
| 541 } | |
| 542 bool bad_height = height_fraction < superscript_scaledown_ratio; | |
| 543 | |
| 544 if (debug) { | |
| 545 if (is_italic) { | |
| 546 tprintf(" Rejecting: superscript is italic.\n"); | |
| 547 } | |
| 548 if (is_punc) { | |
| 549 tprintf(" Rejecting: punctuation present.\n"); | |
| 550 } | |
| 551 const char *char_str = wc.unicharset()->id_to_unichar(unichar_id); | |
| 552 if (bad_certainty) { | |
| 553 tprintf( | |
| 554 " Rejecting: don't believe character %s with certainty %.2f " | |
| 555 "which is less than threshold %.2f\n", | |
| 556 char_str, char_certainty, certainty_threshold); | |
| 557 } | |
| 558 if (bad_height) { | |
| 559 tprintf( | |
| 560 " Rejecting: character %s seems too small @ %.2f versus " | |
| 561 "expected %.2f\n", | |
| 562 char_str, char_height, normal_height); | |
| 563 } | |
| 564 } | |
| 565 if (bad_certainty || bad_height || is_punc || is_italic) { | |
| 566 if (ok_run_count == i) { | |
| 567 initial_ok_run_count = ok_run_count; | |
| 568 } | |
| 569 ok_run_count = 0; | |
| 570 } else { | |
| 571 ok_run_count++; | |
| 572 } | |
| 573 if (char_certainty < worst_certainty) { | |
| 574 worst_certainty = char_certainty; | |
| 575 } | |
| 576 } | |
| 577 bool all_ok = ok_run_count == wc.length(); | |
| 578 if (all_ok && debug) { | |
| 579 tprintf(" Accept: worst revised certainty is %.2f\n", worst_certainty); | |
| 580 } | |
| 581 if (!all_ok) { | |
| 582 if (left_ok) { | |
| 583 *left_ok = initial_ok_run_count; | |
| 584 } | |
| 585 if (right_ok) { | |
| 586 *right_ok = ok_run_count; | |
| 587 } | |
| 588 } | |
| 589 return all_ok; | |
| 590 } | |
| 591 | |
| 592 } // namespace tesseract |
