Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/wordrec/lm_consistency.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: lm_consistency.cpp | |
| 3 // Description: Struct for recording consistency of the paths representing | |
| 4 // OCR hypotheses. | |
| 5 // Author: Rika Antonova | |
| 6 // Created: Mon Jun 20 11:26:43 PST 2012 | |
| 7 // | |
| 8 // (C) Copyright 2012, Google Inc. | |
| 9 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 10 // you may not use this file except in compliance with the License. | |
| 11 // You may obtain a copy of the License at | |
| 12 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 13 // Unless required by applicable law or agreed to in writing, software | |
| 14 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 16 // See the License for the specific language governing permissions and | |
| 17 // limitations under the License. | |
| 18 // | |
| 19 //////////////////////////////////////////////////////////////////////// | |
| 20 | |
| 21 #include "lm_consistency.h" | |
| 22 | |
| 23 #include "associate.h" | |
| 24 #include "dict.h" | |
| 25 #include "ratngs.h" | |
| 26 | |
| 27 namespace tesseract { | |
| 28 | |
| 29 void LMConsistencyInfo::ComputeXheightConsistency(const BLOB_CHOICE *b, bool is_punc) { | |
| 30 if (xht_decision == XH_INCONSISTENT) { | |
| 31 return; // It isn't going to get any better. | |
| 32 } | |
| 33 | |
| 34 // Compute xheight consistency. | |
| 35 bool parent_null = xht_sp < 0; | |
| 36 int parent_sp = xht_sp; | |
| 37 // Debug strings. | |
| 38 if (b->yshift() > LMConsistencyInfo::kShiftThresh) { | |
| 39 xht_sp = LMConsistencyInfo::kSUP; | |
| 40 } else if (b->yshift() < -LMConsistencyInfo::kShiftThresh) { | |
| 41 xht_sp = LMConsistencyInfo::kSUB; | |
| 42 } else { | |
| 43 xht_sp = LMConsistencyInfo::kNORM; | |
| 44 } | |
| 45 xht_count[xht_sp]++; | |
| 46 if (is_punc) { | |
| 47 xht_count_punc[xht_sp]++; | |
| 48 } | |
| 49 if (!parent_null) { | |
| 50 xpos_entropy += abs(parent_sp - xht_sp); | |
| 51 } | |
| 52 // TODO(eger): Figure out a better way to account for small caps. | |
| 53 // For the first character not y-shifted, we only care if it is too small. | |
| 54 // Too large is common in drop caps and small caps. | |
| 55 // int16_t small_xht = b->min_xheight(); | |
| 56 // if (parent_vse == nullptr && sp == LanguageModelConsistencyInfo::kNORM) { | |
| 57 // small_xht = 0; | |
| 58 // } | |
| 59 IntersectRange(b->min_xheight(), b->max_xheight(), &(xht_lo[xht_sp]), &(xht_hi[xht_sp])); | |
| 60 | |
| 61 // Compute xheight inconsistency kinds. | |
| 62 if (parent_null) { | |
| 63 if (xht_count[kNORM] == 1) { | |
| 64 xht_decision = XH_GOOD; | |
| 65 } else { | |
| 66 xht_decision = XH_SUBNORMAL; | |
| 67 } | |
| 68 return; | |
| 69 } | |
| 70 | |
| 71 // When we intersect the ranges of xheights in pixels for all characters in | |
| 72 // each position (subscript, normal, superscript), | |
| 73 // How much range must be left? 0? [exactly one pixel height for xheight] 1? | |
| 74 // TODO(eger): Extend this code to take a prior for the rest of the line. | |
| 75 const int kMinIntersectedXHeightRange = 0; | |
| 76 for (int i = 0; i < kNumPos; i++) { | |
| 77 if (xht_lo[i] > xht_hi[i] - kMinIntersectedXHeightRange) { | |
| 78 xht_decision = XH_INCONSISTENT; | |
| 79 return; | |
| 80 } | |
| 81 } | |
| 82 | |
| 83 // Reject as improbable anything where there's much punctuation in subscript | |
| 84 // or superscript regions. | |
| 85 if (xht_count_punc[kSUB] > xht_count[kSUB] * 0.4 || | |
| 86 xht_count_punc[kSUP] > xht_count[kSUP] * 0.4) { | |
| 87 xht_decision = XH_INCONSISTENT; | |
| 88 return; | |
| 89 } | |
| 90 | |
| 91 // Now check that the subscript and superscript aren't too small relative to | |
| 92 // the mainline. | |
| 93 auto mainline_xht = static_cast<double>(xht_lo[kNORM]); | |
| 94 double kMinSizeRatio = 0.4; | |
| 95 if (mainline_xht > 0.0 && (static_cast<double>(xht_hi[kSUB]) / mainline_xht < kMinSizeRatio || | |
| 96 static_cast<double>(xht_hi[kSUP]) / mainline_xht < kMinSizeRatio)) { | |
| 97 xht_decision = XH_INCONSISTENT; | |
| 98 return; | |
| 99 } | |
| 100 // TODO(eger): Check into inconsistency of super/subscript y offsets. | |
| 101 if (xpos_entropy > kMaxEntropy) { | |
| 102 xht_decision = XH_INCONSISTENT; | |
| 103 return; | |
| 104 } | |
| 105 if (xht_count[kSUB] == 0 && xht_count[kSUP] == 0) { | |
| 106 xht_decision = XH_GOOD; | |
| 107 return; | |
| 108 } | |
| 109 xht_decision = XH_SUBNORMAL; | |
| 110 } | |
| 111 | |
| 112 } // namespace tesseract |
