comparison mupdf-source/thirdparty/tesseract/src/wordrec/lm_consistency.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 ///////////////////////////////////////////////////////////////////////
2 // File: lm_consistency.h
3 // Description: Struct for recording consistency of the paths representing
4 // OCR hypotheses.
5 // Author: Rika Antonova
6 //
7 // (C) Copyright 2012, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
18 ////////////////////////////////////////////////////////////////////////
19
20 #ifndef TESSERACT_WORDREC_LM_CONSISTENCY_H_
21 #define TESSERACT_WORDREC_LM_CONSISTENCY_H_
22
23 #include <cstdint> // for INT16_MAX
24 #include "dawg.h" // for EDGE_REF, NO_EDGE
25 #include "dict.h" // for XH_GOOD, XH_INCONSISTENT, XHeightConsi...
26
27 class BLOB_CHOICE;
28
29 namespace tesseract {
30
31 static const char *const XHeightConsistencyEnumName[] = {
32 "XH_GOOD",
33 "XH_SUBNORMAL",
34 "XH_INCONSISTENT",
35 };
36
37 // Struct for keeping track of the consistency of the path.
38 struct LMConsistencyInfo {
39 enum ChartypeEnum { CT_NONE, CT_ALPHA, CT_DIGIT, CT_OTHER };
40
41 // How much do characters have to be shifted away from normal parameters
42 // before we say they're not normal?
43 static const int kShiftThresh = 1;
44
45 // How much shifting from subscript to superscript and back
46 // before we declare shenanigans?
47 static const int kMaxEntropy = 1;
48
49 // Script positions - order important for entropy calculation.
50 static const int kSUB = 0, kNORM = 1, kSUP = 2;
51 static const int kNumPos = 3;
52
53 explicit LMConsistencyInfo(const LMConsistencyInfo *parent_info) {
54 if (parent_info == nullptr) {
55 // Initialize from scratch.
56 num_alphas = 0;
57 num_digits = 0;
58 num_punc = 0;
59 num_other = 0;
60 chartype = CT_NONE;
61 punc_ref = NO_EDGE;
62 invalid_punc = false;
63 num_non_first_upper = 0;
64 num_lower = 0;
65 script_id = 0;
66 inconsistent_script = false;
67 num_inconsistent_spaces = 0;
68 inconsistent_font = false;
69 // Initialize XHeight stats.
70 for (int i = 0; i < kNumPos; i++) {
71 xht_count[i] = 0;
72 xht_count_punc[i] = 0;
73 xht_lo[i] = 0;
74 xht_hi[i] = 256; // kBlnCellHeight
75 }
76 xht_sp = -1; // This invalid value indicates that there was no parent.
77 xpos_entropy = 0;
78 xht_decision = XH_GOOD;
79 } else {
80 // Copy parent info
81 *this = *parent_info;
82 }
83 }
84 inline int NumInconsistentPunc() const {
85 return invalid_punc ? num_punc : 0;
86 }
87 inline int NumInconsistentCase() const {
88 return (num_non_first_upper > num_lower) ? num_lower : num_non_first_upper;
89 }
90 inline int NumInconsistentChartype() const {
91 return (NumInconsistentPunc() + num_other +
92 ((num_alphas > num_digits) ? num_digits : num_alphas));
93 }
94 inline bool Consistent() const {
95 return (NumInconsistentPunc() == 0 && NumInconsistentCase() == 0 &&
96 NumInconsistentChartype() == 0 && !inconsistent_script && !inconsistent_font &&
97 !InconsistentXHeight());
98 }
99 inline int NumInconsistentSpaces() const {
100 return num_inconsistent_spaces;
101 }
102 inline int InconsistentXHeight() const {
103 return xht_decision == XH_INCONSISTENT;
104 }
105 void ComputeXheightConsistency(const BLOB_CHOICE *b, bool is_punc);
106 float BodyMinXHeight() const {
107 if (InconsistentXHeight()) {
108 return 0.0f;
109 }
110 return xht_lo[kNORM];
111 }
112 float BodyMaxXHeight() const {
113 if (InconsistentXHeight()) {
114 return static_cast<float>(INT16_MAX);
115 }
116 return xht_hi[kNORM];
117 }
118
119 EDGE_REF punc_ref;
120 int num_alphas;
121 int num_digits;
122 int num_punc;
123 int num_other;
124 ChartypeEnum chartype;
125 XHeightConsistencyEnum xht_decision;
126 int num_non_first_upper;
127 int num_lower;
128 int script_id;
129 int num_inconsistent_spaces;
130 // Metrics clumped by position.
131 float xht_lo[kNumPos];
132 float xht_hi[kNumPos];
133 int16_t xht_count[kNumPos];
134 int16_t xht_count_punc[kNumPos];
135 int16_t xht_sp;
136 int16_t xpos_entropy;
137 bool invalid_punc;
138 bool inconsistent_script;
139 bool inconsistent_font;
140 };
141
142 } // namespace tesseract
143
144 #endif // TESSERACT_WORDREC_LM_CONSISTENCY_H_