comparison mupdf-source/thirdparty/tesseract/src/training/unicharset/validator.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 #include "validator.h"
2
3 #include <algorithm>
4 #include <iterator>
5 #include <unordered_map>
6 #include <vector>
7
8 #include "icuerrorcode.h"
9 #include "unicode/uchar.h" // From libicu
10 #include "unicode/uscript.h" // From libicu
11 #include "validate_grapheme.h"
12 #include "validate_indic.h"
13 #include "validate_javanese.h"
14 #include "validate_khmer.h"
15 #include "validate_myanmar.h"
16
17 namespace tesseract {
18
19 // Some specific but universally useful unicodes.
20 const char32 Validator::kZeroWidthSpace = 0x200B;
21 const char32 Validator::kZeroWidthNonJoiner = 0x200C;
22 const char32 Validator::kZeroWidthJoiner = 0x200D;
23 const char32 Validator::kLeftToRightMark = 0x200E;
24 const char32 Validator::kRightToLeftMark = 0x200F;
25 const char32 Validator::kInvalid = 0xfffd;
26
27 // Destructor.
28 // It is defined here, so the compiler can create a single vtable
29 // instead of weak vtables in every compilation unit.
30 Validator::~Validator() = default;
31
32 // Validates and cleans the src vector of unicodes to the *dest, according to
33 // g_mode. In the case of kSingleString, a single vector containing the whole
34 // result is added to *dest. With kCombined, multiple vectors are added to
35 // *dest with one grapheme in each. With kGlyphSplit, multiple vectors are
36 // added to *dest with a smaller unit representing a glyph in each.
37 // In case of validation error, returns false and as much as possible of the
38 // input, without discarding invalid text.
39 /* static */
40 bool Validator::ValidateCleanAndSegment(GraphemeNormMode g_mode, bool report_errors,
41 const std::vector<char32> &src,
42 std::vector<std::vector<char32>> *dest) {
43 ValidateGrapheme g_validator(ViramaScript::kNonVirama, report_errors);
44 std::vector<std::vector<char32>> graphemes;
45 ViramaScript script = MostFrequentViramaScript(src);
46 bool success = true;
47 if (script == ViramaScript::kNonVirama) {
48 // The grapheme segmenter's maximum segmentation is the grapheme unit, so
49 // up the mode by 1 to get the desired effect.
50 if (g_mode == GraphemeNormMode::kCombined) {
51 g_mode = GraphemeNormMode::kGlyphSplit;
52 } else if (g_mode == GraphemeNormMode::kGlyphSplit) {
53 g_mode = GraphemeNormMode::kIndividualUnicodes;
54 }
55 // Just do grapheme segmentation.
56 success = g_validator.ValidateCleanAndSegmentInternal(g_mode, src, dest);
57 } else {
58 success =
59 g_validator.ValidateCleanAndSegmentInternal(GraphemeNormMode::kGlyphSplit, src, &graphemes);
60 std::unique_ptr<Validator> validator(ScriptValidator(script, report_errors));
61 for (const auto &grapheme : graphemes) {
62 if (!validator->ValidateCleanAndSegmentInternal(g_mode, grapheme, dest)) {
63 success = false;
64 }
65 }
66 }
67 return success;
68 }
69
70 // Factory method that understands how to map script to the right subclass.
71 std::unique_ptr<Validator> Validator::ScriptValidator(ViramaScript script, bool report_errors) {
72 switch (script) {
73 #define CASE(e, T) case ViramaScript::e: return std::make_unique<T>(script, report_errors)
74 CASE(kNonVirama, ValidateGrapheme);
75 CASE(kJavanese, ValidateJavanese);
76 CASE(kMyanmar, ValidateMyanmar);
77 CASE(kKhmer, ValidateKhmer);
78 #undef CASE
79 default:
80 return std::make_unique<ValidateIndic>(script, report_errors);
81 }
82 }
83
84 // Internal version of the public static ValidateCleanAndSegment.
85 // Validates and cleans the src vector of unicodes to the *dest, according to
86 // its type and the given g_mode.
87 // In case of validation error, returns false and returns as much as possible
88 // of the input, without discarding invalid text.
89 bool Validator::ValidateCleanAndSegmentInternal(GraphemeNormMode g_mode,
90 const std::vector<char32> &src,
91 std::vector<std::vector<char32>> *dest) {
92 Clear();
93 ComputeClassCodes(src);
94 bool success = true;
95 for (codes_used_ = 0; codes_used_ < codes_.size();) {
96 if (!ConsumeGraphemeIfValid()) {
97 success = false;
98 ++codes_used_;
99 }
100 }
101 MoveResultsToDest(g_mode, dest);
102 return success;
103 }
104
105 // Moves the results from parts_ or output_ to dest according to g_mode.
106 void Validator::MoveResultsToDest(GraphemeNormMode g_mode, std::vector<std::vector<char32>> *dest) {
107 if (g_mode == GraphemeNormMode::kIndividualUnicodes) {
108 // Append each element of the combined output_ that we made as a new vector
109 // in dest.
110 dest->reserve(dest->size() + output_.size());
111 for (char32 ch : output_) {
112 dest->push_back({ch});
113 }
114 } else if (g_mode == GraphemeNormMode::kGlyphSplit) {
115 // Append all the parts_ that we made onto dest.
116 std::move(parts_.begin(), parts_.end(), std::back_inserter(*dest));
117 } else if (g_mode == GraphemeNormMode::kCombined || dest->empty()) {
118 // Append the combined output_ that we made onto dest as one new vector.
119 dest->push_back(std::vector<char32>());
120 output_.swap(dest->back());
121 } else { // kNone.
122 // Append the combined output_ that we made onto the last existing element
123 // of dest.
124 dest->back().insert(dest->back().end(), output_.begin(), output_.end());
125 }
126 }
127
128 static bool CmpPairSecond(const std::pair<int, int> &p1, const std::pair<int, int> &p2) {
129 return p1.second < p2.second;
130 }
131
132 // Computes and returns the ViramaScript corresponding to the most frequent
133 // virama-using script in the input, or kNonVirama if none are present.
134 /* static */
135 ViramaScript Validator::MostFrequentViramaScript(const std::vector<char32> &utf32) {
136 std::unordered_map<int, int> histogram;
137 for (char32 ch : utf32) {
138 // Determine the codepage base. For the Indic scripts, Khmer and Javanese,
139 // it is sufficient to divide by kIndicCodePageSize but Myanmar is all over
140 // the unicode code space, so use its script id.
141 int base = ch / kIndicCodePageSize;
142 IcuErrorCode err;
143 UScriptCode script_code = uscript_getScript(ch, err);
144 if ((kMinIndicUnicode <= ch && ch <= kMaxJavaneseUnicode && script_code != USCRIPT_COMMON) ||
145 script_code == USCRIPT_MYANMAR) {
146 if (script_code == USCRIPT_MYANMAR) {
147 base = static_cast<char32>(ViramaScript::kMyanmar) / kIndicCodePageSize;
148 }
149 ++histogram[base];
150 }
151 }
152 if (!histogram.empty()) {
153 int base = std::max_element(histogram.begin(), histogram.end(), CmpPairSecond)->first;
154 auto codebase = static_cast<char32>(base * kIndicCodePageSize);
155 // Check for validity.
156 if (codebase == static_cast<char32>(ViramaScript::kMyanmar) ||
157 codebase == static_cast<char32>(ViramaScript::kJavanese) ||
158 codebase == static_cast<char32>(ViramaScript::kKhmer) ||
159 (static_cast<char32>(ViramaScript::kDevanagari) <= codebase &&
160 codebase <= static_cast<char32>(ViramaScript::kSinhala))) {
161 return static_cast<ViramaScript>(codebase);
162 }
163 }
164 return ViramaScript::kNonVirama;
165 }
166
167 // Returns true if the given UTF-32 unicode is a "virama" character.
168 /* static */
169 bool Validator::IsVirama(char32 unicode) {
170 return (kMinIndicUnicode <= unicode && unicode <= kMaxSinhalaUnicode &&
171 (unicode & 0x7f) == 0x4d) ||
172 unicode == kSinhalaVirama || unicode == kJavaneseVirama || unicode == kMyanmarVirama ||
173 unicode == kKhmerVirama;
174 }
175
176 // Returns true if the given UTF-32 unicode is a vedic accent.
177 /* static */
178 bool Validator::IsVedicAccent(char32 unicode) {
179 return (0x1cd0 <= unicode && unicode < 0x1d00) || (0xa8e0 <= unicode && unicode <= 0xa8f7) ||
180 (0x951 <= unicode && unicode <= 0x954);
181 }
182
183 // Returns true if the script is one that uses subscripts for conjuncts.
184 bool Validator::IsSubscriptScript() const {
185 return script_ == ViramaScript::kTelugu || script_ == ViramaScript::kKannada ||
186 script_ == ViramaScript::kJavanese || script_ == ViramaScript::kMyanmar ||
187 script_ == ViramaScript::kKhmer;
188 }
189
190 void Validator::ComputeClassCodes(const std::vector<char32> &text) {
191 codes_.reserve(text.size());
192 for (char32 c : text) {
193 codes_.emplace_back(UnicodeToCharClass(c), c);
194 }
195 }
196
197 // Resets to the initial state.
198 void Validator::Clear() {
199 codes_.clear();
200 parts_.clear();
201 output_.clear();
202 codes_used_ = 0;
203 output_used_ = 0;
204 }
205
206 } // namespace tesseract