comparison mupdf-source/thirdparty/tesseract/src/training/unicharset/validate_indic.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 #include "validate_indic.h"
2 #include "errcode.h"
3 #include "tprintf.h"
4
5 namespace tesseract {
6
7 // Returns whether codes matches the pattern for an Indic Grapheme.
8 // The ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf
9 // has a BNF for valid syllables (Graphemes) which is modified slightly
10 // for Unicode. Notably U+200C and U+200D are used before/after the
11 // virama/virama to express explicit or soft viramas.
12 // Also the unicode v.9 Malayalam entry states that CZHC can be used in several
13 // Indic languages to request traditional ligatures, and CzHC is Malayalam-
14 // specific for requesting open conjuncts.
15 //
16 // + vowel Grapheme: V[D](v)*
17 // + consonant Grapheme: (C[N](H|HZ|Hz|ZH)?)*C[N](H|Hz)?[M[P]][D](v)*
18 bool ValidateIndic::ConsumeGraphemeIfValid() {
19 switch (codes_[codes_used_].first) {
20 case CharClass::kConsonant:
21 return ConsumeConsonantHeadIfValid() && ConsumeConsonantTailIfValid();
22 case CharClass::kVowel:
23 case CharClass::kVedicMark:
24 return ConsumeVowelIfValid();
25 case CharClass::kZeroWidthJoiner:
26 case CharClass::kZeroWidthNonJoiner:
27 // Apart from within an aksara, joiners are silently dropped.
28 if (report_errors_) {
29 tprintf("Dropping isolated joiner: 0x%x\n", codes_[codes_used_].second);
30 }
31 ++codes_used_;
32 return true;
33 case CharClass::kOther:
34 UseMultiCode(1);
35 return true;
36 default:
37 if (report_errors_) {
38 tprintf("Invalid start of grapheme sequence:%c=0x%x\n",
39 static_cast<int>(codes_[codes_used_].first),
40 codes_[codes_used_].second);
41 }
42 return false;
43 }
44 }
45
46 Validator::CharClass ValidateIndic::UnicodeToCharClass(char32 ch) const {
47 if (IsVedicAccent(ch)) {
48 return CharClass::kVedicMark;
49 }
50 if (ch == kZeroWidthNonJoiner) {
51 return CharClass::kZeroWidthNonJoiner;
52 }
53 if (ch == kZeroWidthJoiner) {
54 return CharClass::kZeroWidthJoiner;
55 }
56 // Offset from the start of the relevant unicode code block aka code page.
57 int base = static_cast<char32>(script_);
58 int off = ch - base;
59 // Anything in another code block is other.
60 if (off < 0 || off >= kIndicCodePageSize) {
61 return CharClass::kOther;
62 }
63 // Exception for Tamil. The aytham character is considered a letter.
64 if (script_ == ViramaScript::kTamil && off == 0x03) {
65 return CharClass::kVowel;
66 }
67 if (off < 0x4) {
68 return CharClass::kVowelModifier;
69 }
70 if (script_ == ViramaScript::kSinhala) {
71 // Sinhala is an exception.
72 if (off <= 0x19) {
73 return CharClass::kVowel;
74 }
75 if (off <= 0x49) {
76 return CharClass::kConsonant;
77 }
78 if (off == 0x4a) {
79 return CharClass::kVirama;
80 }
81 if (off <= 0x5f) {
82 return CharClass::kMatra;
83 }
84 } else {
85 if (off <= 0x14 || off == 0x50) {
86 return CharClass::kVowel;
87 }
88 if (off <= 0x3b || (0x58 <= off && off <= 0x5f)) {
89 return CharClass::kConsonant;
90 }
91 // Sinhala doesn't have Nukta or Avagraha.
92 if (off == 0x3c) {
93 return CharClass::kNukta;
94 }
95 if (off == 0x3d) {
96 return CharClass::kVowel; // avagraha
97 }
98 if (off <= 0x4c || (0x51 <= off && off <= 0x54)) {
99 return CharClass::kMatra;
100 }
101 if (0x55 <= off && off <= 0x57) {
102 return CharClass::kMatraPiece;
103 }
104 if (off == 0x4d) {
105 return CharClass::kVirama;
106 }
107 }
108 if (off == 0x60 || off == 0x61) {
109 return CharClass::kVowel;
110 }
111 if (off == 0x62 || off == 0x63) {
112 return CharClass::kMatra;
113 }
114 // Danda and digits up to 6f are OK as other.
115 // 70-7f are script-specific.
116 // 0BF0-0BF2 are Tamil numbers 10, 100 and 1000; treat as other.
117 if (script_ == ViramaScript::kTamil && (0x70 <= off && off <= 0x72)) {
118 return CharClass::kOther;
119 }
120 // 0BF3-0BFA are other Tamil symbols.
121 if (script_ == ViramaScript::kTamil && (0x73 <= off && off <= 0x7A)) {
122 return CharClass::kOther;
123 }
124 if (script_ == ViramaScript::kBengali && (off == 0x70 || off == 0x71)) {
125 return CharClass::kConsonant;
126 }
127 if (script_ == ViramaScript::kGurmukhi && (off == 0x72 || off == 0x73)) {
128 return CharClass::kConsonant;
129 }
130 if (script_ == ViramaScript::kSinhala && off == 0x70) {
131 return CharClass::kConsonant;
132 }
133 if (script_ == ViramaScript::kDevanagari && off == 0x70) {
134 return CharClass::kOther;
135 }
136 if (0x70 <= off && off <= 0x73) {
137 return CharClass::kVowelModifier;
138 }
139 // Non Indic, Digits, Measures, danda, etc.
140 return CharClass::kOther;
141 }
142
143 // Helper consumes/copies a virama and any associated post-virama joiners.
144 // A linking virama (with either type of pre-virama joiner, post-virama ZWJ, or
145 // no joiner at all) must be followed by a consonant.
146 // A non-linking (explicit) virama is indicated by a ZWNJ after it, or a non
147 // consonant, space, or character from a different script. We clean up the
148 // representation to make it consistent by adding a ZWNJ if missing from a
149 // non-linking virama. Returns false with an invalid sequence.
150 bool ValidateIndic::ConsumeViramaIfValid(IndicPair joiner, bool post_matra) {
151 const unsigned num_codes = codes_.size();
152 if (joiner.first == CharClass::kOther) {
153 CodeOnlyToOutput();
154 if (codes_used_ < num_codes && codes_[codes_used_].second == kZeroWidthJoiner) {
155 // Post-matra viramas must be explicit, so no joiners allowed here.
156 if (post_matra) {
157 if (report_errors_) {
158 tprintf("ZWJ after a post-matra virama!!\n");
159 }
160 return false;
161 }
162 if (codes_used_ + 1 < num_codes && codes_[codes_used_ - 2].second != kRayana &&
163 (codes_[codes_used_ + 1].second == kZeroWidthNonJoiner ||
164 codes_[codes_used_ + 1].second == kYayana ||
165 codes_[codes_used_ + 1].second == kRayana)) {
166 // This combination will be picked up later.
167 ASSERT_HOST(!CodeOnlyToOutput());
168 } else {
169 // Half-form with optional Nukta.
170 unsigned len = output_.size() + 1 - output_used_;
171 if (UseMultiCode(len)) {
172 return true;
173 }
174 }
175 if (codes_used_ < num_codes && codes_[codes_used_].second == kZeroWidthNonJoiner) {
176 if (output_used_ == output_.size() || output_[output_used_] != kRayana) {
177 if (report_errors_) {
178 tprintf("Virama ZWJ ZWNJ in non-Sinhala: base=0x%x!\n", static_cast<int>(script_));
179 }
180 return false;
181 }
182 // Special Sinhala case of Stand-alone Repaya. ['RA' H Z z]
183 if (UseMultiCode(4)) {
184 return true;
185 }
186 }
187 } else if (codes_used_ == num_codes || codes_[codes_used_].first != CharClass::kConsonant ||
188 post_matra) {
189 if (codes_used_ == num_codes || codes_[codes_used_].second != kZeroWidthNonJoiner) {
190 // It is valid to have an unterminated virama at the end of a word, but
191 // for consistency, we will always add ZWNJ if not present.
192 output_.push_back(kZeroWidthNonJoiner);
193 } else {
194 CodeOnlyToOutput();
195 }
196 // Explicit virama [H z]
197 MultiCodePart(2);
198 }
199 } else {
200 // Pre-virama joiner [{Z|z} H] requests specific conjunct.
201 if (UseMultiCode(2)) {
202 if (report_errors_) {
203 tprintf("Invalid pre-virama joiner with no 2nd consonant!!\n");
204 }
205 return false;
206 }
207 if (codes_[codes_used_].second == kZeroWidthJoiner ||
208 codes_[codes_used_].second == kZeroWidthNonJoiner) {
209 if (report_errors_) {
210 tprintf("JHJ!!: 0x%x 0x%x 0x%x\n", joiner.second, output_.back(),
211 codes_[codes_used_].second);
212 }
213 return false;
214 }
215 }
216 // It is good so far as it goes.
217 return true;
218 }
219
220 // Helper consumes/copies a series of consonants separated by viramas while
221 // valid, but not any vowel or other modifiers.
222 bool ValidateIndic::ConsumeConsonantHeadIfValid() {
223 const unsigned num_codes = codes_.size();
224 // Consonant aksara
225 do {
226 CodeOnlyToOutput();
227 // Special Sinhala case of [H Z Yayana/Rayana].
228 int index = output_.size() - 3;
229 if (output_used_ + 3 <= output_.size() &&
230 (output_.back() == kYayana || output_.back() == kRayana) && IsVirama(output_[index]) &&
231 output_[index + 1] == kZeroWidthJoiner) {
232 MultiCodePart(3);
233 }
234 bool have_nukta = false;
235 if (codes_used_ < num_codes && codes_[codes_used_].first == CharClass::kNukta) {
236 have_nukta = true;
237 CodeOnlyToOutput();
238 }
239 // Test for subscript conjunct.
240 index = output_.size() - 2 - have_nukta;
241 if (output_used_ + 2 + have_nukta <= output_.size() && IsSubscriptScript() &&
242 IsVirama(output_[index])) {
243 // Output previous virama, consonant + optional nukta.
244 MultiCodePart(2 + have_nukta);
245 }
246 IndicPair joiner(CharClass::kOther, 0);
247 if (codes_used_ < num_codes && (codes_[codes_used_].second == kZeroWidthJoiner ||
248 (codes_[codes_used_].second == kZeroWidthNonJoiner &&
249 script_ == ViramaScript::kMalayalam))) {
250 joiner = codes_[codes_used_];
251 if (++codes_used_ == num_codes) {
252 if (report_errors_) {
253 tprintf("Skipping ending joiner: 0x%x 0x%x\n", output_.back(), joiner.second);
254 }
255 return true;
256 }
257 if (codes_[codes_used_].first == CharClass::kVirama) {
258 output_.push_back(joiner.second);
259 } else {
260 if (report_errors_) {
261 tprintf("Skipping unnecessary joiner: 0x%x 0x%x 0x%x\n", output_.back(), joiner.second,
262 codes_[codes_used_].second);
263 }
264 joiner = std::make_pair(CharClass::kOther, 0);
265 }
266 }
267 if (codes_used_ < num_codes && codes_[codes_used_].first == CharClass::kVirama) {
268 if (!ConsumeViramaIfValid(joiner, false)) {
269 return false;
270 }
271 } else {
272 break; // No virama, so the run of consonants is over.
273 }
274 } while (codes_used_ < num_codes && codes_[codes_used_].first == CharClass::kConsonant);
275 if (output_used_ < output_.size()) {
276 MultiCodePart(1);
277 }
278 return true;
279 }
280
281 // Helper consumes/copies a tail part of a consonant, comprising optional
282 // matra/piece, vowel modifier, vedic mark, terminating virama.
283 bool ValidateIndic::ConsumeConsonantTailIfValid() {
284 if (codes_used_ == codes_.size()) {
285 return true;
286 }
287 // No virama: Finish the grapheme.
288 // Are multiple matras allowed?
289 if (codes_[codes_used_].first == CharClass::kMatra) {
290 if (UseMultiCode(1)) {
291 return true;
292 }
293 if (codes_[codes_used_].first == CharClass::kMatraPiece) {
294 if (UseMultiCode(1)) {
295 return true;
296 }
297 }
298 }
299 while (codes_[codes_used_].first == CharClass::kVowelModifier) {
300 if (UseMultiCode(1)) {
301 return true;
302 }
303 // Only Malayalam allows only repeated 0xd02.
304 if (script_ != ViramaScript::kMalayalam || output_.back() != 0xd02) {
305 break;
306 }
307 }
308 while (codes_[codes_used_].first == CharClass::kVedicMark) {
309 if (UseMultiCode(1)) {
310 return true;
311 }
312 }
313 if (codes_[codes_used_].first == CharClass::kVirama) {
314 if (!ConsumeViramaIfValid(IndicPair(CharClass::kOther, 0), true)) {
315 return false;
316 }
317 }
318 // What we have consumed so far is a valid consonant cluster.
319 if (output_used_ < output_.size()) {
320 MultiCodePart(1);
321 }
322
323 return true;
324 }
325
326 // Helper consumes/copies a vowel and optional modifiers.
327 bool ValidateIndic::ConsumeVowelIfValid() {
328 if (UseMultiCode(1)) {
329 return true;
330 }
331 while (codes_[codes_used_].first == CharClass::kVowelModifier) {
332 if (UseMultiCode(1)) {
333 return true;
334 }
335 // Only Malayalam allows repeated modifiers?
336 if (script_ != ViramaScript::kMalayalam) {
337 break;
338 }
339 }
340 while (codes_[codes_used_].first == CharClass::kVedicMark) {
341 if (UseMultiCode(1)) {
342 return true;
343 }
344 }
345 // What we have consumed so far is a valid vowel cluster.
346 return true;
347 }
348
349 } // namespace tesseract