Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/training/unicharset/validate_indic.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 #include "validate_indic.h" | |
| 2 #include "errcode.h" | |
| 3 #include "tprintf.h" | |
| 4 | |
| 5 namespace tesseract { | |
| 6 | |
| 7 // Returns whether codes matches the pattern for an Indic Grapheme. | |
| 8 // The ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf | |
| 9 // has a BNF for valid syllables (Graphemes) which is modified slightly | |
| 10 // for Unicode. Notably U+200C and U+200D are used before/after the | |
| 11 // virama/virama to express explicit or soft viramas. | |
| 12 // Also the unicode v.9 Malayalam entry states that CZHC can be used in several | |
| 13 // Indic languages to request traditional ligatures, and CzHC is Malayalam- | |
| 14 // specific for requesting open conjuncts. | |
| 15 // | |
| 16 // + vowel Grapheme: V[D](v)* | |
| 17 // + consonant Grapheme: (C[N](H|HZ|Hz|ZH)?)*C[N](H|Hz)?[M[P]][D](v)* | |
| 18 bool ValidateIndic::ConsumeGraphemeIfValid() { | |
| 19 switch (codes_[codes_used_].first) { | |
| 20 case CharClass::kConsonant: | |
| 21 return ConsumeConsonantHeadIfValid() && ConsumeConsonantTailIfValid(); | |
| 22 case CharClass::kVowel: | |
| 23 case CharClass::kVedicMark: | |
| 24 return ConsumeVowelIfValid(); | |
| 25 case CharClass::kZeroWidthJoiner: | |
| 26 case CharClass::kZeroWidthNonJoiner: | |
| 27 // Apart from within an aksara, joiners are silently dropped. | |
| 28 if (report_errors_) { | |
| 29 tprintf("Dropping isolated joiner: 0x%x\n", codes_[codes_used_].second); | |
| 30 } | |
| 31 ++codes_used_; | |
| 32 return true; | |
| 33 case CharClass::kOther: | |
| 34 UseMultiCode(1); | |
| 35 return true; | |
| 36 default: | |
| 37 if (report_errors_) { | |
| 38 tprintf("Invalid start of grapheme sequence:%c=0x%x\n", | |
| 39 static_cast<int>(codes_[codes_used_].first), | |
| 40 codes_[codes_used_].second); | |
| 41 } | |
| 42 return false; | |
| 43 } | |
| 44 } | |
| 45 | |
| 46 Validator::CharClass ValidateIndic::UnicodeToCharClass(char32 ch) const { | |
| 47 if (IsVedicAccent(ch)) { | |
| 48 return CharClass::kVedicMark; | |
| 49 } | |
| 50 if (ch == kZeroWidthNonJoiner) { | |
| 51 return CharClass::kZeroWidthNonJoiner; | |
| 52 } | |
| 53 if (ch == kZeroWidthJoiner) { | |
| 54 return CharClass::kZeroWidthJoiner; | |
| 55 } | |
| 56 // Offset from the start of the relevant unicode code block aka code page. | |
| 57 int base = static_cast<char32>(script_); | |
| 58 int off = ch - base; | |
| 59 // Anything in another code block is other. | |
| 60 if (off < 0 || off >= kIndicCodePageSize) { | |
| 61 return CharClass::kOther; | |
| 62 } | |
| 63 // Exception for Tamil. The aytham character is considered a letter. | |
| 64 if (script_ == ViramaScript::kTamil && off == 0x03) { | |
| 65 return CharClass::kVowel; | |
| 66 } | |
| 67 if (off < 0x4) { | |
| 68 return CharClass::kVowelModifier; | |
| 69 } | |
| 70 if (script_ == ViramaScript::kSinhala) { | |
| 71 // Sinhala is an exception. | |
| 72 if (off <= 0x19) { | |
| 73 return CharClass::kVowel; | |
| 74 } | |
| 75 if (off <= 0x49) { | |
| 76 return CharClass::kConsonant; | |
| 77 } | |
| 78 if (off == 0x4a) { | |
| 79 return CharClass::kVirama; | |
| 80 } | |
| 81 if (off <= 0x5f) { | |
| 82 return CharClass::kMatra; | |
| 83 } | |
| 84 } else { | |
| 85 if (off <= 0x14 || off == 0x50) { | |
| 86 return CharClass::kVowel; | |
| 87 } | |
| 88 if (off <= 0x3b || (0x58 <= off && off <= 0x5f)) { | |
| 89 return CharClass::kConsonant; | |
| 90 } | |
| 91 // Sinhala doesn't have Nukta or Avagraha. | |
| 92 if (off == 0x3c) { | |
| 93 return CharClass::kNukta; | |
| 94 } | |
| 95 if (off == 0x3d) { | |
| 96 return CharClass::kVowel; // avagraha | |
| 97 } | |
| 98 if (off <= 0x4c || (0x51 <= off && off <= 0x54)) { | |
| 99 return CharClass::kMatra; | |
| 100 } | |
| 101 if (0x55 <= off && off <= 0x57) { | |
| 102 return CharClass::kMatraPiece; | |
| 103 } | |
| 104 if (off == 0x4d) { | |
| 105 return CharClass::kVirama; | |
| 106 } | |
| 107 } | |
| 108 if (off == 0x60 || off == 0x61) { | |
| 109 return CharClass::kVowel; | |
| 110 } | |
| 111 if (off == 0x62 || off == 0x63) { | |
| 112 return CharClass::kMatra; | |
| 113 } | |
| 114 // Danda and digits up to 6f are OK as other. | |
| 115 // 70-7f are script-specific. | |
| 116 // 0BF0-0BF2 are Tamil numbers 10, 100 and 1000; treat as other. | |
| 117 if (script_ == ViramaScript::kTamil && (0x70 <= off && off <= 0x72)) { | |
| 118 return CharClass::kOther; | |
| 119 } | |
| 120 // 0BF3-0BFA are other Tamil symbols. | |
| 121 if (script_ == ViramaScript::kTamil && (0x73 <= off && off <= 0x7A)) { | |
| 122 return CharClass::kOther; | |
| 123 } | |
| 124 if (script_ == ViramaScript::kBengali && (off == 0x70 || off == 0x71)) { | |
| 125 return CharClass::kConsonant; | |
| 126 } | |
| 127 if (script_ == ViramaScript::kGurmukhi && (off == 0x72 || off == 0x73)) { | |
| 128 return CharClass::kConsonant; | |
| 129 } | |
| 130 if (script_ == ViramaScript::kSinhala && off == 0x70) { | |
| 131 return CharClass::kConsonant; | |
| 132 } | |
| 133 if (script_ == ViramaScript::kDevanagari && off == 0x70) { | |
| 134 return CharClass::kOther; | |
| 135 } | |
| 136 if (0x70 <= off && off <= 0x73) { | |
| 137 return CharClass::kVowelModifier; | |
| 138 } | |
| 139 // Non Indic, Digits, Measures, danda, etc. | |
| 140 return CharClass::kOther; | |
| 141 } | |
| 142 | |
| 143 // Helper consumes/copies a virama and any associated post-virama joiners. | |
| 144 // A linking virama (with either type of pre-virama joiner, post-virama ZWJ, or | |
| 145 // no joiner at all) must be followed by a consonant. | |
| 146 // A non-linking (explicit) virama is indicated by a ZWNJ after it, or a non | |
| 147 // consonant, space, or character from a different script. We clean up the | |
| 148 // representation to make it consistent by adding a ZWNJ if missing from a | |
| 149 // non-linking virama. Returns false with an invalid sequence. | |
| 150 bool ValidateIndic::ConsumeViramaIfValid(IndicPair joiner, bool post_matra) { | |
| 151 const unsigned num_codes = codes_.size(); | |
| 152 if (joiner.first == CharClass::kOther) { | |
| 153 CodeOnlyToOutput(); | |
| 154 if (codes_used_ < num_codes && codes_[codes_used_].second == kZeroWidthJoiner) { | |
| 155 // Post-matra viramas must be explicit, so no joiners allowed here. | |
| 156 if (post_matra) { | |
| 157 if (report_errors_) { | |
| 158 tprintf("ZWJ after a post-matra virama!!\n"); | |
| 159 } | |
| 160 return false; | |
| 161 } | |
| 162 if (codes_used_ + 1 < num_codes && codes_[codes_used_ - 2].second != kRayana && | |
| 163 (codes_[codes_used_ + 1].second == kZeroWidthNonJoiner || | |
| 164 codes_[codes_used_ + 1].second == kYayana || | |
| 165 codes_[codes_used_ + 1].second == kRayana)) { | |
| 166 // This combination will be picked up later. | |
| 167 ASSERT_HOST(!CodeOnlyToOutput()); | |
| 168 } else { | |
| 169 // Half-form with optional Nukta. | |
| 170 unsigned len = output_.size() + 1 - output_used_; | |
| 171 if (UseMultiCode(len)) { | |
| 172 return true; | |
| 173 } | |
| 174 } | |
| 175 if (codes_used_ < num_codes && codes_[codes_used_].second == kZeroWidthNonJoiner) { | |
| 176 if (output_used_ == output_.size() || output_[output_used_] != kRayana) { | |
| 177 if (report_errors_) { | |
| 178 tprintf("Virama ZWJ ZWNJ in non-Sinhala: base=0x%x!\n", static_cast<int>(script_)); | |
| 179 } | |
| 180 return false; | |
| 181 } | |
| 182 // Special Sinhala case of Stand-alone Repaya. ['RA' H Z z] | |
| 183 if (UseMultiCode(4)) { | |
| 184 return true; | |
| 185 } | |
| 186 } | |
| 187 } else if (codes_used_ == num_codes || codes_[codes_used_].first != CharClass::kConsonant || | |
| 188 post_matra) { | |
| 189 if (codes_used_ == num_codes || codes_[codes_used_].second != kZeroWidthNonJoiner) { | |
| 190 // It is valid to have an unterminated virama at the end of a word, but | |
| 191 // for consistency, we will always add ZWNJ if not present. | |
| 192 output_.push_back(kZeroWidthNonJoiner); | |
| 193 } else { | |
| 194 CodeOnlyToOutput(); | |
| 195 } | |
| 196 // Explicit virama [H z] | |
| 197 MultiCodePart(2); | |
| 198 } | |
| 199 } else { | |
| 200 // Pre-virama joiner [{Z|z} H] requests specific conjunct. | |
| 201 if (UseMultiCode(2)) { | |
| 202 if (report_errors_) { | |
| 203 tprintf("Invalid pre-virama joiner with no 2nd consonant!!\n"); | |
| 204 } | |
| 205 return false; | |
| 206 } | |
| 207 if (codes_[codes_used_].second == kZeroWidthJoiner || | |
| 208 codes_[codes_used_].second == kZeroWidthNonJoiner) { | |
| 209 if (report_errors_) { | |
| 210 tprintf("JHJ!!: 0x%x 0x%x 0x%x\n", joiner.second, output_.back(), | |
| 211 codes_[codes_used_].second); | |
| 212 } | |
| 213 return false; | |
| 214 } | |
| 215 } | |
| 216 // It is good so far as it goes. | |
| 217 return true; | |
| 218 } | |
| 219 | |
| 220 // Helper consumes/copies a series of consonants separated by viramas while | |
| 221 // valid, but not any vowel or other modifiers. | |
| 222 bool ValidateIndic::ConsumeConsonantHeadIfValid() { | |
| 223 const unsigned num_codes = codes_.size(); | |
| 224 // Consonant aksara | |
| 225 do { | |
| 226 CodeOnlyToOutput(); | |
| 227 // Special Sinhala case of [H Z Yayana/Rayana]. | |
| 228 int index = output_.size() - 3; | |
| 229 if (output_used_ + 3 <= output_.size() && | |
| 230 (output_.back() == kYayana || output_.back() == kRayana) && IsVirama(output_[index]) && | |
| 231 output_[index + 1] == kZeroWidthJoiner) { | |
| 232 MultiCodePart(3); | |
| 233 } | |
| 234 bool have_nukta = false; | |
| 235 if (codes_used_ < num_codes && codes_[codes_used_].first == CharClass::kNukta) { | |
| 236 have_nukta = true; | |
| 237 CodeOnlyToOutput(); | |
| 238 } | |
| 239 // Test for subscript conjunct. | |
| 240 index = output_.size() - 2 - have_nukta; | |
| 241 if (output_used_ + 2 + have_nukta <= output_.size() && IsSubscriptScript() && | |
| 242 IsVirama(output_[index])) { | |
| 243 // Output previous virama, consonant + optional nukta. | |
| 244 MultiCodePart(2 + have_nukta); | |
| 245 } | |
| 246 IndicPair joiner(CharClass::kOther, 0); | |
| 247 if (codes_used_ < num_codes && (codes_[codes_used_].second == kZeroWidthJoiner || | |
| 248 (codes_[codes_used_].second == kZeroWidthNonJoiner && | |
| 249 script_ == ViramaScript::kMalayalam))) { | |
| 250 joiner = codes_[codes_used_]; | |
| 251 if (++codes_used_ == num_codes) { | |
| 252 if (report_errors_) { | |
| 253 tprintf("Skipping ending joiner: 0x%x 0x%x\n", output_.back(), joiner.second); | |
| 254 } | |
| 255 return true; | |
| 256 } | |
| 257 if (codes_[codes_used_].first == CharClass::kVirama) { | |
| 258 output_.push_back(joiner.second); | |
| 259 } else { | |
| 260 if (report_errors_) { | |
| 261 tprintf("Skipping unnecessary joiner: 0x%x 0x%x 0x%x\n", output_.back(), joiner.second, | |
| 262 codes_[codes_used_].second); | |
| 263 } | |
| 264 joiner = std::make_pair(CharClass::kOther, 0); | |
| 265 } | |
| 266 } | |
| 267 if (codes_used_ < num_codes && codes_[codes_used_].first == CharClass::kVirama) { | |
| 268 if (!ConsumeViramaIfValid(joiner, false)) { | |
| 269 return false; | |
| 270 } | |
| 271 } else { | |
| 272 break; // No virama, so the run of consonants is over. | |
| 273 } | |
| 274 } while (codes_used_ < num_codes && codes_[codes_used_].first == CharClass::kConsonant); | |
| 275 if (output_used_ < output_.size()) { | |
| 276 MultiCodePart(1); | |
| 277 } | |
| 278 return true; | |
| 279 } | |
| 280 | |
| 281 // Helper consumes/copies a tail part of a consonant, comprising optional | |
| 282 // matra/piece, vowel modifier, vedic mark, terminating virama. | |
| 283 bool ValidateIndic::ConsumeConsonantTailIfValid() { | |
| 284 if (codes_used_ == codes_.size()) { | |
| 285 return true; | |
| 286 } | |
| 287 // No virama: Finish the grapheme. | |
| 288 // Are multiple matras allowed? | |
| 289 if (codes_[codes_used_].first == CharClass::kMatra) { | |
| 290 if (UseMultiCode(1)) { | |
| 291 return true; | |
| 292 } | |
| 293 if (codes_[codes_used_].first == CharClass::kMatraPiece) { | |
| 294 if (UseMultiCode(1)) { | |
| 295 return true; | |
| 296 } | |
| 297 } | |
| 298 } | |
| 299 while (codes_[codes_used_].first == CharClass::kVowelModifier) { | |
| 300 if (UseMultiCode(1)) { | |
| 301 return true; | |
| 302 } | |
| 303 // Only Malayalam allows only repeated 0xd02. | |
| 304 if (script_ != ViramaScript::kMalayalam || output_.back() != 0xd02) { | |
| 305 break; | |
| 306 } | |
| 307 } | |
| 308 while (codes_[codes_used_].first == CharClass::kVedicMark) { | |
| 309 if (UseMultiCode(1)) { | |
| 310 return true; | |
| 311 } | |
| 312 } | |
| 313 if (codes_[codes_used_].first == CharClass::kVirama) { | |
| 314 if (!ConsumeViramaIfValid(IndicPair(CharClass::kOther, 0), true)) { | |
| 315 return false; | |
| 316 } | |
| 317 } | |
| 318 // What we have consumed so far is a valid consonant cluster. | |
| 319 if (output_used_ < output_.size()) { | |
| 320 MultiCodePart(1); | |
| 321 } | |
| 322 | |
| 323 return true; | |
| 324 } | |
| 325 | |
| 326 // Helper consumes/copies a vowel and optional modifiers. | |
| 327 bool ValidateIndic::ConsumeVowelIfValid() { | |
| 328 if (UseMultiCode(1)) { | |
| 329 return true; | |
| 330 } | |
| 331 while (codes_[codes_used_].first == CharClass::kVowelModifier) { | |
| 332 if (UseMultiCode(1)) { | |
| 333 return true; | |
| 334 } | |
| 335 // Only Malayalam allows repeated modifiers? | |
| 336 if (script_ != ViramaScript::kMalayalam) { | |
| 337 break; | |
| 338 } | |
| 339 } | |
| 340 while (codes_[codes_used_].first == CharClass::kVedicMark) { | |
| 341 if (UseMultiCode(1)) { | |
| 342 return true; | |
| 343 } | |
| 344 } | |
| 345 // What we have consumed so far is a valid vowel cluster. | |
| 346 return true; | |
| 347 } | |
| 348 | |
| 349 } // namespace tesseract |
