Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/training/unicharset/validate_javanese.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /********************************************************************** | |
| 2 * File: validate_javanese.cpp | |
| 3 * Description: Text validator for Javanese Script - aksara jawa. | |
| 4 * Author: Shree Devi Kumar | |
| 5 * | |
| 6 * Licensed under the Apache License, Version 2.0 (the "License"); | |
| 7 * you may not use this file except in compliance with the License. | |
| 8 * You may obtain a copy of the License at | |
| 9 * http://www.apache.org/licenses/LICENSE-2.0 | |
| 10 * Unless required by applicable law or agreed to in writing, software | |
| 11 * distributed under the License is distributed on an "AS IS" BASIS, | |
| 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 13 * See the License for the specific language governing permissions and | |
| 14 * limitations under the License. | |
| 15 * | |
| 16 **********************************************************************/ | |
| 17 | |
| 18 #include "validate_javanese.h" | |
| 19 #include "errcode.h" | |
| 20 #include "tprintf.h" | |
| 21 | |
| 22 namespace tesseract { | |
| 23 | |
| 24 // Returns whether codes matches the pattern for a Javanese Grapheme. | |
| 25 // Taken from unicode standard: | |
| 26 // http://www.unicode.org/charts/PDF/UA980.pdf | |
| 27 // http://www.unicode.org/versions/Unicode11.0.0/ch17.pdf | |
| 28 // The Consonant class here includes independent vowels. | |
| 29 // The order of components in an orthographic syllable as expressed in BNF is: | |
| 30 // {C F} C {{R}Y} {V{A}} {Z} | |
| 31 // Translated to the codes used by the CharClass enum: | |
| 32 // [(V|C[N])(H)] (V|C[N]) [[N]N] [M[D]] [v] | |
| 33 // Also see https://r12a.github.io/scripts/javanese/ for detailed notes. | |
| 34 // Validation rules copied from validate_indic.cpp and modified for Javanese. | |
| 35 // Indic - for reference | |
| 36 // + vowel Grapheme: V[D](v)* | |
| 37 // + consonant Grapheme: (C[N](H|HZ|Hz|ZH)?)*C[N](H|Hz)?[M[P]][D](v)* | |
| 38 | |
| 39 bool ValidateJavanese::ConsumeGraphemeIfValid() { | |
| 40 switch (codes_[codes_used_].first) { | |
| 41 case CharClass::kConsonant: | |
| 42 return ConsumeConsonantHeadIfValid() && ConsumeConsonantTailIfValid(); | |
| 43 case CharClass::kVowel: | |
| 44 case CharClass::kVedicMark: | |
| 45 return ConsumeVowelIfValid(); | |
| 46 case CharClass::kZeroWidthJoiner: | |
| 47 case CharClass::kZeroWidthNonJoiner: | |
| 48 // Apart from within an aksara, joiners are silently dropped. | |
| 49 if (report_errors_) { | |
| 50 tprintf("Dropping isolated joiner: 0x%x\n", codes_[codes_used_].second); | |
| 51 } | |
| 52 ++codes_used_; | |
| 53 return true; | |
| 54 case CharClass::kOther: | |
| 55 UseMultiCode(1); | |
| 56 return true; | |
| 57 default: | |
| 58 if (report_errors_) { | |
| 59 tprintf("Invalid start of grapheme sequence:%c=0x%x\n", | |
| 60 static_cast<int>(codes_[codes_used_].first), | |
| 61 codes_[codes_used_].second); | |
| 62 } | |
| 63 return false; | |
| 64 } | |
| 65 } | |
| 66 | |
| 67 // Helper consumes/copies a virama and any associated post-virama joiners. | |
| 68 // A linking virama (with either type of pre-virama joiner, post-virama ZWJ, or | |
| 69 // no joiner at all) must be followed by a consonant. | |
| 70 // A non-linking (explicit) virama is indicated by a ZWNJ after it, or a non | |
| 71 // consonant, space, or character from a different script. We clean up the | |
| 72 // representation to make it consistent by adding a ZWNJ if missing from a | |
| 73 // non-linking virama. Returns false with an invalid sequence. | |
| 74 bool ValidateJavanese::ConsumeViramaIfValid(IndicPair joiner, bool post_matra) { | |
| 75 const unsigned num_codes = codes_.size(); | |
| 76 if (joiner.first == CharClass::kOther) { | |
| 77 CodeOnlyToOutput(); | |
| 78 if (codes_used_ < num_codes && codes_[codes_used_].second == kZeroWidthJoiner) { | |
| 79 // Post-matra viramas must be explicit, so no joiners allowed here. | |
| 80 if (post_matra) { | |
| 81 if (report_errors_) { | |
| 82 tprintf("ZWJ after a post-matra virama!!\n"); | |
| 83 } | |
| 84 return false; | |
| 85 } | |
| 86 if (codes_used_ + 1 < num_codes && codes_[codes_used_ - 2].second != kCakra && | |
| 87 (codes_[codes_used_ + 1].second == kZeroWidthNonJoiner || | |
| 88 codes_[codes_used_ + 1].second == kPengkal || | |
| 89 codes_[codes_used_ + 1].second == kCakra)) { | |
| 90 // This combination will be picked up later. | |
| 91 ASSERT_HOST(!CodeOnlyToOutput()); | |
| 92 } else { | |
| 93 // Half-form with optional Nukta. | |
| 94 unsigned len = output_.size() + 1 - output_used_; | |
| 95 if (UseMultiCode(len)) { | |
| 96 return true; | |
| 97 } | |
| 98 } | |
| 99 if (codes_used_ < num_codes && codes_[codes_used_].second == kZeroWidthNonJoiner) { | |
| 100 if (output_used_ == output_.size() || output_[output_used_] != kCakra) { | |
| 101 if (report_errors_) { | |
| 102 tprintf("Virama ZWJ ZWNJ in non-Sinhala: base=0x%x!\n", static_cast<int>(script_)); | |
| 103 } | |
| 104 return false; | |
| 105 } | |
| 106 // Special Sinhala case of Stand-alone Repaya. ['RA' H Z z] | |
| 107 if (UseMultiCode(4)) { | |
| 108 return true; | |
| 109 } | |
| 110 } | |
| 111 } else if (codes_used_ == num_codes || codes_[codes_used_].first != CharClass::kConsonant || | |
| 112 post_matra) { | |
| 113 if (codes_used_ == num_codes || codes_[codes_used_].second != kZeroWidthNonJoiner) { | |
| 114 // It is valid to have an unterminated virama at the end of a word, but | |
| 115 // for consistency, we will always add ZWNJ if not present. | |
| 116 CodeOnlyToOutput(); | |
| 117 } else { | |
| 118 CodeOnlyToOutput(); | |
| 119 } | |
| 120 // Explicit virama [H z] | |
| 121 MultiCodePart(2); | |
| 122 } | |
| 123 } else { | |
| 124 // Pre-virama joiner [{Z|z} H] requests specific conjunct. | |
| 125 if (UseMultiCode(2)) { | |
| 126 if (report_errors_) { | |
| 127 tprintf("Invalid pre-virama joiner with no 2nd consonant!!\n"); | |
| 128 } | |
| 129 return false; | |
| 130 } | |
| 131 if (codes_[codes_used_].second == kZeroWidthJoiner || | |
| 132 codes_[codes_used_].second == kZeroWidthNonJoiner) { | |
| 133 if (report_errors_) { | |
| 134 tprintf("JHJ!!: 0x%x 0x%x 0x%x\n", joiner.second, output_.back(), | |
| 135 codes_[codes_used_].second); | |
| 136 } | |
| 137 return false; | |
| 138 } | |
| 139 } | |
| 140 // It is good so far as it goes. | |
| 141 return true; | |
| 142 } | |
| 143 | |
| 144 // Helper consumes/copies a series of consonants separated by viramas while | |
| 145 // valid, but not any vowel or other modifiers. | |
| 146 bool ValidateJavanese::ConsumeConsonantHeadIfValid() { | |
| 147 const unsigned num_codes = codes_.size(); | |
| 148 // Consonant aksara | |
| 149 do { | |
| 150 CodeOnlyToOutput(); | |
| 151 // Special Sinhala case of [H Z Yayana/Rayana]. | |
| 152 int index = output_.size() - 3; | |
| 153 if (output_used_ + 3 <= output_.size() && | |
| 154 (output_.back() == kPengkal || output_.back() == kCakra) && IsVirama(output_[index]) && | |
| 155 output_[index + 1] == kZeroWidthJoiner) { | |
| 156 MultiCodePart(3); | |
| 157 } | |
| 158 bool have_nukta = false; | |
| 159 if (codes_used_ < num_codes && codes_[codes_used_].first == CharClass::kNukta) { | |
| 160 have_nukta = true; | |
| 161 CodeOnlyToOutput(); | |
| 162 } | |
| 163 // Test for subscript conjunct. | |
| 164 index = output_.size() - 2 - have_nukta; | |
| 165 if (output_used_ + 2 + have_nukta <= output_.size() && IsSubscriptScript() && | |
| 166 IsVirama(output_[index])) { | |
| 167 // Output previous virama, consonant + optional nukta. | |
| 168 MultiCodePart(2 + have_nukta); | |
| 169 } | |
| 170 IndicPair joiner(CharClass::kOther, 0); | |
| 171 if (codes_used_ < num_codes && (codes_[codes_used_].second == kZeroWidthJoiner || | |
| 172 (codes_[codes_used_].second == kZeroWidthNonJoiner && | |
| 173 script_ == ViramaScript::kMalayalam))) { | |
| 174 joiner = codes_[codes_used_]; | |
| 175 if (++codes_used_ == num_codes) { | |
| 176 if (report_errors_) { | |
| 177 tprintf("Skipping ending joiner: 0x%x 0x%x\n", output_.back(), joiner.second); | |
| 178 } | |
| 179 return true; | |
| 180 } | |
| 181 if (codes_[codes_used_].first == CharClass::kVirama) { | |
| 182 output_.push_back(joiner.second); | |
| 183 } else { | |
| 184 if (report_errors_) { | |
| 185 tprintf("Skipping unnecessary joiner: 0x%x 0x%x 0x%x\n", output_.back(), joiner.second, | |
| 186 codes_[codes_used_].second); | |
| 187 } | |
| 188 joiner = std::make_pair(CharClass::kOther, 0); | |
| 189 } | |
| 190 } | |
| 191 if (codes_used_ < num_codes && codes_[codes_used_].first == CharClass::kVirama) { | |
| 192 if (!ConsumeViramaIfValid(joiner, false)) { | |
| 193 return false; | |
| 194 } | |
| 195 } else { | |
| 196 break; // No virama, so the run of consonants is over. | |
| 197 } | |
| 198 } while (codes_used_ < num_codes && codes_[codes_used_].first == CharClass::kConsonant); | |
| 199 if (output_used_ < output_.size()) { | |
| 200 MultiCodePart(1); | |
| 201 } | |
| 202 return true; | |
| 203 } | |
| 204 | |
| 205 // Helper consumes/copies a tail part of a consonant, comprising optional | |
| 206 // matra/piece, vowel modifier, vedic mark, terminating virama. | |
| 207 bool ValidateJavanese::ConsumeConsonantTailIfValid() { | |
| 208 if (codes_used_ == codes_.size()) { | |
| 209 return true; | |
| 210 } | |
| 211 // No virama: Finish the grapheme. | |
| 212 // Are multiple matras allowed? | |
| 213 if (codes_[codes_used_].first == CharClass::kMatra) { | |
| 214 if (UseMultiCode(1)) { | |
| 215 return true; | |
| 216 } | |
| 217 if (codes_[codes_used_].first == CharClass::kMatraPiece) { | |
| 218 if (UseMultiCode(1)) { | |
| 219 return true; | |
| 220 } | |
| 221 } | |
| 222 } | |
| 223 // Tarung also used for long versions of u and o vowels and vocalic r | |
| 224 // Taling + Tarung is valid eg. ꦏ + ◌ꦺ + ◌ꦴ | |
| 225 while (codes_[codes_used_].first == CharClass::kMatraPiece) { | |
| 226 if (UseMultiCode(1)) { | |
| 227 return true; | |
| 228 } | |
| 229 } | |
| 230 while (codes_[codes_used_].first == CharClass::kVowelModifier) { | |
| 231 if (UseMultiCode(1)) { | |
| 232 return true; | |
| 233 } | |
| 234 // Only Malayalam allows only repeated 0xd02. | |
| 235 if (script_ != ViramaScript::kMalayalam || output_.back() != 0xd02) { | |
| 236 break; | |
| 237 } | |
| 238 } | |
| 239 while (codes_[codes_used_].first == CharClass::kVedicMark) { | |
| 240 if (UseMultiCode(1)) { | |
| 241 return true; | |
| 242 } | |
| 243 } | |
| 244 if (codes_[codes_used_].first == CharClass::kVirama) { | |
| 245 if (!ConsumeViramaIfValid(IndicPair(CharClass::kOther, 0), true)) { | |
| 246 return false; | |
| 247 } | |
| 248 } | |
| 249 // What we have consumed so far is a valid consonant cluster. | |
| 250 if (output_used_ < output_.size()) { | |
| 251 MultiCodePart(1); | |
| 252 } | |
| 253 | |
| 254 return true; | |
| 255 } | |
| 256 | |
| 257 // Helper consumes/copies a vowel and optional modifiers. | |
| 258 bool ValidateJavanese::ConsumeVowelIfValid() { | |
| 259 if (UseMultiCode(1)) { | |
| 260 return true; | |
| 261 } | |
| 262 while (codes_[codes_used_].first == CharClass::kVowelModifier) { | |
| 263 if (UseMultiCode(1)) { | |
| 264 return true; | |
| 265 } | |
| 266 // Only Malayalam allows repeated modifiers? | |
| 267 if (script_ != ViramaScript::kMalayalam) { | |
| 268 break; | |
| 269 } | |
| 270 } | |
| 271 while (codes_[codes_used_].first == CharClass::kVedicMark) { | |
| 272 if (UseMultiCode(1)) { | |
| 273 return true; | |
| 274 } | |
| 275 } | |
| 276 // What we have consumed so far is a valid vowel cluster. | |
| 277 return true; | |
| 278 } | |
| 279 | |
| 280 Validator::CharClass ValidateJavanese::UnicodeToCharClass(char32 ch) const { | |
| 281 if (ch == kZeroWidthNonJoiner) { | |
| 282 return CharClass::kZeroWidthNonJoiner; | |
| 283 } | |
| 284 if (ch == kZeroWidthJoiner) { | |
| 285 return CharClass::kZeroWidthJoiner; | |
| 286 } | |
| 287 // Offset from the start of the relevant unicode code block aka code page. | |
| 288 int off = ch - static_cast<char32>(script_); | |
| 289 // Anything in another code block is other. | |
| 290 if (off < 0 || off >= kIndicCodePageSize) { | |
| 291 return CharClass::kOther; | |
| 292 } | |
| 293 if (off < 0x4) { | |
| 294 return CharClass::kVowelModifier; | |
| 295 } | |
| 296 if (off <= 0x32) { | |
| 297 return CharClass::kConsonant; // includes independent vowels | |
| 298 } | |
| 299 if (off == 0x33) { | |
| 300 return CharClass::kNukta; // A9B3 CECAK TELU | |
| 301 } | |
| 302 if (off == 0x34) { | |
| 303 return CharClass::kMatraPiece; // A9B4 TARUNG two part vowels | |
| 304 } | |
| 305 if (off <= 0x39) { | |
| 306 return CharClass::kMatra; | |
| 307 } | |
| 308 if (off <= 0x3a) { | |
| 309 return CharClass::kConsonant; // A9BA TALING - pre base vowel | |
| 310 } | |
| 311 if (off <= 0x3d) { | |
| 312 return CharClass::kMatra; | |
| 313 } | |
| 314 if (off <= 0x3f) { | |
| 315 return CharClass::kNukta; // A9BE-A9BF PENGKAL-CAKRA medial consonants | |
| 316 } | |
| 317 if (off == 0x40) { | |
| 318 return CharClass::kVirama; // A9C0 PANGKON | |
| 319 } | |
| 320 return CharClass::kOther; | |
| 321 } | |
| 322 | |
| 323 } // namespace tesseract |
