Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/training/unicharset/validate_myanmar.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 #include "validate_myanmar.h" | |
| 2 #include "errcode.h" | |
| 3 #include "icuerrorcode.h" | |
| 4 #include "tprintf.h" | |
| 5 #include "unicode/uchar.h" // From libicu | |
| 6 #include "unicode/uscript.h" // From libicu | |
| 7 | |
| 8 namespace tesseract { | |
| 9 | |
| 10 // Returns whether codes matches the pattern for a Myanmar Grapheme. | |
| 11 // Taken directly from the unicode table 16-3. | |
| 12 // See http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf | |
| 13 bool ValidateMyanmar::ConsumeGraphemeIfValid() { | |
| 14 const unsigned num_codes = codes_.size(); | |
| 15 if (codes_used_ == num_codes) { | |
| 16 return true; | |
| 17 } | |
| 18 // Other. | |
| 19 if (IsMyanmarOther(codes_[codes_used_].second)) { | |
| 20 UseMultiCode(1); | |
| 21 return true; | |
| 22 } | |
| 23 // Kinzi. | |
| 24 if (codes_used_ + 2 < num_codes && codes_[codes_used_].second == 0x1004 && | |
| 25 codes_[codes_used_ + 1].second == kMyanmarAsat && | |
| 26 codes_[codes_used_ + 2].second == kMyanmarVirama) { | |
| 27 ASSERT_HOST(!CodeOnlyToOutput()); | |
| 28 ASSERT_HOST(!CodeOnlyToOutput()); | |
| 29 if (UseMultiCode(3)) { | |
| 30 return true; | |
| 31 } | |
| 32 } | |
| 33 // Base consonant/vowel. NOTE that since everything in Myanmar appears to be | |
| 34 // optional, except the base, this is the only place where invalid input can | |
| 35 // be detected and false returned. | |
| 36 if (IsMyanmarLetter(codes_[codes_used_].second)) { | |
| 37 if (UseMultiCode(1)) { | |
| 38 return true; | |
| 39 } | |
| 40 } else { | |
| 41 if (report_errors_) { | |
| 42 tprintf("Invalid start of Myanmar syllable:0x%x\n", codes_[codes_used_].second); | |
| 43 } | |
| 44 return false; // One of these is required. | |
| 45 } | |
| 46 if (ConsumeSubscriptIfPresent()) { | |
| 47 return true; | |
| 48 } | |
| 49 ConsumeOptionalSignsIfPresent(); | |
| 50 // What we have consumed so far is a valid syllable. | |
| 51 return true; | |
| 52 } | |
| 53 | |
| 54 // TODO(rays) Doesn't use intermediate coding like the other scripts, as there | |
| 55 // is little correspondence between the content of table 16-3 and the char | |
| 56 // classes of the Indic languages. (Experts may disagree and improve!) | |
| 57 // In unicode table 16-3 there is basically a long list of optional characters, | |
| 58 // which can be coded quite easily. | |
| 59 // Unfortunately, table 16-3 doesn't include even half the Myanmar unicodes!! | |
| 60 // The table also allows sequences that still result in dotted circles!! | |
| 61 // So with a lot of guesswork the rest have been added in a reasonable place. | |
| 62 Validator::CharClass ValidateMyanmar::UnicodeToCharClass(char32 ch) const { | |
| 63 if (IsMyanmarLetter(ch)) { | |
| 64 return CharClass::kConsonant; | |
| 65 } | |
| 66 return CharClass::kOther; | |
| 67 } | |
| 68 | |
| 69 // Helper consumes/copies a virama and any subscript consonant. | |
| 70 // Returns true if the end of input is reached. | |
| 71 bool ValidateMyanmar::ConsumeSubscriptIfPresent() { | |
| 72 // Subscript consonant. It appears there can be only one. | |
| 73 const unsigned num_codes = codes_.size(); | |
| 74 if (codes_used_ + 1 < num_codes && codes_[codes_used_].second == kMyanmarVirama) { | |
| 75 if (IsMyanmarLetter(codes_[codes_used_ + 1].second)) { | |
| 76 ASSERT_HOST(!CodeOnlyToOutput()); | |
| 77 if (UseMultiCode(2)) { | |
| 78 return true; | |
| 79 } | |
| 80 } | |
| 81 } | |
| 82 return false; | |
| 83 } | |
| 84 | |
| 85 // Helper consumes/copies a series of optional signs. | |
| 86 // Returns true if the end of input is reached. | |
| 87 bool ValidateMyanmar::ConsumeOptionalSignsIfPresent() { | |
| 88 // The following characters are allowed, all optional, and in sequence. | |
| 89 // An exception is kMyanmarMedialYa, which can include kMyanmarAsat. | |
| 90 const std::vector<char32> kMedials({kMyanmarAsat, kMyanmarMedialYa, 0x103c, 0x103d, 0x103e, | |
| 91 0x105e, 0x105f, 0x1060, 0x1081, 0x1031}); | |
| 92 for (char32 ch : kMedials) { | |
| 93 if (codes_[codes_used_].second == ch) { | |
| 94 if (UseMultiCode(1)) { | |
| 95 return true; | |
| 96 } | |
| 97 if (ch == kMyanmarMedialYa && codes_[codes_used_].second == kMyanmarAsat) { | |
| 98 if (UseMultiCode(1)) { | |
| 99 return true; | |
| 100 } | |
| 101 } | |
| 102 } | |
| 103 } | |
| 104 // Vowel sign i, ii, ai. | |
| 105 char32 ch = codes_[codes_used_].second; | |
| 106 if (ch == 0x102d || ch == 0x102e || ch == 0x1032) { | |
| 107 if (UseMultiCode(1)) { | |
| 108 return true; | |
| 109 } | |
| 110 } | |
| 111 // Vowel sign u, uu, and extensions. | |
| 112 ch = codes_[codes_used_].second; | |
| 113 if (ch == 0x102f || ch == 0x1030 || (0x1056 <= ch && ch <= 0x1059) || ch == 0x1062 || | |
| 114 ch == 0x1067 || ch == 0x1068 || (0x1071 <= ch && ch <= 0x1074) || | |
| 115 (0x1083 <= ch && ch <= 0x1086) || ch == 0x109c || ch == 0x109d) { | |
| 116 if (UseMultiCode(1)) { | |
| 117 return true; | |
| 118 } | |
| 119 } | |
| 120 // Tall aa, aa with optional asat. | |
| 121 if (codes_[codes_used_].second == 0x102b || codes_[codes_used_].second == 0x102c) { | |
| 122 if (UseMultiCode(1)) { | |
| 123 return true; | |
| 124 } | |
| 125 if (codes_[codes_used_].second == kMyanmarAsat) { | |
| 126 if (UseMultiCode(1)) { | |
| 127 return true; | |
| 128 } | |
| 129 } | |
| 130 } | |
| 131 // The following characters are allowed, all optional, and in sequence. | |
| 132 // Anusvar, Dot below, Visarga | |
| 133 const std::vector<char32> kSigns({0x1036, 0x1037, 0x1038}); | |
| 134 for (char32 ch : kSigns) { | |
| 135 if (codes_[codes_used_].second == ch) { | |
| 136 if (UseMultiCode(1)) { | |
| 137 return true; | |
| 138 } | |
| 139 } | |
| 140 } | |
| 141 // Tone mark extensions. | |
| 142 ch = codes_[codes_used_].second; | |
| 143 if (ch == 0x102c || ch == 0x1038 || ch == kMyanmarAsat || (0x1062 <= ch && ch <= 0x1064) || | |
| 144 (0x1069 <= ch && ch <= 0x106d) || (0x1087 <= ch && ch <= 0x108d) || ch == 0x108f || | |
| 145 ch == 0x109a || ch == 0x109b || (0xaa7b <= ch && ch <= 0xaa7d)) { | |
| 146 if (UseMultiCode(1)) { | |
| 147 return true; | |
| 148 } | |
| 149 } | |
| 150 // Sgaw tones 0x1062, 0x1063 must be followed by asat. | |
| 151 // W Pwo tones 0x1069, 0x106a, and 0x106b may be followed by dot below or visarga (nasal). | |
| 152 ch = codes_[codes_used_].second; | |
| 153 if (ch == 0x103a || ch == 0x1037 || ch == 0x1038) { | |
| 154 if (UseMultiCode(1)) { | |
| 155 return true; | |
| 156 } | |
| 157 } | |
| 158 return false; | |
| 159 } | |
| 160 | |
| 161 // Returns true if the unicode is a Myanmar "letter" including consonants | |
| 162 // and independent vowels. Although table 16-3 distinguishes between some | |
| 163 // base consonants and vowels, the extensions make no such distinction, so we | |
| 164 // put them all into a single bucket. | |
| 165 // Update MYANMAR LETTER based on following: | |
| 166 // https://unicode.org/charts/PDF/U1000.pdf - Myanmar | |
| 167 // http://unicode.org/charts/PDF/UAA60.pdf - Myanmar Extended-A | |
| 168 // http://unicode.org/charts/PDF/UA9E0.pdf - Myanmar Extended-B | |
| 169 /* static */ | |
| 170 bool ValidateMyanmar::IsMyanmarLetter(char32 ch) { | |
| 171 return (0x1000 <= ch && ch <= 0x102a) || ch == 0x103f || (0x104c <= ch && ch <= 0x1055) || | |
| 172 (0x105a <= ch && ch <= 0x105d) || ch == 0x1061 || ch == 0x1065 || ch == 0x1066 || | |
| 173 (0x106e <= ch && ch <= 0x1070) || (0x1075 <= ch && ch <= 0x1081) || ch == 0x108e || | |
| 174 (0xa9e0 <= ch && ch <= 0xa9e4) || (0xa9e7 <= ch && ch <= 0xa9ef) || | |
| 175 (0xa9fa <= ch && ch <= 0xa9fe) || (0xaa60 <= ch && ch <= 0xaa6f) || | |
| 176 (0xaa71 <= ch && ch <= 0xaa73) || ch == 0xaa7a || ch == 0xaa7e || ch == 0xaa7f; | |
| 177 } | |
| 178 | |
| 179 // Returns true if ch is a Myanmar digit or other symbol that does not take | |
| 180 // part in being a syllable eg. punctuation marks. | |
| 181 // MYANMAR DIGIT, MYANMAR SYMBOL, MYANMAR LOGOGRAM | |
| 182 // REDUPLICATION MARKS | |
| 183 /* static */ | |
| 184 bool ValidateMyanmar::IsMyanmarOther(char32 ch) { | |
| 185 IcuErrorCode err; | |
| 186 UScriptCode script_code = uscript_getScript(ch, err); | |
| 187 if (script_code != USCRIPT_MYANMAR && ch != Validator::kZeroWidthJoiner && | |
| 188 ch != Validator::kZeroWidthNonJoiner) { | |
| 189 return true; | |
| 190 } | |
| 191 return (0x1040 <= ch && ch <= 0x104f) || (0x1090 <= ch && ch <= 0x1099) || | |
| 192 (0x109e <= ch && ch <= 0x109f) || (0xa9f0 <= ch && ch <= 0xa9f9) || | |
| 193 (ch == 0xa9e6 || ch == 0xaa70) || (0xaa74 <= ch && ch <= 0xaa79); | |
| 194 } | |
| 195 | |
| 196 } // namespace tesseract |
