comparison mupdf-source/thirdparty/tesseract/src/training/unicharset/validate_myanmar.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 #include "validate_myanmar.h"
2 #include "errcode.h"
3 #include "icuerrorcode.h"
4 #include "tprintf.h"
5 #include "unicode/uchar.h" // From libicu
6 #include "unicode/uscript.h" // From libicu
7
8 namespace tesseract {
9
10 // Returns whether codes matches the pattern for a Myanmar Grapheme.
11 // Taken directly from the unicode table 16-3.
12 // See http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf
13 bool ValidateMyanmar::ConsumeGraphemeIfValid() {
14 const unsigned num_codes = codes_.size();
15 if (codes_used_ == num_codes) {
16 return true;
17 }
18 // Other.
19 if (IsMyanmarOther(codes_[codes_used_].second)) {
20 UseMultiCode(1);
21 return true;
22 }
23 // Kinzi.
24 if (codes_used_ + 2 < num_codes && codes_[codes_used_].second == 0x1004 &&
25 codes_[codes_used_ + 1].second == kMyanmarAsat &&
26 codes_[codes_used_ + 2].second == kMyanmarVirama) {
27 ASSERT_HOST(!CodeOnlyToOutput());
28 ASSERT_HOST(!CodeOnlyToOutput());
29 if (UseMultiCode(3)) {
30 return true;
31 }
32 }
33 // Base consonant/vowel. NOTE that since everything in Myanmar appears to be
34 // optional, except the base, this is the only place where invalid input can
35 // be detected and false returned.
36 if (IsMyanmarLetter(codes_[codes_used_].second)) {
37 if (UseMultiCode(1)) {
38 return true;
39 }
40 } else {
41 if (report_errors_) {
42 tprintf("Invalid start of Myanmar syllable:0x%x\n", codes_[codes_used_].second);
43 }
44 return false; // One of these is required.
45 }
46 if (ConsumeSubscriptIfPresent()) {
47 return true;
48 }
49 ConsumeOptionalSignsIfPresent();
50 // What we have consumed so far is a valid syllable.
51 return true;
52 }
53
54 // TODO(rays) Doesn't use intermediate coding like the other scripts, as there
55 // is little correspondence between the content of table 16-3 and the char
56 // classes of the Indic languages. (Experts may disagree and improve!)
57 // In unicode table 16-3 there is basically a long list of optional characters,
58 // which can be coded quite easily.
59 // Unfortunately, table 16-3 doesn't include even half the Myanmar unicodes!!
60 // The table also allows sequences that still result in dotted circles!!
61 // So with a lot of guesswork the rest have been added in a reasonable place.
62 Validator::CharClass ValidateMyanmar::UnicodeToCharClass(char32 ch) const {
63 if (IsMyanmarLetter(ch)) {
64 return CharClass::kConsonant;
65 }
66 return CharClass::kOther;
67 }
68
69 // Helper consumes/copies a virama and any subscript consonant.
70 // Returns true if the end of input is reached.
71 bool ValidateMyanmar::ConsumeSubscriptIfPresent() {
72 // Subscript consonant. It appears there can be only one.
73 const unsigned num_codes = codes_.size();
74 if (codes_used_ + 1 < num_codes && codes_[codes_used_].second == kMyanmarVirama) {
75 if (IsMyanmarLetter(codes_[codes_used_ + 1].second)) {
76 ASSERT_HOST(!CodeOnlyToOutput());
77 if (UseMultiCode(2)) {
78 return true;
79 }
80 }
81 }
82 return false;
83 }
84
85 // Helper consumes/copies a series of optional signs.
86 // Returns true if the end of input is reached.
87 bool ValidateMyanmar::ConsumeOptionalSignsIfPresent() {
88 // The following characters are allowed, all optional, and in sequence.
89 // An exception is kMyanmarMedialYa, which can include kMyanmarAsat.
90 const std::vector<char32> kMedials({kMyanmarAsat, kMyanmarMedialYa, 0x103c, 0x103d, 0x103e,
91 0x105e, 0x105f, 0x1060, 0x1081, 0x1031});
92 for (char32 ch : kMedials) {
93 if (codes_[codes_used_].second == ch) {
94 if (UseMultiCode(1)) {
95 return true;
96 }
97 if (ch == kMyanmarMedialYa && codes_[codes_used_].second == kMyanmarAsat) {
98 if (UseMultiCode(1)) {
99 return true;
100 }
101 }
102 }
103 }
104 // Vowel sign i, ii, ai.
105 char32 ch = codes_[codes_used_].second;
106 if (ch == 0x102d || ch == 0x102e || ch == 0x1032) {
107 if (UseMultiCode(1)) {
108 return true;
109 }
110 }
111 // Vowel sign u, uu, and extensions.
112 ch = codes_[codes_used_].second;
113 if (ch == 0x102f || ch == 0x1030 || (0x1056 <= ch && ch <= 0x1059) || ch == 0x1062 ||
114 ch == 0x1067 || ch == 0x1068 || (0x1071 <= ch && ch <= 0x1074) ||
115 (0x1083 <= ch && ch <= 0x1086) || ch == 0x109c || ch == 0x109d) {
116 if (UseMultiCode(1)) {
117 return true;
118 }
119 }
120 // Tall aa, aa with optional asat.
121 if (codes_[codes_used_].second == 0x102b || codes_[codes_used_].second == 0x102c) {
122 if (UseMultiCode(1)) {
123 return true;
124 }
125 if (codes_[codes_used_].second == kMyanmarAsat) {
126 if (UseMultiCode(1)) {
127 return true;
128 }
129 }
130 }
131 // The following characters are allowed, all optional, and in sequence.
132 // Anusvar, Dot below, Visarga
133 const std::vector<char32> kSigns({0x1036, 0x1037, 0x1038});
134 for (char32 ch : kSigns) {
135 if (codes_[codes_used_].second == ch) {
136 if (UseMultiCode(1)) {
137 return true;
138 }
139 }
140 }
141 // Tone mark extensions.
142 ch = codes_[codes_used_].second;
143 if (ch == 0x102c || ch == 0x1038 || ch == kMyanmarAsat || (0x1062 <= ch && ch <= 0x1064) ||
144 (0x1069 <= ch && ch <= 0x106d) || (0x1087 <= ch && ch <= 0x108d) || ch == 0x108f ||
145 ch == 0x109a || ch == 0x109b || (0xaa7b <= ch && ch <= 0xaa7d)) {
146 if (UseMultiCode(1)) {
147 return true;
148 }
149 }
150 // Sgaw tones 0x1062, 0x1063 must be followed by asat.
151 // W Pwo tones 0x1069, 0x106a, and 0x106b may be followed by dot below or visarga (nasal).
152 ch = codes_[codes_used_].second;
153 if (ch == 0x103a || ch == 0x1037 || ch == 0x1038) {
154 if (UseMultiCode(1)) {
155 return true;
156 }
157 }
158 return false;
159 }
160
161 // Returns true if the unicode is a Myanmar "letter" including consonants
162 // and independent vowels. Although table 16-3 distinguishes between some
163 // base consonants and vowels, the extensions make no such distinction, so we
164 // put them all into a single bucket.
165 // Update MYANMAR LETTER based on following:
166 // https://unicode.org/charts/PDF/U1000.pdf - Myanmar
167 // http://unicode.org/charts/PDF/UAA60.pdf - Myanmar Extended-A
168 // http://unicode.org/charts/PDF/UA9E0.pdf - Myanmar Extended-B
169 /* static */
170 bool ValidateMyanmar::IsMyanmarLetter(char32 ch) {
171 return (0x1000 <= ch && ch <= 0x102a) || ch == 0x103f || (0x104c <= ch && ch <= 0x1055) ||
172 (0x105a <= ch && ch <= 0x105d) || ch == 0x1061 || ch == 0x1065 || ch == 0x1066 ||
173 (0x106e <= ch && ch <= 0x1070) || (0x1075 <= ch && ch <= 0x1081) || ch == 0x108e ||
174 (0xa9e0 <= ch && ch <= 0xa9e4) || (0xa9e7 <= ch && ch <= 0xa9ef) ||
175 (0xa9fa <= ch && ch <= 0xa9fe) || (0xaa60 <= ch && ch <= 0xaa6f) ||
176 (0xaa71 <= ch && ch <= 0xaa73) || ch == 0xaa7a || ch == 0xaa7e || ch == 0xaa7f;
177 }
178
179 // Returns true if ch is a Myanmar digit or other symbol that does not take
180 // part in being a syllable eg. punctuation marks.
181 // MYANMAR DIGIT, MYANMAR SYMBOL, MYANMAR LOGOGRAM
182 // REDUPLICATION MARKS
183 /* static */
184 bool ValidateMyanmar::IsMyanmarOther(char32 ch) {
185 IcuErrorCode err;
186 UScriptCode script_code = uscript_getScript(ch, err);
187 if (script_code != USCRIPT_MYANMAR && ch != Validator::kZeroWidthJoiner &&
188 ch != Validator::kZeroWidthNonJoiner) {
189 return true;
190 }
191 return (0x1040 <= ch && ch <= 0x104f) || (0x1090 <= ch && ch <= 0x1099) ||
192 (0x109e <= ch && ch <= 0x109f) || (0xa9f0 <= ch && ch <= 0xa9f9) ||
193 (ch == 0xa9e6 || ch == 0xaa70) || (0xaa74 <= ch && ch <= 0xaa79);
194 }
195
196 } // namespace tesseract