Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/training/unicharset/validate_grapheme.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 #include "validate_grapheme.h" | |
| 2 #include "tprintf.h" | |
| 3 #include "unicode/uchar.h" // From libicu | |
| 4 | |
| 5 namespace tesseract { | |
| 6 | |
| 7 bool ValidateGrapheme::ConsumeGraphemeIfValid() { | |
| 8 const unsigned num_codes = codes_.size(); | |
| 9 char32 prev_prev_ch = ' '; | |
| 10 char32 prev_ch = ' '; | |
| 11 CharClass prev_cc = CharClass::kWhitespace; | |
| 12 int num_codes_in_grapheme = 0; | |
| 13 while (codes_used_ < num_codes) { | |
| 14 CharClass cc = codes_[codes_used_].first; | |
| 15 char32 ch = codes_[codes_used_].second; | |
| 16 const bool is_combiner = cc == CharClass::kCombiner || cc == CharClass::kVirama; | |
| 17 // TODO: Make this code work well with RTL text. | |
| 18 // See | |
| 19 // https://github.com/tesseract-ocr/tesseract/pull/2266#issuecomment-467114751 | |
| 20 #if 0 | |
| 21 // Reject easily detected badly formed sequences. | |
| 22 if (prev_cc == CharClass::kWhitespace && is_combiner) { | |
| 23 if (report_errors_) tprintf("Word started with a combiner:0x%x\n", ch); | |
| 24 return false; | |
| 25 } | |
| 26 #endif | |
| 27 if (prev_cc == CharClass::kVirama && cc == CharClass::kVirama) { | |
| 28 if (report_errors_) { | |
| 29 tprintf("Two grapheme links in a row:0x%x 0x%x\n", prev_ch, ch); | |
| 30 } | |
| 31 return false; | |
| 32 } | |
| 33 if (prev_cc != CharClass::kWhitespace && cc != CharClass::kWhitespace && | |
| 34 IsBadlyFormed(prev_ch, ch)) { | |
| 35 return false; | |
| 36 } | |
| 37 bool prev_is_fwd_combiner = prev_ch == kZeroWidthJoiner || prev_cc == CharClass::kVirama || | |
| 38 (prev_ch == kZeroWidthNonJoiner && | |
| 39 (cc == CharClass::kVirama || prev_prev_ch == kZeroWidthJoiner)); | |
| 40 if (num_codes_in_grapheme > 0 && !is_combiner && !prev_is_fwd_combiner) { | |
| 41 break; | |
| 42 } | |
| 43 CodeOnlyToOutput(); | |
| 44 ++num_codes_in_grapheme; | |
| 45 prev_prev_ch = prev_ch; | |
| 46 prev_ch = ch; | |
| 47 prev_cc = cc; | |
| 48 } | |
| 49 if (num_codes_in_grapheme > 0) { | |
| 50 MultiCodePart(num_codes_in_grapheme); | |
| 51 } | |
| 52 return true; | |
| 53 } | |
| 54 | |
| 55 Validator::CharClass ValidateGrapheme::UnicodeToCharClass(char32 ch) const { | |
| 56 if (IsVedicAccent(ch)) { | |
| 57 return CharClass::kVedicMark; | |
| 58 } | |
| 59 // The ZeroWidth[Non]Joiner characters are mapped to kCombiner as they | |
| 60 // always combine with the previous character. | |
| 61 if (u_hasBinaryProperty(ch, UCHAR_GRAPHEME_LINK)) { | |
| 62 return CharClass::kVirama; | |
| 63 } | |
| 64 if (u_isUWhiteSpace(ch)) { | |
| 65 return CharClass::kWhitespace; | |
| 66 } | |
| 67 // Workaround for Javanese Aksara's Taling, do not label it as a combiner | |
| 68 if (ch == 0xa9ba) { | |
| 69 return CharClass::kConsonant; | |
| 70 } | |
| 71 int char_type = u_charType(ch); | |
| 72 if (char_type == U_NON_SPACING_MARK || char_type == U_ENCLOSING_MARK || | |
| 73 char_type == U_COMBINING_SPACING_MARK || ch == kZeroWidthNonJoiner || | |
| 74 ch == kZeroWidthJoiner) { | |
| 75 return CharClass::kCombiner; | |
| 76 } | |
| 77 return CharClass::kOther; | |
| 78 } | |
| 79 | |
| 80 // Helper returns true if the sequence prev_ch,ch is invalid. | |
| 81 bool ValidateGrapheme::IsBadlyFormed(char32 prev_ch, char32 ch) { | |
| 82 // Reject badly formed Indic vowels. | |
| 83 if (IsBadlyFormedIndicVowel(prev_ch, ch)) { | |
| 84 if (report_errors_) { | |
| 85 tprintf("Badly formed Indic vowel sequence:0x%x 0x%x\n", prev_ch, ch); | |
| 86 } | |
| 87 return true; | |
| 88 } | |
| 89 if (IsBadlyFormedThai(prev_ch, ch)) { | |
| 90 if (report_errors_) { | |
| 91 tprintf("Badly formed Thai:0x%x 0x%x\n", prev_ch, ch); | |
| 92 } | |
| 93 return true; | |
| 94 } | |
| 95 return false; | |
| 96 } | |
| 97 | |
| 98 // Helper returns true if the sequence prev_ch,ch is an invalid Indic vowel. | |
| 99 // Some vowels in Indic scripts may be analytically decomposed into atomic pairs | |
| 100 // of components that are themselves valid unicode symbols. (See Table 12-1 in | |
| 101 // http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf | |
| 102 // for examples in Devanagari). The Unicode standard discourages specifying | |
| 103 // vowels this way, but they are sometimes encountered in text, probably because | |
| 104 // some editors still permit it. Renderers however dislike such pairs, and so | |
| 105 // this function may be used to detect their occurrence for removal. | |
| 106 // TODO(rays) This function only covers a subset of Indic languages and doesn't | |
| 107 // include all rules. Add rules as appropriate to support other languages or | |
| 108 // find a way to generalize these existing rules that makes use of the | |
| 109 // regularity of the mapping from ISCII to Unicode. | |
| 110 /* static */ | |
| 111 bool ValidateGrapheme::IsBadlyFormedIndicVowel(char32 prev_ch, char32 ch) { | |
| 112 return ((prev_ch == 0x905 && (ch == 0x946 || ch == 0x93E)) || (prev_ch == 0x909 && ch == 0x941) || | |
| 113 (prev_ch == 0x90F && (ch >= 0x945 && ch <= 0x947)) || | |
| 114 (prev_ch == 0x905 && (ch >= 0x949 && ch <= 0x94C)) || | |
| 115 (prev_ch == 0x906 && (ch >= 0x949 && ch <= 0x94C)) || | |
| 116 // Illegal combinations of two dependent Devanagari vowels. | |
| 117 (prev_ch == 0x93E && (ch >= 0x945 && ch <= 0x948)) || | |
| 118 // Dependent Devanagari vowels following a virama. | |
| 119 (prev_ch == 0x94D && (ch >= 0x93E && ch <= 0x94C)) || | |
| 120 // Bengali vowels (Table 9-5, pg 313) | |
| 121 (prev_ch == 0x985 && ch == 0x9BE) || | |
| 122 // Telugu vowels (Table 9-19, pg 331) | |
| 123 (prev_ch == 0xC12 && (ch == 0xC55 || ch == 0xC4C)) || | |
| 124 // Kannada vowels (Table 9-20, pg 332) | |
| 125 (prev_ch == 0xC92 && ch == 0xCCC)); | |
| 126 } | |
| 127 | |
| 128 // Helper returns true if ch is a Thai consonant. | |
| 129 static bool IsThaiConsonant(char32 ch) { | |
| 130 return 0xe01 <= ch && ch <= 0xe2e; | |
| 131 } | |
| 132 | |
| 133 // Helper returns true is ch is a before-consonant vowel. | |
| 134 static bool IsThaiBeforeConsonantVowel(char32 ch) { | |
| 135 return 0xe40 <= ch && ch <= 0xe44; | |
| 136 } | |
| 137 | |
| 138 // Helper returns true if ch is a Thai tone mark. | |
| 139 static bool IsThaiToneMark(char32 ch) { | |
| 140 return 0xe48 <= ch && ch <= 0xe4b; | |
| 141 } | |
| 142 | |
| 143 // Helper returns true if ch is a Thai vowel that may be followed by a tone | |
| 144 // mark. | |
| 145 static bool IsThaiTonableVowel(char32 ch) { | |
| 146 return (0xe34 <= ch && ch <= 0xe39) || ch == 0xe31; | |
| 147 } | |
| 148 | |
| 149 // Helper returns true if the sequence prev_ch,ch is invalid Thai. | |
| 150 // These rules come from a native Thai speaker, and are not covered by the | |
| 151 // Thai section in the unicode book: | |
| 152 // http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf | |
| 153 // Comments below added by Ray interpreting the code ranges. | |
| 154 /* static */ | |
| 155 bool ValidateGrapheme::IsBadlyFormedThai(char32 prev_ch, char32 ch) { | |
| 156 // Tone marks must follow consonants or specific vowels. | |
| 157 if (IsThaiToneMark(ch) && !(IsThaiConsonant(prev_ch) || IsThaiTonableVowel(prev_ch))) { | |
| 158 return true; | |
| 159 } | |
| 160 // Tonable vowels must follow consonants. | |
| 161 if ((IsThaiTonableVowel(ch) || ch == 0xe47) && !IsThaiConsonant(prev_ch)) { | |
| 162 return true; | |
| 163 } | |
| 164 // Thanthakhat must follow consonant or specific vowels. | |
| 165 if (ch == 0xe4c && !(IsThaiConsonant(prev_ch) || prev_ch == 0xe38 || prev_ch == 0xe34)) { | |
| 166 return true; | |
| 167 } | |
| 168 // Nikkhahit must follow a consonant ?or certain markers?. | |
| 169 // TODO(rays) confirm this, but there were so many in the ground truth of the | |
| 170 // validation set that it seems reasonable to assume it is valid. | |
| 171 if (ch == 0xe4d && !(IsThaiConsonant(prev_ch) || prev_ch == 0xe48 || prev_ch == 0xe49)) { | |
| 172 return true; | |
| 173 } | |
| 174 // The vowels e30, e32, e33 can be used more liberally. | |
| 175 if ((ch == 0xe30 || ch == 0xe32 || ch == 0xe33) && | |
| 176 !(IsThaiConsonant(prev_ch) || IsThaiToneMark(prev_ch)) && | |
| 177 !(prev_ch == 0xe32 && ch == 0xe30) && !(prev_ch == 0xe4d && ch == 0xe32)) { | |
| 178 return true; | |
| 179 } | |
| 180 // Some vowels come before consonants, and therefore cannot follow things | |
| 181 // that cannot end a syllable. | |
| 182 if (IsThaiBeforeConsonantVowel(ch) && | |
| 183 (IsThaiBeforeConsonantVowel(prev_ch) || prev_ch == 0xe31 || prev_ch == 0xe37)) { | |
| 184 return true; | |
| 185 } | |
| 186 // Don't allow the standalone vowel U+0e24 to be followed by other vowels. | |
| 187 if ((0xe30 <= ch && ch <= 0xe4D) && prev_ch == 0xe24) { | |
| 188 return true; | |
| 189 } | |
| 190 return false; | |
| 191 } | |
| 192 | |
| 193 } // namespace tesseract |
