Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/training/unicharset/validate_khmer.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 #include "validate_khmer.h" | |
| 2 #include "errcode.h" | |
| 3 #include "tprintf.h" | |
| 4 | |
| 5 namespace tesseract { | |
| 6 | |
| 7 // Returns whether codes matches the pattern for a Khmer Grapheme. | |
| 8 // Taken from unicode standard: | |
| 9 // http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf. | |
| 10 // where it gives: B {R | C} {S {R}}* {{Z} V} {O} {S}, using different notation | |
| 11 // to the ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf. | |
| 12 // Translated to the codes used by the CharClass enum: | |
| 13 // C {R | N} {HC {R}}* {{Z|z} M{P}} {D} {HC} | |
| 14 // Where R is a new symbol (Robat) and N is repurposed as a consonant shifter. | |
| 15 // Also the Consonant class here includes independent vowels, as they are | |
| 16 // treated the same anyway. | |
| 17 // In the split grapheme mode, the only characters that get grouped are the | |
| 18 // HC and the {Z|z}M The unicode chapter on Khmer only mentions the joiners in | |
| 19 // the BNF syntax, so who knows what they do. | |
| 20 bool ValidateKhmer::ConsumeGraphemeIfValid() { | |
| 21 const unsigned num_codes = codes_.size(); | |
| 22 if (codes_used_ == num_codes) { | |
| 23 return false; | |
| 24 } | |
| 25 if (codes_[codes_used_].first == CharClass::kOther) { | |
| 26 UseMultiCode(1); | |
| 27 return true; | |
| 28 } | |
| 29 if (codes_[codes_used_].first != CharClass::kConsonant) { | |
| 30 if (report_errors_) { | |
| 31 tprintf("Invalid start of Khmer syllable:0x%x\n", codes_[codes_used_].second); | |
| 32 } | |
| 33 return false; | |
| 34 } | |
| 35 if (UseMultiCode(1)) { | |
| 36 return true; | |
| 37 } | |
| 38 if (codes_[codes_used_].first == CharClass::kRobat || | |
| 39 codes_[codes_used_].first == CharClass::kNukta) { | |
| 40 if (UseMultiCode(1)) { | |
| 41 return true; | |
| 42 } | |
| 43 } | |
| 44 while (codes_used_ + 1 < num_codes && codes_[codes_used_].first == CharClass::kVirama && | |
| 45 codes_[codes_used_ + 1].first == CharClass::kConsonant) { | |
| 46 ASSERT_HOST(!CodeOnlyToOutput()); | |
| 47 if (UseMultiCode(2)) { | |
| 48 return true; | |
| 49 } | |
| 50 if (codes_[codes_used_].first == CharClass::kRobat) { | |
| 51 if (UseMultiCode(1)) { | |
| 52 return true; | |
| 53 } | |
| 54 } | |
| 55 } | |
| 56 unsigned num_matra_parts = 0; | |
| 57 if (codes_[codes_used_].second == kZeroWidthJoiner || | |
| 58 codes_[codes_used_].second == kZeroWidthNonJoiner) { | |
| 59 if (CodeOnlyToOutput()) { | |
| 60 if (report_errors_) { | |
| 61 tprintf("Unterminated joiner: 0x%x\n", output_.back()); | |
| 62 } | |
| 63 return false; | |
| 64 } | |
| 65 ++num_matra_parts; | |
| 66 } | |
| 67 // Not quite as shown by the BNF, the matra piece is allowed as a matra on its | |
| 68 // own or as an addition to other matras. | |
| 69 if (codes_[codes_used_].first == CharClass::kMatra || | |
| 70 codes_[codes_used_].first == CharClass::kMatraPiece) { | |
| 71 ++num_matra_parts; | |
| 72 if (UseMultiCode(num_matra_parts)) { | |
| 73 return true; | |
| 74 } | |
| 75 } else if (num_matra_parts) { | |
| 76 if (report_errors_) { | |
| 77 tprintf("Joiner with non-dependent vowel after it!:0x%x 0x%x\n", output_.back(), | |
| 78 codes_[codes_used_].second); | |
| 79 } | |
| 80 return false; | |
| 81 } | |
| 82 if (codes_[codes_used_].first == CharClass::kMatraPiece && | |
| 83 codes_[codes_used_ - 1].first != CharClass::kMatraPiece) { | |
| 84 if (UseMultiCode(1)) { | |
| 85 return true; | |
| 86 } | |
| 87 } | |
| 88 if (codes_[codes_used_].first == CharClass::kVowelModifier) { | |
| 89 if (UseMultiCode(1)) { | |
| 90 return true; | |
| 91 } | |
| 92 } | |
| 93 if (codes_used_ + 1 < num_codes && codes_[codes_used_].first == CharClass::kVirama && | |
| 94 codes_[codes_used_ + 1].first == CharClass::kConsonant) { | |
| 95 ASSERT_HOST(!CodeOnlyToOutput()); | |
| 96 if (UseMultiCode(2)) { | |
| 97 return true; | |
| 98 } | |
| 99 } | |
| 100 return true; | |
| 101 } | |
| 102 | |
| 103 Validator::CharClass ValidateKhmer::UnicodeToCharClass(char32 ch) const { | |
| 104 if (IsVedicAccent(ch)) { | |
| 105 return CharClass::kVedicMark; | |
| 106 } | |
| 107 if (ch == kZeroWidthNonJoiner) { | |
| 108 return CharClass::kZeroWidthNonJoiner; | |
| 109 } | |
| 110 if (ch == kZeroWidthJoiner) { | |
| 111 return CharClass::kZeroWidthJoiner; | |
| 112 } | |
| 113 // Offset from the start of the relevant unicode code block aka code page. | |
| 114 int off = ch - static_cast<char32>(script_); | |
| 115 // Anything in another code block is other. | |
| 116 if (off < 0 || off >= kIndicCodePageSize) { | |
| 117 return CharClass::kOther; | |
| 118 } | |
| 119 if (off <= 0x33) { | |
| 120 return CharClass::kConsonant; | |
| 121 } | |
| 122 if (off <= 0x45) { | |
| 123 return CharClass::kMatra; | |
| 124 } | |
| 125 if (off == 0x46) { | |
| 126 return CharClass::kMatraPiece; | |
| 127 } | |
| 128 if (off == 0x4c) { | |
| 129 return CharClass::kRobat; | |
| 130 } | |
| 131 if (off == 0x49 || off == 0x4a) { | |
| 132 return CharClass::kNukta; | |
| 133 } | |
| 134 if (off <= 0x51) { | |
| 135 return CharClass::kVowelModifier; | |
| 136 } | |
| 137 if (off == 0x52) { | |
| 138 return CharClass::kVirama; | |
| 139 } | |
| 140 return CharClass::kOther; | |
| 141 } | |
| 142 | |
| 143 } // namespace tesseract |
