Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/training/unicharset/validate_myanmar.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 #ifndef TESSERACT_TRAINING_VALIDATE_MYANMAR_H_ | |
| 2 #define TESSERACT_TRAINING_VALIDATE_MYANMAR_H_ | |
| 3 | |
| 4 #include "validator.h" | |
| 5 | |
| 6 namespace tesseract { | |
| 7 | |
| 8 // Subclass of Validator that validates and segments Myanmar. | |
| 9 class ValidateMyanmar : public Validator { | |
| 10 public: | |
| 11 ValidateMyanmar(ViramaScript script, bool report_errors) : Validator(script, report_errors) {} | |
| 12 ~ValidateMyanmar() override = default; | |
| 13 | |
| 14 protected: | |
| 15 // Returns whether codes matches the pattern for a Myanmar Grapheme. | |
| 16 // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to | |
| 17 // parts_ and output_. Returns true if a valid Grapheme was consumed, | |
| 18 // otherwise does not increment codes_used_. | |
| 19 bool ConsumeGraphemeIfValid() override; | |
| 20 // Returns the CharClass corresponding to the given Unicode ch. | |
| 21 Validator::CharClass UnicodeToCharClass(char32 ch) const override; | |
| 22 | |
| 23 private: | |
| 24 // Helper consumes/copies a virama and any subscript consonant. | |
| 25 // Returns true if the end of input is reached. | |
| 26 bool ConsumeSubscriptIfPresent(); | |
| 27 // Helper consumes/copies a series of optional signs. | |
| 28 // Returns true if the end of input is reached. | |
| 29 bool ConsumeOptionalSignsIfPresent(); | |
| 30 // Returns true if the unicode is a Myanmar "letter" including consonants | |
| 31 // and independent vowels. Although table 16-3 distinguishes between some | |
| 32 // base consonants and vowels, the extensions make no such distinction, so we | |
| 33 // put them all into a single bucket. | |
| 34 static bool IsMyanmarLetter(char32 ch); | |
| 35 // Returns true if ch is a Myanmar digit or other symbol that does not take | |
| 36 // part in being a syllable. | |
| 37 static bool IsMyanmarOther(char32 ch); | |
| 38 | |
| 39 // Some special unicodes used only for Myanmar processing. | |
| 40 static const char32 kMyanmarAsat = 0x103a; | |
| 41 static const char32 kMyanmarMedialYa = 0x103b; | |
| 42 }; | |
| 43 | |
| 44 } // namespace tesseract | |
| 45 | |
| 46 #endif // TESSERACT_TRAINING_VALIDATE_MYANMAR_H_ |
