Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/training/unicharset/validator.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /********************************************************************** | |
| 2 * File: validator.h | |
| 3 * Description: Base class for various text validators. Intended mainly for | |
| 4 * scripts that use a virama character. | |
| 5 * Author: Ray Smith | |
| 6 * | |
| 7 * (C) Copyright 2017, Google Inc. | |
| 8 * Licensed under the Apache License, Version 2.0 (the "License"); | |
| 9 * you may not use this file except in compliance with the License. | |
| 10 * You may obtain a copy of the License at | |
| 11 * http://www.apache.org/licenses/LICENSE-2.0 | |
| 12 * Unless required by applicable law or agreed to in writing, software | |
| 13 * distributed under the License is distributed on an "AS IS" BASIS, | |
| 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 15 * See the License for the specific language governing permissions and | |
| 16 * limitations under the License. | |
| 17 * | |
| 18 **********************************************************************/ | |
| 19 | |
| 20 #ifndef TESSERACT_TRAINING_VALIDATOR_H_ | |
| 21 #define TESSERACT_TRAINING_VALIDATOR_H_ | |
| 22 | |
| 23 #include "export.h" | |
| 24 | |
| 25 #include <tesseract/unichar.h> | |
| 26 | |
| 27 #include <memory> | |
| 28 #include <vector> | |
| 29 | |
| 30 namespace tesseract { | |
| 31 | |
| 32 // Different kinds of grapheme normalization - not just for Indic! | |
| 33 // A grapheme is a syllable unit in Indic and can be several unicodes. | |
| 34 // In other scripts, a grapheme is a base character and accent/diacritic | |
| 35 // combination, as not all accented characters have a single composed form. | |
| 36 enum class GraphemeNormMode { | |
| 37 // Validation result is a single string, even if input is multi-word. | |
| 38 kSingleString, | |
| 39 // Standard unicode graphemes are validated and output as grapheme units. | |
| 40 kCombined, | |
| 41 // Graphemes are validated and sub-divided. For virama-using scripts, units | |
| 42 // that correspond to repeatable glyphs are generated. (Mostly single unicodes | |
| 43 // but viramas and joiners are paired with the most sensible neighbor.) | |
| 44 // For non-virama scripts, this means that base/accent pairs are separated, | |
| 45 // ie the output is individual unicodes. | |
| 46 kGlyphSplit, | |
| 47 // The output is always single unicodes, regardless of the script. | |
| 48 kIndividualUnicodes, | |
| 49 }; | |
| 50 | |
| 51 // An enum representing the scripts that use a virama character. It is | |
| 52 // guaranteed that the value of any element, (except kNonVirama) can be cast | |
| 53 // to a unicode (char32) value that represents the start of the unicode range | |
| 54 // of the corresponding script. | |
| 55 enum class ViramaScript : char32 { | |
| 56 kNonVirama = 0, | |
| 57 kDevanagari = 0x900, | |
| 58 kBengali = 0x980, | |
| 59 kGurmukhi = 0xa00, | |
| 60 kGujarati = 0xa80, | |
| 61 kOriya = 0xb00, | |
| 62 kTamil = 0xb80, | |
| 63 kTelugu = 0xc00, | |
| 64 kKannada = 0xc80, | |
| 65 kMalayalam = 0xd00, | |
| 66 kSinhala = 0xd80, | |
| 67 kMyanmar = 0x1000, | |
| 68 kKhmer = 0x1780, | |
| 69 kJavanese = 0xa980, | |
| 70 }; | |
| 71 | |
| 72 // Base class offers a validation API and protected methods to allow subclasses | |
| 73 // to easily build the validated/segmented output. | |
| 74 class TESS_UNICHARSET_TRAINING_API Validator { | |
| 75 public: | |
| 76 // Validates and cleans the src vector of unicodes to the *dest, according to | |
| 77 // g_mode. In the case of kSingleString, a single vector containing the whole | |
| 78 // result is added to *dest. With kCombined, multiple vectors are added to | |
| 79 // *dest with one grapheme in each. With kGlyphSplit, multiple vectors are | |
| 80 // added to *dest with a smaller unit representing a glyph in each. | |
| 81 // In case of validation error, returns false and as much as possible of the | |
| 82 // input, without discarding invalid text. | |
| 83 static bool ValidateCleanAndSegment(GraphemeNormMode g_mode, bool report_errors, | |
| 84 const std::vector<char32> &src, | |
| 85 std::vector<std::vector<char32>> *dest); | |
| 86 | |
| 87 // Returns true if the unicode ch is a non-printing zero-width mark of no | |
| 88 // significance to OCR training or evaluation. | |
| 89 static bool IsZeroWidthMark(char32 ch) { | |
| 90 return ch == kZeroWidthSpace || ch == kLeftToRightMark || ch == kRightToLeftMark || | |
| 91 ch == kInvalid; | |
| 92 } | |
| 93 virtual ~Validator(); | |
| 94 | |
| 95 // Some specific but universally useful unicodes. | |
| 96 static const char32 kZeroWidthSpace; | |
| 97 static const char32 kZeroWidthNonJoiner; | |
| 98 static const char32 kZeroWidthJoiner; | |
| 99 static const char32 kLeftToRightMark; | |
| 100 static const char32 kRightToLeftMark; | |
| 101 static const char32 kInvalid; | |
| 102 | |
| 103 protected: | |
| 104 // These are more or less the character class identifiers in the ISCII | |
| 105 // standard, section 8. They have been augmented with the Unicode meta | |
| 106 // characters Zero Width Joiner and Zero Width Non Joiner, and the | |
| 107 // Unicode Vedic Marks. | |
| 108 // The best sources of information on Unicode and Indic scripts are: | |
| 109 // http://varamozhi.sourceforge.net/iscii91.pdf | |
| 110 // http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf | |
| 111 // http://unicode.org/faq/indic.html | |
| 112 // http://www.microsoft.com/typography/otfntdev/teluguot/shaping.aspx | |
| 113 enum class CharClass { | |
| 114 // NOTE: The values of the enum members are meaningless and arbitrary, ie | |
| 115 // they are not used for sorting, or any other risky application. | |
| 116 // The reason they are what they are is they are a single character | |
| 117 // abbreviation that can be used in a regexp/BNF definition of a grammar, | |
| 118 // IN A COMMENT, and still not relied upon in the code. | |
| 119 kConsonant = 'C', | |
| 120 kVowel = 'V', | |
| 121 kVirama = 'H', // (aka Halant) | |
| 122 kMatra = 'M', // (aka Dependent Vowel) | |
| 123 kMatraPiece = 'P', // unicode provides pieces of Matras. | |
| 124 kVowelModifier = 'D', // (candrabindu, anusvara, visarga, other marks) | |
| 125 kZeroWidthNonJoiner = 'z', // Unicode Zero Width Non-Joiner U+200C | |
| 126 kZeroWidthJoiner = 'Z', // Unicode Zero Width Joiner U+200D | |
| 127 kVedicMark = 'v', // Modifiers can come modify any indic syllable. | |
| 128 kNukta = 'N', // Occurs only immediately after consonants. | |
| 129 kRobat = 'R', // Khmer only. | |
| 130 kOther = 'O', // (digits, measures, non-Indic, etc) | |
| 131 // Additional classes used only by ValidateGrapheme. | |
| 132 kWhitespace = ' ', | |
| 133 kCombiner = 'c', // Combiners other than virama. | |
| 134 }; | |
| 135 using IndicPair = std::pair<CharClass, char32>; | |
| 136 | |
| 137 Validator(ViramaScript script, bool report_errors) | |
| 138 : script_(script), codes_used_(0), output_used_(0), report_errors_(report_errors) {} | |
| 139 | |
| 140 // Factory method that understands how to map script to the right subclass. | |
| 141 static std::unique_ptr<Validator> ScriptValidator(ViramaScript script, bool report_errors); | |
| 142 | |
| 143 // Internal version of the public static ValidateCleanAndSegment. | |
| 144 // Validates and cleans the src vector of unicodes to the *dest, according to | |
| 145 // its type and the given g_mode. | |
| 146 // In case of validation error, returns false and returns as much as possible | |
| 147 // of the input, without discarding invalid text. | |
| 148 bool ValidateCleanAndSegmentInternal(GraphemeNormMode g_mode, const std::vector<char32> &src, | |
| 149 std::vector<std::vector<char32>> *dest); | |
| 150 // Moves the results from parts_ or output_ to dest according to g_mode. | |
| 151 void MoveResultsToDest(GraphemeNormMode g_mode, std::vector<std::vector<char32>> *dest); | |
| 152 | |
| 153 // Computes and returns the ViramaScript corresponding to the most frequent | |
| 154 // virama-using script in the input, or kNonVirama if none are present. | |
| 155 static ViramaScript MostFrequentViramaScript(const std::vector<char32> &utf32); | |
| 156 // Returns true if the given UTF-32 unicode is a "virama" character. | |
| 157 static bool IsVirama(char32 unicode); | |
| 158 // Returns true if the given UTF-32 unicode is a vedic accent. | |
| 159 static bool IsVedicAccent(char32 unicode); | |
| 160 // Returns true if the script is one that uses subscripts for conjuncts. | |
| 161 bool IsSubscriptScript() const; | |
| 162 | |
| 163 // Helper function appends the next element of codes_ only to output_, | |
| 164 // without touching parts_ | |
| 165 // Returns true at the end of codes_. | |
| 166 bool CodeOnlyToOutput() { | |
| 167 output_.push_back(codes_[codes_used_].second); | |
| 168 return ++codes_used_ == codes_.size(); | |
| 169 } | |
| 170 | |
| 171 // Helper function adds a length-element vector to parts_ from the last length | |
| 172 // elements of output_. If there are more than length unused elements in | |
| 173 // output_, adds unicodes as single-element vectors to parts_ to catch | |
| 174 // output_used_ up to output->size() - length before adding the length-element | |
| 175 // vector. | |
| 176 void MultiCodePart(unsigned length) { | |
| 177 while (output_used_ + length < output_.size()) { | |
| 178 parts_.emplace_back(std::initializer_list<char32>{output_[output_used_++]}); | |
| 179 } | |
| 180 parts_.emplace_back(std::initializer_list<char32>{output_[output_used_]}); | |
| 181 while (++output_used_ < output_.size()) { | |
| 182 parts_.back().push_back(output_[output_used_]); | |
| 183 } | |
| 184 } | |
| 185 | |
| 186 // Helper function appends the next element of codes_ to output_, and then | |
| 187 // calls MultiCodePart to add the appropriate components to parts_. | |
| 188 // Returns true at the end of codes_. | |
| 189 bool UseMultiCode(unsigned length) { | |
| 190 output_.push_back(codes_[codes_used_].second); | |
| 191 MultiCodePart(length); | |
| 192 return ++codes_used_ == codes_.size(); | |
| 193 } | |
| 194 | |
| 195 // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to | |
| 196 // parts_ and output_. Returns true if a valid Grapheme was consumed, | |
| 197 // otherwise does not increment codes_used_. | |
| 198 virtual bool ConsumeGraphemeIfValid() = 0; | |
| 199 // Sets codes_ to the class codes for the given unicode text. | |
| 200 void ComputeClassCodes(const std::vector<char32> &text); | |
| 201 // Returns the CharClass corresponding to the given Unicode ch. | |
| 202 virtual CharClass UnicodeToCharClass(char32 ch) const = 0; | |
| 203 // Resets to the initial state. | |
| 204 void Clear(); | |
| 205 | |
| 206 // Number of unicodes in each Indic codepage. | |
| 207 static const int kIndicCodePageSize = 128; | |
| 208 // Lowest unicode value of any Indic script. (Devanagari). | |
| 209 static const char32 kMinIndicUnicode = 0x900; | |
| 210 // Highest unicode value of any consistent (ISCII-based) Indic script. | |
| 211 static const char32 kMaxSinhalaUnicode = 0xdff; | |
| 212 // Highest unicode value of any virama-using script. (Khmer). | |
| 213 static const char32 kMaxViramaScriptUnicode = 0x17ff; | |
| 214 // Some special unicodes. | |
| 215 static const char32 kSinhalaVirama = 0xdca; | |
| 216 static const char32 kMyanmarVirama = 0x1039; | |
| 217 static const char32 kKhmerVirama = 0x17d2; | |
| 218 // Javanese Script - aksarajawa | |
| 219 static const char32 kJavaneseVirama = 0xa9c0; | |
| 220 static const char32 kMaxJavaneseUnicode = 0xa9df; | |
| 221 | |
| 222 // Script we are operating on. | |
| 223 ViramaScript script_; | |
| 224 // Input unicodes with assigned CharClass is the data to be validated. | |
| 225 std::vector<IndicPair> codes_; | |
| 226 // Glyph-like components of the input. | |
| 227 std::vector<std::vector<char32>> parts_; | |
| 228 // Copied validated unicodes from codes_ that are OK to output. | |
| 229 std::vector<char32> output_; | |
| 230 // The number of elements of codes_ that have been processed so far. | |
| 231 unsigned codes_used_; | |
| 232 // The number of elements of output_ that have already been added to parts_. | |
| 233 unsigned output_used_; | |
| 234 // Log error messages for reasons why text is invalid. | |
| 235 bool report_errors_; | |
| 236 }; | |
| 237 | |
| 238 } // namespace tesseract | |
| 239 | |
| 240 #endif // TESSERACT_TRAINING_VALIDATOR_H_ |
