Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/training/unicharset/validator.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 #include "validator.h" | |
| 2 | |
| 3 #include <algorithm> | |
| 4 #include <iterator> | |
| 5 #include <unordered_map> | |
| 6 #include <vector> | |
| 7 | |
| 8 #include "icuerrorcode.h" | |
| 9 #include "unicode/uchar.h" // From libicu | |
| 10 #include "unicode/uscript.h" // From libicu | |
| 11 #include "validate_grapheme.h" | |
| 12 #include "validate_indic.h" | |
| 13 #include "validate_javanese.h" | |
| 14 #include "validate_khmer.h" | |
| 15 #include "validate_myanmar.h" | |
| 16 | |
| 17 namespace tesseract { | |
| 18 | |
| 19 // Some specific but universally useful unicodes. | |
| 20 const char32 Validator::kZeroWidthSpace = 0x200B; | |
| 21 const char32 Validator::kZeroWidthNonJoiner = 0x200C; | |
| 22 const char32 Validator::kZeroWidthJoiner = 0x200D; | |
| 23 const char32 Validator::kLeftToRightMark = 0x200E; | |
| 24 const char32 Validator::kRightToLeftMark = 0x200F; | |
| 25 const char32 Validator::kInvalid = 0xfffd; | |
| 26 | |
| 27 // Destructor. | |
| 28 // It is defined here, so the compiler can create a single vtable | |
| 29 // instead of weak vtables in every compilation unit. | |
| 30 Validator::~Validator() = default; | |
| 31 | |
| 32 // Validates and cleans the src vector of unicodes to the *dest, according to | |
| 33 // g_mode. In the case of kSingleString, a single vector containing the whole | |
| 34 // result is added to *dest. With kCombined, multiple vectors are added to | |
| 35 // *dest with one grapheme in each. With kGlyphSplit, multiple vectors are | |
| 36 // added to *dest with a smaller unit representing a glyph in each. | |
| 37 // In case of validation error, returns false and as much as possible of the | |
| 38 // input, without discarding invalid text. | |
| 39 /* static */ | |
| 40 bool Validator::ValidateCleanAndSegment(GraphemeNormMode g_mode, bool report_errors, | |
| 41 const std::vector<char32> &src, | |
| 42 std::vector<std::vector<char32>> *dest) { | |
| 43 ValidateGrapheme g_validator(ViramaScript::kNonVirama, report_errors); | |
| 44 std::vector<std::vector<char32>> graphemes; | |
| 45 ViramaScript script = MostFrequentViramaScript(src); | |
| 46 bool success = true; | |
| 47 if (script == ViramaScript::kNonVirama) { | |
| 48 // The grapheme segmenter's maximum segmentation is the grapheme unit, so | |
| 49 // up the mode by 1 to get the desired effect. | |
| 50 if (g_mode == GraphemeNormMode::kCombined) { | |
| 51 g_mode = GraphemeNormMode::kGlyphSplit; | |
| 52 } else if (g_mode == GraphemeNormMode::kGlyphSplit) { | |
| 53 g_mode = GraphemeNormMode::kIndividualUnicodes; | |
| 54 } | |
| 55 // Just do grapheme segmentation. | |
| 56 success = g_validator.ValidateCleanAndSegmentInternal(g_mode, src, dest); | |
| 57 } else { | |
| 58 success = | |
| 59 g_validator.ValidateCleanAndSegmentInternal(GraphemeNormMode::kGlyphSplit, src, &graphemes); | |
| 60 std::unique_ptr<Validator> validator(ScriptValidator(script, report_errors)); | |
| 61 for (const auto &grapheme : graphemes) { | |
| 62 if (!validator->ValidateCleanAndSegmentInternal(g_mode, grapheme, dest)) { | |
| 63 success = false; | |
| 64 } | |
| 65 } | |
| 66 } | |
| 67 return success; | |
| 68 } | |
| 69 | |
| 70 // Factory method that understands how to map script to the right subclass. | |
| 71 std::unique_ptr<Validator> Validator::ScriptValidator(ViramaScript script, bool report_errors) { | |
| 72 switch (script) { | |
| 73 #define CASE(e, T) case ViramaScript::e: return std::make_unique<T>(script, report_errors) | |
| 74 CASE(kNonVirama, ValidateGrapheme); | |
| 75 CASE(kJavanese, ValidateJavanese); | |
| 76 CASE(kMyanmar, ValidateMyanmar); | |
| 77 CASE(kKhmer, ValidateKhmer); | |
| 78 #undef CASE | |
| 79 default: | |
| 80 return std::make_unique<ValidateIndic>(script, report_errors); | |
| 81 } | |
| 82 } | |
| 83 | |
| 84 // Internal version of the public static ValidateCleanAndSegment. | |
| 85 // Validates and cleans the src vector of unicodes to the *dest, according to | |
| 86 // its type and the given g_mode. | |
| 87 // In case of validation error, returns false and returns as much as possible | |
| 88 // of the input, without discarding invalid text. | |
| 89 bool Validator::ValidateCleanAndSegmentInternal(GraphemeNormMode g_mode, | |
| 90 const std::vector<char32> &src, | |
| 91 std::vector<std::vector<char32>> *dest) { | |
| 92 Clear(); | |
| 93 ComputeClassCodes(src); | |
| 94 bool success = true; | |
| 95 for (codes_used_ = 0; codes_used_ < codes_.size();) { | |
| 96 if (!ConsumeGraphemeIfValid()) { | |
| 97 success = false; | |
| 98 ++codes_used_; | |
| 99 } | |
| 100 } | |
| 101 MoveResultsToDest(g_mode, dest); | |
| 102 return success; | |
| 103 } | |
| 104 | |
| 105 // Moves the results from parts_ or output_ to dest according to g_mode. | |
| 106 void Validator::MoveResultsToDest(GraphemeNormMode g_mode, std::vector<std::vector<char32>> *dest) { | |
| 107 if (g_mode == GraphemeNormMode::kIndividualUnicodes) { | |
| 108 // Append each element of the combined output_ that we made as a new vector | |
| 109 // in dest. | |
| 110 dest->reserve(dest->size() + output_.size()); | |
| 111 for (char32 ch : output_) { | |
| 112 dest->push_back({ch}); | |
| 113 } | |
| 114 } else if (g_mode == GraphemeNormMode::kGlyphSplit) { | |
| 115 // Append all the parts_ that we made onto dest. | |
| 116 std::move(parts_.begin(), parts_.end(), std::back_inserter(*dest)); | |
| 117 } else if (g_mode == GraphemeNormMode::kCombined || dest->empty()) { | |
| 118 // Append the combined output_ that we made onto dest as one new vector. | |
| 119 dest->push_back(std::vector<char32>()); | |
| 120 output_.swap(dest->back()); | |
| 121 } else { // kNone. | |
| 122 // Append the combined output_ that we made onto the last existing element | |
| 123 // of dest. | |
| 124 dest->back().insert(dest->back().end(), output_.begin(), output_.end()); | |
| 125 } | |
| 126 } | |
| 127 | |
| 128 static bool CmpPairSecond(const std::pair<int, int> &p1, const std::pair<int, int> &p2) { | |
| 129 return p1.second < p2.second; | |
| 130 } | |
| 131 | |
| 132 // Computes and returns the ViramaScript corresponding to the most frequent | |
| 133 // virama-using script in the input, or kNonVirama if none are present. | |
| 134 /* static */ | |
| 135 ViramaScript Validator::MostFrequentViramaScript(const std::vector<char32> &utf32) { | |
| 136 std::unordered_map<int, int> histogram; | |
| 137 for (char32 ch : utf32) { | |
| 138 // Determine the codepage base. For the Indic scripts, Khmer and Javanese, | |
| 139 // it is sufficient to divide by kIndicCodePageSize but Myanmar is all over | |
| 140 // the unicode code space, so use its script id. | |
| 141 int base = ch / kIndicCodePageSize; | |
| 142 IcuErrorCode err; | |
| 143 UScriptCode script_code = uscript_getScript(ch, err); | |
| 144 if ((kMinIndicUnicode <= ch && ch <= kMaxJavaneseUnicode && script_code != USCRIPT_COMMON) || | |
| 145 script_code == USCRIPT_MYANMAR) { | |
| 146 if (script_code == USCRIPT_MYANMAR) { | |
| 147 base = static_cast<char32>(ViramaScript::kMyanmar) / kIndicCodePageSize; | |
| 148 } | |
| 149 ++histogram[base]; | |
| 150 } | |
| 151 } | |
| 152 if (!histogram.empty()) { | |
| 153 int base = std::max_element(histogram.begin(), histogram.end(), CmpPairSecond)->first; | |
| 154 auto codebase = static_cast<char32>(base * kIndicCodePageSize); | |
| 155 // Check for validity. | |
| 156 if (codebase == static_cast<char32>(ViramaScript::kMyanmar) || | |
| 157 codebase == static_cast<char32>(ViramaScript::kJavanese) || | |
| 158 codebase == static_cast<char32>(ViramaScript::kKhmer) || | |
| 159 (static_cast<char32>(ViramaScript::kDevanagari) <= codebase && | |
| 160 codebase <= static_cast<char32>(ViramaScript::kSinhala))) { | |
| 161 return static_cast<ViramaScript>(codebase); | |
| 162 } | |
| 163 } | |
| 164 return ViramaScript::kNonVirama; | |
| 165 } | |
| 166 | |
| 167 // Returns true if the given UTF-32 unicode is a "virama" character. | |
| 168 /* static */ | |
| 169 bool Validator::IsVirama(char32 unicode) { | |
| 170 return (kMinIndicUnicode <= unicode && unicode <= kMaxSinhalaUnicode && | |
| 171 (unicode & 0x7f) == 0x4d) || | |
| 172 unicode == kSinhalaVirama || unicode == kJavaneseVirama || unicode == kMyanmarVirama || | |
| 173 unicode == kKhmerVirama; | |
| 174 } | |
| 175 | |
| 176 // Returns true if the given UTF-32 unicode is a vedic accent. | |
| 177 /* static */ | |
| 178 bool Validator::IsVedicAccent(char32 unicode) { | |
| 179 return (0x1cd0 <= unicode && unicode < 0x1d00) || (0xa8e0 <= unicode && unicode <= 0xa8f7) || | |
| 180 (0x951 <= unicode && unicode <= 0x954); | |
| 181 } | |
| 182 | |
| 183 // Returns true if the script is one that uses subscripts for conjuncts. | |
| 184 bool Validator::IsSubscriptScript() const { | |
| 185 return script_ == ViramaScript::kTelugu || script_ == ViramaScript::kKannada || | |
| 186 script_ == ViramaScript::kJavanese || script_ == ViramaScript::kMyanmar || | |
| 187 script_ == ViramaScript::kKhmer; | |
| 188 } | |
| 189 | |
| 190 void Validator::ComputeClassCodes(const std::vector<char32> &text) { | |
| 191 codes_.reserve(text.size()); | |
| 192 for (char32 c : text) { | |
| 193 codes_.emplace_back(UnicodeToCharClass(c), c); | |
| 194 } | |
| 195 } | |
| 196 | |
| 197 // Resets to the initial state. | |
| 198 void Validator::Clear() { | |
| 199 codes_.clear(); | |
| 200 parts_.clear(); | |
| 201 output_.clear(); | |
| 202 codes_used_ = 0; | |
| 203 output_used_ = 0; | |
| 204 } | |
| 205 | |
| 206 } // namespace tesseract |
