Mercurial > hgrepos > Python2 > PyMuPDF
view mupdf-source/thirdparty/tesseract/src/training/unicharset/validator.cpp @ 21:2f43e400f144
Provide an "all" target to build both the sdist and the wheel
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Fri, 19 Sep 2025 10:28:53 +0200 |
| parents | b50eed0cc0ef |
| children |
line wrap: on
line source
#include "validator.h" #include <algorithm> #include <iterator> #include <unordered_map> #include <vector> #include "icuerrorcode.h" #include "unicode/uchar.h" // From libicu #include "unicode/uscript.h" // From libicu #include "validate_grapheme.h" #include "validate_indic.h" #include "validate_javanese.h" #include "validate_khmer.h" #include "validate_myanmar.h" namespace tesseract { // Some specific but universally useful unicodes. const char32 Validator::kZeroWidthSpace = 0x200B; const char32 Validator::kZeroWidthNonJoiner = 0x200C; const char32 Validator::kZeroWidthJoiner = 0x200D; const char32 Validator::kLeftToRightMark = 0x200E; const char32 Validator::kRightToLeftMark = 0x200F; const char32 Validator::kInvalid = 0xfffd; // Destructor. // It is defined here, so the compiler can create a single vtable // instead of weak vtables in every compilation unit. Validator::~Validator() = default; // Validates and cleans the src vector of unicodes to the *dest, according to // g_mode. In the case of kSingleString, a single vector containing the whole // result is added to *dest. With kCombined, multiple vectors are added to // *dest with one grapheme in each. With kGlyphSplit, multiple vectors are // added to *dest with a smaller unit representing a glyph in each. // In case of validation error, returns false and as much as possible of the // input, without discarding invalid text. /* static */ bool Validator::ValidateCleanAndSegment(GraphemeNormMode g_mode, bool report_errors, const std::vector<char32> &src, std::vector<std::vector<char32>> *dest) { ValidateGrapheme g_validator(ViramaScript::kNonVirama, report_errors); std::vector<std::vector<char32>> graphemes; ViramaScript script = MostFrequentViramaScript(src); bool success = true; if (script == ViramaScript::kNonVirama) { // The grapheme segmenter's maximum segmentation is the grapheme unit, so // up the mode by 1 to get the desired effect. if (g_mode == GraphemeNormMode::kCombined) { g_mode = GraphemeNormMode::kGlyphSplit; } else if (g_mode == GraphemeNormMode::kGlyphSplit) { g_mode = GraphemeNormMode::kIndividualUnicodes; } // Just do grapheme segmentation. success = g_validator.ValidateCleanAndSegmentInternal(g_mode, src, dest); } else { success = g_validator.ValidateCleanAndSegmentInternal(GraphemeNormMode::kGlyphSplit, src, &graphemes); std::unique_ptr<Validator> validator(ScriptValidator(script, report_errors)); for (const auto &grapheme : graphemes) { if (!validator->ValidateCleanAndSegmentInternal(g_mode, grapheme, dest)) { success = false; } } } return success; } // Factory method that understands how to map script to the right subclass. std::unique_ptr<Validator> Validator::ScriptValidator(ViramaScript script, bool report_errors) { switch (script) { #define CASE(e, T) case ViramaScript::e: return std::make_unique<T>(script, report_errors) CASE(kNonVirama, ValidateGrapheme); CASE(kJavanese, ValidateJavanese); CASE(kMyanmar, ValidateMyanmar); CASE(kKhmer, ValidateKhmer); #undef CASE default: return std::make_unique<ValidateIndic>(script, report_errors); } } // Internal version of the public static ValidateCleanAndSegment. // Validates and cleans the src vector of unicodes to the *dest, according to // its type and the given g_mode. // In case of validation error, returns false and returns as much as possible // of the input, without discarding invalid text. bool Validator::ValidateCleanAndSegmentInternal(GraphemeNormMode g_mode, const std::vector<char32> &src, std::vector<std::vector<char32>> *dest) { Clear(); ComputeClassCodes(src); bool success = true; for (codes_used_ = 0; codes_used_ < codes_.size();) { if (!ConsumeGraphemeIfValid()) { success = false; ++codes_used_; } } MoveResultsToDest(g_mode, dest); return success; } // Moves the results from parts_ or output_ to dest according to g_mode. void Validator::MoveResultsToDest(GraphemeNormMode g_mode, std::vector<std::vector<char32>> *dest) { if (g_mode == GraphemeNormMode::kIndividualUnicodes) { // Append each element of the combined output_ that we made as a new vector // in dest. dest->reserve(dest->size() + output_.size()); for (char32 ch : output_) { dest->push_back({ch}); } } else if (g_mode == GraphemeNormMode::kGlyphSplit) { // Append all the parts_ that we made onto dest. std::move(parts_.begin(), parts_.end(), std::back_inserter(*dest)); } else if (g_mode == GraphemeNormMode::kCombined || dest->empty()) { // Append the combined output_ that we made onto dest as one new vector. dest->push_back(std::vector<char32>()); output_.swap(dest->back()); } else { // kNone. // Append the combined output_ that we made onto the last existing element // of dest. dest->back().insert(dest->back().end(), output_.begin(), output_.end()); } } static bool CmpPairSecond(const std::pair<int, int> &p1, const std::pair<int, int> &p2) { return p1.second < p2.second; } // Computes and returns the ViramaScript corresponding to the most frequent // virama-using script in the input, or kNonVirama if none are present. /* static */ ViramaScript Validator::MostFrequentViramaScript(const std::vector<char32> &utf32) { std::unordered_map<int, int> histogram; for (char32 ch : utf32) { // Determine the codepage base. For the Indic scripts, Khmer and Javanese, // it is sufficient to divide by kIndicCodePageSize but Myanmar is all over // the unicode code space, so use its script id. int base = ch / kIndicCodePageSize; IcuErrorCode err; UScriptCode script_code = uscript_getScript(ch, err); if ((kMinIndicUnicode <= ch && ch <= kMaxJavaneseUnicode && script_code != USCRIPT_COMMON) || script_code == USCRIPT_MYANMAR) { if (script_code == USCRIPT_MYANMAR) { base = static_cast<char32>(ViramaScript::kMyanmar) / kIndicCodePageSize; } ++histogram[base]; } } if (!histogram.empty()) { int base = std::max_element(histogram.begin(), histogram.end(), CmpPairSecond)->first; auto codebase = static_cast<char32>(base * kIndicCodePageSize); // Check for validity. if (codebase == static_cast<char32>(ViramaScript::kMyanmar) || codebase == static_cast<char32>(ViramaScript::kJavanese) || codebase == static_cast<char32>(ViramaScript::kKhmer) || (static_cast<char32>(ViramaScript::kDevanagari) <= codebase && codebase <= static_cast<char32>(ViramaScript::kSinhala))) { return static_cast<ViramaScript>(codebase); } } return ViramaScript::kNonVirama; } // Returns true if the given UTF-32 unicode is a "virama" character. /* static */ bool Validator::IsVirama(char32 unicode) { return (kMinIndicUnicode <= unicode && unicode <= kMaxSinhalaUnicode && (unicode & 0x7f) == 0x4d) || unicode == kSinhalaVirama || unicode == kJavaneseVirama || unicode == kMyanmarVirama || unicode == kKhmerVirama; } // Returns true if the given UTF-32 unicode is a vedic accent. /* static */ bool Validator::IsVedicAccent(char32 unicode) { return (0x1cd0 <= unicode && unicode < 0x1d00) || (0xa8e0 <= unicode && unicode <= 0xa8f7) || (0x951 <= unicode && unicode <= 0x954); } // Returns true if the script is one that uses subscripts for conjuncts. bool Validator::IsSubscriptScript() const { return script_ == ViramaScript::kTelugu || script_ == ViramaScript::kKannada || script_ == ViramaScript::kJavanese || script_ == ViramaScript::kMyanmar || script_ == ViramaScript::kKhmer; } void Validator::ComputeClassCodes(const std::vector<char32> &text) { codes_.reserve(text.size()); for (char32 c : text) { codes_.emplace_back(UnicodeToCharClass(c), c); } } // Resets to the initial state. void Validator::Clear() { codes_.clear(); parts_.clear(); output_.clear(); codes_used_ = 0; output_used_ = 0; } } // namespace tesseract
