Python2/PyMuPDF: mupdf-source/thirdparty/tesseract/src/training/unicharset/validator.cpp comparison

comparison mupdf-source/thirdparty/tesseract/src/training/unicharset/validator.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.

author	Franz Glasner <fzglas.hg@dom66.de>
date	Mon, 15 Sep 2025 11:43:07 +0200
parents
children

comparison

equal deleted inserted replaced

-:1d09e1dec1d9
+:b50eed0cc0ef
+#include "validator.h"
+#include <algorithm>
+#include <iterator>
+#include <unordered_map>
+#include <vector>
+#include "icuerrorcode.h"
+#include "unicode/uchar.h"   // From libicu
+#include "unicode/uscript.h" // From libicu
+#include "validate_grapheme.h"
+#include "validate_indic.h"
+#include "validate_javanese.h"
+#include "validate_khmer.h"
+#include "validate_myanmar.h"
+namespace tesseract {
+// Some specific but universally useful unicodes.
+const char32 Validator::kZeroWidthSpace = 0x200B;
+const char32 Validator::kZeroWidthNonJoiner = 0x200C;
+const char32 Validator::kZeroWidthJoiner = 0x200D;
+const char32 Validator::kLeftToRightMark = 0x200E;
+const char32 Validator::kRightToLeftMark = 0x200F;
+const char32 Validator::kInvalid = 0xfffd;
+// Destructor.
+// It is defined here, so the compiler can create a single vtable
+// instead of weak vtables in every compilation unit.
+Validator::~Validator() = default;
+// Validates and cleans the src vector of unicodes to the *dest, according to
+// g_mode. In the case of kSingleString, a single vector containing the whole
+// result is added to *dest. With kCombined, multiple vectors are added to
+// *dest with one grapheme in each. With kGlyphSplit, multiple vectors are
+// added to *dest with a smaller unit representing a glyph in each.
+// In case of validation error, returns false and as much as possible of the
+// input, without discarding invalid text.
+/* static */
+bool Validator::ValidateCleanAndSegment(GraphemeNormMode g_mode, bool report_errors,
+const std::vector<char32> &src,
+std::vector<std::vector<char32>> *dest) {
+ValidateGrapheme g_validator(ViramaScript::kNonVirama, report_errors);
+std::vector<std::vector<char32>> graphemes;
+ViramaScript script = MostFrequentViramaScript(src);
+bool success = true;
+if (script == ViramaScript::kNonVirama) {
+// The grapheme segmenter's maximum segmentation is the grapheme unit, so
+// up the mode by 1 to get the desired effect.
+if (g_mode == GraphemeNormMode::kCombined) {
+g_mode = GraphemeNormMode::kGlyphSplit;
+} else if (g_mode == GraphemeNormMode::kGlyphSplit) {
+g_mode = GraphemeNormMode::kIndividualUnicodes;
+}
+// Just do grapheme segmentation.
+success = g_validator.ValidateCleanAndSegmentInternal(g_mode, src, dest);
+} else {
+success =
+g_validator.ValidateCleanAndSegmentInternal(GraphemeNormMode::kGlyphSplit, src, &graphemes);
+std::unique_ptr<Validator> validator(ScriptValidator(script, report_errors));
+for (const auto &grapheme : graphemes) {
+if (!validator->ValidateCleanAndSegmentInternal(g_mode, grapheme, dest)) {
+success = false;
+}
+}
+}
+return success;
+}
+// Factory method that understands how to map script to the right subclass.
+std::unique_ptr<Validator> Validator::ScriptValidator(ViramaScript script, bool report_errors) {
+switch (script) {
+#define CASE(e, T) case ViramaScript::e: return std::make_unique<T>(script, report_errors)
+CASE(kNonVirama, ValidateGrapheme);
+CASE(kJavanese, ValidateJavanese);
+CASE(kMyanmar, ValidateMyanmar);
+CASE(kKhmer, ValidateKhmer);
+#undef CASE
+default:
+return std::make_unique<ValidateIndic>(script, report_errors);
+}
+}
+// Internal version of the public static ValidateCleanAndSegment.
+// Validates and cleans the src vector of unicodes to the *dest, according to
+// its type and the given g_mode.
+// In case of validation error, returns false and returns as much as possible
+// of the input, without discarding invalid text.
+bool Validator::ValidateCleanAndSegmentInternal(GraphemeNormMode g_mode,
+const std::vector<char32> &src,
+std::vector<std::vector<char32>> *dest) {
+Clear();
+ComputeClassCodes(src);
+bool success = true;
+for (codes_used_ = 0; codes_used_ < codes_.size();) {
+if (!ConsumeGraphemeIfValid()) {
+success = false;
+++codes_used_;
+}
+}
+MoveResultsToDest(g_mode, dest);
+return success;
+}
+// Moves the results from parts_ or output_ to dest according to g_mode.
+void Validator::MoveResultsToDest(GraphemeNormMode g_mode, std::vector<std::vector<char32>> *dest) {
+if (g_mode == GraphemeNormMode::kIndividualUnicodes) {
+// Append each element of the combined output_ that we made as a new vector
+// in dest.
+dest->reserve(dest->size() + output_.size());
+for (char32 ch : output_) {
+dest->push_back({ch});
+}
+} else if (g_mode == GraphemeNormMode::kGlyphSplit) {
+// Append all the parts_ that we made onto dest.
+std::move(parts_.begin(), parts_.end(), std::back_inserter(*dest));
+} else if (g_mode == GraphemeNormMode::kCombined || dest->empty()) {
+// Append the combined output_ that we made onto dest as one new vector.
+dest->push_back(std::vector<char32>());
+output_.swap(dest->back());
+} else { // kNone.
+// Append the combined output_ that we made onto the last existing element
+// of dest.
+dest->back().insert(dest->back().end(), output_.begin(), output_.end());
+}
+}
+static bool CmpPairSecond(const std::pair<int, int> &p1, const std::pair<int, int> &p2) {
+return p1.second < p2.second;
+}
+// Computes and returns the ViramaScript corresponding to the most frequent
+// virama-using script in the input, or kNonVirama if none are present.
+/* static */
+ViramaScript Validator::MostFrequentViramaScript(const std::vector<char32> &utf32) {
+std::unordered_map<int, int> histogram;
+for (char32 ch : utf32) {
+// Determine the codepage base. For the Indic scripts, Khmer and Javanese,
+// it is sufficient to divide by kIndicCodePageSize but Myanmar is all over
+// the unicode code space, so use its script id.
+int base = ch / kIndicCodePageSize;
+IcuErrorCode err;
+UScriptCode script_code = uscript_getScript(ch, err);
+if ((kMinIndicUnicode <= ch && ch <= kMaxJavaneseUnicode && script_code != USCRIPT_COMMON) ||
+script_code == USCRIPT_MYANMAR) {
+if (script_code == USCRIPT_MYANMAR) {
+base = static_cast<char32>(ViramaScript::kMyanmar) / kIndicCodePageSize;
+}
+++histogram[base];
+}
+}
+if (!histogram.empty()) {
+int base = std::max_element(histogram.begin(), histogram.end(), CmpPairSecond)->first;
+auto codebase = static_cast<char32>(base * kIndicCodePageSize);
+// Check for validity.
+if (codebase == static_cast<char32>(ViramaScript::kMyanmar) ||
+codebase == static_cast<char32>(ViramaScript::kJavanese) ||
+codebase == static_cast<char32>(ViramaScript::kKhmer) ||
+(static_cast<char32>(ViramaScript::kDevanagari) <= codebase &&
+codebase <= static_cast<char32>(ViramaScript::kSinhala))) {
+return static_cast<ViramaScript>(codebase);
+}
+}
+return ViramaScript::kNonVirama;
+}
+// Returns true if the given UTF-32 unicode is a "virama" character.
+/* static */
+bool Validator::IsVirama(char32 unicode) {
+return (kMinIndicUnicode <= unicode && unicode <= kMaxSinhalaUnicode &&
+(unicode & 0x7f) == 0x4d) ||
+unicode == kSinhalaVirama || unicode == kJavaneseVirama || unicode == kMyanmarVirama ||
+unicode == kKhmerVirama;
+}
+// Returns true if the given UTF-32 unicode is a vedic accent.
+/* static */
+bool Validator::IsVedicAccent(char32 unicode) {
+return (0x1cd0 <= unicode && unicode < 0x1d00) || (0xa8e0 <= unicode && unicode <= 0xa8f7) ||
+(0x951 <= unicode && unicode <= 0x954);
+}
+// Returns true if the script is one that uses subscripts for conjuncts.
+bool Validator::IsSubscriptScript() const {
+return script_ == ViramaScript::kTelugu || script_ == ViramaScript::kKannada ||
+script_ == ViramaScript::kJavanese || script_ == ViramaScript::kMyanmar ||
+script_ == ViramaScript::kKhmer;
+}
+void Validator::ComputeClassCodes(const std::vector<char32> &text) {
+codes_.reserve(text.size());
+for (char32 c : text) {
+codes_.emplace_back(UnicodeToCharClass(c), c);
+}
+}
+// Resets to the initial state.
+void Validator::Clear() {
+codes_.clear();
+parts_.clear();
+output_.clear();
+codes_used_ = 0;
+output_used_ = 0;
+}
+} // namespace tesseract

Mercurial > hgrepos > Python2 > PyMuPDF

comparison mupdf-source/thirdparty/tesseract/src/training/unicharset/validator.cpp @ 2:b50eed0cc0ef upstream