diff mupdf-source/thirdparty/tesseract/src/training/unicharset/validate_khmer.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/training/unicharset/validate_khmer.cpp	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,143 @@
+#include "validate_khmer.h"
+#include "errcode.h"
+#include "tprintf.h"
+
+namespace tesseract {
+
+// Returns whether codes matches the pattern for a Khmer Grapheme.
+// Taken from unicode standard:
+// http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf.
+// where it gives: B {R | C} {S {R}}* {{Z} V} {O} {S}, using different notation
+// to the ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf.
+// Translated to the codes used by the CharClass enum:
+// C {R | N} {HC {R}}* {{Z|z} M{P}} {D} {HC}
+// Where R is a new symbol (Robat) and N is repurposed as a consonant shifter.
+// Also the Consonant class here includes independent vowels, as they are
+// treated the same anyway.
+// In the split grapheme mode, the only characters that get grouped are the
+// HC and the {Z|z}M The unicode chapter on Khmer only mentions the joiners in
+// the BNF syntax, so who knows what they do.
+bool ValidateKhmer::ConsumeGraphemeIfValid() {
+  const unsigned num_codes = codes_.size();
+  if (codes_used_ == num_codes) {
+    return false;
+  }
+  if (codes_[codes_used_].first == CharClass::kOther) {
+    UseMultiCode(1);
+    return true;
+  }
+  if (codes_[codes_used_].first != CharClass::kConsonant) {
+    if (report_errors_) {
+      tprintf("Invalid start of Khmer syllable:0x%x\n", codes_[codes_used_].second);
+    }
+    return false;
+  }
+  if (UseMultiCode(1)) {
+    return true;
+  }
+  if (codes_[codes_used_].first == CharClass::kRobat ||
+      codes_[codes_used_].first == CharClass::kNukta) {
+    if (UseMultiCode(1)) {
+      return true;
+    }
+  }
+  while (codes_used_ + 1 < num_codes && codes_[codes_used_].first == CharClass::kVirama &&
+         codes_[codes_used_ + 1].first == CharClass::kConsonant) {
+    ASSERT_HOST(!CodeOnlyToOutput());
+    if (UseMultiCode(2)) {
+      return true;
+    }
+    if (codes_[codes_used_].first == CharClass::kRobat) {
+      if (UseMultiCode(1)) {
+        return true;
+      }
+    }
+  }
+  unsigned num_matra_parts = 0;
+  if (codes_[codes_used_].second == kZeroWidthJoiner ||
+      codes_[codes_used_].second == kZeroWidthNonJoiner) {
+    if (CodeOnlyToOutput()) {
+      if (report_errors_) {
+        tprintf("Unterminated joiner: 0x%x\n", output_.back());
+      }
+      return false;
+    }
+    ++num_matra_parts;
+  }
+  // Not quite as shown by the BNF, the matra piece is allowed as a matra on its
+  // own or as an addition to other matras.
+  if (codes_[codes_used_].first == CharClass::kMatra ||
+      codes_[codes_used_].first == CharClass::kMatraPiece) {
+    ++num_matra_parts;
+    if (UseMultiCode(num_matra_parts)) {
+      return true;
+    }
+  } else if (num_matra_parts) {
+    if (report_errors_) {
+      tprintf("Joiner with non-dependent vowel after it!:0x%x 0x%x\n", output_.back(),
+              codes_[codes_used_].second);
+    }
+    return false;
+  }
+  if (codes_[codes_used_].first == CharClass::kMatraPiece &&
+      codes_[codes_used_ - 1].first != CharClass::kMatraPiece) {
+    if (UseMultiCode(1)) {
+      return true;
+    }
+  }
+  if (codes_[codes_used_].first == CharClass::kVowelModifier) {
+    if (UseMultiCode(1)) {
+      return true;
+    }
+  }
+  if (codes_used_ + 1 < num_codes && codes_[codes_used_].first == CharClass::kVirama &&
+      codes_[codes_used_ + 1].first == CharClass::kConsonant) {
+    ASSERT_HOST(!CodeOnlyToOutput());
+    if (UseMultiCode(2)) {
+      return true;
+    }
+  }
+  return true;
+}
+
+Validator::CharClass ValidateKhmer::UnicodeToCharClass(char32 ch) const {
+  if (IsVedicAccent(ch)) {
+    return CharClass::kVedicMark;
+  }
+  if (ch == kZeroWidthNonJoiner) {
+    return CharClass::kZeroWidthNonJoiner;
+  }
+  if (ch == kZeroWidthJoiner) {
+    return CharClass::kZeroWidthJoiner;
+  }
+  // Offset from the start of the relevant unicode code block aka code page.
+  int off = ch - static_cast<char32>(script_);
+  // Anything in another code block is other.
+  if (off < 0 || off >= kIndicCodePageSize) {
+    return CharClass::kOther;
+  }
+  if (off <= 0x33) {
+    return CharClass::kConsonant;
+  }
+  if (off <= 0x45) {
+    return CharClass::kMatra;
+  }
+  if (off == 0x46) {
+    return CharClass::kMatraPiece;
+  }
+  if (off == 0x4c) {
+    return CharClass::kRobat;
+  }
+  if (off == 0x49 || off == 0x4a) {
+    return CharClass::kNukta;
+  }
+  if (off <= 0x51) {
+    return CharClass::kVowelModifier;
+  }
+  if (off == 0x52) {
+    return CharClass::kVirama;
+  }
+  return CharClass::kOther;
+}
+
+} // namespace tesseract