diff mupdf-source/thirdparty/tesseract/src/training/unicharset/validate_indic.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/training/unicharset/validate_indic.cpp	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,349 @@
+#include "validate_indic.h"
+#include "errcode.h"
+#include "tprintf.h"
+
+namespace tesseract {
+
+// Returns whether codes matches the pattern for an Indic Grapheme.
+// The ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf
+// has a BNF for valid syllables (Graphemes) which is modified slightly
+// for Unicode.  Notably U+200C and U+200D are used before/after the
+// virama/virama to express explicit or soft viramas.
+// Also the unicode v.9 Malayalam entry states that CZHC can be used in several
+// Indic languages to request traditional ligatures, and CzHC is Malayalam-
+// specific for requesting open conjuncts.
+//
+//  + vowel Grapheme:  V[D](v)*
+//  + consonant Grapheme: (C[N](H|HZ|Hz|ZH)?)*C[N](H|Hz)?[M[P]][D](v)*
+bool ValidateIndic::ConsumeGraphemeIfValid() {
+  switch (codes_[codes_used_].first) {
+    case CharClass::kConsonant:
+      return ConsumeConsonantHeadIfValid() && ConsumeConsonantTailIfValid();
+    case CharClass::kVowel:
+    case CharClass::kVedicMark:
+      return ConsumeVowelIfValid();
+    case CharClass::kZeroWidthJoiner:
+    case CharClass::kZeroWidthNonJoiner:
+      // Apart from within an aksara, joiners are silently dropped.
+      if (report_errors_) {
+        tprintf("Dropping isolated joiner: 0x%x\n", codes_[codes_used_].second);
+      }
+      ++codes_used_;
+      return true;
+    case CharClass::kOther:
+      UseMultiCode(1);
+      return true;
+    default:
+      if (report_errors_) {
+        tprintf("Invalid start of grapheme sequence:%c=0x%x\n",
+                static_cast<int>(codes_[codes_used_].first),
+                codes_[codes_used_].second);
+      }
+      return false;
+  }
+}
+
+Validator::CharClass ValidateIndic::UnicodeToCharClass(char32 ch) const {
+  if (IsVedicAccent(ch)) {
+    return CharClass::kVedicMark;
+  }
+  if (ch == kZeroWidthNonJoiner) {
+    return CharClass::kZeroWidthNonJoiner;
+  }
+  if (ch == kZeroWidthJoiner) {
+    return CharClass::kZeroWidthJoiner;
+  }
+  // Offset from the start of the relevant unicode code block aka code page.
+  int base = static_cast<char32>(script_);
+  int off = ch - base;
+  // Anything in another code block is other.
+  if (off < 0 || off >= kIndicCodePageSize) {
+    return CharClass::kOther;
+  }
+  // Exception for Tamil. The aytham character is considered a letter.
+  if (script_ == ViramaScript::kTamil && off == 0x03) {
+    return CharClass::kVowel;
+  }
+  if (off < 0x4) {
+    return CharClass::kVowelModifier;
+  }
+  if (script_ == ViramaScript::kSinhala) {
+    // Sinhala is an exception.
+    if (off <= 0x19) {
+      return CharClass::kVowel;
+    }
+    if (off <= 0x49) {
+      return CharClass::kConsonant;
+    }
+    if (off == 0x4a) {
+      return CharClass::kVirama;
+    }
+    if (off <= 0x5f) {
+      return CharClass::kMatra;
+    }
+  } else {
+    if (off <= 0x14 || off == 0x50) {
+      return CharClass::kVowel;
+    }
+    if (off <= 0x3b || (0x58 <= off && off <= 0x5f)) {
+      return CharClass::kConsonant;
+    }
+    // Sinhala doesn't have Nukta or Avagraha.
+    if (off == 0x3c) {
+      return CharClass::kNukta;
+    }
+    if (off == 0x3d) {
+      return CharClass::kVowel; // avagraha
+    }
+    if (off <= 0x4c || (0x51 <= off && off <= 0x54)) {
+      return CharClass::kMatra;
+    }
+    if (0x55 <= off && off <= 0x57) {
+      return CharClass::kMatraPiece;
+    }
+    if (off == 0x4d) {
+      return CharClass::kVirama;
+    }
+  }
+  if (off == 0x60 || off == 0x61) {
+    return CharClass::kVowel;
+  }
+  if (off == 0x62 || off == 0x63) {
+    return CharClass::kMatra;
+  }
+  // Danda and digits up to 6f are OK as other.
+  // 70-7f are script-specific.
+  // 0BF0-0BF2 are Tamil numbers 10, 100 and 1000; treat as other.
+  if (script_ == ViramaScript::kTamil && (0x70 <= off && off <= 0x72)) {
+    return CharClass::kOther;
+  }
+  // 0BF3-0BFA are other Tamil symbols.
+  if (script_ == ViramaScript::kTamil && (0x73 <= off && off <= 0x7A)) {
+    return CharClass::kOther;
+  }
+  if (script_ == ViramaScript::kBengali && (off == 0x70 || off == 0x71)) {
+    return CharClass::kConsonant;
+  }
+  if (script_ == ViramaScript::kGurmukhi && (off == 0x72 || off == 0x73)) {
+    return CharClass::kConsonant;
+  }
+  if (script_ == ViramaScript::kSinhala && off == 0x70) {
+    return CharClass::kConsonant;
+  }
+  if (script_ == ViramaScript::kDevanagari && off == 0x70) {
+    return CharClass::kOther;
+  }
+  if (0x70 <= off && off <= 0x73) {
+    return CharClass::kVowelModifier;
+  }
+  // Non Indic, Digits, Measures, danda, etc.
+  return CharClass::kOther;
+}
+
+// Helper consumes/copies a virama and any associated post-virama joiners.
+// A linking virama (with either type of pre-virama joiner, post-virama ZWJ, or
+// no joiner at all) must be followed by a consonant.
+// A non-linking (explicit) virama is indicated by a ZWNJ after it, or a non
+// consonant, space, or character from a different script. We clean up the
+// representation to make it consistent by adding a ZWNJ if missing from a
+// non-linking virama. Returns false with an invalid sequence.
+bool ValidateIndic::ConsumeViramaIfValid(IndicPair joiner, bool post_matra) {
+  const unsigned num_codes = codes_.size();
+  if (joiner.first == CharClass::kOther) {
+    CodeOnlyToOutput();
+    if (codes_used_ < num_codes && codes_[codes_used_].second == kZeroWidthJoiner) {
+      // Post-matra viramas must be explicit, so no joiners allowed here.
+      if (post_matra) {
+        if (report_errors_) {
+          tprintf("ZWJ after a post-matra virama!!\n");
+        }
+        return false;
+      }
+      if (codes_used_ + 1 < num_codes && codes_[codes_used_ - 2].second != kRayana &&
+          (codes_[codes_used_ + 1].second == kZeroWidthNonJoiner ||
+           codes_[codes_used_ + 1].second == kYayana ||
+           codes_[codes_used_ + 1].second == kRayana)) {
+        // This combination will be picked up later.
+        ASSERT_HOST(!CodeOnlyToOutput());
+      } else {
+        // Half-form with optional Nukta.
+        unsigned len = output_.size() + 1 - output_used_;
+        if (UseMultiCode(len)) {
+          return true;
+        }
+      }
+      if (codes_used_ < num_codes && codes_[codes_used_].second == kZeroWidthNonJoiner) {
+        if (output_used_ == output_.size() || output_[output_used_] != kRayana) {
+          if (report_errors_) {
+            tprintf("Virama ZWJ ZWNJ in non-Sinhala: base=0x%x!\n", static_cast<int>(script_));
+          }
+          return false;
+        }
+        // Special Sinhala case of Stand-alone Repaya. ['RA' H Z z]
+        if (UseMultiCode(4)) {
+          return true;
+        }
+      }
+    } else if (codes_used_ == num_codes || codes_[codes_used_].first != CharClass::kConsonant ||
+               post_matra) {
+      if (codes_used_ == num_codes || codes_[codes_used_].second != kZeroWidthNonJoiner) {
+        // It is valid to have an unterminated virama at the end of a word, but
+        // for consistency, we will always add ZWNJ if not present.
+        output_.push_back(kZeroWidthNonJoiner);
+      } else {
+        CodeOnlyToOutput();
+      }
+      // Explicit virama [H z]
+      MultiCodePart(2);
+    }
+  } else {
+    // Pre-virama joiner [{Z|z} H] requests specific conjunct.
+    if (UseMultiCode(2)) {
+      if (report_errors_) {
+        tprintf("Invalid pre-virama joiner with no 2nd consonant!!\n");
+      }
+      return false;
+    }
+    if (codes_[codes_used_].second == kZeroWidthJoiner ||
+        codes_[codes_used_].second == kZeroWidthNonJoiner) {
+      if (report_errors_) {
+        tprintf("JHJ!!: 0x%x 0x%x 0x%x\n", joiner.second, output_.back(),
+                codes_[codes_used_].second);
+      }
+      return false;
+    }
+  }
+  // It is good so far as it goes.
+  return true;
+}
+
+// Helper consumes/copies a series of consonants separated by viramas while
+// valid, but not any vowel or other modifiers.
+bool ValidateIndic::ConsumeConsonantHeadIfValid() {
+  const unsigned num_codes = codes_.size();
+  // Consonant aksara
+  do {
+    CodeOnlyToOutput();
+    // Special Sinhala case of [H Z Yayana/Rayana].
+    int index = output_.size() - 3;
+    if (output_used_ + 3 <= output_.size() &&
+        (output_.back() == kYayana || output_.back() == kRayana) && IsVirama(output_[index]) &&
+        output_[index + 1] == kZeroWidthJoiner) {
+      MultiCodePart(3);
+    }
+    bool have_nukta = false;
+    if (codes_used_ < num_codes && codes_[codes_used_].first == CharClass::kNukta) {
+      have_nukta = true;
+      CodeOnlyToOutput();
+    }
+    // Test for subscript conjunct.
+    index = output_.size() - 2 - have_nukta;
+    if (output_used_ + 2 + have_nukta <= output_.size() && IsSubscriptScript() &&
+        IsVirama(output_[index])) {
+      // Output previous virama, consonant + optional nukta.
+      MultiCodePart(2 + have_nukta);
+    }
+    IndicPair joiner(CharClass::kOther, 0);
+    if (codes_used_ < num_codes && (codes_[codes_used_].second == kZeroWidthJoiner ||
+                                    (codes_[codes_used_].second == kZeroWidthNonJoiner &&
+                                     script_ == ViramaScript::kMalayalam))) {
+      joiner = codes_[codes_used_];
+      if (++codes_used_ == num_codes) {
+        if (report_errors_) {
+          tprintf("Skipping ending joiner: 0x%x 0x%x\n", output_.back(), joiner.second);
+        }
+        return true;
+      }
+      if (codes_[codes_used_].first == CharClass::kVirama) {
+        output_.push_back(joiner.second);
+      } else {
+        if (report_errors_) {
+          tprintf("Skipping unnecessary joiner: 0x%x 0x%x 0x%x\n", output_.back(), joiner.second,
+                  codes_[codes_used_].second);
+        }
+        joiner = std::make_pair(CharClass::kOther, 0);
+      }
+    }
+    if (codes_used_ < num_codes && codes_[codes_used_].first == CharClass::kVirama) {
+      if (!ConsumeViramaIfValid(joiner, false)) {
+        return false;
+      }
+    } else {
+      break; // No virama, so the run of consonants is over.
+    }
+  } while (codes_used_ < num_codes && codes_[codes_used_].first == CharClass::kConsonant);
+  if (output_used_ < output_.size()) {
+    MultiCodePart(1);
+  }
+  return true;
+}
+
+// Helper consumes/copies a tail part of a consonant, comprising optional
+// matra/piece, vowel modifier, vedic mark, terminating virama.
+bool ValidateIndic::ConsumeConsonantTailIfValid() {
+  if (codes_used_ == codes_.size()) {
+    return true;
+  }
+  // No virama: Finish the grapheme.
+  // Are multiple matras allowed?
+  if (codes_[codes_used_].first == CharClass::kMatra) {
+    if (UseMultiCode(1)) {
+      return true;
+    }
+    if (codes_[codes_used_].first == CharClass::kMatraPiece) {
+      if (UseMultiCode(1)) {
+        return true;
+      }
+    }
+  }
+  while (codes_[codes_used_].first == CharClass::kVowelModifier) {
+    if (UseMultiCode(1)) {
+      return true;
+    }
+    // Only Malayalam allows only repeated 0xd02.
+    if (script_ != ViramaScript::kMalayalam || output_.back() != 0xd02) {
+      break;
+    }
+  }
+  while (codes_[codes_used_].first == CharClass::kVedicMark) {
+    if (UseMultiCode(1)) {
+      return true;
+    }
+  }
+  if (codes_[codes_used_].first == CharClass::kVirama) {
+    if (!ConsumeViramaIfValid(IndicPair(CharClass::kOther, 0), true)) {
+      return false;
+    }
+  }
+  // What we have consumed so far is a valid consonant cluster.
+  if (output_used_ < output_.size()) {
+    MultiCodePart(1);
+  }
+
+  return true;
+}
+
+// Helper consumes/copies a vowel and optional modifiers.
+bool ValidateIndic::ConsumeVowelIfValid() {
+  if (UseMultiCode(1)) {
+    return true;
+  }
+  while (codes_[codes_used_].first == CharClass::kVowelModifier) {
+    if (UseMultiCode(1)) {
+      return true;
+    }
+    // Only Malayalam allows repeated modifiers?
+    if (script_ != ViramaScript::kMalayalam) {
+      break;
+    }
+  }
+  while (codes_[codes_used_].first == CharClass::kVedicMark) {
+    if (UseMultiCode(1)) {
+      return true;
+    }
+  }
+  // What we have consumed so far is a valid vowel cluster.
+  return true;
+}
+
+} // namespace tesseract