diff mupdf-source/thirdparty/tesseract/src/training/unicharset/validate_myanmar.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/training/unicharset/validate_myanmar.cpp	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,196 @@
+#include "validate_myanmar.h"
+#include "errcode.h"
+#include "icuerrorcode.h"
+#include "tprintf.h"
+#include "unicode/uchar.h"   // From libicu
+#include "unicode/uscript.h" // From libicu
+
+namespace tesseract {
+
+// Returns whether codes matches the pattern for a Myanmar Grapheme.
+// Taken directly from the unicode table 16-3.
+// See http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf
+bool ValidateMyanmar::ConsumeGraphemeIfValid() {
+  const unsigned num_codes = codes_.size();
+  if (codes_used_ == num_codes) {
+    return true;
+  }
+  // Other.
+  if (IsMyanmarOther(codes_[codes_used_].second)) {
+    UseMultiCode(1);
+    return true;
+  }
+  // Kinzi.
+  if (codes_used_ + 2 < num_codes && codes_[codes_used_].second == 0x1004 &&
+      codes_[codes_used_ + 1].second == kMyanmarAsat &&
+      codes_[codes_used_ + 2].second == kMyanmarVirama) {
+    ASSERT_HOST(!CodeOnlyToOutput());
+    ASSERT_HOST(!CodeOnlyToOutput());
+    if (UseMultiCode(3)) {
+      return true;
+    }
+  }
+  // Base consonant/vowel. NOTE that since everything in Myanmar appears to be
+  // optional, except the base, this is the only place where invalid input can
+  // be detected and false returned.
+  if (IsMyanmarLetter(codes_[codes_used_].second)) {
+    if (UseMultiCode(1)) {
+      return true;
+    }
+  } else {
+    if (report_errors_) {
+      tprintf("Invalid start of Myanmar syllable:0x%x\n", codes_[codes_used_].second);
+    }
+    return false; // One of these is required.
+  }
+  if (ConsumeSubscriptIfPresent()) {
+    return true;
+  }
+  ConsumeOptionalSignsIfPresent();
+  // What we have consumed so far is a valid syllable.
+  return true;
+}
+
+// TODO(rays) Doesn't use intermediate coding like the other scripts, as there
+// is little correspondence between the content of table 16-3 and the char
+// classes of the Indic languages. (Experts may disagree and improve!)
+// In unicode table 16-3 there is basically a long list of optional characters,
+// which can be coded quite easily.
+// Unfortunately, table 16-3 doesn't include even half the Myanmar unicodes!!
+// The table also allows sequences that still result in dotted circles!!
+// So with a lot of guesswork the rest have been added in a reasonable place.
+Validator::CharClass ValidateMyanmar::UnicodeToCharClass(char32 ch) const {
+  if (IsMyanmarLetter(ch)) {
+    return CharClass::kConsonant;
+  }
+  return CharClass::kOther;
+}
+
+// Helper consumes/copies a virama and any subscript consonant.
+// Returns true if the end of input is reached.
+bool ValidateMyanmar::ConsumeSubscriptIfPresent() {
+  // Subscript consonant. It appears there can be only one.
+  const unsigned num_codes = codes_.size();
+  if (codes_used_ + 1 < num_codes && codes_[codes_used_].second == kMyanmarVirama) {
+    if (IsMyanmarLetter(codes_[codes_used_ + 1].second)) {
+      ASSERT_HOST(!CodeOnlyToOutput());
+      if (UseMultiCode(2)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+// Helper consumes/copies a series of optional signs.
+// Returns true if the end of input is reached.
+bool ValidateMyanmar::ConsumeOptionalSignsIfPresent() {
+  // The following characters are allowed, all optional, and in sequence.
+  // An exception is kMyanmarMedialYa, which can include kMyanmarAsat.
+  const std::vector<char32> kMedials({kMyanmarAsat, kMyanmarMedialYa, 0x103c, 0x103d, 0x103e,
+                                      0x105e, 0x105f, 0x1060, 0x1081, 0x1031});
+  for (char32 ch : kMedials) {
+    if (codes_[codes_used_].second == ch) {
+      if (UseMultiCode(1)) {
+        return true;
+      }
+      if (ch == kMyanmarMedialYa && codes_[codes_used_].second == kMyanmarAsat) {
+        if (UseMultiCode(1)) {
+          return true;
+        }
+      }
+    }
+  }
+  // Vowel sign i, ii, ai.
+  char32 ch = codes_[codes_used_].second;
+  if (ch == 0x102d || ch == 0x102e || ch == 0x1032) {
+    if (UseMultiCode(1)) {
+      return true;
+    }
+  }
+  // Vowel sign u, uu, and extensions.
+  ch = codes_[codes_used_].second;
+  if (ch == 0x102f || ch == 0x1030 || (0x1056 <= ch && ch <= 0x1059) || ch == 0x1062 ||
+      ch == 0x1067 || ch == 0x1068 || (0x1071 <= ch && ch <= 0x1074) ||
+      (0x1083 <= ch && ch <= 0x1086) || ch == 0x109c || ch == 0x109d) {
+    if (UseMultiCode(1)) {
+      return true;
+    }
+  }
+  // Tall aa, aa with optional asat.
+  if (codes_[codes_used_].second == 0x102b || codes_[codes_used_].second == 0x102c) {
+    if (UseMultiCode(1)) {
+      return true;
+    }
+    if (codes_[codes_used_].second == kMyanmarAsat) {
+      if (UseMultiCode(1)) {
+        return true;
+      }
+    }
+  }
+  // The following characters are allowed, all optional, and in sequence.
+  // Anusvar, Dot below, Visarga
+  const std::vector<char32> kSigns({0x1036, 0x1037, 0x1038});
+  for (char32 ch : kSigns) {
+    if (codes_[codes_used_].second == ch) {
+      if (UseMultiCode(1)) {
+        return true;
+      }
+    }
+  }
+  // Tone mark extensions.
+  ch = codes_[codes_used_].second;
+  if (ch == 0x102c || ch == 0x1038 || ch == kMyanmarAsat || (0x1062 <= ch && ch <= 0x1064) ||
+      (0x1069 <= ch && ch <= 0x106d) || (0x1087 <= ch && ch <= 0x108d) || ch == 0x108f ||
+      ch == 0x109a || ch == 0x109b || (0xaa7b <= ch && ch <= 0xaa7d)) {
+    if (UseMultiCode(1)) {
+      return true;
+    }
+  }
+  // Sgaw tones 0x1062, 0x1063 must be followed by asat.
+  // W Pwo tones 0x1069, 0x106a, and 0x106b may be followed by dot below or visarga (nasal).
+  ch = codes_[codes_used_].second;
+  if (ch == 0x103a || ch == 0x1037 || ch == 0x1038) {
+    if (UseMultiCode(1)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Returns true if the unicode is a Myanmar "letter" including consonants
+// and independent vowels. Although table 16-3 distinguishes between some
+// base consonants and vowels, the extensions make no such distinction, so we
+// put them all into a single bucket.
+// Update MYANMAR LETTER based on following:
+// https://unicode.org/charts/PDF/U1000.pdf - Myanmar
+// http://unicode.org/charts/PDF/UAA60.pdf - Myanmar Extended-A
+// http://unicode.org/charts/PDF/UA9E0.pdf - Myanmar Extended-B
+/* static */
+bool ValidateMyanmar::IsMyanmarLetter(char32 ch) {
+  return (0x1000 <= ch && ch <= 0x102a) || ch == 0x103f || (0x104c <= ch && ch <= 0x1055) ||
+         (0x105a <= ch && ch <= 0x105d) || ch == 0x1061 || ch == 0x1065 || ch == 0x1066 ||
+         (0x106e <= ch && ch <= 0x1070) || (0x1075 <= ch && ch <= 0x1081) || ch == 0x108e ||
+         (0xa9e0 <= ch && ch <= 0xa9e4) || (0xa9e7 <= ch && ch <= 0xa9ef) ||
+         (0xa9fa <= ch && ch <= 0xa9fe) || (0xaa60 <= ch && ch <= 0xaa6f) ||
+         (0xaa71 <= ch && ch <= 0xaa73) || ch == 0xaa7a || ch == 0xaa7e || ch == 0xaa7f;
+}
+
+// Returns true if ch is a Myanmar digit or other symbol that does not take
+// part in being a syllable eg. punctuation marks.
+// MYANMAR DIGIT, MYANMAR SYMBOL, MYANMAR LOGOGRAM
+// REDUPLICATION MARKS
+/* static */
+bool ValidateMyanmar::IsMyanmarOther(char32 ch) {
+  IcuErrorCode err;
+  UScriptCode script_code = uscript_getScript(ch, err);
+  if (script_code != USCRIPT_MYANMAR && ch != Validator::kZeroWidthJoiner &&
+      ch != Validator::kZeroWidthNonJoiner) {
+    return true;
+  }
+  return (0x1040 <= ch && ch <= 0x104f) || (0x1090 <= ch && ch <= 0x1099) ||
+         (0x109e <= ch && ch <= 0x109f) || (0xa9f0 <= ch && ch <= 0xa9f9) ||
+         (ch == 0xa9e6 || ch == 0xaa70) || (0xaa74 <= ch && ch <= 0xaa79);
+}
+
+} // namespace tesseract