Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/training/unicharset/validate_javanese.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/training/unicharset/validate_javanese.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,323 @@ +/********************************************************************** + * File: validate_javanese.cpp + * Description: Text validator for Javanese Script - aksara jawa. + * Author: Shree Devi Kumar + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + **********************************************************************/ + +#include "validate_javanese.h" +#include "errcode.h" +#include "tprintf.h" + +namespace tesseract { + +// Returns whether codes matches the pattern for a Javanese Grapheme. +// Taken from unicode standard: +// http://www.unicode.org/charts/PDF/UA980.pdf +// http://www.unicode.org/versions/Unicode11.0.0/ch17.pdf +// The Consonant class here includes independent vowels. +// The order of components in an orthographic syllable as expressed in BNF is: +// {C F} C {{R}Y} {V{A}} {Z} +// Translated to the codes used by the CharClass enum: +// [(V|C[N])(H)] (V|C[N]) [[N]N] [M[D]] [v] +// Also see https://r12a.github.io/scripts/javanese/ for detailed notes. +// Validation rules copied from validate_indic.cpp and modified for Javanese. +// Indic - for reference +// + vowel Grapheme: V[D](v)* +// + consonant Grapheme: (C[N](H|HZ|Hz|ZH)?)*C[N](H|Hz)?[M[P]][D](v)* + +bool ValidateJavanese::ConsumeGraphemeIfValid() { + switch (codes_[codes_used_].first) { + case CharClass::kConsonant: + return ConsumeConsonantHeadIfValid() && ConsumeConsonantTailIfValid(); + case CharClass::kVowel: + case CharClass::kVedicMark: + return ConsumeVowelIfValid(); + case CharClass::kZeroWidthJoiner: + case CharClass::kZeroWidthNonJoiner: + // Apart from within an aksara, joiners are silently dropped. + if (report_errors_) { + tprintf("Dropping isolated joiner: 0x%x\n", codes_[codes_used_].second); + } + ++codes_used_; + return true; + case CharClass::kOther: + UseMultiCode(1); + return true; + default: + if (report_errors_) { + tprintf("Invalid start of grapheme sequence:%c=0x%x\n", + static_cast<int>(codes_[codes_used_].first), + codes_[codes_used_].second); + } + return false; + } +} + +// Helper consumes/copies a virama and any associated post-virama joiners. +// A linking virama (with either type of pre-virama joiner, post-virama ZWJ, or +// no joiner at all) must be followed by a consonant. +// A non-linking (explicit) virama is indicated by a ZWNJ after it, or a non +// consonant, space, or character from a different script. We clean up the +// representation to make it consistent by adding a ZWNJ if missing from a +// non-linking virama. Returns false with an invalid sequence. +bool ValidateJavanese::ConsumeViramaIfValid(IndicPair joiner, bool post_matra) { + const unsigned num_codes = codes_.size(); + if (joiner.first == CharClass::kOther) { + CodeOnlyToOutput(); + if (codes_used_ < num_codes && codes_[codes_used_].second == kZeroWidthJoiner) { + // Post-matra viramas must be explicit, so no joiners allowed here. + if (post_matra) { + if (report_errors_) { + tprintf("ZWJ after a post-matra virama!!\n"); + } + return false; + } + if (codes_used_ + 1 < num_codes && codes_[codes_used_ - 2].second != kCakra && + (codes_[codes_used_ + 1].second == kZeroWidthNonJoiner || + codes_[codes_used_ + 1].second == kPengkal || + codes_[codes_used_ + 1].second == kCakra)) { + // This combination will be picked up later. + ASSERT_HOST(!CodeOnlyToOutput()); + } else { + // Half-form with optional Nukta. + unsigned len = output_.size() + 1 - output_used_; + if (UseMultiCode(len)) { + return true; + } + } + if (codes_used_ < num_codes && codes_[codes_used_].second == kZeroWidthNonJoiner) { + if (output_used_ == output_.size() || output_[output_used_] != kCakra) { + if (report_errors_) { + tprintf("Virama ZWJ ZWNJ in non-Sinhala: base=0x%x!\n", static_cast<int>(script_)); + } + return false; + } + // Special Sinhala case of Stand-alone Repaya. ['RA' H Z z] + if (UseMultiCode(4)) { + return true; + } + } + } else if (codes_used_ == num_codes || codes_[codes_used_].first != CharClass::kConsonant || + post_matra) { + if (codes_used_ == num_codes || codes_[codes_used_].second != kZeroWidthNonJoiner) { + // It is valid to have an unterminated virama at the end of a word, but + // for consistency, we will always add ZWNJ if not present. + CodeOnlyToOutput(); + } else { + CodeOnlyToOutput(); + } + // Explicit virama [H z] + MultiCodePart(2); + } + } else { + // Pre-virama joiner [{Z|z} H] requests specific conjunct. + if (UseMultiCode(2)) { + if (report_errors_) { + tprintf("Invalid pre-virama joiner with no 2nd consonant!!\n"); + } + return false; + } + if (codes_[codes_used_].second == kZeroWidthJoiner || + codes_[codes_used_].second == kZeroWidthNonJoiner) { + if (report_errors_) { + tprintf("JHJ!!: 0x%x 0x%x 0x%x\n", joiner.second, output_.back(), + codes_[codes_used_].second); + } + return false; + } + } + // It is good so far as it goes. + return true; +} + +// Helper consumes/copies a series of consonants separated by viramas while +// valid, but not any vowel or other modifiers. +bool ValidateJavanese::ConsumeConsonantHeadIfValid() { + const unsigned num_codes = codes_.size(); + // Consonant aksara + do { + CodeOnlyToOutput(); + // Special Sinhala case of [H Z Yayana/Rayana]. + int index = output_.size() - 3; + if (output_used_ + 3 <= output_.size() && + (output_.back() == kPengkal || output_.back() == kCakra) && IsVirama(output_[index]) && + output_[index + 1] == kZeroWidthJoiner) { + MultiCodePart(3); + } + bool have_nukta = false; + if (codes_used_ < num_codes && codes_[codes_used_].first == CharClass::kNukta) { + have_nukta = true; + CodeOnlyToOutput(); + } + // Test for subscript conjunct. + index = output_.size() - 2 - have_nukta; + if (output_used_ + 2 + have_nukta <= output_.size() && IsSubscriptScript() && + IsVirama(output_[index])) { + // Output previous virama, consonant + optional nukta. + MultiCodePart(2 + have_nukta); + } + IndicPair joiner(CharClass::kOther, 0); + if (codes_used_ < num_codes && (codes_[codes_used_].second == kZeroWidthJoiner || + (codes_[codes_used_].second == kZeroWidthNonJoiner && + script_ == ViramaScript::kMalayalam))) { + joiner = codes_[codes_used_]; + if (++codes_used_ == num_codes) { + if (report_errors_) { + tprintf("Skipping ending joiner: 0x%x 0x%x\n", output_.back(), joiner.second); + } + return true; + } + if (codes_[codes_used_].first == CharClass::kVirama) { + output_.push_back(joiner.second); + } else { + if (report_errors_) { + tprintf("Skipping unnecessary joiner: 0x%x 0x%x 0x%x\n", output_.back(), joiner.second, + codes_[codes_used_].second); + } + joiner = std::make_pair(CharClass::kOther, 0); + } + } + if (codes_used_ < num_codes && codes_[codes_used_].first == CharClass::kVirama) { + if (!ConsumeViramaIfValid(joiner, false)) { + return false; + } + } else { + break; // No virama, so the run of consonants is over. + } + } while (codes_used_ < num_codes && codes_[codes_used_].first == CharClass::kConsonant); + if (output_used_ < output_.size()) { + MultiCodePart(1); + } + return true; +} + +// Helper consumes/copies a tail part of a consonant, comprising optional +// matra/piece, vowel modifier, vedic mark, terminating virama. +bool ValidateJavanese::ConsumeConsonantTailIfValid() { + if (codes_used_ == codes_.size()) { + return true; + } + // No virama: Finish the grapheme. + // Are multiple matras allowed? + if (codes_[codes_used_].first == CharClass::kMatra) { + if (UseMultiCode(1)) { + return true; + } + if (codes_[codes_used_].first == CharClass::kMatraPiece) { + if (UseMultiCode(1)) { + return true; + } + } + } + // Tarung also used for long versions of u and o vowels and vocalic r + // Taling + Tarung is valid eg. ꦏ + ◌ꦺ + ◌ꦴ + while (codes_[codes_used_].first == CharClass::kMatraPiece) { + if (UseMultiCode(1)) { + return true; + } + } + while (codes_[codes_used_].first == CharClass::kVowelModifier) { + if (UseMultiCode(1)) { + return true; + } + // Only Malayalam allows only repeated 0xd02. + if (script_ != ViramaScript::kMalayalam || output_.back() != 0xd02) { + break; + } + } + while (codes_[codes_used_].first == CharClass::kVedicMark) { + if (UseMultiCode(1)) { + return true; + } + } + if (codes_[codes_used_].first == CharClass::kVirama) { + if (!ConsumeViramaIfValid(IndicPair(CharClass::kOther, 0), true)) { + return false; + } + } + // What we have consumed so far is a valid consonant cluster. + if (output_used_ < output_.size()) { + MultiCodePart(1); + } + + return true; +} + +// Helper consumes/copies a vowel and optional modifiers. +bool ValidateJavanese::ConsumeVowelIfValid() { + if (UseMultiCode(1)) { + return true; + } + while (codes_[codes_used_].first == CharClass::kVowelModifier) { + if (UseMultiCode(1)) { + return true; + } + // Only Malayalam allows repeated modifiers? + if (script_ != ViramaScript::kMalayalam) { + break; + } + } + while (codes_[codes_used_].first == CharClass::kVedicMark) { + if (UseMultiCode(1)) { + return true; + } + } + // What we have consumed so far is a valid vowel cluster. + return true; +} + +Validator::CharClass ValidateJavanese::UnicodeToCharClass(char32 ch) const { + if (ch == kZeroWidthNonJoiner) { + return CharClass::kZeroWidthNonJoiner; + } + if (ch == kZeroWidthJoiner) { + return CharClass::kZeroWidthJoiner; + } + // Offset from the start of the relevant unicode code block aka code page. + int off = ch - static_cast<char32>(script_); + // Anything in another code block is other. + if (off < 0 || off >= kIndicCodePageSize) { + return CharClass::kOther; + } + if (off < 0x4) { + return CharClass::kVowelModifier; + } + if (off <= 0x32) { + return CharClass::kConsonant; // includes independent vowels + } + if (off == 0x33) { + return CharClass::kNukta; // A9B3 CECAK TELU + } + if (off == 0x34) { + return CharClass::kMatraPiece; // A9B4 TARUNG two part vowels + } + if (off <= 0x39) { + return CharClass::kMatra; + } + if (off <= 0x3a) { + return CharClass::kConsonant; // A9BA TALING - pre base vowel + } + if (off <= 0x3d) { + return CharClass::kMatra; + } + if (off <= 0x3f) { + return CharClass::kNukta; // A9BE-A9BF PENGKAL-CAKRA medial consonants + } + if (off == 0x40) { + return CharClass::kVirama; // A9C0 PANGKON + } + return CharClass::kOther; +} + +} // namespace tesseract
