Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/training/unicharset/normstrngs.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/training/unicharset/normstrngs.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,317 @@ +/********************************************************************** + * File: normstrngs.cpp + * Description: Utilities to normalize and manipulate UTF-32 and + * UTF-8 strings. + * Author: Ranjith Unnikrishnan + * + * (C) Copyright 2013, Google Inc. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + **********************************************************************/ + +#include "normstrngs.h" + +#include <string> +#include <unordered_map> +#include <vector> + +#include <tesseract/unichar.h> +#include "errcode.h" +#include "icuerrorcode.h" +#include "unicode/normalizer2.h" // From libicu +#include "unicode/translit.h" // From libicu +#include "unicode/uchar.h" // From libicu +#include "unicode/unorm2.h" // From libicu +#include "unicode/uscript.h" // From libicu + +namespace tesseract { + +static bool is_hyphen_punc(const char32 ch) { + static const char32 kHyphenPuncUnicodes[] = { + '-', + 0x2010, // hyphen + 0x2011, // non-breaking hyphen + 0x2012, // figure dash + 0x2013, // en dash + 0x2014, // em dash + 0x2015, // horizontal bar + // how about 0x2043 hyphen bullet? + // how about 0x2500 box drawings light horizontal? + 0x207b, // superscript minus + 0x208b, // subscript minus + 0x2212, // minus sign + 0xfe58, // small em dash + 0xfe63, // small hyphen-minus + 0xff0d, // fullwidth hyphen-minus + 0x2e17 // double oblique hyphen (Fraktur) + }; + for (int kHyphenPuncUnicode : kHyphenPuncUnicodes) { + if (kHyphenPuncUnicode == ch) { + return true; + } + } + return false; +} + +static bool is_single_quote(const char32 ch) { + static const char32 kSingleQuoteUnicodes[] = { + '\'', '`', + 0x2018, // left single quotation mark (English, others) + 0x2019, // right single quotation mark (Danish, Finnish, Swedish, Norw.) + // We may have to introduce a comma set with 0x201a + 0x201A, // single low-9 quotation mark (German) + 0x201B, // single high-reversed-9 quotation mark (PropList.txt) + 0x2032, // prime + 0x300C, // left corner bracket (East Asian languages) + 0xFF07 // fullwidth apostrophe + }; + for (int kSingleQuoteUnicode : kSingleQuoteUnicodes) { + if (kSingleQuoteUnicode == ch) { + return true; + } + } + return false; +} + +static bool is_double_quote(const char32 ch) { + static const char32 kDoubleQuoteUnicodes[] = { + '"', + 0x201C, // left double quotation mark (English, others) + 0x201D, // right double quotation mark (Danish, Finnish, Swedish, Norw.) + 0x201F, // double high-reversed-9 quotation mark (PropList.txt) + 0x2033, // double prime + 0x201E, // double low-9 quotation mark (German) + 0x301D, // reversed double prime quotation mark (East Asian langs, + // horiz.) + 0x301E, // close double prime (East Asian languages written horizontally) + 0xFF02 // fullwidth quotation mark + }; + for (int kDoubleQuoteUnicode : kDoubleQuoteUnicodes) { + if (kDoubleQuoteUnicode == ch) { + return true; + } + } + return false; +} + +// Helper runs a standard unicode normalization, optional OCR normalization, +// and leaves the result as char32 for subsequent processing. +static void NormalizeUTF8ToUTF32(UnicodeNormMode u_mode, OCRNorm ocr_normalize, const char *str8, + std::vector<char32> *normed32) { + // Convert to ICU string for unicode normalization. + icu::UnicodeString uch_str(str8, "UTF-8"); + IcuErrorCode error_code; + // Convert the enum to the new weird icu representation. + const char *norm_type = + u_mode == UnicodeNormMode::kNFKD || u_mode == UnicodeNormMode::kNFKC ? "nfkc" : "nfc"; + UNormalization2Mode compose = u_mode == UnicodeNormMode::kNFC || u_mode == UnicodeNormMode::kNFKC + ? UNORM2_COMPOSE + : UNORM2_DECOMPOSE; + // Pointer to singleton does not require deletion. + const icu::Normalizer2 *normalizer = + icu::Normalizer2::getInstance(nullptr, norm_type, compose, error_code); + error_code.assertSuccess(); + error_code.reset(); + icu::UnicodeString norm_str = normalizer->normalize(uch_str, error_code); + error_code.assertSuccess(); + // Convert to char32 for output. OCR normalization if required. + normed32->reserve(norm_str.length()); // An approximation. + for (int offset = 0; offset < norm_str.length(); offset = norm_str.moveIndex32(offset, 1)) { + char32 ch = norm_str.char32At(offset); + // Skip all ZWS, RTL and LTR marks. + if (Validator::IsZeroWidthMark(ch)) { + continue; + } + if (ocr_normalize == OCRNorm::kNormalize) { + ch = OCRNormalize(ch); + } + normed32->push_back(ch); + } +} + +// Helper removes joiners from strings that contain no letters. +static void StripJoiners(std::vector<char32> *str32) { + for (char32 ch : *str32) { + if (u_isalpha(ch)) { + return; + } + } + int len = 0; + for (char32 ch : *str32) { + if (ch != Validator::kZeroWidthJoiner && ch != Validator::kZeroWidthNonJoiner) { + (*str32)[len++] = ch; + } + } + str32->resize(len); +} + +// Normalizes a UTF8 string according to the given modes. Returns true on +// success. If false is returned, some failure or invalidity was present, and +// the result string is produced on a "best effort" basis. +bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize, + GraphemeNorm grapheme_normalize, const char *str8, + std::string *normalized) { + std::vector<char32> normed32; + NormalizeUTF8ToUTF32(u_mode, ocr_normalize, str8, &normed32); + if (grapheme_normalize == GraphemeNorm::kNormalize) { + StripJoiners(&normed32); + std::vector<std::vector<char32>> graphemes; + bool success = Validator::ValidateCleanAndSegment(GraphemeNormMode::kSingleString, false, + normed32, &graphemes); + if (graphemes.empty() || graphemes[0].empty()) { + success = false; + } else if (normalized != nullptr) { + *normalized = UNICHAR::UTF32ToUTF8(graphemes[0]); + } + return success; + } + if (normalized != nullptr) { + *normalized = UNICHAR::UTF32ToUTF8(normed32); + } + return true; +} + +// Normalizes a UTF8 string according to the given modes and splits into +// graphemes according to g_mode. Returns true on success. If false is returned, +// some failure or invalidity was present, and the result string is produced on +// a "best effort" basis. +bool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize, + GraphemeNormMode g_mode, bool report_errors, const char *str8, + std::vector<std::string> *graphemes) { + std::vector<char32> normed32; + NormalizeUTF8ToUTF32(u_mode, ocr_normalize, str8, &normed32); + StripJoiners(&normed32); + std::vector<std::vector<char32>> graphemes32; + bool success = Validator::ValidateCleanAndSegment(g_mode, report_errors, normed32, &graphemes32); + if (g_mode != GraphemeNormMode::kSingleString && success) { + // If we modified the string to clean it up, the segmentation may not be + // correct, so check for changes and do it again. + std::vector<char32> cleaned32; + for (const auto &g : graphemes32) { + cleaned32.insert(cleaned32.end(), g.begin(), g.end()); + } + if (cleaned32 != normed32) { + graphemes32.clear(); + success = Validator::ValidateCleanAndSegment(g_mode, report_errors, cleaned32, &graphemes32); + } + } + graphemes->clear(); + graphemes->reserve(graphemes32.size()); + for (const auto &grapheme : graphemes32) { + graphemes->push_back(UNICHAR::UTF32ToUTF8(grapheme)); + } + return success; +} + +// Apply just the OCR-specific normalizations and return the normalized char. +char32 OCRNormalize(char32 ch) { + if (is_hyphen_punc(ch)) { + return '-'; + } else if (is_single_quote(ch)) { + return '\''; + } else if (is_double_quote(ch)) { + return '"'; + } + return ch; +} + +bool IsOCREquivalent(char32 ch1, char32 ch2) { + return OCRNormalize(ch1) == OCRNormalize(ch2); +} + +bool IsValidCodepoint(const char32 ch) { + // In the range [0, 0xD800) or [0xE000, 0x10FFFF] + return (static_cast<uint32_t>(ch) < 0xD800) || (ch >= 0xE000 && ch <= 0x10FFFF); +} + +bool IsWhitespace(const char32 ch) { + ASSERT_HOST_MSG(IsValidCodepoint(ch), "Invalid Unicode codepoint: 0x%x\n", ch); + return u_isUWhiteSpace(static_cast<UChar32>(ch)); +} + +bool IsUTF8Whitespace(const char *text) { + return SpanUTF8Whitespace(text) == strlen(text); +} + +unsigned int SpanUTF8Whitespace(const char *text) { + int n_white = 0; + for (UNICHAR::const_iterator it = UNICHAR::begin(text, strlen(text)); + it != UNICHAR::end(text, strlen(text)); ++it) { + if (!IsWhitespace(*it)) { + break; + } + n_white += it.utf8_len(); + } + return n_white; +} + +unsigned int SpanUTF8NotWhitespace(const char *text) { + int n_notwhite = 0; + for (UNICHAR::const_iterator it = UNICHAR::begin(text, strlen(text)); + it != UNICHAR::end(text, strlen(text)); ++it) { + if (IsWhitespace(*it)) { + break; + } + n_notwhite += it.utf8_len(); + } + return n_notwhite; +} + +bool IsInterchangeValid(const char32 ch) { + return IsValidCodepoint(ch) && !(ch >= 0xFDD0 && ch <= 0xFDEF) && // Noncharacters. + !(ch >= 0xFFFE && ch <= 0xFFFF) && !(ch >= 0x1FFFE && ch <= 0x1FFFF) && + !(ch >= 0x2FFFE && ch <= 0x2FFFF) && !(ch >= 0x3FFFE && ch <= 0x3FFFF) && + !(ch >= 0x4FFFE && ch <= 0x4FFFF) && !(ch >= 0x5FFFE && ch <= 0x5FFFF) && + !(ch >= 0x6FFFE && ch <= 0x6FFFF) && !(ch >= 0x7FFFE && ch <= 0x7FFFF) && + !(ch >= 0x8FFFE && ch <= 0x8FFFF) && !(ch >= 0x9FFFE && ch <= 0x9FFFF) && + !(ch >= 0xAFFFE && ch <= 0xAFFFF) && !(ch >= 0xBFFFE && ch <= 0xBFFFF) && + !(ch >= 0xCFFFE && ch <= 0xCFFFF) && !(ch >= 0xDFFFE && ch <= 0xDFFFF) && + !(ch >= 0xEFFFE && ch <= 0xEFFFF) && !(ch >= 0xFFFFE && ch <= 0xFFFFF) && + !(ch >= 0x10FFFE && ch <= 0x10FFFF) && + (!u_isISOControl(static_cast<UChar32>(ch)) || ch == '\n' || ch == '\f' || ch == '\t' || + ch == '\r'); +} + +bool IsInterchangeValid7BitAscii(const char32 ch) { + return IsValidCodepoint(ch) && ch <= 128 && + (!u_isISOControl(static_cast<UChar32>(ch)) || ch == '\n' || ch == '\f' || ch == '\t' || + ch == '\r'); +} + +char32 FullwidthToHalfwidth(const char32 ch) { + // Return unchanged if not in the fullwidth-halfwidth Unicode block. + if (ch < 0xFF00 || ch > 0xFFEF || !IsValidCodepoint(ch)) { + if (ch != 0x3000) { + return ch; + } + } + // Special case for fullwidth left and right "white parentheses". + if (ch == 0xFF5F) { + return 0x2985; + } + if (ch == 0xFF60) { + return 0x2986; + } + // Construct a full-to-half width transliterator. + IcuErrorCode error_code; + icu::UnicodeString uch_str(static_cast<UChar32>(ch)); + const icu::Transliterator *fulltohalf = + icu::Transliterator::createInstance("Fullwidth-Halfwidth", UTRANS_FORWARD, error_code); + error_code.assertSuccess(); + error_code.reset(); + + fulltohalf->transliterate(uch_str); + delete fulltohalf; + ASSERT_HOST(uch_str.length() != 0); + return uch_str[0]; +} + +} // namespace tesseract
