Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/training/unicharset/normstrngs.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /********************************************************************** | |
| 2 * File: normstrngs.cpp | |
| 3 * Description: Utilities to normalize and manipulate UTF-32 and | |
| 4 * UTF-8 strings. | |
| 5 * Author: Ranjith Unnikrishnan | |
| 6 * | |
| 7 * (C) Copyright 2013, Google Inc. | |
| 8 * Licensed under the Apache License, Version 2.0 (the "License"); | |
| 9 * you may not use this file except in compliance with the License. | |
| 10 * You may obtain a copy of the License at | |
| 11 * http://www.apache.org/licenses/LICENSE-2.0 | |
| 12 * Unless required by applicable law or agreed to in writing, software | |
| 13 * distributed under the License is distributed on an "AS IS" BASIS, | |
| 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 15 * See the License for the specific language governing permissions and | |
| 16 * limitations under the License. | |
| 17 * | |
| 18 **********************************************************************/ | |
| 19 | |
| 20 #include "normstrngs.h" | |
| 21 | |
| 22 #include <string> | |
| 23 #include <unordered_map> | |
| 24 #include <vector> | |
| 25 | |
| 26 #include <tesseract/unichar.h> | |
| 27 #include "errcode.h" | |
| 28 #include "icuerrorcode.h" | |
| 29 #include "unicode/normalizer2.h" // From libicu | |
| 30 #include "unicode/translit.h" // From libicu | |
| 31 #include "unicode/uchar.h" // From libicu | |
| 32 #include "unicode/unorm2.h" // From libicu | |
| 33 #include "unicode/uscript.h" // From libicu | |
| 34 | |
| 35 namespace tesseract { | |
| 36 | |
| 37 static bool is_hyphen_punc(const char32 ch) { | |
| 38 static const char32 kHyphenPuncUnicodes[] = { | |
| 39 '-', | |
| 40 0x2010, // hyphen | |
| 41 0x2011, // non-breaking hyphen | |
| 42 0x2012, // figure dash | |
| 43 0x2013, // en dash | |
| 44 0x2014, // em dash | |
| 45 0x2015, // horizontal bar | |
| 46 // how about 0x2043 hyphen bullet? | |
| 47 // how about 0x2500 box drawings light horizontal? | |
| 48 0x207b, // superscript minus | |
| 49 0x208b, // subscript minus | |
| 50 0x2212, // minus sign | |
| 51 0xfe58, // small em dash | |
| 52 0xfe63, // small hyphen-minus | |
| 53 0xff0d, // fullwidth hyphen-minus | |
| 54 0x2e17 // double oblique hyphen (Fraktur) | |
| 55 }; | |
| 56 for (int kHyphenPuncUnicode : kHyphenPuncUnicodes) { | |
| 57 if (kHyphenPuncUnicode == ch) { | |
| 58 return true; | |
| 59 } | |
| 60 } | |
| 61 return false; | |
| 62 } | |
| 63 | |
| 64 static bool is_single_quote(const char32 ch) { | |
| 65 static const char32 kSingleQuoteUnicodes[] = { | |
| 66 '\'', '`', | |
| 67 0x2018, // left single quotation mark (English, others) | |
| 68 0x2019, // right single quotation mark (Danish, Finnish, Swedish, Norw.) | |
| 69 // We may have to introduce a comma set with 0x201a | |
| 70 0x201A, // single low-9 quotation mark (German) | |
| 71 0x201B, // single high-reversed-9 quotation mark (PropList.txt) | |
| 72 0x2032, // prime | |
| 73 0x300C, // left corner bracket (East Asian languages) | |
| 74 0xFF07 // fullwidth apostrophe | |
| 75 }; | |
| 76 for (int kSingleQuoteUnicode : kSingleQuoteUnicodes) { | |
| 77 if (kSingleQuoteUnicode == ch) { | |
| 78 return true; | |
| 79 } | |
| 80 } | |
| 81 return false; | |
| 82 } | |
| 83 | |
| 84 static bool is_double_quote(const char32 ch) { | |
| 85 static const char32 kDoubleQuoteUnicodes[] = { | |
| 86 '"', | |
| 87 0x201C, // left double quotation mark (English, others) | |
| 88 0x201D, // right double quotation mark (Danish, Finnish, Swedish, Norw.) | |
| 89 0x201F, // double high-reversed-9 quotation mark (PropList.txt) | |
| 90 0x2033, // double prime | |
| 91 0x201E, // double low-9 quotation mark (German) | |
| 92 0x301D, // reversed double prime quotation mark (East Asian langs, | |
| 93 // horiz.) | |
| 94 0x301E, // close double prime (East Asian languages written horizontally) | |
| 95 0xFF02 // fullwidth quotation mark | |
| 96 }; | |
| 97 for (int kDoubleQuoteUnicode : kDoubleQuoteUnicodes) { | |
| 98 if (kDoubleQuoteUnicode == ch) { | |
| 99 return true; | |
| 100 } | |
| 101 } | |
| 102 return false; | |
| 103 } | |
| 104 | |
| 105 // Helper runs a standard unicode normalization, optional OCR normalization, | |
| 106 // and leaves the result as char32 for subsequent processing. | |
| 107 static void NormalizeUTF8ToUTF32(UnicodeNormMode u_mode, OCRNorm ocr_normalize, const char *str8, | |
| 108 std::vector<char32> *normed32) { | |
| 109 // Convert to ICU string for unicode normalization. | |
| 110 icu::UnicodeString uch_str(str8, "UTF-8"); | |
| 111 IcuErrorCode error_code; | |
| 112 // Convert the enum to the new weird icu representation. | |
| 113 const char *norm_type = | |
| 114 u_mode == UnicodeNormMode::kNFKD || u_mode == UnicodeNormMode::kNFKC ? "nfkc" : "nfc"; | |
| 115 UNormalization2Mode compose = u_mode == UnicodeNormMode::kNFC || u_mode == UnicodeNormMode::kNFKC | |
| 116 ? UNORM2_COMPOSE | |
| 117 : UNORM2_DECOMPOSE; | |
| 118 // Pointer to singleton does not require deletion. | |
| 119 const icu::Normalizer2 *normalizer = | |
| 120 icu::Normalizer2::getInstance(nullptr, norm_type, compose, error_code); | |
| 121 error_code.assertSuccess(); | |
| 122 error_code.reset(); | |
| 123 icu::UnicodeString norm_str = normalizer->normalize(uch_str, error_code); | |
| 124 error_code.assertSuccess(); | |
| 125 // Convert to char32 for output. OCR normalization if required. | |
| 126 normed32->reserve(norm_str.length()); // An approximation. | |
| 127 for (int offset = 0; offset < norm_str.length(); offset = norm_str.moveIndex32(offset, 1)) { | |
| 128 char32 ch = norm_str.char32At(offset); | |
| 129 // Skip all ZWS, RTL and LTR marks. | |
| 130 if (Validator::IsZeroWidthMark(ch)) { | |
| 131 continue; | |
| 132 } | |
| 133 if (ocr_normalize == OCRNorm::kNormalize) { | |
| 134 ch = OCRNormalize(ch); | |
| 135 } | |
| 136 normed32->push_back(ch); | |
| 137 } | |
| 138 } | |
| 139 | |
| 140 // Helper removes joiners from strings that contain no letters. | |
| 141 static void StripJoiners(std::vector<char32> *str32) { | |
| 142 for (char32 ch : *str32) { | |
| 143 if (u_isalpha(ch)) { | |
| 144 return; | |
| 145 } | |
| 146 } | |
| 147 int len = 0; | |
| 148 for (char32 ch : *str32) { | |
| 149 if (ch != Validator::kZeroWidthJoiner && ch != Validator::kZeroWidthNonJoiner) { | |
| 150 (*str32)[len++] = ch; | |
| 151 } | |
| 152 } | |
| 153 str32->resize(len); | |
| 154 } | |
| 155 | |
| 156 // Normalizes a UTF8 string according to the given modes. Returns true on | |
| 157 // success. If false is returned, some failure or invalidity was present, and | |
| 158 // the result string is produced on a "best effort" basis. | |
| 159 bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize, | |
| 160 GraphemeNorm grapheme_normalize, const char *str8, | |
| 161 std::string *normalized) { | |
| 162 std::vector<char32> normed32; | |
| 163 NormalizeUTF8ToUTF32(u_mode, ocr_normalize, str8, &normed32); | |
| 164 if (grapheme_normalize == GraphemeNorm::kNormalize) { | |
| 165 StripJoiners(&normed32); | |
| 166 std::vector<std::vector<char32>> graphemes; | |
| 167 bool success = Validator::ValidateCleanAndSegment(GraphemeNormMode::kSingleString, false, | |
| 168 normed32, &graphemes); | |
| 169 if (graphemes.empty() || graphemes[0].empty()) { | |
| 170 success = false; | |
| 171 } else if (normalized != nullptr) { | |
| 172 *normalized = UNICHAR::UTF32ToUTF8(graphemes[0]); | |
| 173 } | |
| 174 return success; | |
| 175 } | |
| 176 if (normalized != nullptr) { | |
| 177 *normalized = UNICHAR::UTF32ToUTF8(normed32); | |
| 178 } | |
| 179 return true; | |
| 180 } | |
| 181 | |
| 182 // Normalizes a UTF8 string according to the given modes and splits into | |
| 183 // graphemes according to g_mode. Returns true on success. If false is returned, | |
| 184 // some failure or invalidity was present, and the result string is produced on | |
| 185 // a "best effort" basis. | |
| 186 bool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize, | |
| 187 GraphemeNormMode g_mode, bool report_errors, const char *str8, | |
| 188 std::vector<std::string> *graphemes) { | |
| 189 std::vector<char32> normed32; | |
| 190 NormalizeUTF8ToUTF32(u_mode, ocr_normalize, str8, &normed32); | |
| 191 StripJoiners(&normed32); | |
| 192 std::vector<std::vector<char32>> graphemes32; | |
| 193 bool success = Validator::ValidateCleanAndSegment(g_mode, report_errors, normed32, &graphemes32); | |
| 194 if (g_mode != GraphemeNormMode::kSingleString && success) { | |
| 195 // If we modified the string to clean it up, the segmentation may not be | |
| 196 // correct, so check for changes and do it again. | |
| 197 std::vector<char32> cleaned32; | |
| 198 for (const auto &g : graphemes32) { | |
| 199 cleaned32.insert(cleaned32.end(), g.begin(), g.end()); | |
| 200 } | |
| 201 if (cleaned32 != normed32) { | |
| 202 graphemes32.clear(); | |
| 203 success = Validator::ValidateCleanAndSegment(g_mode, report_errors, cleaned32, &graphemes32); | |
| 204 } | |
| 205 } | |
| 206 graphemes->clear(); | |
| 207 graphemes->reserve(graphemes32.size()); | |
| 208 for (const auto &grapheme : graphemes32) { | |
| 209 graphemes->push_back(UNICHAR::UTF32ToUTF8(grapheme)); | |
| 210 } | |
| 211 return success; | |
| 212 } | |
| 213 | |
| 214 // Apply just the OCR-specific normalizations and return the normalized char. | |
| 215 char32 OCRNormalize(char32 ch) { | |
| 216 if (is_hyphen_punc(ch)) { | |
| 217 return '-'; | |
| 218 } else if (is_single_quote(ch)) { | |
| 219 return '\''; | |
| 220 } else if (is_double_quote(ch)) { | |
| 221 return '"'; | |
| 222 } | |
| 223 return ch; | |
| 224 } | |
| 225 | |
| 226 bool IsOCREquivalent(char32 ch1, char32 ch2) { | |
| 227 return OCRNormalize(ch1) == OCRNormalize(ch2); | |
| 228 } | |
| 229 | |
| 230 bool IsValidCodepoint(const char32 ch) { | |
| 231 // In the range [0, 0xD800) or [0xE000, 0x10FFFF] | |
| 232 return (static_cast<uint32_t>(ch) < 0xD800) || (ch >= 0xE000 && ch <= 0x10FFFF); | |
| 233 } | |
| 234 | |
| 235 bool IsWhitespace(const char32 ch) { | |
| 236 ASSERT_HOST_MSG(IsValidCodepoint(ch), "Invalid Unicode codepoint: 0x%x\n", ch); | |
| 237 return u_isUWhiteSpace(static_cast<UChar32>(ch)); | |
| 238 } | |
| 239 | |
| 240 bool IsUTF8Whitespace(const char *text) { | |
| 241 return SpanUTF8Whitespace(text) == strlen(text); | |
| 242 } | |
| 243 | |
| 244 unsigned int SpanUTF8Whitespace(const char *text) { | |
| 245 int n_white = 0; | |
| 246 for (UNICHAR::const_iterator it = UNICHAR::begin(text, strlen(text)); | |
| 247 it != UNICHAR::end(text, strlen(text)); ++it) { | |
| 248 if (!IsWhitespace(*it)) { | |
| 249 break; | |
| 250 } | |
| 251 n_white += it.utf8_len(); | |
| 252 } | |
| 253 return n_white; | |
| 254 } | |
| 255 | |
| 256 unsigned int SpanUTF8NotWhitespace(const char *text) { | |
| 257 int n_notwhite = 0; | |
| 258 for (UNICHAR::const_iterator it = UNICHAR::begin(text, strlen(text)); | |
| 259 it != UNICHAR::end(text, strlen(text)); ++it) { | |
| 260 if (IsWhitespace(*it)) { | |
| 261 break; | |
| 262 } | |
| 263 n_notwhite += it.utf8_len(); | |
| 264 } | |
| 265 return n_notwhite; | |
| 266 } | |
| 267 | |
| 268 bool IsInterchangeValid(const char32 ch) { | |
| 269 return IsValidCodepoint(ch) && !(ch >= 0xFDD0 && ch <= 0xFDEF) && // Noncharacters. | |
| 270 !(ch >= 0xFFFE && ch <= 0xFFFF) && !(ch >= 0x1FFFE && ch <= 0x1FFFF) && | |
| 271 !(ch >= 0x2FFFE && ch <= 0x2FFFF) && !(ch >= 0x3FFFE && ch <= 0x3FFFF) && | |
| 272 !(ch >= 0x4FFFE && ch <= 0x4FFFF) && !(ch >= 0x5FFFE && ch <= 0x5FFFF) && | |
| 273 !(ch >= 0x6FFFE && ch <= 0x6FFFF) && !(ch >= 0x7FFFE && ch <= 0x7FFFF) && | |
| 274 !(ch >= 0x8FFFE && ch <= 0x8FFFF) && !(ch >= 0x9FFFE && ch <= 0x9FFFF) && | |
| 275 !(ch >= 0xAFFFE && ch <= 0xAFFFF) && !(ch >= 0xBFFFE && ch <= 0xBFFFF) && | |
| 276 !(ch >= 0xCFFFE && ch <= 0xCFFFF) && !(ch >= 0xDFFFE && ch <= 0xDFFFF) && | |
| 277 !(ch >= 0xEFFFE && ch <= 0xEFFFF) && !(ch >= 0xFFFFE && ch <= 0xFFFFF) && | |
| 278 !(ch >= 0x10FFFE && ch <= 0x10FFFF) && | |
| 279 (!u_isISOControl(static_cast<UChar32>(ch)) || ch == '\n' || ch == '\f' || ch == '\t' || | |
| 280 ch == '\r'); | |
| 281 } | |
| 282 | |
| 283 bool IsInterchangeValid7BitAscii(const char32 ch) { | |
| 284 return IsValidCodepoint(ch) && ch <= 128 && | |
| 285 (!u_isISOControl(static_cast<UChar32>(ch)) || ch == '\n' || ch == '\f' || ch == '\t' || | |
| 286 ch == '\r'); | |
| 287 } | |
| 288 | |
| 289 char32 FullwidthToHalfwidth(const char32 ch) { | |
| 290 // Return unchanged if not in the fullwidth-halfwidth Unicode block. | |
| 291 if (ch < 0xFF00 || ch > 0xFFEF || !IsValidCodepoint(ch)) { | |
| 292 if (ch != 0x3000) { | |
| 293 return ch; | |
| 294 } | |
| 295 } | |
| 296 // Special case for fullwidth left and right "white parentheses". | |
| 297 if (ch == 0xFF5F) { | |
| 298 return 0x2985; | |
| 299 } | |
| 300 if (ch == 0xFF60) { | |
| 301 return 0x2986; | |
| 302 } | |
| 303 // Construct a full-to-half width transliterator. | |
| 304 IcuErrorCode error_code; | |
| 305 icu::UnicodeString uch_str(static_cast<UChar32>(ch)); | |
| 306 const icu::Transliterator *fulltohalf = | |
| 307 icu::Transliterator::createInstance("Fullwidth-Halfwidth", UTRANS_FORWARD, error_code); | |
| 308 error_code.assertSuccess(); | |
| 309 error_code.reset(); | |
| 310 | |
| 311 fulltohalf->transliterate(uch_str); | |
| 312 delete fulltohalf; | |
| 313 ASSERT_HOST(uch_str.length() != 0); | |
| 314 return uch_str[0]; | |
| 315 } | |
| 316 | |
| 317 } // namespace tesseract |
