comparison mupdf-source/thirdparty/tesseract/src/training/unicharset/normstrngs.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /**********************************************************************
2 * File: normstrngs.cpp
3 * Description: Utilities to normalize and manipulate UTF-32 and
4 * UTF-8 strings.
5 * Author: Ranjith Unnikrishnan
6 *
7 * (C) Copyright 2013, Google Inc.
8 * Licensed under the Apache License, Version 2.0 (the "License");
9 * you may not use this file except in compliance with the License.
10 * You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 *
18 **********************************************************************/
19
20 #include "normstrngs.h"
21
22 #include <string>
23 #include <unordered_map>
24 #include <vector>
25
26 #include <tesseract/unichar.h>
27 #include "errcode.h"
28 #include "icuerrorcode.h"
29 #include "unicode/normalizer2.h" // From libicu
30 #include "unicode/translit.h" // From libicu
31 #include "unicode/uchar.h" // From libicu
32 #include "unicode/unorm2.h" // From libicu
33 #include "unicode/uscript.h" // From libicu
34
35 namespace tesseract {
36
37 static bool is_hyphen_punc(const char32 ch) {
38 static const char32 kHyphenPuncUnicodes[] = {
39 '-',
40 0x2010, // hyphen
41 0x2011, // non-breaking hyphen
42 0x2012, // figure dash
43 0x2013, // en dash
44 0x2014, // em dash
45 0x2015, // horizontal bar
46 // how about 0x2043 hyphen bullet?
47 // how about 0x2500 box drawings light horizontal?
48 0x207b, // superscript minus
49 0x208b, // subscript minus
50 0x2212, // minus sign
51 0xfe58, // small em dash
52 0xfe63, // small hyphen-minus
53 0xff0d, // fullwidth hyphen-minus
54 0x2e17 // double oblique hyphen (Fraktur)
55 };
56 for (int kHyphenPuncUnicode : kHyphenPuncUnicodes) {
57 if (kHyphenPuncUnicode == ch) {
58 return true;
59 }
60 }
61 return false;
62 }
63
64 static bool is_single_quote(const char32 ch) {
65 static const char32 kSingleQuoteUnicodes[] = {
66 '\'', '`',
67 0x2018, // left single quotation mark (English, others)
68 0x2019, // right single quotation mark (Danish, Finnish, Swedish, Norw.)
69 // We may have to introduce a comma set with 0x201a
70 0x201A, // single low-9 quotation mark (German)
71 0x201B, // single high-reversed-9 quotation mark (PropList.txt)
72 0x2032, // prime
73 0x300C, // left corner bracket (East Asian languages)
74 0xFF07 // fullwidth apostrophe
75 };
76 for (int kSingleQuoteUnicode : kSingleQuoteUnicodes) {
77 if (kSingleQuoteUnicode == ch) {
78 return true;
79 }
80 }
81 return false;
82 }
83
84 static bool is_double_quote(const char32 ch) {
85 static const char32 kDoubleQuoteUnicodes[] = {
86 '"',
87 0x201C, // left double quotation mark (English, others)
88 0x201D, // right double quotation mark (Danish, Finnish, Swedish, Norw.)
89 0x201F, // double high-reversed-9 quotation mark (PropList.txt)
90 0x2033, // double prime
91 0x201E, // double low-9 quotation mark (German)
92 0x301D, // reversed double prime quotation mark (East Asian langs,
93 // horiz.)
94 0x301E, // close double prime (East Asian languages written horizontally)
95 0xFF02 // fullwidth quotation mark
96 };
97 for (int kDoubleQuoteUnicode : kDoubleQuoteUnicodes) {
98 if (kDoubleQuoteUnicode == ch) {
99 return true;
100 }
101 }
102 return false;
103 }
104
105 // Helper runs a standard unicode normalization, optional OCR normalization,
106 // and leaves the result as char32 for subsequent processing.
107 static void NormalizeUTF8ToUTF32(UnicodeNormMode u_mode, OCRNorm ocr_normalize, const char *str8,
108 std::vector<char32> *normed32) {
109 // Convert to ICU string for unicode normalization.
110 icu::UnicodeString uch_str(str8, "UTF-8");
111 IcuErrorCode error_code;
112 // Convert the enum to the new weird icu representation.
113 const char *norm_type =
114 u_mode == UnicodeNormMode::kNFKD || u_mode == UnicodeNormMode::kNFKC ? "nfkc" : "nfc";
115 UNormalization2Mode compose = u_mode == UnicodeNormMode::kNFC || u_mode == UnicodeNormMode::kNFKC
116 ? UNORM2_COMPOSE
117 : UNORM2_DECOMPOSE;
118 // Pointer to singleton does not require deletion.
119 const icu::Normalizer2 *normalizer =
120 icu::Normalizer2::getInstance(nullptr, norm_type, compose, error_code);
121 error_code.assertSuccess();
122 error_code.reset();
123 icu::UnicodeString norm_str = normalizer->normalize(uch_str, error_code);
124 error_code.assertSuccess();
125 // Convert to char32 for output. OCR normalization if required.
126 normed32->reserve(norm_str.length()); // An approximation.
127 for (int offset = 0; offset < norm_str.length(); offset = norm_str.moveIndex32(offset, 1)) {
128 char32 ch = norm_str.char32At(offset);
129 // Skip all ZWS, RTL and LTR marks.
130 if (Validator::IsZeroWidthMark(ch)) {
131 continue;
132 }
133 if (ocr_normalize == OCRNorm::kNormalize) {
134 ch = OCRNormalize(ch);
135 }
136 normed32->push_back(ch);
137 }
138 }
139
140 // Helper removes joiners from strings that contain no letters.
141 static void StripJoiners(std::vector<char32> *str32) {
142 for (char32 ch : *str32) {
143 if (u_isalpha(ch)) {
144 return;
145 }
146 }
147 int len = 0;
148 for (char32 ch : *str32) {
149 if (ch != Validator::kZeroWidthJoiner && ch != Validator::kZeroWidthNonJoiner) {
150 (*str32)[len++] = ch;
151 }
152 }
153 str32->resize(len);
154 }
155
156 // Normalizes a UTF8 string according to the given modes. Returns true on
157 // success. If false is returned, some failure or invalidity was present, and
158 // the result string is produced on a "best effort" basis.
159 bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize,
160 GraphemeNorm grapheme_normalize, const char *str8,
161 std::string *normalized) {
162 std::vector<char32> normed32;
163 NormalizeUTF8ToUTF32(u_mode, ocr_normalize, str8, &normed32);
164 if (grapheme_normalize == GraphemeNorm::kNormalize) {
165 StripJoiners(&normed32);
166 std::vector<std::vector<char32>> graphemes;
167 bool success = Validator::ValidateCleanAndSegment(GraphemeNormMode::kSingleString, false,
168 normed32, &graphemes);
169 if (graphemes.empty() || graphemes[0].empty()) {
170 success = false;
171 } else if (normalized != nullptr) {
172 *normalized = UNICHAR::UTF32ToUTF8(graphemes[0]);
173 }
174 return success;
175 }
176 if (normalized != nullptr) {
177 *normalized = UNICHAR::UTF32ToUTF8(normed32);
178 }
179 return true;
180 }
181
182 // Normalizes a UTF8 string according to the given modes and splits into
183 // graphemes according to g_mode. Returns true on success. If false is returned,
184 // some failure or invalidity was present, and the result string is produced on
185 // a "best effort" basis.
186 bool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize,
187 GraphemeNormMode g_mode, bool report_errors, const char *str8,
188 std::vector<std::string> *graphemes) {
189 std::vector<char32> normed32;
190 NormalizeUTF8ToUTF32(u_mode, ocr_normalize, str8, &normed32);
191 StripJoiners(&normed32);
192 std::vector<std::vector<char32>> graphemes32;
193 bool success = Validator::ValidateCleanAndSegment(g_mode, report_errors, normed32, &graphemes32);
194 if (g_mode != GraphemeNormMode::kSingleString && success) {
195 // If we modified the string to clean it up, the segmentation may not be
196 // correct, so check for changes and do it again.
197 std::vector<char32> cleaned32;
198 for (const auto &g : graphemes32) {
199 cleaned32.insert(cleaned32.end(), g.begin(), g.end());
200 }
201 if (cleaned32 != normed32) {
202 graphemes32.clear();
203 success = Validator::ValidateCleanAndSegment(g_mode, report_errors, cleaned32, &graphemes32);
204 }
205 }
206 graphemes->clear();
207 graphemes->reserve(graphemes32.size());
208 for (const auto &grapheme : graphemes32) {
209 graphemes->push_back(UNICHAR::UTF32ToUTF8(grapheme));
210 }
211 return success;
212 }
213
214 // Apply just the OCR-specific normalizations and return the normalized char.
215 char32 OCRNormalize(char32 ch) {
216 if (is_hyphen_punc(ch)) {
217 return '-';
218 } else if (is_single_quote(ch)) {
219 return '\'';
220 } else if (is_double_quote(ch)) {
221 return '"';
222 }
223 return ch;
224 }
225
226 bool IsOCREquivalent(char32 ch1, char32 ch2) {
227 return OCRNormalize(ch1) == OCRNormalize(ch2);
228 }
229
230 bool IsValidCodepoint(const char32 ch) {
231 // In the range [0, 0xD800) or [0xE000, 0x10FFFF]
232 return (static_cast<uint32_t>(ch) < 0xD800) || (ch >= 0xE000 && ch <= 0x10FFFF);
233 }
234
235 bool IsWhitespace(const char32 ch) {
236 ASSERT_HOST_MSG(IsValidCodepoint(ch), "Invalid Unicode codepoint: 0x%x\n", ch);
237 return u_isUWhiteSpace(static_cast<UChar32>(ch));
238 }
239
240 bool IsUTF8Whitespace(const char *text) {
241 return SpanUTF8Whitespace(text) == strlen(text);
242 }
243
244 unsigned int SpanUTF8Whitespace(const char *text) {
245 int n_white = 0;
246 for (UNICHAR::const_iterator it = UNICHAR::begin(text, strlen(text));
247 it != UNICHAR::end(text, strlen(text)); ++it) {
248 if (!IsWhitespace(*it)) {
249 break;
250 }
251 n_white += it.utf8_len();
252 }
253 return n_white;
254 }
255
256 unsigned int SpanUTF8NotWhitespace(const char *text) {
257 int n_notwhite = 0;
258 for (UNICHAR::const_iterator it = UNICHAR::begin(text, strlen(text));
259 it != UNICHAR::end(text, strlen(text)); ++it) {
260 if (IsWhitespace(*it)) {
261 break;
262 }
263 n_notwhite += it.utf8_len();
264 }
265 return n_notwhite;
266 }
267
268 bool IsInterchangeValid(const char32 ch) {
269 return IsValidCodepoint(ch) && !(ch >= 0xFDD0 && ch <= 0xFDEF) && // Noncharacters.
270 !(ch >= 0xFFFE && ch <= 0xFFFF) && !(ch >= 0x1FFFE && ch <= 0x1FFFF) &&
271 !(ch >= 0x2FFFE && ch <= 0x2FFFF) && !(ch >= 0x3FFFE && ch <= 0x3FFFF) &&
272 !(ch >= 0x4FFFE && ch <= 0x4FFFF) && !(ch >= 0x5FFFE && ch <= 0x5FFFF) &&
273 !(ch >= 0x6FFFE && ch <= 0x6FFFF) && !(ch >= 0x7FFFE && ch <= 0x7FFFF) &&
274 !(ch >= 0x8FFFE && ch <= 0x8FFFF) && !(ch >= 0x9FFFE && ch <= 0x9FFFF) &&
275 !(ch >= 0xAFFFE && ch <= 0xAFFFF) && !(ch >= 0xBFFFE && ch <= 0xBFFFF) &&
276 !(ch >= 0xCFFFE && ch <= 0xCFFFF) && !(ch >= 0xDFFFE && ch <= 0xDFFFF) &&
277 !(ch >= 0xEFFFE && ch <= 0xEFFFF) && !(ch >= 0xFFFFE && ch <= 0xFFFFF) &&
278 !(ch >= 0x10FFFE && ch <= 0x10FFFF) &&
279 (!u_isISOControl(static_cast<UChar32>(ch)) || ch == '\n' || ch == '\f' || ch == '\t' ||
280 ch == '\r');
281 }
282
283 bool IsInterchangeValid7BitAscii(const char32 ch) {
284 return IsValidCodepoint(ch) && ch <= 128 &&
285 (!u_isISOControl(static_cast<UChar32>(ch)) || ch == '\n' || ch == '\f' || ch == '\t' ||
286 ch == '\r');
287 }
288
289 char32 FullwidthToHalfwidth(const char32 ch) {
290 // Return unchanged if not in the fullwidth-halfwidth Unicode block.
291 if (ch < 0xFF00 || ch > 0xFFEF || !IsValidCodepoint(ch)) {
292 if (ch != 0x3000) {
293 return ch;
294 }
295 }
296 // Special case for fullwidth left and right "white parentheses".
297 if (ch == 0xFF5F) {
298 return 0x2985;
299 }
300 if (ch == 0xFF60) {
301 return 0x2986;
302 }
303 // Construct a full-to-half width transliterator.
304 IcuErrorCode error_code;
305 icu::UnicodeString uch_str(static_cast<UChar32>(ch));
306 const icu::Transliterator *fulltohalf =
307 icu::Transliterator::createInstance("Fullwidth-Halfwidth", UTRANS_FORWARD, error_code);
308 error_code.assertSuccess();
309 error_code.reset();
310
311 fulltohalf->transliterate(uch_str);
312 delete fulltohalf;
313 ASSERT_HOST(uch_str.length() != 0);
314 return uch_str[0];
315 }
316
317 } // namespace tesseract