comparison mupdf-source/thirdparty/zxing-cpp/core/src/Utf.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /*
2 * Copyright 2016 Nu-book Inc.
3 * Copyright 2021 gitlost
4 * Copyright 2022 Axel Waggershauser
5 */
6 // SPDX-License-Identifier: Apache-2.0
7
8 #include "Utf.h"
9
10 #include "ZXTestSupport.h"
11 #include "ZXAlgorithms.h"
12
13 #include <iomanip>
14 #include <cstdint>
15 #include <sstream>
16
17 namespace ZXing {
18
19 // TODO: c++20 has char8_t
20 #ifndef ZXING_HAS_CHAR8
21 #if __cplusplus <= 201703L
22 using char8_t = uint8_t;
23 #endif
24 #endif
25 using utf8_t = std::basic_string_view<char8_t>;
26
27 using state_t = uint8_t;
28 constexpr state_t kAccepted = 0;
29 constexpr state_t kRejected [[maybe_unused]] = 12;
30
31 inline char32_t Utf8Decode(char8_t byte, state_t& state, char32_t& codep)
32 {
33 // Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
34 // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
35 static constexpr const state_t kUtf8Data[] = {
36 /* The first part of the table maps bytes to character classes that
37 * reduce the size of the transition table and create bitmasks. */
38 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
39 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
40 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
41 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
42 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
43 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
44 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
45 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
46
47 /* The second part is a transition table that maps a combination
48 * of a state of the automaton and a character class to a state. */
49 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
50 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
51 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
52 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
53 12,36,12,12,12,12,12,12,12,12,12,12,
54 };
55
56 state_t type = kUtf8Data[byte];
57 codep = (state != kAccepted) ? (byte & 0x3fu) | (codep << 6) : (0xff >> type) & (byte);
58 state = kUtf8Data[256 + state + type];
59 return state;
60 }
61
62 static_assert(sizeof(wchar_t) == 4 || sizeof(wchar_t) == 2, "wchar_t needs to be 2 or 4 bytes wide");
63
64 inline bool IsUtf16SurrogatePair(std::wstring_view str)
65 {
66 return sizeof(wchar_t) == 2 && str.size() >= 2 && (str[0] & 0xfc00) == 0xd800 && (str[1] & 0xfc00) == 0xdc00;
67 }
68
69 inline char32_t Utf32FromUtf16Surrogates(std::wstring_view str)
70 {
71 return (static_cast<char32_t>(str[0]) << 10) + str[1] - 0x35fdc00;
72 }
73
74 static size_t Utf8CountCodePoints(utf8_t utf8)
75 {
76 size_t count = 0;
77
78 for (size_t i = 0; i < utf8.size();) {
79 if (utf8[i] < 128) {
80 ++i;
81 } else {
82 switch (utf8[i] & 0xf0) {
83 case 0xc0: [[fallthrough]];
84 case 0xd0: i += 2; break;
85 case 0xe0: i += 3; break;
86 case 0xf0: i += 4; break;
87 default: // we are in middle of a sequence
88 ++i;
89 while (i < utf8.size() && (utf8[i] & 0xc0) == 0x80)
90 ++i;
91 break;
92 }
93 }
94 ++count;
95 }
96
97 return count;
98 }
99
100 static void AppendFromUtf8(utf8_t utf8, std::wstring& buffer)
101 {
102 buffer.reserve(buffer.size() + Utf8CountCodePoints(utf8));
103
104 char32_t codePoint = 0;
105 state_t state = kAccepted;
106
107 for (auto b : utf8) {
108 if (Utf8Decode(b, state, codePoint) != kAccepted)
109 continue;
110
111 if (sizeof(wchar_t) == 2 && codePoint > 0xffff) { // surrogate pair
112 buffer.push_back(narrow_cast<wchar_t>(0xd7c0 + (codePoint >> 10)));
113 buffer.push_back(narrow_cast<wchar_t>(0xdc00 + (codePoint & 0x3ff)));
114 } else {
115 buffer.push_back(narrow_cast<wchar_t>(codePoint));
116 }
117 }
118 }
119
120 std::wstring FromUtf8(std::string_view utf8)
121 {
122 std::wstring str;
123 AppendFromUtf8({reinterpret_cast<const char8_t*>(utf8.data()), utf8.size()}, str);
124 return str;
125 }
126
127 #if __cplusplus > 201703L
128 std::wstring FromUtf8(std::u8string_view utf8)
129 {
130 std::wstring str;
131 AppendFromUtf8(utf8, str);
132 return str;
133 }
134 #endif
135
136 // Count the number of bytes required to store given code points in UTF-8.
137 static size_t Utf8CountBytes(std::wstring_view str)
138 {
139 int result = 0;
140 for (; str.size(); str.remove_prefix(1)) {
141 if (str.front() < 0x80)
142 result += 1;
143 else if (str.front() < 0x800)
144 result += 2;
145 else if (sizeof(wchar_t) == 4) {
146 if (str.front() < 0x10000)
147 result += 3;
148 else
149 result += 4;
150 } else {
151 if (IsUtf16SurrogatePair(str)) {
152 result += 4;
153 str.remove_prefix(1);
154 } else
155 result += 3;
156 }
157 }
158 return result;
159 }
160
161 ZXING_EXPORT_TEST_ONLY
162 int Utf32ToUtf8(char32_t utf32, char* out)
163 {
164 if (utf32 < 0x80) {
165 *out++ = narrow_cast<char8_t>(utf32);
166 return 1;
167 }
168 if (utf32 < 0x800) {
169 *out++ = narrow_cast<char8_t>((utf32 >> 6) | 0xc0);
170 *out++ = narrow_cast<char8_t>((utf32 & 0x3f) | 0x80);
171 return 2;
172 }
173 if (utf32 < 0x10000) {
174 *out++ = narrow_cast<char8_t>((utf32 >> 12) | 0xe0);
175 *out++ = narrow_cast<char8_t>(((utf32 >> 6) & 0x3f) | 0x80);
176 *out++ = narrow_cast<char8_t>((utf32 & 0x3f) | 0x80);
177 return 3;
178 }
179
180 *out++ = narrow_cast<char8_t>((utf32 >> 18) | 0xf0);
181 *out++ = narrow_cast<char8_t>(((utf32 >> 12) & 0x3f) | 0x80);
182 *out++ = narrow_cast<char8_t>(((utf32 >> 6) & 0x3f) | 0x80);
183 *out++ = narrow_cast<char8_t>((utf32 & 0x3f) | 0x80);
184 return 4;
185 }
186
187 static void AppendToUtf8(std::wstring_view str, std::string& utf8)
188 {
189 utf8.reserve(utf8.size() + Utf8CountBytes(str));
190
191 char buffer[4];
192 for (; str.size(); str.remove_prefix(1))
193 {
194 uint32_t cp;
195 if (IsUtf16SurrogatePair(str)) {
196 cp = Utf32FromUtf16Surrogates(str);
197 str.remove_prefix(1);
198 } else
199 cp = str.front();
200
201 auto bufLength = Utf32ToUtf8(cp, buffer);
202 utf8.append(buffer, bufLength);
203 }
204 }
205
206 std::string ToUtf8(std::wstring_view str)
207 {
208 std::string utf8;
209 AppendToUtf8(str, utf8);
210 return utf8;
211 }
212
213 static bool iswgraph(wchar_t wc)
214 {
215 /* Consider all legal codepoints as graphical except for:
216 * - whitespace
217 * - C0 and C1 control characters
218 * - U+2028 and U+2029 (line/para break)
219 * - U+FFF9 through U+FFFB (interlinear annotation controls)
220 * The following code is based on libmusls implementation */
221
222 if (wc == ' ' || (unsigned)wc - '\t' < 5)
223 return false;
224 if (wc < 0xff)
225 return ((wc + 1) & 0x7f) >= 0x21;
226 if (wc < 0x2028 || wc - 0x202a < 0xd800 - 0x202a || wc - 0xe000 < 0xfff9 - 0xe000)
227 return true;
228 if (wc - 0xfffc > 0x10ffff - 0xfffc || (wc & 0xfffe) == 0xfffe)
229 return false;
230 return true;
231 }
232
233 std::wstring EscapeNonGraphical(std::wstring_view str)
234 {
235 static const char* const ascii_nongraphs[33] = {
236 "NUL", "SOH", "STX", "ETX", "EOT", "ENQ", "ACK", "BEL",
237 "BS", "HT", "LF", "VT", "FF", "CR", "SO", "SI",
238 "DLE", "DC1", "DC2", "DC3", "DC4", "NAK", "SYN", "ETB",
239 "CAN", "EM", "SUB", "ESC", "FS", "GS", "RS", "US",
240 "DEL",
241 };
242
243 std::wostringstream ws;
244 ws.fill(L'0');
245
246 for (; str.size(); str.remove_prefix(1)) {
247 wchar_t wc = str.front();
248 if (wc < 32 || wc == 127) // Non-graphical ASCII, excluding space
249 ws << "<" << ascii_nongraphs[wc == 127 ? 32 : wc] << ">";
250 else if (wc < 128) // ASCII
251 ws << wc;
252 else if (IsUtf16SurrogatePair(str)) {
253 ws.write(str.data(), 2);
254 str.remove_prefix(1);
255 }
256 // Exclude unpaired surrogates and NO-BREAK spaces NBSP and NUMSP
257 else if ((wc < 0xd800 || wc >= 0xe000) && (iswgraph(wc) && wc != 0xA0 && wc != 0x2007 && wc != 0x2000 && wc != 0xfffd))
258 ws << wc;
259 else // Non-graphical Unicode
260 ws << "<U+" << std::setw(wc < 256 ? 2 : 4) << std::uppercase << std::hex << static_cast<uint32_t>(wc) << ">";
261 }
262
263 return ws.str();
264 }
265
266 std::string EscapeNonGraphical(std::string_view utf8)
267 {
268 return ToUtf8(EscapeNonGraphical(FromUtf8(utf8)));
269 }
270
271 } // namespace ZXing