Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/zxing-cpp/core/src/Utf.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /* | |
| 2 * Copyright 2016 Nu-book Inc. | |
| 3 * Copyright 2021 gitlost | |
| 4 * Copyright 2022 Axel Waggershauser | |
| 5 */ | |
| 6 // SPDX-License-Identifier: Apache-2.0 | |
| 7 | |
| 8 #include "Utf.h" | |
| 9 | |
| 10 #include "ZXTestSupport.h" | |
| 11 #include "ZXAlgorithms.h" | |
| 12 | |
| 13 #include <iomanip> | |
| 14 #include <cstdint> | |
| 15 #include <sstream> | |
| 16 | |
| 17 namespace ZXing { | |
| 18 | |
| 19 // TODO: c++20 has char8_t | |
| 20 #ifndef ZXING_HAS_CHAR8 | |
| 21 #if __cplusplus <= 201703L | |
| 22 using char8_t = uint8_t; | |
| 23 #endif | |
| 24 #endif | |
| 25 using utf8_t = std::basic_string_view<char8_t>; | |
| 26 | |
| 27 using state_t = uint8_t; | |
| 28 constexpr state_t kAccepted = 0; | |
| 29 constexpr state_t kRejected [[maybe_unused]] = 12; | |
| 30 | |
| 31 inline char32_t Utf8Decode(char8_t byte, state_t& state, char32_t& codep) | |
| 32 { | |
| 33 // Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de> | |
| 34 // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. | |
| 35 static constexpr const state_t kUtf8Data[] = { | |
| 36 /* The first part of the table maps bytes to character classes that | |
| 37 * reduce the size of the transition table and create bitmasks. */ | |
| 38 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
| 39 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
| 40 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
| 41 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
| 42 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, | |
| 43 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, | |
| 44 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | |
| 45 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, | |
| 46 | |
| 47 /* The second part is a transition table that maps a combination | |
| 48 * of a state of the automaton and a character class to a state. */ | |
| 49 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, | |
| 50 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, | |
| 51 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, | |
| 52 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, | |
| 53 12,36,12,12,12,12,12,12,12,12,12,12, | |
| 54 }; | |
| 55 | |
| 56 state_t type = kUtf8Data[byte]; | |
| 57 codep = (state != kAccepted) ? (byte & 0x3fu) | (codep << 6) : (0xff >> type) & (byte); | |
| 58 state = kUtf8Data[256 + state + type]; | |
| 59 return state; | |
| 60 } | |
| 61 | |
| 62 static_assert(sizeof(wchar_t) == 4 || sizeof(wchar_t) == 2, "wchar_t needs to be 2 or 4 bytes wide"); | |
| 63 | |
| 64 inline bool IsUtf16SurrogatePair(std::wstring_view str) | |
| 65 { | |
| 66 return sizeof(wchar_t) == 2 && str.size() >= 2 && (str[0] & 0xfc00) == 0xd800 && (str[1] & 0xfc00) == 0xdc00; | |
| 67 } | |
| 68 | |
| 69 inline char32_t Utf32FromUtf16Surrogates(std::wstring_view str) | |
| 70 { | |
| 71 return (static_cast<char32_t>(str[0]) << 10) + str[1] - 0x35fdc00; | |
| 72 } | |
| 73 | |
| 74 static size_t Utf8CountCodePoints(utf8_t utf8) | |
| 75 { | |
| 76 size_t count = 0; | |
| 77 | |
| 78 for (size_t i = 0; i < utf8.size();) { | |
| 79 if (utf8[i] < 128) { | |
| 80 ++i; | |
| 81 } else { | |
| 82 switch (utf8[i] & 0xf0) { | |
| 83 case 0xc0: [[fallthrough]]; | |
| 84 case 0xd0: i += 2; break; | |
| 85 case 0xe0: i += 3; break; | |
| 86 case 0xf0: i += 4; break; | |
| 87 default: // we are in middle of a sequence | |
| 88 ++i; | |
| 89 while (i < utf8.size() && (utf8[i] & 0xc0) == 0x80) | |
| 90 ++i; | |
| 91 break; | |
| 92 } | |
| 93 } | |
| 94 ++count; | |
| 95 } | |
| 96 | |
| 97 return count; | |
| 98 } | |
| 99 | |
| 100 static void AppendFromUtf8(utf8_t utf8, std::wstring& buffer) | |
| 101 { | |
| 102 buffer.reserve(buffer.size() + Utf8CountCodePoints(utf8)); | |
| 103 | |
| 104 char32_t codePoint = 0; | |
| 105 state_t state = kAccepted; | |
| 106 | |
| 107 for (auto b : utf8) { | |
| 108 if (Utf8Decode(b, state, codePoint) != kAccepted) | |
| 109 continue; | |
| 110 | |
| 111 if (sizeof(wchar_t) == 2 && codePoint > 0xffff) { // surrogate pair | |
| 112 buffer.push_back(narrow_cast<wchar_t>(0xd7c0 + (codePoint >> 10))); | |
| 113 buffer.push_back(narrow_cast<wchar_t>(0xdc00 + (codePoint & 0x3ff))); | |
| 114 } else { | |
| 115 buffer.push_back(narrow_cast<wchar_t>(codePoint)); | |
| 116 } | |
| 117 } | |
| 118 } | |
| 119 | |
| 120 std::wstring FromUtf8(std::string_view utf8) | |
| 121 { | |
| 122 std::wstring str; | |
| 123 AppendFromUtf8({reinterpret_cast<const char8_t*>(utf8.data()), utf8.size()}, str); | |
| 124 return str; | |
| 125 } | |
| 126 | |
| 127 #if __cplusplus > 201703L | |
| 128 std::wstring FromUtf8(std::u8string_view utf8) | |
| 129 { | |
| 130 std::wstring str; | |
| 131 AppendFromUtf8(utf8, str); | |
| 132 return str; | |
| 133 } | |
| 134 #endif | |
| 135 | |
| 136 // Count the number of bytes required to store given code points in UTF-8. | |
| 137 static size_t Utf8CountBytes(std::wstring_view str) | |
| 138 { | |
| 139 int result = 0; | |
| 140 for (; str.size(); str.remove_prefix(1)) { | |
| 141 if (str.front() < 0x80) | |
| 142 result += 1; | |
| 143 else if (str.front() < 0x800) | |
| 144 result += 2; | |
| 145 else if (sizeof(wchar_t) == 4) { | |
| 146 if (str.front() < 0x10000) | |
| 147 result += 3; | |
| 148 else | |
| 149 result += 4; | |
| 150 } else { | |
| 151 if (IsUtf16SurrogatePair(str)) { | |
| 152 result += 4; | |
| 153 str.remove_prefix(1); | |
| 154 } else | |
| 155 result += 3; | |
| 156 } | |
| 157 } | |
| 158 return result; | |
| 159 } | |
| 160 | |
| 161 ZXING_EXPORT_TEST_ONLY | |
| 162 int Utf32ToUtf8(char32_t utf32, char* out) | |
| 163 { | |
| 164 if (utf32 < 0x80) { | |
| 165 *out++ = narrow_cast<char8_t>(utf32); | |
| 166 return 1; | |
| 167 } | |
| 168 if (utf32 < 0x800) { | |
| 169 *out++ = narrow_cast<char8_t>((utf32 >> 6) | 0xc0); | |
| 170 *out++ = narrow_cast<char8_t>((utf32 & 0x3f) | 0x80); | |
| 171 return 2; | |
| 172 } | |
| 173 if (utf32 < 0x10000) { | |
| 174 *out++ = narrow_cast<char8_t>((utf32 >> 12) | 0xe0); | |
| 175 *out++ = narrow_cast<char8_t>(((utf32 >> 6) & 0x3f) | 0x80); | |
| 176 *out++ = narrow_cast<char8_t>((utf32 & 0x3f) | 0x80); | |
| 177 return 3; | |
| 178 } | |
| 179 | |
| 180 *out++ = narrow_cast<char8_t>((utf32 >> 18) | 0xf0); | |
| 181 *out++ = narrow_cast<char8_t>(((utf32 >> 12) & 0x3f) | 0x80); | |
| 182 *out++ = narrow_cast<char8_t>(((utf32 >> 6) & 0x3f) | 0x80); | |
| 183 *out++ = narrow_cast<char8_t>((utf32 & 0x3f) | 0x80); | |
| 184 return 4; | |
| 185 } | |
| 186 | |
| 187 static void AppendToUtf8(std::wstring_view str, std::string& utf8) | |
| 188 { | |
| 189 utf8.reserve(utf8.size() + Utf8CountBytes(str)); | |
| 190 | |
| 191 char buffer[4]; | |
| 192 for (; str.size(); str.remove_prefix(1)) | |
| 193 { | |
| 194 uint32_t cp; | |
| 195 if (IsUtf16SurrogatePair(str)) { | |
| 196 cp = Utf32FromUtf16Surrogates(str); | |
| 197 str.remove_prefix(1); | |
| 198 } else | |
| 199 cp = str.front(); | |
| 200 | |
| 201 auto bufLength = Utf32ToUtf8(cp, buffer); | |
| 202 utf8.append(buffer, bufLength); | |
| 203 } | |
| 204 } | |
| 205 | |
| 206 std::string ToUtf8(std::wstring_view str) | |
| 207 { | |
| 208 std::string utf8; | |
| 209 AppendToUtf8(str, utf8); | |
| 210 return utf8; | |
| 211 } | |
| 212 | |
| 213 static bool iswgraph(wchar_t wc) | |
| 214 { | |
| 215 /* Consider all legal codepoints as graphical except for: | |
| 216 * - whitespace | |
| 217 * - C0 and C1 control characters | |
| 218 * - U+2028 and U+2029 (line/para break) | |
| 219 * - U+FFF9 through U+FFFB (interlinear annotation controls) | |
| 220 * The following code is based on libmusls implementation */ | |
| 221 | |
| 222 if (wc == ' ' || (unsigned)wc - '\t' < 5) | |
| 223 return false; | |
| 224 if (wc < 0xff) | |
| 225 return ((wc + 1) & 0x7f) >= 0x21; | |
| 226 if (wc < 0x2028 || wc - 0x202a < 0xd800 - 0x202a || wc - 0xe000 < 0xfff9 - 0xe000) | |
| 227 return true; | |
| 228 if (wc - 0xfffc > 0x10ffff - 0xfffc || (wc & 0xfffe) == 0xfffe) | |
| 229 return false; | |
| 230 return true; | |
| 231 } | |
| 232 | |
| 233 std::wstring EscapeNonGraphical(std::wstring_view str) | |
| 234 { | |
| 235 static const char* const ascii_nongraphs[33] = { | |
| 236 "NUL", "SOH", "STX", "ETX", "EOT", "ENQ", "ACK", "BEL", | |
| 237 "BS", "HT", "LF", "VT", "FF", "CR", "SO", "SI", | |
| 238 "DLE", "DC1", "DC2", "DC3", "DC4", "NAK", "SYN", "ETB", | |
| 239 "CAN", "EM", "SUB", "ESC", "FS", "GS", "RS", "US", | |
| 240 "DEL", | |
| 241 }; | |
| 242 | |
| 243 std::wostringstream ws; | |
| 244 ws.fill(L'0'); | |
| 245 | |
| 246 for (; str.size(); str.remove_prefix(1)) { | |
| 247 wchar_t wc = str.front(); | |
| 248 if (wc < 32 || wc == 127) // Non-graphical ASCII, excluding space | |
| 249 ws << "<" << ascii_nongraphs[wc == 127 ? 32 : wc] << ">"; | |
| 250 else if (wc < 128) // ASCII | |
| 251 ws << wc; | |
| 252 else if (IsUtf16SurrogatePair(str)) { | |
| 253 ws.write(str.data(), 2); | |
| 254 str.remove_prefix(1); | |
| 255 } | |
| 256 // Exclude unpaired surrogates and NO-BREAK spaces NBSP and NUMSP | |
| 257 else if ((wc < 0xd800 || wc >= 0xe000) && (iswgraph(wc) && wc != 0xA0 && wc != 0x2007 && wc != 0x2000 && wc != 0xfffd)) | |
| 258 ws << wc; | |
| 259 else // Non-graphical Unicode | |
| 260 ws << "<U+" << std::setw(wc < 256 ? 2 : 4) << std::uppercase << std::hex << static_cast<uint32_t>(wc) << ">"; | |
| 261 } | |
| 262 | |
| 263 return ws.str(); | |
| 264 } | |
| 265 | |
| 266 std::string EscapeNonGraphical(std::string_view utf8) | |
| 267 { | |
| 268 return ToUtf8(EscapeNonGraphical(FromUtf8(utf8))); | |
| 269 } | |
| 270 | |
| 271 } // namespace ZXing |
