Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/zxing-cpp/core/src/TextDecoder.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /* | |
| 2 * Copyright 2016 Nu-book Inc. | |
| 3 * Copyright 2022 gitlost | |
| 4 */ | |
| 5 // SPDX-License-Identifier: Apache-2.0 | |
| 6 | |
| 7 #include "TextDecoder.h" | |
| 8 | |
| 9 #include "CharacterSet.h" | |
| 10 #include "ECI.h" | |
| 11 #include "Utf.h" | |
| 12 #include "ZXAlgorithms.h" | |
| 13 #include "libzueci/zueci.h" | |
| 14 | |
| 15 #include <cassert> | |
| 16 #include <stdexcept> | |
| 17 | |
| 18 namespace ZXing { | |
| 19 | |
| 20 void TextDecoder::Append(std::string& str, const uint8_t* bytes, size_t length, CharacterSet charset, bool sjisASCII) | |
| 21 { | |
| 22 int eci = ToInt(ToECI(charset)); | |
| 23 const size_t str_len = str.length(); | |
| 24 const int bytes_len = narrow_cast<int>(length); | |
| 25 constexpr unsigned int replacement = 0xFFFD; | |
| 26 const unsigned int flags = ZUECI_FLAG_SB_STRAIGHT_THRU | (sjisASCII ? ZUECI_FLAG_SJIS_STRAIGHT_THRU : 0); | |
| 27 int utf8_len; | |
| 28 | |
| 29 if (eci == -1) | |
| 30 eci = 899; // Binary | |
| 31 | |
| 32 int error_number = zueci_dest_len_utf8(eci, bytes, bytes_len, replacement, flags, &utf8_len); | |
| 33 if (error_number >= ZUECI_ERROR) | |
| 34 throw std::runtime_error("zueci_dest_len_utf8 failed"); | |
| 35 | |
| 36 str.resize(str_len + utf8_len); // Precise length | |
| 37 unsigned char *utf8_buf = reinterpret_cast<unsigned char *>(str.data()) + str_len; | |
| 38 | |
| 39 error_number = zueci_eci_to_utf8(eci, bytes, bytes_len, replacement, flags, utf8_buf, &utf8_len); | |
| 40 if (error_number >= ZUECI_ERROR) { | |
| 41 str.resize(str_len); | |
| 42 throw std::runtime_error("zueci_eci_to_utf8 failed"); | |
| 43 } | |
| 44 assert(str.length() == str_len + utf8_len); | |
| 45 } | |
| 46 | |
| 47 void TextDecoder::Append(std::wstring& str, const uint8_t* bytes, size_t length, CharacterSet charset) | |
| 48 { | |
| 49 std::string u8str; | |
| 50 Append(u8str, bytes, length, charset); | |
| 51 str.append(FromUtf8(u8str)); | |
| 52 } | |
| 53 | |
| 54 /** | |
| 55 * @param bytes bytes encoding a string, whose encoding should be guessed | |
| 56 * @return name of guessed encoding; at the moment will only guess one of: | |
| 57 * {@link #SHIFT_JIS}, {@link #UTF8}, {@link #ISO88591}, or the platform | |
| 58 * default encoding if none of these can possibly be correct | |
| 59 */ | |
| 60 CharacterSet | |
| 61 TextDecoder::GuessEncoding(const uint8_t* bytes, size_t length, CharacterSet fallback) | |
| 62 { | |
| 63 // For now, merely tries to distinguish ISO-8859-1, UTF-8 and Shift_JIS, | |
| 64 // which should be by far the most common encodings. | |
| 65 bool canBeISO88591 = true; | |
| 66 bool canBeShiftJIS = true; | |
| 67 bool canBeUTF8 = true; | |
| 68 int utf8BytesLeft = 0; | |
| 69 //int utf8LowChars = 0; | |
| 70 int utf2BytesChars = 0; | |
| 71 int utf3BytesChars = 0; | |
| 72 int utf4BytesChars = 0; | |
| 73 int sjisBytesLeft = 0; | |
| 74 //int sjisLowChars = 0; | |
| 75 int sjisKatakanaChars = 0; | |
| 76 //int sjisDoubleBytesChars = 0; | |
| 77 int sjisCurKatakanaWordLength = 0; | |
| 78 int sjisCurDoubleBytesWordLength = 0; | |
| 79 int sjisMaxKatakanaWordLength = 0; | |
| 80 int sjisMaxDoubleBytesWordLength = 0; | |
| 81 //int isoLowChars = 0; | |
| 82 //int isoHighChars = 0; | |
| 83 int isoHighOther = 0; | |
| 84 | |
| 85 bool utf8bom = length > 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF; | |
| 86 | |
| 87 for (size_t i = 0; i < length && (canBeISO88591 || canBeShiftJIS || canBeUTF8); ++i) | |
| 88 { | |
| 89 int value = bytes[i]; | |
| 90 | |
| 91 // UTF-8 stuff | |
| 92 if (canBeUTF8) { | |
| 93 if (utf8BytesLeft > 0) { | |
| 94 if ((value & 0x80) == 0) { | |
| 95 canBeUTF8 = false; | |
| 96 } | |
| 97 else { | |
| 98 utf8BytesLeft--; | |
| 99 } | |
| 100 } | |
| 101 else if ((value & 0x80) != 0) { | |
| 102 if ((value & 0x40) == 0) { | |
| 103 canBeUTF8 = false; | |
| 104 } | |
| 105 else { | |
| 106 utf8BytesLeft++; | |
| 107 if ((value & 0x20) == 0) { | |
| 108 utf2BytesChars++; | |
| 109 } | |
| 110 else { | |
| 111 utf8BytesLeft++; | |
| 112 if ((value & 0x10) == 0) { | |
| 113 utf3BytesChars++; | |
| 114 } | |
| 115 else { | |
| 116 utf8BytesLeft++; | |
| 117 if ((value & 0x08) == 0) { | |
| 118 utf4BytesChars++; | |
| 119 } | |
| 120 else { | |
| 121 canBeUTF8 = false; | |
| 122 } | |
| 123 } | |
| 124 } | |
| 125 } | |
| 126 } //else { | |
| 127 //utf8LowChars++; | |
| 128 //} | |
| 129 } | |
| 130 | |
| 131 // ISO-8859-1 stuff | |
| 132 if (canBeISO88591) { | |
| 133 if (value > 0x7F && value < 0xA0) { | |
| 134 canBeISO88591 = false; | |
| 135 } | |
| 136 else if (value > 0x9F) { | |
| 137 if (value < 0xC0 || value == 0xD7 || value == 0xF7) { | |
| 138 isoHighOther++; | |
| 139 } //else { | |
| 140 //isoHighChars++; | |
| 141 //} | |
| 142 } //else { | |
| 143 //isoLowChars++; | |
| 144 //} | |
| 145 } | |
| 146 | |
| 147 // Shift_JIS stuff | |
| 148 if (canBeShiftJIS) { | |
| 149 if (sjisBytesLeft > 0) { | |
| 150 if (value < 0x40 || value == 0x7F || value > 0xFC) { | |
| 151 canBeShiftJIS = false; | |
| 152 } | |
| 153 else { | |
| 154 sjisBytesLeft--; | |
| 155 } | |
| 156 } | |
| 157 else if (value == 0x80 || value == 0xA0 || value > 0xEF) { | |
| 158 canBeShiftJIS = false; | |
| 159 } | |
| 160 else if (value < 0x20 && value != 0xa && value != 0xd) { | |
| 161 canBeShiftJIS = false; // use non-printable ASCII as indication for binary content | |
| 162 } | |
| 163 else if (value > 0xA0 && value < 0xE0) { | |
| 164 sjisKatakanaChars++; | |
| 165 sjisCurDoubleBytesWordLength = 0; | |
| 166 sjisCurKatakanaWordLength++; | |
| 167 if (sjisCurKatakanaWordLength > sjisMaxKatakanaWordLength) { | |
| 168 sjisMaxKatakanaWordLength = sjisCurKatakanaWordLength; | |
| 169 } | |
| 170 } | |
| 171 else if (value > 0x7F) { | |
| 172 sjisBytesLeft++; | |
| 173 //sjisDoubleBytesChars++; | |
| 174 sjisCurKatakanaWordLength = 0; | |
| 175 sjisCurDoubleBytesWordLength++; | |
| 176 if (sjisCurDoubleBytesWordLength > sjisMaxDoubleBytesWordLength) { | |
| 177 sjisMaxDoubleBytesWordLength = sjisCurDoubleBytesWordLength; | |
| 178 } | |
| 179 } | |
| 180 else { | |
| 181 //sjisLowChars++; | |
| 182 sjisCurKatakanaWordLength = 0; | |
| 183 sjisCurDoubleBytesWordLength = 0; | |
| 184 } | |
| 185 } | |
| 186 } | |
| 187 | |
| 188 if (canBeUTF8 && utf8BytesLeft > 0) { | |
| 189 canBeUTF8 = false; | |
| 190 } | |
| 191 if (canBeShiftJIS && sjisBytesLeft > 0) { | |
| 192 canBeShiftJIS = false; | |
| 193 } | |
| 194 | |
| 195 // Easy -- if there is BOM or at least 1 valid not-single byte character (and no evidence it can't be UTF-8), done | |
| 196 if (canBeUTF8 && (utf8bom || utf2BytesChars + utf3BytesChars + utf4BytesChars > 0)) { | |
| 197 return CharacterSet::UTF8; | |
| 198 } | |
| 199 | |
| 200 bool assumeShiftJIS = fallback == CharacterSet::Shift_JIS || fallback == CharacterSet::EUC_JP; | |
| 201 // Easy -- if assuming Shift_JIS or at least 3 valid consecutive not-ascii characters (and no evidence it can't be), done | |
| 202 if (canBeShiftJIS && (assumeShiftJIS || sjisMaxKatakanaWordLength >= 3 || sjisMaxDoubleBytesWordLength >= 3)) { | |
| 203 return CharacterSet::Shift_JIS; | |
| 204 } | |
| 205 // Distinguishing Shift_JIS and ISO-8859-1 can be a little tough for short words. The crude heuristic is: | |
| 206 // - If we saw | |
| 207 // - only two consecutive katakana chars in the whole text, or | |
| 208 // - at least 10% of bytes that could be "upper" not-alphanumeric Latin1, | |
| 209 // - then we conclude Shift_JIS, else ISO-8859-1 | |
| 210 if (canBeISO88591 && canBeShiftJIS) { | |
| 211 return (sjisMaxKatakanaWordLength == 2 && sjisKatakanaChars == 2) || isoHighOther * 10 >= (int)length | |
| 212 ? CharacterSet::Shift_JIS : CharacterSet::ISO8859_1; | |
| 213 } | |
| 214 | |
| 215 // Otherwise, try in order ISO-8859-1, Shift JIS, UTF-8 and fall back to default platform encoding | |
| 216 if (canBeISO88591) { | |
| 217 return CharacterSet::ISO8859_1; | |
| 218 } | |
| 219 if (canBeShiftJIS) { | |
| 220 return CharacterSet::Shift_JIS; | |
| 221 } | |
| 222 if (canBeUTF8) { | |
| 223 return CharacterSet::UTF8; | |
| 224 } | |
| 225 // Otherwise, we take a wild guess with platform encoding | |
| 226 return fallback; | |
| 227 } | |
| 228 | |
| 229 CharacterSet | |
| 230 TextDecoder::DefaultEncoding() | |
| 231 { | |
| 232 return CharacterSet::ISO8859_1; | |
| 233 } | |
| 234 | |
| 235 } // ZXing |
