comparison mupdf-source/thirdparty/zxing-cpp/core/src/TextDecoder.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /*
2 * Copyright 2016 Nu-book Inc.
3 * Copyright 2022 gitlost
4 */
5 // SPDX-License-Identifier: Apache-2.0
6
7 #include "TextDecoder.h"
8
9 #include "CharacterSet.h"
10 #include "ECI.h"
11 #include "Utf.h"
12 #include "ZXAlgorithms.h"
13 #include "libzueci/zueci.h"
14
15 #include <cassert>
16 #include <stdexcept>
17
18 namespace ZXing {
19
20 void TextDecoder::Append(std::string& str, const uint8_t* bytes, size_t length, CharacterSet charset, bool sjisASCII)
21 {
22 int eci = ToInt(ToECI(charset));
23 const size_t str_len = str.length();
24 const int bytes_len = narrow_cast<int>(length);
25 constexpr unsigned int replacement = 0xFFFD;
26 const unsigned int flags = ZUECI_FLAG_SB_STRAIGHT_THRU | (sjisASCII ? ZUECI_FLAG_SJIS_STRAIGHT_THRU : 0);
27 int utf8_len;
28
29 if (eci == -1)
30 eci = 899; // Binary
31
32 int error_number = zueci_dest_len_utf8(eci, bytes, bytes_len, replacement, flags, &utf8_len);
33 if (error_number >= ZUECI_ERROR)
34 throw std::runtime_error("zueci_dest_len_utf8 failed");
35
36 str.resize(str_len + utf8_len); // Precise length
37 unsigned char *utf8_buf = reinterpret_cast<unsigned char *>(str.data()) + str_len;
38
39 error_number = zueci_eci_to_utf8(eci, bytes, bytes_len, replacement, flags, utf8_buf, &utf8_len);
40 if (error_number >= ZUECI_ERROR) {
41 str.resize(str_len);
42 throw std::runtime_error("zueci_eci_to_utf8 failed");
43 }
44 assert(str.length() == str_len + utf8_len);
45 }
46
47 void TextDecoder::Append(std::wstring& str, const uint8_t* bytes, size_t length, CharacterSet charset)
48 {
49 std::string u8str;
50 Append(u8str, bytes, length, charset);
51 str.append(FromUtf8(u8str));
52 }
53
54 /**
55 * @param bytes bytes encoding a string, whose encoding should be guessed
56 * @return name of guessed encoding; at the moment will only guess one of:
57 * {@link #SHIFT_JIS}, {@link #UTF8}, {@link #ISO88591}, or the platform
58 * default encoding if none of these can possibly be correct
59 */
60 CharacterSet
61 TextDecoder::GuessEncoding(const uint8_t* bytes, size_t length, CharacterSet fallback)
62 {
63 // For now, merely tries to distinguish ISO-8859-1, UTF-8 and Shift_JIS,
64 // which should be by far the most common encodings.
65 bool canBeISO88591 = true;
66 bool canBeShiftJIS = true;
67 bool canBeUTF8 = true;
68 int utf8BytesLeft = 0;
69 //int utf8LowChars = 0;
70 int utf2BytesChars = 0;
71 int utf3BytesChars = 0;
72 int utf4BytesChars = 0;
73 int sjisBytesLeft = 0;
74 //int sjisLowChars = 0;
75 int sjisKatakanaChars = 0;
76 //int sjisDoubleBytesChars = 0;
77 int sjisCurKatakanaWordLength = 0;
78 int sjisCurDoubleBytesWordLength = 0;
79 int sjisMaxKatakanaWordLength = 0;
80 int sjisMaxDoubleBytesWordLength = 0;
81 //int isoLowChars = 0;
82 //int isoHighChars = 0;
83 int isoHighOther = 0;
84
85 bool utf8bom = length > 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF;
86
87 for (size_t i = 0; i < length && (canBeISO88591 || canBeShiftJIS || canBeUTF8); ++i)
88 {
89 int value = bytes[i];
90
91 // UTF-8 stuff
92 if (canBeUTF8) {
93 if (utf8BytesLeft > 0) {
94 if ((value & 0x80) == 0) {
95 canBeUTF8 = false;
96 }
97 else {
98 utf8BytesLeft--;
99 }
100 }
101 else if ((value & 0x80) != 0) {
102 if ((value & 0x40) == 0) {
103 canBeUTF8 = false;
104 }
105 else {
106 utf8BytesLeft++;
107 if ((value & 0x20) == 0) {
108 utf2BytesChars++;
109 }
110 else {
111 utf8BytesLeft++;
112 if ((value & 0x10) == 0) {
113 utf3BytesChars++;
114 }
115 else {
116 utf8BytesLeft++;
117 if ((value & 0x08) == 0) {
118 utf4BytesChars++;
119 }
120 else {
121 canBeUTF8 = false;
122 }
123 }
124 }
125 }
126 } //else {
127 //utf8LowChars++;
128 //}
129 }
130
131 // ISO-8859-1 stuff
132 if (canBeISO88591) {
133 if (value > 0x7F && value < 0xA0) {
134 canBeISO88591 = false;
135 }
136 else if (value > 0x9F) {
137 if (value < 0xC0 || value == 0xD7 || value == 0xF7) {
138 isoHighOther++;
139 } //else {
140 //isoHighChars++;
141 //}
142 } //else {
143 //isoLowChars++;
144 //}
145 }
146
147 // Shift_JIS stuff
148 if (canBeShiftJIS) {
149 if (sjisBytesLeft > 0) {
150 if (value < 0x40 || value == 0x7F || value > 0xFC) {
151 canBeShiftJIS = false;
152 }
153 else {
154 sjisBytesLeft--;
155 }
156 }
157 else if (value == 0x80 || value == 0xA0 || value > 0xEF) {
158 canBeShiftJIS = false;
159 }
160 else if (value < 0x20 && value != 0xa && value != 0xd) {
161 canBeShiftJIS = false; // use non-printable ASCII as indication for binary content
162 }
163 else if (value > 0xA0 && value < 0xE0) {
164 sjisKatakanaChars++;
165 sjisCurDoubleBytesWordLength = 0;
166 sjisCurKatakanaWordLength++;
167 if (sjisCurKatakanaWordLength > sjisMaxKatakanaWordLength) {
168 sjisMaxKatakanaWordLength = sjisCurKatakanaWordLength;
169 }
170 }
171 else if (value > 0x7F) {
172 sjisBytesLeft++;
173 //sjisDoubleBytesChars++;
174 sjisCurKatakanaWordLength = 0;
175 sjisCurDoubleBytesWordLength++;
176 if (sjisCurDoubleBytesWordLength > sjisMaxDoubleBytesWordLength) {
177 sjisMaxDoubleBytesWordLength = sjisCurDoubleBytesWordLength;
178 }
179 }
180 else {
181 //sjisLowChars++;
182 sjisCurKatakanaWordLength = 0;
183 sjisCurDoubleBytesWordLength = 0;
184 }
185 }
186 }
187
188 if (canBeUTF8 && utf8BytesLeft > 0) {
189 canBeUTF8 = false;
190 }
191 if (canBeShiftJIS && sjisBytesLeft > 0) {
192 canBeShiftJIS = false;
193 }
194
195 // Easy -- if there is BOM or at least 1 valid not-single byte character (and no evidence it can't be UTF-8), done
196 if (canBeUTF8 && (utf8bom || utf2BytesChars + utf3BytesChars + utf4BytesChars > 0)) {
197 return CharacterSet::UTF8;
198 }
199
200 bool assumeShiftJIS = fallback == CharacterSet::Shift_JIS || fallback == CharacterSet::EUC_JP;
201 // Easy -- if assuming Shift_JIS or at least 3 valid consecutive not-ascii characters (and no evidence it can't be), done
202 if (canBeShiftJIS && (assumeShiftJIS || sjisMaxKatakanaWordLength >= 3 || sjisMaxDoubleBytesWordLength >= 3)) {
203 return CharacterSet::Shift_JIS;
204 }
205 // Distinguishing Shift_JIS and ISO-8859-1 can be a little tough for short words. The crude heuristic is:
206 // - If we saw
207 // - only two consecutive katakana chars in the whole text, or
208 // - at least 10% of bytes that could be "upper" not-alphanumeric Latin1,
209 // - then we conclude Shift_JIS, else ISO-8859-1
210 if (canBeISO88591 && canBeShiftJIS) {
211 return (sjisMaxKatakanaWordLength == 2 && sjisKatakanaChars == 2) || isoHighOther * 10 >= (int)length
212 ? CharacterSet::Shift_JIS : CharacterSet::ISO8859_1;
213 }
214
215 // Otherwise, try in order ISO-8859-1, Shift JIS, UTF-8 and fall back to default platform encoding
216 if (canBeISO88591) {
217 return CharacterSet::ISO8859_1;
218 }
219 if (canBeShiftJIS) {
220 return CharacterSet::Shift_JIS;
221 }
222 if (canBeUTF8) {
223 return CharacterSet::UTF8;
224 }
225 // Otherwise, we take a wild guess with platform encoding
226 return fallback;
227 }
228
229 CharacterSet
230 TextDecoder::DefaultEncoding()
231 {
232 return CharacterSet::ISO8859_1;
233 }
234
235 } // ZXing