Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/zint/backend/eci.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /* eci.c - Extended Channel Interpretations */ | |
| 2 /* | |
| 3 libzint - the open source barcode library | |
| 4 Copyright (C) 2009-2024 Robin Stuart <rstuart114@gmail.com> | |
| 5 | |
| 6 Redistribution and use in source and binary forms, with or without | |
| 7 modification, are permitted provided that the following conditions | |
| 8 are met: | |
| 9 | |
| 10 1. Redistributions of source code must retain the above copyright | |
| 11 notice, this list of conditions and the following disclaimer. | |
| 12 2. Redistributions in binary form must reproduce the above copyright | |
| 13 notice, this list of conditions and the following disclaimer in the | |
| 14 documentation and/or other materials provided with the distribution. | |
| 15 3. Neither the name of the project nor the names of its contributors | |
| 16 may be used to endorse or promote products derived from this software | |
| 17 without specific prior written permission. | |
| 18 | |
| 19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | |
| 20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
| 21 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
| 22 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE | |
| 23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
| 24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
| 25 OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
| 26 HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
| 27 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
| 28 OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
| 29 SUCH DAMAGE. | |
| 30 */ | |
| 31 /* SPDX-License-Identifier: BSD-3-Clause */ | |
| 32 | |
| 33 #include <assert.h> | |
| 34 #include "common.h" | |
| 35 #include "eci.h" | |
| 36 #include "eci_sb.h" | |
| 37 #include "big5.h" | |
| 38 #include "gb18030.h" | |
| 39 #include "gb2312.h" | |
| 40 #include "gbk.h" | |
| 41 #include "ksx1001.h" | |
| 42 #include "sjis.h" | |
| 43 | |
| 44 /* Single-byte stuff */ | |
| 45 | |
| 46 /* Base ISO/IEC 8859 routine to convert Unicode codepoint `u` */ | |
| 47 static int u_iso8859(const unsigned int u, const unsigned short *tab_s, const unsigned short *tab_u, | |
| 48 const unsigned char *tab_sb, int e, unsigned char *dest) { | |
| 49 int s; | |
| 50 if (u < 0xA0) { | |
| 51 if (u >= 0x80) { /* U+0080-9F fail */ | |
| 52 return 0; | |
| 53 } | |
| 54 *dest = (unsigned char) u; | |
| 55 return 1; | |
| 56 } | |
| 57 if (u <= 0xFF) { | |
| 58 const unsigned int u2 = u - 0xA0; | |
| 59 if (tab_s[u2 >> 4] & ((unsigned short) 1 << (u2 & 0xF))) { | |
| 60 *dest = (unsigned char) u; /* Straight-thru */ | |
| 61 return 1; | |
| 62 } | |
| 63 } | |
| 64 | |
| 65 s = 0; | |
| 66 while (s <= e) { | |
| 67 const int m = (s + e) >> 1; | |
| 68 if (tab_u[m] < u) { | |
| 69 s = m + 1; | |
| 70 } else if (tab_u[m] > u) { | |
| 71 e = m - 1; | |
| 72 } else { | |
| 73 *dest = tab_sb[m]; | |
| 74 return 1; | |
| 75 } | |
| 76 } | |
| 77 return 0; | |
| 78 } | |
| 79 | |
| 80 /* Base Windows-125x routine to convert Unicode codepoint `u` */ | |
| 81 static int u_cp125x(const unsigned int u, const unsigned short *tab_s, const unsigned short *tab_u, | |
| 82 const unsigned char *tab_sb, int e, unsigned char *dest) { | |
| 83 int s; | |
| 84 if (u < 0x80) { | |
| 85 *dest = (unsigned char) u; | |
| 86 return 1; | |
| 87 } | |
| 88 if (u <= 0xFF && u >= 0xA0) { | |
| 89 const unsigned int u2 = u - 0xA0; | |
| 90 if (tab_s[u2 >> 4] & ((unsigned short) 1 << (u2 & 0xF))) { | |
| 91 *dest = (unsigned char) u; /* Straight-thru */ | |
| 92 return 1; | |
| 93 } | |
| 94 } | |
| 95 | |
| 96 s = 0; | |
| 97 while (s <= e) { | |
| 98 const int m = (s + e) >> 1; | |
| 99 if (tab_u[m] < u) { | |
| 100 s = m + 1; | |
| 101 } else if (tab_u[m] > u) { | |
| 102 e = m - 1; | |
| 103 } else { | |
| 104 *dest = tab_sb[m]; | |
| 105 return 1; | |
| 106 } | |
| 107 } | |
| 108 return 0; | |
| 109 } | |
| 110 | |
| 111 /* ECI 27 ASCII (ISO/IEC 646:1991 IRV (US)) */ | |
| 112 static int u_ascii(const unsigned int u, unsigned char *dest) { | |
| 113 if (u < 0x80) { | |
| 114 *dest = (unsigned char) u; | |
| 115 return 1; | |
| 116 } | |
| 117 return 0; | |
| 118 } | |
| 119 | |
| 120 /* ECI 170 ASCII subset (ISO/IEC 646:1991 Invariant), excludes 12 chars that historically had national variants, | |
| 121 namely "#$@[\]^`{|}~" */ | |
| 122 static int u_ascii_inv(const unsigned int u, unsigned char *dest) { | |
| 123 if (u == 0x7F || (u <= 'z' && u != '#' && u != '$' && u != '@' && (u <= 'Z' || u == '_' || u >= 'a'))) { | |
| 124 *dest = (unsigned char) u; | |
| 125 return 1; | |
| 126 } | |
| 127 return 0; | |
| 128 } | |
| 129 | |
| 130 /* ECI 25 UTF-16 Big Endian (ISO/IEC 10646) - assumes valid Unicode */ | |
| 131 static int u_utf16be(const unsigned int u, unsigned char *dest) { | |
| 132 unsigned int u2, v; | |
| 133 if (u < 0x10000) { | |
| 134 dest[0] = (unsigned char) (u >> 8); | |
| 135 dest[1] = (unsigned char) u; | |
| 136 return 2; | |
| 137 } | |
| 138 u2 = u - 0x10000; | |
| 139 v = u2 >> 10; | |
| 140 dest[0] = (unsigned char) (0xD8 + (v >> 8)); | |
| 141 dest[1] = (unsigned char) v; | |
| 142 v = u2 & 0x3FF; | |
| 143 dest[2] = (unsigned char) (0xDC + (v >> 8)); | |
| 144 dest[3] = (unsigned char) v; | |
| 145 return 4; | |
| 146 } | |
| 147 | |
| 148 /* ECI 33 UTF-16 Little Endian (ISO/IEC 10646) - assumes valid Unicode */ | |
| 149 static int u_utf16le(const unsigned int u, unsigned char *dest) { | |
| 150 unsigned int u2, v; | |
| 151 if (u < 0x10000) { | |
| 152 dest[0] = (unsigned char) u; | |
| 153 dest[1] = (unsigned char) (u >> 8); | |
| 154 return 2; | |
| 155 } | |
| 156 u2 = u - 0x10000; | |
| 157 v = u2 >> 10; | |
| 158 dest[0] = (unsigned char) v; | |
| 159 dest[1] = (unsigned char) (0xD8 + (v >> 8)); | |
| 160 v = u2 & 0x3FF; | |
| 161 dest[2] = (unsigned char) v; | |
| 162 dest[3] = (unsigned char) (0xDC + (v >> 8)); | |
| 163 return 4; | |
| 164 } | |
| 165 | |
| 166 /* ECI 34 UTF-32 Big Endian (ISO/IEC 10646) - assumes valid Unicode */ | |
| 167 static int u_utf32be(const unsigned int u, unsigned char *dest) { | |
| 168 dest[0] = 0; | |
| 169 dest[1] = (unsigned char) (u >> 16); | |
| 170 dest[2] = (unsigned char) (u >> 8); | |
| 171 dest[3] = (unsigned char) u; | |
| 172 return 4; | |
| 173 } | |
| 174 | |
| 175 /* ECI 35 UTF-32 Little Endian (ISO/IEC 10646) - assumes valid Unicode */ | |
| 176 static int u_utf32le(const unsigned int u, unsigned char *dest) { | |
| 177 dest[0] = (unsigned char) u; | |
| 178 dest[1] = (unsigned char) (u >> 8); | |
| 179 dest[2] = (unsigned char) (u >> 16); | |
| 180 dest[3] = 0; | |
| 181 return 4; | |
| 182 } | |
| 183 | |
| 184 /* Multibyte stuff */ | |
| 185 | |
| 186 /* Acknowledgements to Bruno Haible <bruno@clisp.org> for a no. of techniques used here */ | |
| 187 | |
| 188 /* Helper to lookup Unicode codepoint `u` in the URO (Unified Repertoire and Ordering) block (U+4E00-9FFF) */ | |
| 189 static int eci_u_lookup_uro_int(const unsigned int u, const unsigned short *tab_u, const unsigned short *tab_mb_ind, | |
| 190 const unsigned short *tab_mb, unsigned int *d) { | |
| 191 unsigned int u2 = (u - 0x4E00) >> 4; /* Blocks of 16 */ | |
| 192 unsigned int v = (unsigned int) 1 << (u & 0xF); | |
| 193 if ((tab_u[u2] & v) == 0) { | |
| 194 return 0; | |
| 195 } | |
| 196 v = tab_u[u2] & (v - 1); /* Mask to bits prior to this one */ | |
| 197 /* Count bits set (http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel) */ | |
| 198 v = v - ((v >> 1) & 0x55555555); | |
| 199 v = (v & 0x33333333) + ((v >> 2) & 0x33333333); | |
| 200 v = (((v + (v >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24; | |
| 201 *d = tab_mb[tab_mb_ind[u2] + v]; | |
| 202 return 2; | |
| 203 } | |
| 204 | |
| 205 /* Version of `eci_u_lookup_uro_int()` taking unsigned char destination */ | |
| 206 static int eci_u_lookup_uro(const unsigned int u, const unsigned short *tab_u, const unsigned short *tab_mb_ind, | |
| 207 const unsigned short *tab_mb, unsigned char *dest) { | |
| 208 unsigned int d; | |
| 209 int ret = eci_u_lookup_uro_int(u, tab_u, tab_mb_ind, tab_mb, &d); | |
| 210 if (ret) { | |
| 211 dest[0] = (unsigned char) (d >> 8); | |
| 212 dest[1] = (unsigned char) d; | |
| 213 } | |
| 214 return ret; | |
| 215 } | |
| 216 | |
| 217 /* ECI 20 Shift JIS */ | |
| 218 static int u_sjis_int(const unsigned int u, unsigned int *d) { | |
| 219 unsigned int u2, dv, md; | |
| 220 int s, e; | |
| 221 | |
| 222 if (u < 0x80 && u != 0x5C && u != 0x7E) { /* Backslash & tilde re-mapped according to JIS X 0201 Roman */ | |
| 223 *d = u; | |
| 224 return 1; | |
| 225 } | |
| 226 /* Special case URO block sequential mappings (considerably lessens size of `sjis_u[]` array) */ | |
| 227 if (u >= 0x4E00 && u <= 0xDFFF) { /* 0xE000 next used value >= 0x4E00 */ | |
| 228 if (u >= 0x9FB0) { | |
| 229 return 0; | |
| 230 } | |
| 231 return eci_u_lookup_uro_int(u, sjis_uro_u, sjis_uro_mb_ind, sjis_mb, d); | |
| 232 } | |
| 233 /* PUA to user-defined (Table 4-86, Lunde, 2nd ed.) */ | |
| 234 if (u >= 0xE000 && u <= 0xE757) { | |
| 235 u2 = u - 0xE000; | |
| 236 dv = u2 / 188; | |
| 237 md = u2 - dv * 188; | |
| 238 *d = ((dv + 0xF0) << 8) | (md + 0x40 + (md >= 0x3F)); | |
| 239 return 2; | |
| 240 } | |
| 241 if (u >= sjis_u[0] && u <= sjis_u[ARRAY_SIZE(sjis_u) - 1]) { | |
| 242 s = 0; | |
| 243 e = ARRAY_SIZE(sjis_u) - 1; | |
| 244 while (s <= e) { | |
| 245 const int m = (s + e) >> 1; | |
| 246 if (sjis_u[m] < u) { | |
| 247 s = m + 1; | |
| 248 } else if (sjis_u[m] > u) { | |
| 249 e = m - 1; | |
| 250 } else { | |
| 251 *d = sjis_mb[u >= 0x4E00 ? m + 6356 : m]; /* Adjust for URO block */ | |
| 252 return 1 + (*d > 0xFF); | |
| 253 } | |
| 254 } | |
| 255 } | |
| 256 return 0; | |
| 257 } | |
| 258 | |
| 259 #ifdef ZINT_TEST /* Wrapper for direct testing */ | |
| 260 INTERNAL int u_sjis_int_test(const unsigned int u, unsigned int *d) { | |
| 261 return u_sjis_int(u, d); | |
| 262 } | |
| 263 #endif | |
| 264 | |
| 265 /* Version of `u_sjis_int()` taking unsigned char destination, for use by `utf8_to_eci()` */ | |
| 266 static int u_sjis(const unsigned int u, unsigned char *dest) { | |
| 267 unsigned int d; | |
| 268 int ret = u_sjis_int(u, &d); | |
| 269 if (ret) { | |
| 270 if (ret == 1) { | |
| 271 dest[0] = (unsigned char) d; | |
| 272 } else { | |
| 273 dest[0] = (unsigned char) (d >> 8); | |
| 274 dest[1] = (unsigned char) d; | |
| 275 } | |
| 276 } | |
| 277 return ret; | |
| 278 } | |
| 279 | |
| 280 /* ECI 28 Big5 Chinese (Taiwan) */ | |
| 281 static int u_big5(const unsigned int u, unsigned char *dest) { | |
| 282 int s, e; | |
| 283 | |
| 284 if (u < 0x80) { | |
| 285 *dest = (unsigned char) u; | |
| 286 return 1; | |
| 287 } | |
| 288 /* Special case URO block sequential mappings (considerably lessens size of `big5_u[]` array) */ | |
| 289 if (u >= 0x4E00 && u <= 0xFA0B) { /* 0xFA0C next used value >= 0x4E00 */ | |
| 290 if (u >= 0x9FB0) { | |
| 291 return 0; | |
| 292 } | |
| 293 return eci_u_lookup_uro(u, big5_uro_u, big5_uro_mb_ind, big5_mb, dest); | |
| 294 } | |
| 295 if (u >= big5_u[0] && u <= big5_u[ARRAY_SIZE(big5_u) - 1]) { | |
| 296 s = 0; | |
| 297 e = ARRAY_SIZE(big5_u) - 1; | |
| 298 while (s <= e) { | |
| 299 const int m = (s + e) >> 1; | |
| 300 if (big5_u[m] < u) { | |
| 301 s = m + 1; | |
| 302 } else if (big5_u[m] > u) { | |
| 303 e = m - 1; | |
| 304 } else { | |
| 305 const unsigned short mb = big5_mb[u >= 0x4E00 ? m + 13061 : m]; /* Adjust for URO block */ | |
| 306 dest[0] = (unsigned char) (mb >> 8); | |
| 307 dest[1] = (unsigned char) mb; | |
| 308 return 2; | |
| 309 } | |
| 310 } | |
| 311 } | |
| 312 return 0; | |
| 313 } | |
| 314 | |
| 315 #ifdef ZINT_TEST /* Wrapper for direct testing */ | |
| 316 INTERNAL int u_big5_test(const unsigned int u, unsigned char *dest) { | |
| 317 return u_big5(u, dest); | |
| 318 } | |
| 319 #endif | |
| 320 | |
| 321 /* ECI 30 EUC-KR (KS X 1001, formerly KS C 5601) Korean */ | |
| 322 static int u_ksx1001(const unsigned int u, unsigned char *dest) { | |
| 323 int s, e; | |
| 324 | |
| 325 if (u < 0x80) { | |
| 326 *dest = (unsigned char) u; | |
| 327 return 1; | |
| 328 } | |
| 329 /* Special case URO block sequential mappings (considerably lessens size of `ksx1001_u[]` array) */ | |
| 330 if (u >= 0x4E00 && u <= 0xABFF) { /* 0xAC00 next used value >= 0x4E00 */ | |
| 331 if (u >= 0x9FA0) { | |
| 332 return 0; | |
| 333 } | |
| 334 return eci_u_lookup_uro(u, ksx1001_uro_u, ksx1001_uro_mb_ind, ksx1001_mb, dest); | |
| 335 } | |
| 336 if (u >= ksx1001_u[0] && u <= ksx1001_u[ARRAY_SIZE(ksx1001_u) - 1]) { | |
| 337 s = ksx1001_u_ind[(u - ksx1001_u[0]) >> 8]; | |
| 338 e = s + 0x100 > ARRAY_SIZE(ksx1001_u) ? ARRAY_SIZE(ksx1001_u) - 1 : s + 0x100 - 1; | |
| 339 while (s <= e) { | |
| 340 const int m = (s + e) >> 1; | |
| 341 if (ksx1001_u[m] < u) { | |
| 342 s = m + 1; | |
| 343 } else if (ksx1001_u[m] > u) { | |
| 344 e = m - 1; | |
| 345 } else { | |
| 346 const unsigned short mb = ksx1001_mb[u >= 0x4E00 ? m + 4620 : m]; /* Adjust for URO block */ | |
| 347 dest[0] = (unsigned char) (mb >> 8); | |
| 348 dest[1] = (unsigned char) mb; | |
| 349 return 2; | |
| 350 } | |
| 351 } | |
| 352 } | |
| 353 return 0; | |
| 354 } | |
| 355 | |
| 356 #ifdef ZINT_TEST /* Wrapper for direct testing */ | |
| 357 INTERNAL int u_ksx1001_test(const unsigned int u, unsigned char *dest) { | |
| 358 return u_ksx1001(u, dest); | |
| 359 } | |
| 360 #endif | |
| 361 | |
| 362 /* ECI 29 GB 2312 Chinese (PRC) */ | |
| 363 static int u_gb2312_int(const unsigned int u, unsigned int *d) { | |
| 364 int s, e; | |
| 365 | |
| 366 if (u < 0x80) { | |
| 367 *d = u; | |
| 368 return 1; | |
| 369 } | |
| 370 /* Special case URO block sequential mappings (considerably lessens size of `gb2312_u[]` array) */ | |
| 371 if (u >= 0x4E00 && u <= 0x9E1E) { /* 0x9E1F next used value >= 0x4E00 */ | |
| 372 if (u >= 0x9CF0) { | |
| 373 return 0; | |
| 374 } | |
| 375 return eci_u_lookup_uro_int(u, gb2312_uro_u, gb2312_uro_mb_ind, gb2312_mb, d); | |
| 376 } | |
| 377 if (u >= gb2312_u[0] && u <= gb2312_u[ARRAY_SIZE(gb2312_u) - 1]) { | |
| 378 s = gb2312_u_ind[(u - gb2312_u[0]) >> 8]; | |
| 379 e = s + 0x100 > ARRAY_SIZE(gb2312_u) ? ARRAY_SIZE(gb2312_u) - 1 : s + 0x100 - 1; | |
| 380 while (s <= e) { | |
| 381 const int m = (s + e) >> 1; | |
| 382 if (gb2312_u[m] < u) { | |
| 383 s = m + 1; | |
| 384 } else if (gb2312_u[m] > u) { | |
| 385 e = m - 1; | |
| 386 } else { | |
| 387 *d = gb2312_mb[u > 0x4E00 ? m + 6627 : m]; /* Adjust for URO block */ | |
| 388 return 2; | |
| 389 } | |
| 390 } | |
| 391 } | |
| 392 return 0; | |
| 393 } | |
| 394 | |
| 395 #ifdef ZINT_TEST /* Wrapper for direct testing */ | |
| 396 INTERNAL int u_gb2312_int_test(const unsigned int u, unsigned int *d) { | |
| 397 return u_gb2312_int(u, d); | |
| 398 } | |
| 399 #endif | |
| 400 | |
| 401 /* Version of `u_gb2312_int()` taking unsigned char destination, for use by `utf8_to_eci()` */ | |
| 402 static int u_gb2312(const unsigned int u, unsigned char *dest) { | |
| 403 unsigned int d; | |
| 404 int ret = u_gb2312_int(u, &d); | |
| 405 if (ret) { | |
| 406 if (ret == 1) { | |
| 407 dest[0] = (unsigned char) d; | |
| 408 } else { | |
| 409 dest[0] = (unsigned char) (d >> 8); | |
| 410 dest[1] = (unsigned char) d; | |
| 411 } | |
| 412 } | |
| 413 return ret; | |
| 414 } | |
| 415 | |
| 416 /* ECI 31 GBK Chinese */ | |
| 417 static int u_gbk_int(const unsigned int u, unsigned int *d) { | |
| 418 int s, e; | |
| 419 | |
| 420 if (u < 0x80) { | |
| 421 *d = u; | |
| 422 return 1; | |
| 423 } | |
| 424 | |
| 425 /* Check GB 2312 first */ | |
| 426 if (u == 0x30FB) { | |
| 427 /* KATAKANA MIDDLE DOT, mapped by GB 2312 but not by GBK (U+00B7 MIDDLE DOT mapped to 0xA1A4 instead) */ | |
| 428 return 0; | |
| 429 } | |
| 430 if (u == 0x2015) { | |
| 431 /* HORIZONTAL BAR, mapped to 0xA844 by GBK rather than 0xA1AA (U+2014 EM DASH mapped there instead) */ | |
| 432 *d = 0xA844; | |
| 433 return 2; | |
| 434 } | |
| 435 if (u_gb2312_int(u, d)) { /* Includes the 2 GB 6345.1-86 corrections given in Table 3-22, Lunde, 2nd ed. */ | |
| 436 return 2; | |
| 437 } | |
| 438 | |
| 439 /* Special case URO block sequential mappings (considerably lessens size of `gbk_u[]` array) */ | |
| 440 if (u >= 0x4E00 && u <= 0xF92B) { /* 0xF92C next used value >= 0x4E00 */ | |
| 441 if (u >= 0x9FB0) { | |
| 442 return 0; | |
| 443 } | |
| 444 return eci_u_lookup_uro_int(u, gbk_uro_u, gbk_uro_mb_ind, gbk_mb, d); | |
| 445 } | |
| 446 if (u >= gbk_u[0] && u <= gbk_u[ARRAY_SIZE(gbk_u) - 1]) { | |
| 447 s = 0; | |
| 448 e = ARRAY_SIZE(gbk_u) - 1; | |
| 449 while (s <= e) { | |
| 450 const int m = (s + e) >> 1; | |
| 451 if (gbk_u[m] < u) { | |
| 452 s = m + 1; | |
| 453 } else if (gbk_u[m] > u) { | |
| 454 e = m - 1; | |
| 455 } else { | |
| 456 *d = gbk_mb[u >= 0x4E00 ? m + 14139 : m]; /* Adjust for URO block */ | |
| 457 return 2; | |
| 458 } | |
| 459 } | |
| 460 } | |
| 461 return 0; | |
| 462 } | |
| 463 | |
| 464 #ifdef ZINT_TEST /* Wrapper for direct testing */ | |
| 465 INTERNAL int u_gbk_int_test(const unsigned int u, unsigned int *d) { | |
| 466 return u_gbk_int(u, d); | |
| 467 } | |
| 468 #endif | |
| 469 | |
| 470 /* Version of `u_gbk_int()` taking unsigned char destination, for use by `utf8_to_eci()` */ | |
| 471 static int u_gbk(const unsigned int u, unsigned char *dest) { | |
| 472 unsigned int d; | |
| 473 int ret = u_gbk_int(u, &d); | |
| 474 if (ret) { | |
| 475 if (ret == 1) { | |
| 476 dest[0] = (unsigned char) d; | |
| 477 } else { | |
| 478 dest[0] = (unsigned char) (d >> 8); | |
| 479 dest[1] = (unsigned char) d; | |
| 480 } | |
| 481 } | |
| 482 return ret; | |
| 483 } | |
| 484 | |
| 485 /* Helper for `u_gb18030_int()` to output 4-byte sequential blocks */ | |
| 486 static int u_gb18030_4_sequential_int(unsigned int u2, unsigned int mb_lead, unsigned int *d1, unsigned int *d2) { | |
| 487 unsigned int dv; | |
| 488 | |
| 489 dv = u2 / 10; | |
| 490 *d2 = u2 - dv * 10 + 0x30; | |
| 491 u2 = dv; | |
| 492 dv = u2 / 126; | |
| 493 *d2 |= (u2 - dv * 126 + 0x81) << 8; | |
| 494 u2 = dv; | |
| 495 dv = u2 / 10; | |
| 496 *d1 = ((dv + mb_lead) << 8) | (u2 - dv * 10 + 0x30); | |
| 497 return 4; | |
| 498 } | |
| 499 | |
| 500 /* ECI 32 GB 18030 Chinese - assumes valid Unicode */ | |
| 501 static int u_gb18030_int(const unsigned int u, unsigned int *d1, unsigned int *d2) { | |
| 502 unsigned int u2, dv; | |
| 503 int s, e; | |
| 504 | |
| 505 if (u < 0x80) { | |
| 506 *d1 = u; | |
| 507 return 1; | |
| 508 } | |
| 509 | |
| 510 /* Check GBK first */ | |
| 511 if (u_gbk_int(u, d1)) { | |
| 512 return 2; | |
| 513 } | |
| 514 | |
| 515 if (u >= 0x10000) { | |
| 516 /* Non-PUA, non-BMP, see Table 3-37, Lunde, 2nd ed. */ | |
| 517 if (u == 0x20087) { | |
| 518 *d1 = 0xFE51; | |
| 519 return 2; | |
| 520 } | |
| 521 if (u == 0x20089) { | |
| 522 *d1 = 0xFE52; | |
| 523 return 2; | |
| 524 } | |
| 525 if (u == 0x200CC) { | |
| 526 *d1 = 0xFE53; | |
| 527 return 2; | |
| 528 } | |
| 529 if (u == 0x215D7) { | |
| 530 *d1 = 0xFE6C; | |
| 531 return 2; | |
| 532 } | |
| 533 if (u == 0x2298F) { | |
| 534 *d1 = 0xFE76; | |
| 535 return 2; | |
| 536 } | |
| 537 if (u == 0x241FE) { | |
| 538 *d1 = 0xFE91; | |
| 539 return 2; | |
| 540 } | |
| 541 /* All other non-BMP U+10000-10FFFF */ | |
| 542 return u_gb18030_4_sequential_int(u - 0x10000, 0x90, d1, d2); | |
| 543 } | |
| 544 if (u >= 0xE000 && u <= 0xE765) { /* PUA to user-defined */ | |
| 545 if (u <= 0xE4C5) { | |
| 546 u2 = u - 0xE000; | |
| 547 dv = u2 / 94; | |
| 548 *d1 = ((dv + (dv < 6 ? 0xAA : 0xF2)) << 8) | (u2 - dv * 94 + 0xA1); | |
| 549 } else { | |
| 550 unsigned int md; | |
| 551 u2 = u - 0xE4C6; | |
| 552 dv = u2 / 96; | |
| 553 md = u2 - dv * 96; | |
| 554 *d1 = ((dv + 0xA1) << 8) | (md + 0x40 + (md >= 0x3F)); | |
| 555 } | |
| 556 return 2; | |
| 557 } | |
| 558 if (u >= gb18030_2_u[0] && u <= gb18030_2_u[ARRAY_SIZE(gb18030_2_u) - 1]) { | |
| 559 s = 0; | |
| 560 e = ARRAY_SIZE(gb18030_2_u) - 1; | |
| 561 while (s <= e) { | |
| 562 const int m = (s + e) >> 1; | |
| 563 if (gb18030_2_u[m] < u) { | |
| 564 s = m + 1; | |
| 565 } else if (gb18030_2_u[m] > u) { | |
| 566 e = m - 1; | |
| 567 } else { | |
| 568 *d1 = gb18030_2_mb[m]; | |
| 569 return 2; | |
| 570 } | |
| 571 } | |
| 572 } | |
| 573 /* All other BMP U+0080-FFFF */ | |
| 574 if (u == 0xE7C7) { /* PUA change to non-PUA, see Table 3-39, Lunde, 2nd ed. */ | |
| 575 *d1 = 0x8135; | |
| 576 *d2 = 0xF437; | |
| 577 return 4; | |
| 578 } | |
| 579 s = 0; | |
| 580 e = ARRAY_SIZE(gb18030_4_u_e) - 1; | |
| 581 while (s < e) { /* Lower bound */ | |
| 582 const int m = (s + e) >> 1; | |
| 583 if (gb18030_4_u_e[m] < u) { | |
| 584 s = m + 1; | |
| 585 } else { | |
| 586 e = m; | |
| 587 } | |
| 588 } | |
| 589 assert(s < ARRAY_SIZE(gb18030_4_u_e)); | |
| 590 return u_gb18030_4_sequential_int(u - gb18030_4_mb_o[s] - 0x80, 0x81, d1, d2); | |
| 591 } | |
| 592 | |
| 593 #ifdef ZINT_TEST /* Wrapper for direct testing */ | |
| 594 INTERNAL int u_gb18030_int_test(const unsigned int u, unsigned int *d1, unsigned int *d2) { | |
| 595 return u_gb18030_int(u, d1, d2); | |
| 596 } | |
| 597 #endif | |
| 598 | |
| 599 /* Version of `u_gb18030_int()` taking unsigned char destination, for use by `utf8_to_eci()` */ | |
| 600 static int u_gb18030(const unsigned int u, unsigned char *dest) { | |
| 601 unsigned int d1, d2; | |
| 602 int ret = u_gb18030_int(u, &d1, &d2); | |
| 603 if (ret) { | |
| 604 if (ret == 1) { | |
| 605 dest[0] = (unsigned char) d1; | |
| 606 } else { | |
| 607 dest[0] = (unsigned char) (d1 >> 8); | |
| 608 dest[1] = (unsigned char) d1; | |
| 609 if (ret == 4) { | |
| 610 dest[2] = (unsigned char) (d2 >> 8); | |
| 611 dest[3] = (unsigned char) d2; | |
| 612 } | |
| 613 } | |
| 614 } | |
| 615 return ret; | |
| 616 } | |
| 617 | |
| 618 /* Main ECI stuff */ | |
| 619 | |
| 620 /* Helper to count the number of chars in a string within a range */ | |
| 621 static int chr_range_cnt(const unsigned char string[], const int length, const unsigned char c1, | |
| 622 const unsigned char c2) { | |
| 623 int count = 0; | |
| 624 int i; | |
| 625 if (c1) { | |
| 626 for (i = 0; i < length; i++) { | |
| 627 if (string[i] >= c1 && string[i] <= c2) { | |
| 628 count++; | |
| 629 } | |
| 630 } | |
| 631 } else { | |
| 632 for (i = 0; i < length; i++) { | |
| 633 if (string[i] <= c2) { | |
| 634 count++; | |
| 635 } | |
| 636 } | |
| 637 } | |
| 638 return count; | |
| 639 } | |
| 640 | |
| 641 /* Is ECI convertible from UTF-8? */ | |
| 642 INTERNAL int is_eci_convertible(const int eci) { | |
| 643 if (eci == 26 || (eci > 35 && eci != 170)) { /* Exclude ECI 170 - ASCII Invariant */ | |
| 644 /* UTF-8 (26) or 8-bit binary data (899) or undefined (> 35 and < 899) or not character set (> 899) */ | |
| 645 return 0; | |
| 646 } | |
| 647 return 1; | |
| 648 } | |
| 649 | |
| 650 /* Are any of the ECIs in the segments convertible from UTF-8? | |
| 651 Sets `convertible[]` for each, which must be at least `seg_count` in size */ | |
| 652 INTERNAL int is_eci_convertible_segs(const struct zint_seg segs[], const int seg_count, int convertible[]) { | |
| 653 int ret = 0; | |
| 654 int i; | |
| 655 for (i = 0; i < seg_count; i++) { | |
| 656 convertible[i] = is_eci_convertible(segs[i].eci); | |
| 657 ret |= convertible[i]; | |
| 658 } | |
| 659 return ret; | |
| 660 } | |
| 661 | |
| 662 /* Calculate length required to convert UTF-8 to (double-byte) encoding */ | |
| 663 INTERNAL int get_eci_length(const int eci, const unsigned char source[], int length) { | |
| 664 if (eci == 20) { /* Shift JIS */ | |
| 665 /* Only ASCII backslash (reverse solidus) exceeds UTF-8 length */ | |
| 666 length += chr_cnt(source, length, '\\'); | |
| 667 | |
| 668 } else if (eci == 25 || eci == 33) { /* UTF-16 */ | |
| 669 /* All ASCII chars take 2 bytes */ | |
| 670 length += chr_range_cnt(source, length, 0, 0x7F); | |
| 671 /* Surrogate pairs are 4 UTF-8 bytes long so fit */ | |
| 672 | |
| 673 } else if (eci == 32) { /* GB 18030 */ | |
| 674 /* Allow for GB 18030 4 byters */ | |
| 675 length *= 2; | |
| 676 | |
| 677 } else if (eci == 34 || eci == 35) { /* UTF-32 */ | |
| 678 /* Quadruple-up ASCII and double-up non-ASCII */ | |
| 679 length += chr_range_cnt(source, length, 0, 0x7F) * 2 + length; | |
| 680 } | |
| 681 | |
| 682 /* Big5, GB 2312, EUC-KR and GBK fit in UTF-8 length */ | |
| 683 | |
| 684 return length; | |
| 685 } | |
| 686 | |
| 687 /* Call `get_eci_length()` for each segment, returning total */ | |
| 688 INTERNAL int get_eci_length_segs(const struct zint_seg segs[], const int seg_count) { | |
| 689 int length = 0; | |
| 690 int i; | |
| 691 | |
| 692 for (i = 0; i < seg_count; i++) { | |
| 693 length += get_eci_length(segs[i].eci, segs[i].source, segs[i].length); | |
| 694 } | |
| 695 | |
| 696 return length; | |
| 697 } | |
| 698 | |
| 699 /* Convert UTF-8 to other character encodings */ | |
| 700 typedef int (*eci_func_t)(const unsigned int u, unsigned char *dest); | |
| 701 INTERNAL int utf8_to_eci(const int eci, const unsigned char source[], unsigned char dest[], int *p_length) { | |
| 702 | |
| 703 static const eci_func_t eci_funcs[36] = { | |
| 704 NULL, NULL, NULL, NULL, u_iso8859_2, /*0-4*/ | |
| 705 u_iso8859_3, u_iso8859_4, u_iso8859_5, u_iso8859_6, u_iso8859_7, /*5-9*/ | |
| 706 u_iso8859_8, u_iso8859_9, u_iso8859_10, u_iso8859_11, NULL, /*10-14*/ | |
| 707 u_iso8859_13, u_iso8859_14, u_iso8859_15, u_iso8859_16, NULL, /*15-19*/ | |
| 708 u_sjis, u_cp1250, u_cp1251, u_cp1252, u_cp1256, /*20-24*/ | |
| 709 u_utf16be, NULL, u_ascii, u_big5, u_gb2312, /*25-29*/ | |
| 710 u_ksx1001, u_gbk, u_gb18030, u_utf16le, u_utf32be, /*30-34*/ | |
| 711 u_utf32le, | |
| 712 }; | |
| 713 eci_func_t eci_func; | |
| 714 unsigned int codepoint, state = 0; | |
| 715 int in_posn = 0; | |
| 716 int out_posn = 0; | |
| 717 int length = *p_length; | |
| 718 | |
| 719 /* Special case ISO/IEC 8859-1 */ | |
| 720 if (eci == 0 || eci == 3) { /* Default ECI 0 to ISO/IEC 8859-1 */ | |
| 721 while (in_posn < length) { | |
| 722 do { | |
| 723 decode_utf8(&state, &codepoint, source[in_posn++]); | |
| 724 } while (in_posn < length && state != 0 && state != 12); | |
| 725 if (state != 0) { | |
| 726 return ZINT_ERROR_INVALID_DATA; | |
| 727 } | |
| 728 if (codepoint >= 0x80 && (codepoint < 0xA0 || codepoint >= 0x100)) { | |
| 729 return ZINT_ERROR_INVALID_DATA; | |
| 730 } | |
| 731 dest[out_posn++] = (unsigned char) codepoint; | |
| 732 } | |
| 733 dest[out_posn] = '\0'; | |
| 734 *p_length = out_posn; | |
| 735 return 0; | |
| 736 } | |
| 737 | |
| 738 if (eci == 170) { /* ASCII Invariant (archaic subset) */ | |
| 739 eci_func = u_ascii_inv; | |
| 740 } else { | |
| 741 eci_func = eci_funcs[eci]; | |
| 742 if (eci_func == NULL) { | |
| 743 return ZINT_ERROR_INVALID_DATA; | |
| 744 } | |
| 745 } | |
| 746 | |
| 747 while (in_posn < length) { | |
| 748 int incr; | |
| 749 do { | |
| 750 decode_utf8(&state, &codepoint, source[in_posn++]); | |
| 751 } while (in_posn < length && state != 0 && state != 12); | |
| 752 if (state != 0) { | |
| 753 return ZINT_ERROR_INVALID_DATA; | |
| 754 } | |
| 755 incr = (*eci_func)(codepoint, dest + out_posn); | |
| 756 if (incr == 0) { | |
| 757 return ZINT_ERROR_INVALID_DATA; | |
| 758 } | |
| 759 out_posn += incr; | |
| 760 } | |
| 761 dest[out_posn] = '\0'; | |
| 762 *p_length = out_posn; | |
| 763 | |
| 764 return 0; | |
| 765 } | |
| 766 | |
| 767 /* Find the lowest single-byte ECI mode which will encode a given set of Unicode text, assuming valid UTF-8 */ | |
| 768 INTERNAL int get_best_eci(const unsigned char source[], int length) { | |
| 769 int eci = 3; | |
| 770 /* Note: attempting single-byte conversions only, so get_eci_length() unnecessary */ | |
| 771 unsigned char *local_source = (unsigned char *) z_alloca(length + 1); | |
| 772 | |
| 773 do { | |
| 774 if (eci == 14) { /* Reserved */ | |
| 775 eci = 15; | |
| 776 } else if (eci == 19) { /* Reserved */ | |
| 777 eci = 21; /* Skip 20 Shift JIS */ | |
| 778 } | |
| 779 if (utf8_to_eci(eci, source, local_source, &length) == 0) { | |
| 780 return eci; | |
| 781 } | |
| 782 eci++; | |
| 783 } while (eci < 25); | |
| 784 | |
| 785 assert(is_valid_utf8(source, length)); | |
| 786 | |
| 787 return 26; /* If all of these fail, use UTF-8! */ | |
| 788 } | |
| 789 | |
| 790 /* Call `get_best_eci()` for each segment, assuming valid UTF-8. Returns 0 on failure, first ECI set on success */ | |
| 791 INTERNAL int get_best_eci_segs(struct zint_symbol *symbol, struct zint_seg segs[], const int seg_count) { | |
| 792 const int default_eci = symbol->symbology == BARCODE_GRIDMATRIX ? 29 : symbol->symbology == BARCODE_UPNQR ? 4 : 3; | |
| 793 int first_eci_set = 0; | |
| 794 int i; | |
| 795 | |
| 796 for (i = 0; i < seg_count; i++) { | |
| 797 if (segs[i].eci == 0) { | |
| 798 const int eci = get_best_eci(segs[i].source, segs[i].length); | |
| 799 if (eci == default_eci) { | |
| 800 if (i != 0 && segs[i - 1].eci != 0 && segs[i - 1].eci != default_eci) { | |
| 801 segs[i].eci = eci; | |
| 802 if (first_eci_set == 0) { | |
| 803 first_eci_set = eci; | |
| 804 } | |
| 805 } | |
| 806 } else { | |
| 807 segs[i].eci = eci; | |
| 808 if (first_eci_set == 0) { | |
| 809 first_eci_set = eci; | |
| 810 if (i == 0) { | |
| 811 symbol->eci = eci; | |
| 812 } | |
| 813 } | |
| 814 } | |
| 815 } | |
| 816 } | |
| 817 | |
| 818 return first_eci_set; | |
| 819 } | |
| 820 | |
| 821 /* QRCODE Shift JIS helpers */ | |
| 822 | |
| 823 /* Convert UTF-8 string to Shift JIS and place in array of ints */ | |
| 824 INTERNAL int sjis_utf8(struct zint_symbol *symbol, const unsigned char source[], int *p_length, | |
| 825 unsigned int *ddata) { | |
| 826 int error_number; | |
| 827 unsigned int i, length; | |
| 828 unsigned int *utfdata = (unsigned int *) z_alloca(sizeof(unsigned int) * (*p_length + 1)); | |
| 829 | |
| 830 error_number = utf8_to_unicode(symbol, source, utfdata, p_length, 1 /*disallow_4byte*/); | |
| 831 if (error_number != 0) { | |
| 832 return error_number; | |
| 833 } | |
| 834 | |
| 835 for (i = 0, length = *p_length; i < length; i++) { | |
| 836 if (!u_sjis_int(utfdata[i], ddata + i)) { | |
| 837 return errtxt(ZINT_ERROR_INVALID_DATA, symbol, 800, "Invalid character in input"); | |
| 838 } | |
| 839 } | |
| 840 | |
| 841 return 0; | |
| 842 } | |
| 843 | |
| 844 /* If `full_multibyte` set, copy byte input stream to array of ints, putting double-bytes that match QR Kanji mode in | |
| 845 * a single entry. If `full_multibyte` not set, do a straight copy */ | |
| 846 INTERNAL void sjis_cpy(const unsigned char source[], int *p_length, unsigned int *ddata, const int full_multibyte) { | |
| 847 unsigned int i, j, length; | |
| 848 unsigned char c1, c2; | |
| 849 | |
| 850 if (full_multibyte) { | |
| 851 for (i = 0, j = 0, length = *p_length; i < length; i++, j++) { | |
| 852 c1 = source[i]; | |
| 853 /* Now using stricter interpretation of standard, and excluding certain trailing bytes */ | |
| 854 if (((c1 >= 0x81 && c1 <= 0x9F) || (c1 >= 0xE0 && c1 <= 0xEB)) && length - i >= 2) { | |
| 855 c2 = source[i + 1]; | |
| 856 if ((c2 >= 0x40 && c2 <= 0xFC) && c2 != 0x7F && (c1 != 0xEB || c2 <= 0xBF)) { | |
| 857 /* This may or may not be valid Shift JIS, but don't care as long as it can be encoded in | |
| 858 * QR Kanji mode */ | |
| 859 ddata[j] = (c1 << 8) | c2; | |
| 860 i++; | |
| 861 } else { | |
| 862 ddata[j] = c1; | |
| 863 } | |
| 864 } else { | |
| 865 ddata[j] = c1; | |
| 866 } | |
| 867 } | |
| 868 *p_length = j; | |
| 869 } else { | |
| 870 /* Straight copy */ | |
| 871 for (i = 0, length = *p_length; i < length; i++) { | |
| 872 ddata[i] = source[i]; | |
| 873 } | |
| 874 } | |
| 875 } | |
| 876 | |
| 877 /* Call `sjis_cpy()` for each segment */ | |
| 878 INTERNAL void sjis_cpy_segs(struct zint_seg segs[], const int seg_count, unsigned int *ddata, | |
| 879 const int full_multibyte) { | |
| 880 int i; | |
| 881 unsigned int *dd = ddata; | |
| 882 | |
| 883 for (i = 0; i < seg_count; i++) { | |
| 884 sjis_cpy(segs[i].source, &segs[i].length, dd, full_multibyte); | |
| 885 dd += segs[i].length; | |
| 886 } | |
| 887 } | |
| 888 | |
| 889 /* Convert UTF-8 string to ECI and place in array of ints using `sjis_cpy()` */ | |
| 890 INTERNAL int sjis_utf8_to_eci(const int eci, const unsigned char source[], int *p_length, unsigned int *ddata, | |
| 891 const int full_multibyte) { | |
| 892 | |
| 893 if (is_eci_convertible(eci)) { | |
| 894 int error_number; | |
| 895 const int eci_length = get_eci_length(eci, source, *p_length); | |
| 896 unsigned char *converted = (unsigned char *) z_alloca(eci_length + 1); | |
| 897 | |
| 898 error_number = utf8_to_eci(eci, source, converted, p_length); | |
| 899 if (error_number != 0) { | |
| 900 /* Note not setting `symbol->errtxt`, up to caller */ | |
| 901 return error_number; | |
| 902 } | |
| 903 | |
| 904 sjis_cpy(converted, p_length, ddata, full_multibyte || eci == 20); | |
| 905 } else { | |
| 906 sjis_cpy(source, p_length, ddata, full_multibyte); | |
| 907 } | |
| 908 | |
| 909 return 0; | |
| 910 } | |
| 911 | |
| 912 /* GRIDMATRIX GB 2312 helpers */ | |
| 913 | |
| 914 /* Convert UTF-8 string to GB 2312 (EUC-CN) and place in array of ints */ | |
| 915 INTERNAL int gb2312_utf8(struct zint_symbol *symbol, const unsigned char source[], int *p_length, | |
| 916 unsigned int *ddata) { | |
| 917 int error_number; | |
| 918 unsigned int i, length; | |
| 919 unsigned int *utfdata = (unsigned int *) z_alloca(sizeof(unsigned int) * (*p_length + 1)); | |
| 920 | |
| 921 error_number = utf8_to_unicode(symbol, source, utfdata, p_length, 1 /*disallow_4byte*/); | |
| 922 if (error_number != 0) { | |
| 923 return error_number; | |
| 924 } | |
| 925 | |
| 926 for (i = 0, length = *p_length; i < length; i++) { | |
| 927 if (utfdata[i] < 0x80) { | |
| 928 ddata[i] = utfdata[i]; | |
| 929 } else { | |
| 930 if (!u_gb2312_int(utfdata[i], ddata + i)) { | |
| 931 return errtxt(ZINT_ERROR_INVALID_DATA, symbol, 810, "Invalid character in input"); | |
| 932 } | |
| 933 } | |
| 934 } | |
| 935 | |
| 936 return 0; | |
| 937 } | |
| 938 | |
| 939 /* If `full_multibyte` set, copy byte input stream to array of ints, putting double-bytes that match GRIDMATRIX | |
| 940 * Chinese mode in a single entry. If `full_multibyte` not set, do a straight copy */ | |
| 941 static void gb2312_cpy(const unsigned char source[], int *p_length, unsigned int *ddata, | |
| 942 const int full_multibyte) { | |
| 943 unsigned int i, j, length; | |
| 944 unsigned char c1, c2; | |
| 945 | |
| 946 if (full_multibyte) { | |
| 947 for (i = 0, j = 0, length = *p_length; i < length; i++, j++) { | |
| 948 if (length - i >= 2) { | |
| 949 c1 = source[i]; | |
| 950 c2 = source[i + 1]; | |
| 951 if (((c1 >= 0xA1 && c1 <= 0xA9) || (c1 >= 0xB0 && c1 <= 0xF7)) && c2 >= 0xA1 && c2 <= 0xFE) { | |
| 952 /* This may or may not be valid GB 2312 (EUC-CN), but don't care as long as it can be encoded in | |
| 953 * GRIDMATRIX Chinese mode */ | |
| 954 ddata[j] = (c1 << 8) | c2; | |
| 955 i++; | |
| 956 } else { | |
| 957 ddata[j] = c1; | |
| 958 } | |
| 959 } else { | |
| 960 ddata[j] = source[i]; | |
| 961 } | |
| 962 } | |
| 963 *p_length = j; | |
| 964 } else { | |
| 965 /* Straight copy */ | |
| 966 for (i = 0, length = *p_length; i < length; i++) { | |
| 967 ddata[i] = source[i]; | |
| 968 } | |
| 969 } | |
| 970 } | |
| 971 | |
| 972 #ifdef ZINT_TEST /* Wrapper for direct testing */ | |
| 973 INTERNAL void gb2312_cpy_test(const unsigned char source[], int *p_length, unsigned int *ddata, | |
| 974 const int full_multibyte) { | |
| 975 gb2312_cpy(source, p_length, ddata, full_multibyte); | |
| 976 } | |
| 977 #endif | |
| 978 | |
| 979 /* Call `gb2312_cpy()` for each segment */ | |
| 980 INTERNAL void gb2312_cpy_segs(struct zint_seg segs[], const int seg_count, unsigned int *ddata, | |
| 981 const int full_multibyte) { | |
| 982 int i; | |
| 983 unsigned int *dd = ddata; | |
| 984 | |
| 985 for (i = 0; i < seg_count; i++) { | |
| 986 gb2312_cpy(segs[i].source, &segs[i].length, dd, full_multibyte); | |
| 987 dd += segs[i].length; | |
| 988 } | |
| 989 } | |
| 990 | |
| 991 /* Convert UTF-8 string to ECI and place in array of ints using `gb2312_cpy()` */ | |
| 992 INTERNAL int gb2312_utf8_to_eci(const int eci, const unsigned char source[], int *p_length, unsigned int *ddata, | |
| 993 const int full_multibyte) { | |
| 994 | |
| 995 if (is_eci_convertible(eci)) { | |
| 996 int error_number; | |
| 997 const int eci_length = get_eci_length(eci, source, *p_length); | |
| 998 unsigned char *converted = (unsigned char *) z_alloca(eci_length + 1); | |
| 999 | |
| 1000 error_number = utf8_to_eci(eci, source, converted, p_length); | |
| 1001 if (error_number != 0) { | |
| 1002 /* Note not setting `symbol->errtxt`, up to caller */ | |
| 1003 return error_number; | |
| 1004 } | |
| 1005 | |
| 1006 gb2312_cpy(converted, p_length, ddata, full_multibyte || eci == 29); | |
| 1007 } else { | |
| 1008 gb2312_cpy(source, p_length, ddata, full_multibyte); | |
| 1009 } | |
| 1010 | |
| 1011 return 0; | |
| 1012 } | |
| 1013 | |
| 1014 /* HANXIN GB 18030 helpers */ | |
| 1015 | |
| 1016 /* Convert UTF-8 string to GB 18030 and place in array of ints */ | |
| 1017 INTERNAL int gb18030_utf8(struct zint_symbol *symbol, const unsigned char source[], int *p_length, | |
| 1018 unsigned int *ddata) { | |
| 1019 int error_number, ret; | |
| 1020 unsigned int i, j, length; | |
| 1021 unsigned int *utfdata = (unsigned int *) z_alloca(sizeof(unsigned int) * (*p_length + 1)); | |
| 1022 | |
| 1023 error_number = utf8_to_unicode(symbol, source, utfdata, p_length, 0 /*disallow_4byte*/); | |
| 1024 if (error_number != 0) { | |
| 1025 return error_number; | |
| 1026 } | |
| 1027 | |
| 1028 for (i = 0, j = 0, length = *p_length; i < length; i++, j++) { | |
| 1029 if (utfdata[i] < 0x80) { | |
| 1030 ddata[j] = utfdata[i]; | |
| 1031 } else { | |
| 1032 ret = u_gb18030_int(utfdata[i], ddata + j, ddata + j + 1); | |
| 1033 if (ret == 0) { /* Should never happen, as GB 18030 is a UTF i.e. maps all Unicode codepoints */ | |
| 1034 return errtxt(ZINT_ERROR_INVALID_DATA, symbol, 820, "Invalid character in input"); /* Not reached */ | |
| 1035 } | |
| 1036 if (ret == 4) { | |
| 1037 j++; | |
| 1038 } | |
| 1039 } | |
| 1040 } | |
| 1041 | |
| 1042 *p_length = j; | |
| 1043 | |
| 1044 return 0; | |
| 1045 } | |
| 1046 | |
| 1047 /* If `full_multibyte` set, copy byte input stream to array of ints, putting double-bytes that match HANXIN | |
| 1048 * Chinese mode in single entry, and quad-bytes in 2 entries. If `full_multibyte` not set, do a straight copy */ | |
| 1049 static void gb18030_cpy(const unsigned char source[], int *p_length, unsigned int *ddata, | |
| 1050 const int full_multibyte) { | |
| 1051 unsigned int i, j, length; | |
| 1052 int done; | |
| 1053 unsigned char c1, c2, c3, c4; | |
| 1054 | |
| 1055 if (full_multibyte) { | |
| 1056 for (i = 0, j = 0, length = *p_length; i < length; i++, j++) { | |
| 1057 done = 0; | |
| 1058 c1 = source[i]; | |
| 1059 if (length - i >= 2) { | |
| 1060 if (c1 >= 0x81 && c1 <= 0xFE) { | |
| 1061 c2 = source[i + 1]; | |
| 1062 if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0x80 && c2 <= 0xFE)) { | |
| 1063 ddata[j] = (c1 << 8) | c2; | |
| 1064 i++; | |
| 1065 done = 1; | |
| 1066 } else if (length - i >= 4 && (c2 >= 0x30 && c2 <= 0x39)) { | |
| 1067 c3 = source[i + 2]; | |
| 1068 c4 = source[i + 3]; | |
| 1069 if ((c3 >= 0x81 && c3 <= 0xFE) && (c4 >= 0x30 && c4 <= 0x39)) { | |
| 1070 ddata[j++] = (c1 << 8) | c2; | |
| 1071 ddata[j] = (c3 << 8) | c4; | |
| 1072 i += 3; | |
| 1073 done = 1; | |
| 1074 } | |
| 1075 } | |
| 1076 } | |
| 1077 } | |
| 1078 if (!done) { | |
| 1079 ddata[j] = c1; | |
| 1080 } | |
| 1081 } | |
| 1082 *p_length = j; | |
| 1083 } else { | |
| 1084 /* Straight copy */ | |
| 1085 for (i = 0, length = *p_length; i < length; i++) { | |
| 1086 ddata[i] = source[i]; | |
| 1087 } | |
| 1088 } | |
| 1089 } | |
| 1090 | |
| 1091 #ifdef ZINT_TEST /* Wrapper for direct testing */ | |
| 1092 INTERNAL void gb18030_cpy_test(const unsigned char source[], int *p_length, unsigned int *ddata, | |
| 1093 const int full_multibyte) { | |
| 1094 gb18030_cpy(source, p_length, ddata, full_multibyte); | |
| 1095 } | |
| 1096 #endif | |
| 1097 | |
| 1098 /* Call `gb18030_cpy()` for each segment */ | |
| 1099 INTERNAL void gb18030_cpy_segs(struct zint_seg segs[], const int seg_count, unsigned int *ddata, | |
| 1100 const int full_multibyte) { | |
| 1101 int i; | |
| 1102 unsigned int *dd = ddata; | |
| 1103 | |
| 1104 for (i = 0; i < seg_count; i++) { | |
| 1105 gb18030_cpy(segs[i].source, &segs[i].length, dd, full_multibyte); | |
| 1106 dd += segs[i].length; | |
| 1107 } | |
| 1108 } | |
| 1109 | |
| 1110 /* Convert UTF-8 string to ECI and place in array of ints using `gb18030_cpy()` */ | |
| 1111 INTERNAL int gb18030_utf8_to_eci(const int eci, const unsigned char source[], int *p_length, unsigned int *ddata, | |
| 1112 const int full_multibyte) { | |
| 1113 | |
| 1114 if (is_eci_convertible(eci)) { | |
| 1115 int error_number; | |
| 1116 const int eci_length = get_eci_length(eci, source, *p_length); | |
| 1117 unsigned char *converted = (unsigned char *) z_alloca(eci_length + 1); | |
| 1118 | |
| 1119 error_number = utf8_to_eci(eci, source, converted, p_length); | |
| 1120 if (error_number != 0) { | |
| 1121 /* Note not setting `symbol->errtxt`, up to caller */ | |
| 1122 return error_number; | |
| 1123 } | |
| 1124 | |
| 1125 /* GB 18030 (ECI 32) superset of GB 2312 (ECI 29) and GBK (ECI 31) */ | |
| 1126 gb18030_cpy(converted, p_length, ddata, full_multibyte || eci == 32 || eci == 29 || eci == 31); | |
| 1127 } else { | |
| 1128 gb18030_cpy(source, p_length, ddata, full_multibyte); | |
| 1129 } | |
| 1130 | |
| 1131 return 0; | |
| 1132 } | |
| 1133 | |
| 1134 /* vim: set ts=4 sw=4 et : */ |
