comparison mupdf-source/thirdparty/zint/backend/eci.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /* eci.c - Extended Channel Interpretations */
2 /*
3 libzint - the open source barcode library
4 Copyright (C) 2009-2024 Robin Stuart <rstuart114@gmail.com>
5
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions
8 are met:
9
10 1. Redistributions of source code must retain the above copyright
11 notice, this list of conditions and the following disclaimer.
12 2. Redistributions in binary form must reproduce the above copyright
13 notice, this list of conditions and the following disclaimer in the
14 documentation and/or other materials provided with the distribution.
15 3. Neither the name of the project nor the names of its contributors
16 may be used to endorse or promote products derived from this software
17 without specific prior written permission.
18
19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 SUCH DAMAGE.
30 */
31 /* SPDX-License-Identifier: BSD-3-Clause */
32
33 #include <assert.h>
34 #include "common.h"
35 #include "eci.h"
36 #include "eci_sb.h"
37 #include "big5.h"
38 #include "gb18030.h"
39 #include "gb2312.h"
40 #include "gbk.h"
41 #include "ksx1001.h"
42 #include "sjis.h"
43
44 /* Single-byte stuff */
45
46 /* Base ISO/IEC 8859 routine to convert Unicode codepoint `u` */
47 static int u_iso8859(const unsigned int u, const unsigned short *tab_s, const unsigned short *tab_u,
48 const unsigned char *tab_sb, int e, unsigned char *dest) {
49 int s;
50 if (u < 0xA0) {
51 if (u >= 0x80) { /* U+0080-9F fail */
52 return 0;
53 }
54 *dest = (unsigned char) u;
55 return 1;
56 }
57 if (u <= 0xFF) {
58 const unsigned int u2 = u - 0xA0;
59 if (tab_s[u2 >> 4] & ((unsigned short) 1 << (u2 & 0xF))) {
60 *dest = (unsigned char) u; /* Straight-thru */
61 return 1;
62 }
63 }
64
65 s = 0;
66 while (s <= e) {
67 const int m = (s + e) >> 1;
68 if (tab_u[m] < u) {
69 s = m + 1;
70 } else if (tab_u[m] > u) {
71 e = m - 1;
72 } else {
73 *dest = tab_sb[m];
74 return 1;
75 }
76 }
77 return 0;
78 }
79
80 /* Base Windows-125x routine to convert Unicode codepoint `u` */
81 static int u_cp125x(const unsigned int u, const unsigned short *tab_s, const unsigned short *tab_u,
82 const unsigned char *tab_sb, int e, unsigned char *dest) {
83 int s;
84 if (u < 0x80) {
85 *dest = (unsigned char) u;
86 return 1;
87 }
88 if (u <= 0xFF && u >= 0xA0) {
89 const unsigned int u2 = u - 0xA0;
90 if (tab_s[u2 >> 4] & ((unsigned short) 1 << (u2 & 0xF))) {
91 *dest = (unsigned char) u; /* Straight-thru */
92 return 1;
93 }
94 }
95
96 s = 0;
97 while (s <= e) {
98 const int m = (s + e) >> 1;
99 if (tab_u[m] < u) {
100 s = m + 1;
101 } else if (tab_u[m] > u) {
102 e = m - 1;
103 } else {
104 *dest = tab_sb[m];
105 return 1;
106 }
107 }
108 return 0;
109 }
110
111 /* ECI 27 ASCII (ISO/IEC 646:1991 IRV (US)) */
112 static int u_ascii(const unsigned int u, unsigned char *dest) {
113 if (u < 0x80) {
114 *dest = (unsigned char) u;
115 return 1;
116 }
117 return 0;
118 }
119
120 /* ECI 170 ASCII subset (ISO/IEC 646:1991 Invariant), excludes 12 chars that historically had national variants,
121 namely "#$@[\]^`{|}~" */
122 static int u_ascii_inv(const unsigned int u, unsigned char *dest) {
123 if (u == 0x7F || (u <= 'z' && u != '#' && u != '$' && u != '@' && (u <= 'Z' || u == '_' || u >= 'a'))) {
124 *dest = (unsigned char) u;
125 return 1;
126 }
127 return 0;
128 }
129
130 /* ECI 25 UTF-16 Big Endian (ISO/IEC 10646) - assumes valid Unicode */
131 static int u_utf16be(const unsigned int u, unsigned char *dest) {
132 unsigned int u2, v;
133 if (u < 0x10000) {
134 dest[0] = (unsigned char) (u >> 8);
135 dest[1] = (unsigned char) u;
136 return 2;
137 }
138 u2 = u - 0x10000;
139 v = u2 >> 10;
140 dest[0] = (unsigned char) (0xD8 + (v >> 8));
141 dest[1] = (unsigned char) v;
142 v = u2 & 0x3FF;
143 dest[2] = (unsigned char) (0xDC + (v >> 8));
144 dest[3] = (unsigned char) v;
145 return 4;
146 }
147
148 /* ECI 33 UTF-16 Little Endian (ISO/IEC 10646) - assumes valid Unicode */
149 static int u_utf16le(const unsigned int u, unsigned char *dest) {
150 unsigned int u2, v;
151 if (u < 0x10000) {
152 dest[0] = (unsigned char) u;
153 dest[1] = (unsigned char) (u >> 8);
154 return 2;
155 }
156 u2 = u - 0x10000;
157 v = u2 >> 10;
158 dest[0] = (unsigned char) v;
159 dest[1] = (unsigned char) (0xD8 + (v >> 8));
160 v = u2 & 0x3FF;
161 dest[2] = (unsigned char) v;
162 dest[3] = (unsigned char) (0xDC + (v >> 8));
163 return 4;
164 }
165
166 /* ECI 34 UTF-32 Big Endian (ISO/IEC 10646) - assumes valid Unicode */
167 static int u_utf32be(const unsigned int u, unsigned char *dest) {
168 dest[0] = 0;
169 dest[1] = (unsigned char) (u >> 16);
170 dest[2] = (unsigned char) (u >> 8);
171 dest[3] = (unsigned char) u;
172 return 4;
173 }
174
175 /* ECI 35 UTF-32 Little Endian (ISO/IEC 10646) - assumes valid Unicode */
176 static int u_utf32le(const unsigned int u, unsigned char *dest) {
177 dest[0] = (unsigned char) u;
178 dest[1] = (unsigned char) (u >> 8);
179 dest[2] = (unsigned char) (u >> 16);
180 dest[3] = 0;
181 return 4;
182 }
183
184 /* Multibyte stuff */
185
186 /* Acknowledgements to Bruno Haible <bruno@clisp.org> for a no. of techniques used here */
187
188 /* Helper to lookup Unicode codepoint `u` in the URO (Unified Repertoire and Ordering) block (U+4E00-9FFF) */
189 static int eci_u_lookup_uro_int(const unsigned int u, const unsigned short *tab_u, const unsigned short *tab_mb_ind,
190 const unsigned short *tab_mb, unsigned int *d) {
191 unsigned int u2 = (u - 0x4E00) >> 4; /* Blocks of 16 */
192 unsigned int v = (unsigned int) 1 << (u & 0xF);
193 if ((tab_u[u2] & v) == 0) {
194 return 0;
195 }
196 v = tab_u[u2] & (v - 1); /* Mask to bits prior to this one */
197 /* Count bits set (http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel) */
198 v = v - ((v >> 1) & 0x55555555);
199 v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
200 v = (((v + (v >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24;
201 *d = tab_mb[tab_mb_ind[u2] + v];
202 return 2;
203 }
204
205 /* Version of `eci_u_lookup_uro_int()` taking unsigned char destination */
206 static int eci_u_lookup_uro(const unsigned int u, const unsigned short *tab_u, const unsigned short *tab_mb_ind,
207 const unsigned short *tab_mb, unsigned char *dest) {
208 unsigned int d;
209 int ret = eci_u_lookup_uro_int(u, tab_u, tab_mb_ind, tab_mb, &d);
210 if (ret) {
211 dest[0] = (unsigned char) (d >> 8);
212 dest[1] = (unsigned char) d;
213 }
214 return ret;
215 }
216
217 /* ECI 20 Shift JIS */
218 static int u_sjis_int(const unsigned int u, unsigned int *d) {
219 unsigned int u2, dv, md;
220 int s, e;
221
222 if (u < 0x80 && u != 0x5C && u != 0x7E) { /* Backslash & tilde re-mapped according to JIS X 0201 Roman */
223 *d = u;
224 return 1;
225 }
226 /* Special case URO block sequential mappings (considerably lessens size of `sjis_u[]` array) */
227 if (u >= 0x4E00 && u <= 0xDFFF) { /* 0xE000 next used value >= 0x4E00 */
228 if (u >= 0x9FB0) {
229 return 0;
230 }
231 return eci_u_lookup_uro_int(u, sjis_uro_u, sjis_uro_mb_ind, sjis_mb, d);
232 }
233 /* PUA to user-defined (Table 4-86, Lunde, 2nd ed.) */
234 if (u >= 0xE000 && u <= 0xE757) {
235 u2 = u - 0xE000;
236 dv = u2 / 188;
237 md = u2 - dv * 188;
238 *d = ((dv + 0xF0) << 8) | (md + 0x40 + (md >= 0x3F));
239 return 2;
240 }
241 if (u >= sjis_u[0] && u <= sjis_u[ARRAY_SIZE(sjis_u) - 1]) {
242 s = 0;
243 e = ARRAY_SIZE(sjis_u) - 1;
244 while (s <= e) {
245 const int m = (s + e) >> 1;
246 if (sjis_u[m] < u) {
247 s = m + 1;
248 } else if (sjis_u[m] > u) {
249 e = m - 1;
250 } else {
251 *d = sjis_mb[u >= 0x4E00 ? m + 6356 : m]; /* Adjust for URO block */
252 return 1 + (*d > 0xFF);
253 }
254 }
255 }
256 return 0;
257 }
258
259 #ifdef ZINT_TEST /* Wrapper for direct testing */
260 INTERNAL int u_sjis_int_test(const unsigned int u, unsigned int *d) {
261 return u_sjis_int(u, d);
262 }
263 #endif
264
265 /* Version of `u_sjis_int()` taking unsigned char destination, for use by `utf8_to_eci()` */
266 static int u_sjis(const unsigned int u, unsigned char *dest) {
267 unsigned int d;
268 int ret = u_sjis_int(u, &d);
269 if (ret) {
270 if (ret == 1) {
271 dest[0] = (unsigned char) d;
272 } else {
273 dest[0] = (unsigned char) (d >> 8);
274 dest[1] = (unsigned char) d;
275 }
276 }
277 return ret;
278 }
279
280 /* ECI 28 Big5 Chinese (Taiwan) */
281 static int u_big5(const unsigned int u, unsigned char *dest) {
282 int s, e;
283
284 if (u < 0x80) {
285 *dest = (unsigned char) u;
286 return 1;
287 }
288 /* Special case URO block sequential mappings (considerably lessens size of `big5_u[]` array) */
289 if (u >= 0x4E00 && u <= 0xFA0B) { /* 0xFA0C next used value >= 0x4E00 */
290 if (u >= 0x9FB0) {
291 return 0;
292 }
293 return eci_u_lookup_uro(u, big5_uro_u, big5_uro_mb_ind, big5_mb, dest);
294 }
295 if (u >= big5_u[0] && u <= big5_u[ARRAY_SIZE(big5_u) - 1]) {
296 s = 0;
297 e = ARRAY_SIZE(big5_u) - 1;
298 while (s <= e) {
299 const int m = (s + e) >> 1;
300 if (big5_u[m] < u) {
301 s = m + 1;
302 } else if (big5_u[m] > u) {
303 e = m - 1;
304 } else {
305 const unsigned short mb = big5_mb[u >= 0x4E00 ? m + 13061 : m]; /* Adjust for URO block */
306 dest[0] = (unsigned char) (mb >> 8);
307 dest[1] = (unsigned char) mb;
308 return 2;
309 }
310 }
311 }
312 return 0;
313 }
314
315 #ifdef ZINT_TEST /* Wrapper for direct testing */
316 INTERNAL int u_big5_test(const unsigned int u, unsigned char *dest) {
317 return u_big5(u, dest);
318 }
319 #endif
320
321 /* ECI 30 EUC-KR (KS X 1001, formerly KS C 5601) Korean */
322 static int u_ksx1001(const unsigned int u, unsigned char *dest) {
323 int s, e;
324
325 if (u < 0x80) {
326 *dest = (unsigned char) u;
327 return 1;
328 }
329 /* Special case URO block sequential mappings (considerably lessens size of `ksx1001_u[]` array) */
330 if (u >= 0x4E00 && u <= 0xABFF) { /* 0xAC00 next used value >= 0x4E00 */
331 if (u >= 0x9FA0) {
332 return 0;
333 }
334 return eci_u_lookup_uro(u, ksx1001_uro_u, ksx1001_uro_mb_ind, ksx1001_mb, dest);
335 }
336 if (u >= ksx1001_u[0] && u <= ksx1001_u[ARRAY_SIZE(ksx1001_u) - 1]) {
337 s = ksx1001_u_ind[(u - ksx1001_u[0]) >> 8];
338 e = s + 0x100 > ARRAY_SIZE(ksx1001_u) ? ARRAY_SIZE(ksx1001_u) - 1 : s + 0x100 - 1;
339 while (s <= e) {
340 const int m = (s + e) >> 1;
341 if (ksx1001_u[m] < u) {
342 s = m + 1;
343 } else if (ksx1001_u[m] > u) {
344 e = m - 1;
345 } else {
346 const unsigned short mb = ksx1001_mb[u >= 0x4E00 ? m + 4620 : m]; /* Adjust for URO block */
347 dest[0] = (unsigned char) (mb >> 8);
348 dest[1] = (unsigned char) mb;
349 return 2;
350 }
351 }
352 }
353 return 0;
354 }
355
356 #ifdef ZINT_TEST /* Wrapper for direct testing */
357 INTERNAL int u_ksx1001_test(const unsigned int u, unsigned char *dest) {
358 return u_ksx1001(u, dest);
359 }
360 #endif
361
362 /* ECI 29 GB 2312 Chinese (PRC) */
363 static int u_gb2312_int(const unsigned int u, unsigned int *d) {
364 int s, e;
365
366 if (u < 0x80) {
367 *d = u;
368 return 1;
369 }
370 /* Special case URO block sequential mappings (considerably lessens size of `gb2312_u[]` array) */
371 if (u >= 0x4E00 && u <= 0x9E1E) { /* 0x9E1F next used value >= 0x4E00 */
372 if (u >= 0x9CF0) {
373 return 0;
374 }
375 return eci_u_lookup_uro_int(u, gb2312_uro_u, gb2312_uro_mb_ind, gb2312_mb, d);
376 }
377 if (u >= gb2312_u[0] && u <= gb2312_u[ARRAY_SIZE(gb2312_u) - 1]) {
378 s = gb2312_u_ind[(u - gb2312_u[0]) >> 8];
379 e = s + 0x100 > ARRAY_SIZE(gb2312_u) ? ARRAY_SIZE(gb2312_u) - 1 : s + 0x100 - 1;
380 while (s <= e) {
381 const int m = (s + e) >> 1;
382 if (gb2312_u[m] < u) {
383 s = m + 1;
384 } else if (gb2312_u[m] > u) {
385 e = m - 1;
386 } else {
387 *d = gb2312_mb[u > 0x4E00 ? m + 6627 : m]; /* Adjust for URO block */
388 return 2;
389 }
390 }
391 }
392 return 0;
393 }
394
395 #ifdef ZINT_TEST /* Wrapper for direct testing */
396 INTERNAL int u_gb2312_int_test(const unsigned int u, unsigned int *d) {
397 return u_gb2312_int(u, d);
398 }
399 #endif
400
401 /* Version of `u_gb2312_int()` taking unsigned char destination, for use by `utf8_to_eci()` */
402 static int u_gb2312(const unsigned int u, unsigned char *dest) {
403 unsigned int d;
404 int ret = u_gb2312_int(u, &d);
405 if (ret) {
406 if (ret == 1) {
407 dest[0] = (unsigned char) d;
408 } else {
409 dest[0] = (unsigned char) (d >> 8);
410 dest[1] = (unsigned char) d;
411 }
412 }
413 return ret;
414 }
415
416 /* ECI 31 GBK Chinese */
417 static int u_gbk_int(const unsigned int u, unsigned int *d) {
418 int s, e;
419
420 if (u < 0x80) {
421 *d = u;
422 return 1;
423 }
424
425 /* Check GB 2312 first */
426 if (u == 0x30FB) {
427 /* KATAKANA MIDDLE DOT, mapped by GB 2312 but not by GBK (U+00B7 MIDDLE DOT mapped to 0xA1A4 instead) */
428 return 0;
429 }
430 if (u == 0x2015) {
431 /* HORIZONTAL BAR, mapped to 0xA844 by GBK rather than 0xA1AA (U+2014 EM DASH mapped there instead) */
432 *d = 0xA844;
433 return 2;
434 }
435 if (u_gb2312_int(u, d)) { /* Includes the 2 GB 6345.1-86 corrections given in Table 3-22, Lunde, 2nd ed. */
436 return 2;
437 }
438
439 /* Special case URO block sequential mappings (considerably lessens size of `gbk_u[]` array) */
440 if (u >= 0x4E00 && u <= 0xF92B) { /* 0xF92C next used value >= 0x4E00 */
441 if (u >= 0x9FB0) {
442 return 0;
443 }
444 return eci_u_lookup_uro_int(u, gbk_uro_u, gbk_uro_mb_ind, gbk_mb, d);
445 }
446 if (u >= gbk_u[0] && u <= gbk_u[ARRAY_SIZE(gbk_u) - 1]) {
447 s = 0;
448 e = ARRAY_SIZE(gbk_u) - 1;
449 while (s <= e) {
450 const int m = (s + e) >> 1;
451 if (gbk_u[m] < u) {
452 s = m + 1;
453 } else if (gbk_u[m] > u) {
454 e = m - 1;
455 } else {
456 *d = gbk_mb[u >= 0x4E00 ? m + 14139 : m]; /* Adjust for URO block */
457 return 2;
458 }
459 }
460 }
461 return 0;
462 }
463
464 #ifdef ZINT_TEST /* Wrapper for direct testing */
465 INTERNAL int u_gbk_int_test(const unsigned int u, unsigned int *d) {
466 return u_gbk_int(u, d);
467 }
468 #endif
469
470 /* Version of `u_gbk_int()` taking unsigned char destination, for use by `utf8_to_eci()` */
471 static int u_gbk(const unsigned int u, unsigned char *dest) {
472 unsigned int d;
473 int ret = u_gbk_int(u, &d);
474 if (ret) {
475 if (ret == 1) {
476 dest[0] = (unsigned char) d;
477 } else {
478 dest[0] = (unsigned char) (d >> 8);
479 dest[1] = (unsigned char) d;
480 }
481 }
482 return ret;
483 }
484
485 /* Helper for `u_gb18030_int()` to output 4-byte sequential blocks */
486 static int u_gb18030_4_sequential_int(unsigned int u2, unsigned int mb_lead, unsigned int *d1, unsigned int *d2) {
487 unsigned int dv;
488
489 dv = u2 / 10;
490 *d2 = u2 - dv * 10 + 0x30;
491 u2 = dv;
492 dv = u2 / 126;
493 *d2 |= (u2 - dv * 126 + 0x81) << 8;
494 u2 = dv;
495 dv = u2 / 10;
496 *d1 = ((dv + mb_lead) << 8) | (u2 - dv * 10 + 0x30);
497 return 4;
498 }
499
500 /* ECI 32 GB 18030 Chinese - assumes valid Unicode */
501 static int u_gb18030_int(const unsigned int u, unsigned int *d1, unsigned int *d2) {
502 unsigned int u2, dv;
503 int s, e;
504
505 if (u < 0x80) {
506 *d1 = u;
507 return 1;
508 }
509
510 /* Check GBK first */
511 if (u_gbk_int(u, d1)) {
512 return 2;
513 }
514
515 if (u >= 0x10000) {
516 /* Non-PUA, non-BMP, see Table 3-37, Lunde, 2nd ed. */
517 if (u == 0x20087) {
518 *d1 = 0xFE51;
519 return 2;
520 }
521 if (u == 0x20089) {
522 *d1 = 0xFE52;
523 return 2;
524 }
525 if (u == 0x200CC) {
526 *d1 = 0xFE53;
527 return 2;
528 }
529 if (u == 0x215D7) {
530 *d1 = 0xFE6C;
531 return 2;
532 }
533 if (u == 0x2298F) {
534 *d1 = 0xFE76;
535 return 2;
536 }
537 if (u == 0x241FE) {
538 *d1 = 0xFE91;
539 return 2;
540 }
541 /* All other non-BMP U+10000-10FFFF */
542 return u_gb18030_4_sequential_int(u - 0x10000, 0x90, d1, d2);
543 }
544 if (u >= 0xE000 && u <= 0xE765) { /* PUA to user-defined */
545 if (u <= 0xE4C5) {
546 u2 = u - 0xE000;
547 dv = u2 / 94;
548 *d1 = ((dv + (dv < 6 ? 0xAA : 0xF2)) << 8) | (u2 - dv * 94 + 0xA1);
549 } else {
550 unsigned int md;
551 u2 = u - 0xE4C6;
552 dv = u2 / 96;
553 md = u2 - dv * 96;
554 *d1 = ((dv + 0xA1) << 8) | (md + 0x40 + (md >= 0x3F));
555 }
556 return 2;
557 }
558 if (u >= gb18030_2_u[0] && u <= gb18030_2_u[ARRAY_SIZE(gb18030_2_u) - 1]) {
559 s = 0;
560 e = ARRAY_SIZE(gb18030_2_u) - 1;
561 while (s <= e) {
562 const int m = (s + e) >> 1;
563 if (gb18030_2_u[m] < u) {
564 s = m + 1;
565 } else if (gb18030_2_u[m] > u) {
566 e = m - 1;
567 } else {
568 *d1 = gb18030_2_mb[m];
569 return 2;
570 }
571 }
572 }
573 /* All other BMP U+0080-FFFF */
574 if (u == 0xE7C7) { /* PUA change to non-PUA, see Table 3-39, Lunde, 2nd ed. */
575 *d1 = 0x8135;
576 *d2 = 0xF437;
577 return 4;
578 }
579 s = 0;
580 e = ARRAY_SIZE(gb18030_4_u_e) - 1;
581 while (s < e) { /* Lower bound */
582 const int m = (s + e) >> 1;
583 if (gb18030_4_u_e[m] < u) {
584 s = m + 1;
585 } else {
586 e = m;
587 }
588 }
589 assert(s < ARRAY_SIZE(gb18030_4_u_e));
590 return u_gb18030_4_sequential_int(u - gb18030_4_mb_o[s] - 0x80, 0x81, d1, d2);
591 }
592
593 #ifdef ZINT_TEST /* Wrapper for direct testing */
594 INTERNAL int u_gb18030_int_test(const unsigned int u, unsigned int *d1, unsigned int *d2) {
595 return u_gb18030_int(u, d1, d2);
596 }
597 #endif
598
599 /* Version of `u_gb18030_int()` taking unsigned char destination, for use by `utf8_to_eci()` */
600 static int u_gb18030(const unsigned int u, unsigned char *dest) {
601 unsigned int d1, d2;
602 int ret = u_gb18030_int(u, &d1, &d2);
603 if (ret) {
604 if (ret == 1) {
605 dest[0] = (unsigned char) d1;
606 } else {
607 dest[0] = (unsigned char) (d1 >> 8);
608 dest[1] = (unsigned char) d1;
609 if (ret == 4) {
610 dest[2] = (unsigned char) (d2 >> 8);
611 dest[3] = (unsigned char) d2;
612 }
613 }
614 }
615 return ret;
616 }
617
618 /* Main ECI stuff */
619
620 /* Helper to count the number of chars in a string within a range */
621 static int chr_range_cnt(const unsigned char string[], const int length, const unsigned char c1,
622 const unsigned char c2) {
623 int count = 0;
624 int i;
625 if (c1) {
626 for (i = 0; i < length; i++) {
627 if (string[i] >= c1 && string[i] <= c2) {
628 count++;
629 }
630 }
631 } else {
632 for (i = 0; i < length; i++) {
633 if (string[i] <= c2) {
634 count++;
635 }
636 }
637 }
638 return count;
639 }
640
641 /* Is ECI convertible from UTF-8? */
642 INTERNAL int is_eci_convertible(const int eci) {
643 if (eci == 26 || (eci > 35 && eci != 170)) { /* Exclude ECI 170 - ASCII Invariant */
644 /* UTF-8 (26) or 8-bit binary data (899) or undefined (> 35 and < 899) or not character set (> 899) */
645 return 0;
646 }
647 return 1;
648 }
649
650 /* Are any of the ECIs in the segments convertible from UTF-8?
651 Sets `convertible[]` for each, which must be at least `seg_count` in size */
652 INTERNAL int is_eci_convertible_segs(const struct zint_seg segs[], const int seg_count, int convertible[]) {
653 int ret = 0;
654 int i;
655 for (i = 0; i < seg_count; i++) {
656 convertible[i] = is_eci_convertible(segs[i].eci);
657 ret |= convertible[i];
658 }
659 return ret;
660 }
661
662 /* Calculate length required to convert UTF-8 to (double-byte) encoding */
663 INTERNAL int get_eci_length(const int eci, const unsigned char source[], int length) {
664 if (eci == 20) { /* Shift JIS */
665 /* Only ASCII backslash (reverse solidus) exceeds UTF-8 length */
666 length += chr_cnt(source, length, '\\');
667
668 } else if (eci == 25 || eci == 33) { /* UTF-16 */
669 /* All ASCII chars take 2 bytes */
670 length += chr_range_cnt(source, length, 0, 0x7F);
671 /* Surrogate pairs are 4 UTF-8 bytes long so fit */
672
673 } else if (eci == 32) { /* GB 18030 */
674 /* Allow for GB 18030 4 byters */
675 length *= 2;
676
677 } else if (eci == 34 || eci == 35) { /* UTF-32 */
678 /* Quadruple-up ASCII and double-up non-ASCII */
679 length += chr_range_cnt(source, length, 0, 0x7F) * 2 + length;
680 }
681
682 /* Big5, GB 2312, EUC-KR and GBK fit in UTF-8 length */
683
684 return length;
685 }
686
687 /* Call `get_eci_length()` for each segment, returning total */
688 INTERNAL int get_eci_length_segs(const struct zint_seg segs[], const int seg_count) {
689 int length = 0;
690 int i;
691
692 for (i = 0; i < seg_count; i++) {
693 length += get_eci_length(segs[i].eci, segs[i].source, segs[i].length);
694 }
695
696 return length;
697 }
698
699 /* Convert UTF-8 to other character encodings */
700 typedef int (*eci_func_t)(const unsigned int u, unsigned char *dest);
701 INTERNAL int utf8_to_eci(const int eci, const unsigned char source[], unsigned char dest[], int *p_length) {
702
703 static const eci_func_t eci_funcs[36] = {
704 NULL, NULL, NULL, NULL, u_iso8859_2, /*0-4*/
705 u_iso8859_3, u_iso8859_4, u_iso8859_5, u_iso8859_6, u_iso8859_7, /*5-9*/
706 u_iso8859_8, u_iso8859_9, u_iso8859_10, u_iso8859_11, NULL, /*10-14*/
707 u_iso8859_13, u_iso8859_14, u_iso8859_15, u_iso8859_16, NULL, /*15-19*/
708 u_sjis, u_cp1250, u_cp1251, u_cp1252, u_cp1256, /*20-24*/
709 u_utf16be, NULL, u_ascii, u_big5, u_gb2312, /*25-29*/
710 u_ksx1001, u_gbk, u_gb18030, u_utf16le, u_utf32be, /*30-34*/
711 u_utf32le,
712 };
713 eci_func_t eci_func;
714 unsigned int codepoint, state = 0;
715 int in_posn = 0;
716 int out_posn = 0;
717 int length = *p_length;
718
719 /* Special case ISO/IEC 8859-1 */
720 if (eci == 0 || eci == 3) { /* Default ECI 0 to ISO/IEC 8859-1 */
721 while (in_posn < length) {
722 do {
723 decode_utf8(&state, &codepoint, source[in_posn++]);
724 } while (in_posn < length && state != 0 && state != 12);
725 if (state != 0) {
726 return ZINT_ERROR_INVALID_DATA;
727 }
728 if (codepoint >= 0x80 && (codepoint < 0xA0 || codepoint >= 0x100)) {
729 return ZINT_ERROR_INVALID_DATA;
730 }
731 dest[out_posn++] = (unsigned char) codepoint;
732 }
733 dest[out_posn] = '\0';
734 *p_length = out_posn;
735 return 0;
736 }
737
738 if (eci == 170) { /* ASCII Invariant (archaic subset) */
739 eci_func = u_ascii_inv;
740 } else {
741 eci_func = eci_funcs[eci];
742 if (eci_func == NULL) {
743 return ZINT_ERROR_INVALID_DATA;
744 }
745 }
746
747 while (in_posn < length) {
748 int incr;
749 do {
750 decode_utf8(&state, &codepoint, source[in_posn++]);
751 } while (in_posn < length && state != 0 && state != 12);
752 if (state != 0) {
753 return ZINT_ERROR_INVALID_DATA;
754 }
755 incr = (*eci_func)(codepoint, dest + out_posn);
756 if (incr == 0) {
757 return ZINT_ERROR_INVALID_DATA;
758 }
759 out_posn += incr;
760 }
761 dest[out_posn] = '\0';
762 *p_length = out_posn;
763
764 return 0;
765 }
766
767 /* Find the lowest single-byte ECI mode which will encode a given set of Unicode text, assuming valid UTF-8 */
768 INTERNAL int get_best_eci(const unsigned char source[], int length) {
769 int eci = 3;
770 /* Note: attempting single-byte conversions only, so get_eci_length() unnecessary */
771 unsigned char *local_source = (unsigned char *) z_alloca(length + 1);
772
773 do {
774 if (eci == 14) { /* Reserved */
775 eci = 15;
776 } else if (eci == 19) { /* Reserved */
777 eci = 21; /* Skip 20 Shift JIS */
778 }
779 if (utf8_to_eci(eci, source, local_source, &length) == 0) {
780 return eci;
781 }
782 eci++;
783 } while (eci < 25);
784
785 assert(is_valid_utf8(source, length));
786
787 return 26; /* If all of these fail, use UTF-8! */
788 }
789
790 /* Call `get_best_eci()` for each segment, assuming valid UTF-8. Returns 0 on failure, first ECI set on success */
791 INTERNAL int get_best_eci_segs(struct zint_symbol *symbol, struct zint_seg segs[], const int seg_count) {
792 const int default_eci = symbol->symbology == BARCODE_GRIDMATRIX ? 29 : symbol->symbology == BARCODE_UPNQR ? 4 : 3;
793 int first_eci_set = 0;
794 int i;
795
796 for (i = 0; i < seg_count; i++) {
797 if (segs[i].eci == 0) {
798 const int eci = get_best_eci(segs[i].source, segs[i].length);
799 if (eci == default_eci) {
800 if (i != 0 && segs[i - 1].eci != 0 && segs[i - 1].eci != default_eci) {
801 segs[i].eci = eci;
802 if (first_eci_set == 0) {
803 first_eci_set = eci;
804 }
805 }
806 } else {
807 segs[i].eci = eci;
808 if (first_eci_set == 0) {
809 first_eci_set = eci;
810 if (i == 0) {
811 symbol->eci = eci;
812 }
813 }
814 }
815 }
816 }
817
818 return first_eci_set;
819 }
820
821 /* QRCODE Shift JIS helpers */
822
823 /* Convert UTF-8 string to Shift JIS and place in array of ints */
824 INTERNAL int sjis_utf8(struct zint_symbol *symbol, const unsigned char source[], int *p_length,
825 unsigned int *ddata) {
826 int error_number;
827 unsigned int i, length;
828 unsigned int *utfdata = (unsigned int *) z_alloca(sizeof(unsigned int) * (*p_length + 1));
829
830 error_number = utf8_to_unicode(symbol, source, utfdata, p_length, 1 /*disallow_4byte*/);
831 if (error_number != 0) {
832 return error_number;
833 }
834
835 for (i = 0, length = *p_length; i < length; i++) {
836 if (!u_sjis_int(utfdata[i], ddata + i)) {
837 return errtxt(ZINT_ERROR_INVALID_DATA, symbol, 800, "Invalid character in input");
838 }
839 }
840
841 return 0;
842 }
843
844 /* If `full_multibyte` set, copy byte input stream to array of ints, putting double-bytes that match QR Kanji mode in
845 * a single entry. If `full_multibyte` not set, do a straight copy */
846 INTERNAL void sjis_cpy(const unsigned char source[], int *p_length, unsigned int *ddata, const int full_multibyte) {
847 unsigned int i, j, length;
848 unsigned char c1, c2;
849
850 if (full_multibyte) {
851 for (i = 0, j = 0, length = *p_length; i < length; i++, j++) {
852 c1 = source[i];
853 /* Now using stricter interpretation of standard, and excluding certain trailing bytes */
854 if (((c1 >= 0x81 && c1 <= 0x9F) || (c1 >= 0xE0 && c1 <= 0xEB)) && length - i >= 2) {
855 c2 = source[i + 1];
856 if ((c2 >= 0x40 && c2 <= 0xFC) && c2 != 0x7F && (c1 != 0xEB || c2 <= 0xBF)) {
857 /* This may or may not be valid Shift JIS, but don't care as long as it can be encoded in
858 * QR Kanji mode */
859 ddata[j] = (c1 << 8) | c2;
860 i++;
861 } else {
862 ddata[j] = c1;
863 }
864 } else {
865 ddata[j] = c1;
866 }
867 }
868 *p_length = j;
869 } else {
870 /* Straight copy */
871 for (i = 0, length = *p_length; i < length; i++) {
872 ddata[i] = source[i];
873 }
874 }
875 }
876
877 /* Call `sjis_cpy()` for each segment */
878 INTERNAL void sjis_cpy_segs(struct zint_seg segs[], const int seg_count, unsigned int *ddata,
879 const int full_multibyte) {
880 int i;
881 unsigned int *dd = ddata;
882
883 for (i = 0; i < seg_count; i++) {
884 sjis_cpy(segs[i].source, &segs[i].length, dd, full_multibyte);
885 dd += segs[i].length;
886 }
887 }
888
889 /* Convert UTF-8 string to ECI and place in array of ints using `sjis_cpy()` */
890 INTERNAL int sjis_utf8_to_eci(const int eci, const unsigned char source[], int *p_length, unsigned int *ddata,
891 const int full_multibyte) {
892
893 if (is_eci_convertible(eci)) {
894 int error_number;
895 const int eci_length = get_eci_length(eci, source, *p_length);
896 unsigned char *converted = (unsigned char *) z_alloca(eci_length + 1);
897
898 error_number = utf8_to_eci(eci, source, converted, p_length);
899 if (error_number != 0) {
900 /* Note not setting `symbol->errtxt`, up to caller */
901 return error_number;
902 }
903
904 sjis_cpy(converted, p_length, ddata, full_multibyte || eci == 20);
905 } else {
906 sjis_cpy(source, p_length, ddata, full_multibyte);
907 }
908
909 return 0;
910 }
911
912 /* GRIDMATRIX GB 2312 helpers */
913
914 /* Convert UTF-8 string to GB 2312 (EUC-CN) and place in array of ints */
915 INTERNAL int gb2312_utf8(struct zint_symbol *symbol, const unsigned char source[], int *p_length,
916 unsigned int *ddata) {
917 int error_number;
918 unsigned int i, length;
919 unsigned int *utfdata = (unsigned int *) z_alloca(sizeof(unsigned int) * (*p_length + 1));
920
921 error_number = utf8_to_unicode(symbol, source, utfdata, p_length, 1 /*disallow_4byte*/);
922 if (error_number != 0) {
923 return error_number;
924 }
925
926 for (i = 0, length = *p_length; i < length; i++) {
927 if (utfdata[i] < 0x80) {
928 ddata[i] = utfdata[i];
929 } else {
930 if (!u_gb2312_int(utfdata[i], ddata + i)) {
931 return errtxt(ZINT_ERROR_INVALID_DATA, symbol, 810, "Invalid character in input");
932 }
933 }
934 }
935
936 return 0;
937 }
938
939 /* If `full_multibyte` set, copy byte input stream to array of ints, putting double-bytes that match GRIDMATRIX
940 * Chinese mode in a single entry. If `full_multibyte` not set, do a straight copy */
941 static void gb2312_cpy(const unsigned char source[], int *p_length, unsigned int *ddata,
942 const int full_multibyte) {
943 unsigned int i, j, length;
944 unsigned char c1, c2;
945
946 if (full_multibyte) {
947 for (i = 0, j = 0, length = *p_length; i < length; i++, j++) {
948 if (length - i >= 2) {
949 c1 = source[i];
950 c2 = source[i + 1];
951 if (((c1 >= 0xA1 && c1 <= 0xA9) || (c1 >= 0xB0 && c1 <= 0xF7)) && c2 >= 0xA1 && c2 <= 0xFE) {
952 /* This may or may not be valid GB 2312 (EUC-CN), but don't care as long as it can be encoded in
953 * GRIDMATRIX Chinese mode */
954 ddata[j] = (c1 << 8) | c2;
955 i++;
956 } else {
957 ddata[j] = c1;
958 }
959 } else {
960 ddata[j] = source[i];
961 }
962 }
963 *p_length = j;
964 } else {
965 /* Straight copy */
966 for (i = 0, length = *p_length; i < length; i++) {
967 ddata[i] = source[i];
968 }
969 }
970 }
971
972 #ifdef ZINT_TEST /* Wrapper for direct testing */
973 INTERNAL void gb2312_cpy_test(const unsigned char source[], int *p_length, unsigned int *ddata,
974 const int full_multibyte) {
975 gb2312_cpy(source, p_length, ddata, full_multibyte);
976 }
977 #endif
978
979 /* Call `gb2312_cpy()` for each segment */
980 INTERNAL void gb2312_cpy_segs(struct zint_seg segs[], const int seg_count, unsigned int *ddata,
981 const int full_multibyte) {
982 int i;
983 unsigned int *dd = ddata;
984
985 for (i = 0; i < seg_count; i++) {
986 gb2312_cpy(segs[i].source, &segs[i].length, dd, full_multibyte);
987 dd += segs[i].length;
988 }
989 }
990
991 /* Convert UTF-8 string to ECI and place in array of ints using `gb2312_cpy()` */
992 INTERNAL int gb2312_utf8_to_eci(const int eci, const unsigned char source[], int *p_length, unsigned int *ddata,
993 const int full_multibyte) {
994
995 if (is_eci_convertible(eci)) {
996 int error_number;
997 const int eci_length = get_eci_length(eci, source, *p_length);
998 unsigned char *converted = (unsigned char *) z_alloca(eci_length + 1);
999
1000 error_number = utf8_to_eci(eci, source, converted, p_length);
1001 if (error_number != 0) {
1002 /* Note not setting `symbol->errtxt`, up to caller */
1003 return error_number;
1004 }
1005
1006 gb2312_cpy(converted, p_length, ddata, full_multibyte || eci == 29);
1007 } else {
1008 gb2312_cpy(source, p_length, ddata, full_multibyte);
1009 }
1010
1011 return 0;
1012 }
1013
1014 /* HANXIN GB 18030 helpers */
1015
1016 /* Convert UTF-8 string to GB 18030 and place in array of ints */
1017 INTERNAL int gb18030_utf8(struct zint_symbol *symbol, const unsigned char source[], int *p_length,
1018 unsigned int *ddata) {
1019 int error_number, ret;
1020 unsigned int i, j, length;
1021 unsigned int *utfdata = (unsigned int *) z_alloca(sizeof(unsigned int) * (*p_length + 1));
1022
1023 error_number = utf8_to_unicode(symbol, source, utfdata, p_length, 0 /*disallow_4byte*/);
1024 if (error_number != 0) {
1025 return error_number;
1026 }
1027
1028 for (i = 0, j = 0, length = *p_length; i < length; i++, j++) {
1029 if (utfdata[i] < 0x80) {
1030 ddata[j] = utfdata[i];
1031 } else {
1032 ret = u_gb18030_int(utfdata[i], ddata + j, ddata + j + 1);
1033 if (ret == 0) { /* Should never happen, as GB 18030 is a UTF i.e. maps all Unicode codepoints */
1034 return errtxt(ZINT_ERROR_INVALID_DATA, symbol, 820, "Invalid character in input"); /* Not reached */
1035 }
1036 if (ret == 4) {
1037 j++;
1038 }
1039 }
1040 }
1041
1042 *p_length = j;
1043
1044 return 0;
1045 }
1046
1047 /* If `full_multibyte` set, copy byte input stream to array of ints, putting double-bytes that match HANXIN
1048 * Chinese mode in single entry, and quad-bytes in 2 entries. If `full_multibyte` not set, do a straight copy */
1049 static void gb18030_cpy(const unsigned char source[], int *p_length, unsigned int *ddata,
1050 const int full_multibyte) {
1051 unsigned int i, j, length;
1052 int done;
1053 unsigned char c1, c2, c3, c4;
1054
1055 if (full_multibyte) {
1056 for (i = 0, j = 0, length = *p_length; i < length; i++, j++) {
1057 done = 0;
1058 c1 = source[i];
1059 if (length - i >= 2) {
1060 if (c1 >= 0x81 && c1 <= 0xFE) {
1061 c2 = source[i + 1];
1062 if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0x80 && c2 <= 0xFE)) {
1063 ddata[j] = (c1 << 8) | c2;
1064 i++;
1065 done = 1;
1066 } else if (length - i >= 4 && (c2 >= 0x30 && c2 <= 0x39)) {
1067 c3 = source[i + 2];
1068 c4 = source[i + 3];
1069 if ((c3 >= 0x81 && c3 <= 0xFE) && (c4 >= 0x30 && c4 <= 0x39)) {
1070 ddata[j++] = (c1 << 8) | c2;
1071 ddata[j] = (c3 << 8) | c4;
1072 i += 3;
1073 done = 1;
1074 }
1075 }
1076 }
1077 }
1078 if (!done) {
1079 ddata[j] = c1;
1080 }
1081 }
1082 *p_length = j;
1083 } else {
1084 /* Straight copy */
1085 for (i = 0, length = *p_length; i < length; i++) {
1086 ddata[i] = source[i];
1087 }
1088 }
1089 }
1090
1091 #ifdef ZINT_TEST /* Wrapper for direct testing */
1092 INTERNAL void gb18030_cpy_test(const unsigned char source[], int *p_length, unsigned int *ddata,
1093 const int full_multibyte) {
1094 gb18030_cpy(source, p_length, ddata, full_multibyte);
1095 }
1096 #endif
1097
1098 /* Call `gb18030_cpy()` for each segment */
1099 INTERNAL void gb18030_cpy_segs(struct zint_seg segs[], const int seg_count, unsigned int *ddata,
1100 const int full_multibyte) {
1101 int i;
1102 unsigned int *dd = ddata;
1103
1104 for (i = 0; i < seg_count; i++) {
1105 gb18030_cpy(segs[i].source, &segs[i].length, dd, full_multibyte);
1106 dd += segs[i].length;
1107 }
1108 }
1109
1110 /* Convert UTF-8 string to ECI and place in array of ints using `gb18030_cpy()` */
1111 INTERNAL int gb18030_utf8_to_eci(const int eci, const unsigned char source[], int *p_length, unsigned int *ddata,
1112 const int full_multibyte) {
1113
1114 if (is_eci_convertible(eci)) {
1115 int error_number;
1116 const int eci_length = get_eci_length(eci, source, *p_length);
1117 unsigned char *converted = (unsigned char *) z_alloca(eci_length + 1);
1118
1119 error_number = utf8_to_eci(eci, source, converted, p_length);
1120 if (error_number != 0) {
1121 /* Note not setting `symbol->errtxt`, up to caller */
1122 return error_number;
1123 }
1124
1125 /* GB 18030 (ECI 32) superset of GB 2312 (ECI 29) and GBK (ECI 31) */
1126 gb18030_cpy(converted, p_length, ddata, full_multibyte || eci == 32 || eci == 29 || eci == 31);
1127 } else {
1128 gb18030_cpy(source, p_length, ddata, full_multibyte);
1129 }
1130
1131 return 0;
1132 }
1133
1134 /* vim: set ts=4 sw=4 et : */