Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/zint/backend/eci.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/zint/backend/eci.c Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,1134 @@ +/* eci.c - Extended Channel Interpretations */ +/* + libzint - the open source barcode library + Copyright (C) 2009-2024 Robin Stuart <rstuart114@gmail.com> + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + 3. Neither the name of the project nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + SUCH DAMAGE. + */ +/* SPDX-License-Identifier: BSD-3-Clause */ + +#include <assert.h> +#include "common.h" +#include "eci.h" +#include "eci_sb.h" +#include "big5.h" +#include "gb18030.h" +#include "gb2312.h" +#include "gbk.h" +#include "ksx1001.h" +#include "sjis.h" + +/* Single-byte stuff */ + +/* Base ISO/IEC 8859 routine to convert Unicode codepoint `u` */ +static int u_iso8859(const unsigned int u, const unsigned short *tab_s, const unsigned short *tab_u, + const unsigned char *tab_sb, int e, unsigned char *dest) { + int s; + if (u < 0xA0) { + if (u >= 0x80) { /* U+0080-9F fail */ + return 0; + } + *dest = (unsigned char) u; + return 1; + } + if (u <= 0xFF) { + const unsigned int u2 = u - 0xA0; + if (tab_s[u2 >> 4] & ((unsigned short) 1 << (u2 & 0xF))) { + *dest = (unsigned char) u; /* Straight-thru */ + return 1; + } + } + + s = 0; + while (s <= e) { + const int m = (s + e) >> 1; + if (tab_u[m] < u) { + s = m + 1; + } else if (tab_u[m] > u) { + e = m - 1; + } else { + *dest = tab_sb[m]; + return 1; + } + } + return 0; +} + +/* Base Windows-125x routine to convert Unicode codepoint `u` */ +static int u_cp125x(const unsigned int u, const unsigned short *tab_s, const unsigned short *tab_u, + const unsigned char *tab_sb, int e, unsigned char *dest) { + int s; + if (u < 0x80) { + *dest = (unsigned char) u; + return 1; + } + if (u <= 0xFF && u >= 0xA0) { + const unsigned int u2 = u - 0xA0; + if (tab_s[u2 >> 4] & ((unsigned short) 1 << (u2 & 0xF))) { + *dest = (unsigned char) u; /* Straight-thru */ + return 1; + } + } + + s = 0; + while (s <= e) { + const int m = (s + e) >> 1; + if (tab_u[m] < u) { + s = m + 1; + } else if (tab_u[m] > u) { + e = m - 1; + } else { + *dest = tab_sb[m]; + return 1; + } + } + return 0; +} + +/* ECI 27 ASCII (ISO/IEC 646:1991 IRV (US)) */ +static int u_ascii(const unsigned int u, unsigned char *dest) { + if (u < 0x80) { + *dest = (unsigned char) u; + return 1; + } + return 0; +} + +/* ECI 170 ASCII subset (ISO/IEC 646:1991 Invariant), excludes 12 chars that historically had national variants, + namely "#$@[\]^`{|}~" */ +static int u_ascii_inv(const unsigned int u, unsigned char *dest) { + if (u == 0x7F || (u <= 'z' && u != '#' && u != '$' && u != '@' && (u <= 'Z' || u == '_' || u >= 'a'))) { + *dest = (unsigned char) u; + return 1; + } + return 0; +} + +/* ECI 25 UTF-16 Big Endian (ISO/IEC 10646) - assumes valid Unicode */ +static int u_utf16be(const unsigned int u, unsigned char *dest) { + unsigned int u2, v; + if (u < 0x10000) { + dest[0] = (unsigned char) (u >> 8); + dest[1] = (unsigned char) u; + return 2; + } + u2 = u - 0x10000; + v = u2 >> 10; + dest[0] = (unsigned char) (0xD8 + (v >> 8)); + dest[1] = (unsigned char) v; + v = u2 & 0x3FF; + dest[2] = (unsigned char) (0xDC + (v >> 8)); + dest[3] = (unsigned char) v; + return 4; +} + +/* ECI 33 UTF-16 Little Endian (ISO/IEC 10646) - assumes valid Unicode */ +static int u_utf16le(const unsigned int u, unsigned char *dest) { + unsigned int u2, v; + if (u < 0x10000) { + dest[0] = (unsigned char) u; + dest[1] = (unsigned char) (u >> 8); + return 2; + } + u2 = u - 0x10000; + v = u2 >> 10; + dest[0] = (unsigned char) v; + dest[1] = (unsigned char) (0xD8 + (v >> 8)); + v = u2 & 0x3FF; + dest[2] = (unsigned char) v; + dest[3] = (unsigned char) (0xDC + (v >> 8)); + return 4; +} + +/* ECI 34 UTF-32 Big Endian (ISO/IEC 10646) - assumes valid Unicode */ +static int u_utf32be(const unsigned int u, unsigned char *dest) { + dest[0] = 0; + dest[1] = (unsigned char) (u >> 16); + dest[2] = (unsigned char) (u >> 8); + dest[3] = (unsigned char) u; + return 4; +} + +/* ECI 35 UTF-32 Little Endian (ISO/IEC 10646) - assumes valid Unicode */ +static int u_utf32le(const unsigned int u, unsigned char *dest) { + dest[0] = (unsigned char) u; + dest[1] = (unsigned char) (u >> 8); + dest[2] = (unsigned char) (u >> 16); + dest[3] = 0; + return 4; +} + +/* Multibyte stuff */ + +/* Acknowledgements to Bruno Haible <bruno@clisp.org> for a no. of techniques used here */ + +/* Helper to lookup Unicode codepoint `u` in the URO (Unified Repertoire and Ordering) block (U+4E00-9FFF) */ +static int eci_u_lookup_uro_int(const unsigned int u, const unsigned short *tab_u, const unsigned short *tab_mb_ind, + const unsigned short *tab_mb, unsigned int *d) { + unsigned int u2 = (u - 0x4E00) >> 4; /* Blocks of 16 */ + unsigned int v = (unsigned int) 1 << (u & 0xF); + if ((tab_u[u2] & v) == 0) { + return 0; + } + v = tab_u[u2] & (v - 1); /* Mask to bits prior to this one */ + /* Count bits set (http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel) */ + v = v - ((v >> 1) & 0x55555555); + v = (v & 0x33333333) + ((v >> 2) & 0x33333333); + v = (((v + (v >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24; + *d = tab_mb[tab_mb_ind[u2] + v]; + return 2; +} + +/* Version of `eci_u_lookup_uro_int()` taking unsigned char destination */ +static int eci_u_lookup_uro(const unsigned int u, const unsigned short *tab_u, const unsigned short *tab_mb_ind, + const unsigned short *tab_mb, unsigned char *dest) { + unsigned int d; + int ret = eci_u_lookup_uro_int(u, tab_u, tab_mb_ind, tab_mb, &d); + if (ret) { + dest[0] = (unsigned char) (d >> 8); + dest[1] = (unsigned char) d; + } + return ret; +} + +/* ECI 20 Shift JIS */ +static int u_sjis_int(const unsigned int u, unsigned int *d) { + unsigned int u2, dv, md; + int s, e; + + if (u < 0x80 && u != 0x5C && u != 0x7E) { /* Backslash & tilde re-mapped according to JIS X 0201 Roman */ + *d = u; + return 1; + } + /* Special case URO block sequential mappings (considerably lessens size of `sjis_u[]` array) */ + if (u >= 0x4E00 && u <= 0xDFFF) { /* 0xE000 next used value >= 0x4E00 */ + if (u >= 0x9FB0) { + return 0; + } + return eci_u_lookup_uro_int(u, sjis_uro_u, sjis_uro_mb_ind, sjis_mb, d); + } + /* PUA to user-defined (Table 4-86, Lunde, 2nd ed.) */ + if (u >= 0xE000 && u <= 0xE757) { + u2 = u - 0xE000; + dv = u2 / 188; + md = u2 - dv * 188; + *d = ((dv + 0xF0) << 8) | (md + 0x40 + (md >= 0x3F)); + return 2; + } + if (u >= sjis_u[0] && u <= sjis_u[ARRAY_SIZE(sjis_u) - 1]) { + s = 0; + e = ARRAY_SIZE(sjis_u) - 1; + while (s <= e) { + const int m = (s + e) >> 1; + if (sjis_u[m] < u) { + s = m + 1; + } else if (sjis_u[m] > u) { + e = m - 1; + } else { + *d = sjis_mb[u >= 0x4E00 ? m + 6356 : m]; /* Adjust for URO block */ + return 1 + (*d > 0xFF); + } + } + } + return 0; +} + +#ifdef ZINT_TEST /* Wrapper for direct testing */ +INTERNAL int u_sjis_int_test(const unsigned int u, unsigned int *d) { + return u_sjis_int(u, d); +} +#endif + +/* Version of `u_sjis_int()` taking unsigned char destination, for use by `utf8_to_eci()` */ +static int u_sjis(const unsigned int u, unsigned char *dest) { + unsigned int d; + int ret = u_sjis_int(u, &d); + if (ret) { + if (ret == 1) { + dest[0] = (unsigned char) d; + } else { + dest[0] = (unsigned char) (d >> 8); + dest[1] = (unsigned char) d; + } + } + return ret; +} + +/* ECI 28 Big5 Chinese (Taiwan) */ +static int u_big5(const unsigned int u, unsigned char *dest) { + int s, e; + + if (u < 0x80) { + *dest = (unsigned char) u; + return 1; + } + /* Special case URO block sequential mappings (considerably lessens size of `big5_u[]` array) */ + if (u >= 0x4E00 && u <= 0xFA0B) { /* 0xFA0C next used value >= 0x4E00 */ + if (u >= 0x9FB0) { + return 0; + } + return eci_u_lookup_uro(u, big5_uro_u, big5_uro_mb_ind, big5_mb, dest); + } + if (u >= big5_u[0] && u <= big5_u[ARRAY_SIZE(big5_u) - 1]) { + s = 0; + e = ARRAY_SIZE(big5_u) - 1; + while (s <= e) { + const int m = (s + e) >> 1; + if (big5_u[m] < u) { + s = m + 1; + } else if (big5_u[m] > u) { + e = m - 1; + } else { + const unsigned short mb = big5_mb[u >= 0x4E00 ? m + 13061 : m]; /* Adjust for URO block */ + dest[0] = (unsigned char) (mb >> 8); + dest[1] = (unsigned char) mb; + return 2; + } + } + } + return 0; +} + +#ifdef ZINT_TEST /* Wrapper for direct testing */ +INTERNAL int u_big5_test(const unsigned int u, unsigned char *dest) { + return u_big5(u, dest); +} +#endif + +/* ECI 30 EUC-KR (KS X 1001, formerly KS C 5601) Korean */ +static int u_ksx1001(const unsigned int u, unsigned char *dest) { + int s, e; + + if (u < 0x80) { + *dest = (unsigned char) u; + return 1; + } + /* Special case URO block sequential mappings (considerably lessens size of `ksx1001_u[]` array) */ + if (u >= 0x4E00 && u <= 0xABFF) { /* 0xAC00 next used value >= 0x4E00 */ + if (u >= 0x9FA0) { + return 0; + } + return eci_u_lookup_uro(u, ksx1001_uro_u, ksx1001_uro_mb_ind, ksx1001_mb, dest); + } + if (u >= ksx1001_u[0] && u <= ksx1001_u[ARRAY_SIZE(ksx1001_u) - 1]) { + s = ksx1001_u_ind[(u - ksx1001_u[0]) >> 8]; + e = s + 0x100 > ARRAY_SIZE(ksx1001_u) ? ARRAY_SIZE(ksx1001_u) - 1 : s + 0x100 - 1; + while (s <= e) { + const int m = (s + e) >> 1; + if (ksx1001_u[m] < u) { + s = m + 1; + } else if (ksx1001_u[m] > u) { + e = m - 1; + } else { + const unsigned short mb = ksx1001_mb[u >= 0x4E00 ? m + 4620 : m]; /* Adjust for URO block */ + dest[0] = (unsigned char) (mb >> 8); + dest[1] = (unsigned char) mb; + return 2; + } + } + } + return 0; +} + +#ifdef ZINT_TEST /* Wrapper for direct testing */ +INTERNAL int u_ksx1001_test(const unsigned int u, unsigned char *dest) { + return u_ksx1001(u, dest); +} +#endif + +/* ECI 29 GB 2312 Chinese (PRC) */ +static int u_gb2312_int(const unsigned int u, unsigned int *d) { + int s, e; + + if (u < 0x80) { + *d = u; + return 1; + } + /* Special case URO block sequential mappings (considerably lessens size of `gb2312_u[]` array) */ + if (u >= 0x4E00 && u <= 0x9E1E) { /* 0x9E1F next used value >= 0x4E00 */ + if (u >= 0x9CF0) { + return 0; + } + return eci_u_lookup_uro_int(u, gb2312_uro_u, gb2312_uro_mb_ind, gb2312_mb, d); + } + if (u >= gb2312_u[0] && u <= gb2312_u[ARRAY_SIZE(gb2312_u) - 1]) { + s = gb2312_u_ind[(u - gb2312_u[0]) >> 8]; + e = s + 0x100 > ARRAY_SIZE(gb2312_u) ? ARRAY_SIZE(gb2312_u) - 1 : s + 0x100 - 1; + while (s <= e) { + const int m = (s + e) >> 1; + if (gb2312_u[m] < u) { + s = m + 1; + } else if (gb2312_u[m] > u) { + e = m - 1; + } else { + *d = gb2312_mb[u > 0x4E00 ? m + 6627 : m]; /* Adjust for URO block */ + return 2; + } + } + } + return 0; +} + +#ifdef ZINT_TEST /* Wrapper for direct testing */ +INTERNAL int u_gb2312_int_test(const unsigned int u, unsigned int *d) { + return u_gb2312_int(u, d); +} +#endif + +/* Version of `u_gb2312_int()` taking unsigned char destination, for use by `utf8_to_eci()` */ +static int u_gb2312(const unsigned int u, unsigned char *dest) { + unsigned int d; + int ret = u_gb2312_int(u, &d); + if (ret) { + if (ret == 1) { + dest[0] = (unsigned char) d; + } else { + dest[0] = (unsigned char) (d >> 8); + dest[1] = (unsigned char) d; + } + } + return ret; +} + +/* ECI 31 GBK Chinese */ +static int u_gbk_int(const unsigned int u, unsigned int *d) { + int s, e; + + if (u < 0x80) { + *d = u; + return 1; + } + + /* Check GB 2312 first */ + if (u == 0x30FB) { + /* KATAKANA MIDDLE DOT, mapped by GB 2312 but not by GBK (U+00B7 MIDDLE DOT mapped to 0xA1A4 instead) */ + return 0; + } + if (u == 0x2015) { + /* HORIZONTAL BAR, mapped to 0xA844 by GBK rather than 0xA1AA (U+2014 EM DASH mapped there instead) */ + *d = 0xA844; + return 2; + } + if (u_gb2312_int(u, d)) { /* Includes the 2 GB 6345.1-86 corrections given in Table 3-22, Lunde, 2nd ed. */ + return 2; + } + + /* Special case URO block sequential mappings (considerably lessens size of `gbk_u[]` array) */ + if (u >= 0x4E00 && u <= 0xF92B) { /* 0xF92C next used value >= 0x4E00 */ + if (u >= 0x9FB0) { + return 0; + } + return eci_u_lookup_uro_int(u, gbk_uro_u, gbk_uro_mb_ind, gbk_mb, d); + } + if (u >= gbk_u[0] && u <= gbk_u[ARRAY_SIZE(gbk_u) - 1]) { + s = 0; + e = ARRAY_SIZE(gbk_u) - 1; + while (s <= e) { + const int m = (s + e) >> 1; + if (gbk_u[m] < u) { + s = m + 1; + } else if (gbk_u[m] > u) { + e = m - 1; + } else { + *d = gbk_mb[u >= 0x4E00 ? m + 14139 : m]; /* Adjust for URO block */ + return 2; + } + } + } + return 0; +} + +#ifdef ZINT_TEST /* Wrapper for direct testing */ +INTERNAL int u_gbk_int_test(const unsigned int u, unsigned int *d) { + return u_gbk_int(u, d); +} +#endif + +/* Version of `u_gbk_int()` taking unsigned char destination, for use by `utf8_to_eci()` */ +static int u_gbk(const unsigned int u, unsigned char *dest) { + unsigned int d; + int ret = u_gbk_int(u, &d); + if (ret) { + if (ret == 1) { + dest[0] = (unsigned char) d; + } else { + dest[0] = (unsigned char) (d >> 8); + dest[1] = (unsigned char) d; + } + } + return ret; +} + +/* Helper for `u_gb18030_int()` to output 4-byte sequential blocks */ +static int u_gb18030_4_sequential_int(unsigned int u2, unsigned int mb_lead, unsigned int *d1, unsigned int *d2) { + unsigned int dv; + + dv = u2 / 10; + *d2 = u2 - dv * 10 + 0x30; + u2 = dv; + dv = u2 / 126; + *d2 |= (u2 - dv * 126 + 0x81) << 8; + u2 = dv; + dv = u2 / 10; + *d1 = ((dv + mb_lead) << 8) | (u2 - dv * 10 + 0x30); + return 4; +} + +/* ECI 32 GB 18030 Chinese - assumes valid Unicode */ +static int u_gb18030_int(const unsigned int u, unsigned int *d1, unsigned int *d2) { + unsigned int u2, dv; + int s, e; + + if (u < 0x80) { + *d1 = u; + return 1; + } + + /* Check GBK first */ + if (u_gbk_int(u, d1)) { + return 2; + } + + if (u >= 0x10000) { + /* Non-PUA, non-BMP, see Table 3-37, Lunde, 2nd ed. */ + if (u == 0x20087) { + *d1 = 0xFE51; + return 2; + } + if (u == 0x20089) { + *d1 = 0xFE52; + return 2; + } + if (u == 0x200CC) { + *d1 = 0xFE53; + return 2; + } + if (u == 0x215D7) { + *d1 = 0xFE6C; + return 2; + } + if (u == 0x2298F) { + *d1 = 0xFE76; + return 2; + } + if (u == 0x241FE) { + *d1 = 0xFE91; + return 2; + } + /* All other non-BMP U+10000-10FFFF */ + return u_gb18030_4_sequential_int(u - 0x10000, 0x90, d1, d2); + } + if (u >= 0xE000 && u <= 0xE765) { /* PUA to user-defined */ + if (u <= 0xE4C5) { + u2 = u - 0xE000; + dv = u2 / 94; + *d1 = ((dv + (dv < 6 ? 0xAA : 0xF2)) << 8) | (u2 - dv * 94 + 0xA1); + } else { + unsigned int md; + u2 = u - 0xE4C6; + dv = u2 / 96; + md = u2 - dv * 96; + *d1 = ((dv + 0xA1) << 8) | (md + 0x40 + (md >= 0x3F)); + } + return 2; + } + if (u >= gb18030_2_u[0] && u <= gb18030_2_u[ARRAY_SIZE(gb18030_2_u) - 1]) { + s = 0; + e = ARRAY_SIZE(gb18030_2_u) - 1; + while (s <= e) { + const int m = (s + e) >> 1; + if (gb18030_2_u[m] < u) { + s = m + 1; + } else if (gb18030_2_u[m] > u) { + e = m - 1; + } else { + *d1 = gb18030_2_mb[m]; + return 2; + } + } + } + /* All other BMP U+0080-FFFF */ + if (u == 0xE7C7) { /* PUA change to non-PUA, see Table 3-39, Lunde, 2nd ed. */ + *d1 = 0x8135; + *d2 = 0xF437; + return 4; + } + s = 0; + e = ARRAY_SIZE(gb18030_4_u_e) - 1; + while (s < e) { /* Lower bound */ + const int m = (s + e) >> 1; + if (gb18030_4_u_e[m] < u) { + s = m + 1; + } else { + e = m; + } + } + assert(s < ARRAY_SIZE(gb18030_4_u_e)); + return u_gb18030_4_sequential_int(u - gb18030_4_mb_o[s] - 0x80, 0x81, d1, d2); +} + +#ifdef ZINT_TEST /* Wrapper for direct testing */ +INTERNAL int u_gb18030_int_test(const unsigned int u, unsigned int *d1, unsigned int *d2) { + return u_gb18030_int(u, d1, d2); +} +#endif + +/* Version of `u_gb18030_int()` taking unsigned char destination, for use by `utf8_to_eci()` */ +static int u_gb18030(const unsigned int u, unsigned char *dest) { + unsigned int d1, d2; + int ret = u_gb18030_int(u, &d1, &d2); + if (ret) { + if (ret == 1) { + dest[0] = (unsigned char) d1; + } else { + dest[0] = (unsigned char) (d1 >> 8); + dest[1] = (unsigned char) d1; + if (ret == 4) { + dest[2] = (unsigned char) (d2 >> 8); + dest[3] = (unsigned char) d2; + } + } + } + return ret; +} + +/* Main ECI stuff */ + +/* Helper to count the number of chars in a string within a range */ +static int chr_range_cnt(const unsigned char string[], const int length, const unsigned char c1, + const unsigned char c2) { + int count = 0; + int i; + if (c1) { + for (i = 0; i < length; i++) { + if (string[i] >= c1 && string[i] <= c2) { + count++; + } + } + } else { + for (i = 0; i < length; i++) { + if (string[i] <= c2) { + count++; + } + } + } + return count; +} + +/* Is ECI convertible from UTF-8? */ +INTERNAL int is_eci_convertible(const int eci) { + if (eci == 26 || (eci > 35 && eci != 170)) { /* Exclude ECI 170 - ASCII Invariant */ + /* UTF-8 (26) or 8-bit binary data (899) or undefined (> 35 and < 899) or not character set (> 899) */ + return 0; + } + return 1; +} + +/* Are any of the ECIs in the segments convertible from UTF-8? + Sets `convertible[]` for each, which must be at least `seg_count` in size */ +INTERNAL int is_eci_convertible_segs(const struct zint_seg segs[], const int seg_count, int convertible[]) { + int ret = 0; + int i; + for (i = 0; i < seg_count; i++) { + convertible[i] = is_eci_convertible(segs[i].eci); + ret |= convertible[i]; + } + return ret; +} + +/* Calculate length required to convert UTF-8 to (double-byte) encoding */ +INTERNAL int get_eci_length(const int eci, const unsigned char source[], int length) { + if (eci == 20) { /* Shift JIS */ + /* Only ASCII backslash (reverse solidus) exceeds UTF-8 length */ + length += chr_cnt(source, length, '\\'); + + } else if (eci == 25 || eci == 33) { /* UTF-16 */ + /* All ASCII chars take 2 bytes */ + length += chr_range_cnt(source, length, 0, 0x7F); + /* Surrogate pairs are 4 UTF-8 bytes long so fit */ + + } else if (eci == 32) { /* GB 18030 */ + /* Allow for GB 18030 4 byters */ + length *= 2; + + } else if (eci == 34 || eci == 35) { /* UTF-32 */ + /* Quadruple-up ASCII and double-up non-ASCII */ + length += chr_range_cnt(source, length, 0, 0x7F) * 2 + length; + } + + /* Big5, GB 2312, EUC-KR and GBK fit in UTF-8 length */ + + return length; +} + +/* Call `get_eci_length()` for each segment, returning total */ +INTERNAL int get_eci_length_segs(const struct zint_seg segs[], const int seg_count) { + int length = 0; + int i; + + for (i = 0; i < seg_count; i++) { + length += get_eci_length(segs[i].eci, segs[i].source, segs[i].length); + } + + return length; +} + +/* Convert UTF-8 to other character encodings */ +typedef int (*eci_func_t)(const unsigned int u, unsigned char *dest); +INTERNAL int utf8_to_eci(const int eci, const unsigned char source[], unsigned char dest[], int *p_length) { + + static const eci_func_t eci_funcs[36] = { + NULL, NULL, NULL, NULL, u_iso8859_2, /*0-4*/ + u_iso8859_3, u_iso8859_4, u_iso8859_5, u_iso8859_6, u_iso8859_7, /*5-9*/ + u_iso8859_8, u_iso8859_9, u_iso8859_10, u_iso8859_11, NULL, /*10-14*/ + u_iso8859_13, u_iso8859_14, u_iso8859_15, u_iso8859_16, NULL, /*15-19*/ + u_sjis, u_cp1250, u_cp1251, u_cp1252, u_cp1256, /*20-24*/ + u_utf16be, NULL, u_ascii, u_big5, u_gb2312, /*25-29*/ + u_ksx1001, u_gbk, u_gb18030, u_utf16le, u_utf32be, /*30-34*/ + u_utf32le, + }; + eci_func_t eci_func; + unsigned int codepoint, state = 0; + int in_posn = 0; + int out_posn = 0; + int length = *p_length; + + /* Special case ISO/IEC 8859-1 */ + if (eci == 0 || eci == 3) { /* Default ECI 0 to ISO/IEC 8859-1 */ + while (in_posn < length) { + do { + decode_utf8(&state, &codepoint, source[in_posn++]); + } while (in_posn < length && state != 0 && state != 12); + if (state != 0) { + return ZINT_ERROR_INVALID_DATA; + } + if (codepoint >= 0x80 && (codepoint < 0xA0 || codepoint >= 0x100)) { + return ZINT_ERROR_INVALID_DATA; + } + dest[out_posn++] = (unsigned char) codepoint; + } + dest[out_posn] = '\0'; + *p_length = out_posn; + return 0; + } + + if (eci == 170) { /* ASCII Invariant (archaic subset) */ + eci_func = u_ascii_inv; + } else { + eci_func = eci_funcs[eci]; + if (eci_func == NULL) { + return ZINT_ERROR_INVALID_DATA; + } + } + + while (in_posn < length) { + int incr; + do { + decode_utf8(&state, &codepoint, source[in_posn++]); + } while (in_posn < length && state != 0 && state != 12); + if (state != 0) { + return ZINT_ERROR_INVALID_DATA; + } + incr = (*eci_func)(codepoint, dest + out_posn); + if (incr == 0) { + return ZINT_ERROR_INVALID_DATA; + } + out_posn += incr; + } + dest[out_posn] = '\0'; + *p_length = out_posn; + + return 0; +} + +/* Find the lowest single-byte ECI mode which will encode a given set of Unicode text, assuming valid UTF-8 */ +INTERNAL int get_best_eci(const unsigned char source[], int length) { + int eci = 3; + /* Note: attempting single-byte conversions only, so get_eci_length() unnecessary */ + unsigned char *local_source = (unsigned char *) z_alloca(length + 1); + + do { + if (eci == 14) { /* Reserved */ + eci = 15; + } else if (eci == 19) { /* Reserved */ + eci = 21; /* Skip 20 Shift JIS */ + } + if (utf8_to_eci(eci, source, local_source, &length) == 0) { + return eci; + } + eci++; + } while (eci < 25); + + assert(is_valid_utf8(source, length)); + + return 26; /* If all of these fail, use UTF-8! */ +} + +/* Call `get_best_eci()` for each segment, assuming valid UTF-8. Returns 0 on failure, first ECI set on success */ +INTERNAL int get_best_eci_segs(struct zint_symbol *symbol, struct zint_seg segs[], const int seg_count) { + const int default_eci = symbol->symbology == BARCODE_GRIDMATRIX ? 29 : symbol->symbology == BARCODE_UPNQR ? 4 : 3; + int first_eci_set = 0; + int i; + + for (i = 0; i < seg_count; i++) { + if (segs[i].eci == 0) { + const int eci = get_best_eci(segs[i].source, segs[i].length); + if (eci == default_eci) { + if (i != 0 && segs[i - 1].eci != 0 && segs[i - 1].eci != default_eci) { + segs[i].eci = eci; + if (first_eci_set == 0) { + first_eci_set = eci; + } + } + } else { + segs[i].eci = eci; + if (first_eci_set == 0) { + first_eci_set = eci; + if (i == 0) { + symbol->eci = eci; + } + } + } + } + } + + return first_eci_set; +} + +/* QRCODE Shift JIS helpers */ + +/* Convert UTF-8 string to Shift JIS and place in array of ints */ +INTERNAL int sjis_utf8(struct zint_symbol *symbol, const unsigned char source[], int *p_length, + unsigned int *ddata) { + int error_number; + unsigned int i, length; + unsigned int *utfdata = (unsigned int *) z_alloca(sizeof(unsigned int) * (*p_length + 1)); + + error_number = utf8_to_unicode(symbol, source, utfdata, p_length, 1 /*disallow_4byte*/); + if (error_number != 0) { + return error_number; + } + + for (i = 0, length = *p_length; i < length; i++) { + if (!u_sjis_int(utfdata[i], ddata + i)) { + return errtxt(ZINT_ERROR_INVALID_DATA, symbol, 800, "Invalid character in input"); + } + } + + return 0; +} + +/* If `full_multibyte` set, copy byte input stream to array of ints, putting double-bytes that match QR Kanji mode in + * a single entry. If `full_multibyte` not set, do a straight copy */ +INTERNAL void sjis_cpy(const unsigned char source[], int *p_length, unsigned int *ddata, const int full_multibyte) { + unsigned int i, j, length; + unsigned char c1, c2; + + if (full_multibyte) { + for (i = 0, j = 0, length = *p_length; i < length; i++, j++) { + c1 = source[i]; + /* Now using stricter interpretation of standard, and excluding certain trailing bytes */ + if (((c1 >= 0x81 && c1 <= 0x9F) || (c1 >= 0xE0 && c1 <= 0xEB)) && length - i >= 2) { + c2 = source[i + 1]; + if ((c2 >= 0x40 && c2 <= 0xFC) && c2 != 0x7F && (c1 != 0xEB || c2 <= 0xBF)) { + /* This may or may not be valid Shift JIS, but don't care as long as it can be encoded in + * QR Kanji mode */ + ddata[j] = (c1 << 8) | c2; + i++; + } else { + ddata[j] = c1; + } + } else { + ddata[j] = c1; + } + } + *p_length = j; + } else { + /* Straight copy */ + for (i = 0, length = *p_length; i < length; i++) { + ddata[i] = source[i]; + } + } +} + +/* Call `sjis_cpy()` for each segment */ +INTERNAL void sjis_cpy_segs(struct zint_seg segs[], const int seg_count, unsigned int *ddata, + const int full_multibyte) { + int i; + unsigned int *dd = ddata; + + for (i = 0; i < seg_count; i++) { + sjis_cpy(segs[i].source, &segs[i].length, dd, full_multibyte); + dd += segs[i].length; + } +} + +/* Convert UTF-8 string to ECI and place in array of ints using `sjis_cpy()` */ +INTERNAL int sjis_utf8_to_eci(const int eci, const unsigned char source[], int *p_length, unsigned int *ddata, + const int full_multibyte) { + + if (is_eci_convertible(eci)) { + int error_number; + const int eci_length = get_eci_length(eci, source, *p_length); + unsigned char *converted = (unsigned char *) z_alloca(eci_length + 1); + + error_number = utf8_to_eci(eci, source, converted, p_length); + if (error_number != 0) { + /* Note not setting `symbol->errtxt`, up to caller */ + return error_number; + } + + sjis_cpy(converted, p_length, ddata, full_multibyte || eci == 20); + } else { + sjis_cpy(source, p_length, ddata, full_multibyte); + } + + return 0; +} + +/* GRIDMATRIX GB 2312 helpers */ + +/* Convert UTF-8 string to GB 2312 (EUC-CN) and place in array of ints */ +INTERNAL int gb2312_utf8(struct zint_symbol *symbol, const unsigned char source[], int *p_length, + unsigned int *ddata) { + int error_number; + unsigned int i, length; + unsigned int *utfdata = (unsigned int *) z_alloca(sizeof(unsigned int) * (*p_length + 1)); + + error_number = utf8_to_unicode(symbol, source, utfdata, p_length, 1 /*disallow_4byte*/); + if (error_number != 0) { + return error_number; + } + + for (i = 0, length = *p_length; i < length; i++) { + if (utfdata[i] < 0x80) { + ddata[i] = utfdata[i]; + } else { + if (!u_gb2312_int(utfdata[i], ddata + i)) { + return errtxt(ZINT_ERROR_INVALID_DATA, symbol, 810, "Invalid character in input"); + } + } + } + + return 0; +} + +/* If `full_multibyte` set, copy byte input stream to array of ints, putting double-bytes that match GRIDMATRIX + * Chinese mode in a single entry. If `full_multibyte` not set, do a straight copy */ +static void gb2312_cpy(const unsigned char source[], int *p_length, unsigned int *ddata, + const int full_multibyte) { + unsigned int i, j, length; + unsigned char c1, c2; + + if (full_multibyte) { + for (i = 0, j = 0, length = *p_length; i < length; i++, j++) { + if (length - i >= 2) { + c1 = source[i]; + c2 = source[i + 1]; + if (((c1 >= 0xA1 && c1 <= 0xA9) || (c1 >= 0xB0 && c1 <= 0xF7)) && c2 >= 0xA1 && c2 <= 0xFE) { + /* This may or may not be valid GB 2312 (EUC-CN), but don't care as long as it can be encoded in + * GRIDMATRIX Chinese mode */ + ddata[j] = (c1 << 8) | c2; + i++; + } else { + ddata[j] = c1; + } + } else { + ddata[j] = source[i]; + } + } + *p_length = j; + } else { + /* Straight copy */ + for (i = 0, length = *p_length; i < length; i++) { + ddata[i] = source[i]; + } + } +} + +#ifdef ZINT_TEST /* Wrapper for direct testing */ +INTERNAL void gb2312_cpy_test(const unsigned char source[], int *p_length, unsigned int *ddata, + const int full_multibyte) { + gb2312_cpy(source, p_length, ddata, full_multibyte); +} +#endif + +/* Call `gb2312_cpy()` for each segment */ +INTERNAL void gb2312_cpy_segs(struct zint_seg segs[], const int seg_count, unsigned int *ddata, + const int full_multibyte) { + int i; + unsigned int *dd = ddata; + + for (i = 0; i < seg_count; i++) { + gb2312_cpy(segs[i].source, &segs[i].length, dd, full_multibyte); + dd += segs[i].length; + } +} + +/* Convert UTF-8 string to ECI and place in array of ints using `gb2312_cpy()` */ +INTERNAL int gb2312_utf8_to_eci(const int eci, const unsigned char source[], int *p_length, unsigned int *ddata, + const int full_multibyte) { + + if (is_eci_convertible(eci)) { + int error_number; + const int eci_length = get_eci_length(eci, source, *p_length); + unsigned char *converted = (unsigned char *) z_alloca(eci_length + 1); + + error_number = utf8_to_eci(eci, source, converted, p_length); + if (error_number != 0) { + /* Note not setting `symbol->errtxt`, up to caller */ + return error_number; + } + + gb2312_cpy(converted, p_length, ddata, full_multibyte || eci == 29); + } else { + gb2312_cpy(source, p_length, ddata, full_multibyte); + } + + return 0; +} + +/* HANXIN GB 18030 helpers */ + +/* Convert UTF-8 string to GB 18030 and place in array of ints */ +INTERNAL int gb18030_utf8(struct zint_symbol *symbol, const unsigned char source[], int *p_length, + unsigned int *ddata) { + int error_number, ret; + unsigned int i, j, length; + unsigned int *utfdata = (unsigned int *) z_alloca(sizeof(unsigned int) * (*p_length + 1)); + + error_number = utf8_to_unicode(symbol, source, utfdata, p_length, 0 /*disallow_4byte*/); + if (error_number != 0) { + return error_number; + } + + for (i = 0, j = 0, length = *p_length; i < length; i++, j++) { + if (utfdata[i] < 0x80) { + ddata[j] = utfdata[i]; + } else { + ret = u_gb18030_int(utfdata[i], ddata + j, ddata + j + 1); + if (ret == 0) { /* Should never happen, as GB 18030 is a UTF i.e. maps all Unicode codepoints */ + return errtxt(ZINT_ERROR_INVALID_DATA, symbol, 820, "Invalid character in input"); /* Not reached */ + } + if (ret == 4) { + j++; + } + } + } + + *p_length = j; + + return 0; +} + +/* If `full_multibyte` set, copy byte input stream to array of ints, putting double-bytes that match HANXIN + * Chinese mode in single entry, and quad-bytes in 2 entries. If `full_multibyte` not set, do a straight copy */ +static void gb18030_cpy(const unsigned char source[], int *p_length, unsigned int *ddata, + const int full_multibyte) { + unsigned int i, j, length; + int done; + unsigned char c1, c2, c3, c4; + + if (full_multibyte) { + for (i = 0, j = 0, length = *p_length; i < length; i++, j++) { + done = 0; + c1 = source[i]; + if (length - i >= 2) { + if (c1 >= 0x81 && c1 <= 0xFE) { + c2 = source[i + 1]; + if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0x80 && c2 <= 0xFE)) { + ddata[j] = (c1 << 8) | c2; + i++; + done = 1; + } else if (length - i >= 4 && (c2 >= 0x30 && c2 <= 0x39)) { + c3 = source[i + 2]; + c4 = source[i + 3]; + if ((c3 >= 0x81 && c3 <= 0xFE) && (c4 >= 0x30 && c4 <= 0x39)) { + ddata[j++] = (c1 << 8) | c2; + ddata[j] = (c3 << 8) | c4; + i += 3; + done = 1; + } + } + } + } + if (!done) { + ddata[j] = c1; + } + } + *p_length = j; + } else { + /* Straight copy */ + for (i = 0, length = *p_length; i < length; i++) { + ddata[i] = source[i]; + } + } +} + +#ifdef ZINT_TEST /* Wrapper for direct testing */ +INTERNAL void gb18030_cpy_test(const unsigned char source[], int *p_length, unsigned int *ddata, + const int full_multibyte) { + gb18030_cpy(source, p_length, ddata, full_multibyte); +} +#endif + +/* Call `gb18030_cpy()` for each segment */ +INTERNAL void gb18030_cpy_segs(struct zint_seg segs[], const int seg_count, unsigned int *ddata, + const int full_multibyte) { + int i; + unsigned int *dd = ddata; + + for (i = 0; i < seg_count; i++) { + gb18030_cpy(segs[i].source, &segs[i].length, dd, full_multibyte); + dd += segs[i].length; + } +} + +/* Convert UTF-8 string to ECI and place in array of ints using `gb18030_cpy()` */ +INTERNAL int gb18030_utf8_to_eci(const int eci, const unsigned char source[], int *p_length, unsigned int *ddata, + const int full_multibyte) { + + if (is_eci_convertible(eci)) { + int error_number; + const int eci_length = get_eci_length(eci, source, *p_length); + unsigned char *converted = (unsigned char *) z_alloca(eci_length + 1); + + error_number = utf8_to_eci(eci, source, converted, p_length); + if (error_number != 0) { + /* Note not setting `symbol->errtxt`, up to caller */ + return error_number; + } + + /* GB 18030 (ECI 32) superset of GB 2312 (ECI 29) and GBK (ECI 31) */ + gb18030_cpy(converted, p_length, ddata, full_multibyte || eci == 32 || eci == 29 || eci == 31); + } else { + gb18030_cpy(source, p_length, ddata, full_multibyte); + } + + return 0; +} + +/* vim: set ts=4 sw=4 et : */
