Mercurial > hgrepos > Python2 > PyMuPDF

diff mupdf-source/thirdparty/zint/backend/eci.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author: Franz Glasner <fzglas.hg@dom66.de>
date: Mon, 15 Sep 2025 11:43:07 +0200
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/zint/backend/eci.c	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,1134 @@
+/*  eci.c - Extended Channel Interpretations */
+/*
+    libzint - the open source barcode library
+    Copyright (C) 2009-2024 Robin Stuart <rstuart114@gmail.com>
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions
+    are met:
+
+    1. Redistributions of source code must retain the above copyright
+       notice, this list of conditions and the following disclaimer.
+    2. Redistributions in binary form must reproduce the above copyright
+       notice, this list of conditions and the following disclaimer in the
+       documentation and/or other materials provided with the distribution.
+    3. Neither the name of the project nor the names of its contributors
+       may be used to endorse or promote products derived from this software
+       without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+    ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+    OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+    HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+    OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+    SUCH DAMAGE.
+ */
+/* SPDX-License-Identifier: BSD-3-Clause */
+
+#include <assert.h>
+#include "common.h"
+#include "eci.h"
+#include "eci_sb.h"
+#include "big5.h"
+#include "gb18030.h"
+#include "gb2312.h"
+#include "gbk.h"
+#include "ksx1001.h"
+#include "sjis.h"
+
+/* Single-byte stuff */
+
+/* Base ISO/IEC 8859 routine to convert Unicode codepoint `u` */
+static int u_iso8859(const unsigned int u, const unsigned short *tab_s, const unsigned short *tab_u,
+            const unsigned char *tab_sb, int e, unsigned char *dest) {
+    int s;
+    if (u < 0xA0) {
+        if (u >= 0x80) { /* U+0080-9F fail */
+            return 0;
+        }
+        *dest = (unsigned char) u;
+        return 1;
+    }
+    if (u <= 0xFF) {
+        const unsigned int u2 = u - 0xA0;
+        if (tab_s[u2 >> 4] & ((unsigned short) 1 << (u2 & 0xF))) {
+            *dest = (unsigned char) u; /* Straight-thru */
+            return 1;
+        }
+    }
+
+    s = 0;
+    while (s <= e) {
+        const int m = (s + e) >> 1;
+        if (tab_u[m] < u) {
+            s = m + 1;
+        } else if (tab_u[m] > u) {
+            e = m - 1;
+        } else {
+            *dest = tab_sb[m];
+            return 1;
+        }
+    }
+    return 0;
+}
+
+/* Base Windows-125x routine to convert Unicode codepoint `u` */
+static int u_cp125x(const unsigned int u, const unsigned short *tab_s, const unsigned short *tab_u,
+            const unsigned char *tab_sb, int e, unsigned char *dest) {
+    int s;
+    if (u < 0x80) {
+        *dest = (unsigned char) u;
+        return 1;
+    }
+    if (u <= 0xFF && u >= 0xA0) {
+        const unsigned int u2 = u - 0xA0;
+        if (tab_s[u2 >> 4] & ((unsigned short) 1 << (u2 & 0xF))) {
+            *dest = (unsigned char) u; /* Straight-thru */
+            return 1;
+        }
+    }
+
+    s = 0;
+    while (s <= e) {
+        const int m = (s + e) >> 1;
+        if (tab_u[m] < u) {
+            s = m + 1;
+        } else if (tab_u[m] > u) {
+            e = m - 1;
+        } else {
+            *dest = tab_sb[m];
+            return 1;
+        }
+    }
+    return 0;
+}
+
+/* ECI 27 ASCII (ISO/IEC 646:1991 IRV (US)) */
+static int u_ascii(const unsigned int u, unsigned char *dest) {
+    if (u < 0x80) {
+        *dest = (unsigned char) u;
+        return 1;
+    }
+    return 0;
+}
+
+/* ECI 170 ASCII subset (ISO/IEC 646:1991 Invariant), excludes 12 chars that historically had national variants,
+    namely "#$@[\]^`{|}~" */
+static int u_ascii_inv(const unsigned int u, unsigned char *dest) {
+    if (u == 0x7F || (u <= 'z' && u != '#' && u != '$' && u != '@' && (u <= 'Z' || u == '_' || u >= 'a'))) {
+        *dest = (unsigned char) u;
+        return 1;
+    }
+    return 0;
+}
+
+/* ECI 25 UTF-16 Big Endian (ISO/IEC 10646) - assumes valid Unicode */
+static int u_utf16be(const unsigned int u, unsigned char *dest) {
+    unsigned int u2, v;
+    if (u < 0x10000) {
+        dest[0] = (unsigned char) (u >> 8);
+        dest[1] = (unsigned char) u;
+        return 2;
+    }
+    u2 = u - 0x10000;
+    v = u2 >> 10;
+    dest[0] = (unsigned char) (0xD8 + (v >> 8));
+    dest[1] = (unsigned char) v;
+    v = u2 & 0x3FF;
+    dest[2] = (unsigned char) (0xDC + (v >> 8));
+    dest[3] = (unsigned char) v;
+    return 4;
+}
+
+/* ECI 33 UTF-16 Little Endian (ISO/IEC 10646) - assumes valid Unicode */
+static int u_utf16le(const unsigned int u, unsigned char *dest) {
+    unsigned int u2, v;
+    if (u < 0x10000) {
+        dest[0] = (unsigned char) u;
+        dest[1] = (unsigned char) (u >> 8);
+        return 2;
+    }
+    u2 = u - 0x10000;
+    v = u2 >> 10;
+    dest[0] = (unsigned char) v;
+    dest[1] = (unsigned char) (0xD8 + (v >> 8));
+    v = u2 & 0x3FF;
+    dest[2] = (unsigned char) v;
+    dest[3] = (unsigned char) (0xDC + (v >> 8));
+    return 4;
+}
+
+/* ECI 34 UTF-32 Big Endian (ISO/IEC 10646) - assumes valid Unicode */
+static int u_utf32be(const unsigned int u, unsigned char *dest) {
+    dest[0] = 0;
+    dest[1] = (unsigned char) (u >> 16);
+    dest[2] = (unsigned char) (u >> 8);
+    dest[3] = (unsigned char) u;
+    return 4;
+}
+
+/* ECI 35 UTF-32 Little Endian (ISO/IEC 10646) - assumes valid Unicode */
+static int u_utf32le(const unsigned int u, unsigned char *dest) {
+    dest[0] = (unsigned char) u;
+    dest[1] = (unsigned char) (u >> 8);
+    dest[2] = (unsigned char) (u >> 16);
+    dest[3] = 0;
+    return 4;
+}
+
+/* Multibyte stuff */
+
+/* Acknowledgements to Bruno Haible <bruno@clisp.org> for a no. of techniques used here */
+
+/* Helper to lookup Unicode codepoint `u` in the URO (Unified Repertoire and Ordering) block (U+4E00-9FFF) */
+static int eci_u_lookup_uro_int(const unsigned int u, const unsigned short *tab_u, const unsigned short *tab_mb_ind,
+            const unsigned short *tab_mb, unsigned int *d) {
+    unsigned int u2 = (u - 0x4E00) >> 4; /* Blocks of 16 */
+    unsigned int v = (unsigned int) 1 << (u & 0xF);
+    if ((tab_u[u2] & v) == 0) {
+        return 0;
+    }
+    v = tab_u[u2] & (v - 1); /* Mask to bits prior to this one */
+    /* Count bits set (http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel) */
+    v = v - ((v >> 1) & 0x55555555);
+    v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
+    v = (((v + (v >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24;
+    *d = tab_mb[tab_mb_ind[u2] + v];
+    return 2;
+}
+
+/* Version of `eci_u_lookup_uro_int()` taking unsigned char destination */
+static int eci_u_lookup_uro(const unsigned int u, const unsigned short *tab_u, const unsigned short *tab_mb_ind,
+            const unsigned short *tab_mb, unsigned char *dest) {
+    unsigned int d;
+    int ret = eci_u_lookup_uro_int(u, tab_u, tab_mb_ind, tab_mb, &d);
+    if (ret) {
+        dest[0] = (unsigned char) (d >> 8);
+        dest[1] = (unsigned char) d;
+    }
+    return ret;
+}
+
+/* ECI 20 Shift JIS */
+static int u_sjis_int(const unsigned int u, unsigned int *d) {
+    unsigned int u2, dv, md;
+    int s, e;
+
+    if (u < 0x80 && u != 0x5C && u != 0x7E) { /* Backslash & tilde re-mapped according to JIS X 0201 Roman */
+        *d = u;
+        return 1;
+    }
+    /* Special case URO block sequential mappings (considerably lessens size of `sjis_u[]` array) */
+    if (u >= 0x4E00 && u <= 0xDFFF) { /* 0xE000 next used value >= 0x4E00 */
+        if (u >= 0x9FB0) {
+            return 0;
+        }
+        return eci_u_lookup_uro_int(u, sjis_uro_u, sjis_uro_mb_ind, sjis_mb, d);
+    }
+    /* PUA to user-defined (Table 4-86, Lunde, 2nd ed.) */
+    if (u >= 0xE000 && u <= 0xE757) {
+        u2 = u - 0xE000;
+        dv = u2 / 188;
+        md = u2 - dv * 188;
+        *d = ((dv + 0xF0) << 8) | (md + 0x40 + (md >= 0x3F));
+        return 2;
+    }
+    if (u >= sjis_u[0] && u <= sjis_u[ARRAY_SIZE(sjis_u) - 1]) {
+        s = 0;
+        e = ARRAY_SIZE(sjis_u) - 1;
+        while (s <= e) {
+            const int m = (s + e) >> 1;
+            if (sjis_u[m] < u) {
+                s = m + 1;
+            } else if (sjis_u[m] > u) {
+                e = m - 1;
+            } else {
+                *d = sjis_mb[u >= 0x4E00 ? m + 6356 : m]; /* Adjust for URO block */
+                return 1 + (*d > 0xFF);
+            }
+        }
+    }
+    return 0;
+}
+
+#ifdef ZINT_TEST /* Wrapper for direct testing */
+INTERNAL int u_sjis_int_test(const unsigned int u, unsigned int *d) {
+    return u_sjis_int(u, d);
+}
+#endif
+
+/* Version of `u_sjis_int()` taking unsigned char destination, for use by `utf8_to_eci()` */
+static int u_sjis(const unsigned int u, unsigned char *dest) {
+    unsigned int d;
+    int ret = u_sjis_int(u, &d);
+    if (ret) {
+        if (ret == 1) {
+            dest[0] = (unsigned char) d;
+        } else {
+            dest[0] = (unsigned char) (d >> 8);
+            dest[1] = (unsigned char) d;
+        }
+    }
+    return ret;
+}
+
+/* ECI 28 Big5 Chinese (Taiwan) */
+static int u_big5(const unsigned int u, unsigned char *dest) {
+    int s, e;
+
+    if (u < 0x80) {
+        *dest = (unsigned char) u;
+        return 1;
+    }
+    /* Special case URO block sequential mappings (considerably lessens size of `big5_u[]` array) */
+    if (u >= 0x4E00 && u <= 0xFA0B) { /* 0xFA0C next used value >= 0x4E00 */
+        if (u >= 0x9FB0) {
+            return 0;
+        }
+        return eci_u_lookup_uro(u, big5_uro_u, big5_uro_mb_ind, big5_mb, dest);
+    }
+    if (u >= big5_u[0] && u <= big5_u[ARRAY_SIZE(big5_u) - 1]) {
+        s = 0;
+        e = ARRAY_SIZE(big5_u) - 1;
+        while (s <= e) {
+            const int m = (s + e) >> 1;
+            if (big5_u[m] < u) {
+                s = m + 1;
+            } else if (big5_u[m] > u) {
+                e = m - 1;
+            } else {
+                const unsigned short mb = big5_mb[u >= 0x4E00 ? m + 13061 : m]; /* Adjust for URO block */
+                dest[0] = (unsigned char) (mb >> 8);
+                dest[1] = (unsigned char) mb;
+                return 2;
+            }
+        }
+    }
+    return 0;
+}
+
+#ifdef ZINT_TEST /* Wrapper for direct testing */
+INTERNAL int u_big5_test(const unsigned int u, unsigned char *dest) {
+    return u_big5(u, dest);
+}
+#endif
+
+/* ECI 30 EUC-KR (KS X 1001, formerly KS C 5601) Korean */
+static int u_ksx1001(const unsigned int u, unsigned char *dest) {
+    int s, e;
+
+    if (u < 0x80) {
+        *dest = (unsigned char) u;
+        return 1;
+    }
+    /* Special case URO block sequential mappings (considerably lessens size of `ksx1001_u[]` array) */
+    if (u >= 0x4E00 && u <= 0xABFF) { /* 0xAC00 next used value >= 0x4E00 */
+        if (u >= 0x9FA0) {
+            return 0;
+        }
+        return eci_u_lookup_uro(u, ksx1001_uro_u, ksx1001_uro_mb_ind, ksx1001_mb, dest);
+    }
+    if (u >= ksx1001_u[0] && u <= ksx1001_u[ARRAY_SIZE(ksx1001_u) - 1]) {
+        s = ksx1001_u_ind[(u - ksx1001_u[0]) >> 8];
+        e = s + 0x100 > ARRAY_SIZE(ksx1001_u) ? ARRAY_SIZE(ksx1001_u) - 1 : s + 0x100 - 1;
+        while (s <= e) {
+            const int m = (s + e) >> 1;
+            if (ksx1001_u[m] < u) {
+                s = m + 1;
+            } else if (ksx1001_u[m] > u) {
+                e = m - 1;
+            } else {
+                const unsigned short mb = ksx1001_mb[u >= 0x4E00 ? m + 4620 : m]; /* Adjust for URO block */
+                dest[0] = (unsigned char) (mb >> 8);
+                dest[1] = (unsigned char) mb;
+                return 2;
+            }
+        }
+    }
+    return 0;
+}
+
+#ifdef ZINT_TEST /* Wrapper for direct testing */
+INTERNAL int u_ksx1001_test(const unsigned int u, unsigned char *dest) {
+    return u_ksx1001(u, dest);
+}
+#endif
+
+/* ECI 29 GB 2312 Chinese (PRC) */
+static int u_gb2312_int(const unsigned int u, unsigned int *d) {
+    int s, e;
+
+    if (u < 0x80) {
+        *d = u;
+        return 1;
+    }
+    /* Special case URO block sequential mappings (considerably lessens size of `gb2312_u[]` array) */
+    if (u >= 0x4E00 && u <= 0x9E1E) { /* 0x9E1F next used value >= 0x4E00 */
+        if (u >= 0x9CF0) {
+            return 0;
+        }
+        return eci_u_lookup_uro_int(u, gb2312_uro_u, gb2312_uro_mb_ind, gb2312_mb, d);
+    }
+    if (u >= gb2312_u[0] && u <= gb2312_u[ARRAY_SIZE(gb2312_u) - 1]) {
+        s = gb2312_u_ind[(u - gb2312_u[0]) >> 8];
+        e = s + 0x100 > ARRAY_SIZE(gb2312_u) ? ARRAY_SIZE(gb2312_u) - 1 : s + 0x100 - 1;
+        while (s <= e) {
+            const int m = (s + e) >> 1;
+            if (gb2312_u[m] < u) {
+                s = m + 1;
+            } else if (gb2312_u[m] > u) {
+                e = m - 1;
+            } else {
+                *d = gb2312_mb[u > 0x4E00 ? m + 6627 : m]; /* Adjust for URO block */
+                return 2;
+            }
+        }
+    }
+    return 0;
+}
+
+#ifdef ZINT_TEST /* Wrapper for direct testing */
+INTERNAL int u_gb2312_int_test(const unsigned int u, unsigned int *d) {
+    return u_gb2312_int(u, d);
+}
+#endif
+
+/* Version of `u_gb2312_int()` taking unsigned char destination, for use by `utf8_to_eci()` */
+static int u_gb2312(const unsigned int u, unsigned char *dest) {
+    unsigned int d;
+    int ret = u_gb2312_int(u, &d);
+    if (ret) {
+        if (ret == 1) {
+            dest[0] = (unsigned char) d;
+        } else {
+            dest[0] = (unsigned char) (d >> 8);
+            dest[1] = (unsigned char) d;
+        }
+    }
+    return ret;
+}
+
+/* ECI 31 GBK Chinese */
+static int u_gbk_int(const unsigned int u, unsigned int *d) {
+    int s, e;
+
+    if (u < 0x80) {
+        *d = u;
+        return 1;
+    }
+
+    /* Check GB 2312 first */
+    if (u == 0x30FB) {
+        /* KATAKANA MIDDLE DOT, mapped by GB 2312 but not by GBK (U+00B7 MIDDLE DOT mapped to 0xA1A4 instead) */
+        return 0;
+    }
+    if (u == 0x2015) {
+        /* HORIZONTAL BAR, mapped to 0xA844 by GBK rather than 0xA1AA (U+2014 EM DASH mapped there instead) */
+        *d = 0xA844;
+        return 2;
+    }
+    if (u_gb2312_int(u, d)) { /* Includes the 2 GB 6345.1-86 corrections given in Table 3-22, Lunde, 2nd ed. */
+        return 2;
+    }
+
+    /* Special case URO block sequential mappings (considerably lessens size of `gbk_u[]` array) */
+    if (u >= 0x4E00 && u <= 0xF92B) { /* 0xF92C next used value >= 0x4E00 */
+        if (u >= 0x9FB0) {
+            return 0;
+        }
+        return eci_u_lookup_uro_int(u, gbk_uro_u, gbk_uro_mb_ind, gbk_mb, d);
+    }
+    if (u >= gbk_u[0] && u <= gbk_u[ARRAY_SIZE(gbk_u) - 1]) {
+        s = 0;
+        e = ARRAY_SIZE(gbk_u) - 1;
+        while (s <= e) {
+            const int m = (s + e) >> 1;
+            if (gbk_u[m] < u) {
+                s = m + 1;
+            } else if (gbk_u[m] > u) {
+                e = m - 1;
+            } else {
+                *d = gbk_mb[u >= 0x4E00 ? m + 14139 : m]; /* Adjust for URO block */
+                return 2;
+            }
+        }
+    }
+    return 0;
+}
+
+#ifdef ZINT_TEST /* Wrapper for direct testing */
+INTERNAL int u_gbk_int_test(const unsigned int u, unsigned int *d) {
+    return u_gbk_int(u, d);
+}
+#endif
+
+/* Version of `u_gbk_int()` taking unsigned char destination, for use by `utf8_to_eci()` */
+static int u_gbk(const unsigned int u, unsigned char *dest) {
+    unsigned int d;
+    int ret = u_gbk_int(u, &d);
+    if (ret) {
+        if (ret == 1) {
+            dest[0] = (unsigned char) d;
+        } else {
+            dest[0] = (unsigned char) (d >> 8);
+            dest[1] = (unsigned char) d;
+        }
+    }
+    return ret;
+}
+
+/* Helper for `u_gb18030_int()` to output 4-byte sequential blocks */
+static int u_gb18030_4_sequential_int(unsigned int u2, unsigned int mb_lead, unsigned int *d1, unsigned int *d2) {
+    unsigned int dv;
+
+    dv = u2 / 10;
+    *d2 = u2 - dv * 10 + 0x30;
+    u2 = dv;
+    dv = u2 / 126;
+    *d2 |= (u2 - dv * 126 + 0x81) << 8;
+    u2 = dv;
+    dv = u2 / 10;
+    *d1 = ((dv + mb_lead) << 8) | (u2 - dv * 10 + 0x30);
+    return 4;
+}
+
+/* ECI 32 GB 18030 Chinese - assumes valid Unicode */
+static int u_gb18030_int(const unsigned int u, unsigned int *d1, unsigned int *d2) {
+    unsigned int u2, dv;
+    int s, e;
+
+    if (u < 0x80) {
+        *d1 = u;
+        return 1;
+    }
+
+    /* Check GBK first */
+    if (u_gbk_int(u, d1)) {
+        return 2;
+    }
+
+    if (u >= 0x10000) {
+        /* Non-PUA, non-BMP, see Table 3-37, Lunde, 2nd ed. */
+        if (u == 0x20087) {
+            *d1 = 0xFE51;
+            return 2;
+        }
+        if (u == 0x20089) {
+            *d1 = 0xFE52;
+            return 2;
+        }
+        if (u == 0x200CC) {
+            *d1 = 0xFE53;
+            return 2;
+        }
+        if (u == 0x215D7) {
+            *d1 = 0xFE6C;
+            return 2;
+        }
+        if (u == 0x2298F) {
+            *d1 = 0xFE76;
+            return 2;
+        }
+        if (u == 0x241FE) {
+            *d1 = 0xFE91;
+            return 2;
+        }
+        /* All other non-BMP U+10000-10FFFF */
+        return u_gb18030_4_sequential_int(u - 0x10000, 0x90, d1, d2);
+    }
+    if (u >= 0xE000 && u <= 0xE765) { /* PUA to user-defined */
+        if (u <= 0xE4C5) {
+            u2 = u - 0xE000;
+            dv = u2 / 94;
+            *d1 = ((dv + (dv < 6 ? 0xAA : 0xF2)) << 8) | (u2 - dv * 94 + 0xA1);
+        } else {
+            unsigned int md;
+            u2 = u - 0xE4C6;
+            dv = u2 / 96;
+            md = u2 - dv * 96;
+            *d1 = ((dv + 0xA1) << 8) | (md + 0x40 + (md >= 0x3F));
+        }
+        return 2;
+    }
+    if (u >= gb18030_2_u[0] && u <= gb18030_2_u[ARRAY_SIZE(gb18030_2_u) - 1]) {
+        s = 0;
+        e = ARRAY_SIZE(gb18030_2_u) - 1;
+        while (s <= e) {
+            const int m = (s + e) >> 1;
+            if (gb18030_2_u[m] < u) {
+                s = m + 1;
+            } else if (gb18030_2_u[m] > u) {
+                e = m - 1;
+            } else {
+                *d1 = gb18030_2_mb[m];
+                return 2;
+            }
+        }
+    }
+    /* All other BMP U+0080-FFFF */
+    if (u == 0xE7C7) { /* PUA change to non-PUA, see Table 3-39, Lunde, 2nd ed. */
+        *d1 = 0x8135;
+        *d2 = 0xF437;
+        return 4;
+    }
+    s = 0;
+    e = ARRAY_SIZE(gb18030_4_u_e) - 1;
+    while (s < e) { /* Lower bound */
+        const int m = (s + e) >> 1;
+        if (gb18030_4_u_e[m] < u) {
+            s = m + 1;
+        } else {
+            e = m;
+        }
+    }
+    assert(s < ARRAY_SIZE(gb18030_4_u_e));
+    return u_gb18030_4_sequential_int(u - gb18030_4_mb_o[s] - 0x80, 0x81, d1, d2);
+}
+
+#ifdef ZINT_TEST /* Wrapper for direct testing */
+INTERNAL int u_gb18030_int_test(const unsigned int u, unsigned int *d1, unsigned int *d2) {
+    return u_gb18030_int(u, d1, d2);
+}
+#endif
+
+/* Version of `u_gb18030_int()` taking unsigned char destination, for use by `utf8_to_eci()` */
+static int u_gb18030(const unsigned int u, unsigned char *dest) {
+    unsigned int d1, d2;
+    int ret = u_gb18030_int(u, &d1, &d2);
+    if (ret) {
+        if (ret == 1) {
+            dest[0] = (unsigned char) d1;
+        } else {
+            dest[0] = (unsigned char) (d1 >> 8);
+            dest[1] = (unsigned char) d1;
+            if (ret == 4) {
+                dest[2] = (unsigned char) (d2 >> 8);
+                dest[3] = (unsigned char) d2;
+            }
+        }
+    }
+    return ret;
+}
+
+/* Main ECI stuff */
+
+/* Helper to count the number of chars in a string within a range */
+static int chr_range_cnt(const unsigned char string[], const int length, const unsigned char c1,
+            const unsigned char c2) {
+    int count = 0;
+    int i;
+    if (c1) {
+        for (i = 0; i < length; i++) {
+            if (string[i] >= c1 && string[i] <= c2) {
+                count++;
+            }
+        }
+    } else {
+        for (i = 0; i < length; i++) {
+            if (string[i] <= c2) {
+                count++;
+            }
+        }
+    }
+    return count;
+}
+
+/* Is ECI convertible from UTF-8? */
+INTERNAL int is_eci_convertible(const int eci) {
+    if (eci == 26 || (eci > 35 && eci != 170)) { /* Exclude ECI 170 - ASCII Invariant */
+        /* UTF-8 (26) or 8-bit binary data (899) or undefined (> 35 and < 899) or not character set (> 899) */
+        return 0;
+    }
+    return 1;
+}
+
+/* Are any of the ECIs in the segments convertible from UTF-8?
+   Sets `convertible[]` for each, which must be at least `seg_count` in size */
+INTERNAL int is_eci_convertible_segs(const struct zint_seg segs[], const int seg_count, int convertible[]) {
+    int ret = 0;
+    int i;
+    for (i = 0; i < seg_count; i++) {
+        convertible[i] = is_eci_convertible(segs[i].eci);
+        ret |= convertible[i];
+    }
+    return ret;
+}
+
+/* Calculate length required to convert UTF-8 to (double-byte) encoding */
+INTERNAL int get_eci_length(const int eci, const unsigned char source[], int length) {
+    if (eci == 20) { /* Shift JIS */
+        /* Only ASCII backslash (reverse solidus) exceeds UTF-8 length */
+        length += chr_cnt(source, length, '\\');
+
+    } else if (eci == 25 || eci == 33) { /* UTF-16 */
+        /* All ASCII chars take 2 bytes */
+        length += chr_range_cnt(source, length, 0, 0x7F);
+        /* Surrogate pairs are 4 UTF-8 bytes long so fit */
+
+    } else if (eci == 32) { /* GB 18030 */
+        /* Allow for GB 18030 4 byters */
+        length *= 2;
+
+    } else if (eci == 34 || eci == 35) { /* UTF-32 */
+        /* Quadruple-up ASCII and double-up non-ASCII */
+        length += chr_range_cnt(source, length, 0, 0x7F) * 2 + length;
+    }
+
+    /* Big5, GB 2312, EUC-KR and GBK fit in UTF-8 length */
+
+    return length;
+}
+
+/* Call `get_eci_length()` for each segment, returning total */
+INTERNAL int get_eci_length_segs(const struct zint_seg segs[], const int seg_count) {
+    int length = 0;
+    int i;
+
+    for (i = 0; i < seg_count; i++) {
+        length += get_eci_length(segs[i].eci, segs[i].source, segs[i].length);
+    }
+
+    return length;
+}
+
+/* Convert UTF-8 to other character encodings */
+typedef int (*eci_func_t)(const unsigned int u, unsigned char *dest);
+INTERNAL int utf8_to_eci(const int eci, const unsigned char source[], unsigned char dest[], int *p_length) {
+
+    static const eci_func_t eci_funcs[36] = {
+                NULL,         NULL,         NULL,         NULL,  u_iso8859_2, /*0-4*/
+         u_iso8859_3,  u_iso8859_4,  u_iso8859_5,  u_iso8859_6,  u_iso8859_7, /*5-9*/
+         u_iso8859_8,  u_iso8859_9, u_iso8859_10, u_iso8859_11,         NULL, /*10-14*/
+        u_iso8859_13, u_iso8859_14, u_iso8859_15, u_iso8859_16,         NULL, /*15-19*/
+              u_sjis,     u_cp1250,     u_cp1251,     u_cp1252,     u_cp1256, /*20-24*/
+           u_utf16be,         NULL,      u_ascii,       u_big5,     u_gb2312, /*25-29*/
+           u_ksx1001,        u_gbk,    u_gb18030,    u_utf16le,    u_utf32be, /*30-34*/
+           u_utf32le,
+    };
+    eci_func_t eci_func;
+    unsigned int codepoint, state = 0;
+    int in_posn = 0;
+    int out_posn = 0;
+    int length = *p_length;
+
+    /* Special case ISO/IEC 8859-1 */
+    if (eci == 0 || eci == 3) { /* Default ECI 0 to ISO/IEC 8859-1 */
+        while (in_posn < length) {
+            do {
+                decode_utf8(&state, &codepoint, source[in_posn++]);
+            } while (in_posn < length && state != 0 && state != 12);
+            if (state != 0) {
+                return ZINT_ERROR_INVALID_DATA;
+            }
+            if (codepoint >= 0x80 && (codepoint < 0xA0 || codepoint >= 0x100)) {
+                return ZINT_ERROR_INVALID_DATA;
+            }
+            dest[out_posn++] = (unsigned char) codepoint;
+        }
+        dest[out_posn] = '\0';
+        *p_length = out_posn;
+        return 0;
+    }
+
+    if (eci == 170) { /* ASCII Invariant (archaic subset) */
+        eci_func = u_ascii_inv;
+    } else {
+        eci_func = eci_funcs[eci];
+        if (eci_func == NULL) {
+            return ZINT_ERROR_INVALID_DATA;
+        }
+    }
+
+    while (in_posn < length) {
+        int incr;
+        do {
+            decode_utf8(&state, &codepoint, source[in_posn++]);
+        } while (in_posn < length && state != 0 && state != 12);
+        if (state != 0) {
+            return ZINT_ERROR_INVALID_DATA;
+        }
+        incr = (*eci_func)(codepoint, dest + out_posn);
+        if (incr == 0) {
+            return ZINT_ERROR_INVALID_DATA;
+        }
+        out_posn += incr;
+    }
+    dest[out_posn] = '\0';
+    *p_length = out_posn;
+
+    return 0;
+}
+
+/* Find the lowest single-byte ECI mode which will encode a given set of Unicode text, assuming valid UTF-8 */
+INTERNAL int get_best_eci(const unsigned char source[], int length) {
+    int eci = 3;
+    /* Note: attempting single-byte conversions only, so get_eci_length() unnecessary */
+    unsigned char *local_source = (unsigned char *) z_alloca(length + 1);
+
+    do {
+        if (eci == 14) { /* Reserved */
+            eci = 15;
+        } else if (eci == 19) { /* Reserved */
+            eci = 21; /* Skip 20 Shift JIS */
+        }
+        if (utf8_to_eci(eci, source, local_source, &length) == 0) {
+            return eci;
+        }
+        eci++;
+    } while (eci < 25);
+
+    assert(is_valid_utf8(source, length));
+
+    return 26; /* If all of these fail, use UTF-8! */
+}
+
+/* Call `get_best_eci()` for each segment, assuming valid UTF-8. Returns 0 on failure, first ECI set on success */
+INTERNAL int get_best_eci_segs(struct zint_symbol *symbol, struct zint_seg segs[], const int seg_count) {
+    const int default_eci = symbol->symbology == BARCODE_GRIDMATRIX ? 29 : symbol->symbology == BARCODE_UPNQR ? 4 : 3;
+    int first_eci_set = 0;
+    int i;
+
+    for (i = 0; i < seg_count; i++) {
+        if (segs[i].eci == 0) {
+            const int eci = get_best_eci(segs[i].source, segs[i].length);
+            if (eci == default_eci) {
+                if (i != 0 && segs[i - 1].eci != 0 && segs[i - 1].eci != default_eci) {
+                    segs[i].eci = eci;
+                    if (first_eci_set == 0) {
+                        first_eci_set = eci;
+                    }
+                }
+            } else {
+                segs[i].eci = eci;
+                if (first_eci_set == 0) {
+                    first_eci_set = eci;
+                    if (i == 0) {
+                        symbol->eci = eci;
+                    }
+                }
+            }
+        }
+    }
+
+    return first_eci_set;
+}
+
+/* QRCODE Shift JIS helpers */
+
+/* Convert UTF-8 string to Shift JIS and place in array of ints */
+INTERNAL int sjis_utf8(struct zint_symbol *symbol, const unsigned char source[], int *p_length,
+                unsigned int *ddata) {
+    int error_number;
+    unsigned int i, length;
+    unsigned int *utfdata = (unsigned int *) z_alloca(sizeof(unsigned int) * (*p_length + 1));
+
+    error_number = utf8_to_unicode(symbol, source, utfdata, p_length, 1 /*disallow_4byte*/);
+    if (error_number != 0) {
+        return error_number;
+    }
+
+    for (i = 0, length = *p_length; i < length; i++) {
+        if (!u_sjis_int(utfdata[i], ddata + i)) {
+            return errtxt(ZINT_ERROR_INVALID_DATA, symbol, 800, "Invalid character in input");
+        }
+    }
+
+    return 0;
+}
+
+/* If `full_multibyte` set, copy byte input stream to array of ints, putting double-bytes that match QR Kanji mode in
+ * a single entry. If `full_multibyte` not set, do a straight copy */
+INTERNAL void sjis_cpy(const unsigned char source[], int *p_length, unsigned int *ddata, const int full_multibyte) {
+    unsigned int i, j, length;
+    unsigned char c1, c2;
+
+    if (full_multibyte) {
+        for (i = 0, j = 0, length = *p_length; i < length; i++, j++) {
+            c1 = source[i];
+            /* Now using stricter interpretation of standard, and excluding certain trailing bytes */
+            if (((c1 >= 0x81 && c1 <= 0x9F) || (c1 >= 0xE0 && c1 <= 0xEB)) && length - i >= 2) {
+                c2 = source[i + 1];
+                if ((c2 >= 0x40 && c2 <= 0xFC) && c2 != 0x7F && (c1 != 0xEB || c2 <= 0xBF)) {
+                    /* This may or may not be valid Shift JIS, but don't care as long as it can be encoded in
+                     * QR Kanji mode */
+                    ddata[j] = (c1 << 8) | c2;
+                    i++;
+                } else {
+                    ddata[j] = c1;
+                }
+            } else {
+                ddata[j] = c1;
+            }
+        }
+        *p_length = j;
+    } else {
+        /* Straight copy */
+        for (i = 0, length = *p_length; i < length; i++) {
+            ddata[i] = source[i];
+        }
+    }
+}
+
+/* Call `sjis_cpy()` for each segment */
+INTERNAL void sjis_cpy_segs(struct zint_seg segs[], const int seg_count, unsigned int *ddata,
+                const int full_multibyte) {
+    int i;
+    unsigned int *dd = ddata;
+
+    for (i = 0; i < seg_count; i++) {
+        sjis_cpy(segs[i].source, &segs[i].length, dd, full_multibyte);
+        dd += segs[i].length;
+    }
+}
+
+/* Convert UTF-8 string to ECI and place in array of ints using `sjis_cpy()` */
+INTERNAL int sjis_utf8_to_eci(const int eci, const unsigned char source[], int *p_length, unsigned int *ddata,
+                const int full_multibyte) {
+
+    if (is_eci_convertible(eci)) {
+        int error_number;
+        const int eci_length = get_eci_length(eci, source, *p_length);
+        unsigned char *converted = (unsigned char *) z_alloca(eci_length + 1);
+
+        error_number = utf8_to_eci(eci, source, converted, p_length);
+        if (error_number != 0) {
+            /* Note not setting `symbol->errtxt`, up to caller */
+            return error_number;
+        }
+
+        sjis_cpy(converted, p_length, ddata, full_multibyte || eci == 20);
+    } else {
+        sjis_cpy(source, p_length, ddata, full_multibyte);
+    }
+
+    return 0;
+}
+
+/* GRIDMATRIX GB 2312 helpers */
+
+/* Convert UTF-8 string to GB 2312 (EUC-CN) and place in array of ints */
+INTERNAL int gb2312_utf8(struct zint_symbol *symbol, const unsigned char source[], int *p_length,
+                unsigned int *ddata) {
+    int error_number;
+    unsigned int i, length;
+    unsigned int *utfdata = (unsigned int *) z_alloca(sizeof(unsigned int) * (*p_length + 1));
+
+    error_number = utf8_to_unicode(symbol, source, utfdata, p_length, 1 /*disallow_4byte*/);
+    if (error_number != 0) {
+        return error_number;
+    }
+
+    for (i = 0, length = *p_length; i < length; i++) {
+        if (utfdata[i] < 0x80) {
+            ddata[i] = utfdata[i];
+        } else {
+            if (!u_gb2312_int(utfdata[i], ddata + i)) {
+                return errtxt(ZINT_ERROR_INVALID_DATA, symbol, 810, "Invalid character in input");
+            }
+        }
+    }
+
+    return 0;
+}
+
+/* If `full_multibyte` set, copy byte input stream to array of ints, putting double-bytes that match GRIDMATRIX
+ * Chinese mode in a single entry. If `full_multibyte` not set, do a straight copy */
+static void gb2312_cpy(const unsigned char source[], int *p_length, unsigned int *ddata,
+                const int full_multibyte) {
+    unsigned int i, j, length;
+    unsigned char c1, c2;
+
+    if (full_multibyte) {
+        for (i = 0, j = 0, length = *p_length; i < length; i++, j++) {
+            if (length - i >= 2) {
+                c1 = source[i];
+                c2 = source[i + 1];
+                if (((c1 >= 0xA1 && c1 <= 0xA9) || (c1 >= 0xB0 && c1 <= 0xF7)) && c2 >= 0xA1 && c2 <= 0xFE) {
+                    /* This may or may not be valid GB 2312 (EUC-CN), but don't care as long as it can be encoded in
+                     * GRIDMATRIX Chinese mode */
+                    ddata[j] = (c1 << 8) | c2;
+                    i++;
+                } else {
+                    ddata[j] = c1;
+                }
+            } else {
+                ddata[j] = source[i];
+            }
+        }
+        *p_length = j;
+    } else {
+        /* Straight copy */
+        for (i = 0, length = *p_length; i < length; i++) {
+            ddata[i] = source[i];
+        }
+    }
+}
+
+#ifdef ZINT_TEST /* Wrapper for direct testing */
+INTERNAL void gb2312_cpy_test(const unsigned char source[], int *p_length, unsigned int *ddata,
+                const int full_multibyte) {
+    gb2312_cpy(source, p_length, ddata, full_multibyte);
+}
+#endif
+
+/* Call `gb2312_cpy()` for each segment */
+INTERNAL void gb2312_cpy_segs(struct zint_seg segs[], const int seg_count, unsigned int *ddata,
+                const int full_multibyte) {
+    int i;
+    unsigned int *dd = ddata;
+
+    for (i = 0; i < seg_count; i++) {
+        gb2312_cpy(segs[i].source, &segs[i].length, dd, full_multibyte);
+        dd += segs[i].length;
+    }
+}
+
+/* Convert UTF-8 string to ECI and place in array of ints using `gb2312_cpy()` */
+INTERNAL int gb2312_utf8_to_eci(const int eci, const unsigned char source[], int *p_length, unsigned int *ddata,
+                const int full_multibyte) {
+
+    if (is_eci_convertible(eci)) {
+        int error_number;
+        const int eci_length = get_eci_length(eci, source, *p_length);
+        unsigned char *converted = (unsigned char *) z_alloca(eci_length + 1);
+
+        error_number = utf8_to_eci(eci, source, converted, p_length);
+        if (error_number != 0) {
+            /* Note not setting `symbol->errtxt`, up to caller */
+            return error_number;
+        }
+
+        gb2312_cpy(converted, p_length, ddata, full_multibyte || eci == 29);
+    } else {
+        gb2312_cpy(source, p_length, ddata, full_multibyte);
+    }
+
+    return 0;
+}
+
+/* HANXIN GB 18030 helpers */
+
+/* Convert UTF-8 string to GB 18030 and place in array of ints */
+INTERNAL int gb18030_utf8(struct zint_symbol *symbol, const unsigned char source[], int *p_length,
+                unsigned int *ddata) {
+    int error_number, ret;
+    unsigned int i, j, length;
+    unsigned int *utfdata = (unsigned int *) z_alloca(sizeof(unsigned int) * (*p_length + 1));
+
+    error_number = utf8_to_unicode(symbol, source, utfdata, p_length, 0 /*disallow_4byte*/);
+    if (error_number != 0) {
+        return error_number;
+    }
+
+    for (i = 0, j = 0, length = *p_length; i < length; i++, j++) {
+        if (utfdata[i] < 0x80) {
+            ddata[j] = utfdata[i];
+        } else {
+            ret = u_gb18030_int(utfdata[i], ddata + j, ddata + j + 1);
+            if (ret == 0) { /* Should never happen, as GB 18030 is a UTF i.e. maps all Unicode codepoints */
+                return errtxt(ZINT_ERROR_INVALID_DATA, symbol, 820, "Invalid character in input"); /* Not reached */
+            }
+            if (ret == 4) {
+                j++;
+            }
+        }
+    }
+
+    *p_length = j;
+
+    return 0;
+}
+
+/* If `full_multibyte` set, copy byte input stream to array of ints, putting double-bytes that match HANXIN
+ * Chinese mode in single entry, and quad-bytes in 2 entries. If `full_multibyte` not set, do a straight copy */
+static void gb18030_cpy(const unsigned char source[], int *p_length, unsigned int *ddata,
+                const int full_multibyte) {
+    unsigned int i, j, length;
+    int done;
+    unsigned char c1, c2, c3, c4;
+
+    if (full_multibyte) {
+        for (i = 0, j = 0, length = *p_length; i < length; i++, j++) {
+            done = 0;
+            c1 = source[i];
+            if (length - i >= 2) {
+                if (c1 >= 0x81 && c1 <= 0xFE) {
+                    c2 = source[i + 1];
+                    if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0x80 && c2 <= 0xFE)) {
+                        ddata[j] = (c1 << 8) | c2;
+                        i++;
+                        done = 1;
+                    } else if (length - i >= 4 && (c2 >= 0x30 && c2 <= 0x39)) {
+                        c3 = source[i + 2];
+                        c4 = source[i + 3];
+                        if ((c3 >= 0x81 && c3 <= 0xFE) && (c4 >= 0x30 && c4 <= 0x39)) {
+                            ddata[j++] = (c1 << 8) | c2;
+                            ddata[j] = (c3 << 8) | c4;
+                            i += 3;
+                            done = 1;
+                        }
+                    }
+                }
+            }
+            if (!done) {
+                ddata[j] = c1;
+            }
+        }
+        *p_length = j;
+    } else {
+        /* Straight copy */
+        for (i = 0, length = *p_length; i < length; i++) {
+            ddata[i] = source[i];
+        }
+    }
+}
+
+#ifdef ZINT_TEST /* Wrapper for direct testing */
+INTERNAL void gb18030_cpy_test(const unsigned char source[], int *p_length, unsigned int *ddata,
+                const int full_multibyte) {
+    gb18030_cpy(source, p_length, ddata, full_multibyte);
+}
+#endif
+
+/* Call `gb18030_cpy()` for each segment */
+INTERNAL void gb18030_cpy_segs(struct zint_seg segs[], const int seg_count, unsigned int *ddata,
+                const int full_multibyte) {
+    int i;
+    unsigned int *dd = ddata;
+
+    for (i = 0; i < seg_count; i++) {
+        gb18030_cpy(segs[i].source, &segs[i].length, dd, full_multibyte);
+        dd += segs[i].length;
+    }
+}
+
+/* Convert UTF-8 string to ECI and place in array of ints using `gb18030_cpy()` */
+INTERNAL int gb18030_utf8_to_eci(const int eci, const unsigned char source[], int *p_length, unsigned int *ddata,
+                const int full_multibyte) {
+
+    if (is_eci_convertible(eci)) {
+        int error_number;
+        const int eci_length = get_eci_length(eci, source, *p_length);
+        unsigned char *converted = (unsigned char *) z_alloca(eci_length + 1);
+
+        error_number = utf8_to_eci(eci, source, converted, p_length);
+        if (error_number != 0) {
+            /* Note not setting `symbol->errtxt`, up to caller */
+            return error_number;
+        }
+
+        /* GB 18030 (ECI 32) superset of GB 2312 (ECI 29) and GBK (ECI 31) */
+        gb18030_cpy(converted, p_length, ddata, full_multibyte || eci == 32 || eci == 29 || eci == 31);
+    } else {
+        gb18030_cpy(source, p_length, ddata, full_multibyte);
+    }
+
+    return 0;
+}
+
+/* vim: set ts=4 sw=4 et : */
author	Franz Glasner <fzglas.hg@dom66.de>
date	Mon, 15 Sep 2025 11:43:07 +0200
parents
children