Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/zint/backend/tools/gen_eci_mb_h.php @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/zint/backend/tools/gen_eci_mb_h.php Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,720 @@ +<?php +/* Generate ECI multibyte tables from unicode.org mapping files */ +/* + libzint - the open source barcode library + Copyright (C) 2022-2023 Robin Stuart <rstuart114@gmail.com> +*/ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* + * To create "backend/eci_big5/gb18030/gb2312/gbk/ksx1001/sjis.h" (from project root directory): + * + * php backend/tools/gen_eci_mb_h.php + * + * NOTE: backend/tools/data/GB18030.TXT will have to be downloaded first from the tarball + * https://haible.de/bruno/charsets/conversion-tables/GB18030.tar.bz2 + * using the version jdk-1.4.2/GB18030.TXT + * + * NOTE: tools/data/GB2312.TXT will have to be downloaded first from the tarball + * https://haible.de/bruno/charsets/conversion-tables/GB2312.tar.bz2 + * using the version unicode.org-mappings/EASTASIA/GB/GB2312.TXT + */ +// 'zend.assertions' should set to 1 in php.ini + +$copyright_text = <<<'EOD' + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + 3. Neither the name of the project nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + SUCH DAMAGE. + */ +/* SPDX-License-Identifier: BSD-3-Clause */ + +EOD; + +$basename = basename(__FILE__); +$dirname = dirname(__FILE__); + +$opts = getopt('d:o:'); +$data_dirname = isset($opts['d']) ? $opts['d'] : ($dirname . '/data'); // Where to load file from. +$out_dirname = isset($opts['o']) ? $opts['o'] : ($dirname . '/..'); // Where to put output. + +$year = 2022; + +function out_header(&$out, $name, $descr, $file, $start_year = 0, $extra_comment = '') { + global $copyright_text, $basename, $year; + $caps = strtoupper($name); + $out[] = '/* ' . $name . '.h - tables for Unicode to ' . $descr . ', generated by "backend/tools/' . $basename . '"'; + if ($extra_comment !== '') { + $out[] = ' from "' . $file . '"'; + $out[] = ' ' . $extra_comment . ' */'; + } else { + $out[] = ' from "' . $file . '" */'; + } + $out[] = '/*'; + $out[] = ' libzint - the open source barcode library'; + if ($start_year && $start_year != $year) { + $out[] = ' Copyright (C) ' . $start_year . '-' . $year . ' Robin Stuart <rstuart114@gmail.com>'; + } else { + $out[] = ' Copyright (C) ' . $year . ' Robin Stuart <rstuart114@gmail.com>'; + } + $out = array_merge($out, explode("\n", $copyright_text)); + $out[] = '#ifndef Z_' . $caps . '_H'; + $out[] = '#define Z_' . $caps . '_H'; +} + +/* Output a block of table entries to `$out` array */ +function out_tab_entries(&$out, $arr, $cnt, $not_hex = false) { + $line = ' '; + for ($i = 0; $i < $cnt; $i++) { + if ($i && $i % 8 === 0) { + $out[] = $line; + $line = ' '; + } + if ($not_hex) { + $line .= sprintf(' %5d,', $arr[$i]); + } else { + $line .= sprintf(' 0x%04X,', $arr[$i]); + } + } + if ($line !== ' ') { + $out[] = $line; + } +} + +/* Output tables to `$out` array */ +function out_tabs(&$out, $name, $sort, $mb, $no_u_ind = false, $u_comment = '', $mb_comment = '') { + if ($u_comment == '') $u_comment = 'Unicode codepoints sorted'; + $cnt_sort = count($sort); + $out[] = ''; + $out[] = '/* ' . $u_comment . ' */'; + $out[] = 'static const unsigned short ' . $name . '_u[' . $cnt_sort . '] = {'; + out_tab_entries($out, $sort, $cnt_sort); + $out[] = '};'; + + if (!empty($mb)) { + if ($mb_comment == '') $mb_comment = 'Multibyte values sorted in Unicode order'; + $cnt = count($mb); + $out[] = ''; + $out[] = '/* ' . $mb_comment . ' */'; + $out[] = 'static const unsigned short ' . $name . '_mb[' . $cnt . '] = {'; + $line = ' '; + out_tab_entries($out, $mb, $cnt); + $out[] = '};'; + } + if (!$no_u_ind) { + $ind_cnt = ($sort[$cnt_sort - 1] >> 8) + 1; + $out[] = ''; + $out[] = '/* Indexes into Unicode `' . $name . '_u[]` array in blocks of 0x100 */'; + $ind_idx = count($out); + $out[] = 'static const unsigned short ' . $name . '_u_ind[] = {'; + $line = ' '; + $i = 0; + foreach ($sort as $ind => $u) { + $div = ($u - $sort[0]) >> 8; + while ($div >= $i) { + if ($i && $i % 8 === 0) { + $out[] = $line; + $line = ' '; + } + $line .= sprintf(' %5d,', $ind); + $i++; + } + } + if ($line !== ' ') { + $out[] = $line; + $line = ' '; + } + $out[] = '};'; + $out[$ind_idx] = 'static const unsigned short ' . $name . '_u_ind[' . $i . '] = {'; + } +} + +/* Helper to output special-case URO (Unified Repertoire and Ordering) block (U+4E00-U+9FFF) tables */ +function out_uro_tabs(&$out, $name, $tab_uro_u, $tab_uro_mb_ind) { + $cnt = count($tab_uro_u); + $out[] = ''; + $out[] = '/* Unicode usage bit-flags for URO (U+4E00-U+9FFF) block */'; + $out[] = 'static const unsigned short ' . $name . '_uro_u[' . $cnt . '] = {'; + out_tab_entries($out, $tab_uro_u, $cnt); + $cnt = count($tab_uro_mb_ind); + $out[] = '};'; + $out[] = ''; + $out[] = '/* Multibyte indexes for URO (U+4E00-U+9FFF) block */'; + $out[] = 'static const unsigned short ' . $name . '_uro_mb_ind[' . $cnt . '] = {'; + out_tab_entries($out, $tab_uro_mb_ind, $cnt, true /*not_hex*/); + $out[] = '};'; +} + +// BIG5 + +$out = array(); + +out_header($out, 'big5', 'Big5', 'https://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/BIG5.TXT', 2021); + +$file = 'https://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/BIG5.TXT'; + +// Read the file. + +if (($get = file_get_contents($file)) === false) { + error_log($error = "$basename: ERROR: Could not read mapping file \"$file\""); + exit($error . PHP_EOL); +} + +$lines = explode("\n", $get); + +// Parse the file. + +$sort = array(); +$mb = array(); +foreach ($lines as $line) { + $line = trim($line); + if ($line === '' || strncmp($line, '0x', 2) !== 0 || strpos($line, "*** NO MAPPING ***") !== false) { + continue; + } + $matches = array(); + if (preg_match('/^0x([0-9A-F]{4})[ \t]+0x([0-9A-F]{4})[ \t].*$/', $line, $matches)) { + $d = hexdec($matches[1]); + $u = hexdec($matches[2]); + $sort[] = $u; + $mb[] = $d; + } +} + +array_multisort($sort, $mb); + +// Calculate URO (U+4E00-U+9FFF) table +for ($u_i = 0, $cnt = count($sort); $u_i < $cnt && $sort[$u_i] < 0x4E00; $u_i++); + +$start_u_i = $u_i; +$big5_uro_u = $big5_uro_mb_ind = array(); +$sort_search = array_flip($sort); +for ($u = 0x4E00; $u <= 0x9FAF; $u += 16) { + $used = 0; + $next_u_i = $u_i; + for ($j = 0; $j < 16; $j++) { + if (isset($sort_search[$u + $j])) { + $i = $sort_search[$u + $j]; + $used |= 1 << $j; + $next_u_i = $i + 1; + $end_u_i = $i; + } + } + $big5_uro_u[] = $used; + $big5_uro_mb_ind[] = $u_i; + $u_i = $next_u_i; +} + +// Output URO tables +out_uro_tabs($out, 'big5', $big5_uro_u, $big5_uro_mb_ind); + +// Remove URO block from Unicode table +array_splice($sort, $start_u_i, $end_u_i - $start_u_i + 1); + +// Output Big5 tables + +out_tabs($out, 'big5', $sort, $mb, true /*no_ind*/); + +$out[] = ''; +$out[] = '#endif /* Z_BIG5_H */'; + +file_put_contents($out_dirname . '/big5.h', implode("\n", $out) . "\n"); + +// EUC-KR (KS X 1001) + +$out = array(); + +out_header($out, 'ksx1001', 'EUC-KR (KS X 1001)', + 'https://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/KSC/KSX1001.TXT', 2021); + +$file = 'https://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/KSC/KSX1001.TXT'; + +// Read the file. + +if (($get = file_get_contents($file)) === false) { + error_log($error = "$basename: ERROR: Could not read mapping file \"$file\""); + exit($error . PHP_EOL); +} + +$lines = explode("\n", $get); + +// Parse the file. + +$sort = array(); +$mb = array(); +foreach ($lines as $line) { + $line = trim($line); + if ($line === '' || strncmp($line, '0x', 2) !== 0 || strpos($line, "*** NO MAPPING ***") !== false) { + continue; + } + $matches = array(); + if (preg_match('/^0x([0-9A-F]{4})[ \t]+0x([0-9A-F]{4})[ \t].*$/', $line, $matches)) { + $d = hexdec($matches[1]) + 0x8080; // Convert to EUC-KR + $u = hexdec($matches[2]); + $sort[] = $u; + $mb[] = $d; + } +} + +// Add some characters defined later than in KSX1001.TXT + +$sort[] = 0x20AC; // Euro sign added KS X 1001:1998 +$mb[] = 0x2266 + 0x8080; + +$sort[] = 0xAE; // Registered trademark added KS X 1001:1998 +$mb[] = 0x2267 + 0x8080; + +$sort[] = 0x327E; // Korean postal code symbol added KS X 1001:2002 +$mb[]= 0x2268 + 0x8080; + +array_multisort($sort, $mb); + +// Calculate URO (U+4E00-U+9FFF) table +for ($u_i = 0, $cnt = count($sort); $u_i < $cnt && $sort[$u_i] < 0x4E00; $u_i++); + +$start_u_i = $u_i; +$ksx1001_uro_u = $ksx1001_uro_mb_ind = array(); +$sort_search = array_flip($sort); +for ($u = 0x4E00; $u <= 0x9F9F; $u += 16) { + $used = 0; + $next_u_i = $u_i; + for ($j = 0; $j < 16; $j++) { + if (isset($sort_search[$u + $j])) { + $i = $sort_search[$u + $j]; + $used |= 1 << $j; + $next_u_i = $i + 1; + $end_u_i = $i; + } + } + $ksx1001_uro_u[] = $used; + $ksx1001_uro_mb_ind[] = $u_i; + $u_i = $next_u_i; +} + +// Output URO tables +out_uro_tabs($out, 'ksx1001', $ksx1001_uro_u, $ksx1001_uro_mb_ind); + +// Remove URO block from Unicode table +array_splice($sort, $start_u_i, $end_u_i - $start_u_i + 1); + +// Output KS X 1001 tables +out_tabs($out, 'ksx1001', $sort, $mb); + +$out[] = ''; +$out[] = '#endif /* Z_KSX1001_H */'; + +file_put_contents($out_dirname . '/ksx1001.h', implode("\n", $out) . "\n"); + +// Shift JIS + +$out = array(); + +out_header($out, 'sjis', 'Shift JIS', 'https://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/SHIFTJIS.TXT', 2009); + +$file = 'https://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/SHIFTJIS.TXT'; + +// Read the file. + +if (($get = file_get_contents($file)) === false) { + error_log($error = "$basename: ERROR: Could not read mapping file \"$file\""); + exit($error . PHP_EOL); +} + +$lines = explode("\n", $get); + +// Parse the file. + +$sort = array(); +$mb = array(); +foreach ($lines as $line) { + $line = trim($line); + if ($line === '' || strncmp($line, '0x', 2) !== 0 || strpos($line, "*** NO MAPPING ***") !== false) { + continue; + } + $matches = array(); + if (preg_match('/^0x([0-9A-F]{2,4})[ \t]+0x([0-9A-F]{4})[ \t].*$/', $line, $matches)) { + $d = hexdec($matches[1]); + if ($d < 0x80 && $d != 0x5C && $d != 0x7E) { + continue; + } + $u = hexdec($matches[2]); + // PUA characters (user-defined range), dealt with programatically by `u_sjis()` + // See CJKV Information Processing by Ken Lunde, 2nd ed., Table 4-86, p.286 + // https://file.allitebooks.com/20160708/CJKV%20Information%20Processing.pdf + if ($u >= 0xE000 && $u <= 0xE757) { + continue; + } + $sort[] = $u; + $mb[] = $d; + } +} + +array_multisort($sort, $mb); + +// Calculate URO (U+4E00-U+9FFF) table +for ($u_i = 0, $cnt = count($sort); $u_i < $cnt && $sort[$u_i] < 0x4E00; $u_i++); + +$start_u_i = $u_i; +$sjis_uro_u = $sjis_uro_mb_ind = array(); +$sort_search = array_flip($sort); +for ($u = 0x4E00; $u <= 0x9FAF; $u += 16) { + $used = 0; + $next_u_i = $u_i; + for ($j = 0; $j < 16; $j++) { + if (isset($sort_search[$u + $j])) { + $i = $sort_search[$u + $j]; + $used |= 1 << $j; + $next_u_i = $i + 1; + $end_u_i = $i; + } + } + $sjis_uro_u[] = $used; + $sjis_uro_mb_ind[] = $u_i; + $u_i = $next_u_i; +} + +// Output URO tables +out_uro_tabs($out, 'sjis', $sjis_uro_u, $sjis_uro_mb_ind); + +// Remove URO block from Unicode table +array_splice($sort, $start_u_i, $end_u_i - $start_u_i + 1); + +// Output Shift JIS tables +out_tabs($out, 'sjis', $sort, $mb, true /*no_ind*/); + +$out[] = ''; +$out[] = '#endif /* Z_SJIS_H */'; + +file_put_contents($out_dirname . '/sjis.h', implode("\n", $out) . "\n"); + +// GB 2312 + +$out = array(); + +out_header($out, 'gb2312', 'GB 2312-1980 (EUC-CN)', + 'unicode.org-mappings/EASTASIA/GB/GB2312.TXT', 2009, + '(see https://haible.de/bruno/charsets/conversion-tables/GB2312.tar.bz2)'); + +$file = $data_dirname . '/' . 'GB2312.TXT'; + +// Read the file. + +if (($get = file_get_contents($file)) === false) { + error_log($error = "$basename: ERROR: Could not read mapping file \"$file\""); + exit($error . PHP_EOL); +} + +$lines = explode("\n", $get); + +// Parse the file. + +$sort = array(); +$mb = array(); +$in_gb2312 = array(); +foreach ($lines as $line) { + $line = trim($line); + if ($line === '' || strncmp($line, '0x', 2) !== 0 || strpos($line, "*** NO MAPPING ***") !== false) { + continue; + } + $matches = array(); + if (preg_match('/^0x([0-9A-F]{2,4})[ \t]+0x([0-9A-F]{4})[ \t].*$/', $line, $matches)) { + $d = hexdec($matches[1]); + if ($d < 0x80) { + continue; + } + $u = hexdec($matches[2]); + $sort[] = $u; + $mb[] = $d + 0x8080; // Convert to EUC-CN + $in_gb2312[$u] = true; + } +} + +array_multisort($sort, $mb); + +// Calculate URO (U+4E00-U+9FFF) table +for ($u_i = 0, $cnt = count($sort); $u_i < $cnt && $sort[$u_i] < 0x4E00; $u_i++); + +$start_u_i = $u_i; +$gb2312_uro_u = $gb2312_uro_mb_ind = array(); +$sort_search = array_flip($sort); +for ($u = 0x4E00; $u <= 0x9CEF; $u += 16) { + $used = 0; + $next_u_i = $u_i; + for ($j = 0; $j < 16; $j++) { + if (isset($sort_search[$u + $j])) { + $i = $sort_search[$u + $j]; + $used |= 1 << $j; + $next_u_i = $i + 1; + $end_u_i = $i; + } + } + $gb2312_uro_u[] = $used; + $gb2312_uro_mb_ind[] = $u_i; + $u_i = $next_u_i; +} + +// Output URO tables +out_uro_tabs($out, 'gb2312', $gb2312_uro_u, $gb2312_uro_mb_ind); + +// Remove URO block from Unicode table +array_splice($sort, $start_u_i, $end_u_i - $start_u_i + 1); + +// Output GB 2312 tables +out_tabs($out, 'gb2312', $sort, $mb); + +$out[] = ''; +$out[] = '#endif /* Z_GB2312_H */'; + +file_put_contents($out_dirname . '/gb2312.h', implode("\n", $out) . "\n"); + +// GBK + +$out = array(); + +out_header($out, 'gbk', 'GBK, excluding mappings in GB 2312', + 'https://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP936.TXT'); + +// Note this has weird 0x80 mapping to U+20AC (EURO SIGN) which needs to be ignored +$file = 'https://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP936.TXT'; + +// Read the file. + +if (($get = file_get_contents($file)) === false) { + error_log($error = "$basename: ERROR: Could not read mapping file \"$file\""); + exit($error . PHP_EOL); +} + +$lines = explode("\n", $get); + +// Parse the file. + +$sort = array(); +$mb = array(); +$in_gbk = array(); +foreach ($lines as $line) { + $line = trim($line); + if ($line === '' || strncmp($line, '0x', 2) !== 0 || strpos($line, "*** NO MAPPING ***") !== false) { + continue; + } + $matches = array(); + if (preg_match('/^0x([0-9A-F]{2,4})[ \t]+0x([0-9A-F]{4})[ \t].*$/', $line, $matches)) { + $d = hexdec($matches[1]); + if ($d <= 0x80) { // Ignore weird 0x80 mapping to U+20AC (EURO SIGN) if any (present in Unicode Public mapping file) + continue; + } + $u = hexdec($matches[2]); + $in_gbk[$u] = true; + if ($u != 0x2015 && isset($in_gb2312[$u])) { // U+2015 mapped differently by GBK + continue; + } + $sort[] = $u; + $mb[] = $d; + } +} + +array_multisort($sort, $mb); + +// Calculate URO (U+4E00-U+9FFF) table +for ($u_i = 0, $cnt = count($sort); $u_i < $cnt && $sort[$u_i] < 0x4E00; $u_i++); + +$start_u_i = $u_i; +$gbk_uro_u = $gbk_uro_mb_ind = array(); +$sort_search = array_flip($sort); +for ($u = 0x4E00; $u <= 0x9FAF; $u += 16) { + $used = 0; + $next_u_i = $u_i; + for ($j = 0; $j < 16; $j++) { + if (isset($sort_search[$u + $j])) { + $i = $sort_search[$u + $j]; + $used |= 1 << $j; + $next_u_i = $i + 1; + $end_u_i = $i; + } + } + $gbk_uro_u[] = $used; + $gbk_uro_mb_ind[] = $u_i; + $u_i = $next_u_i; +} + +// Output URO tables +out_uro_tabs($out, 'gbk', $gbk_uro_u, $gbk_uro_mb_ind); + +// Remove URO block from Unicode table +array_splice($sort, $start_u_i, $end_u_i - $start_u_i + 1); + +// Output GBK tables +out_tabs($out, 'gbk', $sort, $mb, true /*no_ind*/); + +$out[] = ''; +$out[] = '#endif /* Z_GBK_H */'; + +file_put_contents($out_dirname . '/gbk.h', implode("\n", $out) . "\n"); + +// GB 18030 + +$out = array(); + +out_header($out, 'gb18030', 'GB 18030-2005', 'jdk-1.4.2/GB18030.TXT', 2016, + '(see https://haible.de/bruno/charsets/conversion-tables/GB18030.tar.bz2)'); + +$file = $data_dirname . '/' . 'GB18030.TXT'; + +// Read the file. + +if (($get = file_get_contents($file)) === false) { + error_log($error = "$basename: ERROR: Could not read mapping file \"$file\""); + exit($error . PHP_EOL); +} + +$lines = explode("\n", $get); + +// Parse the file. + +$sort2 = array(); +$mb2 = array(); +$sort4 = array(); +$mb4 = array(); + +foreach ($lines as $line) { + $line = trim($line); + if ($line === '' || strncmp($line, '0x', 2) !== 0 || strpos($line, "*** NO MAPPING ***") !== false) { + continue; + } + if (preg_match('/^0x([0-9A-F]{2,8})[ \t]+0x([0-9A-F]{5})/', $line)) { // Exclude U+10000..10FFFF to save space + continue; + } + $matches = array(); + if (preg_match('/^0x([0-9A-F]{2,8})[ \t]+0x([0-9A-F]{4}).*$/', $line, $matches)) { + $d = hexdec($matches[1]); + if ($d < 0x80) { + continue; + } + $u = hexdec($matches[2]); + // 2-byte extension GB 18030-2005 change, were PUA, see Table 3-37, p.109, Lunde 2nd ed. + if (($u >= 0x9FB4 && $u <= 0x9FBB) || ($u >= 0xFE10 && $u <= 0xFE19)) { + //continue; + } + // 4-byte extension change, PUA + if ($u == 0xE7C7) { + continue; + } + if ($d < 0x10000) { + if (isset($in_gbk[$u])) { + continue; + } + // User-defined, dealt with programatically by `u_gb18030()` + if ($u >= 0xE000 && $u <= 0xE765) { + continue; + } + $sort2[] = $u; + $mb2[] = $d; + } else if ($u < 0x10000) { + $sort4[] = $u; + $mb4[] = $d; + } + } +} + +/* 2-byte extension GB 18030-2005 change, was PUA U+E7C7 below, see Table 3-39, p.111, Lunde 2nd ed. */ +$sort2[] = 0x1E3F; $mb2[] = 0xA8BC; + +/* 2-byte extension GB 18030-2005 change, were PUA, see Table 3-37, p.109, Lunde 2nd ed. */ +$sort2[] = 0x9FB4; $mb2[] = 0xFE59; +$sort2[] = 0x9FB5; $mb2[] = 0xFE61; +$sort2[] = 0x9FB6; $mb2[] = 0xFE66; +$sort2[] = 0x9FB7; $mb2[] = 0xFE67; +$sort2[] = 0x9FB8; $mb2[] = 0xFE6D; +$sort2[] = 0x9FB9; $mb2[] = 0xFE7E; +$sort2[] = 0x9FBA; $mb2[] = 0xFE90; +$sort2[] = 0x9FBB; $mb2[] = 0xFEA0; + +$sort2[] = 0xFE10; $mb2[] = 0xA6D9; +$sort2[] = 0xFE11; $mb2[] = 0xA6DB; +$sort2[] = 0xFE12; $mb2[] = 0xA6DA; +$sort2[] = 0xFE13; $mb2[] = 0xA6DC; +$sort2[] = 0xFE14; $mb2[] = 0xA6DD; +$sort2[] = 0xFE15; $mb2[] = 0xA6DE; +$sort2[] = 0xFE16; $mb2[] = 0xA6DF; +$sort2[] = 0xFE17; $mb2[] = 0xA6EC; +$sort2[] = 0xFE18; $mb2[] = 0xA6ED; +$sort2[] = 0xFE19; $mb2[] = 0xA6F3; + +/* 4-byte extension PUA */ +// Dealt with by `u_gb18030()` +//$sort4[] = 0xE7C7; +//$mb4[] = 0x8135F437; + +// Calculate Unicode start/end codepoints mapping to consecutive 4-byte blocks + +array_multisort($sort4, $mb4); + +$gb18030_4_u_b = array(); +$gb18030_4_u_e = array(); +$gb18030_4_mb_o = array(); + +// Start/end points +$prev_u = $begin_u = $sort4[0]; +for ($i = 1, $cnt = count($sort4); $i < $cnt; $i++) { + $u = $sort4[$i]; + if ($u === $prev_u + 1) { + $prev_u++; + continue; + } + $gb18030_4_u_b[] = $begin_u; + $gb18030_4_u_e[] = $prev_u; + $begin_u = $prev_u = $u; +} +$gb18030_4_u_b[] = $begin_u; +$gb18030_4_u_e[] = $prev_u; + +// Gaps between blocks +$gb18030_4_mb_o[] = 0; +for ($i = 1, $cnt = count($gb18030_4_u_b); $i < $cnt; $i++) { + $gb18030_4_mb_o[] = $gb18030_4_u_b[$i] - ($gb18030_4_u_e[$i - 1] + 1) + $gb18030_4_mb_o[count($gb18030_4_mb_o) - 1]; +} + +// Output GB 18030 tables + +array_multisort($sort2, $mb2); +out_tabs($out, 'gb18030_2', $sort2, $mb2, true /*no_ind*/); + +// Start codepoints `gb18030_4_u_b` array not needed by `u_gb18030()` +$cnt = count($gb18030_4_u_e); +$out[] = ''; +$out[] = '/* End Unicode codepoints of blocks mapping consecutively to 4-byte multibyte blocks */'; +$out[] = 'static const unsigned short gb18030_4_u_e[' . $cnt .'] = {'; +out_tab_entries($out, $gb18030_4_u_e, $cnt); +$out[] = '};'; +$cnt = count($gb18030_4_mb_o); +$out[] = ''; +$out[] = '/* Cumulative gaps between Unicode blocks mapping consecutively to 4-byte multibyte blocks,'; +$out[] = ' used to adjust multibyte offsets */'; +$out[] = 'static const unsigned short gb18030_4_mb_o[' . $cnt .'] = {'; +out_tab_entries($out, $gb18030_4_mb_o, $cnt, true /*not_hex*/); +$out[] = '};'; + +$out[] = ''; +$out[] = '#endif /* Z_GB18030_H */'; + +file_put_contents($out_dirname . '/gb18030.h', implode("\n", $out) . "\n"); + +/* vim: set ts=4 sw=4 et : */
