Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/zint/backend/tools/gen_eci_mb_h.php @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 <?php | |
| 2 /* Generate ECI multibyte tables from unicode.org mapping files */ | |
| 3 /* | |
| 4 libzint - the open source barcode library | |
| 5 Copyright (C) 2022-2023 Robin Stuart <rstuart114@gmail.com> | |
| 6 */ | |
| 7 /* SPDX-License-Identifier: BSD-3-Clause */ | |
| 8 /* | |
| 9 * To create "backend/eci_big5/gb18030/gb2312/gbk/ksx1001/sjis.h" (from project root directory): | |
| 10 * | |
| 11 * php backend/tools/gen_eci_mb_h.php | |
| 12 * | |
| 13 * NOTE: backend/tools/data/GB18030.TXT will have to be downloaded first from the tarball | |
| 14 * https://haible.de/bruno/charsets/conversion-tables/GB18030.tar.bz2 | |
| 15 * using the version jdk-1.4.2/GB18030.TXT | |
| 16 * | |
| 17 * NOTE: tools/data/GB2312.TXT will have to be downloaded first from the tarball | |
| 18 * https://haible.de/bruno/charsets/conversion-tables/GB2312.tar.bz2 | |
| 19 * using the version unicode.org-mappings/EASTASIA/GB/GB2312.TXT | |
| 20 */ | |
| 21 // 'zend.assertions' should set to 1 in php.ini | |
| 22 | |
| 23 $copyright_text = <<<'EOD' | |
| 24 | |
| 25 Redistribution and use in source and binary forms, with or without | |
| 26 modification, are permitted provided that the following conditions | |
| 27 are met: | |
| 28 | |
| 29 1. Redistributions of source code must retain the above copyright | |
| 30 notice, this list of conditions and the following disclaimer. | |
| 31 2. Redistributions in binary form must reproduce the above copyright | |
| 32 notice, this list of conditions and the following disclaimer in the | |
| 33 documentation and/or other materials provided with the distribution. | |
| 34 3. Neither the name of the project nor the names of its contributors | |
| 35 may be used to endorse or promote products derived from this software | |
| 36 without specific prior written permission. | |
| 37 | |
| 38 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | |
| 39 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
| 40 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
| 41 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE | |
| 42 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
| 43 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
| 44 OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
| 45 HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
| 46 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
| 47 OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
| 48 SUCH DAMAGE. | |
| 49 */ | |
| 50 /* SPDX-License-Identifier: BSD-3-Clause */ | |
| 51 | |
| 52 EOD; | |
| 53 | |
| 54 $basename = basename(__FILE__); | |
| 55 $dirname = dirname(__FILE__); | |
| 56 | |
| 57 $opts = getopt('d:o:'); | |
| 58 $data_dirname = isset($opts['d']) ? $opts['d'] : ($dirname . '/data'); // Where to load file from. | |
| 59 $out_dirname = isset($opts['o']) ? $opts['o'] : ($dirname . '/..'); // Where to put output. | |
| 60 | |
| 61 $year = 2022; | |
| 62 | |
| 63 function out_header(&$out, $name, $descr, $file, $start_year = 0, $extra_comment = '') { | |
| 64 global $copyright_text, $basename, $year; | |
| 65 $caps = strtoupper($name); | |
| 66 $out[] = '/* ' . $name . '.h - tables for Unicode to ' . $descr . ', generated by "backend/tools/' . $basename . '"'; | |
| 67 if ($extra_comment !== '') { | |
| 68 $out[] = ' from "' . $file . '"'; | |
| 69 $out[] = ' ' . $extra_comment . ' */'; | |
| 70 } else { | |
| 71 $out[] = ' from "' . $file . '" */'; | |
| 72 } | |
| 73 $out[] = '/*'; | |
| 74 $out[] = ' libzint - the open source barcode library'; | |
| 75 if ($start_year && $start_year != $year) { | |
| 76 $out[] = ' Copyright (C) ' . $start_year . '-' . $year . ' Robin Stuart <rstuart114@gmail.com>'; | |
| 77 } else { | |
| 78 $out[] = ' Copyright (C) ' . $year . ' Robin Stuart <rstuart114@gmail.com>'; | |
| 79 } | |
| 80 $out = array_merge($out, explode("\n", $copyright_text)); | |
| 81 $out[] = '#ifndef Z_' . $caps . '_H'; | |
| 82 $out[] = '#define Z_' . $caps . '_H'; | |
| 83 } | |
| 84 | |
| 85 /* Output a block of table entries to `$out` array */ | |
| 86 function out_tab_entries(&$out, $arr, $cnt, $not_hex = false) { | |
| 87 $line = ' '; | |
| 88 for ($i = 0; $i < $cnt; $i++) { | |
| 89 if ($i && $i % 8 === 0) { | |
| 90 $out[] = $line; | |
| 91 $line = ' '; | |
| 92 } | |
| 93 if ($not_hex) { | |
| 94 $line .= sprintf(' %5d,', $arr[$i]); | |
| 95 } else { | |
| 96 $line .= sprintf(' 0x%04X,', $arr[$i]); | |
| 97 } | |
| 98 } | |
| 99 if ($line !== ' ') { | |
| 100 $out[] = $line; | |
| 101 } | |
| 102 } | |
| 103 | |
| 104 /* Output tables to `$out` array */ | |
| 105 function out_tabs(&$out, $name, $sort, $mb, $no_u_ind = false, $u_comment = '', $mb_comment = '') { | |
| 106 if ($u_comment == '') $u_comment = 'Unicode codepoints sorted'; | |
| 107 $cnt_sort = count($sort); | |
| 108 $out[] = ''; | |
| 109 $out[] = '/* ' . $u_comment . ' */'; | |
| 110 $out[] = 'static const unsigned short ' . $name . '_u[' . $cnt_sort . '] = {'; | |
| 111 out_tab_entries($out, $sort, $cnt_sort); | |
| 112 $out[] = '};'; | |
| 113 | |
| 114 if (!empty($mb)) { | |
| 115 if ($mb_comment == '') $mb_comment = 'Multibyte values sorted in Unicode order'; | |
| 116 $cnt = count($mb); | |
| 117 $out[] = ''; | |
| 118 $out[] = '/* ' . $mb_comment . ' */'; | |
| 119 $out[] = 'static const unsigned short ' . $name . '_mb[' . $cnt . '] = {'; | |
| 120 $line = ' '; | |
| 121 out_tab_entries($out, $mb, $cnt); | |
| 122 $out[] = '};'; | |
| 123 } | |
| 124 if (!$no_u_ind) { | |
| 125 $ind_cnt = ($sort[$cnt_sort - 1] >> 8) + 1; | |
| 126 $out[] = ''; | |
| 127 $out[] = '/* Indexes into Unicode `' . $name . '_u[]` array in blocks of 0x100 */'; | |
| 128 $ind_idx = count($out); | |
| 129 $out[] = 'static const unsigned short ' . $name . '_u_ind[] = {'; | |
| 130 $line = ' '; | |
| 131 $i = 0; | |
| 132 foreach ($sort as $ind => $u) { | |
| 133 $div = ($u - $sort[0]) >> 8; | |
| 134 while ($div >= $i) { | |
| 135 if ($i && $i % 8 === 0) { | |
| 136 $out[] = $line; | |
| 137 $line = ' '; | |
| 138 } | |
| 139 $line .= sprintf(' %5d,', $ind); | |
| 140 $i++; | |
| 141 } | |
| 142 } | |
| 143 if ($line !== ' ') { | |
| 144 $out[] = $line; | |
| 145 $line = ' '; | |
| 146 } | |
| 147 $out[] = '};'; | |
| 148 $out[$ind_idx] = 'static const unsigned short ' . $name . '_u_ind[' . $i . '] = {'; | |
| 149 } | |
| 150 } | |
| 151 | |
| 152 /* Helper to output special-case URO (Unified Repertoire and Ordering) block (U+4E00-U+9FFF) tables */ | |
| 153 function out_uro_tabs(&$out, $name, $tab_uro_u, $tab_uro_mb_ind) { | |
| 154 $cnt = count($tab_uro_u); | |
| 155 $out[] = ''; | |
| 156 $out[] = '/* Unicode usage bit-flags for URO (U+4E00-U+9FFF) block */'; | |
| 157 $out[] = 'static const unsigned short ' . $name . '_uro_u[' . $cnt . '] = {'; | |
| 158 out_tab_entries($out, $tab_uro_u, $cnt); | |
| 159 $cnt = count($tab_uro_mb_ind); | |
| 160 $out[] = '};'; | |
| 161 $out[] = ''; | |
| 162 $out[] = '/* Multibyte indexes for URO (U+4E00-U+9FFF) block */'; | |
| 163 $out[] = 'static const unsigned short ' . $name . '_uro_mb_ind[' . $cnt . '] = {'; | |
| 164 out_tab_entries($out, $tab_uro_mb_ind, $cnt, true /*not_hex*/); | |
| 165 $out[] = '};'; | |
| 166 } | |
| 167 | |
| 168 // BIG5 | |
| 169 | |
| 170 $out = array(); | |
| 171 | |
| 172 out_header($out, 'big5', 'Big5', 'https://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/BIG5.TXT', 2021); | |
| 173 | |
| 174 $file = 'https://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/BIG5.TXT'; | |
| 175 | |
| 176 // Read the file. | |
| 177 | |
| 178 if (($get = file_get_contents($file)) === false) { | |
| 179 error_log($error = "$basename: ERROR: Could not read mapping file \"$file\""); | |
| 180 exit($error . PHP_EOL); | |
| 181 } | |
| 182 | |
| 183 $lines = explode("\n", $get); | |
| 184 | |
| 185 // Parse the file. | |
| 186 | |
| 187 $sort = array(); | |
| 188 $mb = array(); | |
| 189 foreach ($lines as $line) { | |
| 190 $line = trim($line); | |
| 191 if ($line === '' || strncmp($line, '0x', 2) !== 0 || strpos($line, "*** NO MAPPING ***") !== false) { | |
| 192 continue; | |
| 193 } | |
| 194 $matches = array(); | |
| 195 if (preg_match('/^0x([0-9A-F]{4})[ \t]+0x([0-9A-F]{4})[ \t].*$/', $line, $matches)) { | |
| 196 $d = hexdec($matches[1]); | |
| 197 $u = hexdec($matches[2]); | |
| 198 $sort[] = $u; | |
| 199 $mb[] = $d; | |
| 200 } | |
| 201 } | |
| 202 | |
| 203 array_multisort($sort, $mb); | |
| 204 | |
| 205 // Calculate URO (U+4E00-U+9FFF) table | |
| 206 for ($u_i = 0, $cnt = count($sort); $u_i < $cnt && $sort[$u_i] < 0x4E00; $u_i++); | |
| 207 | |
| 208 $start_u_i = $u_i; | |
| 209 $big5_uro_u = $big5_uro_mb_ind = array(); | |
| 210 $sort_search = array_flip($sort); | |
| 211 for ($u = 0x4E00; $u <= 0x9FAF; $u += 16) { | |
| 212 $used = 0; | |
| 213 $next_u_i = $u_i; | |
| 214 for ($j = 0; $j < 16; $j++) { | |
| 215 if (isset($sort_search[$u + $j])) { | |
| 216 $i = $sort_search[$u + $j]; | |
| 217 $used |= 1 << $j; | |
| 218 $next_u_i = $i + 1; | |
| 219 $end_u_i = $i; | |
| 220 } | |
| 221 } | |
| 222 $big5_uro_u[] = $used; | |
| 223 $big5_uro_mb_ind[] = $u_i; | |
| 224 $u_i = $next_u_i; | |
| 225 } | |
| 226 | |
| 227 // Output URO tables | |
| 228 out_uro_tabs($out, 'big5', $big5_uro_u, $big5_uro_mb_ind); | |
| 229 | |
| 230 // Remove URO block from Unicode table | |
| 231 array_splice($sort, $start_u_i, $end_u_i - $start_u_i + 1); | |
| 232 | |
| 233 // Output Big5 tables | |
| 234 | |
| 235 out_tabs($out, 'big5', $sort, $mb, true /*no_ind*/); | |
| 236 | |
| 237 $out[] = ''; | |
| 238 $out[] = '#endif /* Z_BIG5_H */'; | |
| 239 | |
| 240 file_put_contents($out_dirname . '/big5.h', implode("\n", $out) . "\n"); | |
| 241 | |
| 242 // EUC-KR (KS X 1001) | |
| 243 | |
| 244 $out = array(); | |
| 245 | |
| 246 out_header($out, 'ksx1001', 'EUC-KR (KS X 1001)', | |
| 247 'https://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/KSC/KSX1001.TXT', 2021); | |
| 248 | |
| 249 $file = 'https://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/KSC/KSX1001.TXT'; | |
| 250 | |
| 251 // Read the file. | |
| 252 | |
| 253 if (($get = file_get_contents($file)) === false) { | |
| 254 error_log($error = "$basename: ERROR: Could not read mapping file \"$file\""); | |
| 255 exit($error . PHP_EOL); | |
| 256 } | |
| 257 | |
| 258 $lines = explode("\n", $get); | |
| 259 | |
| 260 // Parse the file. | |
| 261 | |
| 262 $sort = array(); | |
| 263 $mb = array(); | |
| 264 foreach ($lines as $line) { | |
| 265 $line = trim($line); | |
| 266 if ($line === '' || strncmp($line, '0x', 2) !== 0 || strpos($line, "*** NO MAPPING ***") !== false) { | |
| 267 continue; | |
| 268 } | |
| 269 $matches = array(); | |
| 270 if (preg_match('/^0x([0-9A-F]{4})[ \t]+0x([0-9A-F]{4})[ \t].*$/', $line, $matches)) { | |
| 271 $d = hexdec($matches[1]) + 0x8080; // Convert to EUC-KR | |
| 272 $u = hexdec($matches[2]); | |
| 273 $sort[] = $u; | |
| 274 $mb[] = $d; | |
| 275 } | |
| 276 } | |
| 277 | |
| 278 // Add some characters defined later than in KSX1001.TXT | |
| 279 | |
| 280 $sort[] = 0x20AC; // Euro sign added KS X 1001:1998 | |
| 281 $mb[] = 0x2266 + 0x8080; | |
| 282 | |
| 283 $sort[] = 0xAE; // Registered trademark added KS X 1001:1998 | |
| 284 $mb[] = 0x2267 + 0x8080; | |
| 285 | |
| 286 $sort[] = 0x327E; // Korean postal code symbol added KS X 1001:2002 | |
| 287 $mb[]= 0x2268 + 0x8080; | |
| 288 | |
| 289 array_multisort($sort, $mb); | |
| 290 | |
| 291 // Calculate URO (U+4E00-U+9FFF) table | |
| 292 for ($u_i = 0, $cnt = count($sort); $u_i < $cnt && $sort[$u_i] < 0x4E00; $u_i++); | |
| 293 | |
| 294 $start_u_i = $u_i; | |
| 295 $ksx1001_uro_u = $ksx1001_uro_mb_ind = array(); | |
| 296 $sort_search = array_flip($sort); | |
| 297 for ($u = 0x4E00; $u <= 0x9F9F; $u += 16) { | |
| 298 $used = 0; | |
| 299 $next_u_i = $u_i; | |
| 300 for ($j = 0; $j < 16; $j++) { | |
| 301 if (isset($sort_search[$u + $j])) { | |
| 302 $i = $sort_search[$u + $j]; | |
| 303 $used |= 1 << $j; | |
| 304 $next_u_i = $i + 1; | |
| 305 $end_u_i = $i; | |
| 306 } | |
| 307 } | |
| 308 $ksx1001_uro_u[] = $used; | |
| 309 $ksx1001_uro_mb_ind[] = $u_i; | |
| 310 $u_i = $next_u_i; | |
| 311 } | |
| 312 | |
| 313 // Output URO tables | |
| 314 out_uro_tabs($out, 'ksx1001', $ksx1001_uro_u, $ksx1001_uro_mb_ind); | |
| 315 | |
| 316 // Remove URO block from Unicode table | |
| 317 array_splice($sort, $start_u_i, $end_u_i - $start_u_i + 1); | |
| 318 | |
| 319 // Output KS X 1001 tables | |
| 320 out_tabs($out, 'ksx1001', $sort, $mb); | |
| 321 | |
| 322 $out[] = ''; | |
| 323 $out[] = '#endif /* Z_KSX1001_H */'; | |
| 324 | |
| 325 file_put_contents($out_dirname . '/ksx1001.h', implode("\n", $out) . "\n"); | |
| 326 | |
| 327 // Shift JIS | |
| 328 | |
| 329 $out = array(); | |
| 330 | |
| 331 out_header($out, 'sjis', 'Shift JIS', 'https://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/SHIFTJIS.TXT', 2009); | |
| 332 | |
| 333 $file = 'https://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/SHIFTJIS.TXT'; | |
| 334 | |
| 335 // Read the file. | |
| 336 | |
| 337 if (($get = file_get_contents($file)) === false) { | |
| 338 error_log($error = "$basename: ERROR: Could not read mapping file \"$file\""); | |
| 339 exit($error . PHP_EOL); | |
| 340 } | |
| 341 | |
| 342 $lines = explode("\n", $get); | |
| 343 | |
| 344 // Parse the file. | |
| 345 | |
| 346 $sort = array(); | |
| 347 $mb = array(); | |
| 348 foreach ($lines as $line) { | |
| 349 $line = trim($line); | |
| 350 if ($line === '' || strncmp($line, '0x', 2) !== 0 || strpos($line, "*** NO MAPPING ***") !== false) { | |
| 351 continue; | |
| 352 } | |
| 353 $matches = array(); | |
| 354 if (preg_match('/^0x([0-9A-F]{2,4})[ \t]+0x([0-9A-F]{4})[ \t].*$/', $line, $matches)) { | |
| 355 $d = hexdec($matches[1]); | |
| 356 if ($d < 0x80 && $d != 0x5C && $d != 0x7E) { | |
| 357 continue; | |
| 358 } | |
| 359 $u = hexdec($matches[2]); | |
| 360 // PUA characters (user-defined range), dealt with programatically by `u_sjis()` | |
| 361 // See CJKV Information Processing by Ken Lunde, 2nd ed., Table 4-86, p.286 | |
| 362 // https://file.allitebooks.com/20160708/CJKV%20Information%20Processing.pdf | |
| 363 if ($u >= 0xE000 && $u <= 0xE757) { | |
| 364 continue; | |
| 365 } | |
| 366 $sort[] = $u; | |
| 367 $mb[] = $d; | |
| 368 } | |
| 369 } | |
| 370 | |
| 371 array_multisort($sort, $mb); | |
| 372 | |
| 373 // Calculate URO (U+4E00-U+9FFF) table | |
| 374 for ($u_i = 0, $cnt = count($sort); $u_i < $cnt && $sort[$u_i] < 0x4E00; $u_i++); | |
| 375 | |
| 376 $start_u_i = $u_i; | |
| 377 $sjis_uro_u = $sjis_uro_mb_ind = array(); | |
| 378 $sort_search = array_flip($sort); | |
| 379 for ($u = 0x4E00; $u <= 0x9FAF; $u += 16) { | |
| 380 $used = 0; | |
| 381 $next_u_i = $u_i; | |
| 382 for ($j = 0; $j < 16; $j++) { | |
| 383 if (isset($sort_search[$u + $j])) { | |
| 384 $i = $sort_search[$u + $j]; | |
| 385 $used |= 1 << $j; | |
| 386 $next_u_i = $i + 1; | |
| 387 $end_u_i = $i; | |
| 388 } | |
| 389 } | |
| 390 $sjis_uro_u[] = $used; | |
| 391 $sjis_uro_mb_ind[] = $u_i; | |
| 392 $u_i = $next_u_i; | |
| 393 } | |
| 394 | |
| 395 // Output URO tables | |
| 396 out_uro_tabs($out, 'sjis', $sjis_uro_u, $sjis_uro_mb_ind); | |
| 397 | |
| 398 // Remove URO block from Unicode table | |
| 399 array_splice($sort, $start_u_i, $end_u_i - $start_u_i + 1); | |
| 400 | |
| 401 // Output Shift JIS tables | |
| 402 out_tabs($out, 'sjis', $sort, $mb, true /*no_ind*/); | |
| 403 | |
| 404 $out[] = ''; | |
| 405 $out[] = '#endif /* Z_SJIS_H */'; | |
| 406 | |
| 407 file_put_contents($out_dirname . '/sjis.h', implode("\n", $out) . "\n"); | |
| 408 | |
| 409 // GB 2312 | |
| 410 | |
| 411 $out = array(); | |
| 412 | |
| 413 out_header($out, 'gb2312', 'GB 2312-1980 (EUC-CN)', | |
| 414 'unicode.org-mappings/EASTASIA/GB/GB2312.TXT', 2009, | |
| 415 '(see https://haible.de/bruno/charsets/conversion-tables/GB2312.tar.bz2)'); | |
| 416 | |
| 417 $file = $data_dirname . '/' . 'GB2312.TXT'; | |
| 418 | |
| 419 // Read the file. | |
| 420 | |
| 421 if (($get = file_get_contents($file)) === false) { | |
| 422 error_log($error = "$basename: ERROR: Could not read mapping file \"$file\""); | |
| 423 exit($error . PHP_EOL); | |
| 424 } | |
| 425 | |
| 426 $lines = explode("\n", $get); | |
| 427 | |
| 428 // Parse the file. | |
| 429 | |
| 430 $sort = array(); | |
| 431 $mb = array(); | |
| 432 $in_gb2312 = array(); | |
| 433 foreach ($lines as $line) { | |
| 434 $line = trim($line); | |
| 435 if ($line === '' || strncmp($line, '0x', 2) !== 0 || strpos($line, "*** NO MAPPING ***") !== false) { | |
| 436 continue; | |
| 437 } | |
| 438 $matches = array(); | |
| 439 if (preg_match('/^0x([0-9A-F]{2,4})[ \t]+0x([0-9A-F]{4})[ \t].*$/', $line, $matches)) { | |
| 440 $d = hexdec($matches[1]); | |
| 441 if ($d < 0x80) { | |
| 442 continue; | |
| 443 } | |
| 444 $u = hexdec($matches[2]); | |
| 445 $sort[] = $u; | |
| 446 $mb[] = $d + 0x8080; // Convert to EUC-CN | |
| 447 $in_gb2312[$u] = true; | |
| 448 } | |
| 449 } | |
| 450 | |
| 451 array_multisort($sort, $mb); | |
| 452 | |
| 453 // Calculate URO (U+4E00-U+9FFF) table | |
| 454 for ($u_i = 0, $cnt = count($sort); $u_i < $cnt && $sort[$u_i] < 0x4E00; $u_i++); | |
| 455 | |
| 456 $start_u_i = $u_i; | |
| 457 $gb2312_uro_u = $gb2312_uro_mb_ind = array(); | |
| 458 $sort_search = array_flip($sort); | |
| 459 for ($u = 0x4E00; $u <= 0x9CEF; $u += 16) { | |
| 460 $used = 0; | |
| 461 $next_u_i = $u_i; | |
| 462 for ($j = 0; $j < 16; $j++) { | |
| 463 if (isset($sort_search[$u + $j])) { | |
| 464 $i = $sort_search[$u + $j]; | |
| 465 $used |= 1 << $j; | |
| 466 $next_u_i = $i + 1; | |
| 467 $end_u_i = $i; | |
| 468 } | |
| 469 } | |
| 470 $gb2312_uro_u[] = $used; | |
| 471 $gb2312_uro_mb_ind[] = $u_i; | |
| 472 $u_i = $next_u_i; | |
| 473 } | |
| 474 | |
| 475 // Output URO tables | |
| 476 out_uro_tabs($out, 'gb2312', $gb2312_uro_u, $gb2312_uro_mb_ind); | |
| 477 | |
| 478 // Remove URO block from Unicode table | |
| 479 array_splice($sort, $start_u_i, $end_u_i - $start_u_i + 1); | |
| 480 | |
| 481 // Output GB 2312 tables | |
| 482 out_tabs($out, 'gb2312', $sort, $mb); | |
| 483 | |
| 484 $out[] = ''; | |
| 485 $out[] = '#endif /* Z_GB2312_H */'; | |
| 486 | |
| 487 file_put_contents($out_dirname . '/gb2312.h', implode("\n", $out) . "\n"); | |
| 488 | |
| 489 // GBK | |
| 490 | |
| 491 $out = array(); | |
| 492 | |
| 493 out_header($out, 'gbk', 'GBK, excluding mappings in GB 2312', | |
| 494 'https://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP936.TXT'); | |
| 495 | |
| 496 // Note this has weird 0x80 mapping to U+20AC (EURO SIGN) which needs to be ignored | |
| 497 $file = 'https://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP936.TXT'; | |
| 498 | |
| 499 // Read the file. | |
| 500 | |
| 501 if (($get = file_get_contents($file)) === false) { | |
| 502 error_log($error = "$basename: ERROR: Could not read mapping file \"$file\""); | |
| 503 exit($error . PHP_EOL); | |
| 504 } | |
| 505 | |
| 506 $lines = explode("\n", $get); | |
| 507 | |
| 508 // Parse the file. | |
| 509 | |
| 510 $sort = array(); | |
| 511 $mb = array(); | |
| 512 $in_gbk = array(); | |
| 513 foreach ($lines as $line) { | |
| 514 $line = trim($line); | |
| 515 if ($line === '' || strncmp($line, '0x', 2) !== 0 || strpos($line, "*** NO MAPPING ***") !== false) { | |
| 516 continue; | |
| 517 } | |
| 518 $matches = array(); | |
| 519 if (preg_match('/^0x([0-9A-F]{2,4})[ \t]+0x([0-9A-F]{4})[ \t].*$/', $line, $matches)) { | |
| 520 $d = hexdec($matches[1]); | |
| 521 if ($d <= 0x80) { // Ignore weird 0x80 mapping to U+20AC (EURO SIGN) if any (present in Unicode Public mapping file) | |
| 522 continue; | |
| 523 } | |
| 524 $u = hexdec($matches[2]); | |
| 525 $in_gbk[$u] = true; | |
| 526 if ($u != 0x2015 && isset($in_gb2312[$u])) { // U+2015 mapped differently by GBK | |
| 527 continue; | |
| 528 } | |
| 529 $sort[] = $u; | |
| 530 $mb[] = $d; | |
| 531 } | |
| 532 } | |
| 533 | |
| 534 array_multisort($sort, $mb); | |
| 535 | |
| 536 // Calculate URO (U+4E00-U+9FFF) table | |
| 537 for ($u_i = 0, $cnt = count($sort); $u_i < $cnt && $sort[$u_i] < 0x4E00; $u_i++); | |
| 538 | |
| 539 $start_u_i = $u_i; | |
| 540 $gbk_uro_u = $gbk_uro_mb_ind = array(); | |
| 541 $sort_search = array_flip($sort); | |
| 542 for ($u = 0x4E00; $u <= 0x9FAF; $u += 16) { | |
| 543 $used = 0; | |
| 544 $next_u_i = $u_i; | |
| 545 for ($j = 0; $j < 16; $j++) { | |
| 546 if (isset($sort_search[$u + $j])) { | |
| 547 $i = $sort_search[$u + $j]; | |
| 548 $used |= 1 << $j; | |
| 549 $next_u_i = $i + 1; | |
| 550 $end_u_i = $i; | |
| 551 } | |
| 552 } | |
| 553 $gbk_uro_u[] = $used; | |
| 554 $gbk_uro_mb_ind[] = $u_i; | |
| 555 $u_i = $next_u_i; | |
| 556 } | |
| 557 | |
| 558 // Output URO tables | |
| 559 out_uro_tabs($out, 'gbk', $gbk_uro_u, $gbk_uro_mb_ind); | |
| 560 | |
| 561 // Remove URO block from Unicode table | |
| 562 array_splice($sort, $start_u_i, $end_u_i - $start_u_i + 1); | |
| 563 | |
| 564 // Output GBK tables | |
| 565 out_tabs($out, 'gbk', $sort, $mb, true /*no_ind*/); | |
| 566 | |
| 567 $out[] = ''; | |
| 568 $out[] = '#endif /* Z_GBK_H */'; | |
| 569 | |
| 570 file_put_contents($out_dirname . '/gbk.h', implode("\n", $out) . "\n"); | |
| 571 | |
| 572 // GB 18030 | |
| 573 | |
| 574 $out = array(); | |
| 575 | |
| 576 out_header($out, 'gb18030', 'GB 18030-2005', 'jdk-1.4.2/GB18030.TXT', 2016, | |
| 577 '(see https://haible.de/bruno/charsets/conversion-tables/GB18030.tar.bz2)'); | |
| 578 | |
| 579 $file = $data_dirname . '/' . 'GB18030.TXT'; | |
| 580 | |
| 581 // Read the file. | |
| 582 | |
| 583 if (($get = file_get_contents($file)) === false) { | |
| 584 error_log($error = "$basename: ERROR: Could not read mapping file \"$file\""); | |
| 585 exit($error . PHP_EOL); | |
| 586 } | |
| 587 | |
| 588 $lines = explode("\n", $get); | |
| 589 | |
| 590 // Parse the file. | |
| 591 | |
| 592 $sort2 = array(); | |
| 593 $mb2 = array(); | |
| 594 $sort4 = array(); | |
| 595 $mb4 = array(); | |
| 596 | |
| 597 foreach ($lines as $line) { | |
| 598 $line = trim($line); | |
| 599 if ($line === '' || strncmp($line, '0x', 2) !== 0 || strpos($line, "*** NO MAPPING ***") !== false) { | |
| 600 continue; | |
| 601 } | |
| 602 if (preg_match('/^0x([0-9A-F]{2,8})[ \t]+0x([0-9A-F]{5})/', $line)) { // Exclude U+10000..10FFFF to save space | |
| 603 continue; | |
| 604 } | |
| 605 $matches = array(); | |
| 606 if (preg_match('/^0x([0-9A-F]{2,8})[ \t]+0x([0-9A-F]{4}).*$/', $line, $matches)) { | |
| 607 $d = hexdec($matches[1]); | |
| 608 if ($d < 0x80) { | |
| 609 continue; | |
| 610 } | |
| 611 $u = hexdec($matches[2]); | |
| 612 // 2-byte extension GB 18030-2005 change, were PUA, see Table 3-37, p.109, Lunde 2nd ed. | |
| 613 if (($u >= 0x9FB4 && $u <= 0x9FBB) || ($u >= 0xFE10 && $u <= 0xFE19)) { | |
| 614 //continue; | |
| 615 } | |
| 616 // 4-byte extension change, PUA | |
| 617 if ($u == 0xE7C7) { | |
| 618 continue; | |
| 619 } | |
| 620 if ($d < 0x10000) { | |
| 621 if (isset($in_gbk[$u])) { | |
| 622 continue; | |
| 623 } | |
| 624 // User-defined, dealt with programatically by `u_gb18030()` | |
| 625 if ($u >= 0xE000 && $u <= 0xE765) { | |
| 626 continue; | |
| 627 } | |
| 628 $sort2[] = $u; | |
| 629 $mb2[] = $d; | |
| 630 } else if ($u < 0x10000) { | |
| 631 $sort4[] = $u; | |
| 632 $mb4[] = $d; | |
| 633 } | |
| 634 } | |
| 635 } | |
| 636 | |
| 637 /* 2-byte extension GB 18030-2005 change, was PUA U+E7C7 below, see Table 3-39, p.111, Lunde 2nd ed. */ | |
| 638 $sort2[] = 0x1E3F; $mb2[] = 0xA8BC; | |
| 639 | |
| 640 /* 2-byte extension GB 18030-2005 change, were PUA, see Table 3-37, p.109, Lunde 2nd ed. */ | |
| 641 $sort2[] = 0x9FB4; $mb2[] = 0xFE59; | |
| 642 $sort2[] = 0x9FB5; $mb2[] = 0xFE61; | |
| 643 $sort2[] = 0x9FB6; $mb2[] = 0xFE66; | |
| 644 $sort2[] = 0x9FB7; $mb2[] = 0xFE67; | |
| 645 $sort2[] = 0x9FB8; $mb2[] = 0xFE6D; | |
| 646 $sort2[] = 0x9FB9; $mb2[] = 0xFE7E; | |
| 647 $sort2[] = 0x9FBA; $mb2[] = 0xFE90; | |
| 648 $sort2[] = 0x9FBB; $mb2[] = 0xFEA0; | |
| 649 | |
| 650 $sort2[] = 0xFE10; $mb2[] = 0xA6D9; | |
| 651 $sort2[] = 0xFE11; $mb2[] = 0xA6DB; | |
| 652 $sort2[] = 0xFE12; $mb2[] = 0xA6DA; | |
| 653 $sort2[] = 0xFE13; $mb2[] = 0xA6DC; | |
| 654 $sort2[] = 0xFE14; $mb2[] = 0xA6DD; | |
| 655 $sort2[] = 0xFE15; $mb2[] = 0xA6DE; | |
| 656 $sort2[] = 0xFE16; $mb2[] = 0xA6DF; | |
| 657 $sort2[] = 0xFE17; $mb2[] = 0xA6EC; | |
| 658 $sort2[] = 0xFE18; $mb2[] = 0xA6ED; | |
| 659 $sort2[] = 0xFE19; $mb2[] = 0xA6F3; | |
| 660 | |
| 661 /* 4-byte extension PUA */ | |
| 662 // Dealt with by `u_gb18030()` | |
| 663 //$sort4[] = 0xE7C7; | |
| 664 //$mb4[] = 0x8135F437; | |
| 665 | |
| 666 // Calculate Unicode start/end codepoints mapping to consecutive 4-byte blocks | |
| 667 | |
| 668 array_multisort($sort4, $mb4); | |
| 669 | |
| 670 $gb18030_4_u_b = array(); | |
| 671 $gb18030_4_u_e = array(); | |
| 672 $gb18030_4_mb_o = array(); | |
| 673 | |
| 674 // Start/end points | |
| 675 $prev_u = $begin_u = $sort4[0]; | |
| 676 for ($i = 1, $cnt = count($sort4); $i < $cnt; $i++) { | |
| 677 $u = $sort4[$i]; | |
| 678 if ($u === $prev_u + 1) { | |
| 679 $prev_u++; | |
| 680 continue; | |
| 681 } | |
| 682 $gb18030_4_u_b[] = $begin_u; | |
| 683 $gb18030_4_u_e[] = $prev_u; | |
| 684 $begin_u = $prev_u = $u; | |
| 685 } | |
| 686 $gb18030_4_u_b[] = $begin_u; | |
| 687 $gb18030_4_u_e[] = $prev_u; | |
| 688 | |
| 689 // Gaps between blocks | |
| 690 $gb18030_4_mb_o[] = 0; | |
| 691 for ($i = 1, $cnt = count($gb18030_4_u_b); $i < $cnt; $i++) { | |
| 692 $gb18030_4_mb_o[] = $gb18030_4_u_b[$i] - ($gb18030_4_u_e[$i - 1] + 1) + $gb18030_4_mb_o[count($gb18030_4_mb_o) - 1]; | |
| 693 } | |
| 694 | |
| 695 // Output GB 18030 tables | |
| 696 | |
| 697 array_multisort($sort2, $mb2); | |
| 698 out_tabs($out, 'gb18030_2', $sort2, $mb2, true /*no_ind*/); | |
| 699 | |
| 700 // Start codepoints `gb18030_4_u_b` array not needed by `u_gb18030()` | |
| 701 $cnt = count($gb18030_4_u_e); | |
| 702 $out[] = ''; | |
| 703 $out[] = '/* End Unicode codepoints of blocks mapping consecutively to 4-byte multibyte blocks */'; | |
| 704 $out[] = 'static const unsigned short gb18030_4_u_e[' . $cnt .'] = {'; | |
| 705 out_tab_entries($out, $gb18030_4_u_e, $cnt); | |
| 706 $out[] = '};'; | |
| 707 $cnt = count($gb18030_4_mb_o); | |
| 708 $out[] = ''; | |
| 709 $out[] = '/* Cumulative gaps between Unicode blocks mapping consecutively to 4-byte multibyte blocks,'; | |
| 710 $out[] = ' used to adjust multibyte offsets */'; | |
| 711 $out[] = 'static const unsigned short gb18030_4_mb_o[' . $cnt .'] = {'; | |
| 712 out_tab_entries($out, $gb18030_4_mb_o, $cnt, true /*not_hex*/); | |
| 713 $out[] = '};'; | |
| 714 | |
| 715 $out[] = ''; | |
| 716 $out[] = '#endif /* Z_GB18030_H */'; | |
| 717 | |
| 718 file_put_contents($out_dirname . '/gb18030.h', implode("\n", $out) . "\n"); | |
| 719 | |
| 720 /* vim: set ts=4 sw=4 et : */ |
