comparison mupdf-source/thirdparty/zint/backend/tools/gen_eci_mb_h.php @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 <?php
2 /* Generate ECI multibyte tables from unicode.org mapping files */
3 /*
4 libzint - the open source barcode library
5 Copyright (C) 2022-2023 Robin Stuart <rstuart114@gmail.com>
6 */
7 /* SPDX-License-Identifier: BSD-3-Clause */
8 /*
9 * To create "backend/eci_big5/gb18030/gb2312/gbk/ksx1001/sjis.h" (from project root directory):
10 *
11 * php backend/tools/gen_eci_mb_h.php
12 *
13 * NOTE: backend/tools/data/GB18030.TXT will have to be downloaded first from the tarball
14 * https://haible.de/bruno/charsets/conversion-tables/GB18030.tar.bz2
15 * using the version jdk-1.4.2/GB18030.TXT
16 *
17 * NOTE: tools/data/GB2312.TXT will have to be downloaded first from the tarball
18 * https://haible.de/bruno/charsets/conversion-tables/GB2312.tar.bz2
19 * using the version unicode.org-mappings/EASTASIA/GB/GB2312.TXT
20 */
21 // 'zend.assertions' should set to 1 in php.ini
22
23 $copyright_text = <<<'EOD'
24
25 Redistribution and use in source and binary forms, with or without
26 modification, are permitted provided that the following conditions
27 are met:
28
29 1. Redistributions of source code must retain the above copyright
30 notice, this list of conditions and the following disclaimer.
31 2. Redistributions in binary form must reproduce the above copyright
32 notice, this list of conditions and the following disclaimer in the
33 documentation and/or other materials provided with the distribution.
34 3. Neither the name of the project nor the names of its contributors
35 may be used to endorse or promote products derived from this software
36 without specific prior written permission.
37
38 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
39 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
40 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
41 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
42 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
43 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
44 OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
45 HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
46 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
47 OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48 SUCH DAMAGE.
49 */
50 /* SPDX-License-Identifier: BSD-3-Clause */
51
52 EOD;
53
54 $basename = basename(__FILE__);
55 $dirname = dirname(__FILE__);
56
57 $opts = getopt('d:o:');
58 $data_dirname = isset($opts['d']) ? $opts['d'] : ($dirname . '/data'); // Where to load file from.
59 $out_dirname = isset($opts['o']) ? $opts['o'] : ($dirname . '/..'); // Where to put output.
60
61 $year = 2022;
62
63 function out_header(&$out, $name, $descr, $file, $start_year = 0, $extra_comment = '') {
64 global $copyright_text, $basename, $year;
65 $caps = strtoupper($name);
66 $out[] = '/* ' . $name . '.h - tables for Unicode to ' . $descr . ', generated by "backend/tools/' . $basename . '"';
67 if ($extra_comment !== '') {
68 $out[] = ' from "' . $file . '"';
69 $out[] = ' ' . $extra_comment . ' */';
70 } else {
71 $out[] = ' from "' . $file . '" */';
72 }
73 $out[] = '/*';
74 $out[] = ' libzint - the open source barcode library';
75 if ($start_year && $start_year != $year) {
76 $out[] = ' Copyright (C) ' . $start_year . '-' . $year . ' Robin Stuart <rstuart114@gmail.com>';
77 } else {
78 $out[] = ' Copyright (C) ' . $year . ' Robin Stuart <rstuart114@gmail.com>';
79 }
80 $out = array_merge($out, explode("\n", $copyright_text));
81 $out[] = '#ifndef Z_' . $caps . '_H';
82 $out[] = '#define Z_' . $caps . '_H';
83 }
84
85 /* Output a block of table entries to `$out` array */
86 function out_tab_entries(&$out, $arr, $cnt, $not_hex = false) {
87 $line = ' ';
88 for ($i = 0; $i < $cnt; $i++) {
89 if ($i && $i % 8 === 0) {
90 $out[] = $line;
91 $line = ' ';
92 }
93 if ($not_hex) {
94 $line .= sprintf(' %5d,', $arr[$i]);
95 } else {
96 $line .= sprintf(' 0x%04X,', $arr[$i]);
97 }
98 }
99 if ($line !== ' ') {
100 $out[] = $line;
101 }
102 }
103
104 /* Output tables to `$out` array */
105 function out_tabs(&$out, $name, $sort, $mb, $no_u_ind = false, $u_comment = '', $mb_comment = '') {
106 if ($u_comment == '') $u_comment = 'Unicode codepoints sorted';
107 $cnt_sort = count($sort);
108 $out[] = '';
109 $out[] = '/* ' . $u_comment . ' */';
110 $out[] = 'static const unsigned short ' . $name . '_u[' . $cnt_sort . '] = {';
111 out_tab_entries($out, $sort, $cnt_sort);
112 $out[] = '};';
113
114 if (!empty($mb)) {
115 if ($mb_comment == '') $mb_comment = 'Multibyte values sorted in Unicode order';
116 $cnt = count($mb);
117 $out[] = '';
118 $out[] = '/* ' . $mb_comment . ' */';
119 $out[] = 'static const unsigned short ' . $name . '_mb[' . $cnt . '] = {';
120 $line = ' ';
121 out_tab_entries($out, $mb, $cnt);
122 $out[] = '};';
123 }
124 if (!$no_u_ind) {
125 $ind_cnt = ($sort[$cnt_sort - 1] >> 8) + 1;
126 $out[] = '';
127 $out[] = '/* Indexes into Unicode `' . $name . '_u[]` array in blocks of 0x100 */';
128 $ind_idx = count($out);
129 $out[] = 'static const unsigned short ' . $name . '_u_ind[] = {';
130 $line = ' ';
131 $i = 0;
132 foreach ($sort as $ind => $u) {
133 $div = ($u - $sort[0]) >> 8;
134 while ($div >= $i) {
135 if ($i && $i % 8 === 0) {
136 $out[] = $line;
137 $line = ' ';
138 }
139 $line .= sprintf(' %5d,', $ind);
140 $i++;
141 }
142 }
143 if ($line !== ' ') {
144 $out[] = $line;
145 $line = ' ';
146 }
147 $out[] = '};';
148 $out[$ind_idx] = 'static const unsigned short ' . $name . '_u_ind[' . $i . '] = {';
149 }
150 }
151
152 /* Helper to output special-case URO (Unified Repertoire and Ordering) block (U+4E00-U+9FFF) tables */
153 function out_uro_tabs(&$out, $name, $tab_uro_u, $tab_uro_mb_ind) {
154 $cnt = count($tab_uro_u);
155 $out[] = '';
156 $out[] = '/* Unicode usage bit-flags for URO (U+4E00-U+9FFF) block */';
157 $out[] = 'static const unsigned short ' . $name . '_uro_u[' . $cnt . '] = {';
158 out_tab_entries($out, $tab_uro_u, $cnt);
159 $cnt = count($tab_uro_mb_ind);
160 $out[] = '};';
161 $out[] = '';
162 $out[] = '/* Multibyte indexes for URO (U+4E00-U+9FFF) block */';
163 $out[] = 'static const unsigned short ' . $name . '_uro_mb_ind[' . $cnt . '] = {';
164 out_tab_entries($out, $tab_uro_mb_ind, $cnt, true /*not_hex*/);
165 $out[] = '};';
166 }
167
168 // BIG5
169
170 $out = array();
171
172 out_header($out, 'big5', 'Big5', 'https://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/BIG5.TXT', 2021);
173
174 $file = 'https://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/BIG5.TXT';
175
176 // Read the file.
177
178 if (($get = file_get_contents($file)) === false) {
179 error_log($error = "$basename: ERROR: Could not read mapping file \"$file\"");
180 exit($error . PHP_EOL);
181 }
182
183 $lines = explode("\n", $get);
184
185 // Parse the file.
186
187 $sort = array();
188 $mb = array();
189 foreach ($lines as $line) {
190 $line = trim($line);
191 if ($line === '' || strncmp($line, '0x', 2) !== 0 || strpos($line, "*** NO MAPPING ***") !== false) {
192 continue;
193 }
194 $matches = array();
195 if (preg_match('/^0x([0-9A-F]{4})[ \t]+0x([0-9A-F]{4})[ \t].*$/', $line, $matches)) {
196 $d = hexdec($matches[1]);
197 $u = hexdec($matches[2]);
198 $sort[] = $u;
199 $mb[] = $d;
200 }
201 }
202
203 array_multisort($sort, $mb);
204
205 // Calculate URO (U+4E00-U+9FFF) table
206 for ($u_i = 0, $cnt = count($sort); $u_i < $cnt && $sort[$u_i] < 0x4E00; $u_i++);
207
208 $start_u_i = $u_i;
209 $big5_uro_u = $big5_uro_mb_ind = array();
210 $sort_search = array_flip($sort);
211 for ($u = 0x4E00; $u <= 0x9FAF; $u += 16) {
212 $used = 0;
213 $next_u_i = $u_i;
214 for ($j = 0; $j < 16; $j++) {
215 if (isset($sort_search[$u + $j])) {
216 $i = $sort_search[$u + $j];
217 $used |= 1 << $j;
218 $next_u_i = $i + 1;
219 $end_u_i = $i;
220 }
221 }
222 $big5_uro_u[] = $used;
223 $big5_uro_mb_ind[] = $u_i;
224 $u_i = $next_u_i;
225 }
226
227 // Output URO tables
228 out_uro_tabs($out, 'big5', $big5_uro_u, $big5_uro_mb_ind);
229
230 // Remove URO block from Unicode table
231 array_splice($sort, $start_u_i, $end_u_i - $start_u_i + 1);
232
233 // Output Big5 tables
234
235 out_tabs($out, 'big5', $sort, $mb, true /*no_ind*/);
236
237 $out[] = '';
238 $out[] = '#endif /* Z_BIG5_H */';
239
240 file_put_contents($out_dirname . '/big5.h', implode("\n", $out) . "\n");
241
242 // EUC-KR (KS X 1001)
243
244 $out = array();
245
246 out_header($out, 'ksx1001', 'EUC-KR (KS X 1001)',
247 'https://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/KSC/KSX1001.TXT', 2021);
248
249 $file = 'https://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/KSC/KSX1001.TXT';
250
251 // Read the file.
252
253 if (($get = file_get_contents($file)) === false) {
254 error_log($error = "$basename: ERROR: Could not read mapping file \"$file\"");
255 exit($error . PHP_EOL);
256 }
257
258 $lines = explode("\n", $get);
259
260 // Parse the file.
261
262 $sort = array();
263 $mb = array();
264 foreach ($lines as $line) {
265 $line = trim($line);
266 if ($line === '' || strncmp($line, '0x', 2) !== 0 || strpos($line, "*** NO MAPPING ***") !== false) {
267 continue;
268 }
269 $matches = array();
270 if (preg_match('/^0x([0-9A-F]{4})[ \t]+0x([0-9A-F]{4})[ \t].*$/', $line, $matches)) {
271 $d = hexdec($matches[1]) + 0x8080; // Convert to EUC-KR
272 $u = hexdec($matches[2]);
273 $sort[] = $u;
274 $mb[] = $d;
275 }
276 }
277
278 // Add some characters defined later than in KSX1001.TXT
279
280 $sort[] = 0x20AC; // Euro sign added KS X 1001:1998
281 $mb[] = 0x2266 + 0x8080;
282
283 $sort[] = 0xAE; // Registered trademark added KS X 1001:1998
284 $mb[] = 0x2267 + 0x8080;
285
286 $sort[] = 0x327E; // Korean postal code symbol added KS X 1001:2002
287 $mb[]= 0x2268 + 0x8080;
288
289 array_multisort($sort, $mb);
290
291 // Calculate URO (U+4E00-U+9FFF) table
292 for ($u_i = 0, $cnt = count($sort); $u_i < $cnt && $sort[$u_i] < 0x4E00; $u_i++);
293
294 $start_u_i = $u_i;
295 $ksx1001_uro_u = $ksx1001_uro_mb_ind = array();
296 $sort_search = array_flip($sort);
297 for ($u = 0x4E00; $u <= 0x9F9F; $u += 16) {
298 $used = 0;
299 $next_u_i = $u_i;
300 for ($j = 0; $j < 16; $j++) {
301 if (isset($sort_search[$u + $j])) {
302 $i = $sort_search[$u + $j];
303 $used |= 1 << $j;
304 $next_u_i = $i + 1;
305 $end_u_i = $i;
306 }
307 }
308 $ksx1001_uro_u[] = $used;
309 $ksx1001_uro_mb_ind[] = $u_i;
310 $u_i = $next_u_i;
311 }
312
313 // Output URO tables
314 out_uro_tabs($out, 'ksx1001', $ksx1001_uro_u, $ksx1001_uro_mb_ind);
315
316 // Remove URO block from Unicode table
317 array_splice($sort, $start_u_i, $end_u_i - $start_u_i + 1);
318
319 // Output KS X 1001 tables
320 out_tabs($out, 'ksx1001', $sort, $mb);
321
322 $out[] = '';
323 $out[] = '#endif /* Z_KSX1001_H */';
324
325 file_put_contents($out_dirname . '/ksx1001.h', implode("\n", $out) . "\n");
326
327 // Shift JIS
328
329 $out = array();
330
331 out_header($out, 'sjis', 'Shift JIS', 'https://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/SHIFTJIS.TXT', 2009);
332
333 $file = 'https://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/SHIFTJIS.TXT';
334
335 // Read the file.
336
337 if (($get = file_get_contents($file)) === false) {
338 error_log($error = "$basename: ERROR: Could not read mapping file \"$file\"");
339 exit($error . PHP_EOL);
340 }
341
342 $lines = explode("\n", $get);
343
344 // Parse the file.
345
346 $sort = array();
347 $mb = array();
348 foreach ($lines as $line) {
349 $line = trim($line);
350 if ($line === '' || strncmp($line, '0x', 2) !== 0 || strpos($line, "*** NO MAPPING ***") !== false) {
351 continue;
352 }
353 $matches = array();
354 if (preg_match('/^0x([0-9A-F]{2,4})[ \t]+0x([0-9A-F]{4})[ \t].*$/', $line, $matches)) {
355 $d = hexdec($matches[1]);
356 if ($d < 0x80 && $d != 0x5C && $d != 0x7E) {
357 continue;
358 }
359 $u = hexdec($matches[2]);
360 // PUA characters (user-defined range), dealt with programatically by `u_sjis()`
361 // See CJKV Information Processing by Ken Lunde, 2nd ed., Table 4-86, p.286
362 // https://file.allitebooks.com/20160708/CJKV%20Information%20Processing.pdf
363 if ($u >= 0xE000 && $u <= 0xE757) {
364 continue;
365 }
366 $sort[] = $u;
367 $mb[] = $d;
368 }
369 }
370
371 array_multisort($sort, $mb);
372
373 // Calculate URO (U+4E00-U+9FFF) table
374 for ($u_i = 0, $cnt = count($sort); $u_i < $cnt && $sort[$u_i] < 0x4E00; $u_i++);
375
376 $start_u_i = $u_i;
377 $sjis_uro_u = $sjis_uro_mb_ind = array();
378 $sort_search = array_flip($sort);
379 for ($u = 0x4E00; $u <= 0x9FAF; $u += 16) {
380 $used = 0;
381 $next_u_i = $u_i;
382 for ($j = 0; $j < 16; $j++) {
383 if (isset($sort_search[$u + $j])) {
384 $i = $sort_search[$u + $j];
385 $used |= 1 << $j;
386 $next_u_i = $i + 1;
387 $end_u_i = $i;
388 }
389 }
390 $sjis_uro_u[] = $used;
391 $sjis_uro_mb_ind[] = $u_i;
392 $u_i = $next_u_i;
393 }
394
395 // Output URO tables
396 out_uro_tabs($out, 'sjis', $sjis_uro_u, $sjis_uro_mb_ind);
397
398 // Remove URO block from Unicode table
399 array_splice($sort, $start_u_i, $end_u_i - $start_u_i + 1);
400
401 // Output Shift JIS tables
402 out_tabs($out, 'sjis', $sort, $mb, true /*no_ind*/);
403
404 $out[] = '';
405 $out[] = '#endif /* Z_SJIS_H */';
406
407 file_put_contents($out_dirname . '/sjis.h', implode("\n", $out) . "\n");
408
409 // GB 2312
410
411 $out = array();
412
413 out_header($out, 'gb2312', 'GB 2312-1980 (EUC-CN)',
414 'unicode.org-mappings/EASTASIA/GB/GB2312.TXT', 2009,
415 '(see https://haible.de/bruno/charsets/conversion-tables/GB2312.tar.bz2)');
416
417 $file = $data_dirname . '/' . 'GB2312.TXT';
418
419 // Read the file.
420
421 if (($get = file_get_contents($file)) === false) {
422 error_log($error = "$basename: ERROR: Could not read mapping file \"$file\"");
423 exit($error . PHP_EOL);
424 }
425
426 $lines = explode("\n", $get);
427
428 // Parse the file.
429
430 $sort = array();
431 $mb = array();
432 $in_gb2312 = array();
433 foreach ($lines as $line) {
434 $line = trim($line);
435 if ($line === '' || strncmp($line, '0x', 2) !== 0 || strpos($line, "*** NO MAPPING ***") !== false) {
436 continue;
437 }
438 $matches = array();
439 if (preg_match('/^0x([0-9A-F]{2,4})[ \t]+0x([0-9A-F]{4})[ \t].*$/', $line, $matches)) {
440 $d = hexdec($matches[1]);
441 if ($d < 0x80) {
442 continue;
443 }
444 $u = hexdec($matches[2]);
445 $sort[] = $u;
446 $mb[] = $d + 0x8080; // Convert to EUC-CN
447 $in_gb2312[$u] = true;
448 }
449 }
450
451 array_multisort($sort, $mb);
452
453 // Calculate URO (U+4E00-U+9FFF) table
454 for ($u_i = 0, $cnt = count($sort); $u_i < $cnt && $sort[$u_i] < 0x4E00; $u_i++);
455
456 $start_u_i = $u_i;
457 $gb2312_uro_u = $gb2312_uro_mb_ind = array();
458 $sort_search = array_flip($sort);
459 for ($u = 0x4E00; $u <= 0x9CEF; $u += 16) {
460 $used = 0;
461 $next_u_i = $u_i;
462 for ($j = 0; $j < 16; $j++) {
463 if (isset($sort_search[$u + $j])) {
464 $i = $sort_search[$u + $j];
465 $used |= 1 << $j;
466 $next_u_i = $i + 1;
467 $end_u_i = $i;
468 }
469 }
470 $gb2312_uro_u[] = $used;
471 $gb2312_uro_mb_ind[] = $u_i;
472 $u_i = $next_u_i;
473 }
474
475 // Output URO tables
476 out_uro_tabs($out, 'gb2312', $gb2312_uro_u, $gb2312_uro_mb_ind);
477
478 // Remove URO block from Unicode table
479 array_splice($sort, $start_u_i, $end_u_i - $start_u_i + 1);
480
481 // Output GB 2312 tables
482 out_tabs($out, 'gb2312', $sort, $mb);
483
484 $out[] = '';
485 $out[] = '#endif /* Z_GB2312_H */';
486
487 file_put_contents($out_dirname . '/gb2312.h', implode("\n", $out) . "\n");
488
489 // GBK
490
491 $out = array();
492
493 out_header($out, 'gbk', 'GBK, excluding mappings in GB 2312',
494 'https://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP936.TXT');
495
496 // Note this has weird 0x80 mapping to U+20AC (EURO SIGN) which needs to be ignored
497 $file = 'https://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP936.TXT';
498
499 // Read the file.
500
501 if (($get = file_get_contents($file)) === false) {
502 error_log($error = "$basename: ERROR: Could not read mapping file \"$file\"");
503 exit($error . PHP_EOL);
504 }
505
506 $lines = explode("\n", $get);
507
508 // Parse the file.
509
510 $sort = array();
511 $mb = array();
512 $in_gbk = array();
513 foreach ($lines as $line) {
514 $line = trim($line);
515 if ($line === '' || strncmp($line, '0x', 2) !== 0 || strpos($line, "*** NO MAPPING ***") !== false) {
516 continue;
517 }
518 $matches = array();
519 if (preg_match('/^0x([0-9A-F]{2,4})[ \t]+0x([0-9A-F]{4})[ \t].*$/', $line, $matches)) {
520 $d = hexdec($matches[1]);
521 if ($d <= 0x80) { // Ignore weird 0x80 mapping to U+20AC (EURO SIGN) if any (present in Unicode Public mapping file)
522 continue;
523 }
524 $u = hexdec($matches[2]);
525 $in_gbk[$u] = true;
526 if ($u != 0x2015 && isset($in_gb2312[$u])) { // U+2015 mapped differently by GBK
527 continue;
528 }
529 $sort[] = $u;
530 $mb[] = $d;
531 }
532 }
533
534 array_multisort($sort, $mb);
535
536 // Calculate URO (U+4E00-U+9FFF) table
537 for ($u_i = 0, $cnt = count($sort); $u_i < $cnt && $sort[$u_i] < 0x4E00; $u_i++);
538
539 $start_u_i = $u_i;
540 $gbk_uro_u = $gbk_uro_mb_ind = array();
541 $sort_search = array_flip($sort);
542 for ($u = 0x4E00; $u <= 0x9FAF; $u += 16) {
543 $used = 0;
544 $next_u_i = $u_i;
545 for ($j = 0; $j < 16; $j++) {
546 if (isset($sort_search[$u + $j])) {
547 $i = $sort_search[$u + $j];
548 $used |= 1 << $j;
549 $next_u_i = $i + 1;
550 $end_u_i = $i;
551 }
552 }
553 $gbk_uro_u[] = $used;
554 $gbk_uro_mb_ind[] = $u_i;
555 $u_i = $next_u_i;
556 }
557
558 // Output URO tables
559 out_uro_tabs($out, 'gbk', $gbk_uro_u, $gbk_uro_mb_ind);
560
561 // Remove URO block from Unicode table
562 array_splice($sort, $start_u_i, $end_u_i - $start_u_i + 1);
563
564 // Output GBK tables
565 out_tabs($out, 'gbk', $sort, $mb, true /*no_ind*/);
566
567 $out[] = '';
568 $out[] = '#endif /* Z_GBK_H */';
569
570 file_put_contents($out_dirname . '/gbk.h', implode("\n", $out) . "\n");
571
572 // GB 18030
573
574 $out = array();
575
576 out_header($out, 'gb18030', 'GB 18030-2005', 'jdk-1.4.2/GB18030.TXT', 2016,
577 '(see https://haible.de/bruno/charsets/conversion-tables/GB18030.tar.bz2)');
578
579 $file = $data_dirname . '/' . 'GB18030.TXT';
580
581 // Read the file.
582
583 if (($get = file_get_contents($file)) === false) {
584 error_log($error = "$basename: ERROR: Could not read mapping file \"$file\"");
585 exit($error . PHP_EOL);
586 }
587
588 $lines = explode("\n", $get);
589
590 // Parse the file.
591
592 $sort2 = array();
593 $mb2 = array();
594 $sort4 = array();
595 $mb4 = array();
596
597 foreach ($lines as $line) {
598 $line = trim($line);
599 if ($line === '' || strncmp($line, '0x', 2) !== 0 || strpos($line, "*** NO MAPPING ***") !== false) {
600 continue;
601 }
602 if (preg_match('/^0x([0-9A-F]{2,8})[ \t]+0x([0-9A-F]{5})/', $line)) { // Exclude U+10000..10FFFF to save space
603 continue;
604 }
605 $matches = array();
606 if (preg_match('/^0x([0-9A-F]{2,8})[ \t]+0x([0-9A-F]{4}).*$/', $line, $matches)) {
607 $d = hexdec($matches[1]);
608 if ($d < 0x80) {
609 continue;
610 }
611 $u = hexdec($matches[2]);
612 // 2-byte extension GB 18030-2005 change, were PUA, see Table 3-37, p.109, Lunde 2nd ed.
613 if (($u >= 0x9FB4 && $u <= 0x9FBB) || ($u >= 0xFE10 && $u <= 0xFE19)) {
614 //continue;
615 }
616 // 4-byte extension change, PUA
617 if ($u == 0xE7C7) {
618 continue;
619 }
620 if ($d < 0x10000) {
621 if (isset($in_gbk[$u])) {
622 continue;
623 }
624 // User-defined, dealt with programatically by `u_gb18030()`
625 if ($u >= 0xE000 && $u <= 0xE765) {
626 continue;
627 }
628 $sort2[] = $u;
629 $mb2[] = $d;
630 } else if ($u < 0x10000) {
631 $sort4[] = $u;
632 $mb4[] = $d;
633 }
634 }
635 }
636
637 /* 2-byte extension GB 18030-2005 change, was PUA U+E7C7 below, see Table 3-39, p.111, Lunde 2nd ed. */
638 $sort2[] = 0x1E3F; $mb2[] = 0xA8BC;
639
640 /* 2-byte extension GB 18030-2005 change, were PUA, see Table 3-37, p.109, Lunde 2nd ed. */
641 $sort2[] = 0x9FB4; $mb2[] = 0xFE59;
642 $sort2[] = 0x9FB5; $mb2[] = 0xFE61;
643 $sort2[] = 0x9FB6; $mb2[] = 0xFE66;
644 $sort2[] = 0x9FB7; $mb2[] = 0xFE67;
645 $sort2[] = 0x9FB8; $mb2[] = 0xFE6D;
646 $sort2[] = 0x9FB9; $mb2[] = 0xFE7E;
647 $sort2[] = 0x9FBA; $mb2[] = 0xFE90;
648 $sort2[] = 0x9FBB; $mb2[] = 0xFEA0;
649
650 $sort2[] = 0xFE10; $mb2[] = 0xA6D9;
651 $sort2[] = 0xFE11; $mb2[] = 0xA6DB;
652 $sort2[] = 0xFE12; $mb2[] = 0xA6DA;
653 $sort2[] = 0xFE13; $mb2[] = 0xA6DC;
654 $sort2[] = 0xFE14; $mb2[] = 0xA6DD;
655 $sort2[] = 0xFE15; $mb2[] = 0xA6DE;
656 $sort2[] = 0xFE16; $mb2[] = 0xA6DF;
657 $sort2[] = 0xFE17; $mb2[] = 0xA6EC;
658 $sort2[] = 0xFE18; $mb2[] = 0xA6ED;
659 $sort2[] = 0xFE19; $mb2[] = 0xA6F3;
660
661 /* 4-byte extension PUA */
662 // Dealt with by `u_gb18030()`
663 //$sort4[] = 0xE7C7;
664 //$mb4[] = 0x8135F437;
665
666 // Calculate Unicode start/end codepoints mapping to consecutive 4-byte blocks
667
668 array_multisort($sort4, $mb4);
669
670 $gb18030_4_u_b = array();
671 $gb18030_4_u_e = array();
672 $gb18030_4_mb_o = array();
673
674 // Start/end points
675 $prev_u = $begin_u = $sort4[0];
676 for ($i = 1, $cnt = count($sort4); $i < $cnt; $i++) {
677 $u = $sort4[$i];
678 if ($u === $prev_u + 1) {
679 $prev_u++;
680 continue;
681 }
682 $gb18030_4_u_b[] = $begin_u;
683 $gb18030_4_u_e[] = $prev_u;
684 $begin_u = $prev_u = $u;
685 }
686 $gb18030_4_u_b[] = $begin_u;
687 $gb18030_4_u_e[] = $prev_u;
688
689 // Gaps between blocks
690 $gb18030_4_mb_o[] = 0;
691 for ($i = 1, $cnt = count($gb18030_4_u_b); $i < $cnt; $i++) {
692 $gb18030_4_mb_o[] = $gb18030_4_u_b[$i] - ($gb18030_4_u_e[$i - 1] + 1) + $gb18030_4_mb_o[count($gb18030_4_mb_o) - 1];
693 }
694
695 // Output GB 18030 tables
696
697 array_multisort($sort2, $mb2);
698 out_tabs($out, 'gb18030_2', $sort2, $mb2, true /*no_ind*/);
699
700 // Start codepoints `gb18030_4_u_b` array not needed by `u_gb18030()`
701 $cnt = count($gb18030_4_u_e);
702 $out[] = '';
703 $out[] = '/* End Unicode codepoints of blocks mapping consecutively to 4-byte multibyte blocks */';
704 $out[] = 'static const unsigned short gb18030_4_u_e[' . $cnt .'] = {';
705 out_tab_entries($out, $gb18030_4_u_e, $cnt);
706 $out[] = '};';
707 $cnt = count($gb18030_4_mb_o);
708 $out[] = '';
709 $out[] = '/* Cumulative gaps between Unicode blocks mapping consecutively to 4-byte multibyte blocks,';
710 $out[] = ' used to adjust multibyte offsets */';
711 $out[] = 'static const unsigned short gb18030_4_mb_o[' . $cnt .'] = {';
712 out_tab_entries($out, $gb18030_4_mb_o, $cnt, true /*not_hex*/);
713 $out[] = '};';
714
715 $out[] = '';
716 $out[] = '#endif /* Z_GB18030_H */';
717
718 file_put_contents($out_dirname . '/gb18030.h', implode("\n", $out) . "\n");
719
720 /* vim: set ts=4 sw=4 et : */