Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/source/fitz/text-decoder.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/source/fitz/text-decoder.c Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,236 @@ +// Copyright (C) 2024 Artifex Software, Inc. +// +// This file is part of MuPDF. +// +// MuPDF is free software: you can redistribute it and/or modify it under the +// terms of the GNU Affero General Public License as published by the Free +// Software Foundation, either version 3 of the License, or (at your option) +// any later version. +// +// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +// details. +// +// You should have received a copy of the GNU Affero General Public License +// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> +// +// Alternative licensing terms are available from the licensor. +// For commercial licensing, see <https://www.artifex.com/> or contact +// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, +// CA 94129, USA, for further information. + +#include "mupdf/fitz.h" +#include "mupdf/pdf.h" + +static int simple_text_decode_bound(fz_text_decoder *dec, unsigned char *s, int n) +{ + return n * 4 + 1; +} + +static int simple_text_decode_size(fz_text_decoder *dec, unsigned char *s, int n) +{ + const unsigned short *table = dec->table1; + unsigned char *e = s + n; + int len = 1; + while (s < e) + len += fz_runelen(table[*s++]); + return len; +} + +static void simple_text_decode(fz_text_decoder *dec, char *p, unsigned char *s, int n) +{ + const unsigned short *table = dec->table1; + unsigned char *e = s + n; + while (s < e) + p += fz_runetochar(p, table[*s++]); + *p = 0; +} + +static int utf16be_text_decode_bound(fz_text_decoder *dec, unsigned char *s, int n) +{ + return n * 2 + 1; +} + +static int utf16le_text_decode_bound(fz_text_decoder *dec, unsigned char *s, int n) +{ + return n * 2 + 1; +} + +static int utf16be_text_decode_size(fz_text_decoder *dec, unsigned char *s, int n) +{ + unsigned char *e = s + n; + int len = 1; + while (s + 1 < e) { + len += fz_runelen(s[0] << 8 | s[1]); + s += 2; + } + return len; +} + +static int utf16le_text_decode_size(fz_text_decoder *dec, unsigned char *s, int n) +{ + unsigned char *e = s + n; + int len = 1; + while (s + 1 < e) { + len += fz_runelen(s[0] | s[1] << 8); + s += 2; + } + return len; +} + +static void utf16be_text_decode(fz_text_decoder *dec, char *p, unsigned char *s, int n) +{ + unsigned char *e = s + n; + while (s + 1 < e) { + p += fz_runetochar(p, s[0] << 8 | s[1]); + s += 2; + } + *p = 0; +} + +static void utf16le_text_decode(fz_text_decoder *dec, char *p, unsigned char *s, int n) +{ + unsigned char *e = s + n; + while (s + 1 < e) { + p += fz_runetochar(p, s[0] | s[1] << 8); + s += 2; + } + *p = 0; +} + +static int cjk_text_decode_bound(fz_text_decoder *dec, unsigned char *s, int n) +{ + return n * 4 + 1; +} + +static int cjk_text_decode_size(fz_text_decoder *dec, unsigned char *s, int n) +{ + unsigned char *e = s + n; + pdf_cmap *to_cid = dec->table1; + pdf_cmap *to_uni = dec->table2; + unsigned int raw; + int cid, uni; + int len = 1; + while (s < e) { + s += pdf_decode_cmap(to_cid, s, e, &raw); + cid = pdf_lookup_cmap(to_cid, raw); + uni = pdf_lookup_cmap(to_uni, cid); + if (uni < 0) { + // ASCII control characters are missing in the CMaps + if (raw < 32) + uni = raw; + else + uni = FZ_REPLACEMENT_CHARACTER; + } + len += fz_runelen(uni); + } + return len; +} + +static void cjk_text_decode(fz_text_decoder *dec, char *p, unsigned char *s, int n) +{ + unsigned char *e = s + n; + pdf_cmap *to_cid = dec->table1; + pdf_cmap *to_uni = dec->table2; + unsigned int raw; + int cid, uni; + while (s < e) { + s += pdf_decode_cmap(to_cid, s, e, &raw); + cid = pdf_lookup_cmap(to_cid, raw); + uni = pdf_lookup_cmap(to_uni, cid); + if (uni < 0) { + // ASCII control characters are missing in the CMaps + if (raw < 32) + uni = raw; + else + uni = FZ_REPLACEMENT_CHARACTER; + } + p += fz_runetochar(p, uni); + } + *p = 0; +} + +static void fz_init_simple_text_decoder(fz_context *ctx, fz_text_decoder *dec, const unsigned short *table) +{ + dec->decode_bound = simple_text_decode_bound; + dec->decode_size = simple_text_decode_size; + dec->decode = simple_text_decode; + dec->table1 = (void*)table; +} + +static void fz_init_utf16be_text_decoder(fz_context *ctx, fz_text_decoder *dec) +{ + dec->decode_bound = utf16be_text_decode_bound; + dec->decode_size = utf16be_text_decode_size; + dec->decode = utf16be_text_decode; +} + +static void fz_init_utf16le_text_decoder(fz_context *ctx, fz_text_decoder *dec) +{ + dec->decode_bound = utf16le_text_decode_bound; + dec->decode_size = utf16le_text_decode_size; + dec->decode = utf16le_text_decode; +} + +static void fz_init_cjk_text_decoder(fz_context *ctx, fz_text_decoder *dec, const char *to_cid, const char *to_uni) +{ + dec->decode_bound = cjk_text_decode_bound; + dec->decode_size = cjk_text_decode_size; + dec->decode = cjk_text_decode; + dec->table1 = pdf_load_builtin_cmap(ctx, to_cid); + if (!dec->table1) + fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "unknown CMap: %s", to_cid); + dec->table2 = pdf_load_builtin_cmap(ctx, to_uni); + if (!dec->table2) + fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "unknown CMap: %s", to_uni); +} + +void fz_init_text_decoder(fz_context *ctx, fz_text_decoder *dec, const char *enc) +{ + // Recognize IANA character set identifiers (case insensitive). + // https://www.iana.org/assignments/character-sets/character-sets.xhtml + + if (!fz_strcasecmp(enc, "utf-16")) + fz_init_utf16le_text_decoder(ctx, dec); + else if (!fz_strcasecmp(enc, "utf-16be")) + fz_init_utf16be_text_decoder(ctx, dec); + else if (!fz_strcasecmp(enc, "utf-16le")) + fz_init_utf16le_text_decoder(ctx, dec); + + else if (!fz_strcasecmp(enc, "euc-jp")) + fz_init_cjk_text_decoder(ctx, dec, "EUC-H", "Adobe-Japan1-UCS2"); + else if (!fz_strcasecmp(enc, "shift_jis") || !fz_strcasecmp(enc, "sjis")) + fz_init_cjk_text_decoder(ctx, dec, "90msp-H", "Adobe-Japan1-UCS2"); + + else if (!fz_strcasecmp(enc, "euc-kr")) + fz_init_cjk_text_decoder(ctx, dec, "KSCms-UHC-H", "Adobe-Korea1-UCS2"); + + else if (!fz_strcasecmp(enc, "euc-cn")) + fz_init_cjk_text_decoder(ctx, dec, "GB-EUC-H", "Adobe-GB1-UCS2"); + else if (!fz_strcasecmp(enc, "gbk") || !fz_strcasecmp(enc, "gb2312") || !fz_strcasecmp(enc, "gb18030")) + fz_init_cjk_text_decoder(ctx, dec, "GBK2K-H", "Adobe-GB1-UCS2"); + + else if (!fz_strcasecmp(enc, "euc-tw")) + fz_init_cjk_text_decoder(ctx, dec, "CNS-EUC-H", "Adobe-CNS1-UCS2"); + else if (!fz_strcasecmp(enc, "big5")) + fz_init_cjk_text_decoder(ctx, dec, "ETen-B5-H", "Adobe-CNS1-UCS2"); + else if (!fz_strcasecmp(enc, "big5-hkscs")) + fz_init_cjk_text_decoder(ctx, dec, "HKscs-B5-H", "Adobe-CNS1-UCS2"); + + else if (!fz_strcasecmp(enc, "iso-8859-1")) + fz_init_simple_text_decoder(ctx, dec, fz_unicode_from_iso8859_1); + else if (!fz_strcasecmp(enc, "iso-8859-7")) + fz_init_simple_text_decoder(ctx, dec, fz_unicode_from_iso8859_7); + else if (!fz_strcasecmp(enc, "koi8-r")) + fz_init_simple_text_decoder(ctx, dec, fz_unicode_from_koi8u); + else if (!fz_strcasecmp(enc, "windows-1250")) + fz_init_simple_text_decoder(ctx, dec, fz_unicode_from_windows_1250); + else if (!fz_strcasecmp(enc, "windows-1251")) + fz_init_simple_text_decoder(ctx, dec, fz_unicode_from_windows_1251); + else if (!fz_strcasecmp(enc, "windows-1252")) + fz_init_simple_text_decoder(ctx, dec, fz_unicode_from_windows_1252); + + else + fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "unknown text encoding: %s", enc); +}
