diff mupdf-source/source/fitz/text-decoder.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/source/fitz/text-decoder.c	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,236 @@
+// Copyright (C) 2024 Artifex Software, Inc.
+//
+// This file is part of MuPDF.
+//
+// MuPDF is free software: you can redistribute it and/or modify it under the
+// terms of the GNU Affero General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+// details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
+//
+// Alternative licensing terms are available from the licensor.
+// For commercial licensing, see <https://www.artifex.com/> or contact
+// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
+// CA 94129, USA, for further information.
+
+#include "mupdf/fitz.h"
+#include "mupdf/pdf.h"
+
+static int simple_text_decode_bound(fz_text_decoder *dec, unsigned char *s, int n)
+{
+	return n * 4 + 1;
+}
+
+static int simple_text_decode_size(fz_text_decoder *dec, unsigned char *s, int n)
+{
+	const unsigned short *table = dec->table1;
+	unsigned char *e = s + n;
+	int len = 1;
+	while (s < e)
+		len += fz_runelen(table[*s++]);
+	return len;
+}
+
+static void simple_text_decode(fz_text_decoder *dec, char *p, unsigned char *s, int n)
+{
+	const unsigned short *table = dec->table1;
+	unsigned char *e = s + n;
+	while (s < e)
+		p += fz_runetochar(p, table[*s++]);
+	*p = 0;
+}
+
+static int utf16be_text_decode_bound(fz_text_decoder *dec, unsigned char *s, int n)
+{
+	return n * 2 + 1;
+}
+
+static int utf16le_text_decode_bound(fz_text_decoder *dec, unsigned char *s, int n)
+{
+	return n * 2 + 1;
+}
+
+static int utf16be_text_decode_size(fz_text_decoder *dec, unsigned char *s, int n)
+{
+	unsigned char *e = s + n;
+	int len = 1;
+	while (s + 1 < e) {
+		len += fz_runelen(s[0] << 8 | s[1]);
+		s += 2;
+	}
+	return len;
+}
+
+static int utf16le_text_decode_size(fz_text_decoder *dec, unsigned char *s, int n)
+{
+	unsigned char *e = s + n;
+	int len = 1;
+	while (s + 1 < e) {
+		len += fz_runelen(s[0] | s[1] << 8);
+		s += 2;
+	}
+	return len;
+}
+
+static void utf16be_text_decode(fz_text_decoder *dec, char *p, unsigned char *s, int n)
+{
+	unsigned char *e = s + n;
+	while (s + 1 < e) {
+		p += fz_runetochar(p, s[0] << 8 | s[1]);
+		s += 2;
+	}
+	*p = 0;
+}
+
+static void utf16le_text_decode(fz_text_decoder *dec, char *p, unsigned char *s, int n)
+{
+	unsigned char *e = s + n;
+	while (s + 1 < e) {
+		p += fz_runetochar(p, s[0] | s[1] << 8);
+		s += 2;
+	}
+	*p = 0;
+}
+
+static int cjk_text_decode_bound(fz_text_decoder *dec, unsigned char *s, int n)
+{
+	return n * 4 + 1;
+}
+
+static int cjk_text_decode_size(fz_text_decoder *dec, unsigned char *s, int n)
+{
+	unsigned char *e = s + n;
+	pdf_cmap *to_cid = dec->table1;
+	pdf_cmap *to_uni = dec->table2;
+	unsigned int raw;
+	int cid, uni;
+	int len = 1;
+	while (s < e) {
+		s += pdf_decode_cmap(to_cid, s, e, &raw);
+		cid = pdf_lookup_cmap(to_cid, raw);
+		uni = pdf_lookup_cmap(to_uni, cid);
+		if (uni < 0) {
+			// ASCII control characters are missing in the CMaps
+			if (raw < 32)
+				uni = raw;
+			else
+				uni = FZ_REPLACEMENT_CHARACTER;
+		}
+		len += fz_runelen(uni);
+	}
+	return len;
+}
+
+static void cjk_text_decode(fz_text_decoder *dec, char *p, unsigned char *s, int n)
+{
+	unsigned char *e = s + n;
+	pdf_cmap *to_cid = dec->table1;
+	pdf_cmap *to_uni = dec->table2;
+	unsigned int raw;
+	int cid, uni;
+	while (s < e) {
+		s += pdf_decode_cmap(to_cid, s, e, &raw);
+		cid = pdf_lookup_cmap(to_cid, raw);
+		uni = pdf_lookup_cmap(to_uni, cid);
+		if (uni < 0) {
+			// ASCII control characters are missing in the CMaps
+			if (raw < 32)
+				uni = raw;
+			else
+				uni = FZ_REPLACEMENT_CHARACTER;
+		}
+		p += fz_runetochar(p, uni);
+	}
+	*p = 0;
+}
+
+static void fz_init_simple_text_decoder(fz_context *ctx, fz_text_decoder *dec, const unsigned short *table)
+{
+	dec->decode_bound = simple_text_decode_bound;
+	dec->decode_size = simple_text_decode_size;
+	dec->decode = simple_text_decode;
+	dec->table1 = (void*)table;
+}
+
+static void fz_init_utf16be_text_decoder(fz_context *ctx, fz_text_decoder *dec)
+{
+	dec->decode_bound = utf16be_text_decode_bound;
+	dec->decode_size = utf16be_text_decode_size;
+	dec->decode = utf16be_text_decode;
+}
+
+static void fz_init_utf16le_text_decoder(fz_context *ctx, fz_text_decoder *dec)
+{
+	dec->decode_bound = utf16le_text_decode_bound;
+	dec->decode_size = utf16le_text_decode_size;
+	dec->decode = utf16le_text_decode;
+}
+
+static void fz_init_cjk_text_decoder(fz_context *ctx, fz_text_decoder *dec, const char *to_cid, const char *to_uni)
+{
+	dec->decode_bound = cjk_text_decode_bound;
+	dec->decode_size = cjk_text_decode_size;
+	dec->decode = cjk_text_decode;
+	dec->table1 = pdf_load_builtin_cmap(ctx, to_cid);
+	if (!dec->table1)
+		fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "unknown CMap: %s", to_cid);
+	dec->table2 = pdf_load_builtin_cmap(ctx, to_uni);
+	if (!dec->table2)
+		fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "unknown CMap: %s", to_uni);
+}
+
+void fz_init_text_decoder(fz_context *ctx, fz_text_decoder *dec, const char *enc)
+{
+	// Recognize IANA character set identifiers (case insensitive).
+	// https://www.iana.org/assignments/character-sets/character-sets.xhtml
+
+	if (!fz_strcasecmp(enc, "utf-16"))
+		fz_init_utf16le_text_decoder(ctx, dec);
+	else if (!fz_strcasecmp(enc, "utf-16be"))
+		fz_init_utf16be_text_decoder(ctx, dec);
+	else if (!fz_strcasecmp(enc, "utf-16le"))
+		fz_init_utf16le_text_decoder(ctx, dec);
+
+	else if (!fz_strcasecmp(enc, "euc-jp"))
+		fz_init_cjk_text_decoder(ctx, dec, "EUC-H", "Adobe-Japan1-UCS2");
+	else if (!fz_strcasecmp(enc, "shift_jis") || !fz_strcasecmp(enc, "sjis"))
+		fz_init_cjk_text_decoder(ctx, dec, "90msp-H", "Adobe-Japan1-UCS2");
+
+	else if (!fz_strcasecmp(enc, "euc-kr"))
+		fz_init_cjk_text_decoder(ctx, dec, "KSCms-UHC-H", "Adobe-Korea1-UCS2");
+
+	else if (!fz_strcasecmp(enc, "euc-cn"))
+		fz_init_cjk_text_decoder(ctx, dec, "GB-EUC-H", "Adobe-GB1-UCS2");
+	else if (!fz_strcasecmp(enc, "gbk") || !fz_strcasecmp(enc, "gb2312") || !fz_strcasecmp(enc, "gb18030"))
+		fz_init_cjk_text_decoder(ctx, dec, "GBK2K-H", "Adobe-GB1-UCS2");
+
+	else if (!fz_strcasecmp(enc, "euc-tw"))
+		fz_init_cjk_text_decoder(ctx, dec, "CNS-EUC-H", "Adobe-CNS1-UCS2");
+	else if (!fz_strcasecmp(enc, "big5"))
+		fz_init_cjk_text_decoder(ctx, dec, "ETen-B5-H", "Adobe-CNS1-UCS2");
+	else if (!fz_strcasecmp(enc, "big5-hkscs"))
+		fz_init_cjk_text_decoder(ctx, dec, "HKscs-B5-H", "Adobe-CNS1-UCS2");
+
+	else if (!fz_strcasecmp(enc, "iso-8859-1"))
+		fz_init_simple_text_decoder(ctx, dec, fz_unicode_from_iso8859_1);
+	else if (!fz_strcasecmp(enc, "iso-8859-7"))
+		fz_init_simple_text_decoder(ctx, dec, fz_unicode_from_iso8859_7);
+	else if (!fz_strcasecmp(enc, "koi8-r"))
+		fz_init_simple_text_decoder(ctx, dec, fz_unicode_from_koi8u);
+	else if (!fz_strcasecmp(enc, "windows-1250"))
+		fz_init_simple_text_decoder(ctx, dec, fz_unicode_from_windows_1250);
+	else if (!fz_strcasecmp(enc, "windows-1251"))
+		fz_init_simple_text_decoder(ctx, dec, fz_unicode_from_windows_1251);
+	else if (!fz_strcasecmp(enc, "windows-1252"))
+		fz_init_simple_text_decoder(ctx, dec, fz_unicode_from_windows_1252);
+
+	else
+		fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "unknown text encoding: %s", enc);
+}