comparison mupdf-source/source/fitz/text-decoder.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 // Copyright (C) 2024 Artifex Software, Inc.
2 //
3 // This file is part of MuPDF.
4 //
5 // MuPDF is free software: you can redistribute it and/or modify it under the
6 // terms of the GNU Affero General Public License as published by the Free
7 // Software Foundation, either version 3 of the License, or (at your option)
8 // any later version.
9 //
10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13 // details.
14 //
15 // You should have received a copy of the GNU Affero General Public License
16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17 //
18 // Alternative licensing terms are available from the licensor.
19 // For commercial licensing, see <https://www.artifex.com/> or contact
20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21 // CA 94129, USA, for further information.
22
23 #include "mupdf/fitz.h"
24 #include "mupdf/pdf.h"
25
26 static int simple_text_decode_bound(fz_text_decoder *dec, unsigned char *s, int n)
27 {
28 return n * 4 + 1;
29 }
30
31 static int simple_text_decode_size(fz_text_decoder *dec, unsigned char *s, int n)
32 {
33 const unsigned short *table = dec->table1;
34 unsigned char *e = s + n;
35 int len = 1;
36 while (s < e)
37 len += fz_runelen(table[*s++]);
38 return len;
39 }
40
41 static void simple_text_decode(fz_text_decoder *dec, char *p, unsigned char *s, int n)
42 {
43 const unsigned short *table = dec->table1;
44 unsigned char *e = s + n;
45 while (s < e)
46 p += fz_runetochar(p, table[*s++]);
47 *p = 0;
48 }
49
50 static int utf16be_text_decode_bound(fz_text_decoder *dec, unsigned char *s, int n)
51 {
52 return n * 2 + 1;
53 }
54
55 static int utf16le_text_decode_bound(fz_text_decoder *dec, unsigned char *s, int n)
56 {
57 return n * 2 + 1;
58 }
59
60 static int utf16be_text_decode_size(fz_text_decoder *dec, unsigned char *s, int n)
61 {
62 unsigned char *e = s + n;
63 int len = 1;
64 while (s + 1 < e) {
65 len += fz_runelen(s[0] << 8 | s[1]);
66 s += 2;
67 }
68 return len;
69 }
70
71 static int utf16le_text_decode_size(fz_text_decoder *dec, unsigned char *s, int n)
72 {
73 unsigned char *e = s + n;
74 int len = 1;
75 while (s + 1 < e) {
76 len += fz_runelen(s[0] | s[1] << 8);
77 s += 2;
78 }
79 return len;
80 }
81
82 static void utf16be_text_decode(fz_text_decoder *dec, char *p, unsigned char *s, int n)
83 {
84 unsigned char *e = s + n;
85 while (s + 1 < e) {
86 p += fz_runetochar(p, s[0] << 8 | s[1]);
87 s += 2;
88 }
89 *p = 0;
90 }
91
92 static void utf16le_text_decode(fz_text_decoder *dec, char *p, unsigned char *s, int n)
93 {
94 unsigned char *e = s + n;
95 while (s + 1 < e) {
96 p += fz_runetochar(p, s[0] | s[1] << 8);
97 s += 2;
98 }
99 *p = 0;
100 }
101
102 static int cjk_text_decode_bound(fz_text_decoder *dec, unsigned char *s, int n)
103 {
104 return n * 4 + 1;
105 }
106
107 static int cjk_text_decode_size(fz_text_decoder *dec, unsigned char *s, int n)
108 {
109 unsigned char *e = s + n;
110 pdf_cmap *to_cid = dec->table1;
111 pdf_cmap *to_uni = dec->table2;
112 unsigned int raw;
113 int cid, uni;
114 int len = 1;
115 while (s < e) {
116 s += pdf_decode_cmap(to_cid, s, e, &raw);
117 cid = pdf_lookup_cmap(to_cid, raw);
118 uni = pdf_lookup_cmap(to_uni, cid);
119 if (uni < 0) {
120 // ASCII control characters are missing in the CMaps
121 if (raw < 32)
122 uni = raw;
123 else
124 uni = FZ_REPLACEMENT_CHARACTER;
125 }
126 len += fz_runelen(uni);
127 }
128 return len;
129 }
130
131 static void cjk_text_decode(fz_text_decoder *dec, char *p, unsigned char *s, int n)
132 {
133 unsigned char *e = s + n;
134 pdf_cmap *to_cid = dec->table1;
135 pdf_cmap *to_uni = dec->table2;
136 unsigned int raw;
137 int cid, uni;
138 while (s < e) {
139 s += pdf_decode_cmap(to_cid, s, e, &raw);
140 cid = pdf_lookup_cmap(to_cid, raw);
141 uni = pdf_lookup_cmap(to_uni, cid);
142 if (uni < 0) {
143 // ASCII control characters are missing in the CMaps
144 if (raw < 32)
145 uni = raw;
146 else
147 uni = FZ_REPLACEMENT_CHARACTER;
148 }
149 p += fz_runetochar(p, uni);
150 }
151 *p = 0;
152 }
153
154 static void fz_init_simple_text_decoder(fz_context *ctx, fz_text_decoder *dec, const unsigned short *table)
155 {
156 dec->decode_bound = simple_text_decode_bound;
157 dec->decode_size = simple_text_decode_size;
158 dec->decode = simple_text_decode;
159 dec->table1 = (void*)table;
160 }
161
162 static void fz_init_utf16be_text_decoder(fz_context *ctx, fz_text_decoder *dec)
163 {
164 dec->decode_bound = utf16be_text_decode_bound;
165 dec->decode_size = utf16be_text_decode_size;
166 dec->decode = utf16be_text_decode;
167 }
168
169 static void fz_init_utf16le_text_decoder(fz_context *ctx, fz_text_decoder *dec)
170 {
171 dec->decode_bound = utf16le_text_decode_bound;
172 dec->decode_size = utf16le_text_decode_size;
173 dec->decode = utf16le_text_decode;
174 }
175
176 static void fz_init_cjk_text_decoder(fz_context *ctx, fz_text_decoder *dec, const char *to_cid, const char *to_uni)
177 {
178 dec->decode_bound = cjk_text_decode_bound;
179 dec->decode_size = cjk_text_decode_size;
180 dec->decode = cjk_text_decode;
181 dec->table1 = pdf_load_builtin_cmap(ctx, to_cid);
182 if (!dec->table1)
183 fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "unknown CMap: %s", to_cid);
184 dec->table2 = pdf_load_builtin_cmap(ctx, to_uni);
185 if (!dec->table2)
186 fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "unknown CMap: %s", to_uni);
187 }
188
189 void fz_init_text_decoder(fz_context *ctx, fz_text_decoder *dec, const char *enc)
190 {
191 // Recognize IANA character set identifiers (case insensitive).
192 // https://www.iana.org/assignments/character-sets/character-sets.xhtml
193
194 if (!fz_strcasecmp(enc, "utf-16"))
195 fz_init_utf16le_text_decoder(ctx, dec);
196 else if (!fz_strcasecmp(enc, "utf-16be"))
197 fz_init_utf16be_text_decoder(ctx, dec);
198 else if (!fz_strcasecmp(enc, "utf-16le"))
199 fz_init_utf16le_text_decoder(ctx, dec);
200
201 else if (!fz_strcasecmp(enc, "euc-jp"))
202 fz_init_cjk_text_decoder(ctx, dec, "EUC-H", "Adobe-Japan1-UCS2");
203 else if (!fz_strcasecmp(enc, "shift_jis") || !fz_strcasecmp(enc, "sjis"))
204 fz_init_cjk_text_decoder(ctx, dec, "90msp-H", "Adobe-Japan1-UCS2");
205
206 else if (!fz_strcasecmp(enc, "euc-kr"))
207 fz_init_cjk_text_decoder(ctx, dec, "KSCms-UHC-H", "Adobe-Korea1-UCS2");
208
209 else if (!fz_strcasecmp(enc, "euc-cn"))
210 fz_init_cjk_text_decoder(ctx, dec, "GB-EUC-H", "Adobe-GB1-UCS2");
211 else if (!fz_strcasecmp(enc, "gbk") || !fz_strcasecmp(enc, "gb2312") || !fz_strcasecmp(enc, "gb18030"))
212 fz_init_cjk_text_decoder(ctx, dec, "GBK2K-H", "Adobe-GB1-UCS2");
213
214 else if (!fz_strcasecmp(enc, "euc-tw"))
215 fz_init_cjk_text_decoder(ctx, dec, "CNS-EUC-H", "Adobe-CNS1-UCS2");
216 else if (!fz_strcasecmp(enc, "big5"))
217 fz_init_cjk_text_decoder(ctx, dec, "ETen-B5-H", "Adobe-CNS1-UCS2");
218 else if (!fz_strcasecmp(enc, "big5-hkscs"))
219 fz_init_cjk_text_decoder(ctx, dec, "HKscs-B5-H", "Adobe-CNS1-UCS2");
220
221 else if (!fz_strcasecmp(enc, "iso-8859-1"))
222 fz_init_simple_text_decoder(ctx, dec, fz_unicode_from_iso8859_1);
223 else if (!fz_strcasecmp(enc, "iso-8859-7"))
224 fz_init_simple_text_decoder(ctx, dec, fz_unicode_from_iso8859_7);
225 else if (!fz_strcasecmp(enc, "koi8-r"))
226 fz_init_simple_text_decoder(ctx, dec, fz_unicode_from_koi8u);
227 else if (!fz_strcasecmp(enc, "windows-1250"))
228 fz_init_simple_text_decoder(ctx, dec, fz_unicode_from_windows_1250);
229 else if (!fz_strcasecmp(enc, "windows-1251"))
230 fz_init_simple_text_decoder(ctx, dec, fz_unicode_from_windows_1251);
231 else if (!fz_strcasecmp(enc, "windows-1252"))
232 fz_init_simple_text_decoder(ctx, dec, fz_unicode_from_windows_1252);
233
234 else
235 fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "unknown text encoding: %s", enc);
236 }