comparison mupdf-source/source/pdf/pdf-cmap-parse.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 // Copyright (C) 2004-2021 Artifex Software, Inc.
2 //
3 // This file is part of MuPDF.
4 //
5 // MuPDF is free software: you can redistribute it and/or modify it under the
6 // terms of the GNU Affero General Public License as published by the Free
7 // Software Foundation, either version 3 of the License, or (at your option)
8 // any later version.
9 //
10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13 // details.
14 //
15 // You should have received a copy of the GNU Affero General Public License
16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17 //
18 // Alternative licensing terms are available from the licensor.
19 // For commercial licensing, see <https://www.artifex.com/> or contact
20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21 // CA 94129, USA, for further information.
22
23 #include "mupdf/fitz.h"
24 #include "mupdf/pdf.h"
25
26 #include <string.h>
27
28 /*
29 * CMap parser
30 */
31
32 static int
33 is_keyword(pdf_token tok, pdf_lexbuf *buf, const char *word)
34 {
35 /* Ignore trailing garbage when matching keywords */
36 return (tok == PDF_TOK_KEYWORD && !strncmp(buf->scratch, word, strlen(word)));
37 }
38
39 static void
40 skip_to_keyword(fz_context *ctx, fz_stream *file, pdf_lexbuf *buf, const char *end, const char *warn)
41 {
42 fz_warn(ctx, "%s", warn);
43 for (;;)
44 {
45 pdf_token tok = pdf_lex(ctx, file, buf);
46 if (is_keyword(tok, buf, end))
47 return;
48 if (tok == PDF_TOK_ERROR)
49 return;
50 if (tok == PDF_TOK_EOF)
51 return;
52 }
53 }
54
55 static void
56 skip_to_token(fz_context *ctx, fz_stream *file, pdf_lexbuf *buf, pdf_token end, const char *warn)
57 {
58 fz_warn(ctx, "%s", warn);
59 for (;;)
60 {
61 pdf_token tok = pdf_lex(ctx, file, buf);
62 if (tok == end)
63 return;
64 if (tok == PDF_TOK_ERROR)
65 return;
66 if (tok == PDF_TOK_EOF)
67 return;
68 }
69 }
70
71 static int
72 pdf_code_from_string(char *buf, size_t len)
73 {
74 unsigned int a = 0;
75 while (len--)
76 a = (a << 8) | *(unsigned char *)buf++;
77 return a;
78 }
79
80 static void
81 pdf_parse_cmap_name(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf)
82 {
83 pdf_token tok;
84
85 tok = pdf_lex(ctx, file, buf);
86
87 if (tok == PDF_TOK_NAME)
88 fz_strlcpy(cmap->cmap_name, buf->scratch, sizeof(cmap->cmap_name));
89 else
90 fz_warn(ctx, "expected name after CMapName in cmap");
91 }
92
93 static void
94 pdf_parse_wmode(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf)
95 {
96 pdf_token tok;
97
98 tok = pdf_lex(ctx, file, buf);
99
100 if (tok == PDF_TOK_INT)
101 pdf_set_cmap_wmode(ctx, cmap, buf->i);
102 else
103 fz_warn(ctx, "expected integer after WMode in cmap");
104 }
105
106 static void
107 pdf_parse_codespace_range(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf)
108 {
109 pdf_token tok;
110 int lo, hi;
111
112 while (1)
113 {
114 tok = pdf_lex(ctx, file, buf);
115
116 if (is_keyword(tok, buf, "endcodespacerange"))
117 return;
118
119 else if (tok == PDF_TOK_STRING)
120 {
121 lo = pdf_code_from_string(buf->scratch, buf->len);
122 tok = pdf_lex(ctx, file, buf);
123 if (tok == PDF_TOK_STRING)
124 {
125 hi = pdf_code_from_string(buf->scratch, buf->len);
126 pdf_add_codespace(ctx, cmap, lo, hi, buf->len);
127 }
128 else
129 {
130 skip_to_keyword(ctx, file, buf, "endcodespacerange", "expected string or endcodespacerange");
131 return;
132 }
133 }
134 else
135 {
136 skip_to_keyword(ctx, file, buf, "endcodespacerange", "expected string or endcodespacerange");
137 return;
138 }
139 }
140 }
141
142 static void
143 pdf_parse_cid_range(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf)
144 {
145 pdf_token tok;
146 int lo, hi, dst;
147
148 while (1)
149 {
150 tok = pdf_lex(ctx, file, buf);
151
152 if (is_keyword(tok, buf, "endcidrange"))
153 return;
154
155 else if (tok != PDF_TOK_STRING)
156 {
157 skip_to_keyword(ctx, file, buf, "endcidrange", "expected string or endcidrange");
158 return;
159 }
160
161 lo = pdf_code_from_string(buf->scratch, buf->len);
162
163 tok = pdf_lex(ctx, file, buf);
164 if (tok != PDF_TOK_STRING)
165 {
166 skip_to_keyword(ctx, file, buf, "endcidrange", "expected string");
167 return;
168 }
169
170 hi = pdf_code_from_string(buf->scratch, buf->len);
171
172 tok = pdf_lex(ctx, file, buf);
173 if (tok != PDF_TOK_INT)
174 {
175 skip_to_keyword(ctx, file, buf, "endcidrange", "expected integer");
176 return;
177 }
178
179 dst = buf->i;
180
181 pdf_map_range_to_range(ctx, cmap, lo, hi, dst);
182 }
183 }
184
185 static void
186 pdf_parse_cid_char(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf)
187 {
188 pdf_token tok;
189 int src, dst;
190
191 while (1)
192 {
193 tok = pdf_lex(ctx, file, buf);
194
195 if (is_keyword(tok, buf, "endcidchar"))
196 return;
197
198 else if (tok != PDF_TOK_STRING)
199 {
200 skip_to_keyword(ctx, file, buf, "endcidchar", "expected string or endcidchar");
201 return;
202 }
203
204 src = pdf_code_from_string(buf->scratch, buf->len);
205
206 tok = pdf_lex(ctx, file, buf);
207 if (tok != PDF_TOK_INT)
208 {
209 skip_to_keyword(ctx, file, buf, "endcidchar", "expected integer");
210 return;
211 }
212
213 dst = buf->i;
214
215 pdf_map_range_to_range(ctx, cmap, src, src, dst);
216 }
217 }
218
219 static void
220 pdf_parse_bf_range_array(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf, int lo, int hi)
221 {
222 pdf_token tok;
223 int dst[256];
224
225 while (1)
226 {
227 tok = pdf_lex(ctx, file, buf);
228
229 if (tok == PDF_TOK_CLOSE_ARRAY)
230 return;
231
232 /* Note: does not handle [ /Name /Name ... ] */
233 else if (tok != PDF_TOK_STRING)
234 {
235 skip_to_token(ctx, file, buf, PDF_TOK_CLOSE_ARRAY, "expected string or ]");
236 return;
237 }
238
239 if (buf->len / 2)
240 {
241 size_t i;
242 size_t len = fz_minz(buf->len / 2, nelem(dst));
243 for (i = 0; i < len; i++)
244 dst[i] = pdf_code_from_string(&buf->scratch[i * 2], 2);
245
246 pdf_map_one_to_many(ctx, cmap, lo, dst, i);
247 }
248
249 lo ++;
250 }
251 }
252
253 static void
254 pdf_parse_bf_range(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf)
255 {
256 pdf_token tok;
257 int lo, hi, dst;
258
259 while (1)
260 {
261 tok = pdf_lex(ctx, file, buf);
262
263 if (is_keyword(tok, buf, "endbfrange"))
264 return;
265
266 else if (tok != PDF_TOK_STRING)
267 {
268 skip_to_keyword(ctx, file, buf, "endbfrange", "expected string or endbfrange");
269 return;
270 }
271
272 lo = pdf_code_from_string(buf->scratch, buf->len);
273
274 tok = pdf_lex(ctx, file, buf);
275 if (tok != PDF_TOK_STRING)
276 {
277 skip_to_keyword(ctx, file, buf, "endbfrange", "expected string");
278 return;
279 }
280
281 hi = pdf_code_from_string(buf->scratch, buf->len);
282 if (lo < 0 || lo > 65535 || hi < 0 || hi > 65535 || lo > hi)
283 {
284 skip_to_keyword(ctx, file, buf, "endbfrange", "bfrange limits out of range");
285 return;
286 }
287
288 tok = pdf_lex(ctx, file, buf);
289
290 if (tok == PDF_TOK_STRING)
291 {
292 if (buf->len == 2)
293 {
294 dst = pdf_code_from_string(buf->scratch, buf->len);
295 pdf_map_range_to_range(ctx, cmap, lo, hi, dst);
296 }
297 else
298 {
299 int dststr[256];
300 size_t i;
301
302 if (buf->len / 2)
303 {
304 size_t len = fz_minz(buf->len / 2, nelem(dststr));
305 for (i = 0; i < len; i++)
306 dststr[i] = pdf_code_from_string(&buf->scratch[i * 2], 2);
307
308 while (lo <= hi)
309 {
310 pdf_map_one_to_many(ctx, cmap, lo, dststr, i);
311 dststr[i-1] ++;
312 lo ++;
313 }
314 }
315 }
316 }
317
318 else if (tok == PDF_TOK_OPEN_ARRAY)
319 {
320 pdf_parse_bf_range_array(ctx, cmap, file, buf, lo, hi);
321 }
322
323 else
324 {
325 skip_to_keyword(ctx, file, buf, "endbfrange", "expected string or array or endbfrange");
326 return;
327 }
328 }
329 }
330
331 static void
332 pdf_parse_bf_char(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf)
333 {
334 pdf_token tok;
335 int dst[256];
336 int src;
337
338 while (1)
339 {
340 tok = pdf_lex(ctx, file, buf);
341
342 if (is_keyword(tok, buf, "endbfchar"))
343 return;
344
345 else if (tok != PDF_TOK_STRING)
346 {
347 skip_to_keyword(ctx, file, buf, "endbfchar", "expected string or endbfchar");
348 return;
349 }
350
351 src = pdf_code_from_string(buf->scratch, buf->len);
352
353 tok = pdf_lex(ctx, file, buf);
354 /* Note: does not handle /dstName */
355 if (tok != PDF_TOK_STRING)
356 {
357 skip_to_keyword(ctx, file, buf, "endbfchar", "expected string");
358 return;
359 }
360
361 if (buf->len / 2)
362 {
363 size_t i;
364 size_t len = fz_minz(buf->len / 2, nelem(dst));
365 for (i = 0; i < len; i++)
366 dst[i] = pdf_code_from_string(&buf->scratch[i * 2], 2);
367 pdf_map_one_to_many(ctx, cmap, src, dst, i);
368 }
369 }
370 }
371
372 pdf_cmap *
373 pdf_load_cmap(fz_context *ctx, fz_stream *file)
374 {
375 pdf_cmap *cmap;
376 char key[64];
377 pdf_lexbuf buf;
378 pdf_token tok;
379
380 pdf_lexbuf_init(ctx, &buf, PDF_LEXBUF_SMALL);
381 cmap = pdf_new_cmap(ctx);
382
383 strcpy(key, ".notdef");
384
385 fz_try(ctx)
386 {
387 while (1)
388 {
389 tok = pdf_lex(ctx, file, &buf);
390
391 if (tok == PDF_TOK_EOF)
392 break;
393
394 else if (tok == PDF_TOK_NAME)
395 {
396 if (!strcmp(buf.scratch, "CMapName"))
397 pdf_parse_cmap_name(ctx, cmap, file, &buf);
398 else if (!strcmp(buf.scratch, "WMode"))
399 pdf_parse_wmode(ctx, cmap, file, &buf);
400 else
401 fz_strlcpy(key, buf.scratch, sizeof key);
402 }
403
404 else if (tok == PDF_TOK_KEYWORD)
405 {
406 if (is_keyword(tok, &buf, "endcmap"))
407 break;
408
409 else if (is_keyword(tok, &buf, "usecmap"))
410 fz_strlcpy(cmap->usecmap_name, key, sizeof(cmap->usecmap_name));
411
412 else if (is_keyword(tok, &buf, "begincodespacerange"))
413 pdf_parse_codespace_range(ctx, cmap, file, &buf);
414
415 else if (is_keyword(tok, &buf, "beginbfchar"))
416 pdf_parse_bf_char(ctx, cmap, file, &buf);
417
418 else if (is_keyword(tok, &buf, "begincidchar"))
419 pdf_parse_cid_char(ctx, cmap, file, &buf);
420
421 else if (is_keyword(tok, &buf, "beginbfrange"))
422 pdf_parse_bf_range(ctx, cmap, file, &buf);
423
424 else if (is_keyword(tok, &buf, "begincidrange"))
425 pdf_parse_cid_range(ctx, cmap, file, &buf);
426 }
427
428 /* ignore everything else */
429 }
430
431 pdf_sort_cmap(ctx, cmap);
432 }
433 fz_always(ctx)
434 {
435 pdf_lexbuf_fin(ctx, &buf);
436 }
437 fz_catch(ctx)
438 {
439 pdf_drop_cmap(ctx, cmap);
440 fz_rethrow(ctx);
441 }
442
443 return cmap;
444 }