comparison mupdf-source/source/pdf/pdf-parse.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 // Copyright (C) 2004-2021 Artifex Software, Inc.
2 //
3 // This file is part of MuPDF.
4 //
5 // MuPDF is free software: you can redistribute it and/or modify it under the
6 // terms of the GNU Affero General Public License as published by the Free
7 // Software Foundation, either version 3 of the License, or (at your option)
8 // any later version.
9 //
10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13 // details.
14 //
15 // You should have received a copy of the GNU Affero General Public License
16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17 //
18 // Alternative licensing terms are available from the licensor.
19 // For commercial licensing, see <https://www.artifex.com/> or contact
20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21 // CA 94129, USA, for further information.
22
23 #include "mupdf/fitz.h"
24 #include "mupdf/pdf.h"
25
26 #include <string.h>
27 #include <time.h>
28
29 #ifdef _WIN32
30 #define timegm _mkgmtime
31 #endif
32
33 #define isdigit(c) (c >= '0' && c <= '9')
34
35 fz_rect
36 pdf_to_rect(fz_context *ctx, pdf_obj *array)
37 {
38 if (!pdf_is_array(ctx, array))
39 return fz_empty_rect;
40 else
41 {
42 float a = pdf_array_get_real(ctx, array, 0);
43 float b = pdf_array_get_real(ctx, array, 1);
44 float c = pdf_array_get_real(ctx, array, 2);
45 float d = pdf_array_get_real(ctx, array, 3);
46 fz_rect r;
47 r.x0 = fz_min(a, c);
48 r.y0 = fz_min(b, d);
49 r.x1 = fz_max(a, c);
50 r.y1 = fz_max(b, d);
51 return r;
52 }
53 }
54
55 fz_quad
56 pdf_to_quad(fz_context *ctx, pdf_obj *array, int offset)
57 {
58 fz_quad q;
59 q.ul.x = pdf_array_get_real(ctx, array, offset+0);
60 q.ul.y = pdf_array_get_real(ctx, array, offset+1);
61 q.ur.x = pdf_array_get_real(ctx, array, offset+2);
62 q.ur.y = pdf_array_get_real(ctx, array, offset+3);
63 q.ll.x = pdf_array_get_real(ctx, array, offset+4);
64 q.ll.y = pdf_array_get_real(ctx, array, offset+5);
65 q.lr.x = pdf_array_get_real(ctx, array, offset+6);
66 q.lr.y = pdf_array_get_real(ctx, array, offset+7);
67 return q;
68 }
69
70 fz_point
71 pdf_to_point(fz_context *ctx, pdf_obj *array, int offset)
72 {
73 fz_point p;
74 p.x = pdf_array_get_real(ctx, array, offset+0);
75 p.y = pdf_array_get_real(ctx, array, offset+1);
76 return p;
77 }
78
79 fz_matrix
80 pdf_to_matrix(fz_context *ctx, pdf_obj *array)
81 {
82 if (!pdf_is_array(ctx, array))
83 return fz_identity;
84 else
85 {
86 fz_matrix m;
87 m.a = pdf_array_get_real(ctx, array, 0);
88 m.b = pdf_array_get_real(ctx, array, 1);
89 m.c = pdf_array_get_real(ctx, array, 2);
90 m.d = pdf_array_get_real(ctx, array, 3);
91 m.e = pdf_array_get_real(ctx, array, 4);
92 m.f = pdf_array_get_real(ctx, array, 5);
93 return m;
94 }
95 }
96
97 char *
98 pdf_format_date(fz_context *ctx, int64_t time, char *s, size_t n)
99 {
100 time_t secs = time;
101 #ifdef _POSIX_SOURCE
102 struct tm tmbuf, *tm = gmtime_r(&secs, &tmbuf);
103 #else
104 struct tm *tm = gmtime(&secs);
105 #endif
106 if (time < 0 || !tm || !strftime(s, n, "D:%Y%m%d%H%M%SZ", tm))
107 return NULL;
108 return s;
109 }
110
111 int64_t
112 pdf_parse_date(fz_context *ctx, const char *s)
113 {
114 int tz_sign, tz_hour, tz_min, tz_adj;
115 struct tm tm;
116 time_t utc;
117
118 if (!s[0])
119 return -1;
120
121 memset(&tm, 0, sizeof tm);
122 tm.tm_mday = 1;
123
124 tz_sign = 1;
125 tz_hour = 0;
126 tz_min = 0;
127
128 if (s[0] == 'D' && s[1] == ':')
129 s += 2;
130
131 if (!isdigit(s[0]) || !isdigit(s[1]) || !isdigit(s[2]) || !isdigit(s[3]))
132 {
133 fz_warn(ctx, "invalid date format (missing year)");
134 return -1;
135 }
136 tm.tm_year = (s[0]-'0')*1000 + (s[1]-'0')*100 + (s[2]-'0')*10 + (s[3]-'0') - 1900;
137 s += 4;
138
139 if (tm.tm_year < 70)
140 {
141 fz_warn(ctx, "invalid date (year out of range)");
142 return -1;
143 }
144
145 if (isdigit(s[0]) && isdigit(s[1]))
146 {
147 tm.tm_mon = (s[0]-'0')*10 + (s[1]-'0') - 1; /* month is 0-11 in struct tm */
148 s += 2;
149 if (isdigit(s[0]) && isdigit(s[1]))
150 {
151 tm.tm_mday = (s[0]-'0')*10 + (s[1]-'0');
152 s += 2;
153 if (isdigit(s[0]) && isdigit(s[1]))
154 {
155 tm.tm_hour = (s[0]-'0')*10 + (s[1]-'0');
156 s += 2;
157 if (isdigit(s[0]) && isdigit(s[1]))
158 {
159 tm.tm_min = (s[0]-'0')*10 + (s[1]-'0');
160 s += 2;
161 if (isdigit(s[0]) && isdigit(s[1]))
162 {
163 tm.tm_sec = (s[0]-'0')*10 + (s[1]-'0');
164 s += 2;
165 }
166 }
167 }
168 }
169 }
170
171 if (tm.tm_sec > 60 || tm.tm_min > 59 || tm.tm_hour > 23 || tm.tm_mday > 31 || tm.tm_mon > 11)
172 {
173 fz_warn(ctx, "invalid date (a field is out of range)");
174 return -1;
175 }
176
177 if (s[0] == 'Z')
178 {
179 if (s[1] == '0' && s[2] == '0')
180 {
181 s += 3;
182 if (s[0] == '\'' && s[1] == '0' && s[2] == '0')
183 {
184 s += 3;
185 if (s[0] == '\'')
186 s += 1;
187 }
188 }
189 else
190 {
191 s += 1;
192 }
193 }
194 else if ((s[0] == '-' || s[0] == '+') && isdigit(s[1]) && isdigit(s[2]))
195 {
196 tz_sign = (s[0] == '-') ? -1 : 1;
197 tz_hour = (s[1]-'0')*10 + (s[2]-'0');
198 s += 3;
199 if (s[0] == '\'' && isdigit(s[1]) && isdigit(s[2]))
200 {
201 tz_min = (s[1]-'0')*10 + (s[2]-'0');
202 s += 3;
203 if (s[0] == '\'')
204 s += 1;
205 }
206 }
207
208 /* PDF is based on ISO/IEC 8824 which limits time zones from -15 to +16. */
209 if (tz_sign < 0 && (tz_hour > 15 || (tz_hour == 15 && tz_min > 0)))
210 {
211 fz_warn(ctx, "invalid date format (time zone out of range)");
212 return -1;
213 }
214 if (tz_sign > 0 && (tz_hour > 16 || (tz_hour == 16 && tz_min > 0)))
215 {
216 fz_warn(ctx, "invalid date format (time zone out of range)");
217 return -1;
218 }
219
220 if (s[0] != 0)
221 fz_warn(ctx, "invalid date format (garbage at end)");
222
223 utc = timegm(&tm);
224 if (utc == (time_t)-1)
225 {
226 fz_warn(ctx, "date overflow error");
227 return -1;
228 }
229
230 tz_adj = tz_sign * (tz_hour * 3600 + tz_min * 60);
231 return utc - tz_adj;
232 }
233
234 int64_t
235 pdf_to_date(fz_context *ctx, pdf_obj *time)
236 {
237 return pdf_parse_date(ctx, pdf_to_str_buf(ctx, time));
238 }
239
240 static int
241 rune_from_utf16be(int *out, const unsigned char *s, const unsigned char *end)
242 {
243 if (s + 2 <= end)
244 {
245 int a = s[0] << 8 | s[1];
246 if (a >= 0xD800 && a <= 0xDFFF && s + 4 <= end)
247 {
248 int b = s[2] << 8 | s[3];
249 *out = ((a - 0xD800) << 10) + (b - 0xDC00) + 0x10000;
250 return 4;
251 }
252 *out = a;
253 return 2;
254 }
255 *out = FZ_REPLACEMENT_CHARACTER;
256 return 1;
257 }
258
259 static int
260 rune_from_utf16le(int *out, const unsigned char *s, const unsigned char *end)
261 {
262 if (s + 2 <= end)
263 {
264 int a = s[1] << 8 | s[0];
265 if (a >= 0xD800 && a <= 0xDFFF && s + 4 <= end)
266 {
267 int b = s[3] << 8 | s[2];
268 *out = ((a - 0xD800) << 10) + (b - 0xDC00) + 0x10000;
269 return 4;
270 }
271 *out = a;
272 return 2;
273 }
274 *out = FZ_REPLACEMENT_CHARACTER;
275 return 1;
276 }
277
278 static size_t
279 skip_language_code_utf16le(const unsigned char *s, size_t n, size_t i)
280 {
281 /* skip language escape codes */
282 if (i + 6 <= n && s[i+1] == 0 && s[i+0] == 27 && s[i+5] == 0 && s[i+4] == 27)
283 return 6;
284 else if (i + 8 <= n && s[i+1] == 0 && s[i+0] == 27 && s[i+7] == 0 && s[i+6] == 27)
285 return 8;
286 return 0;
287 }
288
289 static size_t
290 skip_language_code_utf16be(const unsigned char *s, size_t n, size_t i)
291 {
292 /* skip language escape codes */
293 if (i + 6 <= n && s[i+0] == 0 && s[i+1] == 27 && s[i+4] == 0 && s[i+5] == 27)
294 return 6;
295 else if (i + 8 <= n && s[i+0] == 0 && s[i+1] == 27 && s[i+6] == 0 && s[i+7] == 27)
296 return 8;
297 return 0;
298 }
299
300 static size_t
301 skip_language_code_utf8(const unsigned char *s, size_t n, size_t i)
302 {
303 /* skip language escape codes */
304 if (i + 3 <= n && s[i] == 27 && s[i+3])
305 return 3;
306 else if (i + 5 <= n && s[i] == 27 && s[i+5] == 27)
307 return 5;
308 return 0;
309 }
310
311 static int
312 is_valid_utf8(const unsigned char *s, const unsigned char *end)
313 {
314 for (; s < end; ++s)
315 {
316 int skip = *s < 0x80 ? 0 : *s < 0xC0 ? -1 : *s < 0xE0 ? 1 : *s < 0xF0 ? 2 : *s < 0xF5 ? 3 : -1;
317 if (skip == -1)
318 return 0;
319 while (skip-- > 0)
320 if (++s >= end || (*s & 0xC0) != 0x80)
321 return 0;
322 }
323 return 1;
324 }
325
326 char *
327 pdf_new_utf8_from_pdf_string(fz_context *ctx, const char *ssrcptr, size_t srclen)
328 {
329 const unsigned char *srcptr = (const unsigned char*)ssrcptr;
330 char *dstptr, *dst;
331 size_t dstlen = 0;
332 int ucs;
333 size_t i, n;
334
335 /* UTF-16BE */
336 if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255)
337 {
338 i = 2;
339 while (i + 2 <= srclen)
340 {
341 n = skip_language_code_utf16be(srcptr, srclen, i);
342 if (n)
343 i += n;
344 else
345 {
346 i += rune_from_utf16be(&ucs, srcptr + i, srcptr + srclen);
347 dstlen += fz_runelen(ucs);
348 }
349 }
350
351 dstptr = dst = Memento_label(fz_malloc(ctx, dstlen + 1), "utf8_from_utf16be");
352
353 i = 2;
354 while (i + 2 <= srclen)
355 {
356 n = skip_language_code_utf16be(srcptr, srclen, i);
357 if (n)
358 i += n;
359 else
360 {
361 i += rune_from_utf16be(&ucs, srcptr + i, srcptr + srclen);
362 dstptr += fz_runetochar(dstptr, ucs);
363 }
364 }
365 }
366
367 /* UTF-16LE */
368 else if (srclen >= 2 && srcptr[0] == 255 && srcptr[1] == 254)
369 {
370 i = 2;
371 while (i + 2 <= srclen)
372 {
373 n = skip_language_code_utf16le(srcptr, srclen, i);
374 if (n)
375 i += n;
376 else
377 {
378 i += rune_from_utf16le(&ucs, srcptr + i, srcptr + srclen);
379 dstlen += fz_runelen(ucs);
380 }
381 }
382
383 dstptr = dst = Memento_label(fz_malloc(ctx, dstlen + 1), "utf8_from_utf16le");
384
385 i = 2;
386 while (i + 2 <= srclen)
387 {
388 n = skip_language_code_utf16le(srcptr, srclen, i);
389 if (n)
390 i += n;
391 else
392 {
393 i += rune_from_utf16le(&ucs, srcptr + i, srcptr + srclen);
394 dstptr += fz_runetochar(dstptr, ucs);
395 }
396 }
397 }
398
399 /* UTF-8 */
400 else if (srclen >= 3 && srcptr[0] == 239 && srcptr[1] == 187 && srcptr[2] == 191)
401 {
402 i = 3;
403 while (i < srclen)
404 {
405 n = skip_language_code_utf8(srcptr, srclen, i);
406 if (n)
407 i += n;
408 else
409 {
410 i += 1;
411 dstlen += 1;
412 }
413 }
414
415 dstptr = dst = Memento_label(fz_malloc(ctx, dstlen + 1), "utf8_from_utf8");
416
417 i = 3;
418 while (i < srclen)
419 {
420 n = skip_language_code_utf8(srcptr, srclen, i);
421 if (n)
422 i += n;
423 else
424 *dstptr++ = srcptr[i++];
425 }
426 }
427
428 /* Detect UTF-8 strings that aren't marked with a BOM */
429 else if (is_valid_utf8(srcptr, srcptr + srclen))
430 {
431 dst = Memento_label(fz_malloc(ctx, srclen + 1), "utf8_from_guess");
432 memcpy(dst, srcptr, srclen);
433 dstptr = dst + srclen;
434 }
435
436 /* PDFDocEncoding */
437 else
438 {
439 for (i = 0; i < srclen; i++)
440 dstlen += fz_runelen(fz_unicode_from_pdf_doc_encoding[srcptr[i]]);
441
442 dstptr = dst = Memento_label(fz_malloc(ctx, dstlen + 1), "utf8_from_pdfdocenc");
443
444 for (i = 0; i < srclen; i++)
445 {
446 ucs = fz_unicode_from_pdf_doc_encoding[srcptr[i]];
447 dstptr += fz_runetochar(dstptr, ucs);
448 }
449 }
450
451 *dstptr = 0;
452 return dst;
453 }
454
455 char *
456 pdf_new_utf8_from_pdf_string_obj(fz_context *ctx, pdf_obj *src)
457 {
458 const char *srcptr;
459 size_t srclen;
460 srcptr = pdf_to_string(ctx, src, &srclen);
461 return pdf_new_utf8_from_pdf_string(ctx, srcptr, srclen);
462 }
463
464 char *
465 pdf_new_utf8_from_pdf_stream_obj(fz_context *ctx, pdf_obj *src)
466 {
467 fz_buffer *stmbuf;
468 char *srcptr;
469 size_t srclen;
470 char *dst = NULL;
471
472 stmbuf = pdf_load_stream(ctx, src);
473 srclen = fz_buffer_storage(ctx, stmbuf, (unsigned char **)&srcptr);
474 fz_try(ctx)
475 dst = pdf_new_utf8_from_pdf_string(ctx, srcptr, srclen);
476 fz_always(ctx)
477 fz_drop_buffer(ctx, stmbuf);
478 fz_catch(ctx)
479 fz_rethrow(ctx);
480 return dst;
481 }
482
483 char *
484 pdf_load_stream_or_string_as_utf8(fz_context *ctx, pdf_obj *src)
485 {
486 if (pdf_is_stream(ctx, src))
487 return pdf_new_utf8_from_pdf_stream_obj(ctx, src);
488 return pdf_new_utf8_from_pdf_string_obj(ctx, src);
489 }
490
491 static pdf_obj *
492 pdf_new_text_string_utf16be(fz_context *ctx, const char *s)
493 {
494 const char *ss;
495 int c, i, n, a, b;
496 unsigned char *p;
497 pdf_obj *obj;
498
499 ss = s;
500 n = 0;
501 while (*ss)
502 {
503 ss += fz_chartorune(&c, ss);
504 n += (c >= 0x10000) ? 2 : 1;
505 }
506
507 p = fz_malloc(ctx, n * 2 + 2);
508 i = 0;
509 p[i++] = 254;
510 p[i++] = 255;
511 while (*s)
512 {
513 s += fz_chartorune(&c, s);
514 if (c >= 0x10000)
515 {
516 a = (((c - 0x10000) >> 10) & 0x3ff) + 0xD800;
517 p[i++] = (a>>8) & 0xff;
518 p[i++] = (a) & 0xff;
519 b = (((c - 0x10000)) & 0x3ff) + 0xDC00;
520 p[i++] = (b>>8) & 0xff;
521 p[i++] = (b) & 0xff;
522 }
523 else
524 {
525 p[i++] = (c>>8) & 0xff;
526 p[i++] = (c) & 0xff;
527 }
528 }
529
530 fz_try(ctx)
531 obj = pdf_new_string(ctx, (char*)p, i);
532 fz_always(ctx)
533 fz_free(ctx, p);
534 fz_catch(ctx)
535 fz_rethrow(ctx);
536 return obj;
537 }
538
539 pdf_obj *
540 pdf_new_text_string(fz_context *ctx, const char *s)
541 {
542 int i = 0;
543 while (s[i] != 0)
544 {
545 if (((unsigned char)s[i]) >= 128)
546 return pdf_new_text_string_utf16be(ctx, s);
547 ++i;
548 }
549 return pdf_new_string(ctx, s, i);
550 }
551
552 pdf_obj *
553 pdf_parse_array(fz_context *ctx, pdf_document *doc, fz_stream *file, pdf_lexbuf *buf)
554 {
555 pdf_obj *ary = NULL;
556 pdf_obj *obj = NULL;
557 int64_t a = 0, b = 0, n = 0;
558 pdf_token tok;
559 pdf_obj *op = NULL;
560
561 fz_var(obj);
562
563 ary = pdf_new_array(ctx, doc, 4);
564
565 fz_try(ctx)
566 {
567 while (1)
568 {
569 tok = pdf_lex(ctx, file, buf);
570
571 if (tok != PDF_TOK_INT && tok != PDF_TOK_R)
572 {
573 if (n > 0)
574 pdf_array_push_int(ctx, ary, a);
575 if (n > 1)
576 pdf_array_push_int(ctx, ary, b);
577 n = 0;
578 }
579
580 if (tok == PDF_TOK_INT && n == 2)
581 {
582 pdf_array_push_int(ctx, ary, a);
583 a = b;
584 n --;
585 }
586
587 switch (tok)
588 {
589 case PDF_TOK_EOF:
590 fz_throw(ctx, FZ_ERROR_SYNTAX, "array not closed before end of file");
591
592 case PDF_TOK_CLOSE_ARRAY:
593 op = ary;
594 goto end;
595
596 case PDF_TOK_INT:
597 if (n == 0)
598 a = buf->i;
599 if (n == 1)
600 b = buf->i;
601 n ++;
602 break;
603
604 case PDF_TOK_R:
605 if (n != 2)
606 fz_throw(ctx, FZ_ERROR_SYNTAX, "cannot parse indirect reference in array");
607 pdf_array_push_drop(ctx, ary, pdf_new_indirect(ctx, doc, a, b));
608 n = 0;
609 break;
610
611 case PDF_TOK_OPEN_ARRAY:
612 obj = pdf_parse_array(ctx, doc, file, buf);
613 pdf_array_push_drop(ctx, ary, obj);
614 break;
615
616 case PDF_TOK_OPEN_DICT:
617 obj = pdf_parse_dict(ctx, doc, file, buf);
618 pdf_array_push_drop(ctx, ary, obj);
619 break;
620
621 case PDF_TOK_NAME:
622 pdf_array_push_name(ctx, ary, buf->scratch);
623 break;
624 case PDF_TOK_REAL:
625 pdf_array_push_real(ctx, ary, buf->f);
626 break;
627 case PDF_TOK_STRING:
628 pdf_array_push_string(ctx, ary, buf->scratch, buf->len);
629 break;
630 case PDF_TOK_TRUE:
631 pdf_array_push_bool(ctx, ary, 1);
632 break;
633 case PDF_TOK_FALSE:
634 pdf_array_push_bool(ctx, ary, 0);
635 break;
636 case PDF_TOK_NULL:
637 pdf_array_push(ctx, ary, PDF_NULL);
638 break;
639
640 default:
641 pdf_array_push(ctx, ary, PDF_NULL);
642 break;
643 }
644 }
645 end:
646 {}
647 }
648 fz_catch(ctx)
649 {
650 pdf_drop_obj(ctx, ary);
651 fz_rethrow(ctx);
652 }
653 return op;
654 }
655
656 pdf_obj *
657 pdf_parse_dict(fz_context *ctx, pdf_document *doc, fz_stream *file, pdf_lexbuf *buf)
658 {
659 pdf_obj *dict;
660 pdf_obj *key = NULL;
661 pdf_obj *val = NULL;
662 pdf_token tok;
663 int64_t a, b;
664
665 dict = pdf_new_dict(ctx, doc, 8);
666
667 fz_var(key);
668 fz_var(val);
669
670 fz_try(ctx)
671 {
672 while (1)
673 {
674 tok = pdf_lex(ctx, file, buf);
675 skip:
676 if (tok == PDF_TOK_CLOSE_DICT)
677 break;
678
679 /* for BI .. ID .. EI in content streams */
680 if (tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID"))
681 break;
682
683 if (tok != PDF_TOK_NAME)
684 fz_throw(ctx, FZ_ERROR_SYNTAX, "invalid key in dict");
685
686 key = pdf_new_name(ctx, buf->scratch);
687
688 tok = pdf_lex(ctx, file, buf);
689
690 switch (tok)
691 {
692 case PDF_TOK_OPEN_ARRAY:
693 val = pdf_parse_array(ctx, doc, file, buf);
694 break;
695
696 case PDF_TOK_OPEN_DICT:
697 val = pdf_parse_dict(ctx, doc, file, buf);
698 break;
699
700 case PDF_TOK_NAME: val = pdf_new_name(ctx, buf->scratch); break;
701 case PDF_TOK_REAL: val = pdf_new_real(ctx, buf->f); break;
702 case PDF_TOK_STRING: val = pdf_new_string(ctx, buf->scratch, buf->len); break;
703 case PDF_TOK_TRUE: val = PDF_TRUE; break;
704 case PDF_TOK_FALSE: val = PDF_FALSE; break;
705 case PDF_TOK_NULL: val = PDF_NULL; break;
706
707 case PDF_TOK_INT:
708 /* 64-bit to allow for numbers > INT_MAX and overflow */
709 a = buf->i;
710 tok = pdf_lex(ctx, file, buf);
711 if (tok == PDF_TOK_CLOSE_DICT || tok == PDF_TOK_NAME ||
712 (tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID")))
713 {
714 pdf_dict_put_int(ctx, dict, key, a);
715 pdf_drop_obj(ctx, key);
716 key = NULL;
717 goto skip;
718 }
719 if (tok == PDF_TOK_INT)
720 {
721 b = buf->i;
722 tok = pdf_lex(ctx, file, buf);
723 if (tok == PDF_TOK_R)
724 {
725 val = pdf_new_indirect(ctx, doc, a, b);
726 break;
727 }
728 }
729 fz_warn(ctx, "invalid indirect reference in dict");
730 val = PDF_NULL;
731 break;
732
733 default:
734 val = PDF_NULL;
735 break;
736 }
737
738 pdf_dict_put(ctx, dict, key, val);
739 pdf_drop_obj(ctx, val);
740 val = NULL;
741 pdf_drop_obj(ctx, key);
742 key = NULL;
743 }
744 }
745 fz_catch(ctx)
746 {
747 pdf_drop_obj(ctx, dict);
748 pdf_drop_obj(ctx, key);
749 pdf_drop_obj(ctx, val);
750 fz_rethrow(ctx);
751 }
752 return dict;
753 }
754
755 pdf_obj *
756 pdf_parse_stm_obj(fz_context *ctx, pdf_document *doc, fz_stream *file, pdf_lexbuf *buf)
757 {
758 pdf_token tok;
759
760 tok = pdf_lex(ctx, file, buf);
761
762 switch (tok)
763 {
764 case PDF_TOK_OPEN_ARRAY:
765 return pdf_parse_array(ctx, doc, file, buf);
766 case PDF_TOK_OPEN_DICT:
767 return pdf_parse_dict(ctx, doc, file, buf);
768 case PDF_TOK_NAME: return pdf_new_name(ctx, buf->scratch);
769 case PDF_TOK_REAL: return pdf_new_real(ctx, buf->f);
770 case PDF_TOK_STRING: return pdf_new_string(ctx, buf->scratch, buf->len);
771 case PDF_TOK_TRUE: return PDF_TRUE;
772 case PDF_TOK_FALSE: return PDF_FALSE;
773 case PDF_TOK_NULL: return PDF_NULL;
774 case PDF_TOK_INT: return pdf_new_int(ctx, buf->i);
775 default: fz_throw(ctx, FZ_ERROR_SYNTAX, "unknown token in object stream");
776 }
777 }
778
779 pdf_obj *
780 pdf_parse_ind_obj_or_newobj(fz_context *ctx, pdf_document *doc, fz_stream *file,
781 int *onum, int *ogen, int64_t *ostmofs, int *try_repair, int *newobj)
782 {
783 pdf_obj *obj = NULL;
784 int num = 0, gen = 0;
785 int64_t stm_ofs;
786 pdf_token tok;
787 pdf_lexbuf *buf = &doc->lexbuf.base;
788 int64_t a, b;
789 int read_next_token = 1;
790
791 fz_var(obj);
792
793 tok = pdf_lex(ctx, file, buf);
794 if (tok != PDF_TOK_INT)
795 {
796 if (try_repair)
797 *try_repair = 1;
798 fz_throw(ctx, FZ_ERROR_SYNTAX, "expected object number");
799 }
800 num = buf->i;
801 if (num < 0 || num > PDF_MAX_OBJECT_NUMBER)
802 fz_throw(ctx, FZ_ERROR_SYNTAX, "object number out of range");
803
804 tok = pdf_lex(ctx, file, buf);
805 if (tok != PDF_TOK_INT)
806 {
807 if (try_repair)
808 *try_repair = 1;
809 fz_throw(ctx, FZ_ERROR_SYNTAX, "expected generation number (%d ? obj)", num);
810 }
811 gen = buf->i;
812 if (gen < 0 || gen >= 65536)
813 {
814 if (try_repair)
815 *try_repair = 1;
816 fz_throw(ctx, FZ_ERROR_SYNTAX, "invalid generation number (%d)", gen);
817 }
818
819 tok = pdf_lex(ctx, file, buf);
820 if (tok == PDF_TOK_NEWOBJ && newobj)
821 {
822 *newobj = 1;
823 if (onum) *onum = num;
824 if (ogen) *ogen = gen;
825 if (ostmofs) *ostmofs = 0;
826 return NULL;
827 }
828 if (tok != PDF_TOK_OBJ)
829 {
830 if (try_repair)
831 *try_repair = 1;
832 fz_throw(ctx, FZ_ERROR_SYNTAX, "expected 'obj' keyword (%d %d ?)", num, gen);
833 }
834
835 tok = pdf_lex(ctx, file, buf);
836
837 switch (tok)
838 {
839 case PDF_TOK_OPEN_ARRAY:
840 obj = pdf_parse_array(ctx, doc, file, buf);
841 break;
842
843 case PDF_TOK_OPEN_DICT:
844 obj = pdf_parse_dict(ctx, doc, file, buf);
845 break;
846
847 case PDF_TOK_NAME: obj = pdf_new_name(ctx, buf->scratch); break;
848 case PDF_TOK_REAL: obj = pdf_new_real(ctx, buf->f); break;
849 case PDF_TOK_STRING: obj = pdf_new_string(ctx, buf->scratch, buf->len); break;
850 case PDF_TOK_TRUE: obj = PDF_TRUE; break;
851 case PDF_TOK_FALSE: obj = PDF_FALSE; break;
852 case PDF_TOK_NULL: obj = PDF_NULL; break;
853
854 case PDF_TOK_INT:
855 a = buf->i;
856 tok = pdf_lex(ctx, file, buf);
857
858 if (tok == PDF_TOK_STREAM || tok == PDF_TOK_ENDOBJ)
859 {
860 obj = pdf_new_int(ctx, a);
861 read_next_token = 0;
862 break;
863 }
864 else if (tok == PDF_TOK_INT)
865 {
866 b = buf->i;
867 tok = pdf_lex(ctx, file, buf);
868 if (tok == PDF_TOK_R)
869 {
870 obj = pdf_new_indirect(ctx, doc, a, b);
871 break;
872 }
873 }
874 fz_throw(ctx, FZ_ERROR_SYNTAX, "expected 'R' keyword (%d %d R)", num, gen);
875
876 case PDF_TOK_ENDOBJ:
877 obj = PDF_NULL;
878 read_next_token = 0;
879 break;
880
881 default:
882 fz_throw(ctx, FZ_ERROR_SYNTAX, "syntax error in object (%d %d R)", num, gen);
883 }
884
885 fz_try(ctx)
886 {
887 if (read_next_token)
888 tok = pdf_lex(ctx, file, buf);
889
890 if (tok == PDF_TOK_STREAM)
891 {
892 int c = fz_read_byte(ctx, file);
893 while (c == ' ')
894 c = fz_read_byte(ctx, file);
895 if (c == '\r')
896 {
897 c = fz_peek_byte(ctx, file);
898 if (c != '\n')
899 fz_warn(ctx, "line feed missing after stream begin marker (%d %d R)", num, gen);
900 else
901 fz_read_byte(ctx, file);
902 }
903 stm_ofs = fz_tell(ctx, file);
904 }
905 else if (tok == PDF_TOK_ENDOBJ)
906 {
907 stm_ofs = 0;
908 }
909 else
910 {
911 fz_warn(ctx, "expected 'endobj' or 'stream' keyword (%d %d R)", num, gen);
912 stm_ofs = 0;
913 }
914 }
915 fz_catch(ctx)
916 {
917 pdf_drop_obj(ctx, obj);
918 fz_rethrow(ctx);
919 }
920
921 if (onum) *onum = num;
922 if (ogen) *ogen = gen;
923 if (ostmofs) *ostmofs = stm_ofs;
924
925 return obj;
926 }
927
928 pdf_obj *
929 pdf_parse_ind_obj(fz_context *ctx, pdf_document *doc, fz_stream *file,
930 int *onum, int *ogen, int64_t *ostmofs, int *try_repair)
931 {
932 return pdf_parse_ind_obj_or_newobj(ctx, doc, file, onum, ogen, ostmofs, try_repair, NULL);
933 }
934
935 pdf_obj *
936 pdf_parse_journal_obj(fz_context *ctx, pdf_document *doc, fz_stream *stm,
937 int *onum, fz_buffer **ostm, int *newobj)
938 {
939 pdf_obj *obj = NULL;
940 pdf_token tok;
941 pdf_lexbuf *buf = &doc->lexbuf.base;
942 int64_t stmofs;
943
944 *newobj = 0;
945 obj = pdf_parse_ind_obj_or_newobj(ctx, doc, stm, onum, NULL, &stmofs, NULL, newobj);
946 /* This will have consumed either the stream or the endobj keywords. */
947
948 *ostm = NULL;
949 if (stmofs)
950 {
951 fz_stream *stream = NULL;
952
953 fz_var(stream);
954
955 fz_try(ctx)
956 {
957 stream = fz_open_endstream_filter(ctx, stm, 0, stmofs);
958 *ostm = fz_read_all(ctx, stream, 32);
959 fz_drop_stream(ctx, stream);
960 stream = NULL;
961 fz_seek(ctx, stm, stmofs + (*ostm ? (*ostm)->len : 0), SEEK_SET);
962 tok = pdf_lex(ctx, stm, buf);
963 if (tok != PDF_TOK_ENDSTREAM)
964 fz_throw(ctx, FZ_ERROR_SYNTAX, "expected 'endstream' keyword");
965 tok = pdf_lex(ctx, stm, buf);
966 if (tok != PDF_TOK_ENDOBJ)
967 fz_throw(ctx, FZ_ERROR_SYNTAX, "expected 'endobj' keyword");
968 }
969 fz_always(ctx)
970 fz_drop_stream(ctx, stream);
971 fz_catch(ctx)
972 {
973 pdf_drop_obj(ctx, obj);
974 fz_rethrow(ctx);
975 }
976 }
977
978 return obj;
979 }