comparison mupdf-source/source/pdf/pdf-lex.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 // Copyright (C) 2004-2024 Artifex Software, Inc.
2 //
3 // This file is part of MuPDF.
4 //
5 // MuPDF is free software: you can redistribute it and/or modify it under the
6 // terms of the GNU Affero General Public License as published by the Free
7 // Software Foundation, either version 3 of the License, or (at your option)
8 // any later version.
9 //
10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13 // details.
14 //
15 // You should have received a copy of the GNU Affero General Public License
16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17 //
18 // Alternative licensing terms are available from the licensor.
19 // For commercial licensing, see <https://www.artifex.com/> or contact
20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21 // CA 94129, USA, for further information.
22
23 #include "mupdf/fitz.h"
24 #include "mupdf/pdf.h"
25
26 #include <string.h>
27
28 #define IS_NUMBER \
29 '+':case'-':case'.':case'0':case'1':case'2':case'3':\
30 case'4':case'5':case'6':case'7':case'8':case'9'
31 #define IS_WHITE \
32 '\x00':case'\x09':case'\x0a':case'\x0c':case'\x0d':case'\x20'
33 #define IS_HEX \
34 '0':case'1':case'2':case'3':case'4':case'5':case'6':\
35 case'7':case'8':case'9':case'A':case'B':case'C':\
36 case'D':case'E':case'F':case'a':case'b':case'c':\
37 case'd':case'e':case'f'
38 #define IS_DELIM \
39 '(':case')':case'<':case'>':case'[':case']':case'{':\
40 case'}':case'/':case'%'
41
42 #define RANGE_0_9 \
43 '0':case'1':case'2':case'3':case'4':case'5':\
44 case'6':case'7':case'8':case'9'
45 #define RANGE_a_f \
46 'a':case'b':case'c':case'd':case'e':case'f'
47 #define RANGE_A_F \
48 'A':case'B':case'C':case'D':case'E':case'F'
49 #define RANGE_0_7 \
50 '0':case'1':case'2':case'3':case'4':case'5':case'6':case'7'
51
52 /* #define DUMP_LEXER_STREAM */
53 #ifdef DUMP_LEXER_STREAM
54 static inline int lex_byte(fz_context *ctx, fz_stream *stm)
55 {
56 int c = fz_read_byte(ctx, stm);
57
58 if (c == EOF)
59 fz_write_printf(ctx, fz_stdout(ctx), "<EOF>");
60 else if (c >= 32 && c < 128)
61 fz_write_printf(ctx, fz_stdout(ctx), "%c", c);
62 else
63 fz_write_printf(ctx, fz_stdout(ctx), "<%02x>", c);
64 return c;
65 }
66 #else
67 #define lex_byte(C,S) fz_read_byte(C,S)
68 #endif
69
70 static inline int iswhite(int ch)
71 {
72 return
73 ch == '\000' ||
74 ch == '\011' ||
75 ch == '\012' ||
76 ch == '\014' ||
77 ch == '\015' ||
78 ch == '\040';
79 }
80
81 static inline int fz_isprint(int ch)
82 {
83 return ch >= ' ' && ch <= '~';
84 }
85
86 static inline int unhex(int ch)
87 {
88 if (ch >= '0' && ch <= '9') return ch - '0';
89 if (ch >= 'A' && ch <= 'F') return ch - 'A' + 0xA;
90 if (ch >= 'a' && ch <= 'f') return ch - 'a' + 0xA;
91 return 0;
92 }
93
94 static void
95 lex_white(fz_context *ctx, fz_stream *f)
96 {
97 int c;
98 do {
99 c = lex_byte(ctx, f);
100 } while ((c <= 32) && (iswhite(c)));
101 if (c != EOF)
102 fz_unread_byte(ctx, f);
103 }
104
105 static void
106 lex_comment(fz_context *ctx, fz_stream *f)
107 {
108 int c;
109 do {
110 c = lex_byte(ctx, f);
111 } while ((c != '\012') && (c != '\015') && (c != EOF));
112 }
113
114 /* Fast(ish) but inaccurate strtof, with Adobe overflow handling. */
115 static float acrobat_compatible_atof(char *s)
116 {
117 int neg = 0;
118 int i = 0;
119
120 while (*s == '-')
121 {
122 neg = 1;
123 ++s;
124 }
125 while (*s == '+')
126 {
127 ++s;
128 }
129
130 while (*s >= '0' && *s <= '9')
131 {
132 /* We deliberately ignore overflow here.
133 * Tests show that Acrobat handles * overflows in exactly the same way we do:
134 * 123450000000000000000678 is read as 678.
135 */
136 i = i * 10 + (*s - '0');
137 ++s;
138 }
139
140 if (*s == '.')
141 {
142 float v = i;
143 float n = 0;
144 float d = 1;
145 ++s;
146 while (*s >= '0' && *s <= '9')
147 {
148 n = 10 * n + (*s - '0');
149 d = 10 * d;
150 ++s;
151 }
152 v += n / d;
153 return neg ? -v : v;
154 }
155 else
156 {
157 return neg ? -i : i;
158 }
159 }
160
161 /* Fast but inaccurate atoi. */
162 static int64_t fast_atoi(char *s)
163 {
164 int neg = 0;
165 int64_t i = 0;
166
167 while (*s == '-')
168 {
169 neg = 1;
170 ++s;
171 }
172 while (*s == '+')
173 {
174 ++s;
175 }
176
177 while (*s >= '0' && *s <= '9')
178 {
179 /* We deliberately ignore overflow here. */
180 i = i * 10 + (*s - '0');
181 ++s;
182 }
183
184 return neg ? -i : i;
185 }
186
187 static int
188 lex_number(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf, int c)
189 {
190 char *s = buf->scratch;
191 char *e = buf->scratch + buf->size - 1; /* leave space for zero terminator */
192 char *isreal = (c == '.' ? s : NULL);
193 int neg = (c == '-');
194 int isbad = 0;
195
196 *s++ = c;
197
198 c = lex_byte(ctx, f);
199
200 /* skip extra '-' signs at start of number */
201 if (neg)
202 {
203 while (c == '-')
204 c = lex_byte(ctx, f);
205 }
206
207 while (s < e)
208 {
209 switch (c)
210 {
211 case IS_WHITE:
212 case IS_DELIM:
213 fz_unread_byte(ctx, f);
214 goto end;
215 case EOF:
216 goto end;
217 case '.':
218 if (isreal)
219 isbad = 1;
220 isreal = s;
221 *s++ = c;
222 break;
223 case '-':
224 /* Bug 703248: Some PDFs (particularly those
225 * generated by google docs) apparently have
226 * numbers like 0.000000000000-5684342 in them.
227 * We'll stop our interpretation at the -, but
228 * keep reading to skip over the trailing
229 * digits so they aren't parsed later. */
230 *s++ = '\0';
231 break;
232 case RANGE_0_9:
233 *s++ = c;
234 break;
235 default:
236 isbad = 1;
237 *s++ = c;
238 break;
239 }
240 c = lex_byte(ctx, f);
241 }
242
243 end:
244 *s = '\0';
245 if (isbad)
246 return PDF_TOK_KEYWORD;
247 if (isreal)
248 {
249 /* We'd like to use the fastest possible atof
250 * routine, but we'd rather match acrobats
251 * handling of broken numbers. As such, we
252 * spot common broken cases and call an
253 * acrobat compatible routine where required. */
254 if (neg > 1 || isreal - buf->scratch >= 10)
255 buf->f = acrobat_compatible_atof(buf->scratch);
256 else
257 buf->f = fz_atof(buf->scratch);
258 return PDF_TOK_REAL;
259 }
260 else
261 {
262 buf->i = fast_atoi(buf->scratch);
263 return PDF_TOK_INT;
264 }
265 }
266
267 static void
268 lex_name(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb)
269 {
270 char *s = lb->scratch;
271 char *e = s + fz_minz(127, lb->size);
272 int c;
273
274 while (1)
275 {
276 if (s == e)
277 {
278 if (e - lb->scratch < 127)
279 {
280 s += pdf_lexbuf_grow(ctx, lb);
281 e = lb->scratch + fz_minz(127, lb->size);
282 }
283 else
284 {
285 /* truncate names that are too long */
286 fz_warn(ctx, "name is too long");
287 *s = 0;
288 lb->len = s - lb->scratch;
289 s = NULL;
290 }
291 }
292 c = lex_byte(ctx, f);
293 switch (c)
294 {
295 case IS_WHITE:
296 case IS_DELIM:
297 fz_unread_byte(ctx, f);
298 goto end;
299 case EOF:
300 goto end;
301 case '#':
302 {
303 int hex[2];
304 int i;
305 for (i = 0; i < 2; i++)
306 {
307 c = fz_peek_byte(ctx, f);
308 switch (c)
309 {
310 case RANGE_0_9:
311 if (i == 1 && c == '0' && hex[0] == 0)
312 goto illegal;
313 hex[i] = lex_byte(ctx, f) - '0';
314 break;
315 case RANGE_a_f:
316 hex[i] = lex_byte(ctx, f) - 'a' + 10;
317 break;
318 case RANGE_A_F:
319 hex[i] = lex_byte(ctx, f) - 'A' + 10;
320 break;
321 default:
322 goto illegal;
323 case EOF:
324 goto illegal_eof;
325 }
326 }
327 if (s) *s++ = (hex[0] << 4) + hex[1];
328 break;
329 illegal:
330 if (i == 1)
331 fz_unread_byte(ctx, f);
332 illegal_eof:
333 if (s) *s++ = '#';
334 continue;
335 }
336 default:
337 if (s) *s++ = c;
338 break;
339 }
340 }
341 end:
342 if (s)
343 {
344 *s = '\0';
345 lb->len = s - lb->scratch;
346 }
347 }
348
349 static int
350 lex_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb)
351 {
352 char *s = lb->scratch;
353 char *e = s + lb->size;
354 int bal = 1;
355 int oct;
356 int c;
357
358 while (1)
359 {
360 if (s == e)
361 {
362 s += pdf_lexbuf_grow(ctx, lb);
363 e = lb->scratch + lb->size;
364 }
365 c = lex_byte(ctx, f);
366 switch (c)
367 {
368 case EOF:
369 return PDF_TOK_ERROR;
370 case '(':
371 bal++;
372 *s++ = c;
373 break;
374 case ')':
375 bal --;
376 if (bal == 0)
377 goto end;
378 *s++ = c;
379 break;
380 case '\\':
381 c = lex_byte(ctx, f);
382 switch (c)
383 {
384 case EOF:
385 return PDF_TOK_ERROR;
386 case 'n':
387 *s++ = '\n';
388 break;
389 case 'r':
390 *s++ = '\r';
391 break;
392 case 't':
393 *s++ = '\t';
394 break;
395 case 'b':
396 *s++ = '\b';
397 break;
398 case 'f':
399 *s++ = '\f';
400 break;
401 case '(':
402 *s++ = '(';
403 break;
404 case ')':
405 *s++ = ')';
406 break;
407 case '\\':
408 *s++ = '\\';
409 break;
410 case RANGE_0_7:
411 oct = c - '0';
412 c = lex_byte(ctx, f);
413 if (c >= '0' && c <= '7')
414 {
415 oct = oct * 8 + (c - '0');
416 c = lex_byte(ctx, f);
417 if (c >= '0' && c <= '7')
418 oct = oct * 8 + (c - '0');
419 else if (c != EOF)
420 fz_unread_byte(ctx, f);
421 }
422 else if (c != EOF)
423 fz_unread_byte(ctx, f);
424 *s++ = oct;
425 break;
426 case '\n':
427 break;
428 case '\r':
429 c = lex_byte(ctx, f);
430 if ((c != '\n') && (c != EOF))
431 fz_unread_byte(ctx, f);
432 break;
433 default:
434 *s++ = c;
435 }
436 break;
437 /* Bug 708256: PDF 32000-1 says that any occurence of \n, \r, or \r\n in a
438 * (unless escaped with a '\') should be interpreted as a single 0x0a byte. */
439 case '\n':
440 *s++ = 0x0a;
441 break;
442 case '\r':
443 *s++ = 0x0a;
444 c = lex_byte(ctx, f);
445 if ((c != '\n') && (c != EOF))
446 fz_unread_byte(ctx, f);
447 break;
448 default:
449 *s++ = c;
450 break;
451 }
452 }
453 end:
454 lb->len = s - lb->scratch;
455 return PDF_TOK_STRING;
456 }
457
458 static int
459 lex_hex_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb)
460 {
461 char *s = lb->scratch;
462 char *e = s + lb->size;
463 int a = 0, x = 0;
464 int c;
465
466 while (1)
467 {
468 if (s == e)
469 {
470 s += pdf_lexbuf_grow(ctx, lb);
471 e = lb->scratch + lb->size;
472 }
473 c = lex_byte(ctx, f);
474 switch (c)
475 {
476 case IS_WHITE:
477 break;
478 default:
479 fz_warn(ctx, "invalid character in hex string");
480 /* fall through */
481 case IS_HEX:
482 if (x)
483 {
484 *s++ = a * 16 + unhex(c);
485 x = !x;
486 }
487 else
488 {
489 a = unhex(c);
490 x = !x;
491 }
492 break;
493 case '>':
494 if (x)
495 {
496 *s++ = a * 16; /* pad truncated string with '0' */
497 }
498 goto end;
499 case EOF:
500 return PDF_TOK_ERROR;
501 }
502 }
503 end:
504 lb->len = s - lb->scratch;
505 return PDF_TOK_STRING;
506 }
507
508 static pdf_token
509 pdf_token_from_keyword(char *key)
510 {
511 switch (*key)
512 {
513 case 'R':
514 if (!strcmp(key, "R")) return PDF_TOK_R;
515 break;
516 case 't':
517 if (!strcmp(key, "true")) return PDF_TOK_TRUE;
518 if (!strcmp(key, "trailer")) return PDF_TOK_TRAILER;
519 break;
520 case 'f':
521 if (!strcmp(key, "false")) return PDF_TOK_FALSE;
522 break;
523 case 'n':
524 if (!strcmp(key, "null")) return PDF_TOK_NULL;
525 if (!strcmp(key, "newobj")) return PDF_TOK_NEWOBJ;
526 break;
527 case 'o':
528 if (!strcmp(key, "obj")) return PDF_TOK_OBJ;
529 break;
530 case 'e':
531 if (!strcmp(key, "endobj")) return PDF_TOK_ENDOBJ;
532 if (!strcmp(key, "endstream")) return PDF_TOK_ENDSTREAM;
533 break;
534 case 's':
535 if (!strcmp(key, "stream")) return PDF_TOK_STREAM;
536 if (!strcmp(key, "startxref")) return PDF_TOK_STARTXREF;
537 break;
538 case 'x':
539 if (!strcmp(key, "xref")) return PDF_TOK_XREF;
540 break;
541 }
542
543 while (*key)
544 {
545 if (!fz_isprint(*key))
546 return PDF_TOK_ERROR;
547 ++key;
548 }
549
550 return PDF_TOK_KEYWORD;
551 }
552
553 void pdf_lexbuf_init(fz_context *ctx, pdf_lexbuf *lb, int size)
554 {
555 lb->size = lb->base_size = size;
556 lb->len = 0;
557 lb->scratch = &lb->buffer[0];
558 }
559
560 void pdf_lexbuf_fin(fz_context *ctx, pdf_lexbuf *lb)
561 {
562 if (lb && lb->size != lb->base_size)
563 fz_free(ctx, lb->scratch);
564 }
565
566 ptrdiff_t pdf_lexbuf_grow(fz_context *ctx, pdf_lexbuf *lb)
567 {
568 char *old = lb->scratch;
569 size_t newsize = lb->size * 2;
570 if (lb->size == lb->base_size)
571 {
572 lb->scratch = Memento_label(fz_malloc(ctx, newsize), "pdf_lexbuf");
573 memcpy(lb->scratch, lb->buffer, lb->size);
574 }
575 else
576 {
577 lb->scratch = fz_realloc(ctx, lb->scratch, newsize);
578 }
579 lb->size = newsize;
580 return lb->scratch - old;
581 }
582
583 pdf_token
584 pdf_lex(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf)
585 {
586 while (1)
587 {
588 int c = lex_byte(ctx, f);
589 switch (c)
590 {
591 case EOF:
592 return PDF_TOK_EOF;
593 case IS_WHITE:
594 lex_white(ctx, f);
595 break;
596 case '%':
597 lex_comment(ctx, f);
598 break;
599 case '/':
600 lex_name(ctx, f, buf);
601 return PDF_TOK_NAME;
602 case '(':
603 return lex_string(ctx, f, buf);
604 case ')':
605 return PDF_TOK_ERROR;
606 case '<':
607 c = lex_byte(ctx, f);
608 if (c == '<')
609 return PDF_TOK_OPEN_DICT;
610 if (c != EOF)
611 fz_unread_byte(ctx, f);
612 return lex_hex_string(ctx, f, buf);
613 case '>':
614 c = lex_byte(ctx, f);
615 if (c == '>')
616 return PDF_TOK_CLOSE_DICT;
617 if (c != EOF)
618 fz_unread_byte(ctx, f);
619 return PDF_TOK_ERROR;
620 case '[':
621 return PDF_TOK_OPEN_ARRAY;
622 case ']':
623 return PDF_TOK_CLOSE_ARRAY;
624 case '{':
625 return PDF_TOK_OPEN_BRACE;
626 case '}':
627 return PDF_TOK_CLOSE_BRACE;
628 case IS_NUMBER:
629 return lex_number(ctx, f, buf, c);
630 default: /* isregular: !isdelim && !iswhite && c != EOF */
631 fz_unread_byte(ctx, f);
632 lex_name(ctx, f, buf);
633 return pdf_token_from_keyword(buf->scratch);
634 }
635 }
636 }
637
638 pdf_token
639 pdf_lex_no_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf)
640 {
641 while (1)
642 {
643 int c = lex_byte(ctx, f);
644 switch (c)
645 {
646 case EOF:
647 return PDF_TOK_EOF;
648 case IS_WHITE:
649 lex_white(ctx, f);
650 break;
651 case '%':
652 lex_comment(ctx, f);
653 break;
654 case '/':
655 lex_name(ctx, f, buf);
656 return PDF_TOK_NAME;
657 case '(':
658 return PDF_TOK_ERROR; /* no strings allowed */
659 case ')':
660 return PDF_TOK_ERROR; /* no strings allowed */
661 case '<':
662 c = lex_byte(ctx, f);
663 if (c == '<')
664 return PDF_TOK_OPEN_DICT;
665 if (c != EOF)
666 fz_unread_byte(ctx, f);
667 return PDF_TOK_ERROR; /* no strings allowed */
668 case '>':
669 c = lex_byte(ctx, f);
670 if (c == '>')
671 return PDF_TOK_CLOSE_DICT;
672 if (c != EOF)
673 fz_unread_byte(ctx, f);
674 return PDF_TOK_ERROR;
675 case '[':
676 return PDF_TOK_OPEN_ARRAY;
677 case ']':
678 return PDF_TOK_CLOSE_ARRAY;
679 case '{':
680 return PDF_TOK_OPEN_BRACE;
681 case '}':
682 return PDF_TOK_CLOSE_BRACE;
683 case IS_NUMBER:
684 return lex_number(ctx, f, buf, c);
685 default: /* isregular: !isdelim && !iswhite && c != EOF */
686 fz_unread_byte(ctx, f);
687 lex_name(ctx, f, buf);
688 return pdf_token_from_keyword(buf->scratch);
689 }
690 }
691 }
692
693 void pdf_append_token(fz_context *ctx, fz_buffer *fzbuf, int tok, pdf_lexbuf *buf)
694 {
695 switch (tok)
696 {
697 case PDF_TOK_NAME:
698 fz_append_printf(ctx, fzbuf, "/%s", buf->scratch);
699 break;
700 case PDF_TOK_STRING:
701 if (buf->len >= buf->size)
702 pdf_lexbuf_grow(ctx, buf);
703 buf->scratch[buf->len] = 0;
704 fz_append_pdf_string(ctx, fzbuf, buf->scratch);
705 break;
706 case PDF_TOK_OPEN_DICT:
707 fz_append_string(ctx, fzbuf, "<<");
708 break;
709 case PDF_TOK_CLOSE_DICT:
710 fz_append_string(ctx, fzbuf, ">>");
711 break;
712 case PDF_TOK_OPEN_ARRAY:
713 fz_append_byte(ctx, fzbuf, '[');
714 break;
715 case PDF_TOK_CLOSE_ARRAY:
716 fz_append_byte(ctx, fzbuf, ']');
717 break;
718 case PDF_TOK_OPEN_BRACE:
719 fz_append_byte(ctx, fzbuf, '{');
720 break;
721 case PDF_TOK_CLOSE_BRACE:
722 fz_append_byte(ctx, fzbuf, '}');
723 break;
724 case PDF_TOK_INT:
725 fz_append_printf(ctx, fzbuf, "%ld", buf->i);
726 break;
727 case PDF_TOK_REAL:
728 fz_append_printf(ctx, fzbuf, "%g", buf->f);
729 break;
730 default:
731 fz_append_data(ctx, fzbuf, buf->scratch, buf->len);
732 break;
733 }
734 }