Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/source/pdf/pdf-lex.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/source/pdf/pdf-lex.c Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,734 @@ +// Copyright (C) 2004-2024 Artifex Software, Inc. +// +// This file is part of MuPDF. +// +// MuPDF is free software: you can redistribute it and/or modify it under the +// terms of the GNU Affero General Public License as published by the Free +// Software Foundation, either version 3 of the License, or (at your option) +// any later version. +// +// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +// details. +// +// You should have received a copy of the GNU Affero General Public License +// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> +// +// Alternative licensing terms are available from the licensor. +// For commercial licensing, see <https://www.artifex.com/> or contact +// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, +// CA 94129, USA, for further information. + +#include "mupdf/fitz.h" +#include "mupdf/pdf.h" + +#include <string.h> + +#define IS_NUMBER \ + '+':case'-':case'.':case'0':case'1':case'2':case'3':\ + case'4':case'5':case'6':case'7':case'8':case'9' +#define IS_WHITE \ + '\x00':case'\x09':case'\x0a':case'\x0c':case'\x0d':case'\x20' +#define IS_HEX \ + '0':case'1':case'2':case'3':case'4':case'5':case'6':\ + case'7':case'8':case'9':case'A':case'B':case'C':\ + case'D':case'E':case'F':case'a':case'b':case'c':\ + case'd':case'e':case'f' +#define IS_DELIM \ + '(':case')':case'<':case'>':case'[':case']':case'{':\ + case'}':case'/':case'%' + +#define RANGE_0_9 \ + '0':case'1':case'2':case'3':case'4':case'5':\ + case'6':case'7':case'8':case'9' +#define RANGE_a_f \ + 'a':case'b':case'c':case'd':case'e':case'f' +#define RANGE_A_F \ + 'A':case'B':case'C':case'D':case'E':case'F' +#define RANGE_0_7 \ + '0':case'1':case'2':case'3':case'4':case'5':case'6':case'7' + +/* #define DUMP_LEXER_STREAM */ +#ifdef DUMP_LEXER_STREAM +static inline int lex_byte(fz_context *ctx, fz_stream *stm) +{ + int c = fz_read_byte(ctx, stm); + + if (c == EOF) + fz_write_printf(ctx, fz_stdout(ctx), "<EOF>"); + else if (c >= 32 && c < 128) + fz_write_printf(ctx, fz_stdout(ctx), "%c", c); + else + fz_write_printf(ctx, fz_stdout(ctx), "<%02x>", c); + return c; +} +#else +#define lex_byte(C,S) fz_read_byte(C,S) +#endif + +static inline int iswhite(int ch) +{ + return + ch == '\000' || + ch == '\011' || + ch == '\012' || + ch == '\014' || + ch == '\015' || + ch == '\040'; +} + +static inline int fz_isprint(int ch) +{ + return ch >= ' ' && ch <= '~'; +} + +static inline int unhex(int ch) +{ + if (ch >= '0' && ch <= '9') return ch - '0'; + if (ch >= 'A' && ch <= 'F') return ch - 'A' + 0xA; + if (ch >= 'a' && ch <= 'f') return ch - 'a' + 0xA; + return 0; +} + +static void +lex_white(fz_context *ctx, fz_stream *f) +{ + int c; + do { + c = lex_byte(ctx, f); + } while ((c <= 32) && (iswhite(c))); + if (c != EOF) + fz_unread_byte(ctx, f); +} + +static void +lex_comment(fz_context *ctx, fz_stream *f) +{ + int c; + do { + c = lex_byte(ctx, f); + } while ((c != '\012') && (c != '\015') && (c != EOF)); +} + +/* Fast(ish) but inaccurate strtof, with Adobe overflow handling. */ +static float acrobat_compatible_atof(char *s) +{ + int neg = 0; + int i = 0; + + while (*s == '-') + { + neg = 1; + ++s; + } + while (*s == '+') + { + ++s; + } + + while (*s >= '0' && *s <= '9') + { + /* We deliberately ignore overflow here. + * Tests show that Acrobat handles * overflows in exactly the same way we do: + * 123450000000000000000678 is read as 678. + */ + i = i * 10 + (*s - '0'); + ++s; + } + + if (*s == '.') + { + float v = i; + float n = 0; + float d = 1; + ++s; + while (*s >= '0' && *s <= '9') + { + n = 10 * n + (*s - '0'); + d = 10 * d; + ++s; + } + v += n / d; + return neg ? -v : v; + } + else + { + return neg ? -i : i; + } +} + +/* Fast but inaccurate atoi. */ +static int64_t fast_atoi(char *s) +{ + int neg = 0; + int64_t i = 0; + + while (*s == '-') + { + neg = 1; + ++s; + } + while (*s == '+') + { + ++s; + } + + while (*s >= '0' && *s <= '9') + { + /* We deliberately ignore overflow here. */ + i = i * 10 + (*s - '0'); + ++s; + } + + return neg ? -i : i; +} + +static int +lex_number(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf, int c) +{ + char *s = buf->scratch; + char *e = buf->scratch + buf->size - 1; /* leave space for zero terminator */ + char *isreal = (c == '.' ? s : NULL); + int neg = (c == '-'); + int isbad = 0; + + *s++ = c; + + c = lex_byte(ctx, f); + + /* skip extra '-' signs at start of number */ + if (neg) + { + while (c == '-') + c = lex_byte(ctx, f); + } + + while (s < e) + { + switch (c) + { + case IS_WHITE: + case IS_DELIM: + fz_unread_byte(ctx, f); + goto end; + case EOF: + goto end; + case '.': + if (isreal) + isbad = 1; + isreal = s; + *s++ = c; + break; + case '-': + /* Bug 703248: Some PDFs (particularly those + * generated by google docs) apparently have + * numbers like 0.000000000000-5684342 in them. + * We'll stop our interpretation at the -, but + * keep reading to skip over the trailing + * digits so they aren't parsed later. */ + *s++ = '\0'; + break; + case RANGE_0_9: + *s++ = c; + break; + default: + isbad = 1; + *s++ = c; + break; + } + c = lex_byte(ctx, f); + } + +end: + *s = '\0'; + if (isbad) + return PDF_TOK_KEYWORD; + if (isreal) + { + /* We'd like to use the fastest possible atof + * routine, but we'd rather match acrobats + * handling of broken numbers. As such, we + * spot common broken cases and call an + * acrobat compatible routine where required. */ + if (neg > 1 || isreal - buf->scratch >= 10) + buf->f = acrobat_compatible_atof(buf->scratch); + else + buf->f = fz_atof(buf->scratch); + return PDF_TOK_REAL; + } + else + { + buf->i = fast_atoi(buf->scratch); + return PDF_TOK_INT; + } +} + +static void +lex_name(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb) +{ + char *s = lb->scratch; + char *e = s + fz_minz(127, lb->size); + int c; + + while (1) + { + if (s == e) + { + if (e - lb->scratch < 127) + { + s += pdf_lexbuf_grow(ctx, lb); + e = lb->scratch + fz_minz(127, lb->size); + } + else + { + /* truncate names that are too long */ + fz_warn(ctx, "name is too long"); + *s = 0; + lb->len = s - lb->scratch; + s = NULL; + } + } + c = lex_byte(ctx, f); + switch (c) + { + case IS_WHITE: + case IS_DELIM: + fz_unread_byte(ctx, f); + goto end; + case EOF: + goto end; + case '#': + { + int hex[2]; + int i; + for (i = 0; i < 2; i++) + { + c = fz_peek_byte(ctx, f); + switch (c) + { + case RANGE_0_9: + if (i == 1 && c == '0' && hex[0] == 0) + goto illegal; + hex[i] = lex_byte(ctx, f) - '0'; + break; + case RANGE_a_f: + hex[i] = lex_byte(ctx, f) - 'a' + 10; + break; + case RANGE_A_F: + hex[i] = lex_byte(ctx, f) - 'A' + 10; + break; + default: + goto illegal; + case EOF: + goto illegal_eof; + } + } + if (s) *s++ = (hex[0] << 4) + hex[1]; + break; +illegal: + if (i == 1) + fz_unread_byte(ctx, f); +illegal_eof: + if (s) *s++ = '#'; + continue; + } + default: + if (s) *s++ = c; + break; + } + } +end: + if (s) + { + *s = '\0'; + lb->len = s - lb->scratch; + } +} + +static int +lex_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb) +{ + char *s = lb->scratch; + char *e = s + lb->size; + int bal = 1; + int oct; + int c; + + while (1) + { + if (s == e) + { + s += pdf_lexbuf_grow(ctx, lb); + e = lb->scratch + lb->size; + } + c = lex_byte(ctx, f); + switch (c) + { + case EOF: + return PDF_TOK_ERROR; + case '(': + bal++; + *s++ = c; + break; + case ')': + bal --; + if (bal == 0) + goto end; + *s++ = c; + break; + case '\\': + c = lex_byte(ctx, f); + switch (c) + { + case EOF: + return PDF_TOK_ERROR; + case 'n': + *s++ = '\n'; + break; + case 'r': + *s++ = '\r'; + break; + case 't': + *s++ = '\t'; + break; + case 'b': + *s++ = '\b'; + break; + case 'f': + *s++ = '\f'; + break; + case '(': + *s++ = '('; + break; + case ')': + *s++ = ')'; + break; + case '\\': + *s++ = '\\'; + break; + case RANGE_0_7: + oct = c - '0'; + c = lex_byte(ctx, f); + if (c >= '0' && c <= '7') + { + oct = oct * 8 + (c - '0'); + c = lex_byte(ctx, f); + if (c >= '0' && c <= '7') + oct = oct * 8 + (c - '0'); + else if (c != EOF) + fz_unread_byte(ctx, f); + } + else if (c != EOF) + fz_unread_byte(ctx, f); + *s++ = oct; + break; + case '\n': + break; + case '\r': + c = lex_byte(ctx, f); + if ((c != '\n') && (c != EOF)) + fz_unread_byte(ctx, f); + break; + default: + *s++ = c; + } + break; + /* Bug 708256: PDF 32000-1 says that any occurence of \n, \r, or \r\n in a + * (unless escaped with a '\') should be interpreted as a single 0x0a byte. */ + case '\n': + *s++ = 0x0a; + break; + case '\r': + *s++ = 0x0a; + c = lex_byte(ctx, f); + if ((c != '\n') && (c != EOF)) + fz_unread_byte(ctx, f); + break; + default: + *s++ = c; + break; + } + } +end: + lb->len = s - lb->scratch; + return PDF_TOK_STRING; +} + +static int +lex_hex_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb) +{ + char *s = lb->scratch; + char *e = s + lb->size; + int a = 0, x = 0; + int c; + + while (1) + { + if (s == e) + { + s += pdf_lexbuf_grow(ctx, lb); + e = lb->scratch + lb->size; + } + c = lex_byte(ctx, f); + switch (c) + { + case IS_WHITE: + break; + default: + fz_warn(ctx, "invalid character in hex string"); + /* fall through */ + case IS_HEX: + if (x) + { + *s++ = a * 16 + unhex(c); + x = !x; + } + else + { + a = unhex(c); + x = !x; + } + break; + case '>': + if (x) + { + *s++ = a * 16; /* pad truncated string with '0' */ + } + goto end; + case EOF: + return PDF_TOK_ERROR; + } + } +end: + lb->len = s - lb->scratch; + return PDF_TOK_STRING; +} + +static pdf_token +pdf_token_from_keyword(char *key) +{ + switch (*key) + { + case 'R': + if (!strcmp(key, "R")) return PDF_TOK_R; + break; + case 't': + if (!strcmp(key, "true")) return PDF_TOK_TRUE; + if (!strcmp(key, "trailer")) return PDF_TOK_TRAILER; + break; + case 'f': + if (!strcmp(key, "false")) return PDF_TOK_FALSE; + break; + case 'n': + if (!strcmp(key, "null")) return PDF_TOK_NULL; + if (!strcmp(key, "newobj")) return PDF_TOK_NEWOBJ; + break; + case 'o': + if (!strcmp(key, "obj")) return PDF_TOK_OBJ; + break; + case 'e': + if (!strcmp(key, "endobj")) return PDF_TOK_ENDOBJ; + if (!strcmp(key, "endstream")) return PDF_TOK_ENDSTREAM; + break; + case 's': + if (!strcmp(key, "stream")) return PDF_TOK_STREAM; + if (!strcmp(key, "startxref")) return PDF_TOK_STARTXREF; + break; + case 'x': + if (!strcmp(key, "xref")) return PDF_TOK_XREF; + break; + } + + while (*key) + { + if (!fz_isprint(*key)) + return PDF_TOK_ERROR; + ++key; + } + + return PDF_TOK_KEYWORD; +} + +void pdf_lexbuf_init(fz_context *ctx, pdf_lexbuf *lb, int size) +{ + lb->size = lb->base_size = size; + lb->len = 0; + lb->scratch = &lb->buffer[0]; +} + +void pdf_lexbuf_fin(fz_context *ctx, pdf_lexbuf *lb) +{ + if (lb && lb->size != lb->base_size) + fz_free(ctx, lb->scratch); +} + +ptrdiff_t pdf_lexbuf_grow(fz_context *ctx, pdf_lexbuf *lb) +{ + char *old = lb->scratch; + size_t newsize = lb->size * 2; + if (lb->size == lb->base_size) + { + lb->scratch = Memento_label(fz_malloc(ctx, newsize), "pdf_lexbuf"); + memcpy(lb->scratch, lb->buffer, lb->size); + } + else + { + lb->scratch = fz_realloc(ctx, lb->scratch, newsize); + } + lb->size = newsize; + return lb->scratch - old; +} + +pdf_token +pdf_lex(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf) +{ + while (1) + { + int c = lex_byte(ctx, f); + switch (c) + { + case EOF: + return PDF_TOK_EOF; + case IS_WHITE: + lex_white(ctx, f); + break; + case '%': + lex_comment(ctx, f); + break; + case '/': + lex_name(ctx, f, buf); + return PDF_TOK_NAME; + case '(': + return lex_string(ctx, f, buf); + case ')': + return PDF_TOK_ERROR; + case '<': + c = lex_byte(ctx, f); + if (c == '<') + return PDF_TOK_OPEN_DICT; + if (c != EOF) + fz_unread_byte(ctx, f); + return lex_hex_string(ctx, f, buf); + case '>': + c = lex_byte(ctx, f); + if (c == '>') + return PDF_TOK_CLOSE_DICT; + if (c != EOF) + fz_unread_byte(ctx, f); + return PDF_TOK_ERROR; + case '[': + return PDF_TOK_OPEN_ARRAY; + case ']': + return PDF_TOK_CLOSE_ARRAY; + case '{': + return PDF_TOK_OPEN_BRACE; + case '}': + return PDF_TOK_CLOSE_BRACE; + case IS_NUMBER: + return lex_number(ctx, f, buf, c); + default: /* isregular: !isdelim && !iswhite && c != EOF */ + fz_unread_byte(ctx, f); + lex_name(ctx, f, buf); + return pdf_token_from_keyword(buf->scratch); + } + } +} + +pdf_token +pdf_lex_no_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf) +{ + while (1) + { + int c = lex_byte(ctx, f); + switch (c) + { + case EOF: + return PDF_TOK_EOF; + case IS_WHITE: + lex_white(ctx, f); + break; + case '%': + lex_comment(ctx, f); + break; + case '/': + lex_name(ctx, f, buf); + return PDF_TOK_NAME; + case '(': + return PDF_TOK_ERROR; /* no strings allowed */ + case ')': + return PDF_TOK_ERROR; /* no strings allowed */ + case '<': + c = lex_byte(ctx, f); + if (c == '<') + return PDF_TOK_OPEN_DICT; + if (c != EOF) + fz_unread_byte(ctx, f); + return PDF_TOK_ERROR; /* no strings allowed */ + case '>': + c = lex_byte(ctx, f); + if (c == '>') + return PDF_TOK_CLOSE_DICT; + if (c != EOF) + fz_unread_byte(ctx, f); + return PDF_TOK_ERROR; + case '[': + return PDF_TOK_OPEN_ARRAY; + case ']': + return PDF_TOK_CLOSE_ARRAY; + case '{': + return PDF_TOK_OPEN_BRACE; + case '}': + return PDF_TOK_CLOSE_BRACE; + case IS_NUMBER: + return lex_number(ctx, f, buf, c); + default: /* isregular: !isdelim && !iswhite && c != EOF */ + fz_unread_byte(ctx, f); + lex_name(ctx, f, buf); + return pdf_token_from_keyword(buf->scratch); + } + } +} + +void pdf_append_token(fz_context *ctx, fz_buffer *fzbuf, int tok, pdf_lexbuf *buf) +{ + switch (tok) + { + case PDF_TOK_NAME: + fz_append_printf(ctx, fzbuf, "/%s", buf->scratch); + break; + case PDF_TOK_STRING: + if (buf->len >= buf->size) + pdf_lexbuf_grow(ctx, buf); + buf->scratch[buf->len] = 0; + fz_append_pdf_string(ctx, fzbuf, buf->scratch); + break; + case PDF_TOK_OPEN_DICT: + fz_append_string(ctx, fzbuf, "<<"); + break; + case PDF_TOK_CLOSE_DICT: + fz_append_string(ctx, fzbuf, ">>"); + break; + case PDF_TOK_OPEN_ARRAY: + fz_append_byte(ctx, fzbuf, '['); + break; + case PDF_TOK_CLOSE_ARRAY: + fz_append_byte(ctx, fzbuf, ']'); + break; + case PDF_TOK_OPEN_BRACE: + fz_append_byte(ctx, fzbuf, '{'); + break; + case PDF_TOK_CLOSE_BRACE: + fz_append_byte(ctx, fzbuf, '}'); + break; + case PDF_TOK_INT: + fz_append_printf(ctx, fzbuf, "%ld", buf->i); + break; + case PDF_TOK_REAL: + fz_append_printf(ctx, fzbuf, "%g", buf->f); + break; + default: + fz_append_data(ctx, fzbuf, buf->scratch, buf->len); + break; + } +}
