diff mupdf-source/source/pdf/pdf-lex.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/source/pdf/pdf-lex.c	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,734 @@
+// Copyright (C) 2004-2024 Artifex Software, Inc.
+//
+// This file is part of MuPDF.
+//
+// MuPDF is free software: you can redistribute it and/or modify it under the
+// terms of the GNU Affero General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+// details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
+//
+// Alternative licensing terms are available from the licensor.
+// For commercial licensing, see <https://www.artifex.com/> or contact
+// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
+// CA 94129, USA, for further information.
+
+#include "mupdf/fitz.h"
+#include "mupdf/pdf.h"
+
+#include <string.h>
+
+#define IS_NUMBER \
+	'+':case'-':case'.':case'0':case'1':case'2':case'3':\
+	case'4':case'5':case'6':case'7':case'8':case'9'
+#define IS_WHITE \
+	'\x00':case'\x09':case'\x0a':case'\x0c':case'\x0d':case'\x20'
+#define IS_HEX \
+	'0':case'1':case'2':case'3':case'4':case'5':case'6':\
+	case'7':case'8':case'9':case'A':case'B':case'C':\
+	case'D':case'E':case'F':case'a':case'b':case'c':\
+	case'd':case'e':case'f'
+#define IS_DELIM \
+	'(':case')':case'<':case'>':case'[':case']':case'{':\
+	case'}':case'/':case'%'
+
+#define RANGE_0_9 \
+	'0':case'1':case'2':case'3':case'4':case'5':\
+	case'6':case'7':case'8':case'9'
+#define RANGE_a_f \
+	'a':case'b':case'c':case'd':case'e':case'f'
+#define RANGE_A_F \
+	'A':case'B':case'C':case'D':case'E':case'F'
+#define RANGE_0_7 \
+	'0':case'1':case'2':case'3':case'4':case'5':case'6':case'7'
+
+/* #define DUMP_LEXER_STREAM */
+#ifdef DUMP_LEXER_STREAM
+static inline int lex_byte(fz_context *ctx, fz_stream *stm)
+{
+	int c = fz_read_byte(ctx, stm);
+
+	if (c == EOF)
+		fz_write_printf(ctx, fz_stdout(ctx), "<EOF>");
+	else if (c >= 32 && c < 128)
+		fz_write_printf(ctx, fz_stdout(ctx), "%c", c);
+	else
+		fz_write_printf(ctx, fz_stdout(ctx), "<%02x>", c);
+	return c;
+}
+#else
+#define lex_byte(C,S) fz_read_byte(C,S)
+#endif
+
+static inline int iswhite(int ch)
+{
+	return
+		ch == '\000' ||
+		ch == '\011' ||
+		ch == '\012' ||
+		ch == '\014' ||
+		ch == '\015' ||
+		ch == '\040';
+}
+
+static inline int fz_isprint(int ch)
+{
+	return ch >= ' ' && ch <= '~';
+}
+
+static inline int unhex(int ch)
+{
+	if (ch >= '0' && ch <= '9') return ch - '0';
+	if (ch >= 'A' && ch <= 'F') return ch - 'A' + 0xA;
+	if (ch >= 'a' && ch <= 'f') return ch - 'a' + 0xA;
+	return 0;
+}
+
+static void
+lex_white(fz_context *ctx, fz_stream *f)
+{
+	int c;
+	do {
+		c = lex_byte(ctx, f);
+	} while ((c <= 32) && (iswhite(c)));
+	if (c != EOF)
+		fz_unread_byte(ctx, f);
+}
+
+static void
+lex_comment(fz_context *ctx, fz_stream *f)
+{
+	int c;
+	do {
+		c = lex_byte(ctx, f);
+	} while ((c != '\012') && (c != '\015') && (c != EOF));
+}
+
+/* Fast(ish) but inaccurate strtof, with Adobe overflow handling. */
+static float acrobat_compatible_atof(char *s)
+{
+	int neg = 0;
+	int i = 0;
+
+	while (*s == '-')
+	{
+		neg = 1;
+		++s;
+	}
+	while (*s == '+')
+	{
+		++s;
+	}
+
+	while (*s >= '0' && *s <= '9')
+	{
+		/* We deliberately ignore overflow here.
+		 * Tests show that Acrobat handles * overflows in exactly the same way we do:
+		 * 123450000000000000000678 is read as 678.
+		 */
+		i = i * 10 + (*s - '0');
+		++s;
+	}
+
+	if (*s == '.')
+	{
+		float v = i;
+		float n = 0;
+		float d = 1;
+		++s;
+		while (*s >= '0' && *s <= '9')
+		{
+			n = 10 * n + (*s - '0');
+			d = 10 * d;
+			++s;
+		}
+		v += n / d;
+		return neg ? -v : v;
+	}
+	else
+	{
+		return neg ? -i : i;
+	}
+}
+
+/* Fast but inaccurate atoi. */
+static int64_t fast_atoi(char *s)
+{
+	int neg = 0;
+	int64_t i = 0;
+
+	while (*s == '-')
+	{
+		neg = 1;
+		++s;
+	}
+	while (*s == '+')
+	{
+		++s;
+	}
+
+	while (*s >= '0' && *s <= '9')
+	{
+		/* We deliberately ignore overflow here. */
+		i = i * 10 + (*s - '0');
+		++s;
+	}
+
+	return neg ? -i : i;
+}
+
+static int
+lex_number(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf, int c)
+{
+	char *s = buf->scratch;
+	char *e = buf->scratch + buf->size - 1; /* leave space for zero terminator */
+	char *isreal = (c == '.' ? s : NULL);
+	int neg = (c == '-');
+	int isbad = 0;
+
+	*s++ = c;
+
+	c = lex_byte(ctx, f);
+
+	/* skip extra '-' signs at start of number */
+	if (neg)
+	{
+		while (c == '-')
+			c = lex_byte(ctx, f);
+	}
+
+	while (s < e)
+	{
+		switch (c)
+		{
+		case IS_WHITE:
+		case IS_DELIM:
+			fz_unread_byte(ctx, f);
+			goto end;
+		case EOF:
+			goto end;
+		case '.':
+			if (isreal)
+				isbad = 1;
+			isreal = s;
+			*s++ = c;
+			break;
+		case '-':
+			/* Bug 703248: Some PDFs (particularly those
+			 * generated by google docs) apparently have
+			 * numbers like 0.000000000000-5684342 in them.
+			 * We'll stop our interpretation at the -, but
+			 * keep reading to skip over the trailing
+			 * digits so they aren't parsed later. */
+			*s++ = '\0';
+			break;
+		case RANGE_0_9:
+			*s++ = c;
+			break;
+		default:
+			isbad = 1;
+			*s++ = c;
+			break;
+		}
+		c = lex_byte(ctx, f);
+	}
+
+end:
+	*s = '\0';
+	if (isbad)
+		return PDF_TOK_KEYWORD;
+	if (isreal)
+	{
+		/* We'd like to use the fastest possible atof
+		 * routine, but we'd rather match acrobats
+		 * handling of broken numbers. As such, we
+		 * spot common broken cases and call an
+		 * acrobat compatible routine where required. */
+		if (neg > 1 || isreal - buf->scratch >= 10)
+			buf->f = acrobat_compatible_atof(buf->scratch);
+		else
+			buf->f = fz_atof(buf->scratch);
+		return PDF_TOK_REAL;
+	}
+	else
+	{
+		buf->i = fast_atoi(buf->scratch);
+		return PDF_TOK_INT;
+	}
+}
+
+static void
+lex_name(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb)
+{
+	char *s = lb->scratch;
+	char *e = s + fz_minz(127, lb->size);
+	int c;
+
+	while (1)
+	{
+		if (s == e)
+		{
+			if (e - lb->scratch < 127)
+			{
+				s += pdf_lexbuf_grow(ctx, lb);
+				e = lb->scratch + fz_minz(127, lb->size);
+			}
+			else
+			{
+				/* truncate names that are too long */
+				fz_warn(ctx, "name is too long");
+				*s = 0;
+				lb->len = s - lb->scratch;
+				s = NULL;
+			}
+		}
+		c = lex_byte(ctx, f);
+		switch (c)
+		{
+		case IS_WHITE:
+		case IS_DELIM:
+			fz_unread_byte(ctx, f);
+			goto end;
+		case EOF:
+			goto end;
+		case '#':
+		{
+			int hex[2];
+			int i;
+			for (i = 0; i < 2; i++)
+			{
+				c = fz_peek_byte(ctx, f);
+				switch (c)
+				{
+				case RANGE_0_9:
+					if (i == 1 && c == '0' && hex[0] == 0)
+						goto illegal;
+					hex[i] = lex_byte(ctx, f) - '0';
+					break;
+				case RANGE_a_f:
+					hex[i] = lex_byte(ctx, f) - 'a' + 10;
+					break;
+				case RANGE_A_F:
+					hex[i] = lex_byte(ctx, f) - 'A' + 10;
+					break;
+				default:
+					goto illegal;
+				case EOF:
+					goto illegal_eof;
+				}
+			}
+			if (s) *s++ = (hex[0] << 4) + hex[1];
+			break;
+illegal:
+			if (i == 1)
+				fz_unread_byte(ctx, f);
+illegal_eof:
+			if (s) *s++ = '#';
+			continue;
+		}
+		default:
+			if (s) *s++ = c;
+			break;
+		}
+	}
+end:
+	if (s)
+	{
+		*s = '\0';
+		lb->len = s - lb->scratch;
+	}
+}
+
+static int
+lex_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb)
+{
+	char *s = lb->scratch;
+	char *e = s + lb->size;
+	int bal = 1;
+	int oct;
+	int c;
+
+	while (1)
+	{
+		if (s == e)
+		{
+			s += pdf_lexbuf_grow(ctx, lb);
+			e = lb->scratch + lb->size;
+		}
+		c = lex_byte(ctx, f);
+		switch (c)
+		{
+		case EOF:
+			return PDF_TOK_ERROR;
+		case '(':
+			bal++;
+			*s++ = c;
+			break;
+		case ')':
+			bal --;
+			if (bal == 0)
+				goto end;
+			*s++ = c;
+			break;
+		case '\\':
+			c = lex_byte(ctx, f);
+			switch (c)
+			{
+			case EOF:
+				return PDF_TOK_ERROR;
+			case 'n':
+				*s++ = '\n';
+				break;
+			case 'r':
+				*s++ = '\r';
+				break;
+			case 't':
+				*s++ = '\t';
+				break;
+			case 'b':
+				*s++ = '\b';
+				break;
+			case 'f':
+				*s++ = '\f';
+				break;
+			case '(':
+				*s++ = '(';
+				break;
+			case ')':
+				*s++ = ')';
+				break;
+			case '\\':
+				*s++ = '\\';
+				break;
+			case RANGE_0_7:
+				oct = c - '0';
+				c = lex_byte(ctx, f);
+				if (c >= '0' && c <= '7')
+				{
+					oct = oct * 8 + (c - '0');
+					c = lex_byte(ctx, f);
+					if (c >= '0' && c <= '7')
+						oct = oct * 8 + (c - '0');
+					else if (c != EOF)
+						fz_unread_byte(ctx, f);
+				}
+				else if (c != EOF)
+					fz_unread_byte(ctx, f);
+				*s++ = oct;
+				break;
+			case '\n':
+				break;
+			case '\r':
+				c = lex_byte(ctx, f);
+				if ((c != '\n') && (c != EOF))
+					fz_unread_byte(ctx, f);
+				break;
+			default:
+				*s++ = c;
+			}
+			break;
+		/* Bug 708256: PDF 32000-1 says that any occurence of \n, \r, or \r\n in a
+		 * (unless escaped with a '\') should be interpreted as a single 0x0a byte. */
+		case '\n':
+			*s++ = 0x0a;
+			break;
+		case '\r':
+			*s++ = 0x0a;
+			c = lex_byte(ctx, f);
+			if ((c != '\n') && (c != EOF))
+				fz_unread_byte(ctx, f);
+			break;
+		default:
+			*s++ = c;
+			break;
+		}
+	}
+end:
+	lb->len = s - lb->scratch;
+	return PDF_TOK_STRING;
+}
+
+static int
+lex_hex_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb)
+{
+	char *s = lb->scratch;
+	char *e = s + lb->size;
+	int a = 0, x = 0;
+	int c;
+
+	while (1)
+	{
+		if (s == e)
+		{
+			s += pdf_lexbuf_grow(ctx, lb);
+			e = lb->scratch + lb->size;
+		}
+		c = lex_byte(ctx, f);
+		switch (c)
+		{
+		case IS_WHITE:
+			break;
+		default:
+			fz_warn(ctx, "invalid character in hex string");
+			/* fall through */
+		case IS_HEX:
+			if (x)
+			{
+				*s++ = a * 16 + unhex(c);
+				x = !x;
+			}
+			else
+			{
+				a = unhex(c);
+				x = !x;
+			}
+			break;
+		case '>':
+			if (x)
+			{
+				*s++ = a * 16; /* pad truncated string with '0' */
+			}
+			goto end;
+		case EOF:
+			return PDF_TOK_ERROR;
+		}
+	}
+end:
+	lb->len = s - lb->scratch;
+	return PDF_TOK_STRING;
+}
+
+static pdf_token
+pdf_token_from_keyword(char *key)
+{
+	switch (*key)
+	{
+	case 'R':
+		if (!strcmp(key, "R")) return PDF_TOK_R;
+		break;
+	case 't':
+		if (!strcmp(key, "true")) return PDF_TOK_TRUE;
+		if (!strcmp(key, "trailer")) return PDF_TOK_TRAILER;
+		break;
+	case 'f':
+		if (!strcmp(key, "false")) return PDF_TOK_FALSE;
+		break;
+	case 'n':
+		if (!strcmp(key, "null")) return PDF_TOK_NULL;
+		if (!strcmp(key, "newobj")) return PDF_TOK_NEWOBJ;
+		break;
+	case 'o':
+		if (!strcmp(key, "obj")) return PDF_TOK_OBJ;
+		break;
+	case 'e':
+		if (!strcmp(key, "endobj")) return PDF_TOK_ENDOBJ;
+		if (!strcmp(key, "endstream")) return PDF_TOK_ENDSTREAM;
+		break;
+	case 's':
+		if (!strcmp(key, "stream")) return PDF_TOK_STREAM;
+		if (!strcmp(key, "startxref")) return PDF_TOK_STARTXREF;
+		break;
+	case 'x':
+		if (!strcmp(key, "xref")) return PDF_TOK_XREF;
+		break;
+	}
+
+	while (*key)
+	{
+		if (!fz_isprint(*key))
+			return PDF_TOK_ERROR;
+		++key;
+	}
+
+	return PDF_TOK_KEYWORD;
+}
+
+void pdf_lexbuf_init(fz_context *ctx, pdf_lexbuf *lb, int size)
+{
+	lb->size = lb->base_size = size;
+	lb->len = 0;
+	lb->scratch = &lb->buffer[0];
+}
+
+void pdf_lexbuf_fin(fz_context *ctx, pdf_lexbuf *lb)
+{
+	if (lb && lb->size != lb->base_size)
+		fz_free(ctx, lb->scratch);
+}
+
+ptrdiff_t pdf_lexbuf_grow(fz_context *ctx, pdf_lexbuf *lb)
+{
+	char *old = lb->scratch;
+	size_t newsize = lb->size * 2;
+	if (lb->size == lb->base_size)
+	{
+		lb->scratch = Memento_label(fz_malloc(ctx, newsize), "pdf_lexbuf");
+		memcpy(lb->scratch, lb->buffer, lb->size);
+	}
+	else
+	{
+		lb->scratch = fz_realloc(ctx, lb->scratch, newsize);
+	}
+	lb->size = newsize;
+	return lb->scratch - old;
+}
+
+pdf_token
+pdf_lex(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf)
+{
+	while (1)
+	{
+		int c = lex_byte(ctx, f);
+		switch (c)
+		{
+		case EOF:
+			return PDF_TOK_EOF;
+		case IS_WHITE:
+			lex_white(ctx, f);
+			break;
+		case '%':
+			lex_comment(ctx, f);
+			break;
+		case '/':
+			lex_name(ctx, f, buf);
+			return PDF_TOK_NAME;
+		case '(':
+			return lex_string(ctx, f, buf);
+		case ')':
+			return PDF_TOK_ERROR;
+		case '<':
+			c = lex_byte(ctx, f);
+			if (c == '<')
+				return PDF_TOK_OPEN_DICT;
+			if (c != EOF)
+				fz_unread_byte(ctx, f);
+			return lex_hex_string(ctx, f, buf);
+		case '>':
+			c = lex_byte(ctx, f);
+			if (c == '>')
+				return PDF_TOK_CLOSE_DICT;
+			if (c != EOF)
+				fz_unread_byte(ctx, f);
+			return PDF_TOK_ERROR;
+		case '[':
+			return PDF_TOK_OPEN_ARRAY;
+		case ']':
+			return PDF_TOK_CLOSE_ARRAY;
+		case '{':
+			return PDF_TOK_OPEN_BRACE;
+		case '}':
+			return PDF_TOK_CLOSE_BRACE;
+		case IS_NUMBER:
+			return lex_number(ctx, f, buf, c);
+		default: /* isregular: !isdelim && !iswhite && c != EOF */
+			fz_unread_byte(ctx, f);
+			lex_name(ctx, f, buf);
+			return pdf_token_from_keyword(buf->scratch);
+		}
+	}
+}
+
+pdf_token
+pdf_lex_no_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf)
+{
+	while (1)
+	{
+		int c = lex_byte(ctx, f);
+		switch (c)
+		{
+		case EOF:
+			return PDF_TOK_EOF;
+		case IS_WHITE:
+			lex_white(ctx, f);
+			break;
+		case '%':
+			lex_comment(ctx, f);
+			break;
+		case '/':
+			lex_name(ctx, f, buf);
+			return PDF_TOK_NAME;
+		case '(':
+			return PDF_TOK_ERROR; /* no strings allowed */
+		case ')':
+			return PDF_TOK_ERROR; /* no strings allowed */
+		case '<':
+			c = lex_byte(ctx, f);
+			if (c == '<')
+				return PDF_TOK_OPEN_DICT;
+			if (c != EOF)
+				fz_unread_byte(ctx, f);
+			return PDF_TOK_ERROR; /* no strings allowed */
+		case '>':
+			c = lex_byte(ctx, f);
+			if (c == '>')
+				return PDF_TOK_CLOSE_DICT;
+			if (c != EOF)
+				fz_unread_byte(ctx, f);
+			return PDF_TOK_ERROR;
+		case '[':
+			return PDF_TOK_OPEN_ARRAY;
+		case ']':
+			return PDF_TOK_CLOSE_ARRAY;
+		case '{':
+			return PDF_TOK_OPEN_BRACE;
+		case '}':
+			return PDF_TOK_CLOSE_BRACE;
+		case IS_NUMBER:
+			return lex_number(ctx, f, buf, c);
+		default: /* isregular: !isdelim && !iswhite && c != EOF */
+			fz_unread_byte(ctx, f);
+			lex_name(ctx, f, buf);
+			return pdf_token_from_keyword(buf->scratch);
+		}
+	}
+}
+
+void pdf_append_token(fz_context *ctx, fz_buffer *fzbuf, int tok, pdf_lexbuf *buf)
+{
+	switch (tok)
+	{
+	case PDF_TOK_NAME:
+		fz_append_printf(ctx, fzbuf, "/%s", buf->scratch);
+		break;
+	case PDF_TOK_STRING:
+		if (buf->len >= buf->size)
+			pdf_lexbuf_grow(ctx, buf);
+		buf->scratch[buf->len] = 0;
+		fz_append_pdf_string(ctx, fzbuf, buf->scratch);
+		break;
+	case PDF_TOK_OPEN_DICT:
+		fz_append_string(ctx, fzbuf, "<<");
+		break;
+	case PDF_TOK_CLOSE_DICT:
+		fz_append_string(ctx, fzbuf, ">>");
+		break;
+	case PDF_TOK_OPEN_ARRAY:
+		fz_append_byte(ctx, fzbuf, '[');
+		break;
+	case PDF_TOK_CLOSE_ARRAY:
+		fz_append_byte(ctx, fzbuf, ']');
+		break;
+	case PDF_TOK_OPEN_BRACE:
+		fz_append_byte(ctx, fzbuf, '{');
+		break;
+	case PDF_TOK_CLOSE_BRACE:
+		fz_append_byte(ctx, fzbuf, '}');
+		break;
+	case PDF_TOK_INT:
+		fz_append_printf(ctx, fzbuf, "%ld", buf->i);
+		break;
+	case PDF_TOK_REAL:
+		fz_append_printf(ctx, fzbuf, "%g", buf->f);
+		break;
+	default:
+		fz_append_data(ctx, fzbuf, buf->scratch, buf->len);
+		break;
+	}
+}