view mupdf-source/source/html/html-doc.c @ 7:5ab937c03c27

Apply full RELRO to all generated binaries. Also strip the generated binaries.
author Franz Glasner <fzglas.hg@dom66.de>
date Tue, 16 Sep 2025 12:37:32 +0200
parents b50eed0cc0ef
children
line wrap: on
line source

// Copyright (C) 2004-2024 Artifex Software, Inc.
//
// This file is part of MuPDF.
//
// MuPDF is free software: you can redistribute it and/or modify it under the
// terms of the GNU Affero General Public License as published by the Free
// Software Foundation, either version 3 of the License, or (at your option)
// any later version.
//
// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
// details.
//
// You should have received a copy of the GNU Affero General Public License
// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
//
// Alternative licensing terms are available from the licensor.
// For commercial licensing, see <https://www.artifex.com/> or contact
// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
// CA 94129, USA, for further information.

#include "mupdf/fitz.h"
#include "html-imp.h"

#include <string.h>
#include <math.h>

enum { T, R, B, L };

typedef struct
{
	fz_document super;
	fz_archive *zip;
	fz_html_font_set *set;
	fz_html *html;
	fz_outline *outline;
	const fz_htdoc_format_t *format;
} html_document;

typedef struct
{
	fz_page super;
	html_document *doc;
	int number;
} html_page;

static void
htdoc_drop_document(fz_context *ctx, fz_document *doc_)
{
	html_document *doc = (html_document*)doc_;
	fz_drop_archive(ctx, doc->zip);
	fz_drop_html(ctx, doc->html);
	fz_drop_html_font_set(ctx, doc->set);
	fz_drop_outline(ctx, doc->outline);
}

static fz_link_dest
htdoc_resolve_link(fz_context *ctx, fz_document *doc_, const char *dest)
{
	html_document *doc = (html_document*)doc_;
	const char *s = strchr(dest, '#');
	if (s && s[1] != 0)
	{
		float y = fz_find_html_target(ctx, doc->html, s+1);
		if (y >= 0)
		{
			int page = y / doc->html->page_h;
			return fz_make_link_dest_xyz(0, page, 0, y - page * doc->html->page_h, 0);
		}
	}

	return fz_make_link_dest_none();
}

static int
htdoc_count_pages(fz_context *ctx, fz_document *doc_, int chapter)
{
	html_document *doc = (html_document*)doc_;
	if (doc->html->tree.root->s.layout.b > 0)
		return ceilf(doc->html->tree.root->s.layout.b / doc->html->page_h);
	return 1;
}

static void
htdoc_update_outline(fz_context *ctx, fz_document *doc, fz_outline *node)
{
	while (node)
	{
		fz_link_dest dest = htdoc_resolve_link(ctx, doc, node->uri);
		node->page = dest.loc;
		node->x = dest.x;
		node->y = dest.y;
		htdoc_update_outline(ctx, doc, node->down);
		node = node->next;
	}
}

static void
htdoc_layout(fz_context *ctx, fz_document *doc_, float w, float h, float em)
{
	html_document *doc = (html_document*)doc_;

	fz_layout_html(ctx, doc->html, w, h, em);

	htdoc_update_outline(ctx, doc_, doc->outline);
}

static void
htdoc_drop_page(fz_context *ctx, fz_page *page_)
{
}

static fz_rect
htdoc_bound_page(fz_context *ctx, fz_page *page_, fz_box_type box)
{
	html_page *page = (html_page*)page_;
	html_document *doc = page->doc;
	fz_rect bbox;
	bbox.x0 = 0;
	bbox.y0 = 0;
	bbox.x1 = doc->html->page_w + doc->html->page_margin[L] + doc->html->page_margin[R];
	bbox.y1 = doc->html->page_h + doc->html->page_margin[T] + doc->html->page_margin[B];
	return bbox;
}

static void
htdoc_run_page(fz_context *ctx, fz_page *page_, fz_device *dev, fz_matrix ctm, fz_cookie *cookie)
{
	html_page *page = (html_page*)page_;
	html_document *doc = page->doc;
	fz_draw_html(ctx, dev, ctm, doc->html, page->number);
}

static fz_link *
htdoc_load_links(fz_context *ctx, fz_page *page_)
{
	html_page *page = (html_page*)page_;
	html_document *doc = page->doc;
	return fz_load_html_links(ctx, doc->html, page->number, "");
}

static fz_bookmark
htdoc_make_bookmark(fz_context *ctx, fz_document *doc_, fz_location loc)
{
	html_document *doc = (html_document*)doc_;
	return fz_make_html_bookmark(ctx, doc->html, loc.page);
}

static fz_location
htdoc_lookup_bookmark(fz_context *ctx, fz_document *doc_, fz_bookmark mark)
{
	html_document *doc = (html_document*)doc_;
	return fz_make_location(0, fz_lookup_html_bookmark(ctx, doc->html, mark));
}

static fz_page *
htdoc_load_page(fz_context *ctx, fz_document *doc_, int chapter, int number)
{
	html_document *doc = (html_document*)doc_;
	html_page *page = fz_new_derived_page(ctx, html_page, doc_);
	page->super.bound_page = htdoc_bound_page;
	page->super.run_page_contents = htdoc_run_page;
	page->super.load_links = htdoc_load_links;
	page->super.drop_page = htdoc_drop_page;
	page->doc = doc;
	page->number = number;
	return (fz_page*)page;
}

static fz_outline *
htdoc_load_outline(fz_context *ctx, fz_document *doc_)
{
	html_document *doc = (html_document*)doc_;
	return fz_keep_outline(ctx, doc->outline);
}

static int
htdoc_lookup_metadata(fz_context *ctx, fz_document *doc_, const char *key, char *buf, size_t size)
{
	html_document *doc = (html_document *)doc_;
	if (!strcmp(key, FZ_META_FORMAT))
		return 1 + (int)fz_strlcpy(buf, doc->format->format_name, size);
	if (!strcmp(key, FZ_META_INFO_TITLE) && doc->html->title)
		return 1 + (int)fz_strlcpy(buf, doc->html->title, size);
	return -1;
}

static fz_html *
generic_parse(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buffer_in, const char *user_css, const fz_htdoc_format_t *format)
{
	fz_buffer *buffer_html = NULL;
	fz_html *html = NULL;

	fz_try(ctx)
	{
		if (format->convert_to_html)
			buffer_html = format->convert_to_html(ctx, set, buffer_in, zip, user_css);
		else
			buffer_html = fz_keep_buffer(ctx, buffer_in);
		html = fz_parse_html(ctx, set, zip, base_uri, buffer_html, user_css, format->try_xml, format->try_html5, format->patch_mobi);
	}
	fz_always(ctx)
	{
		fz_drop_buffer(ctx, buffer_html);
	}
	fz_catch(ctx)
	{
		fz_drop_html(ctx, html);
		fz_rethrow(ctx);
	}
	return html;
}

fz_document *
fz_htdoc_open_document_with_buffer(fz_context *ctx, fz_archive *dir, fz_buffer *buf, const fz_htdoc_format_t *format)
{
	html_document *doc = NULL;

	fz_var(doc);
	fz_var(dir);

	fz_try(ctx)
	{
		doc = fz_new_derived_document(ctx, html_document);
		doc->super.drop_document = htdoc_drop_document;
		doc->super.layout = htdoc_layout;
		doc->super.load_outline = htdoc_load_outline;
		doc->super.resolve_link_dest = htdoc_resolve_link;
		doc->super.make_bookmark = htdoc_make_bookmark;
		doc->super.lookup_bookmark = htdoc_lookup_bookmark;
		doc->super.count_pages = htdoc_count_pages;
		doc->super.load_page = htdoc_load_page;
		doc->super.lookup_metadata = htdoc_lookup_metadata;
		doc->super.is_reflowable = 1;

		doc->zip = fz_keep_archive(ctx, dir);
		doc->format = format;
		doc->set = fz_new_html_font_set(ctx);
		doc->html = generic_parse(ctx, doc->set, doc->zip, ".", buf, fz_user_css(ctx), format);
		doc->outline = fz_load_html_outline(ctx, doc->html);
	}
	fz_always(ctx)
		fz_drop_buffer(ctx, buf);
	fz_catch(ctx)
	{
		fz_drop_document(ctx, &doc->super);
		fz_rethrow(ctx);
	}

	return (fz_document*)doc;
}

fz_document *
fz_htdoc_open_document_with_stream_and_dir(fz_context *ctx, fz_stream *stm, fz_archive *dir, const fz_htdoc_format_t *format)
{
	fz_buffer *buf = NULL;

	if (stm)
		buf = fz_read_all(ctx, stm, 0);

	return fz_htdoc_open_document_with_buffer(ctx, dir, buf, format);
}

/* Variant specific functions */

/* Generic HTML document handler */

static int isws(int c)
{
	return c == 32 || c == 9 || c == 10 || c == 13 || c == 12;
}

static int recognize_html_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *dir, void **hstate, fz_document_recognize_state_free_fn **free_state, int xhtml)
{
	uint8_t buffer[4096];
	size_t i, n, m;
	enum {
		state_top,
		state_open,
		state_pling,
		state_query,
		state_maybe_doctype,
		state_maybe_doctype_ws,
		state_maybe_doctype_html,
		state_maybe_doctype_html_xhtml,
		state_maybe_comment,
		state_maybe_html,
		state_maybe_html_xhtml,
		state_comment
	};
	int state = state_top;
	int type = 0;

	if (hstate)
		*hstate = NULL;
	if (free_state)
		*free_state = NULL;

	if (stream == NULL)
		return 0;

	/* Simple state machine. Search for "<!doctype html" or "<html" in the first
	 * 4K of the file, allowing for comments and whitespace and case insensitivity. */

	n = fz_read(ctx, stream, buffer, sizeof(buffer));
	fz_seek(ctx, stream, 0, SEEK_SET);
	if (n == 0)
		return 0;

	i = 0;
	if (n >= 3 && buffer[0] == 0xEF && buffer[1] == 0xBB && buffer[2] == 0xBF)
	{
		/* UTF-8 encoded BOM. Just skip it. */
		i = 3;
	}
	else if (n >= 2 && buffer[0] == 0xFE && buffer[1] == 0xFF)
	{
		/* UTF-16, big endian. */
		type = 1;
		i = 2;
		n &= ~1;
	}
	else if (n >= 2 && buffer[0] == 0xFF && buffer[1] == 0xFE)
	{
		/* UTF-16, little endian. */
		i = 2;
		type = 2;
		n &= ~1;
	}

	while (i < n)
	{
		int c;

		switch (type)
		{
		case 0: /* UTF-8 */
			c = buffer[i++];
			break;
		case 1: /* UTF-16 - big endian */
			c = buffer[i++] << 8;
			c |= buffer[i++];
			break;
		case 2: /* UTF-16 - little endian */
			c = buffer[i++];
			c |= buffer[i++] << 8;
			break;
		}

		switch (state)
		{
		case state_top:
			if (isws(c))
				continue; /* whitespace */
			if (c == '<')
				state = state_open;
			else
				return 0; /* Non whitespace found at the top level prior to a known tag. Fail. */
			break;
		case state_open:
			if (isws(c))
				continue; /* whitespace */
			if (c == '!')
				state = state_pling;
			else if (c == '?')
				state = state_query;
			else if (c == 'h' || c == 'H')
				state = state_maybe_html;
			else
				return 0; /* Not an acceptable opening tag. */
			m = 0;
			break;
		case state_query:
			if (c == '>')
				state = state_top;
			break;
		case state_pling:
			if (isws(c))
				continue; /* whitespace */
			else if (c == '-')
				state = state_maybe_comment;
			else if (c == 'd' || c == 'D')
				state = state_maybe_doctype;
			else
				return 0; /* Not an acceptable opening tag. */
			break;
		case state_maybe_comment:
			if (c == '-')
				state = state_comment;
			else
				return 0; /* Not an acceptable opening tag. */
			break;
		case state_comment:
			if (c == '-')
			{
				m++;
			}
			else if (c == '>' && m >= 2)
			{
				state = state_top;
			}
			else
				m = 0;
			break;
		case state_maybe_doctype:
			if (c == "octype"[m] || c == "OCTYPE"[m])
			{
				m++;
				if (m == 6)
				{
					state = state_maybe_doctype_ws;
					m = 0;
				}
			}
			else
				return 0; /* Not an acceptable opening tag. */
			break;
		case state_maybe_doctype_ws:
			if (isws(c))
				m++;
			else if (m > 0 && (c == 'h' || c == 'H'))
			{
				state = state_maybe_doctype_html;
				m = 0;
			}
			else
				return 0; /* Not an acceptable opening tag. */
			break;
		case state_maybe_doctype_html:
			if (c == "tml"[m] || c == "TML"[m])
			{
				m++;
				if (m == 3)
				{
					state = state_maybe_doctype_html_xhtml;
					m = 0;
				}
			}
			else
				return 0; /* Not an acceptable opening tag. */
			break;
		case state_maybe_doctype_html_xhtml:
			if (c == '>')
			{
				/* Not xhtml - the xhtml agent can handle this at a pinch (so 25),
				 * but we'd rather the html one did (75). */
				return xhtml ? 25 : 75;
			}
			if (c >= 'A'  && c <= 'Z')
				c += 'a'-'A';
			if (c == "xhtml"[m])
			{
				m++;
				if (m == 5)
				{
					/* xhtml - the xhtml agent would be better (75) than the html
					 * agent (25). */
					return xhtml ? 75 : 25;
				}
			}
			else
				m = 0;
			break;
		case state_maybe_html:
			if (c == "tml"[m] || c == "TML"[m])
			{
				m++;
				if (m == 3)
				{
					state = state_maybe_html_xhtml;
					m = 0;
				}
			}
			else
				return 0; /* Not an acceptable opening tag. */
			break;
		case state_maybe_html_xhtml:
			if (c == '>')
			{
				/* Not xhtml - the xhtml agent can handle this at a pinch (so 25),
				 * but we'd rather the html one did (75). */
				return xhtml ? 25 : 75;
			}
			if (c >= 'A'  && c <= 'Z')
				c += 'a'-'A';
			if (c == "xhtml"[m])
			{
				m++;
				if (m == 5)
				{
					/* xhtml - the xhtml agent would be better (75) than the html
					 * agent (25). */
					return xhtml ? 75 : 25;
				}
			}
			else
				m = 0;
			break;
		}
	}

	return 0;
}

int htdoc_recognize_html_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *dir, void **hstate, fz_document_recognize_state_free_fn **free_state)
{
	return recognize_html_content(ctx, handler, stream, dir, hstate, free_state, 0);
}

static const fz_htdoc_format_t fz_htdoc_html5 =
{
	"HTML5",
	NULL,
	0, 1, 0
};

static fz_document *
htdoc_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *dir, void *state)
{
	return fz_htdoc_open_document_with_stream_and_dir(ctx, file, dir, &fz_htdoc_html5);
}

static const char *htdoc_extensions[] =
{
	"htm",
	"html",
	NULL
};

static const char *htdoc_mimetypes[] =
{
	"text/html",
	NULL
};

fz_document_handler html_document_handler =
{
	NULL,
	htdoc_open_document,
	htdoc_extensions,
	htdoc_mimetypes,
	htdoc_recognize_html_content,
	1
};

/* XHTML document handler */

static const fz_htdoc_format_t fz_htdoc_xhtml =
{
	"XHTML",
	NULL,
	1, 1, 0
};

static fz_document *
xhtdoc_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *dir, void *state)
{
	return fz_htdoc_open_document_with_stream_and_dir(ctx, file, dir, &fz_htdoc_xhtml);
}

int xhtdoc_recognize_xhtml_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *dir, void **hstate, fz_document_recognize_state_free_fn **free_state)
{
	return recognize_html_content(ctx, handler, stream, dir, hstate, free_state, 1);
}

static const char *xhtdoc_extensions[] =
{
	"xhtml",
	NULL
};

static const char *xhtdoc_mimetypes[] =
{
	"application/xhtml+xml",
	NULL
};

fz_document_handler xhtml_document_handler =
{
	NULL,
	xhtdoc_open_document,
	xhtdoc_extensions,
	xhtdoc_mimetypes,
	xhtdoc_recognize_xhtml_content,
	1
};

/* FB2 document handler */

static const fz_htdoc_format_t fz_htdoc_fb2 =
{
	"FictionBook2",
	NULL,
	1, 0, 0
};

static fz_document *
fb2doc_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *dir, void *state)
{
	return fz_htdoc_open_document_with_stream_and_dir(ctx, file, dir, &fz_htdoc_fb2);
}

static int
fb2doc_recognize_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *dir, void **state, fz_document_recognize_state_free_fn **free_state)
{
	const char *match = "<FictionBook";
	int pos = 0;
	int n = 4096;
	int c;

	if (state)
		*state = NULL;
	if (free_state)
		*free_state = NULL;

	if (stream == NULL)
		return 0;

	do
	{
		c = fz_read_byte(ctx, stream);
		if (c == EOF)
			return 0;
		if (c == match[pos])
		{
			pos++;
			if (pos == 12)
				return 100;
		}
		else
		{
			/* Restart matching, but recheck c against the start. */
			pos = (c == match[0]);
		}
	}
	while (--n > 0);

	return 0;
}

static const char *fb2doc_extensions[] =
{
	"fb2",
	"xml",
	NULL
};

static const char *fb2doc_mimetypes[] =
{
	"application/x-fictionbook",
	"application/xml",
	"text/xml",
	NULL
};

fz_document_handler fb2_document_handler =
{
	NULL,
	fb2doc_open_document,
	fb2doc_extensions,
	fb2doc_mimetypes,
	fb2doc_recognize_content
};

/* Mobi document handler */

static const fz_htdoc_format_t fz_htdoc_mobi =
{
	"MOBI",
	NULL,
	1, 1, 1
};

static fz_document *
mobi_open_document_with_buffer(fz_context *ctx, fz_buffer *mobi)
{
	fz_archive *dir = NULL;
	fz_buffer *html;
	fz_document *doc;
	fz_var(dir);
	fz_try(ctx)
	{
		dir = fz_extract_html_from_mobi(ctx, mobi);
		html = fz_read_archive_entry(ctx, dir, "index.html");
		doc = fz_htdoc_open_document_with_buffer(ctx, dir, html, &fz_htdoc_mobi);
	}
	fz_always(ctx)
	{
		fz_drop_buffer(ctx, mobi);
		fz_drop_archive(ctx, dir);
	}
	fz_catch(ctx)
	{
		fz_rethrow(ctx);
	}
	return doc;
}

static int
mobi_recognize_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *dir, void **state, fz_document_recognize_state_free_fn **free_state)
{
	char text[8];

	if (state)
		*state = NULL;
	if (free_state)
		*free_state = NULL;

	if (stream == NULL)
		return 0;

	fz_seek(ctx, stream, 32 + 28, SEEK_SET);
	if (fz_read(ctx, stream, (unsigned char *)text, 8) != 8)
		return 0;
	if (memcmp(text, "BOOKMOBI", 8) == 0)
		return 100;
	if (memcmp(text, "TEXtREAd", 8) == 0)
		return 100;

	return 0;
}

static fz_document *
mobi_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *dir, void *state)
{
	return mobi_open_document_with_buffer(ctx, fz_read_all(ctx, file, 0));
}

static const char *mobi_extensions[] =
{
	"mobi",
	"prc",
	"pdb",
	NULL
};

static const char *mobi_mimetypes[] =
{
	"application/x-mobipocket-ebook",
	NULL
};

fz_document_handler mobi_document_handler =
{
	NULL,
	mobi_open_document,
	mobi_extensions,
	mobi_mimetypes,
	mobi_recognize_content
};