diff mupdf-source/source/html/html-doc.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/source/html/html-doc.c	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,751 @@
+// Copyright (C) 2004-2024 Artifex Software, Inc.
+//
+// This file is part of MuPDF.
+//
+// MuPDF is free software: you can redistribute it and/or modify it under the
+// terms of the GNU Affero General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+// details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
+//
+// Alternative licensing terms are available from the licensor.
+// For commercial licensing, see <https://www.artifex.com/> or contact
+// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
+// CA 94129, USA, for further information.
+
+#include "mupdf/fitz.h"
+#include "html-imp.h"
+
+#include <string.h>
+#include <math.h>
+
+enum { T, R, B, L };
+
+typedef struct
+{
+	fz_document super;
+	fz_archive *zip;
+	fz_html_font_set *set;
+	fz_html *html;
+	fz_outline *outline;
+	const fz_htdoc_format_t *format;
+} html_document;
+
+typedef struct
+{
+	fz_page super;
+	html_document *doc;
+	int number;
+} html_page;
+
+static void
+htdoc_drop_document(fz_context *ctx, fz_document *doc_)
+{
+	html_document *doc = (html_document*)doc_;
+	fz_drop_archive(ctx, doc->zip);
+	fz_drop_html(ctx, doc->html);
+	fz_drop_html_font_set(ctx, doc->set);
+	fz_drop_outline(ctx, doc->outline);
+}
+
+static fz_link_dest
+htdoc_resolve_link(fz_context *ctx, fz_document *doc_, const char *dest)
+{
+	html_document *doc = (html_document*)doc_;
+	const char *s = strchr(dest, '#');
+	if (s && s[1] != 0)
+	{
+		float y = fz_find_html_target(ctx, doc->html, s+1);
+		if (y >= 0)
+		{
+			int page = y / doc->html->page_h;
+			return fz_make_link_dest_xyz(0, page, 0, y - page * doc->html->page_h, 0);
+		}
+	}
+
+	return fz_make_link_dest_none();
+}
+
+static int
+htdoc_count_pages(fz_context *ctx, fz_document *doc_, int chapter)
+{
+	html_document *doc = (html_document*)doc_;
+	if (doc->html->tree.root->s.layout.b > 0)
+		return ceilf(doc->html->tree.root->s.layout.b / doc->html->page_h);
+	return 1;
+}
+
+static void
+htdoc_update_outline(fz_context *ctx, fz_document *doc, fz_outline *node)
+{
+	while (node)
+	{
+		fz_link_dest dest = htdoc_resolve_link(ctx, doc, node->uri);
+		node->page = dest.loc;
+		node->x = dest.x;
+		node->y = dest.y;
+		htdoc_update_outline(ctx, doc, node->down);
+		node = node->next;
+	}
+}
+
+static void
+htdoc_layout(fz_context *ctx, fz_document *doc_, float w, float h, float em)
+{
+	html_document *doc = (html_document*)doc_;
+
+	fz_layout_html(ctx, doc->html, w, h, em);
+
+	htdoc_update_outline(ctx, doc_, doc->outline);
+}
+
+static void
+htdoc_drop_page(fz_context *ctx, fz_page *page_)
+{
+}
+
+static fz_rect
+htdoc_bound_page(fz_context *ctx, fz_page *page_, fz_box_type box)
+{
+	html_page *page = (html_page*)page_;
+	html_document *doc = page->doc;
+	fz_rect bbox;
+	bbox.x0 = 0;
+	bbox.y0 = 0;
+	bbox.x1 = doc->html->page_w + doc->html->page_margin[L] + doc->html->page_margin[R];
+	bbox.y1 = doc->html->page_h + doc->html->page_margin[T] + doc->html->page_margin[B];
+	return bbox;
+}
+
+static void
+htdoc_run_page(fz_context *ctx, fz_page *page_, fz_device *dev, fz_matrix ctm, fz_cookie *cookie)
+{
+	html_page *page = (html_page*)page_;
+	html_document *doc = page->doc;
+	fz_draw_html(ctx, dev, ctm, doc->html, page->number);
+}
+
+static fz_link *
+htdoc_load_links(fz_context *ctx, fz_page *page_)
+{
+	html_page *page = (html_page*)page_;
+	html_document *doc = page->doc;
+	return fz_load_html_links(ctx, doc->html, page->number, "");
+}
+
+static fz_bookmark
+htdoc_make_bookmark(fz_context *ctx, fz_document *doc_, fz_location loc)
+{
+	html_document *doc = (html_document*)doc_;
+	return fz_make_html_bookmark(ctx, doc->html, loc.page);
+}
+
+static fz_location
+htdoc_lookup_bookmark(fz_context *ctx, fz_document *doc_, fz_bookmark mark)
+{
+	html_document *doc = (html_document*)doc_;
+	return fz_make_location(0, fz_lookup_html_bookmark(ctx, doc->html, mark));
+}
+
+static fz_page *
+htdoc_load_page(fz_context *ctx, fz_document *doc_, int chapter, int number)
+{
+	html_document *doc = (html_document*)doc_;
+	html_page *page = fz_new_derived_page(ctx, html_page, doc_);
+	page->super.bound_page = htdoc_bound_page;
+	page->super.run_page_contents = htdoc_run_page;
+	page->super.load_links = htdoc_load_links;
+	page->super.drop_page = htdoc_drop_page;
+	page->doc = doc;
+	page->number = number;
+	return (fz_page*)page;
+}
+
+static fz_outline *
+htdoc_load_outline(fz_context *ctx, fz_document *doc_)
+{
+	html_document *doc = (html_document*)doc_;
+	return fz_keep_outline(ctx, doc->outline);
+}
+
+static int
+htdoc_lookup_metadata(fz_context *ctx, fz_document *doc_, const char *key, char *buf, size_t size)
+{
+	html_document *doc = (html_document *)doc_;
+	if (!strcmp(key, FZ_META_FORMAT))
+		return 1 + (int)fz_strlcpy(buf, doc->format->format_name, size);
+	if (!strcmp(key, FZ_META_INFO_TITLE) && doc->html->title)
+		return 1 + (int)fz_strlcpy(buf, doc->html->title, size);
+	return -1;
+}
+
+static fz_html *
+generic_parse(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buffer_in, const char *user_css, const fz_htdoc_format_t *format)
+{
+	fz_buffer *buffer_html = NULL;
+	fz_html *html = NULL;
+
+	fz_try(ctx)
+	{
+		if (format->convert_to_html)
+			buffer_html = format->convert_to_html(ctx, set, buffer_in, zip, user_css);
+		else
+			buffer_html = fz_keep_buffer(ctx, buffer_in);
+		html = fz_parse_html(ctx, set, zip, base_uri, buffer_html, user_css, format->try_xml, format->try_html5, format->patch_mobi);
+	}
+	fz_always(ctx)
+	{
+		fz_drop_buffer(ctx, buffer_html);
+	}
+	fz_catch(ctx)
+	{
+		fz_drop_html(ctx, html);
+		fz_rethrow(ctx);
+	}
+	return html;
+}
+
+fz_document *
+fz_htdoc_open_document_with_buffer(fz_context *ctx, fz_archive *dir, fz_buffer *buf, const fz_htdoc_format_t *format)
+{
+	html_document *doc = NULL;
+
+	fz_var(doc);
+	fz_var(dir);
+
+	fz_try(ctx)
+	{
+		doc = fz_new_derived_document(ctx, html_document);
+		doc->super.drop_document = htdoc_drop_document;
+		doc->super.layout = htdoc_layout;
+		doc->super.load_outline = htdoc_load_outline;
+		doc->super.resolve_link_dest = htdoc_resolve_link;
+		doc->super.make_bookmark = htdoc_make_bookmark;
+		doc->super.lookup_bookmark = htdoc_lookup_bookmark;
+		doc->super.count_pages = htdoc_count_pages;
+		doc->super.load_page = htdoc_load_page;
+		doc->super.lookup_metadata = htdoc_lookup_metadata;
+		doc->super.is_reflowable = 1;
+
+		doc->zip = fz_keep_archive(ctx, dir);
+		doc->format = format;
+		doc->set = fz_new_html_font_set(ctx);
+		doc->html = generic_parse(ctx, doc->set, doc->zip, ".", buf, fz_user_css(ctx), format);
+		doc->outline = fz_load_html_outline(ctx, doc->html);
+	}
+	fz_always(ctx)
+		fz_drop_buffer(ctx, buf);
+	fz_catch(ctx)
+	{
+		fz_drop_document(ctx, &doc->super);
+		fz_rethrow(ctx);
+	}
+
+	return (fz_document*)doc;
+}
+
+fz_document *
+fz_htdoc_open_document_with_stream_and_dir(fz_context *ctx, fz_stream *stm, fz_archive *dir, const fz_htdoc_format_t *format)
+{
+	fz_buffer *buf = NULL;
+
+	if (stm)
+		buf = fz_read_all(ctx, stm, 0);
+
+	return fz_htdoc_open_document_with_buffer(ctx, dir, buf, format);
+}
+
+/* Variant specific functions */
+
+/* Generic HTML document handler */
+
+static int isws(int c)
+{
+	return c == 32 || c == 9 || c == 10 || c == 13 || c == 12;
+}
+
+static int recognize_html_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *dir, void **hstate, fz_document_recognize_state_free_fn **free_state, int xhtml)
+{
+	uint8_t buffer[4096];
+	size_t i, n, m;
+	enum {
+		state_top,
+		state_open,
+		state_pling,
+		state_query,
+		state_maybe_doctype,
+		state_maybe_doctype_ws,
+		state_maybe_doctype_html,
+		state_maybe_doctype_html_xhtml,
+		state_maybe_comment,
+		state_maybe_html,
+		state_maybe_html_xhtml,
+		state_comment
+	};
+	int state = state_top;
+	int type = 0;
+
+	if (hstate)
+		*hstate = NULL;
+	if (free_state)
+		*free_state = NULL;
+
+	if (stream == NULL)
+		return 0;
+
+	/* Simple state machine. Search for "<!doctype html" or "<html" in the first
+	 * 4K of the file, allowing for comments and whitespace and case insensitivity. */
+
+	n = fz_read(ctx, stream, buffer, sizeof(buffer));
+	fz_seek(ctx, stream, 0, SEEK_SET);
+	if (n == 0)
+		return 0;
+
+	i = 0;
+	if (n >= 3 && buffer[0] == 0xEF && buffer[1] == 0xBB && buffer[2] == 0xBF)
+	{
+		/* UTF-8 encoded BOM. Just skip it. */
+		i = 3;
+	}
+	else if (n >= 2 && buffer[0] == 0xFE && buffer[1] == 0xFF)
+	{
+		/* UTF-16, big endian. */
+		type = 1;
+		i = 2;
+		n &= ~1;
+	}
+	else if (n >= 2 && buffer[0] == 0xFF && buffer[1] == 0xFE)
+	{
+		/* UTF-16, little endian. */
+		i = 2;
+		type = 2;
+		n &= ~1;
+	}
+
+	while (i < n)
+	{
+		int c;
+
+		switch (type)
+		{
+		case 0: /* UTF-8 */
+			c = buffer[i++];
+			break;
+		case 1: /* UTF-16 - big endian */
+			c = buffer[i++] << 8;
+			c |= buffer[i++];
+			break;
+		case 2: /* UTF-16 - little endian */
+			c = buffer[i++];
+			c |= buffer[i++] << 8;
+			break;
+		}
+
+		switch (state)
+		{
+		case state_top:
+			if (isws(c))
+				continue; /* whitespace */
+			if (c == '<')
+				state = state_open;
+			else
+				return 0; /* Non whitespace found at the top level prior to a known tag. Fail. */
+			break;
+		case state_open:
+			if (isws(c))
+				continue; /* whitespace */
+			if (c == '!')
+				state = state_pling;
+			else if (c == '?')
+				state = state_query;
+			else if (c == 'h' || c == 'H')
+				state = state_maybe_html;
+			else
+				return 0; /* Not an acceptable opening tag. */
+			m = 0;
+			break;
+		case state_query:
+			if (c == '>')
+				state = state_top;
+			break;
+		case state_pling:
+			if (isws(c))
+				continue; /* whitespace */
+			else if (c == '-')
+				state = state_maybe_comment;
+			else if (c == 'd' || c == 'D')
+				state = state_maybe_doctype;
+			else
+				return 0; /* Not an acceptable opening tag. */
+			break;
+		case state_maybe_comment:
+			if (c == '-')
+				state = state_comment;
+			else
+				return 0; /* Not an acceptable opening tag. */
+			break;
+		case state_comment:
+			if (c == '-')
+			{
+				m++;
+			}
+			else if (c == '>' && m >= 2)
+			{
+				state = state_top;
+			}
+			else
+				m = 0;
+			break;
+		case state_maybe_doctype:
+			if (c == "octype"[m] || c == "OCTYPE"[m])
+			{
+				m++;
+				if (m == 6)
+				{
+					state = state_maybe_doctype_ws;
+					m = 0;
+				}
+			}
+			else
+				return 0; /* Not an acceptable opening tag. */
+			break;
+		case state_maybe_doctype_ws:
+			if (isws(c))
+				m++;
+			else if (m > 0 && (c == 'h' || c == 'H'))
+			{
+				state = state_maybe_doctype_html;
+				m = 0;
+			}
+			else
+				return 0; /* Not an acceptable opening tag. */
+			break;
+		case state_maybe_doctype_html:
+			if (c == "tml"[m] || c == "TML"[m])
+			{
+				m++;
+				if (m == 3)
+				{
+					state = state_maybe_doctype_html_xhtml;
+					m = 0;
+				}
+			}
+			else
+				return 0; /* Not an acceptable opening tag. */
+			break;
+		case state_maybe_doctype_html_xhtml:
+			if (c == '>')
+			{
+				/* Not xhtml - the xhtml agent can handle this at a pinch (so 25),
+				 * but we'd rather the html one did (75). */
+				return xhtml ? 25 : 75;
+			}
+			if (c >= 'A'  && c <= 'Z')
+				c += 'a'-'A';
+			if (c == "xhtml"[m])
+			{
+				m++;
+				if (m == 5)
+				{
+					/* xhtml - the xhtml agent would be better (75) than the html
+					 * agent (25). */
+					return xhtml ? 75 : 25;
+				}
+			}
+			else
+				m = 0;
+			break;
+		case state_maybe_html:
+			if (c == "tml"[m] || c == "TML"[m])
+			{
+				m++;
+				if (m == 3)
+				{
+					state = state_maybe_html_xhtml;
+					m = 0;
+				}
+			}
+			else
+				return 0; /* Not an acceptable opening tag. */
+			break;
+		case state_maybe_html_xhtml:
+			if (c == '>')
+			{
+				/* Not xhtml - the xhtml agent can handle this at a pinch (so 25),
+				 * but we'd rather the html one did (75). */
+				return xhtml ? 25 : 75;
+			}
+			if (c >= 'A'  && c <= 'Z')
+				c += 'a'-'A';
+			if (c == "xhtml"[m])
+			{
+				m++;
+				if (m == 5)
+				{
+					/* xhtml - the xhtml agent would be better (75) than the html
+					 * agent (25). */
+					return xhtml ? 75 : 25;
+				}
+			}
+			else
+				m = 0;
+			break;
+		}
+	}
+
+	return 0;
+}
+
+int htdoc_recognize_html_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *dir, void **hstate, fz_document_recognize_state_free_fn **free_state)
+{
+	return recognize_html_content(ctx, handler, stream, dir, hstate, free_state, 0);
+}
+
+static const fz_htdoc_format_t fz_htdoc_html5 =
+{
+	"HTML5",
+	NULL,
+	0, 1, 0
+};
+
+static fz_document *
+htdoc_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *dir, void *state)
+{
+	return fz_htdoc_open_document_with_stream_and_dir(ctx, file, dir, &fz_htdoc_html5);
+}
+
+static const char *htdoc_extensions[] =
+{
+	"htm",
+	"html",
+	NULL
+};
+
+static const char *htdoc_mimetypes[] =
+{
+	"text/html",
+	NULL
+};
+
+fz_document_handler html_document_handler =
+{
+	NULL,
+	htdoc_open_document,
+	htdoc_extensions,
+	htdoc_mimetypes,
+	htdoc_recognize_html_content,
+	1
+};
+
+/* XHTML document handler */
+
+static const fz_htdoc_format_t fz_htdoc_xhtml =
+{
+	"XHTML",
+	NULL,
+	1, 1, 0
+};
+
+static fz_document *
+xhtdoc_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *dir, void *state)
+{
+	return fz_htdoc_open_document_with_stream_and_dir(ctx, file, dir, &fz_htdoc_xhtml);
+}
+
+int xhtdoc_recognize_xhtml_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *dir, void **hstate, fz_document_recognize_state_free_fn **free_state)
+{
+	return recognize_html_content(ctx, handler, stream, dir, hstate, free_state, 1);
+}
+
+static const char *xhtdoc_extensions[] =
+{
+	"xhtml",
+	NULL
+};
+
+static const char *xhtdoc_mimetypes[] =
+{
+	"application/xhtml+xml",
+	NULL
+};
+
+fz_document_handler xhtml_document_handler =
+{
+	NULL,
+	xhtdoc_open_document,
+	xhtdoc_extensions,
+	xhtdoc_mimetypes,
+	xhtdoc_recognize_xhtml_content,
+	1
+};
+
+/* FB2 document handler */
+
+static const fz_htdoc_format_t fz_htdoc_fb2 =
+{
+	"FictionBook2",
+	NULL,
+	1, 0, 0
+};
+
+static fz_document *
+fb2doc_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *dir, void *state)
+{
+	return fz_htdoc_open_document_with_stream_and_dir(ctx, file, dir, &fz_htdoc_fb2);
+}
+
+static int
+fb2doc_recognize_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *dir, void **state, fz_document_recognize_state_free_fn **free_state)
+{
+	const char *match = "<FictionBook";
+	int pos = 0;
+	int n = 4096;
+	int c;
+
+	if (state)
+		*state = NULL;
+	if (free_state)
+		*free_state = NULL;
+
+	if (stream == NULL)
+		return 0;
+
+	do
+	{
+		c = fz_read_byte(ctx, stream);
+		if (c == EOF)
+			return 0;
+		if (c == match[pos])
+		{
+			pos++;
+			if (pos == 12)
+				return 100;
+		}
+		else
+		{
+			/* Restart matching, but recheck c against the start. */
+			pos = (c == match[0]);
+		}
+	}
+	while (--n > 0);
+
+	return 0;
+}
+
+static const char *fb2doc_extensions[] =
+{
+	"fb2",
+	"xml",
+	NULL
+};
+
+static const char *fb2doc_mimetypes[] =
+{
+	"application/x-fictionbook",
+	"application/xml",
+	"text/xml",
+	NULL
+};
+
+fz_document_handler fb2_document_handler =
+{
+	NULL,
+	fb2doc_open_document,
+	fb2doc_extensions,
+	fb2doc_mimetypes,
+	fb2doc_recognize_content
+};
+
+/* Mobi document handler */
+
+static const fz_htdoc_format_t fz_htdoc_mobi =
+{
+	"MOBI",
+	NULL,
+	1, 1, 1
+};
+
+static fz_document *
+mobi_open_document_with_buffer(fz_context *ctx, fz_buffer *mobi)
+{
+	fz_archive *dir = NULL;
+	fz_buffer *html;
+	fz_document *doc;
+	fz_var(dir);
+	fz_try(ctx)
+	{
+		dir = fz_extract_html_from_mobi(ctx, mobi);
+		html = fz_read_archive_entry(ctx, dir, "index.html");
+		doc = fz_htdoc_open_document_with_buffer(ctx, dir, html, &fz_htdoc_mobi);
+	}
+	fz_always(ctx)
+	{
+		fz_drop_buffer(ctx, mobi);
+		fz_drop_archive(ctx, dir);
+	}
+	fz_catch(ctx)
+	{
+		fz_rethrow(ctx);
+	}
+	return doc;
+}
+
+static int
+mobi_recognize_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *dir, void **state, fz_document_recognize_state_free_fn **free_state)
+{
+	char text[8];
+
+	if (state)
+		*state = NULL;
+	if (free_state)
+		*free_state = NULL;
+
+	if (stream == NULL)
+		return 0;
+
+	fz_seek(ctx, stream, 32 + 28, SEEK_SET);
+	if (fz_read(ctx, stream, (unsigned char *)text, 8) != 8)
+		return 0;
+	if (memcmp(text, "BOOKMOBI", 8) == 0)
+		return 100;
+	if (memcmp(text, "TEXtREAd", 8) == 0)
+		return 100;
+
+	return 0;
+}
+
+static fz_document *
+mobi_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *dir, void *state)
+{
+	return mobi_open_document_with_buffer(ctx, fz_read_all(ctx, file, 0));
+}
+
+static const char *mobi_extensions[] =
+{
+	"mobi",
+	"prc",
+	"pdb",
+	NULL
+};
+
+static const char *mobi_mimetypes[] =
+{
+	"application/x-mobipocket-ebook",
+	NULL
+};
+
+fz_document_handler mobi_document_handler =
+{
+	NULL,
+	mobi_open_document,
+	mobi_extensions,
+	mobi_mimetypes,
+	mobi_recognize_content
+};