Mercurial > hgrepos > Python2 > PyMuPDF

diff mupdf-source/source/html/office.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author: Franz Glasner <fzglas.hg@dom66.de>
date: Mon, 15 Sep 2025 11:43:07 +0200
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/source/html/office.c	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,1343 @@
+// Copyright (C) 2023-2025 Artifex Software, Inc.
+//
+// This file is part of MuPDF.
+//
+// MuPDF is free software: you can redistribute it and/or modify it under the
+// terms of the GNU Affero General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+// details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
+//
+// Alternative licensing terms are available from the licensor.
+// For commercial licensing, see <https://www.artifex.com/> or contact
+// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
+// CA 94129, USA, for further information.
+
+#include "mupdf/fitz.h"
+#include "html-imp.h"
+
+#undef DEBUG_OFFICE_TO_HTML
+
+/* Defaults are all 0's. FIXME: Very subject to change. Possibly might be removed entirely. */
+typedef struct
+{
+	int output_page_numbers;
+	int output_sheet_names;
+	int output_cell_markers;
+	int output_cell_row_markers;
+	int output_cell_names;
+	int output_formatting;
+	int output_filenames;
+	int output_errors;
+}
+fz_office_to_html_opts;
+
+typedef struct
+{
+	fz_office_to_html_opts opts;
+
+	fz_output *out;
+
+	int page;
+
+	/* State for if we are parsing a sheet. */
+	/* The last column label we have to send. */
+	char *label;
+	/* Columns are numbered from 1. */
+	/* The column we are at. */
+	int col_at;
+	/* The column we last signalled. If this is 0, then we haven't
+	 * even started a row yet. */
+	int col_signalled;
+
+	/* If we are currently processing a spreadsheet, store the current
+	 * sheets name here. */
+	const char *sheet_name;
+
+	int shared_string_max;
+	int shared_string_len;
+	char **shared_strings;
+
+	int footnotes_max;
+	char **footnotes;
+
+	char *title;
+} doc_info;
+
+static void
+doc_escape(fz_context *ctx, fz_output *output, const char *str_)
+{
+	const unsigned char *str = (const unsigned char *)str_;
+	int c;
+
+	if (!str)
+		return;
+
+	while ((c = *str++) != 0)
+	{
+		if (c == '&')
+		{
+			fz_write_string(ctx, output, "&amp;");
+		}
+		else if (c == '<')
+		{
+			fz_write_string(ctx, output, "&lt;");
+		}
+		else if (c == '>')
+		{
+			fz_write_string(ctx, output, "&gt;");
+		}
+		else
+		{
+			/* We get utf-8 in, just parrot it out again. */
+			fz_write_byte(ctx, output, c);
+		}
+	}
+}
+
+static void
+show_text(fz_context *ctx, fz_xml *top, doc_info *info)
+{
+	fz_xml *pos = top;
+	fz_xml *next;
+
+	while (pos)
+	{
+		doc_escape(ctx, info->out, fz_xml_text(pos));
+
+		if (fz_xml_is_tag(pos, "lineBreak"))
+		{
+			fz_write_string(ctx, info->out, "\n");
+		}
+		else if (fz_xml_is_tag(pos, "tab"))
+		{
+			fz_write_string(ctx, info->out, "\t");
+		}
+		else if (fz_xml_is_tag(pos, "lastRenderedPageBreak"))
+		{
+			info->page++;
+		}
+
+		/* Always try to move down. */
+		next = fz_xml_down(pos);
+		if (next)
+		{
+			/* We can move down, easy! */
+			pos = next;
+			continue;
+		}
+
+		if (pos == top)
+			break;
+
+		/* We can't move down, try moving to next. */
+		next = fz_xml_next(pos);
+		if (next)
+		{
+			/* We can move to next, easy! */
+			pos = next;
+			continue;
+		}
+
+		/* If we can't go down, or next, pop up until we
+		 * find somewhere we can go next from. */
+		while (1)
+		{
+			/* OK. So move up. */
+			pos = fz_xml_up(pos);
+			/* Check for hitting the top. */
+			if (pos == top)
+				pos = NULL;
+			if (pos == NULL)
+				break;
+			/* We've returned to a node. See if it's a 'p'. */
+			if (fz_xml_is_tag(pos, "p"))
+			{
+				fz_write_string(ctx, info->out, "\n");
+			}
+			next = fz_xml_next(pos);
+			if (next)
+			{
+				pos = next;
+				break;
+			}
+		}
+	}
+}
+
+static void
+show_footnote(fz_context *ctx, fz_xml *v, doc_info *info)
+{
+	int n = fz_atoi(fz_xml_att(v, "w:id"));
+
+	if (n < 0 || n >= info->footnotes_max)
+		return;
+
+	if (info->footnotes[n] == NULL ||
+		info->footnotes[n][0] == 0)
+		return;
+
+	/* Then send the strings. */
+	doc_escape(ctx, info->out, info->footnotes[n]);
+}
+
+static void
+process_doc_stream(fz_context *ctx, fz_xml *xml, doc_info *info, int do_pages)
+{
+	fz_xml *pos;
+	fz_xml *next;
+	const char *paragraph_style = NULL;
+	const char *inline_style = NULL;
+
+#ifdef DEBUG_OFFICE_TO_HTML
+	fz_write_printf(ctx, fz_stddbg(ctx), "process_doc_stream:\n");
+	fz_output_xml(ctx, fz_stddbg(ctx), xml, 0);
+#endif
+
+	/* First off, see if we can do page numbers. */
+	if (do_pages)
+	{
+		pos = fz_xml_find_dfs(xml, "lastRenderedPageBreak", NULL, NULL);
+		if (pos)
+		{
+			/* We *can* do page numbers, so start here. */
+			fz_write_string(ctx, info->out, "<div id=\"page1\">\n");
+			info->page = 1;
+		}
+	}
+
+	/* Now walk the tree for real. */
+	pos = xml;
+	while (pos)
+	{
+		/* When we arrive on a node, check if it's a 't'. */
+		if (fz_xml_is_tag(pos, "t"))
+		{
+			show_text(ctx, pos, info);
+			/* Do NOT go down, we've already dealt with that. */
+		}
+		else if (fz_xml_is_tag(pos, "br"))
+		{
+			if (paragraph_style && strcmp(paragraph_style, "pre"))
+			{
+				fz_write_printf(ctx, info->out, "<br/>\n");
+			}
+			else
+			{
+				fz_write_printf(ctx, info->out, "\n");
+			}
+		}
+		else if (fz_xml_is_tag(pos, "footnoteReference"))
+		{
+			show_footnote(ctx, pos, info);
+			/* Do NOT go down, we've already dealt with that. */
+		}
+		else if (fz_xml_is_tag(pos, "tabs"))
+		{
+			/* Don't walk through tabs, or we will hit lots of 'tab' entries and
+			 * output incorrect information. */
+		}
+		else if (fz_xml_is_tag(pos, "pStyle"))
+		{
+			/* Should prob fix fz_xml_*() to strip namespace prefix
+			from attributes, to match what it does for tag names.
+			*/
+			paragraph_style = fz_xml_att(pos, "w:val");
+			if (paragraph_style)
+			{
+				if (!strcmp(paragraph_style, "BodyText"))
+					paragraph_style = NULL;
+				else if (!strcmp(paragraph_style, "Heading1"))
+					paragraph_style = "h1";
+				else if (!strcmp(paragraph_style, "Heading2"))
+					paragraph_style = "h2";
+				else if (!strcmp(paragraph_style, "Heading3"))
+					paragraph_style = "h3";
+				else if (!strcmp(paragraph_style, "Heading4"))
+					paragraph_style = "h4";
+				else if (!strcmp(paragraph_style, "Heading5"))
+					paragraph_style = "h5";
+				else if (!strcmp(paragraph_style, "Heading6"))
+					paragraph_style = "h6";
+				else if (!strcmp(paragraph_style, "SourceCode"))
+					paragraph_style = "pre";
+				else
+					paragraph_style = NULL;
+
+				if (paragraph_style)
+					fz_write_printf(ctx, info->out, "<%s>", paragraph_style);
+			}
+		}
+		else if (fz_xml_is_tag(pos, "rStyle"))
+		{
+			inline_style = fz_xml_att(pos, "w:val");
+			if (inline_style)
+			{
+				if (!strcmp(inline_style, "VerbatimChar"))
+					inline_style = "tt";
+				else
+				{
+					if (0)
+						fz_write_printf(ctx, info->out, "<!-- %s -->", inline_style);
+					inline_style = NULL;
+				}
+				if (inline_style)
+					fz_write_printf(ctx, info->out, "<%s>", inline_style);
+			}
+		}
+		else
+		{
+			fz_xml *down;
+			if (fz_xml_is_tag(pos, "lineBreak"))
+			{
+				fz_write_string(ctx, info->out, "\n");
+			}
+			else if (fz_xml_is_tag(pos, "p"))
+			{
+				fz_write_string(ctx, info->out, "<p>");
+			}
+			else if (fz_xml_is_tag(pos, "tab"))
+			{
+				fz_write_string(ctx, info->out, "\t");
+			}
+			else if (do_pages && fz_xml_is_tag(pos, "lastRenderedPageBreak"))
+			{
+				if (info->page)
+					fz_write_string(ctx, info->out, "\n</div>\n");
+				info->page++;
+				fz_write_printf(ctx, info->out, "<div id=\"page%d\">\n", info->page);
+			}
+			/* Try to move down. */
+			down = fz_xml_down(pos);
+			if (down)
+			{
+				/* We can move down, easy! */
+				pos = down;
+				continue;
+			}
+		}
+		/* Try moving to next. */
+		next = fz_xml_next(pos);
+		if (next)
+		{
+			/* We can move to next, easy! */
+			pos = next;
+			continue;
+		}
+
+		/* If we can't go down, or next, pop up until we
+		 * find somewhere we can go next from. */
+		while (1)
+		{
+			/* OK. So move up. */
+			pos = fz_xml_up(pos);
+			/* Check for hitting the top. */
+			if (pos == NULL)
+				break;
+			/* We've returned to a node. See if it's a 'p'. */
+			if (fz_xml_is_tag(pos, "p"))
+			{
+				if (paragraph_style)
+				{
+					fz_write_printf(ctx, info->out, "</%s>", paragraph_style);
+					paragraph_style = NULL;
+				}
+				fz_write_string(ctx, info->out, "</p>\n");
+			}
+			else if (fz_xml_is_tag(pos, "r"))
+			{
+				/* Seems to be pseudo-close for rStyle. */
+				if (inline_style)
+				{
+					fz_write_printf(ctx, info->out, "</%s>", inline_style);
+					inline_style = NULL;
+				}
+			}
+			next = fz_xml_next(pos);
+			if (next)
+			{
+				pos = next;
+				break;
+			}
+		}
+	}
+
+	if (do_pages && info->page)
+		fz_write_string(ctx, info->out, "\n</div>\n");
+}
+
+static void
+process_item(fz_context *ctx, fz_archive *arch, const char *file, doc_info *info, int do_pages)
+{
+	fz_xml *xml = fz_parse_xml_archive_entry(ctx, arch, file, 1);
+
+	fz_try(ctx)
+		process_doc_stream(ctx, xml, info, do_pages);
+	fz_always(ctx)
+		fz_drop_xml(ctx, xml);
+	fz_catch(ctx)
+		fz_rethrow(ctx);
+}
+
+static void
+process_rootfile(fz_context *ctx, fz_archive *arch, const char *file, doc_info *info)
+{
+	fz_xml *xml = fz_parse_xml_archive_entry(ctx, arch, file, 0);
+
+	fz_try(ctx)
+	{
+		/* FIXME: Should really search for these just inside 'spine'. */
+		fz_xml *pos = fz_xml_find_dfs(xml, "itemref", NULL, NULL);
+		while (pos)
+		{
+			char *idref = fz_xml_att(pos, "idref");
+			fz_xml *item = fz_xml_find_dfs(xml, "item", "id", idref);
+			while (item)
+			{
+				char *type = fz_xml_att(item, "media-type");
+				char *href = fz_xml_att(item, "href");
+				if (type && href && !strcmp(type, "application/xml"))
+				{
+					process_item(ctx, arch, href, info, 1);
+				}
+				item = fz_xml_find_next_dfs(pos, "item", "id", idref);
+			}
+			pos = fz_xml_find_next_dfs(pos, "itemref", NULL, NULL);
+		}
+	}
+	fz_always(ctx)
+		fz_drop_xml(ctx, xml);
+	fz_catch(ctx)
+		fz_rethrow(ctx);
+}
+
+/* XLSX support */
+static char *
+make_rel_name(fz_context *ctx, const char *file)
+{
+	size_t z = strlen(file);
+	char *s = fz_malloc(ctx, z + 12);
+	char *t;
+	const char *p;
+	const char *slash = file;
+
+	for (p = file; *p != 0; p++)
+		if (*p == '/')
+			slash = p+1;
+
+	t = s;
+	if (slash != file)
+	{
+		memcpy(t, file, slash - file);
+		t += slash - file;
+	}
+	memcpy(t, "_rels/", 6);
+	t += 6;
+	memcpy(t, file + (slash - file), z - (slash - file));
+	t += z - (slash - file);
+	memcpy(t, ".rels", 6);
+
+	return s;
+}
+
+static char *lookup_rel(fz_context *ctx, fz_xml *rels, const char *id)
+{
+	fz_xml *pos;
+
+	if (id == NULL)
+		return NULL;
+
+	pos = fz_xml_find_dfs(rels, "Relationship", NULL, NULL);
+	while (pos)
+	{
+		char *id2 = fz_xml_att(pos, "Id");
+
+		if (id2 && !strcmp(id, id2))
+			return fz_xml_att(pos, "Target");
+
+		pos = fz_xml_find_next_dfs(pos, "Relationship", NULL, NULL);
+	}
+
+	return NULL;
+}
+
+static void
+send_cell_formatting(fz_context *ctx, doc_info *info)
+{
+	if (info->col_signalled == 0)
+	{
+		fz_write_string(ctx, info->out, "<tr>\n");
+		info->col_signalled = 1;
+		if (info->col_at > 1)
+			fz_write_string(ctx, info->out, "<td>");
+	}
+
+	/* Send the label */
+	while (info->col_signalled < info->col_at)
+	{
+		fz_write_string(ctx, info->out, "</td>");
+		info->col_signalled++;
+		if (info->col_signalled < info->col_at)
+			fz_write_string(ctx, info->out, "<td>");
+	}
+	if (info->sheet_name && info->sheet_name[0])
+		fz_write_printf(ctx, info->out, "<td id=\"%s!%s\">", info->sheet_name, info->label);
+	else
+		fz_write_printf(ctx, info->out, "<td id=\"%s\">", info->label);
+}
+
+static void
+show_shared_string(fz_context *ctx, fz_xml *v, doc_info *info)
+{
+	const char *t = fz_xml_text(fz_xml_down(v));
+	int n = fz_atoi(t);
+
+	if (n < 0 || n >= info->shared_string_len)
+		return;
+
+	if (info->shared_strings[n] == NULL ||
+		info->shared_strings[n][0] == 0)
+		return;
+
+	send_cell_formatting(ctx, info);
+	/* Then send the strings. */
+	doc_escape(ctx, info->out, info->shared_strings[n]);
+}
+
+static int
+col_from_label(const char *label)
+{
+	int col = 0;
+	int len = 26;
+	int base = 0;
+
+	/* If we can't read the column, return 0. */
+	if (label == NULL || *label < 'A' || *label > 'Z')
+		return 0;
+
+	/*	Each section (A-Z, AA-ZZ, AAA-ZZZ etc) is of len 'len', and starts
+	 *	at base index 'base'. Each section is 26 times as long, and starts
+	 *	at base + len from the previous section.
+	 *
+	 *	A:	col = 26 * 0 + 0 + 0
+	 *	AA:	col = (26 * 0 + 0 + 0) * 26 + 0 + 26 = 26
+	 *	AAA:	col = (((26 * 0 + 0 + 0) * 26 + 0 + 26)*26 + 0 + 26*26 = 26 + 26 * 26
+	 */
+	do
+	{
+		col = 26 * col + (*label++) - 'A' + base;
+		base += len;
+		len *= 26;
+	}
+	while (*label >= 'A' && *label <= 'Z');
+
+	return col+1;
+}
+
+static void
+show_cell_text(fz_context *ctx, fz_xml *top, doc_info *info)
+{
+	fz_xml *pos = top;
+	fz_xml *next;
+
+	while (pos)
+	{
+		char *text = fz_xml_text(pos);
+
+		if (text)
+		{
+			send_cell_formatting(ctx, info);
+			doc_escape(ctx, info->out, text);
+		}
+
+		/* Always try to move down. */
+		next = fz_xml_down(pos);
+		if (next)
+		{
+			/* We can move down, easy! */
+			pos = next;
+			continue;
+		}
+
+		if (pos == top)
+			break;
+
+		/* We can't move down, try moving to next. */
+		next = fz_xml_next(pos);
+		if (next)
+		{
+			/* We can move to next, easy! */
+			pos = next;
+			continue;
+		}
+
+		/* If we can't go down, or next, pop up until we
+		 * find somewhere we can go next from. */
+		while (1)
+		{
+			/* OK. So move up. */
+			pos = fz_xml_up(pos);
+			/* Check for hitting the top. */
+			if (pos == top)
+				pos = NULL;
+			if (pos == NULL)
+				break;
+			next = fz_xml_next(pos);
+			if (next)
+			{
+				pos = next;
+				break;
+			}
+		}
+	}
+}
+
+static void
+arrived_at_cell(fz_context *ctx, doc_info *info, const char *label)
+{
+	int col;
+
+	/* If we have a label queued, and no label is given here, then we're
+	 * processing a 'cell' callback after having had a 'cellname'
+	 * callback. So don't signal it twice! */
+	if (label == NULL && info->label)
+		return;
+
+	col = label ? col_from_label(label) : 0;
+
+	fz_free(ctx, info->label);
+	info->label = NULL;
+	info->label = label ? fz_strdup(ctx, label) : NULL;
+	info->col_at = col;
+}
+
+static void
+show_cell(fz_context *ctx, fz_xml *cell, doc_info *info)
+{
+	char *t = fz_xml_att(cell, "t");
+	fz_xml *v = fz_xml_find_down(cell, "v");
+	const char *r = fz_xml_att(cell, "r");
+
+	arrived_at_cell(ctx, info, r);
+	if (t && t[0] == 's' && t[1] == 0)
+		show_shared_string(ctx, v, info);
+	else
+		show_cell_text(ctx, v, info);
+}
+
+static void
+new_row(fz_context *ctx, doc_info *info)
+{
+	if (info->col_signalled)
+	{
+		/* We've sent at least one cell. So need to close the
+		 * td and tr */
+		fz_write_string(ctx, info->out, "</td>\n</tr>\n");
+	}
+	else
+	{
+		/* We've not sent anything for this row. Keep the counts
+		 * correct. */
+		fz_write_string(ctx, info->out, "<tr></tr>\n");
+	}
+	info->col_at = 1;
+	info->col_signalled = 0;
+	fz_free(ctx, info->label);
+	info->label = NULL;
+}
+
+static void
+process_sheet(fz_context *ctx, fz_archive *arch, const char *name, const char *file, doc_info *info)
+{
+	fz_xml *xml = fz_parse_xml_archive_entry(ctx, arch, file, 1);
+
+#ifdef DEBUG_OFFICE_TO_HTML
+	fz_write_printf(ctx, fz_stddbg(ctx), "process_sheet:\n");
+	fz_output_xml(ctx, fz_stddbg(ctx), xml, 0);
+#endif
+
+	fz_write_printf(ctx, info->out, "<table id=\"%s\">\n", name);
+
+	info->sheet_name = name;
+	info->col_at = 0;
+	info->col_signalled = 0;
+
+	fz_try(ctx)
+	{
+		fz_xml *pos = xml;
+		fz_xml *next;
+
+		while (pos)
+		{
+			/* When we arrive on a node, check if it's a cell. */
+			if (fz_xml_is_tag(pos, "c"))
+			{
+				show_cell(ctx, pos, info);
+				/* Do NOT go down, we've already dealt with that. */
+			}
+			else
+			{
+				/* Try to move down. */
+				next = fz_xml_down(pos);
+				if (next)
+				{
+					/* We can move down, easy! */
+					pos = next;
+					continue;
+				}
+			}
+			/* Try moving to next. */
+			next = fz_xml_next(pos);
+			if (next)
+			{
+				/* We can move to next, easy! */
+				pos = next;
+				continue;
+			}
+
+			/* If we can't go down, or next, pop up until we
+			 * find somewhere we can go next from. */
+			while (1)
+			{
+				/* OK. So move up. */
+				pos = fz_xml_up(pos);
+				/* Check for hitting the top. */
+				if (pos == NULL)
+					break;
+
+				/* We've returned to a node. See if it's a 'row'. */
+				if (fz_xml_is_tag(pos, "row"))
+					new_row(ctx, info);
+
+				next = fz_xml_next(pos);
+				if (next)
+				{
+					pos = next;
+					break;
+				}
+			}
+		}
+		if (info->col_signalled)
+			fz_write_printf(ctx, info->out, "</td>\n</tr>\n");
+		fz_write_printf(ctx, info->out, "</table>\n");
+	}
+	fz_always(ctx)
+		fz_drop_xml(ctx, xml);
+	fz_catch(ctx)
+		fz_rethrow(ctx);
+}
+
+static void
+process_slide(fz_context *ctx, fz_archive *arch, const char *file, doc_info *info)
+{
+	fz_write_printf(ctx, info->out, "<div id=\"slide%d\">\n", info->page++);
+	process_item(ctx, arch, file, info, 0);
+	fz_write_printf(ctx, info->out, "</div>\n");
+}
+
+static char *
+make_absolute_path(fz_context *ctx, const char *abs, const char *rel)
+{
+	const char *a = abs;
+	const char *aslash = a;
+	int up = 0;
+	size_t z1, z2;
+	char *s;
+
+	if (rel == NULL)
+		return NULL;
+	if (abs == NULL || *rel == '/')
+		return fz_strdup(ctx, rel);
+
+	for (a = abs; *a != 0; a++)
+		if (*a == '/')
+			aslash = a+1;
+
+	while (rel[0] == '.')
+	{
+		if (rel[1] == '/')
+			rel += 2;
+		else if (rel[1] == '.' && rel[2] == '/')
+			rel += 3, up++;
+		else
+			fz_throw(ctx, FZ_ERROR_FORMAT, "Unresolvable path");
+	}
+	if (rel[0] == 0)
+		fz_throw(ctx, FZ_ERROR_FORMAT, "Unresolvable path");
+
+	while (up)
+	{
+		while (aslash != abs && aslash[-1] != '/')
+			aslash--;
+
+		up--;
+	}
+
+	z1 = aslash - abs;
+	z2 = strlen(rel);
+	s = fz_malloc(ctx, z1 + z2 + 1);
+	if (z1)
+		memcpy(s, abs, z1);
+	memcpy(s+z1, rel, z2+1);
+
+	return s;
+}
+
+static char *
+collate_t_content(fz_context *ctx, fz_xml *top)
+{
+	char *val = NULL;
+	fz_xml *next;
+	fz_xml *pos = fz_xml_down(top);
+
+	while (pos != top)
+	{
+		/* Capture all the 't' content. */
+		if (fz_xml_is_tag(pos, "t"))
+		{
+			/* Remember the content. */
+			char *s = fz_xml_text(fz_xml_down(pos));
+
+			if (s == NULL)
+			{
+				/* Do nothing */
+			}
+			else if (val == NULL)
+				val = fz_strdup(ctx, s);
+			else
+			{
+				char *val2;
+				size_t z1 = strlen(val);
+				size_t z2 = strlen(s) + 1;
+				fz_try(ctx)
+				{
+					val2 = fz_malloc(ctx, z1 + z2);
+				}
+				fz_catch(ctx)
+				{
+					fz_free(ctx, val);
+					fz_rethrow(ctx);
+				}
+				memcpy(val2, val, z1);
+				memcpy(val2 + z1, s, z2);
+				fz_free(ctx, val);
+				val = val2;
+			}
+			/* Do NOT go down, we've already dealt with that. */
+		}
+		else if (fz_xml_is_tag(pos, "rPr") || fz_xml_is_tag(pos, "rPh"))
+		{
+			/* We do not want the 't' content from within these. */
+		}
+		else
+		{
+			/* Try to move down. */
+			next = fz_xml_down(pos);
+			if (next)
+			{
+				/* We can move down, easy! */
+				pos = next;
+				continue;
+			}
+		}
+		/* Try moving to next. */
+		next = fz_xml_next(pos);
+		if (next)
+		{
+			/* We can move to next, easy! */
+			pos = next;
+			continue;
+		}
+
+		/* If we can't go down, or next, pop up until we
+		 * find somewhere we can go next from. */
+		while (1)
+		{
+			/* OK. So move up. */
+			pos = fz_xml_up(pos);
+			/* Check for hitting the top. */
+			if (pos == top)
+				break;
+			next = fz_xml_next(pos);
+			if (next)
+			{
+				pos = next;
+				break;
+			}
+		}
+	}
+
+	return val;
+}
+
+static fz_xml *
+try_parse_xml_archive_entry(fz_context *ctx, fz_archive *arch, const char *filename, int preserve_white)
+{
+	if (!fz_has_archive_entry(ctx, arch, filename))
+		return NULL;
+
+	return fz_parse_xml_archive_entry(ctx, arch, filename, preserve_white);
+}
+
+static void
+load_shared_strings(fz_context *ctx, fz_archive *arch, fz_xml *rels, doc_info *info, const char *file)
+{
+	fz_xml *pos = fz_xml_find_dfs(rels, "Relationship", "Type", "http://schemas.openxmlformats.org/officeDocument/2006/relationships/sharedStrings");
+	const char *ss_file = fz_xml_att(pos, "Target");
+	char *resolved = NULL;
+	fz_xml *xml = NULL;
+	char *str = NULL;
+
+	if (ss_file == NULL)
+		return;
+
+	fz_var(xml);
+	fz_var(str);
+	fz_var(resolved);
+
+	fz_try(ctx)
+	{
+		resolved = make_absolute_path(ctx, file, ss_file);
+		xml = fz_parse_xml_archive_entry(ctx, arch, resolved, 1);
+
+		pos = fz_xml_find_dfs(xml, "si", NULL, NULL);
+		while (pos)
+		{
+			int n = info->shared_string_len;
+			str = collate_t_content(ctx, pos);
+
+			if (n == info->shared_string_max)
+			{
+				int max = info->shared_string_max;
+				int newmax = max ? max * 2 : 1024;
+				char **arr = fz_realloc(ctx, info->shared_strings, sizeof(*arr) * newmax);
+				memset(&arr[max], 0, sizeof(*arr) * (newmax - max));
+				info->shared_strings = arr;
+				info->shared_string_max = newmax;
+			}
+
+			info->shared_strings[n] = str;
+			str = NULL;
+			info->shared_string_len++;
+			pos = fz_xml_find_next_dfs(pos, "si", NULL, NULL);
+		}
+	}
+	fz_always(ctx)
+	{
+		fz_drop_xml(ctx, xml);
+		fz_free(ctx, resolved);
+		fz_free(ctx, str);
+	}
+	fz_catch(ctx)
+		fz_rethrow(ctx);
+}
+
+static void
+load_footnotes(fz_context *ctx, fz_archive *arch, fz_xml *rels, doc_info *info, const char *file)
+{
+	char *resolved = NULL;
+	fz_xml *xml = NULL;
+	char *str = NULL;
+
+	fz_var(xml);
+	fz_var(str);
+	fz_var(resolved);
+
+	fz_try(ctx)
+	{
+		fz_xml *pos;
+
+		resolved = make_absolute_path(ctx, file, "footnotes.xml");
+		xml = try_parse_xml_archive_entry(ctx, arch, resolved, 1);
+		if (xml == NULL)
+			break;
+
+		pos = fz_xml_find_dfs(xml, "footnote", NULL, NULL);
+		while (pos)
+		{
+			int n = fz_atoi(fz_xml_att(pos, "w:id"));
+
+			str = collate_t_content(ctx, pos);
+
+			if (str && n >= 0)
+			{
+				if (n >= info->footnotes_max)
+				{
+					int max = info->footnotes_max;
+					int newmax = max ? max * 2 : 1024;
+					char **arr;
+					if (newmax < n)
+						newmax = n+1;
+					arr = fz_realloc(ctx, info->footnotes, sizeof(*arr) * newmax);
+					memset(&arr[max], 0, sizeof(*arr) * (newmax - max));
+					info->footnotes = arr;
+					info->footnotes_max = newmax;
+				}
+
+				info->footnotes[n] = str;
+				str = NULL;
+			}
+			pos = fz_xml_find_next_dfs(pos, "footnote", NULL, NULL);
+		}
+	}
+	fz_always(ctx)
+	{
+		fz_drop_xml(ctx, xml);
+		fz_free(ctx, resolved);
+		fz_free(ctx, str);
+	}
+	fz_catch(ctx)
+		fz_rethrow(ctx);
+}
+
+static void
+process_office_document(fz_context *ctx, fz_archive *arch, const char *file, doc_info *info)
+{
+	char *file_rels;
+	fz_xml *xml = NULL;
+	fz_xml *rels = NULL;
+	char *resolved_rel = NULL;
+
+	if (file == NULL)
+		return;
+
+	file_rels = make_rel_name(ctx, file);
+
+	fz_var(resolved_rel);
+
+	fz_var(rels);
+	fz_var(xml);
+
+	fz_try(ctx)
+	{
+		fz_xml *pos;
+
+		rels = fz_parse_xml_archive_entry(ctx, arch, file_rels, 0);
+		xml = fz_parse_xml_archive_entry(ctx, arch, file, 1);
+
+		/* XLSX */
+		pos = fz_xml_find_dfs(xml, "sheet", NULL, NULL);
+		if (pos)
+		{
+			load_shared_strings(ctx, arch, rels, info, file);
+			while (pos)
+			{
+				char *name = fz_xml_att(pos, "name");
+				char *id = fz_xml_att(pos, "r:id");
+				char *sheet = lookup_rel(ctx, rels, id);
+
+				if (sheet)
+				{
+					resolved_rel = make_absolute_path(ctx, file, sheet);
+					process_sheet(ctx, arch, name, resolved_rel, info);
+					fz_free(ctx, resolved_rel);
+					resolved_rel = NULL;
+				}
+				pos = fz_xml_find_next_dfs(pos, "sheet", NULL, NULL);
+			}
+			break;
+		}
+
+		/* Let's try it as a powerpoint */
+		pos = fz_xml_find_dfs(xml, "sldId", NULL, NULL);
+		if (pos)
+		{
+			while (pos)
+			{
+				char *id = fz_xml_att(pos, "r:id");
+				char *sheet = lookup_rel(ctx, rels, id);
+
+				if (sheet)
+				{
+					resolved_rel = make_absolute_path(ctx, file, sheet);
+					process_slide(ctx, arch, resolved_rel, info);
+					fz_free(ctx, resolved_rel);
+					resolved_rel = NULL;
+				}
+				pos = fz_xml_find_next_dfs(pos, "sldId", NULL, NULL);
+			}
+			break;
+		}
+
+		/* Let's try it as word. */
+		{
+			load_footnotes(ctx, arch, rels, info, file);
+			process_doc_stream(ctx, xml, info, 1);
+		}
+	}
+	fz_always(ctx)
+	{
+		fz_drop_xml(ctx, xml);
+		fz_drop_xml(ctx, rels);
+		fz_free(ctx, resolved_rel);
+		fz_free(ctx, file_rels);
+	}
+	fz_catch(ctx)
+		fz_rethrow(ctx);
+}
+
+static void
+process_office_document_properties(fz_context *ctx, fz_archive *arch, const char *file, doc_info *info)
+{
+	fz_xml *xml = NULL;
+	char *title;
+
+	fz_var(xml);
+
+	fz_try(ctx)
+	{
+		fz_xml *pos;
+
+		xml = fz_parse_xml_archive_entry(ctx, arch, file, 1);
+
+		pos = fz_xml_find_dfs(xml, "title", NULL, NULL);
+		title = fz_xml_text(fz_xml_down(pos));
+		if (title)
+		{
+			fz_write_string(ctx, info->out, "<title>");
+			doc_escape(ctx, info->out, title);
+			fz_write_string(ctx, info->out, "</title>");
+		}
+	}
+	fz_always(ctx)
+	{
+		fz_drop_xml(ctx, xml);
+	}
+	fz_catch(ctx)
+		fz_rethrow(ctx);
+}
+
+static fz_buffer *
+fz_office_to_html(fz_context *ctx, fz_html_font_set *set, fz_buffer *buffer_in, fz_archive *dir, const char *user_css, fz_office_to_html_opts *opts)
+{
+	fz_stream *stream = NULL;
+	fz_archive *archive = NULL;
+	fz_buffer *buffer_out = NULL;
+	fz_xml *xml = NULL;
+	fz_xml *pos = NULL;
+	fz_xml *rels = NULL;
+	const char *schema = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument";
+	const char *schema_props = "http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties";
+	doc_info info = { 0 };
+	int i;
+
+	fz_var(archive);
+	fz_var(stream);
+	fz_var(buffer_out);
+	fz_var(xml);
+	fz_var(rels);
+
+	if (opts)
+		info.opts = *opts;
+
+	fz_try(ctx)
+	{
+		if (buffer_in)
+		{
+			stream = fz_open_buffer(ctx, buffer_in);
+			archive = fz_open_archive_with_stream(ctx, stream);
+		}
+		else
+			archive = fz_keep_archive(ctx, dir);
+		buffer_out = fz_new_buffer(ctx, 1024);
+		info.out = fz_new_output_with_buffer(ctx, buffer_out);
+
+		/* Is it an HWPX ?*/
+		xml = try_parse_xml_archive_entry(ctx, archive, "META-INF/container.xml", 0);
+		if (xml)
+		{
+			pos = fz_xml_find_dfs(xml, "rootfile", "media-type", "application/hwpml-package+xml");
+			if (!pos)
+				fz_throw(ctx, FZ_ERROR_FORMAT, "Archive not hwpx.");
+
+			while (pos)
+			{
+				const char *file = fz_xml_att(pos, "full-path");
+				process_rootfile(ctx, archive, file, &info);
+				pos = fz_xml_find_next_dfs(pos, "rootfile", "media-type", "application/hwpml-package+xml");
+			}
+			fz_close_output(ctx, info.out);
+			break;
+		}
+
+		/* Try other types */
+		{
+			xml = try_parse_xml_archive_entry(ctx, archive, "_rels/.rels", 0);
+
+			fz_write_string(ctx, info.out, "<html>\n");
+
+			pos = fz_xml_find_dfs(xml, "Relationship", "Type", schema_props);
+			if (pos)
+			{
+				const char *file = fz_xml_att(pos, "Target");
+				fz_write_string(ctx, info.out, "<head>\n");
+				process_office_document_properties(ctx, archive, file, &info);
+				fz_write_string(ctx, info.out, "</head>\n");
+			}
+
+			fz_write_string(ctx, info.out, "<body>\n");
+			pos = fz_xml_find_dfs(xml, "Relationship", "Type", schema);
+			if (!pos)
+				fz_throw(ctx, FZ_ERROR_FORMAT, "Archive not docx.");
+
+			while (pos)
+			{
+				const char *file = fz_xml_att(pos, "Target");
+				if (file)
+					process_office_document(ctx, archive, file, &info);
+				pos = fz_xml_find_next_dfs(pos, "Relationship", "Type", schema);
+			}
+		}
+
+		fz_close_output(ctx, info.out);
+	}
+	fz_always(ctx)
+	{
+		fz_drop_xml(ctx, rels);
+		fz_drop_xml(ctx, xml);
+		for (i = 0; i < info.shared_string_len; ++i)
+			fz_free(ctx, info.shared_strings[i]);
+		fz_free(ctx, info.shared_strings);
+		for (i = 0; i < info.footnotes_max; ++i)
+			fz_free(ctx, info.footnotes[i]);
+		fz_free(ctx, info.footnotes);
+		fz_drop_output(ctx, info.out);
+		fz_drop_archive(ctx, archive);
+		fz_drop_stream(ctx, stream);
+	}
+	fz_catch(ctx)
+	{
+		fz_drop_buffer(ctx, buffer_out);
+		fz_rethrow(ctx);
+	}
+
+#ifdef DEBUG_OFFICE_TO_HTML
+	{
+		unsigned char *storage;
+		size_t len = fz_buffer_storage(ctx, buffer_out, &storage);
+		fz_write_printf(ctx, fz_stddbg(ctx), "fz_office_to_html: Output buffer, len=%zd:\n", len);
+		fz_write_buffer(ctx, fz_stddbg(ctx), buffer_out);
+	}
+#endif
+
+	return buffer_out;
+}
+
+/* Office document handler */
+
+static fz_buffer *
+office_to_html(fz_context *ctx, fz_html_font_set *set, fz_buffer *buf, fz_archive *zip, const char *user_css)
+{
+	fz_office_to_html_opts opts = { 0 };
+
+	return fz_office_to_html(ctx, set, buf, zip, user_css, &opts);
+}
+
+static const fz_htdoc_format_t fz_htdoc_office =
+{
+	"Office document",
+	office_to_html,
+	0, 1, 0
+};
+
+static fz_document *
+office_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *zip, void *state)
+{
+	return fz_htdoc_open_document_with_stream_and_dir(ctx, file, zip, &fz_htdoc_office);
+}
+
+static const char *office_extensions[] =
+{
+	"docx",
+	"xlsx",
+	"pptx",
+	"hwpx",
+	NULL
+};
+
+static const char *office_mimetypes[] =
+{
+	// DOCX
+	"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+	// XLSX
+	"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+	// PPTX
+	"application/vnd.openxmlformats-officedocument.presentationml.presentation",
+	// HWPX
+	"application/haansofthwpx",
+	"application/vnd.hancom.hwpx",
+	NULL
+};
+
+/* We are only ever 75% sure here, to allow a 'better' handler, such as sodochandler
+ * to override us by returning 100. */
+static int
+office_recognize_doc_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *zip, void **state, fz_document_recognize_state_free_fn **free_state)
+{
+	fz_archive *arch = NULL;
+	int ret = 0;
+	fz_xml *xml = NULL;
+
+	if (state)
+		*state = NULL;
+	if (free_state)
+		*free_state = NULL;
+
+	fz_var(arch);
+	fz_var(ret);
+	fz_var(xml);
+
+	fz_try(ctx)
+	{
+		if (stream)
+		{
+			arch = fz_try_open_archive_with_stream(ctx, stream);
+			if (arch == NULL)
+				break;
+		}
+		else
+			arch = fz_keep_archive(ctx, zip);
+
+		xml = fz_try_parse_xml_archive_entry(ctx, arch, "META-INF/container.xml", 0);
+		if (xml)
+		{
+			if (fz_xml_find_dfs(xml, "rootfile", "media-type", "application/hwpml-package+xml"))
+				ret = 75; /* HWPX */
+			break;
+		}
+		xml = fz_try_parse_xml_archive_entry(ctx, arch, "_rels/.rels", 0);
+		if (xml)
+		{
+			if (fz_xml_find_dfs(xml, "Relationship", "Type", "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument"))
+			{
+				ret = 75; /* DOCX | PPTX | XLSX */
+			}
+			break;
+		}
+	}
+	fz_always(ctx)
+	{
+		fz_drop_xml(ctx, xml);
+		fz_drop_archive(ctx, arch);
+	}
+	fz_catch(ctx)
+		fz_rethrow(ctx);
+
+	return ret;
+}
+
+fz_document_handler office_document_handler =
+{
+	NULL,
+	office_open_document,
+	office_extensions,
+	office_mimetypes,
+	office_recognize_doc_content
+};
author	Franz Glasner <fzglas.hg@dom66.de>
date	Mon, 15 Sep 2025 11:43:07 +0200
parents
children