Python2/PyMuPDF: mupdf-source/source/fitz/xml.c comparison

comparison mupdf-source/source/fitz/xml.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.

author	Franz Glasner <fzglas.hg@dom66.de>
date	Mon, 15 Sep 2025 11:43:07 +0200
parents
children

comparison

equal deleted inserted replaced

-:1d09e1dec1d9
+:b50eed0cc0ef
+// Copyright (C) 2004-2025 Artifex Software, Inc.
+//
+// This file is part of MuPDF.
+//
+// MuPDF is free software: you can redistribute it and/or modify it under the
+// terms of the GNU Affero General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+// details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
+//
+// Alternative licensing terms are available from the licensor.
+// For commercial licensing, see <https://www.artifex.com/> or contact
+// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
+// CA 94129, USA, for further information.
+#include "xml-imp.h"
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#if FZ_ENABLE_HTML_ENGINE
+#include <gumbo.h>
+#endif
+#define FZ_XML_MAX_DEPTH 4096
+/* #define FZ_XML_SEQ */
+static const struct { const char *name; int c; } html_entities[] = {
+	{"nbsp",160}, {"iexcl",161}, {"cent",162}, {"pound",163},
+	{"curren",164}, {"yen",165}, {"brvbar",166}, {"sect",167},
+	{"uml",168}, {"copy",169}, {"ordf",170}, {"laquo",171},
+	{"not",172}, {"shy",173}, {"reg",174}, {"macr",175}, {"deg",176},
+	{"plusmn",177}, {"sup2",178}, {"sup3",179}, {"acute",180},
+	{"micro",181}, {"para",182}, {"middot",183}, {"cedil",184},
+	{"sup1",185}, {"ordm",186}, {"raquo",187}, {"frac14",188},
+	{"frac12",189}, {"frac34",190}, {"iquest",191}, {"Agrave",192},
+	{"Aacute",193}, {"Acirc",194}, {"Atilde",195}, {"Auml",196},
+	{"Aring",197}, {"AElig",198}, {"Ccedil",199}, {"Egrave",200},
+	{"Eacute",201}, {"Ecirc",202}, {"Euml",203}, {"Igrave",204},
+	{"Iacute",205}, {"Icirc",206}, {"Iuml",207}, {"ETH",208},
+	{"Ntilde",209}, {"Ograve",210}, {"Oacute",211}, {"Ocirc",212},
+	{"Otilde",213}, {"Ouml",214}, {"times",215}, {"Oslash",216},
+	{"Ugrave",217}, {"Uacute",218}, {"Ucirc",219}, {"Uuml",220},
+	{"Yacute",221}, {"THORN",222}, {"szlig",223}, {"agrave",224},
+	{"aacute",225}, {"acirc",226}, {"atilde",227}, {"auml",228},
+	{"aring",229}, {"aelig",230}, {"ccedil",231}, {"egrave",232},
+	{"eacute",233}, {"ecirc",234}, {"euml",235}, {"igrave",236},
+	{"iacute",237}, {"icirc",238}, {"iuml",239}, {"eth",240},
+	{"ntilde",241}, {"ograve",242}, {"oacute",243}, {"ocirc",244},
+	{"otilde",245}, {"ouml",246}, {"divide",247}, {"oslash",248},
+	{"ugrave",249}, {"uacute",250}, {"ucirc",251}, {"uuml",252},
+	{"yacute",253}, {"thorn",254}, {"yuml",255}, {"lt",60}, {"gt",62},
+	{"amp",38}, {"apos",39}, {"quot",34}, {"OElig",338}, {"oelig",339},
+	{"Scaron",352}, {"scaron",353}, {"Yuml",376}, {"circ",710},
+	{"tilde",732}, {"ensp",8194}, {"emsp",8195}, {"thinsp",8201},
+	{"zwnj",8204}, {"zwj",8205}, {"lrm",8206}, {"rlm",8207},
+	{"ndash",8211}, {"mdash",8212}, {"lsquo",8216}, {"rsquo",8217},
+	{"sbquo",8218}, {"ldquo",8220}, {"rdquo",8221}, {"bdquo",8222},
+	{"dagger",8224}, {"Dagger",8225}, {"permil",8240}, {"lsaquo",8249},
+	{"rsaquo",8250}, {"euro",8364}, {"fnof",402}, {"Alpha",913},
+	{"Beta",914}, {"Gamma",915}, {"Delta",916}, {"Epsilon",917},
+	{"Zeta",918}, {"Eta",919}, {"Theta",920}, {"Iota",921}, {"Kappa",922},
+	{"Lambda",923}, {"Mu",924}, {"Nu",925}, {"Xi",926}, {"Omicron",927},
+	{"Pi",928}, {"Rho",929}, {"Sigma",931}, {"Tau",932}, {"Upsilon",933},
+	{"Phi",934}, {"Chi",935}, {"Psi",936}, {"Omega",937}, {"alpha",945},
+	{"beta",946}, {"gamma",947}, {"delta",948}, {"epsilon",949},
+	{"zeta",950}, {"eta",951}, {"theta",952}, {"iota",953}, {"kappa",954},
+	{"lambda",955}, {"mu",956}, {"nu",957}, {"xi",958}, {"omicron",959},
+	{"pi",960}, {"rho",961}, {"sigmaf",962}, {"sigma",963}, {"tau",964},
+	{"upsilon",965}, {"phi",966}, {"chi",967}, {"psi",968}, {"omega",969},
+	{"thetasym",977}, {"upsih",978}, {"piv",982}, {"bull",8226},
+	{"hellip",8230}, {"prime",8242}, {"Prime",8243}, {"oline",8254},
+	{"frasl",8260}, {"weierp",8472}, {"image",8465}, {"real",8476},
+	{"trade",8482}, {"alefsym",8501}, {"larr",8592}, {"uarr",8593},
+	{"rarr",8594}, {"darr",8595}, {"harr",8596}, {"crarr",8629},
+	{"lArr",8656}, {"uArr",8657}, {"rArr",8658}, {"dArr",8659},
+	{"hArr",8660}, {"forall",8704}, {"part",8706}, {"exist",8707},
+	{"empty",8709}, {"nabla",8711}, {"isin",8712}, {"notin",8713},
+	{"ni",8715}, {"prod",8719}, {"sum",8721}, {"minus",8722},
+	{"lowast",8727}, {"radic",8730}, {"prop",8733}, {"infin",8734},
+	{"ang",8736}, {"and",8743}, {"or",8744}, {"cap",8745}, {"cup",8746},
+	{"int",8747}, {"there4",8756}, {"sim",8764}, {"cong",8773},
+	{"asymp",8776}, {"ne",8800}, {"equiv",8801}, {"le",8804}, {"ge",8805},
+	{"sub",8834}, {"sup",8835}, {"nsub",8836}, {"sube",8838},
+	{"supe",8839}, {"oplus",8853}, {"otimes",8855}, {"perp",8869},
+	{"sdot",8901}, {"lceil",8968}, {"rceil",8969}, {"lfloor",8970},
+	{"rfloor",8971}, {"lang",9001}, {"rang",9002}, {"loz",9674},
+	{"spades",9824}, {"clubs",9827}, {"hearts",9829}, {"diams",9830},
+};
+struct parser
+{
+	fz_pool *pool;
+	fz_xml *head;
+	int preserve_white;
+	int depth;
+#ifdef FZ_XML_SEQ
+	int seq;
+#endif
+};
+static void xml_indent(fz_context *ctx, fz_output *out, int n)
+{
+	while (n--) {
+		fz_write_byte(ctx, out, ' ');
+		fz_write_byte(ctx, out, ' ');
+	}
+}
+void fz_debug_xml(fz_xml *item, int level)
+{
+	/* This is a bit nasty as it relies on implementation
+	 * details of both fz_stdout, and fz_write_printf coping
+	 * with NULL ctx. */
+	fz_output_xml(NULL, fz_stdout(NULL), item, level);
+}
+void fz_output_xml(fz_context *ctx, fz_output *out, fz_xml *item, int level)
+{
+	char *s;
+	if (item == NULL)
+		return;
+	/* Skip over the DOC object at the top. */
+	if (item->up == NULL)
+	{
+		fz_xml *child;
+		for (child = fz_xml_down(item); child; child = child->u.node.next)
+			fz_output_xml(ctx, out, child, level + 1);
+		return;
+	}
+	s = fz_xml_text(item);
+	xml_indent(ctx, out, level);
+	if (s)
+	{
+		int c;
+		fz_write_byte(ctx, out, '"');
+		while (*s) {
+			s += fz_chartorune(&c, s);
+			switch (c) {
+			default:
+				if (c > 0xFFFF)
+					fz_write_printf(ctx, out, "\\u{%X}", c);
+				else if (c < 32 || c > 127)
+					fz_write_printf(ctx, out, "\\u%04X", c);
+				else
+					fz_write_byte(ctx, out, c);
+				break;
+			case '\\': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, '\\'); break;
+			case '\b': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, 'b'); break;
+			case '\f': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, 'f'); break;
+			case '\n': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, 'n'); break;
+			case '\r': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, 'r'); break;
+			case '\t': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, 't'); break;
+			}
+		}
+		fz_write_byte(ctx, out, '"');
+#ifdef FZ_XML_SEQ
+		fz_write_printf(ctx, out, " <%d>", item->seq);
+#endif
+		fz_write_byte(ctx, out, '\n');
+	}
+	else
+	{
+		fz_xml *child;
+		struct attribute *att;
+#ifdef FZ_XML_SEQ
+		fz_write_printf(ctx, out, "(%s <%d>\n", item->u.node.u.d.name, item->u.node.seq);
+#else
+		fz_write_printf(ctx, out, "(%s\n", item->u.node.u.d.name);
+#endif
+		for (att = item->u.node.u.d.atts; att; att = att->next)
+		{
+			xml_indent(ctx, out, level);
+			fz_write_printf(ctx, out, "=%s %s\n", att->name, att->value);
+		}
+		for (child = fz_xml_down(item); child; child = child->u.node.next)
+			fz_output_xml(ctx, out, child, level + 1);
+		xml_indent(ctx, out, level);
+#ifdef FZ_XML_SEQ
+		fz_write_printf(ctx, out, ")%s <%d>\n", item->u.node.u.d.name, item->u.node.seq);
+#else
+		fz_write_printf(ctx, out, ")%s\n", item->u.node.u.d.name);
+#endif
+	}
+}
+fz_xml *fz_xml_prev(fz_xml *item)
+{
+	return item && item->up ? item->u.node.prev : NULL;
+}
+fz_xml *fz_xml_next(fz_xml *item)
+{
+	return item && item->up ? item->u.node.next : NULL;
+}
+fz_xml *fz_xml_up(fz_xml *item)
+{
+	/* Never step up to the DOC. */
+	return item && item->up && item->up->up ? item->up : NULL;
+}
+fz_xml *fz_xml_down(fz_xml *item)
+{
+	/* DOC items can never have MAGIC_TEXT as their down value,
+	 * so this is safe. */
+	return item && !FZ_TEXT_ITEM(item) ? item->down : NULL;
+}
+char *fz_xml_text(fz_xml *item)
+{
+	/* DOC items can never have MAGIC_TEXT as their down value,
+	 * so this is safe. */
+	return (item && FZ_TEXT_ITEM(item)) ? item->u.node.u.text : NULL;
+}
+char *fz_xml_tag(fz_xml *item)
+{
+	/* DOC items can never have MAGIC_TEXT as their down value,
+	 * so this is safe. */
+	return item && !FZ_TEXT_ITEM(item) ? item->u.node.u.d.name : NULL;
+}
+int fz_xml_is_tag(fz_xml *item, const char *name)
+{
+	if (!item || FZ_DOCUMENT_ITEM(item) || FZ_TEXT_ITEM(item))
+		return 0;
+	return !strcmp(item->u.node.u.d.name, name);
+}
+char *fz_xml_att(fz_xml *item, const char *name)
+{
+	struct attribute *att;
+	if (!item || FZ_DOCUMENT_ITEM(item) || FZ_TEXT_ITEM(item))
+		return NULL;
+	for (att = item->u.node.u.d.atts; att; att = att->next)
+		if (!strcmp(att->name, name))
+			return att->value;
+	return NULL;
+}
+char *fz_xml_att_alt(fz_xml *item, const char *one, const char *two)
+{
+	char *val = fz_xml_att(item, one);
+	if (!val)
+		val = fz_xml_att(item, two);
+	return val;
+}
+fz_xml *fz_xml_find(fz_xml *item, const char *tag)
+{
+	/* Skip over any DOC item. */
+	if (item && FZ_DOCUMENT_ITEM(item))
+		item = item->down;
+	while (item)
+	{
+		if (!FZ_TEXT_ITEM(item) && !strcmp(item->u.node.u.d.name, tag))
+			return item;
+		item = item->u.node.next;
+	}
+	return NULL;
+}
+fz_xml *fz_xml_find_next(fz_xml *item, const char *tag)
+{
+	/* Skip over any DOC item. */
+	if (item && FZ_DOCUMENT_ITEM(item))
+		item = item->down;
+	if (item)
+		item = item->u.node.next;
+	return fz_xml_find(item, tag);
+}
+fz_xml *fz_xml_find_down(fz_xml *item, const char *tag)
+{
+	if (item)
+		item = fz_xml_down(item);
+	return fz_xml_find(item, tag);
+}
+int fz_xml_att_eq(fz_xml *item, const char *name, const char *match)
+{
+	const char *val = fz_xml_att(item, name);
+	return val ? !strcmp(val, match) : 0;
+}
+fz_xml *fz_xml_find_match(fz_xml *item, const char *tag, const char *att, const char *match)
+{
+	/* Skip over any document item. */
+	if (item && FZ_DOCUMENT_ITEM(item))
+		item = item->down;
+	while (1)
+	{
+		item = tag ? fz_xml_find(item, tag) : item;
+		if (item == NULL || fz_xml_att_eq(item, att, match))
+			break;
+		item = item->u.node.next;
+	}
+	return item;
+}
+fz_xml *fz_xml_find_next_match(fz_xml *item, const char *tag, const char *att, const char *match)
+{
+	/* Skip over any document item. */
+	if (item && FZ_DOCUMENT_ITEM(item))
+		item = item->down;
+	if (item != NULL)
+	{
+		do
+		{
+			item = tag ? fz_xml_find_next(item, tag) : item->u.node.next;
+		}
+		while (item != NULL && !fz_xml_att_eq(item, att, match));
+	}
+	return item;
+}
+fz_xml *fz_xml_find_down_match(fz_xml *item, const char *tag, const char *att, const char *match)
+{
+	return fz_xml_find_match(fz_xml_down(item), tag, att, match);
+}
+fz_xml *fz_xml_root(fz_xml *xml)
+{
+	if (xml == NULL)
+		return NULL;
+	/* If we've been given a node mid-tree, run up to the root to find
+	 * the doc node. */
+	while (xml->up)
+		xml = xml->up;
+	/* And the root is the child of the doc.*/
+	return xml->down;
+}
+void fz_drop_xml(fz_context *ctx, fz_xml *xml)
+{
+	if (!xml)
+		return;
+	/* Wherever we are in the tree, we want the doc node at the root. */
+	while (xml->up)
+		xml = xml->up;
+	/* Drop a reference to the tree as a whole. */
+	if (fz_drop_imp(ctx, xml, &xml->u.doc.refs) == 0)
+		return;
+	fz_drop_pool(ctx, xml->u.doc.pool);
+}
+void fz_detach_xml(fz_context *ctx, fz_xml *node)
+{
+	fz_xml *doc = node;
+	/* If we're already a document node, then this is a NOP. */
+	if (doc->up == NULL)
+		return;
+	/* Move doc to be the doc pointer at the top of the tree. */
+	while (doc->up)
+	{
+		doc = doc->up;
+	}
+	/* Relocate node to be the child of doc. */
+	node->up->down = NULL;
+	doc->down = node;
+	/* NOTE: Suppose that X = doc->down on entry. On exit doc->down == node, but
+	 * X->up = doc. We need to be careful throughout this code to not assume that
+	 * Y is always a child of Y->up. */
+}
+size_t xml_parse_entity(int *c, const char *a)
+{
+	char *b;
+	size_t i;
+	if (a[1] == '#') {
+		if (a[2] == 'x')
+			*c = strtol(a + 3, &b, 16);
+		else
+			*c = strtol(a + 2, &b, 10);
+		if (*b == ';')
+			return b - a + 1;
+	}
+	else if (a[1] == 'l' && a[2] == 't' && a[3] == ';') {
+		*c = '<';
+		return 4;
+	}
+	else if (a[1] == 'g' && a[2] == 't' && a[3] == ';') {
+		*c = '>';
+		return 4;
+	}
+	else if (a[1] == 'a' && a[2] == 'm' && a[3] == 'p' && a[4] == ';') {
+		*c = '&';
+		return 5;
+	}
+	else if (a[1] == 'a' && a[2] == 'p' && a[3] == 'o' && a[4] == 's' && a[5] == ';') {
+		*c = '\'';
+		return 6;
+	}
+	else if (a[1] == 'q' && a[2] == 'u' && a[3] == 'o' && a[4] == 't' && a[5] == ';') {
+		*c = '"';
+		return 6;
+	}
+	/* We should only be doing this for XHTML, but it shouldn't be a problem. */
+	for (i = 0; i < nelem(html_entities); ++i) {
+		size_t n = strlen(html_entities[i].name);
+		if (!strncmp(a+1, html_entities[i].name, n) && a[n+1] == ';') {
+			*c = html_entities[i].c;
+			return n + 2;
+		}
+	}
+	*c = *a;
+	return 1;
+}
+static inline int isname(int c)
+{
+	return c == '.' || c == '-' || c == '_' || c == ':' ||
+		(c >= '0' && c <= '9') ||
+		(c >= 'A' && c <= 'Z') ||
+		(c >= 'a' && c <= 'z');
+}
+static inline int iswhite(int c)
+{
+	return c == ' ' || c == '\r' || c == '\n' || c == '\t';
+}
+static void xml_emit_open_tag(fz_context *ctx, struct parser *parser, const char *a, const char *b, int is_text)
+{
+	fz_xml *head, *tail;
+	const char *ns;
+	size_t size;
+	if (is_text)
+		size = offsetof(fz_xml, u.node.u.text) + b-a+1;
+	else
+	{
+		/* skip namespace prefix */
+		for (ns = a; ns < b - 1; ++ns)
+			if (*ns == ':')
+				a = ns + 1;
+		size = offsetof(fz_xml, u.node.u.d.name) + b-a+1;
+	}
+	head = fz_pool_alloc(ctx, parser->pool, size);
+	if (is_text)
+		head->down = MAGIC_TEXT;
+	else
+	{
+		memcpy(head->u.node.u.d.name, a, b - a);
+		head->u.node.u.d.name[b - a] = 0;
+		head->u.node.u.d.atts = NULL;
+		head->down = NULL;
+	}
+	head->up = parser->head;
+	head->u.node.next = NULL;
+#ifdef FZ_XML_SEQ
+	head->u.node.seq = parser->seq++;
+#endif
+	/* During construction, we use head->next to mean "the
+	 * tail of the children. When we close the tag, we
+	 * rewrite it to be NULL. */
+	if (!parser->head->down) {
+		parser->head->down = head;
+		parser->head->u.node.next = head;
+		head->u.node.prev = NULL;
+	}
+	else {
+		tail = parser->head->u.node.next;
+		tail->u.node.next = head;
+		head->u.node.prev = tail;
+		parser->head->u.node.next = head;
+	}
+	parser->head = head;
+	parser->depth++;
+	if (parser->depth >= FZ_XML_MAX_DEPTH)
+		fz_throw(ctx, FZ_ERROR_SYNTAX, "too deep xml element nesting");
+}
+static void xml_emit_att_name(fz_context *ctx, struct parser *parser, const char *a, const char *b)
+{
+	fz_xml *head = parser->head;
+	struct attribute *att;
+	size_t size;
+	size = offsetof(struct attribute, name) + b-a+1;
+	att = fz_pool_alloc(ctx, parser->pool, size);
+	memcpy(att->name, a, b - a);
+	att->name[b - a] = 0;
+	att->value = NULL;
+	att->next = head->u.node.u.d.atts;
+	head->u.node.u.d.atts = att;
+}
+void fz_xml_add_att(fz_context *ctx, fz_pool *pool, fz_xml *node, const char *key, const char *val)
+{
+	size_t size = offsetof(struct attribute, name) + strlen(key) + 1;
+	struct attribute *att = fz_pool_alloc(ctx, pool, size);
+	memcpy(att->name, key, strlen(key)+1);
+	att->value = fz_pool_alloc(ctx, pool, strlen(val) + 1);
+	memcpy(att->value, val, strlen(val)+1);
+	att->next = node->u.node.u.d.atts;
+	node->u.node.u.d.atts = att;
+}
+static void xml_emit_att_value(fz_context *ctx, struct parser *parser, const char *a, const char *b)
+{
+	fz_xml *head = parser->head;
+	struct attribute *att = head->u.node.u.d.atts;
+	char *s;
+	int c;
+	/* entities are all longer than UTFmax so runetochar is safe */
+	s = att->value = fz_pool_alloc(ctx, parser->pool, b - a + 1);
+	while (a < b) {
+		if (*a == '&') {
+			a += xml_parse_entity(&c, a);
+			s += fz_runetochar(s, c);
+		}
+		else {
+			*s++ = *a++;
+		}
+	}
+	*s = 0;
+}
+static void xml_emit_close_tag(fz_context *ctx, struct parser *parser)
+{
+	parser->depth--;
+	parser->head->u.node.next = NULL;
+	if (parser->head->up)
+		parser->head = parser->head->up;
+}
+static void xml_emit_text(fz_context *ctx, struct parser *parser, const char *a, const char *b)
+{
+	fz_xml *head;
+	const char *p;
+	char *s;
+	int c;
+	/* Skip text outside the root tag */
+	if (parser->depth == 0)
+		return;
+	/* Skip all-whitespace text nodes */
+	if (!parser->preserve_white)
+	{
+		for (p = a; p < b; p++)
+			if (!iswhite(*p))
+				break;
+		if (p == b)
+			return;
+	}
+	xml_emit_open_tag(ctx, parser, a, b, 1);
+	head = parser->head;
+	/* entities are all longer than UTFmax so runetochar is safe */
+	s = fz_xml_text(head);
+	while (a < b) {
+		if (*a == '&') {
+			a += xml_parse_entity(&c, a);
+			s += fz_runetochar(s, c);
+		}
+		else {
+			*s++ = *a++;
+		}
+	}
+	*s = 0;
+	xml_emit_close_tag(ctx, parser);
+}
+static void xml_emit_cdata(fz_context *ctx, struct parser *parser, const char *a, const char *b)
+{
+	fz_xml *head;
+	char *s;
+	xml_emit_open_tag(ctx, parser, a, b, 1);
+	head = parser->head;
+	s = head->u.node.u.text;
+	while (a < b)
+		*s++ = *a++;
+	*s = 0;
+	xml_emit_close_tag(ctx, parser);
+}
+static int close_tag(fz_context *ctx, struct parser *parser, const char *mark, const char *p)
+{
+	const char *ns, *tag;
+	/* skip namespace prefix */
+	for (ns = mark; ns < p - 1; ++ns)
+		if (*ns == ':')
+			mark = ns + 1;
+	tag = fz_xml_tag(parser->head);
+	if (tag && strncmp(tag, mark, p-mark) == 0 && tag[p-mark] == 0)
+	{
+		xml_emit_close_tag(ctx, parser);
+		return 0;
+	}
+	return 1;
+}
+static char *xml_parse_document_imp(fz_context *ctx, struct parser *parser, const char *p) /* lgtm [cpp/use-of-goto] */
+{
+	const char *mark;
+	int quote;
+parse_text:
+	mark = p;
+	while (*p && *p != '<') ++p;
+	if (*p == '<') {
+		if (mark < p)
+			xml_emit_text(ctx, parser, mark, p);
+		++p;
+		goto parse_element;
+	} else if (mark < p)
+		xml_emit_text(ctx, parser, mark, p);
+	return NULL;
+parse_element:
+	if (*p == '/') { ++p; goto parse_closing_element; }
+	if (*p == '!') { ++p; goto parse_comment; }
+	if (*p == '?') { ++p; goto parse_processing_instruction; }
+	while (iswhite(*p)) ++p;
+	if (isname(*p))
+		goto parse_element_name;
+	return "syntax error in element";
+parse_comment:
+	if (p[0]=='D' && p[1]=='O' && p[2]=='C' && p[3]=='T' && p[4]=='Y' && p[5]=='P' && p[6]=='E')
+		goto parse_declaration;
+	if (p[0]=='E' && p[1]=='N' && p[2]=='T' && p[3]=='I' && p[4]=='T' && p[5]=='Y')
+		goto parse_declaration;
+	if (*p == '[') goto parse_cdata;
+	if (*p++ != '-') return "syntax error in comment (<! not followed by --)";
+	if (*p++ != '-') return "syntax error in comment (<!- not followed by -)";
+	while (*p) {
+		if (p[0] == '-' && p[1] == '-' && p[2] == '>') {
+			p += 3;
+			goto parse_text;
+		}
+		++p;
+	}
+	return "end of data in comment";
+parse_declaration:
+	while (*p) if (*p++ == '>') goto parse_text;
+	return "end of data in declaration";
+parse_cdata:
+	if (p[1] != 'C' || p[2] != 'D' || p[3] != 'A' || p[4] != 'T' || p[5] != 'A' || p[6] != '[')
+		return "syntax error in CDATA section";
+	p += 7;
+	mark = p;
+	while (*p) {
+		if (p[0] == ']' && p[1] == ']' && p[2] == '>') {
+			xml_emit_cdata(ctx, parser, mark, p);
+			p += 3;
+			goto parse_text;
+		}
+		++p;
+	}
+	return "end of data in CDATA section";
+parse_processing_instruction:
+	while (*p) {
+		if (p[0] == '?' && p[1] == '>') {
+			p += 2;
+			goto parse_text;
+		}
+		++p;
+	}
+	return "end of data in processing instruction";
+parse_closing_element:
+	while (iswhite(*p)) ++p;
+	mark = p;
+	while (isname(*p)) ++p;
+	if (!isname(*mark))
+		return "syntax error in closing element";
+	if (close_tag(ctx, parser, mark, p))
+		return "opening and closing tag mismatch";
+	while (iswhite(*p)) ++p;
+	if (*p != '>')
+		return "syntax error in closing element";
+	++p;
+	goto parse_text;
+parse_element_name:
+	mark = p;
+	while (isname(*p)) ++p;
+	xml_emit_open_tag(ctx, parser, mark, p, 0);
+	if (*p == '>') {
+		++p;
+		goto parse_text;
+	}
+	if (p[0] == '/' && p[1] == '>') {
+		xml_emit_close_tag(ctx, parser);
+		p += 2;
+		goto parse_text;
+	}
+	if (iswhite(*p))
+		goto parse_attributes;
+	return "syntax error after element name";
+parse_attributes:
+	while (iswhite(*p)) ++p;
+	if (isname(*p))
+		goto parse_attribute_name;
+	if (*p == '>') {
+		++p;
+		goto parse_text;
+	}
+	if (p[0] == '/' && p[1] == '>') {
+		xml_emit_close_tag(ctx, parser);
+		p += 2;
+		goto parse_text;
+	}
+	return "syntax error in attributes";
+parse_attribute_name:
+	mark = p;
+	while (isname(*p)) ++p;
+	xml_emit_att_name(ctx, parser, mark, p);
+	while (iswhite(*p)) ++p;
+	if (*p == '=') { ++p; goto parse_attribute_value; }
+	return "syntax error after attribute name";
+parse_attribute_value:
+	while (iswhite(*p)) ++p;
+	quote = *p++;
+	mark = p;
+	/* special case for handling MOBI filepos=00000 syntax */
+	if (quote >= '0' && quote <= '9') {
+		while (*p >= '0' && *p <= '9') ++p;
+		xml_emit_att_value(ctx, parser, mark, p);
+		goto parse_attributes;
+	}
+	if (quote != '"' && quote != '\'')
+		return "missing quote character";
+	while (*p && *p != quote) ++p;
+	if (*p == quote) {
+		xml_emit_att_value(ctx, parser, mark, p++);
+		goto parse_attributes;
+	}
+	return "end of data in attribute value";
+}
+static int fast_tolower(int c)
+{
+	if ((unsigned)c - 'A' < 26)
+		return c | 32;
+	return c;
+}
+static int fast_strncasecmp(const char *a, const char *b, size_t n)
+{
+	if (!n--)
+		return 0;
+	for (; *a && *b && n && fast_tolower(*a) == fast_tolower(*b); a++, b++, n--)
+		;
+	return fast_tolower(*a) - fast_tolower(*b);
+}
+static char *fast_strcasestr(char *h, char *n)
+{
+	int n0 = fast_tolower(*n++);
+	size_t nn = strlen(n);
+	while (*h != 0)
+	{
+		if (fast_tolower(*h) == n0 && fast_strncasecmp(h+1, n, nn) == 0)
+			return h;
+		++h;
+	}
+	return NULL;
+}
+static int startswith(const char *a, const char *b)
+{
+	return !fast_strncasecmp(a, b, strlen(b));
+}
+/* https://encoding.spec.whatwg.org/#names-and-labels */
+static struct { char *encoding; char *alias; } encoding_aliases[] = {
+	{ "big5", "big5" },
+	{ "big5", "big5-hkscs" },
+	{ "big5", "cn-big5" },
+	{ "big5", "csbig5" },
+	{ "big5", "x-x-big5" },
+	{ "euc-cn", "euc-cn" },
+	{ "euc-jp", "cseucpkdfmtjapanese" },
+	{ "euc-jp", "euc-jp" },
+	{ "euc-jp", "x-euc-jp" },
+	{ "euc-kr", "cseuckr" },
+	{ "euc-kr", "csksc56011987" },
+	{ "euc-kr", "euc-kr" },
+	{ "euc-kr", "iso-ir-149" },
+	{ "euc-kr", "korean" },
+	{ "euc-kr", "ks_c_5601" },
+	{ "euc-kr", "ksc5601" },
+	{ "euc-kr", "ksc_5601" },
+	{ "euc-kr", "windows-949" },
+	{ "euc-tw", "euc-tw" },
+	{ "gb18030", "chinese" },
+	{ "gb18030", "csgb2312" },
+	{ "gb18030", "csiso58gb231280" },
+	{ "gb18030", "gb18030" },
+	{ "gb18030", "gb2312" },
+	{ "gb18030", "gb_2312" },
+	{ "gb18030", "gbk" },
+	{ "gb18030", "iso-ir-58" },
+	{ "gb18030", "x-gbk" },
+	{ "iso-8859-1", "ascii" },
+	{ "iso-8859-1", "iso-8859-1" },
+	{ "iso-8859-1", "iso8859-1" },
+	{ "iso-8859-1", "latin1" },
+	{ "iso-8859-1", "us-ascii" },
+	{ "iso-8859-7", "greek" },
+	{ "iso-8859-7", "greek8" },
+	{ "iso-8859-7", "iso-8859-1" },
+	{ "iso-8859-7", "iso8859-1" },
+	{ "koi8-r", "koi" },
+	{ "koi8-r", "koi8" },
+	{ "koi8-r", "koi8-r" },
+	{ "koi8-r", "koi8-ru" },
+	{ "koi8-r", "koi8-u" },
+	{ "koi8-r", "koi8_r" },
+	{ "shift_jis", "csshiftjis" },
+	{ "shift_jis", "ms932" },
+	{ "shift_jis", "ms_kanji" },
+	{ "shift_jis", "shift-jis" },
+	{ "shift_jis", "shift_jis" },
+	{ "shift_jis", "sjis" },
+	{ "shift_jis", "windows-31j" },
+	{ "shift_jis", "x-sjis" },
+	{ "windows-1250", "cp1250" },
+	{ "windows-1250", "windows-1250" },
+	{ "windows-1251", "cp1251" },
+	{ "windows-1251", "windows-1251" },
+	{ "windows-1252", "cp1252" },
+	{ "windows-1252", "cp819" },
+	{ "windows-1252", "windows-1252" },
+};
+static char *match_encoding_name(char *enc)
+{
+	size_t i;
+	for (i = 0; i < nelem(encoding_aliases); ++i)
+		if (startswith(enc, encoding_aliases[i].alias))
+			return encoding_aliases[i].encoding;
+	return NULL;
+}
+// Look for encoding in <meta http-equiv="content-type" content="text/html; charset=XXX"> tags
+static const char *find_meta_encoding(char *s)
+{
+	const char *table = NULL;
+	char *end, *meta, *charset, *enc;
+	meta = fast_strcasestr(s, "<meta");
+	while (meta && !table)
+	{
+		end = strchr(meta, '>');
+		if (end)
+		{
+			*end = 0;
+			if (fast_strcasestr(meta, "http-equiv") && fast_strcasestr(meta, "content-type"))
+			{
+				charset = fast_strcasestr(meta, "charset=");
+				if (charset)
+				{
+					enc = match_encoding_name(charset + 8);
+					if (enc)
+						table = enc;
+				}
+			}
+			*end = '>';
+		}
+		meta = fast_strcasestr(meta + 5, "<meta");
+	}
+	return table;
+}
+static const char *find_xml_encoding(char *s)
+{
+	const char *table = NULL;
+	char *end, *xml, *enc;
+	end = strchr(s, '>');
+	if (end)
+	{
+		*end = 0;
+		xml = strstr(s, "<?xml");
+		if (xml)
+		{
+			enc = strstr(xml, "encoding=");
+			if (enc)
+			{
+				enc = match_encoding_name(enc + 10);
+				if (enc)
+					table = enc;
+			}
+		}
+		*end = '>';
+	}
+	if (!table)
+		table = find_meta_encoding(s);
+	return table;
+}
+static char *convert_to_utf8(fz_context *ctx, unsigned char *s, size_t n, int *dofree)
+{
+	fz_text_decoder dec;
+	const char *enc;
+	const unsigned char *e = s + n;
+	char *dst, *d;
+	int m;
+	int c;
+	if (s[0] == 0xFE && s[1] == 0xFF) {
+		s += 2;
+		dst = d = Memento_label(fz_malloc(ctx, n * FZ_UTFMAX), "utf8_from_be");
+		while (s + 1 < e) {
+			c = s[0] << 8 | s[1];
+			d += fz_runetochar(d, c);
+			s += 2;
+		}
+		*d = 0;
+		*dofree = 1;
+		return dst;
+	}
+	if (s[0] == 0xFF && s[1] == 0xFE) {
+		s += 2;
+		dst = d = Memento_label(fz_malloc(ctx, n * FZ_UTFMAX), "utf8_from_le");
+		while (s + 1 < e) {
+			c = s[0] | s[1] << 8;
+			d += fz_runetochar(d, c);
+			s += 2;
+		}
+		*d = 0;
+		*dofree = 1;
+		return dst;
+	}
+	enc = find_xml_encoding((char*)s);
+	if (enc)
+	{
+		fz_init_text_decoder(ctx, &dec, enc);
+		// NOTE: use decode_size if memory is more important than speed
+		m = (int)dec.decode_bound(&dec, s, (int)n);
+		dst = Memento_label(fz_malloc(ctx, m), "utf8");
+		dec.decode(&dec, dst, s, (int)n);
+		*dofree = 1;
+		return dst;
+	}
+	*dofree = 0;
+	if (s[0] == 0xEF && s[1] == 0xBB && s[2] == 0xBF)
+		return (char*)s+3;
+	return (char*)s;
+}
+fz_xml *
+fz_parse_xml_stream(fz_context *ctx, fz_stream *stm, int preserve_white)
+{
+	fz_buffer *buf = fz_read_all(ctx, stm, 128);
+	fz_xml *xml = NULL;
+	fz_var(xml);
+	fz_try(ctx)
+		xml = fz_parse_xml(ctx, buf, preserve_white);
+	fz_always(ctx)
+		fz_drop_buffer(ctx, buf);
+	fz_catch(ctx)
+		fz_rethrow(ctx);
+	return xml;
+}
+static fz_xml *
+parse_and_drop_buffer(fz_context *ctx, fz_buffer *buf, int preserve_white)
+{
+	fz_xml *xml = NULL;
+	fz_var(xml);
+	fz_try(ctx)
+		xml = fz_parse_xml(ctx, buf, preserve_white);
+	fz_always(ctx)
+		fz_drop_buffer(ctx, buf);
+	fz_catch(ctx)
+		fz_rethrow(ctx);
+	return xml;
+}
+fz_xml *
+fz_parse_xml_archive_entry(fz_context *ctx, fz_archive *arch, const char *filename, int preserve_white)
+{
+	fz_buffer *buf = fz_read_archive_entry(ctx, arch, filename);
+	return parse_and_drop_buffer(ctx, buf, preserve_white);
+}
+fz_xml *
+fz_try_parse_xml_archive_entry(fz_context *ctx, fz_archive *arch, const char *filename, int preserve_white)
+{
+	fz_buffer *buf = fz_try_read_archive_entry(ctx, arch, filename);
+	if (buf == NULL)
+		return NULL;
+	return parse_and_drop_buffer(ctx, buf, preserve_white);
+}
+fz_xml *
+fz_parse_xml(fz_context *ctx, fz_buffer *buf, int preserve_white)
+{
+	struct parser parser;
+	fz_xml *xml = NULL;
+	fz_xml *root, *node;
+	char *p = NULL;
+	char *error;
+	int dofree = 0;
+	unsigned char *s;
+	size_t n;
+	static unsigned char empty_string[] = "";
+	fz_var(dofree);
+	fz_var(p);
+	if (buf == NULL)
+	{
+		n = 0;
+		s = empty_string;
+	}
+	else
+	{
+		/* ensure we are zero-terminated */
+		fz_terminate_buffer(ctx, buf);
+		n = fz_buffer_storage(ctx, buf, &s);
+	}
+	parser.pool = fz_new_pool(ctx);
+	parser.head = root = fz_pool_alloc_flexible(ctx, parser.pool, fz_xml, u.node.u.d.name, 1);
+	parser.preserve_white = preserve_white;
+	parser.depth = 0;
+#ifdef FZ_XML_SEQ
+	parser.seq = 0;
+#endif
+	fz_try(ctx)
+	{
+		p = convert_to_utf8(ctx, s, n, &dofree);
+		error = xml_parse_document_imp(ctx, &parser, p);
+		if (error)
+			fz_throw(ctx, FZ_ERROR_SYNTAX, "%s", error);
+		for (node = parser.head; node; node = node->up)
+			node->u.node.next = NULL;
+		xml = fz_pool_alloc(ctx, parser.pool, sizeof *xml);
+		xml->up = NULL;
+		xml->down = root->down;
+		xml->u.doc.refs = 1;
+		xml->u.doc.pool = parser.pool;
+		for (node = root->down; node; node = node->u.node.next)
+			node->up = xml;
+	}
+	fz_always(ctx)
+	{
+		if (dofree)
+			fz_free(ctx, p);
+	}
+	fz_catch(ctx)
+	{
+		fz_drop_pool(ctx, parser.pool);
+		fz_rethrow(ctx);
+	}
+	return xml;
+}
+#if FZ_ENABLE_HTML_ENGINE
+/*
+	Parse the contents of buffer into a tree of XML nodes, using the HTML5 syntax.
+	Gumbo doesn't check for malloc errors. Use our pool allocator and let it longjmp
+	out of Gumbo on allocation errors. At the end (success or fail) we release the
+	pool used for Gumbo's parse tree all at once.
+*/
+struct mem_gumbo {
+	fz_context *ctx;
+	fz_pool *pool;
+};
+static void *alloc_gumbo(void *ctx, size_t size)
+{
+	struct mem_gumbo *mem = ctx;
+	return fz_pool_alloc(mem->ctx, mem->pool, size);
+}
+static void dealloc_gumbo(void *ctx, void *ptr)
+{
+	/* nothing */
+}
+static void xml_from_gumbo(fz_context *ctx, struct parser *parser, GumboNode *node)
+{
+	unsigned int i;
+	const char *tag, *end, *sentinel;
+	switch (node->type)
+	{
+	case GUMBO_NODE_ELEMENT:
+		if (node->v.element.tag != GUMBO_TAG_UNKNOWN)
+		{
+			tag = gumbo_normalized_tagname(node->v.element.tag);
+			end = tag + strlen(tag);
+		}
+		else
+		{
+			tag = node->v.element.original_tag.data;
+			sentinel = tag + node->v.element.original_tag.length;
+			if (tag[0] == '<')
+				++tag;
+			for (end = tag; end < sentinel; ++end)
+				if (end[0] == '>' || end[0] == '/' || iswhite(end[0]))
+					break;
+		}
+		xml_emit_open_tag(ctx, parser, tag, end, 0);
+		for (i = 0; i < node->v.element.attributes.length; ++i)
+		{
+			GumboAttribute *att = node->v.element.attributes.data[i];
+			xml_emit_att_name(ctx, parser, att->name, att->name+strlen(att->name));
+			xml_emit_att_value(ctx, parser, att->value, att->value+strlen(att->value));
+		}
+		for (i = 0; i < node->v.element.children.length; ++i)
+		{
+			GumboNode *child = node->v.element.children.data[i];
+			xml_from_gumbo(ctx, parser, child);
+		}
+		xml_emit_close_tag(ctx, parser);
+		break;
+	case GUMBO_NODE_TEXT:
+	case GUMBO_NODE_CDATA:
+	case GUMBO_NODE_WHITESPACE:
+		xml_emit_text(ctx, parser, node->v.text.text, node->v.text.text+strlen(node->v.text.text));
+		break;
+	case GUMBO_NODE_DOCUMENT:
+	case GUMBO_NODE_COMMENT:
+	case GUMBO_NODE_TEMPLATE:
+		break;
+	}
+}
+#endif
+fz_xml *
+fz_parse_xml_from_html5(fz_context *ctx, fz_buffer *buf)
+{
+#if FZ_ENABLE_HTML_ENGINE
+	struct parser parser;
+	fz_xml *xml = NULL;
+	fz_xml root, *node;
+	char *p = NULL;
+	int dofree = 0;
+	unsigned char *s;
+	size_t n;
+	GumboOutput *soup = NULL;
+	GumboOptions opts;
+	struct mem_gumbo mem;
+	static unsigned char empty_string[] = "";
+	fz_var(mem.pool);
+	fz_var(soup);
+	fz_var(dofree);
+	fz_var(p);
+	if (buf == NULL)
+	{
+		n = 0;
+		s = empty_string;
+	}
+	else
+	{
+		/* ensure we are zero-terminated */
+		fz_terminate_buffer(ctx, buf);
+		n = fz_buffer_storage(ctx, buf, &s);
+	}
+	mem.ctx = ctx;
+	mem.pool = NULL;
+	memset(&root, 0, sizeof(root));
+	parser.pool = fz_new_pool(ctx);
+	parser.head = &root;
+	parser.preserve_white = 1;
+	parser.depth = 0;
+#ifdef FZ_XML_SEQ
+	parser.seq = 0;
+#endif
+	fz_try(ctx)
+	{
+		p = convert_to_utf8(ctx, s, n, &dofree);
+		mem.pool = fz_new_pool(ctx);
+		memset(&opts, 0, sizeof opts);
+		opts.allocator = alloc_gumbo;
+		opts.deallocator = dealloc_gumbo;
+		opts.userdata = &mem;
+		opts.tab_stop = 8;
+		opts.stop_on_first_error = 0;
+		opts.max_errors = -1;
+		opts.fragment_context = GUMBO_TAG_LAST;
+		opts.fragment_namespace = GUMBO_NAMESPACE_HTML;
+		soup = gumbo_parse_with_options(&opts, (const char *)p, strlen(p));
+		xml_from_gumbo(ctx, &parser, soup->root);
+		for (node = parser.head; node; node = node->up)
+			node->u.node.next = NULL;
+		xml = fz_pool_alloc(ctx, parser.pool, sizeof *xml);
+		xml->up = NULL;
+		xml->down = root.down;
+		xml->u.doc.pool = parser.pool;
+		xml->u.doc.refs = 1;
+		for (node = root.down; node; node = node->u.node.next)
+			node->up = xml;
+	}
+	fz_always(ctx)
+	{
+		if (soup)
+			gumbo_destroy_output(&opts, soup);
+		fz_drop_pool(ctx, mem.pool);
+		if (dofree)
+			fz_free(ctx, p);
+	}
+	fz_catch(ctx)
+	{
+		fz_drop_pool(ctx, parser.pool);
+		fz_rethrow(ctx);
+	}
+	return xml;
+#else
+	fz_throw(ctx, FZ_ERROR_GENERIC, "HTML Engine not enabled in this build");
+#endif
+}
+fz_xml *fz_xml_find_dfs(fz_xml *item, const char *tag, const char *att, const char *match)
+{
+	return fz_xml_find_dfs_top(item, tag, att, match, NULL);
+}
+fz_xml *fz_xml_find_dfs_top(fz_xml *item, const char *tag, const char *att, const char *match, fz_xml *top)
+{
+	/* Skip over any DOC object. */
+	if (item && FZ_DOCUMENT_ITEM(item))
+		item = item->down;
+	while (item)
+	{
+		if (!FZ_TEXT_ITEM(item) && (tag == NULL || !strcmp(item->u.node.u.d.name, tag)))
+		{
+			if (att == NULL || (match == NULL ? fz_xml_att(item, att) != NULL : fz_xml_att_eq(item, att, match)))
+				return item;
+		}
+		if (!FZ_TEXT_ITEM(item) && item->down)
+			item = item->down;
+		else if (item->u.node.next)
+			item = item->u.node.next;
+		else
+			while (1) {
+				item = item->up;
+				/* Stop searching if we hit our declared 'top' item. */
+				if (item == top)
+					return NULL;
+				/* We should never reach item == NULL, but just in case. */
+				if (item == NULL)
+					return NULL;
+				/* If we reach the DOC object at the top, we're done. */
+				if (item->up == NULL)
+					return NULL;
+				if (item->u.node.next)
+				{
+					item = item->u.node.next;
+					break;
+				}
+			}
+	}
+	return NULL;
+}
+fz_xml *fz_xml_find_next_dfs(fz_xml *item, const char *tag, const char *att, const char *match)
+{
+	return fz_xml_find_next_dfs_top(item, tag, att, match, NULL);
+}
+fz_xml *fz_xml_find_next_dfs_top(fz_xml *item, const char *tag, const char *att, const char *match, fz_xml *top)
+{
+	/* Skip over any DOC object. */
+	if (item && FZ_DOCUMENT_ITEM(item))
+		item = item->down;
+	if (item == NULL)
+		return NULL;
+	if (item->down)
+		item = item->down;
+	else if (item->u.node.next)
+		item = item->u.node.next;
+	else
+		while (1) {
+			item = item->up;
+			/* Stop searching if we hit our declared 'top' item. */
+			if (item == top)
+				return NULL;
+			/* We should never reach item == NULL, but just in case. */
+			if (item == NULL)
+				return NULL;
+			/* If we reach the DOC object at the top, we're done. */
+			if (item->up == NULL)
+				return NULL;
+			if (item->u.node.next)
+			{
+				item = item->u.node.next;
+				break;
+			}
+		}
+	return fz_xml_find_dfs_top(item, tag, att, match, top);
+}
+fz_xml *fz_keep_xml(fz_context *ctx, fz_xml *xml)
+{
+	fz_xml *dom = xml;
+	if (xml == NULL)
+		return xml;
+	while (dom->up)
+		dom = dom->up;
+	fz_keep_imp(ctx, dom, &dom->u.doc.refs);
+	/* Return the original node pointer, not the dom pointer! */
+	return xml;
+}

Mercurial > hgrepos > Python2 > PyMuPDF

comparison mupdf-source/source/fitz/xml.c @ 2:b50eed0cc0ef upstream