diff mupdf-source/include/mupdf/fitz/xml.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/include/mupdf/fitz/xml.h	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,435 @@
+// Copyright (C) 2004-2024 Artifex Software, Inc.
+//
+// This file is part of MuPDF.
+//
+// MuPDF is free software: you can redistribute it and/or modify it under the
+// terms of the GNU Affero General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+// details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
+//
+// Alternative licensing terms are available from the licensor.
+// For commercial licensing, see <https://www.artifex.com/> or contact
+// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
+// CA 94129, USA, for further information.
+
+#ifndef MUPDF_FITZ_XML_H
+#define MUPDF_FITZ_XML_H
+
+#include "mupdf/fitz/system.h"
+#include "mupdf/fitz/context.h"
+#include "mupdf/fitz/buffer.h"
+#include "mupdf/fitz/pool.h"
+#include "mupdf/fitz/archive.h"
+
+/**
+	XML document model
+*/
+
+typedef struct fz_xml fz_xml;
+
+/* For backwards compatibility */
+typedef fz_xml fz_xml_doc;
+
+/**
+	Parse the contents of buffer into a tree of xml nodes.
+
+	preserve_white: whether to keep or delete all-whitespace nodes.
+*/
+fz_xml *fz_parse_xml(fz_context *ctx, fz_buffer *buf, int preserve_white);
+
+/**
+	Parse the contents of buffer into a tree of xml nodes.
+
+	preserve_white: whether to keep or delete all-whitespace nodes.
+*/
+fz_xml *fz_parse_xml_stream(fz_context *ctx, fz_stream *stream, int preserve_white);
+
+/**
+	Parse the contents of an archive entry into a tree of xml nodes.
+
+	preserve_white: whether to keep or delete all-whitespace nodes.
+*/
+fz_xml *fz_parse_xml_archive_entry(fz_context *ctx, fz_archive *dir, const char *filename, int preserve_white);
+
+/**
+	Try and parse the contents of an archive entry into a tree of xml nodes.
+
+	preserve_white: whether to keep or delete all-whitespace nodes.
+
+	Will return NULL if the archive entry can't be found. Otherwise behaves
+	the same as fz_parse_xml_archive_entry. May throw exceptions.
+*/
+fz_xml *fz_try_parse_xml_archive_entry(fz_context *ctx, fz_archive *dir, const char *filename, int preserve_white);
+
+/**
+	Parse the contents of a buffer into a tree of XML nodes,
+	using the HTML5 parsing algorithm.
+*/
+fz_xml *fz_parse_xml_from_html5(fz_context *ctx, fz_buffer *buf);
+
+/**
+	Add a reference to the XML.
+*/
+fz_xml *fz_keep_xml(fz_context *ctx, fz_xml *xml);
+
+/**
+	Drop a reference to the XML. When the last reference is
+	dropped, the node and all its children and siblings will
+	be freed.
+*/
+void fz_drop_xml(fz_context *ctx, fz_xml *xml);
+
+/**
+	Detach a node from the tree, unlinking it from its parent,
+	and setting the document root to the node.
+*/
+void fz_detach_xml(fz_context *ctx, fz_xml *node);
+
+/**
+	Return the topmost XML node of a document.
+*/
+fz_xml *fz_xml_root(fz_xml_doc *xml);
+
+/**
+	Return previous sibling of XML node.
+*/
+fz_xml *fz_xml_prev(fz_xml *item);
+
+/**
+	Return next sibling of XML node.
+*/
+fz_xml *fz_xml_next(fz_xml *item);
+
+/**
+	Return parent of XML node.
+*/
+fz_xml *fz_xml_up(fz_xml *item);
+
+/**
+	Return first child of XML node.
+*/
+fz_xml *fz_xml_down(fz_xml *item);
+
+/**
+	Return true if the tag name matches.
+*/
+int fz_xml_is_tag(fz_xml *item, const char *name);
+
+/**
+	Return tag of XML node. Return NULL for text nodes.
+*/
+char *fz_xml_tag(fz_xml *item);
+
+/**
+	Return the value of an attribute of an XML node.
+	NULL if the attribute doesn't exist.
+*/
+char *fz_xml_att(fz_xml *item, const char *att);
+
+/**
+	Return the value of an attribute of an XML node.
+	If the first attribute doesn't exist, try the second.
+	NULL if neither attribute exists.
+*/
+char *fz_xml_att_alt(fz_xml *item, const char *one, const char *two);
+
+/**
+	Check for a matching attribute on an XML node.
+
+	If the node has the requested attribute (name), and the value
+	matches (match) then return 1. Otherwise, 0.
+*/
+int fz_xml_att_eq(fz_xml *item, const char *name, const char *match);
+
+/**
+	Add an attribute to an XML node.
+*/
+void fz_xml_add_att(fz_context *ctx, fz_pool *pool, fz_xml *node, const char *key, const char *val);
+
+/**
+	Return the text content of an XML node.
+	Return NULL if the node is a tag.
+*/
+char *fz_xml_text(fz_xml *item);
+
+/**
+	Pretty-print an XML tree to given output.
+*/
+void fz_output_xml(fz_context *ctx, fz_output *out, fz_xml *item, int level);
+
+/**
+	Pretty-print an XML tree to stdout. (Deprecated, use
+	fz_output_xml in preference).
+*/
+void fz_debug_xml(fz_xml *item, int level);
+
+/**
+	Search the siblings of XML nodes starting with item looking for
+	the first with the given tag.
+
+	Return NULL if none found.
+*/
+fz_xml *fz_xml_find(fz_xml *item, const char *tag);
+
+/**
+	Search the siblings of XML nodes starting with the first sibling
+	of item looking for the first with the given tag.
+
+	Return NULL if none found.
+*/
+fz_xml *fz_xml_find_next(fz_xml *item, const char *tag);
+
+/**
+	Search the siblings of XML nodes starting with the first child
+	of item looking for the first with the given tag.
+
+	Return NULL if none found.
+*/
+fz_xml *fz_xml_find_down(fz_xml *item, const char *tag);
+
+/**
+	Search the siblings of XML nodes starting with item looking for
+	the first with the given tag (or any tag if tag is NULL), and
+	with a matching attribute.
+
+	Return NULL if none found.
+*/
+fz_xml *fz_xml_find_match(fz_xml *item, const char *tag, const char *att, const char *match);
+
+/**
+	Search the siblings of XML nodes starting with the first sibling
+	of item looking for the first with the given tag (or any tag if tag
+	is NULL), and with a matching attribute.
+
+	Return NULL if none found.
+*/
+fz_xml *fz_xml_find_next_match(fz_xml *item, const char *tag, const char *att, const char *match);
+
+/**
+	Search the siblings of XML nodes starting with the first child
+	of item looking for the first with the given tag (or any tag if
+	tag is NULL), and with a matching attribute.
+
+	Return NULL if none found.
+*/
+fz_xml *fz_xml_find_down_match(fz_xml *item, const char *tag, const char *att, const char *match);
+
+/**
+	Perform a depth first search from item, returning the first
+	child that matches the given tag (or any tag if tag is NULL),
+	with the given attribute (if att is non NULL), that matches
+	match (if match is non NULL).
+*/
+fz_xml *fz_xml_find_dfs(fz_xml *item, const char *tag, const char *att, const char *match);
+
+/**
+	Perform a depth first search from item, returning the first
+	child that matches the given tag (or any tag if tag is NULL),
+	with the given attribute (if att is non NULL), that matches
+	match (if match is non NULL). The search stops if it ever
+	reaches the top of the tree, or the declared 'top' item.
+*/
+fz_xml *fz_xml_find_dfs_top(fz_xml *item, const char *tag, const char *att, const char *match, fz_xml *top);
+
+/**
+	Perform a depth first search onwards from item, returning the first
+	child that matches the given tag (or any tag if tag is NULL),
+	with the given attribute (if att is non NULL), that matches
+	match (if match is non NULL).
+*/
+fz_xml *fz_xml_find_next_dfs(fz_xml *item, const char *tag, const char *att, const char *match);
+
+/**
+	Perform a depth first search onwards from item, returning the first
+	child that matches the given tag (or any tag if tag is NULL),
+	with the given attribute (if att is non NULL), that matches
+	match (if match is non NULL). The search stops if it ever reaches
+	the top of the tree, or the declared 'top' item.
+*/
+fz_xml *fz_xml_find_next_dfs_top(fz_xml *item, const char *tag, const char *att, const char *match, fz_xml *top);
+
+/**
+	DOM-like functions for html in xml.
+*/
+
+/**
+	Return a borrowed reference for the 'body' element of
+	the given DOM.
+*/
+fz_xml *fz_dom_body(fz_context *ctx, fz_xml *dom);
+
+/**
+	Return a borrowed reference for the document (the top
+	level element) of the DOM.
+*/
+fz_xml *fz_dom_document_element(fz_context *ctx, fz_xml *dom);
+
+/**
+	Create an element of a given tag type for the given DOM.
+
+	The element is not linked into the DOM yet.
+*/
+fz_xml *fz_dom_create_element(fz_context *ctx, fz_xml *dom, const char *tag);
+
+/**
+	Create a text node for the given DOM.
+
+	The element is not linked into the DOM yet.
+*/
+fz_xml *fz_dom_create_text_node(fz_context *ctx, fz_xml *dom, const char *text);
+
+/**
+	Find the first element matching the requirements in a depth first traversal from elt.
+
+	The tagname must match tag, unless tag is NULL, when all tag names are considered to match.
+
+	If att is NULL, then all tags match.
+	Otherwise:
+		If match is NULL, then only nodes that have an att attribute match.
+		If match is non-NULL, then only nodes that have an att attribute that matches match match.
+
+	Returns NULL (if no match found), or a borrowed reference to the first matching element.
+*/
+fz_xml *fz_dom_find(fz_context *ctx, fz_xml *elt, const char *tag, const char *att, const char *match);
+
+/**
+	Find the next element matching the requirements.
+*/
+fz_xml *fz_dom_find_next(fz_context *ctx, fz_xml *elt, const char *tag, const char *att, const char *match);
+
+/**
+	Insert an element as the last child of a parent, unlinking the
+	child from its current position if required.
+*/
+void fz_dom_append_child(fz_context *ctx, fz_xml *parent, fz_xml *child);
+
+/**
+	Insert an element (new_elt), before another element (node),
+	unlinking the new_elt from its current position if required.
+*/
+void fz_dom_insert_before(fz_context *ctx, fz_xml *node, fz_xml *new_elt);
+
+/**
+	Insert an element (new_elt), after another element (node),
+	unlinking the new_elt from its current position if required.
+*/
+void fz_dom_insert_after(fz_context *ctx, fz_xml *node, fz_xml *new_elt);
+
+/**
+	Remove an element from the DOM. The element can be added back elsewhere
+	if required.
+
+	No reference counting changes for the element.
+*/
+void fz_dom_remove(fz_context *ctx, fz_xml *elt);
+
+/**
+	Clone an element (and its children).
+
+	A borrowed reference to the clone is returned. The clone is not
+	yet linked into the DOM.
+*/
+fz_xml *fz_dom_clone(fz_context *ctx, fz_xml *elt);
+
+/**
+	Return a borrowed reference to the first child of a node,
+	or NULL if there isn't one.
+*/
+fz_xml *fz_dom_first_child(fz_context *ctx, fz_xml *elt);
+
+/**
+	Return a borrowed reference to the parent of a node,
+	or NULL if there isn't one.
+*/
+fz_xml *fz_dom_parent(fz_context *ctx, fz_xml *elt);
+
+/**
+	Return a borrowed reference to the next sibling of a node,
+	or NULL if there isn't one.
+*/
+fz_xml *fz_dom_next(fz_context *ctx, fz_xml *elt);
+
+/**
+	Return a borrowed reference to the previous sibling of a node,
+	or NULL if there isn't one.
+*/
+fz_xml *fz_dom_previous(fz_context *ctx, fz_xml *elt);
+
+/**
+	Add an attribute to an element.
+
+	Ownership of att and value remain with the caller.
+*/
+void fz_dom_add_attribute(fz_context *ctx, fz_xml *elt, const char *att, const char *value);
+
+/**
+	Remove an attribute from an element.
+*/
+void fz_dom_remove_attribute(fz_context *ctx, fz_xml *elt, const char *att);
+
+/**
+	Retrieve the value of a given attribute from a given element.
+
+	Returns a borrowed pointer to the value or NULL if not found.
+*/
+const char *fz_dom_attribute(fz_context *ctx, fz_xml *elt, const char *att);
+
+/**
+	Enumerate through the attributes of an element.
+
+	Call with i=0,1,2,3... to enumerate attributes.
+
+	On return *att and the return value will be NULL if there are not
+	that many attributes to read. Otherwise, *att will be filled in
+	with a borrowed pointer to the attribute name, and the return
+	value will be a borrowed pointer to the value.
+*/
+const char *fz_dom_get_attribute(fz_context *ctx, fz_xml *elt, int i, const char **att);
+
+/**
+	Make new xml dom root element.
+*/
+fz_xml *fz_new_dom(fz_context *ctx, const char *tag);
+
+/**
+	Create a new dom node.
+
+	This will NOT be linked in yet.
+*/
+fz_xml *fz_new_dom_node(fz_context *ctx, fz_xml *dom, const char *tag);
+
+/**
+	Create a new dom text node.
+
+	This will NOT be linked in yet.
+*/
+fz_xml *fz_new_dom_text_node(fz_context *ctx, fz_xml *dom, const char *text);
+
+/**
+	Write our xml structure out to an xml stream.
+
+	Properly formatted XML is only allowed to have a single top-level node
+	under which everything must sit. Our structures allow for multiple
+	top level nodes. If required, we will output an extra 'ROOT' node
+	at the top so that the xml is well-formed.
+
+	If 'indented' is non-zero then additional whitespace will be added to
+	make the XML easier to read in a text editor. It will NOT be properly
+	compliant.
+*/
+void fz_write_xml(fz_context *ctx, fz_xml *root, fz_output *out, int indented);
+
+/**
+	As for fz_write_xml, but direct to a file.
+*/
+void fz_save_xml(fz_context *ctx, fz_xml *root, const char *path, int indented);
+
+#endif