Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/source/html/xml-dom.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/source/html/xml-dom.c Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,655 @@ +// Copyright (C) 2022-2025 Artifex Software, Inc. +// +// This file is part of MuPDF. +// +// MuPDF is free software: you can redistribute it and/or modify it under the +// terms of the GNU Affero General Public License as published by the Free +// Software Foundation, either version 3 of the License, or (at your option) +// any later version. +// +// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +// details. +// +// You should have received a copy of the GNU Affero General Public License +// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> +// +// Alternative licensing terms are available from the licensor. +// For commercial licensing, see <https://www.artifex.com/> or contact +// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, +// CA 94129, USA, for further information. + +#include "html-imp.h" + +#include "string.h" + +fz_xml *fz_story_document(fz_context *ctx, fz_story *story) +{ + if (story == NULL || story->dom == NULL) + return NULL; + + return story->dom; +} + +fz_xml *fz_dom_body(fz_context *ctx, fz_xml *dom) +{ + if (dom == NULL) + return NULL; + + return fz_xml_find_dfs(dom, "body", NULL, NULL); +} + +fz_xml *fz_dom_document_element(fz_context *ctx, fz_xml *dom) +{ + if (dom == NULL) + return NULL; + + while (dom->up) + dom = dom->up; + + return dom->down; +} + +static fz_xml * +doc_pointer(fz_xml *a) +{ + while (a->up) + a = a->up; + + return a; +} + +static void +check_same_doc(fz_context *ctx, fz_xml *a, fz_xml *b) +{ + /* Sanity check: The child and parent must come from the same doc. */ + if (doc_pointer(a) != doc_pointer(b)) + fz_throw(ctx, FZ_ERROR_ARGUMENT, "Parent and child must be from the same document"); +} + +/* Helper function to skip forward if we are passed a + * doc pointer in circumstances where we should not be. */ +static fz_xml * +skip_doc_pointer(fz_xml *x) +{ + return (x == NULL || !FZ_DOCUMENT_ITEM(x)) ? x : x->down; +} + +fz_xml * +fz_new_dom(fz_context *ctx, const char *tag) +{ + fz_pool *pool = fz_new_pool(ctx); + fz_xml *xml; + + fz_try(ctx) + { + xml = fz_pool_alloc(ctx, pool, sizeof *xml); + xml->up = NULL; + xml->down = NULL; + xml->u.doc.refs = 1; + xml->u.doc.pool = pool; + xml->down = fz_new_dom_node(ctx, xml, tag); + xml->down->up = xml; + } + fz_catch(ctx) + { + fz_drop_pool(ctx, pool); + fz_rethrow(ctx); + } + + return xml->down; +} + +fz_xml * +fz_new_dom_node(fz_context *ctx, fz_xml *dom, const char *tag) +{ + const char *ns; + fz_xml *xml; + size_t size; + + dom = doc_pointer(dom); + + /* skip namespace prefix */ + for (ns = tag; *ns; ++ns) + if (*ns == ':') + tag = ns + 1; + + size = offsetof(fz_xml, u.node.u.d.name) + ns-tag+1; + + xml = fz_pool_alloc(ctx, dom->u.doc.pool, size); + + memcpy(xml->u.node.u.d.name, tag, ns-tag+1); + xml->u.node.u.d.atts = NULL; + xml->down = NULL; + xml->up = dom; + xml->u.node.next = NULL; + xml->u.node.prev = NULL; +#ifdef FZ_XML_SEQ + /* We don't have sequence numbers here. */ + xml->seq = 0; +#endif + + return xml; +} + +fz_xml * +fz_new_dom_text_node(fz_context *ctx, fz_xml *dom, const char *text) +{ + fz_xml *xml; + size_t len = text ? strlen(text) : 0; + size_t size; + + dom = doc_pointer(dom); + + size = offsetof(fz_xml, u.node.u.text) + len + 1; + + xml = fz_pool_alloc(ctx, dom->u.doc.pool, size); + + if (text) + memcpy(xml->u.node.u.text, text, len); + xml->u.node.u.text[len] = 0; + xml->down = MAGIC_TEXT; + xml->up = dom; + xml->u.node.next = NULL; + xml->u.node.prev = NULL; +#ifdef FZ_XML_SEQ + /* We don't have sequence numbers here. */ + xml->u.node.seq = 0; +#endif + + return xml; +} + +static fz_xml * +clone_xml(fz_context *ctx, fz_xml *dom, fz_xml *node) +{ + fz_xml *clone; + struct attribute **dst; + struct attribute *attr; + fz_xml *child, *prev; + + if (dom == NULL || node == NULL) + return NULL; + + /* Text nodes are simple. No children. */ + if (FZ_TEXT_ITEM(node)) + { + return fz_new_dom_text_node(ctx, dom, node->u.node.u.text); + } + + /* Clone a non-text node. */ + clone = fz_new_dom_node(ctx, dom, node->u.node.u.d.name); + + /* Clone the attributes. */ + attr = node->u.node.u.d.atts; + dst = &clone->u.node.u.d.atts; + while (attr) + { + size_t len = strlen(attr->name) + 1; + size_t size = offsetof(struct attribute, name) + len; + struct attribute *a = fz_pool_alloc(ctx, dom->u.doc.pool, size); + memcpy(a->name, attr->name, len); + a->next = NULL; + a->value = NULL; + if (attr->value) + { + a->value = fz_pool_alloc(ctx, dom->u.doc.pool, strlen(attr->value)+1); + strcpy(a->value, attr->value); + } + *dst = a; + dst = &a->next; + attr = attr->next; + } + + /* If we have no children, we're done. */ + if (node->down == NULL) + return clone; + + /* Copy the first child. */ + clone->down = clone_xml(ctx, dom, node->down); + clone->down->up = clone; + + /* And then run along all the successive children. */ + prev = clone->down; + child = node->down->u.node.next; + while (child) + { + prev->u.node.next = clone_xml(ctx, dom, child); + prev->u.node.prev = prev; + prev = prev->u.node.next; + prev->up = clone; + child = child->u.node.next; + } + + return clone; +} + +fz_xml *fz_dom_clone(fz_context *ctx, fz_xml *elt) +{ + fz_xml *dom; + + if (elt == NULL) + return NULL; + + /* We shouldn't be passed a document item really, but + * cope. */ + if (FZ_DOCUMENT_ITEM(elt)) + elt = elt->down; + + /* Find the document pointer. */ + dom = elt; + while (dom->up) + dom = dom->up; + + return clone_xml(ctx, dom, elt); +} + +fz_xml *fz_dom_create_element(fz_context *ctx, fz_xml *dom, const char *tag) +{ + if (dom == NULL || tag == NULL) + return NULL; + + /* We make a new node, unconnected to anything else. + * up will still point to the dom root though. */ + return fz_new_dom_node(ctx, dom, tag); +} + +fz_xml *fz_dom_create_text_node(fz_context *ctx, fz_xml *dom, const char *text) +{ + if (dom == NULL || text == NULL) + return NULL; + + /* We make a new node, unconnected to anything else. */ + return fz_new_dom_text_node(ctx, dom, text); +} + +fz_xml *fz_dom_find(fz_context *ctx, fz_xml *elt, const char *tag, const char *att, const char *match) +{ + if (elt == NULL) + return NULL; + + return fz_xml_find_dfs(elt, tag, att, match); +} + +fz_xml *fz_dom_find_next(fz_context *ctx, fz_xml *elt, const char *tag, const char *att, const char *match) +{ + if (elt == NULL) + return NULL; + + return fz_xml_find_next_dfs(elt, tag, att, match); +} + +void fz_dom_append_child(fz_context *ctx, fz_xml *parent, fz_xml *child) +{ + fz_xml *x; + + child = skip_doc_pointer(child); + + if (parent == NULL || child == NULL) + return; + + check_same_doc(ctx, parent, child); + + /* Sanity checks: We can't add child to parent if parent is + * a child of child. */ + x = parent; + while (x) + { + if (x == child) + fz_throw(ctx, FZ_ERROR_ARGUMENT, "Can't add a parent to its child."); + x = x->up; + } + + /* First unlink child from anywhere it's currently linked in. */ + if (child->u.node.prev) + child->u.node.prev->u.node.next = child->u.node.next; + else if (child->up->down == child && !FZ_DOCUMENT_ITEM(child->up)) + child->up->down = child->u.node.next; + if (child->u.node.next) + child->u.node.next->u.node.prev = child->u.node.prev; + child->u.node.next = NULL; + child->u.node.prev = NULL; + + /* Now find where to insert the child. */ + if (parent->down == NULL) + { + /* Insert as first (and only) child. */ + parent->down = child; + } + else + { + /* Find x, the current last child. */ + x = parent->down; + while (x->u.node.next) + x = x->u.node.next; + + /* And insert xchild after that. */ + x->u.node.next = child; + child->u.node.prev = x; + } + child->up = parent; +} + +void fz_dom_insert_before(fz_context *ctx, fz_xml *existing, fz_xml *elt) +{ + fz_xml *x; + + existing = skip_doc_pointer(existing); + elt = skip_doc_pointer(elt); + + if (existing == NULL || elt == NULL) + return; + + check_same_doc(ctx, existing, elt); + + /* Sanity check: We can't add elt before existing if existing is + * a child of elt. */ + x = existing; + while (x) + { + if (x == elt) + fz_throw(ctx, FZ_ERROR_ARGUMENT, "Can't add a node before its child."); + x = x->up; + } + + /* First unlink elt from anywhere it's currently linked in. */ + if (elt->u.node.prev) + elt->u.node.prev->u.node.next = elt->u.node.next; + else if (elt->up && !FZ_DOCUMENT_ITEM(elt->up)) + elt->up->down = elt->u.node.next; + if (elt->u.node.next) + elt->u.node.next->u.node.prev = elt->u.node.prev; + elt->u.node.next = NULL; + elt->u.node.prev = NULL; + elt->up = NULL; + + /* Now insert the element */ + elt->u.node.prev = existing->u.node.prev; + if (elt->u.node.prev) + elt->u.node.prev->u.node.next = elt; + else if (existing->up && !FZ_DOCUMENT_ITEM(existing->up)) + existing->up->down = elt; + elt->u.node.next = existing; + existing->u.node.prev = elt; + elt->up = existing->up; +} + +void fz_dom_insert_after(fz_context *ctx, fz_xml *existing, fz_xml *elt) +{ + fz_xml *x; + + existing = skip_doc_pointer(existing); + elt = skip_doc_pointer(elt); + + if (existing == NULL || elt == NULL) + return; + + check_same_doc(ctx, existing, elt); + + /* Sanity check: We can't add elt before existing if existing is + * a child of elt. */ + x = existing; + while (x) + { + if (x == elt) + fz_throw(ctx, FZ_ERROR_ARGUMENT, "Can't add a node after its child."); + x = x->up; + } + + /* First unlink child from anywhere it's currently linked in. */ + if (elt->u.node.prev) + elt->u.node.prev->u.node.next = elt->u.node.next; + else if (elt->up && !FZ_DOCUMENT_ITEM(elt->up)) + elt->up->down = elt->u.node.next; + if (elt->u.node.next) + elt->u.node.next->u.node.prev = elt->u.node.prev; + elt->u.node.next = NULL; + elt->u.node.prev = NULL; + + /* Now insert the element */ + elt->u.node.next = existing->u.node.next; + if (elt->u.node.next) + elt->u.node.next->u.node.prev = elt; + elt->u.node.prev = existing; + existing->u.node.next = elt; + elt->up = existing->up; +} + +void fz_dom_remove(fz_context *ctx, fz_xml *elt) +{ + elt = skip_doc_pointer(elt); + + if (elt == NULL) + return; + + /* Unlink child from anywhere it's currently linked in. */ + if (elt->u.node.prev) + elt->u.node.prev->u.node.next = elt->u.node.next; + else if (elt->up && !FZ_DOCUMENT_ITEM(elt)) + elt->up->down = elt->u.node.next; + if (elt->u.node.next) + elt->u.node.next->u.node.prev = elt->u.node.prev; + elt->u.node.next = NULL; + elt->u.node.prev = NULL; + elt->up = doc_pointer(elt); +} + +fz_xml *fz_dom_first_child(fz_context *ctx, fz_xml *elt) +{ + elt = skip_doc_pointer(elt); + + if (elt == NULL || FZ_TEXT_ITEM(elt)) + return NULL; + + return elt->down; +} + +fz_xml *fz_dom_parent(fz_context *ctx, fz_xml *elt) +{ + elt = skip_doc_pointer(elt); + + if (elt == NULL) + return NULL; + + if (FZ_DOCUMENT_ITEM(elt->up)) + return NULL; + + return elt->up; +} + +fz_xml *fz_dom_next(fz_context *ctx, fz_xml *elt) +{ + elt = skip_doc_pointer(elt); + + if (elt == NULL) + return NULL; + + return elt->u.node.next; +} + +fz_xml *fz_dom_previous(fz_context *ctx, fz_xml *elt) +{ + elt = skip_doc_pointer(elt); + + if (elt == NULL) + return NULL; + + return elt->u.node.prev; +} + +void fz_dom_add_attribute(fz_context *ctx, fz_xml *elt, const char *att, const char *value) +{ + struct attribute *attr; + size_t len, size; + char *mvalue = NULL; + fz_xml *doc; + + elt = skip_doc_pointer(elt); + + if (elt == NULL || att == NULL) + return; + + if (FZ_TEXT_ITEM(elt)) + fz_throw(ctx, FZ_ERROR_ARGUMENT, "Cannot add attributes to text node."); + + /* Move value to being a malloced thing, with the entity parsing done. */ + if (value) { + char *d; + const char *s = value; + d = mvalue = fz_malloc(ctx, strlen(value)+1); + + while (*s) + { + if (*s == '&') { + int c; + s += xml_parse_entity(&c, s); + d += fz_runetochar(d, c); + } + else + *d++ = *s++; + } + *d = 0; + } + + /* Do we have an attribute we can reuse? */ + attr = elt->u.node.u.d.atts; + while (attr) + { + if (strcmp(att, attr->name) == 0) + { + /* Reuse this one. */ + break; + } + attr = attr->next; + } + + if (attr && attr->value) + { + if (mvalue == NULL) + { + /* Just rewrite the existing value to be NULL. This + * 'leaks' the old value within the pool, so it will + * be cleaned up at the end. */ + attr->value = NULL; + return; + } + if (strcmp(mvalue, attr->value) == 0) + { + /* Old and new values match. Nothing to change. */ + return; + } + } + + doc = doc_pointer(elt); + /* Move mvalue to be an fz_pool thing. */ + if (mvalue) + { + char *tmp; + fz_try(ctx) + { + tmp = fz_pool_alloc(ctx, doc->u.doc.pool, strlen(mvalue)+1); + strcpy(tmp, mvalue); + } + fz_always(ctx) + fz_free(ctx, mvalue); + fz_catch(ctx) + fz_rethrow(ctx); + mvalue = tmp; + } + + /* Make a new one and prepend it. */ + len = strlen(att) + 1; + size = offsetof(struct attribute, name) + len; + attr = fz_pool_alloc(ctx, doc->u.doc.pool, size); + memcpy(attr->name, att, len); + attr->next = elt->u.node.u.d.atts; + elt->u.node.u.d.atts = attr; + attr->value = mvalue; +} + +void fz_dom_remove_attribute(fz_context *ctx, fz_xml *elt, const char *att) +{ + struct attribute **attr; + + elt = skip_doc_pointer(elt); + + if (elt == NULL || att == NULL) + return; + + if (FZ_TEXT_ITEM(elt)) + fz_throw(ctx, FZ_ERROR_ARGUMENT, "Cannot add attributes to text node."); + + attr = &elt->u.node.u.d.atts; + while (*attr) + { + if (strcmp(att, (*attr)->name) == 0) + { + /* Delete this one. */ + /* The old attr/value are 'leaked' within the pool. */ + *attr = (*attr)->next; + break; + } + attr = &(*attr)->next; + } +} + +const char *fz_dom_attribute(fz_context *ctx, fz_xml *elt, const char *att) +{ + struct attribute *attr; + + elt = skip_doc_pointer(elt); + + if (elt == NULL || att == NULL) + return NULL; + + /* Text nodes don't have attributes. */ + if (FZ_TEXT_ITEM(elt)) + return NULL; + + attr = elt->u.node.u.d.atts; + while (attr) + { + if (strcmp(att, attr->name) == 0) + { + /* Found! */ + return attr->value; + } + } + return NULL; +} + +const char *fz_dom_get_attribute(fz_context *ctx, fz_xml *elt, int i, const char **att) +{ + struct attribute *attr; + + if (elt == NULL || att == NULL) + { + if (att) + *att = NULL; + return NULL; + } + + /* Text nodes don't have attributes. */ + if (FZ_TEXT_ITEM(elt) || i < 0) + { + *att = NULL; + return NULL; + } + + attr = elt->u.node.u.d.atts; + while (attr) + { + if (i == 0) + { + /* Found! */ + *att = attr->name; + return attr->value; + } + i--; + attr = attr->next; + } + + *att = NULL; + return NULL; +}
