comparison mupdf-source/source/html/xml-dom.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 // Copyright (C) 2022-2025 Artifex Software, Inc.
2 //
3 // This file is part of MuPDF.
4 //
5 // MuPDF is free software: you can redistribute it and/or modify it under the
6 // terms of the GNU Affero General Public License as published by the Free
7 // Software Foundation, either version 3 of the License, or (at your option)
8 // any later version.
9 //
10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13 // details.
14 //
15 // You should have received a copy of the GNU Affero General Public License
16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17 //
18 // Alternative licensing terms are available from the licensor.
19 // For commercial licensing, see <https://www.artifex.com/> or contact
20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21 // CA 94129, USA, for further information.
22
23 #include "html-imp.h"
24
25 #include "string.h"
26
27 fz_xml *fz_story_document(fz_context *ctx, fz_story *story)
28 {
29 if (story == NULL || story->dom == NULL)
30 return NULL;
31
32 return story->dom;
33 }
34
35 fz_xml *fz_dom_body(fz_context *ctx, fz_xml *dom)
36 {
37 if (dom == NULL)
38 return NULL;
39
40 return fz_xml_find_dfs(dom, "body", NULL, NULL);
41 }
42
43 fz_xml *fz_dom_document_element(fz_context *ctx, fz_xml *dom)
44 {
45 if (dom == NULL)
46 return NULL;
47
48 while (dom->up)
49 dom = dom->up;
50
51 return dom->down;
52 }
53
54 static fz_xml *
55 doc_pointer(fz_xml *a)
56 {
57 while (a->up)
58 a = a->up;
59
60 return a;
61 }
62
63 static void
64 check_same_doc(fz_context *ctx, fz_xml *a, fz_xml *b)
65 {
66 /* Sanity check: The child and parent must come from the same doc. */
67 if (doc_pointer(a) != doc_pointer(b))
68 fz_throw(ctx, FZ_ERROR_ARGUMENT, "Parent and child must be from the same document");
69 }
70
71 /* Helper function to skip forward if we are passed a
72 * doc pointer in circumstances where we should not be. */
73 static fz_xml *
74 skip_doc_pointer(fz_xml *x)
75 {
76 return (x == NULL || !FZ_DOCUMENT_ITEM(x)) ? x : x->down;
77 }
78
79 fz_xml *
80 fz_new_dom(fz_context *ctx, const char *tag)
81 {
82 fz_pool *pool = fz_new_pool(ctx);
83 fz_xml *xml;
84
85 fz_try(ctx)
86 {
87 xml = fz_pool_alloc(ctx, pool, sizeof *xml);
88 xml->up = NULL;
89 xml->down = NULL;
90 xml->u.doc.refs = 1;
91 xml->u.doc.pool = pool;
92 xml->down = fz_new_dom_node(ctx, xml, tag);
93 xml->down->up = xml;
94 }
95 fz_catch(ctx)
96 {
97 fz_drop_pool(ctx, pool);
98 fz_rethrow(ctx);
99 }
100
101 return xml->down;
102 }
103
104 fz_xml *
105 fz_new_dom_node(fz_context *ctx, fz_xml *dom, const char *tag)
106 {
107 const char *ns;
108 fz_xml *xml;
109 size_t size;
110
111 dom = doc_pointer(dom);
112
113 /* skip namespace prefix */
114 for (ns = tag; *ns; ++ns)
115 if (*ns == ':')
116 tag = ns + 1;
117
118 size = offsetof(fz_xml, u.node.u.d.name) + ns-tag+1;
119
120 xml = fz_pool_alloc(ctx, dom->u.doc.pool, size);
121
122 memcpy(xml->u.node.u.d.name, tag, ns-tag+1);
123 xml->u.node.u.d.atts = NULL;
124 xml->down = NULL;
125 xml->up = dom;
126 xml->u.node.next = NULL;
127 xml->u.node.prev = NULL;
128 #ifdef FZ_XML_SEQ
129 /* We don't have sequence numbers here. */
130 xml->seq = 0;
131 #endif
132
133 return xml;
134 }
135
136 fz_xml *
137 fz_new_dom_text_node(fz_context *ctx, fz_xml *dom, const char *text)
138 {
139 fz_xml *xml;
140 size_t len = text ? strlen(text) : 0;
141 size_t size;
142
143 dom = doc_pointer(dom);
144
145 size = offsetof(fz_xml, u.node.u.text) + len + 1;
146
147 xml = fz_pool_alloc(ctx, dom->u.doc.pool, size);
148
149 if (text)
150 memcpy(xml->u.node.u.text, text, len);
151 xml->u.node.u.text[len] = 0;
152 xml->down = MAGIC_TEXT;
153 xml->up = dom;
154 xml->u.node.next = NULL;
155 xml->u.node.prev = NULL;
156 #ifdef FZ_XML_SEQ
157 /* We don't have sequence numbers here. */
158 xml->u.node.seq = 0;
159 #endif
160
161 return xml;
162 }
163
164 static fz_xml *
165 clone_xml(fz_context *ctx, fz_xml *dom, fz_xml *node)
166 {
167 fz_xml *clone;
168 struct attribute **dst;
169 struct attribute *attr;
170 fz_xml *child, *prev;
171
172 if (dom == NULL || node == NULL)
173 return NULL;
174
175 /* Text nodes are simple. No children. */
176 if (FZ_TEXT_ITEM(node))
177 {
178 return fz_new_dom_text_node(ctx, dom, node->u.node.u.text);
179 }
180
181 /* Clone a non-text node. */
182 clone = fz_new_dom_node(ctx, dom, node->u.node.u.d.name);
183
184 /* Clone the attributes. */
185 attr = node->u.node.u.d.atts;
186 dst = &clone->u.node.u.d.atts;
187 while (attr)
188 {
189 size_t len = strlen(attr->name) + 1;
190 size_t size = offsetof(struct attribute, name) + len;
191 struct attribute *a = fz_pool_alloc(ctx, dom->u.doc.pool, size);
192 memcpy(a->name, attr->name, len);
193 a->next = NULL;
194 a->value = NULL;
195 if (attr->value)
196 {
197 a->value = fz_pool_alloc(ctx, dom->u.doc.pool, strlen(attr->value)+1);
198 strcpy(a->value, attr->value);
199 }
200 *dst = a;
201 dst = &a->next;
202 attr = attr->next;
203 }
204
205 /* If we have no children, we're done. */
206 if (node->down == NULL)
207 return clone;
208
209 /* Copy the first child. */
210 clone->down = clone_xml(ctx, dom, node->down);
211 clone->down->up = clone;
212
213 /* And then run along all the successive children. */
214 prev = clone->down;
215 child = node->down->u.node.next;
216 while (child)
217 {
218 prev->u.node.next = clone_xml(ctx, dom, child);
219 prev->u.node.prev = prev;
220 prev = prev->u.node.next;
221 prev->up = clone;
222 child = child->u.node.next;
223 }
224
225 return clone;
226 }
227
228 fz_xml *fz_dom_clone(fz_context *ctx, fz_xml *elt)
229 {
230 fz_xml *dom;
231
232 if (elt == NULL)
233 return NULL;
234
235 /* We shouldn't be passed a document item really, but
236 * cope. */
237 if (FZ_DOCUMENT_ITEM(elt))
238 elt = elt->down;
239
240 /* Find the document pointer. */
241 dom = elt;
242 while (dom->up)
243 dom = dom->up;
244
245 return clone_xml(ctx, dom, elt);
246 }
247
248 fz_xml *fz_dom_create_element(fz_context *ctx, fz_xml *dom, const char *tag)
249 {
250 if (dom == NULL || tag == NULL)
251 return NULL;
252
253 /* We make a new node, unconnected to anything else.
254 * up will still point to the dom root though. */
255 return fz_new_dom_node(ctx, dom, tag);
256 }
257
258 fz_xml *fz_dom_create_text_node(fz_context *ctx, fz_xml *dom, const char *text)
259 {
260 if (dom == NULL || text == NULL)
261 return NULL;
262
263 /* We make a new node, unconnected to anything else. */
264 return fz_new_dom_text_node(ctx, dom, text);
265 }
266
267 fz_xml *fz_dom_find(fz_context *ctx, fz_xml *elt, const char *tag, const char *att, const char *match)
268 {
269 if (elt == NULL)
270 return NULL;
271
272 return fz_xml_find_dfs(elt, tag, att, match);
273 }
274
275 fz_xml *fz_dom_find_next(fz_context *ctx, fz_xml *elt, const char *tag, const char *att, const char *match)
276 {
277 if (elt == NULL)
278 return NULL;
279
280 return fz_xml_find_next_dfs(elt, tag, att, match);
281 }
282
283 void fz_dom_append_child(fz_context *ctx, fz_xml *parent, fz_xml *child)
284 {
285 fz_xml *x;
286
287 child = skip_doc_pointer(child);
288
289 if (parent == NULL || child == NULL)
290 return;
291
292 check_same_doc(ctx, parent, child);
293
294 /* Sanity checks: We can't add child to parent if parent is
295 * a child of child. */
296 x = parent;
297 while (x)
298 {
299 if (x == child)
300 fz_throw(ctx, FZ_ERROR_ARGUMENT, "Can't add a parent to its child.");
301 x = x->up;
302 }
303
304 /* First unlink child from anywhere it's currently linked in. */
305 if (child->u.node.prev)
306 child->u.node.prev->u.node.next = child->u.node.next;
307 else if (child->up->down == child && !FZ_DOCUMENT_ITEM(child->up))
308 child->up->down = child->u.node.next;
309 if (child->u.node.next)
310 child->u.node.next->u.node.prev = child->u.node.prev;
311 child->u.node.next = NULL;
312 child->u.node.prev = NULL;
313
314 /* Now find where to insert the child. */
315 if (parent->down == NULL)
316 {
317 /* Insert as first (and only) child. */
318 parent->down = child;
319 }
320 else
321 {
322 /* Find x, the current last child. */
323 x = parent->down;
324 while (x->u.node.next)
325 x = x->u.node.next;
326
327 /* And insert xchild after that. */
328 x->u.node.next = child;
329 child->u.node.prev = x;
330 }
331 child->up = parent;
332 }
333
334 void fz_dom_insert_before(fz_context *ctx, fz_xml *existing, fz_xml *elt)
335 {
336 fz_xml *x;
337
338 existing = skip_doc_pointer(existing);
339 elt = skip_doc_pointer(elt);
340
341 if (existing == NULL || elt == NULL)
342 return;
343
344 check_same_doc(ctx, existing, elt);
345
346 /* Sanity check: We can't add elt before existing if existing is
347 * a child of elt. */
348 x = existing;
349 while (x)
350 {
351 if (x == elt)
352 fz_throw(ctx, FZ_ERROR_ARGUMENT, "Can't add a node before its child.");
353 x = x->up;
354 }
355
356 /* First unlink elt from anywhere it's currently linked in. */
357 if (elt->u.node.prev)
358 elt->u.node.prev->u.node.next = elt->u.node.next;
359 else if (elt->up && !FZ_DOCUMENT_ITEM(elt->up))
360 elt->up->down = elt->u.node.next;
361 if (elt->u.node.next)
362 elt->u.node.next->u.node.prev = elt->u.node.prev;
363 elt->u.node.next = NULL;
364 elt->u.node.prev = NULL;
365 elt->up = NULL;
366
367 /* Now insert the element */
368 elt->u.node.prev = existing->u.node.prev;
369 if (elt->u.node.prev)
370 elt->u.node.prev->u.node.next = elt;
371 else if (existing->up && !FZ_DOCUMENT_ITEM(existing->up))
372 existing->up->down = elt;
373 elt->u.node.next = existing;
374 existing->u.node.prev = elt;
375 elt->up = existing->up;
376 }
377
378 void fz_dom_insert_after(fz_context *ctx, fz_xml *existing, fz_xml *elt)
379 {
380 fz_xml *x;
381
382 existing = skip_doc_pointer(existing);
383 elt = skip_doc_pointer(elt);
384
385 if (existing == NULL || elt == NULL)
386 return;
387
388 check_same_doc(ctx, existing, elt);
389
390 /* Sanity check: We can't add elt before existing if existing is
391 * a child of elt. */
392 x = existing;
393 while (x)
394 {
395 if (x == elt)
396 fz_throw(ctx, FZ_ERROR_ARGUMENT, "Can't add a node after its child.");
397 x = x->up;
398 }
399
400 /* First unlink child from anywhere it's currently linked in. */
401 if (elt->u.node.prev)
402 elt->u.node.prev->u.node.next = elt->u.node.next;
403 else if (elt->up && !FZ_DOCUMENT_ITEM(elt->up))
404 elt->up->down = elt->u.node.next;
405 if (elt->u.node.next)
406 elt->u.node.next->u.node.prev = elt->u.node.prev;
407 elt->u.node.next = NULL;
408 elt->u.node.prev = NULL;
409
410 /* Now insert the element */
411 elt->u.node.next = existing->u.node.next;
412 if (elt->u.node.next)
413 elt->u.node.next->u.node.prev = elt;
414 elt->u.node.prev = existing;
415 existing->u.node.next = elt;
416 elt->up = existing->up;
417 }
418
419 void fz_dom_remove(fz_context *ctx, fz_xml *elt)
420 {
421 elt = skip_doc_pointer(elt);
422
423 if (elt == NULL)
424 return;
425
426 /* Unlink child from anywhere it's currently linked in. */
427 if (elt->u.node.prev)
428 elt->u.node.prev->u.node.next = elt->u.node.next;
429 else if (elt->up && !FZ_DOCUMENT_ITEM(elt))
430 elt->up->down = elt->u.node.next;
431 if (elt->u.node.next)
432 elt->u.node.next->u.node.prev = elt->u.node.prev;
433 elt->u.node.next = NULL;
434 elt->u.node.prev = NULL;
435 elt->up = doc_pointer(elt);
436 }
437
438 fz_xml *fz_dom_first_child(fz_context *ctx, fz_xml *elt)
439 {
440 elt = skip_doc_pointer(elt);
441
442 if (elt == NULL || FZ_TEXT_ITEM(elt))
443 return NULL;
444
445 return elt->down;
446 }
447
448 fz_xml *fz_dom_parent(fz_context *ctx, fz_xml *elt)
449 {
450 elt = skip_doc_pointer(elt);
451
452 if (elt == NULL)
453 return NULL;
454
455 if (FZ_DOCUMENT_ITEM(elt->up))
456 return NULL;
457
458 return elt->up;
459 }
460
461 fz_xml *fz_dom_next(fz_context *ctx, fz_xml *elt)
462 {
463 elt = skip_doc_pointer(elt);
464
465 if (elt == NULL)
466 return NULL;
467
468 return elt->u.node.next;
469 }
470
471 fz_xml *fz_dom_previous(fz_context *ctx, fz_xml *elt)
472 {
473 elt = skip_doc_pointer(elt);
474
475 if (elt == NULL)
476 return NULL;
477
478 return elt->u.node.prev;
479 }
480
481 void fz_dom_add_attribute(fz_context *ctx, fz_xml *elt, const char *att, const char *value)
482 {
483 struct attribute *attr;
484 size_t len, size;
485 char *mvalue = NULL;
486 fz_xml *doc;
487
488 elt = skip_doc_pointer(elt);
489
490 if (elt == NULL || att == NULL)
491 return;
492
493 if (FZ_TEXT_ITEM(elt))
494 fz_throw(ctx, FZ_ERROR_ARGUMENT, "Cannot add attributes to text node.");
495
496 /* Move value to being a malloced thing, with the entity parsing done. */
497 if (value) {
498 char *d;
499 const char *s = value;
500 d = mvalue = fz_malloc(ctx, strlen(value)+1);
501
502 while (*s)
503 {
504 if (*s == '&') {
505 int c;
506 s += xml_parse_entity(&c, s);
507 d += fz_runetochar(d, c);
508 }
509 else
510 *d++ = *s++;
511 }
512 *d = 0;
513 }
514
515 /* Do we have an attribute we can reuse? */
516 attr = elt->u.node.u.d.atts;
517 while (attr)
518 {
519 if (strcmp(att, attr->name) == 0)
520 {
521 /* Reuse this one. */
522 break;
523 }
524 attr = attr->next;
525 }
526
527 if (attr && attr->value)
528 {
529 if (mvalue == NULL)
530 {
531 /* Just rewrite the existing value to be NULL. This
532 * 'leaks' the old value within the pool, so it will
533 * be cleaned up at the end. */
534 attr->value = NULL;
535 return;
536 }
537 if (strcmp(mvalue, attr->value) == 0)
538 {
539 /* Old and new values match. Nothing to change. */
540 return;
541 }
542 }
543
544 doc = doc_pointer(elt);
545 /* Move mvalue to be an fz_pool thing. */
546 if (mvalue)
547 {
548 char *tmp;
549 fz_try(ctx)
550 {
551 tmp = fz_pool_alloc(ctx, doc->u.doc.pool, strlen(mvalue)+1);
552 strcpy(tmp, mvalue);
553 }
554 fz_always(ctx)
555 fz_free(ctx, mvalue);
556 fz_catch(ctx)
557 fz_rethrow(ctx);
558 mvalue = tmp;
559 }
560
561 /* Make a new one and prepend it. */
562 len = strlen(att) + 1;
563 size = offsetof(struct attribute, name) + len;
564 attr = fz_pool_alloc(ctx, doc->u.doc.pool, size);
565 memcpy(attr->name, att, len);
566 attr->next = elt->u.node.u.d.atts;
567 elt->u.node.u.d.atts = attr;
568 attr->value = mvalue;
569 }
570
571 void fz_dom_remove_attribute(fz_context *ctx, fz_xml *elt, const char *att)
572 {
573 struct attribute **attr;
574
575 elt = skip_doc_pointer(elt);
576
577 if (elt == NULL || att == NULL)
578 return;
579
580 if (FZ_TEXT_ITEM(elt))
581 fz_throw(ctx, FZ_ERROR_ARGUMENT, "Cannot add attributes to text node.");
582
583 attr = &elt->u.node.u.d.atts;
584 while (*attr)
585 {
586 if (strcmp(att, (*attr)->name) == 0)
587 {
588 /* Delete this one. */
589 /* The old attr/value are 'leaked' within the pool. */
590 *attr = (*attr)->next;
591 break;
592 }
593 attr = &(*attr)->next;
594 }
595 }
596
597 const char *fz_dom_attribute(fz_context *ctx, fz_xml *elt, const char *att)
598 {
599 struct attribute *attr;
600
601 elt = skip_doc_pointer(elt);
602
603 if (elt == NULL || att == NULL)
604 return NULL;
605
606 /* Text nodes don't have attributes. */
607 if (FZ_TEXT_ITEM(elt))
608 return NULL;
609
610 attr = elt->u.node.u.d.atts;
611 while (attr)
612 {
613 if (strcmp(att, attr->name) == 0)
614 {
615 /* Found! */
616 return attr->value;
617 }
618 }
619 return NULL;
620 }
621
622 const char *fz_dom_get_attribute(fz_context *ctx, fz_xml *elt, int i, const char **att)
623 {
624 struct attribute *attr;
625
626 if (elt == NULL || att == NULL)
627 {
628 if (att)
629 *att = NULL;
630 return NULL;
631 }
632
633 /* Text nodes don't have attributes. */
634 if (FZ_TEXT_ITEM(elt) || i < 0)
635 {
636 *att = NULL;
637 return NULL;
638 }
639
640 attr = elt->u.node.u.d.atts;
641 while (attr)
642 {
643 if (i == 0)
644 {
645 /* Found! */
646 *att = attr->name;
647 return attr->value;
648 }
649 i--;
650 attr = attr->next;
651 }
652
653 *att = NULL;
654 return NULL;
655 }