comparison mupdf-source/source/fitz/xml.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 // Copyright (C) 2004-2025 Artifex Software, Inc.
2 //
3 // This file is part of MuPDF.
4 //
5 // MuPDF is free software: you can redistribute it and/or modify it under the
6 // terms of the GNU Affero General Public License as published by the Free
7 // Software Foundation, either version 3 of the License, or (at your option)
8 // any later version.
9 //
10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13 // details.
14 //
15 // You should have received a copy of the GNU Affero General Public License
16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17 //
18 // Alternative licensing terms are available from the licensor.
19 // For commercial licensing, see <https://www.artifex.com/> or contact
20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21 // CA 94129, USA, for further information.
22
23 #include "xml-imp.h"
24
25 #include <string.h>
26 #include <stdlib.h>
27 #include <stdio.h>
28
29 #if FZ_ENABLE_HTML_ENGINE
30 #include <gumbo.h>
31 #endif
32
33 #define FZ_XML_MAX_DEPTH 4096
34
35 /* #define FZ_XML_SEQ */
36
37 static const struct { const char *name; int c; } html_entities[] = {
38 {"nbsp",160}, {"iexcl",161}, {"cent",162}, {"pound",163},
39 {"curren",164}, {"yen",165}, {"brvbar",166}, {"sect",167},
40 {"uml",168}, {"copy",169}, {"ordf",170}, {"laquo",171},
41 {"not",172}, {"shy",173}, {"reg",174}, {"macr",175}, {"deg",176},
42 {"plusmn",177}, {"sup2",178}, {"sup3",179}, {"acute",180},
43 {"micro",181}, {"para",182}, {"middot",183}, {"cedil",184},
44 {"sup1",185}, {"ordm",186}, {"raquo",187}, {"frac14",188},
45 {"frac12",189}, {"frac34",190}, {"iquest",191}, {"Agrave",192},
46 {"Aacute",193}, {"Acirc",194}, {"Atilde",195}, {"Auml",196},
47 {"Aring",197}, {"AElig",198}, {"Ccedil",199}, {"Egrave",200},
48 {"Eacute",201}, {"Ecirc",202}, {"Euml",203}, {"Igrave",204},
49 {"Iacute",205}, {"Icirc",206}, {"Iuml",207}, {"ETH",208},
50 {"Ntilde",209}, {"Ograve",210}, {"Oacute",211}, {"Ocirc",212},
51 {"Otilde",213}, {"Ouml",214}, {"times",215}, {"Oslash",216},
52 {"Ugrave",217}, {"Uacute",218}, {"Ucirc",219}, {"Uuml",220},
53 {"Yacute",221}, {"THORN",222}, {"szlig",223}, {"agrave",224},
54 {"aacute",225}, {"acirc",226}, {"atilde",227}, {"auml",228},
55 {"aring",229}, {"aelig",230}, {"ccedil",231}, {"egrave",232},
56 {"eacute",233}, {"ecirc",234}, {"euml",235}, {"igrave",236},
57 {"iacute",237}, {"icirc",238}, {"iuml",239}, {"eth",240},
58 {"ntilde",241}, {"ograve",242}, {"oacute",243}, {"ocirc",244},
59 {"otilde",245}, {"ouml",246}, {"divide",247}, {"oslash",248},
60 {"ugrave",249}, {"uacute",250}, {"ucirc",251}, {"uuml",252},
61 {"yacute",253}, {"thorn",254}, {"yuml",255}, {"lt",60}, {"gt",62},
62 {"amp",38}, {"apos",39}, {"quot",34}, {"OElig",338}, {"oelig",339},
63 {"Scaron",352}, {"scaron",353}, {"Yuml",376}, {"circ",710},
64 {"tilde",732}, {"ensp",8194}, {"emsp",8195}, {"thinsp",8201},
65 {"zwnj",8204}, {"zwj",8205}, {"lrm",8206}, {"rlm",8207},
66 {"ndash",8211}, {"mdash",8212}, {"lsquo",8216}, {"rsquo",8217},
67 {"sbquo",8218}, {"ldquo",8220}, {"rdquo",8221}, {"bdquo",8222},
68 {"dagger",8224}, {"Dagger",8225}, {"permil",8240}, {"lsaquo",8249},
69 {"rsaquo",8250}, {"euro",8364}, {"fnof",402}, {"Alpha",913},
70 {"Beta",914}, {"Gamma",915}, {"Delta",916}, {"Epsilon",917},
71 {"Zeta",918}, {"Eta",919}, {"Theta",920}, {"Iota",921}, {"Kappa",922},
72 {"Lambda",923}, {"Mu",924}, {"Nu",925}, {"Xi",926}, {"Omicron",927},
73 {"Pi",928}, {"Rho",929}, {"Sigma",931}, {"Tau",932}, {"Upsilon",933},
74 {"Phi",934}, {"Chi",935}, {"Psi",936}, {"Omega",937}, {"alpha",945},
75 {"beta",946}, {"gamma",947}, {"delta",948}, {"epsilon",949},
76 {"zeta",950}, {"eta",951}, {"theta",952}, {"iota",953}, {"kappa",954},
77 {"lambda",955}, {"mu",956}, {"nu",957}, {"xi",958}, {"omicron",959},
78 {"pi",960}, {"rho",961}, {"sigmaf",962}, {"sigma",963}, {"tau",964},
79 {"upsilon",965}, {"phi",966}, {"chi",967}, {"psi",968}, {"omega",969},
80 {"thetasym",977}, {"upsih",978}, {"piv",982}, {"bull",8226},
81 {"hellip",8230}, {"prime",8242}, {"Prime",8243}, {"oline",8254},
82 {"frasl",8260}, {"weierp",8472}, {"image",8465}, {"real",8476},
83 {"trade",8482}, {"alefsym",8501}, {"larr",8592}, {"uarr",8593},
84 {"rarr",8594}, {"darr",8595}, {"harr",8596}, {"crarr",8629},
85 {"lArr",8656}, {"uArr",8657}, {"rArr",8658}, {"dArr",8659},
86 {"hArr",8660}, {"forall",8704}, {"part",8706}, {"exist",8707},
87 {"empty",8709}, {"nabla",8711}, {"isin",8712}, {"notin",8713},
88 {"ni",8715}, {"prod",8719}, {"sum",8721}, {"minus",8722},
89 {"lowast",8727}, {"radic",8730}, {"prop",8733}, {"infin",8734},
90 {"ang",8736}, {"and",8743}, {"or",8744}, {"cap",8745}, {"cup",8746},
91 {"int",8747}, {"there4",8756}, {"sim",8764}, {"cong",8773},
92 {"asymp",8776}, {"ne",8800}, {"equiv",8801}, {"le",8804}, {"ge",8805},
93 {"sub",8834}, {"sup",8835}, {"nsub",8836}, {"sube",8838},
94 {"supe",8839}, {"oplus",8853}, {"otimes",8855}, {"perp",8869},
95 {"sdot",8901}, {"lceil",8968}, {"rceil",8969}, {"lfloor",8970},
96 {"rfloor",8971}, {"lang",9001}, {"rang",9002}, {"loz",9674},
97 {"spades",9824}, {"clubs",9827}, {"hearts",9829}, {"diams",9830},
98 };
99
100 struct parser
101 {
102 fz_pool *pool;
103 fz_xml *head;
104 int preserve_white;
105 int depth;
106 #ifdef FZ_XML_SEQ
107 int seq;
108 #endif
109 };
110
111 static void xml_indent(fz_context *ctx, fz_output *out, int n)
112 {
113 while (n--) {
114 fz_write_byte(ctx, out, ' ');
115 fz_write_byte(ctx, out, ' ');
116 }
117 }
118
119 void fz_debug_xml(fz_xml *item, int level)
120 {
121 /* This is a bit nasty as it relies on implementation
122 * details of both fz_stdout, and fz_write_printf coping
123 * with NULL ctx. */
124 fz_output_xml(NULL, fz_stdout(NULL), item, level);
125 }
126
127 void fz_output_xml(fz_context *ctx, fz_output *out, fz_xml *item, int level)
128 {
129 char *s;
130
131 if (item == NULL)
132 return;
133
134 /* Skip over the DOC object at the top. */
135 if (item->up == NULL)
136 {
137 fz_xml *child;
138 for (child = fz_xml_down(item); child; child = child->u.node.next)
139 fz_output_xml(ctx, out, child, level + 1);
140 return;
141 }
142
143 s = fz_xml_text(item);
144 xml_indent(ctx, out, level);
145 if (s)
146 {
147 int c;
148 fz_write_byte(ctx, out, '"');
149 while (*s) {
150 s += fz_chartorune(&c, s);
151 switch (c) {
152 default:
153 if (c > 0xFFFF)
154 fz_write_printf(ctx, out, "\\u{%X}", c);
155 else if (c < 32 || c > 127)
156 fz_write_printf(ctx, out, "\\u%04X", c);
157 else
158 fz_write_byte(ctx, out, c);
159 break;
160 case '\\': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, '\\'); break;
161 case '\b': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, 'b'); break;
162 case '\f': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, 'f'); break;
163 case '\n': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, 'n'); break;
164 case '\r': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, 'r'); break;
165 case '\t': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, 't'); break;
166 }
167 }
168 fz_write_byte(ctx, out, '"');
169 #ifdef FZ_XML_SEQ
170 fz_write_printf(ctx, out, " <%d>", item->seq);
171 #endif
172 fz_write_byte(ctx, out, '\n');
173 }
174 else
175 {
176 fz_xml *child;
177 struct attribute *att;
178
179 #ifdef FZ_XML_SEQ
180 fz_write_printf(ctx, out, "(%s <%d>\n", item->u.node.u.d.name, item->u.node.seq);
181 #else
182 fz_write_printf(ctx, out, "(%s\n", item->u.node.u.d.name);
183 #endif
184 for (att = item->u.node.u.d.atts; att; att = att->next)
185 {
186 xml_indent(ctx, out, level);
187 fz_write_printf(ctx, out, "=%s %s\n", att->name, att->value);
188 }
189 for (child = fz_xml_down(item); child; child = child->u.node.next)
190 fz_output_xml(ctx, out, child, level + 1);
191 xml_indent(ctx, out, level);
192 #ifdef FZ_XML_SEQ
193 fz_write_printf(ctx, out, ")%s <%d>\n", item->u.node.u.d.name, item->u.node.seq);
194 #else
195 fz_write_printf(ctx, out, ")%s\n", item->u.node.u.d.name);
196 #endif
197 }
198 }
199
200 fz_xml *fz_xml_prev(fz_xml *item)
201 {
202 return item && item->up ? item->u.node.prev : NULL;
203 }
204
205 fz_xml *fz_xml_next(fz_xml *item)
206 {
207 return item && item->up ? item->u.node.next : NULL;
208 }
209
210 fz_xml *fz_xml_up(fz_xml *item)
211 {
212 /* Never step up to the DOC. */
213 return item && item->up && item->up->up ? item->up : NULL;
214 }
215
216 fz_xml *fz_xml_down(fz_xml *item)
217 {
218 /* DOC items can never have MAGIC_TEXT as their down value,
219 * so this is safe. */
220 return item && !FZ_TEXT_ITEM(item) ? item->down : NULL;
221 }
222
223 char *fz_xml_text(fz_xml *item)
224 {
225 /* DOC items can never have MAGIC_TEXT as their down value,
226 * so this is safe. */
227 return (item && FZ_TEXT_ITEM(item)) ? item->u.node.u.text : NULL;
228 }
229
230 char *fz_xml_tag(fz_xml *item)
231 {
232 /* DOC items can never have MAGIC_TEXT as their down value,
233 * so this is safe. */
234 return item && !FZ_TEXT_ITEM(item) ? item->u.node.u.d.name : NULL;
235 }
236
237 int fz_xml_is_tag(fz_xml *item, const char *name)
238 {
239 if (!item || FZ_DOCUMENT_ITEM(item) || FZ_TEXT_ITEM(item))
240 return 0;
241 return !strcmp(item->u.node.u.d.name, name);
242 }
243
244 char *fz_xml_att(fz_xml *item, const char *name)
245 {
246 struct attribute *att;
247 if (!item || FZ_DOCUMENT_ITEM(item) || FZ_TEXT_ITEM(item))
248 return NULL;
249 for (att = item->u.node.u.d.atts; att; att = att->next)
250 if (!strcmp(att->name, name))
251 return att->value;
252 return NULL;
253 }
254
255 char *fz_xml_att_alt(fz_xml *item, const char *one, const char *two)
256 {
257 char *val = fz_xml_att(item, one);
258 if (!val)
259 val = fz_xml_att(item, two);
260 return val;
261 }
262
263 fz_xml *fz_xml_find(fz_xml *item, const char *tag)
264 {
265 /* Skip over any DOC item. */
266 if (item && FZ_DOCUMENT_ITEM(item))
267 item = item->down;
268
269 while (item)
270 {
271 if (!FZ_TEXT_ITEM(item) && !strcmp(item->u.node.u.d.name, tag))
272 return item;
273 item = item->u.node.next;
274 }
275 return NULL;
276 }
277
278 fz_xml *fz_xml_find_next(fz_xml *item, const char *tag)
279 {
280 /* Skip over any DOC item. */
281 if (item && FZ_DOCUMENT_ITEM(item))
282 item = item->down;
283
284 if (item)
285 item = item->u.node.next;
286 return fz_xml_find(item, tag);
287 }
288
289 fz_xml *fz_xml_find_down(fz_xml *item, const char *tag)
290 {
291 if (item)
292 item = fz_xml_down(item);
293 return fz_xml_find(item, tag);
294 }
295
296 int fz_xml_att_eq(fz_xml *item, const char *name, const char *match)
297 {
298 const char *val = fz_xml_att(item, name);
299
300 return val ? !strcmp(val, match) : 0;
301 }
302
303 fz_xml *fz_xml_find_match(fz_xml *item, const char *tag, const char *att, const char *match)
304 {
305 /* Skip over any document item. */
306 if (item && FZ_DOCUMENT_ITEM(item))
307 item = item->down;
308
309 while (1)
310 {
311 item = tag ? fz_xml_find(item, tag) : item;
312 if (item == NULL || fz_xml_att_eq(item, att, match))
313 break;
314 item = item->u.node.next;
315 }
316
317 return item;
318 }
319
320 fz_xml *fz_xml_find_next_match(fz_xml *item, const char *tag, const char *att, const char *match)
321 {
322 /* Skip over any document item. */
323 if (item && FZ_DOCUMENT_ITEM(item))
324 item = item->down;
325
326 if (item != NULL)
327 {
328 do
329 {
330 item = tag ? fz_xml_find_next(item, tag) : item->u.node.next;
331 }
332 while (item != NULL && !fz_xml_att_eq(item, att, match));
333 }
334
335 return item;
336 }
337
338 fz_xml *fz_xml_find_down_match(fz_xml *item, const char *tag, const char *att, const char *match)
339 {
340 return fz_xml_find_match(fz_xml_down(item), tag, att, match);
341 }
342
343 fz_xml *fz_xml_root(fz_xml *xml)
344 {
345 if (xml == NULL)
346 return NULL;
347
348 /* If we've been given a node mid-tree, run up to the root to find
349 * the doc node. */
350 while (xml->up)
351 xml = xml->up;
352
353 /* And the root is the child of the doc.*/
354 return xml->down;
355 }
356
357 void fz_drop_xml(fz_context *ctx, fz_xml *xml)
358 {
359 if (!xml)
360 return;
361
362 /* Wherever we are in the tree, we want the doc node at the root. */
363 while (xml->up)
364 xml = xml->up;
365
366 /* Drop a reference to the tree as a whole. */
367 if (fz_drop_imp(ctx, xml, &xml->u.doc.refs) == 0)
368 return;
369
370 fz_drop_pool(ctx, xml->u.doc.pool);
371 }
372
373 void fz_detach_xml(fz_context *ctx, fz_xml *node)
374 {
375 fz_xml *doc = node;
376
377 /* If we're already a document node, then this is a NOP. */
378 if (doc->up == NULL)
379 return;
380
381 /* Move doc to be the doc pointer at the top of the tree. */
382 while (doc->up)
383 {
384 doc = doc->up;
385 }
386
387 /* Relocate node to be the child of doc. */
388 node->up->down = NULL;
389 doc->down = node;
390
391 /* NOTE: Suppose that X = doc->down on entry. On exit doc->down == node, but
392 * X->up = doc. We need to be careful throughout this code to not assume that
393 * Y is always a child of Y->up. */
394 }
395
396 size_t xml_parse_entity(int *c, const char *a)
397 {
398 char *b;
399 size_t i;
400
401 if (a[1] == '#') {
402 if (a[2] == 'x')
403 *c = strtol(a + 3, &b, 16);
404 else
405 *c = strtol(a + 2, &b, 10);
406 if (*b == ';')
407 return b - a + 1;
408 }
409 else if (a[1] == 'l' && a[2] == 't' && a[3] == ';') {
410 *c = '<';
411 return 4;
412 }
413 else if (a[1] == 'g' && a[2] == 't' && a[3] == ';') {
414 *c = '>';
415 return 4;
416 }
417 else if (a[1] == 'a' && a[2] == 'm' && a[3] == 'p' && a[4] == ';') {
418 *c = '&';
419 return 5;
420 }
421 else if (a[1] == 'a' && a[2] == 'p' && a[3] == 'o' && a[4] == 's' && a[5] == ';') {
422 *c = '\'';
423 return 6;
424 }
425 else if (a[1] == 'q' && a[2] == 'u' && a[3] == 'o' && a[4] == 't' && a[5] == ';') {
426 *c = '"';
427 return 6;
428 }
429
430 /* We should only be doing this for XHTML, but it shouldn't be a problem. */
431 for (i = 0; i < nelem(html_entities); ++i) {
432 size_t n = strlen(html_entities[i].name);
433 if (!strncmp(a+1, html_entities[i].name, n) && a[n+1] == ';') {
434 *c = html_entities[i].c;
435 return n + 2;
436 }
437 }
438
439 *c = *a;
440 return 1;
441 }
442
443 static inline int isname(int c)
444 {
445 return c == '.' || c == '-' || c == '_' || c == ':' ||
446 (c >= '0' && c <= '9') ||
447 (c >= 'A' && c <= 'Z') ||
448 (c >= 'a' && c <= 'z');
449 }
450
451 static inline int iswhite(int c)
452 {
453 return c == ' ' || c == '\r' || c == '\n' || c == '\t';
454 }
455
456 static void xml_emit_open_tag(fz_context *ctx, struct parser *parser, const char *a, const char *b, int is_text)
457 {
458 fz_xml *head, *tail;
459 const char *ns;
460 size_t size;
461
462 if (is_text)
463 size = offsetof(fz_xml, u.node.u.text) + b-a+1;
464 else
465 {
466 /* skip namespace prefix */
467 for (ns = a; ns < b - 1; ++ns)
468 if (*ns == ':')
469 a = ns + 1;
470
471 size = offsetof(fz_xml, u.node.u.d.name) + b-a+1;
472 }
473 head = fz_pool_alloc(ctx, parser->pool, size);
474
475 if (is_text)
476 head->down = MAGIC_TEXT;
477 else
478 {
479 memcpy(head->u.node.u.d.name, a, b - a);
480 head->u.node.u.d.name[b - a] = 0;
481 head->u.node.u.d.atts = NULL;
482 head->down = NULL;
483 }
484
485 head->up = parser->head;
486 head->u.node.next = NULL;
487 #ifdef FZ_XML_SEQ
488 head->u.node.seq = parser->seq++;
489 #endif
490
491 /* During construction, we use head->next to mean "the
492 * tail of the children. When we close the tag, we
493 * rewrite it to be NULL. */
494 if (!parser->head->down) {
495 parser->head->down = head;
496 parser->head->u.node.next = head;
497 head->u.node.prev = NULL;
498 }
499 else {
500 tail = parser->head->u.node.next;
501 tail->u.node.next = head;
502 head->u.node.prev = tail;
503 parser->head->u.node.next = head;
504 }
505
506 parser->head = head;
507 parser->depth++;
508 if (parser->depth >= FZ_XML_MAX_DEPTH)
509 fz_throw(ctx, FZ_ERROR_SYNTAX, "too deep xml element nesting");
510 }
511
512 static void xml_emit_att_name(fz_context *ctx, struct parser *parser, const char *a, const char *b)
513 {
514 fz_xml *head = parser->head;
515 struct attribute *att;
516 size_t size;
517
518 size = offsetof(struct attribute, name) + b-a+1;
519 att = fz_pool_alloc(ctx, parser->pool, size);
520 memcpy(att->name, a, b - a);
521 att->name[b - a] = 0;
522 att->value = NULL;
523 att->next = head->u.node.u.d.atts;
524 head->u.node.u.d.atts = att;
525 }
526
527 void fz_xml_add_att(fz_context *ctx, fz_pool *pool, fz_xml *node, const char *key, const char *val)
528 {
529 size_t size = offsetof(struct attribute, name) + strlen(key) + 1;
530 struct attribute *att = fz_pool_alloc(ctx, pool, size);
531 memcpy(att->name, key, strlen(key)+1);
532 att->value = fz_pool_alloc(ctx, pool, strlen(val) + 1);
533 memcpy(att->value, val, strlen(val)+1);
534 att->next = node->u.node.u.d.atts;
535 node->u.node.u.d.atts = att;
536 }
537
538 static void xml_emit_att_value(fz_context *ctx, struct parser *parser, const char *a, const char *b)
539 {
540 fz_xml *head = parser->head;
541 struct attribute *att = head->u.node.u.d.atts;
542 char *s;
543 int c;
544
545 /* entities are all longer than UTFmax so runetochar is safe */
546 s = att->value = fz_pool_alloc(ctx, parser->pool, b - a + 1);
547 while (a < b) {
548 if (*a == '&') {
549 a += xml_parse_entity(&c, a);
550 s += fz_runetochar(s, c);
551 }
552 else {
553 *s++ = *a++;
554 }
555 }
556 *s = 0;
557 }
558
559 static void xml_emit_close_tag(fz_context *ctx, struct parser *parser)
560 {
561 parser->depth--;
562 parser->head->u.node.next = NULL;
563 if (parser->head->up)
564 parser->head = parser->head->up;
565 }
566
567 static void xml_emit_text(fz_context *ctx, struct parser *parser, const char *a, const char *b)
568 {
569 fz_xml *head;
570 const char *p;
571 char *s;
572 int c;
573
574 /* Skip text outside the root tag */
575 if (parser->depth == 0)
576 return;
577
578 /* Skip all-whitespace text nodes */
579 if (!parser->preserve_white)
580 {
581 for (p = a; p < b; p++)
582 if (!iswhite(*p))
583 break;
584 if (p == b)
585 return;
586 }
587
588 xml_emit_open_tag(ctx, parser, a, b, 1);
589 head = parser->head;
590
591 /* entities are all longer than UTFmax so runetochar is safe */
592 s = fz_xml_text(head);
593 while (a < b) {
594 if (*a == '&') {
595 a += xml_parse_entity(&c, a);
596 s += fz_runetochar(s, c);
597 }
598 else {
599 *s++ = *a++;
600 }
601 }
602 *s = 0;
603
604 xml_emit_close_tag(ctx, parser);
605 }
606
607 static void xml_emit_cdata(fz_context *ctx, struct parser *parser, const char *a, const char *b)
608 {
609 fz_xml *head;
610 char *s;
611
612 xml_emit_open_tag(ctx, parser, a, b, 1);
613 head = parser->head;
614
615 s = head->u.node.u.text;
616 while (a < b)
617 *s++ = *a++;
618 *s = 0;
619
620 xml_emit_close_tag(ctx, parser);
621 }
622
623 static int close_tag(fz_context *ctx, struct parser *parser, const char *mark, const char *p)
624 {
625 const char *ns, *tag;
626
627 /* skip namespace prefix */
628 for (ns = mark; ns < p - 1; ++ns)
629 if (*ns == ':')
630 mark = ns + 1;
631
632 tag = fz_xml_tag(parser->head);
633 if (tag && strncmp(tag, mark, p-mark) == 0 && tag[p-mark] == 0)
634 {
635 xml_emit_close_tag(ctx, parser);
636 return 0;
637 }
638 return 1;
639 }
640
641 static char *xml_parse_document_imp(fz_context *ctx, struct parser *parser, const char *p) /* lgtm [cpp/use-of-goto] */
642 {
643 const char *mark;
644 int quote;
645
646 parse_text:
647 mark = p;
648 while (*p && *p != '<') ++p;
649 if (*p == '<') {
650 if (mark < p)
651 xml_emit_text(ctx, parser, mark, p);
652 ++p;
653 goto parse_element;
654 } else if (mark < p)
655 xml_emit_text(ctx, parser, mark, p);
656 return NULL;
657
658 parse_element:
659 if (*p == '/') { ++p; goto parse_closing_element; }
660 if (*p == '!') { ++p; goto parse_comment; }
661 if (*p == '?') { ++p; goto parse_processing_instruction; }
662 while (iswhite(*p)) ++p;
663 if (isname(*p))
664 goto parse_element_name;
665 return "syntax error in element";
666
667 parse_comment:
668 if (p[0]=='D' && p[1]=='O' && p[2]=='C' && p[3]=='T' && p[4]=='Y' && p[5]=='P' && p[6]=='E')
669 goto parse_declaration;
670 if (p[0]=='E' && p[1]=='N' && p[2]=='T' && p[3]=='I' && p[4]=='T' && p[5]=='Y')
671 goto parse_declaration;
672 if (*p == '[') goto parse_cdata;
673 if (*p++ != '-') return "syntax error in comment (<! not followed by --)";
674 if (*p++ != '-') return "syntax error in comment (<!- not followed by -)";
675 while (*p) {
676 if (p[0] == '-' && p[1] == '-' && p[2] == '>') {
677 p += 3;
678 goto parse_text;
679 }
680 ++p;
681 }
682 return "end of data in comment";
683
684 parse_declaration:
685 while (*p) if (*p++ == '>') goto parse_text;
686 return "end of data in declaration";
687
688 parse_cdata:
689 if (p[1] != 'C' || p[2] != 'D' || p[3] != 'A' || p[4] != 'T' || p[5] != 'A' || p[6] != '[')
690 return "syntax error in CDATA section";
691 p += 7;
692 mark = p;
693 while (*p) {
694 if (p[0] == ']' && p[1] == ']' && p[2] == '>') {
695 xml_emit_cdata(ctx, parser, mark, p);
696 p += 3;
697 goto parse_text;
698 }
699 ++p;
700 }
701 return "end of data in CDATA section";
702
703 parse_processing_instruction:
704 while (*p) {
705 if (p[0] == '?' && p[1] == '>') {
706 p += 2;
707 goto parse_text;
708 }
709 ++p;
710 }
711 return "end of data in processing instruction";
712
713 parse_closing_element:
714 while (iswhite(*p)) ++p;
715 mark = p;
716 while (isname(*p)) ++p;
717 if (!isname(*mark))
718 return "syntax error in closing element";
719 if (close_tag(ctx, parser, mark, p))
720 return "opening and closing tag mismatch";
721 while (iswhite(*p)) ++p;
722 if (*p != '>')
723 return "syntax error in closing element";
724 ++p;
725 goto parse_text;
726
727 parse_element_name:
728 mark = p;
729 while (isname(*p)) ++p;
730 xml_emit_open_tag(ctx, parser, mark, p, 0);
731 if (*p == '>') {
732 ++p;
733 goto parse_text;
734 }
735 if (p[0] == '/' && p[1] == '>') {
736 xml_emit_close_tag(ctx, parser);
737 p += 2;
738 goto parse_text;
739 }
740 if (iswhite(*p))
741 goto parse_attributes;
742 return "syntax error after element name";
743
744 parse_attributes:
745 while (iswhite(*p)) ++p;
746 if (isname(*p))
747 goto parse_attribute_name;
748 if (*p == '>') {
749 ++p;
750 goto parse_text;
751 }
752 if (p[0] == '/' && p[1] == '>') {
753 xml_emit_close_tag(ctx, parser);
754 p += 2;
755 goto parse_text;
756 }
757 return "syntax error in attributes";
758
759 parse_attribute_name:
760 mark = p;
761 while (isname(*p)) ++p;
762 xml_emit_att_name(ctx, parser, mark, p);
763 while (iswhite(*p)) ++p;
764 if (*p == '=') { ++p; goto parse_attribute_value; }
765 return "syntax error after attribute name";
766
767 parse_attribute_value:
768 while (iswhite(*p)) ++p;
769 quote = *p++;
770 mark = p;
771
772 /* special case for handling MOBI filepos=00000 syntax */
773 if (quote >= '0' && quote <= '9') {
774 while (*p >= '0' && *p <= '9') ++p;
775 xml_emit_att_value(ctx, parser, mark, p);
776 goto parse_attributes;
777 }
778
779 if (quote != '"' && quote != '\'')
780 return "missing quote character";
781 while (*p && *p != quote) ++p;
782 if (*p == quote) {
783 xml_emit_att_value(ctx, parser, mark, p++);
784 goto parse_attributes;
785 }
786 return "end of data in attribute value";
787 }
788
789 static int fast_tolower(int c)
790 {
791 if ((unsigned)c - 'A' < 26)
792 return c | 32;
793 return c;
794 }
795
796 static int fast_strncasecmp(const char *a, const char *b, size_t n)
797 {
798 if (!n--)
799 return 0;
800 for (; *a && *b && n && fast_tolower(*a) == fast_tolower(*b); a++, b++, n--)
801 ;
802 return fast_tolower(*a) - fast_tolower(*b);
803 }
804
805 static char *fast_strcasestr(char *h, char *n)
806 {
807 int n0 = fast_tolower(*n++);
808 size_t nn = strlen(n);
809 while (*h != 0)
810 {
811 if (fast_tolower(*h) == n0 && fast_strncasecmp(h+1, n, nn) == 0)
812 return h;
813 ++h;
814 }
815 return NULL;
816 }
817
818 static int startswith(const char *a, const char *b)
819 {
820 return !fast_strncasecmp(a, b, strlen(b));
821 }
822
823 /* https://encoding.spec.whatwg.org/#names-and-labels */
824 static struct { char *encoding; char *alias; } encoding_aliases[] = {
825 { "big5", "big5" },
826 { "big5", "big5-hkscs" },
827 { "big5", "cn-big5" },
828 { "big5", "csbig5" },
829 { "big5", "x-x-big5" },
830 { "euc-cn", "euc-cn" },
831 { "euc-jp", "cseucpkdfmtjapanese" },
832 { "euc-jp", "euc-jp" },
833 { "euc-jp", "x-euc-jp" },
834 { "euc-kr", "cseuckr" },
835 { "euc-kr", "csksc56011987" },
836 { "euc-kr", "euc-kr" },
837 { "euc-kr", "iso-ir-149" },
838 { "euc-kr", "korean" },
839 { "euc-kr", "ks_c_5601" },
840 { "euc-kr", "ksc5601" },
841 { "euc-kr", "ksc_5601" },
842 { "euc-kr", "windows-949" },
843 { "euc-tw", "euc-tw" },
844 { "gb18030", "chinese" },
845 { "gb18030", "csgb2312" },
846 { "gb18030", "csiso58gb231280" },
847 { "gb18030", "gb18030" },
848 { "gb18030", "gb2312" },
849 { "gb18030", "gb_2312" },
850 { "gb18030", "gbk" },
851 { "gb18030", "iso-ir-58" },
852 { "gb18030", "x-gbk" },
853 { "iso-8859-1", "ascii" },
854 { "iso-8859-1", "iso-8859-1" },
855 { "iso-8859-1", "iso8859-1" },
856 { "iso-8859-1", "latin1" },
857 { "iso-8859-1", "us-ascii" },
858 { "iso-8859-7", "greek" },
859 { "iso-8859-7", "greek8" },
860 { "iso-8859-7", "iso-8859-1" },
861 { "iso-8859-7", "iso8859-1" },
862 { "koi8-r", "koi" },
863 { "koi8-r", "koi8" },
864 { "koi8-r", "koi8-r" },
865 { "koi8-r", "koi8-ru" },
866 { "koi8-r", "koi8-u" },
867 { "koi8-r", "koi8_r" },
868 { "shift_jis", "csshiftjis" },
869 { "shift_jis", "ms932" },
870 { "shift_jis", "ms_kanji" },
871 { "shift_jis", "shift-jis" },
872 { "shift_jis", "shift_jis" },
873 { "shift_jis", "sjis" },
874 { "shift_jis", "windows-31j" },
875 { "shift_jis", "x-sjis" },
876 { "windows-1250", "cp1250" },
877 { "windows-1250", "windows-1250" },
878 { "windows-1251", "cp1251" },
879 { "windows-1251", "windows-1251" },
880 { "windows-1252", "cp1252" },
881 { "windows-1252", "cp819" },
882 { "windows-1252", "windows-1252" },
883 };
884
885 static char *match_encoding_name(char *enc)
886 {
887 size_t i;
888 for (i = 0; i < nelem(encoding_aliases); ++i)
889 if (startswith(enc, encoding_aliases[i].alias))
890 return encoding_aliases[i].encoding;
891 return NULL;
892 }
893
894 // Look for encoding in <meta http-equiv="content-type" content="text/html; charset=XXX"> tags
895 static const char *find_meta_encoding(char *s)
896 {
897 const char *table = NULL;
898 char *end, *meta, *charset, *enc;
899
900 meta = fast_strcasestr(s, "<meta");
901 while (meta && !table)
902 {
903 end = strchr(meta, '>');
904 if (end)
905 {
906 *end = 0;
907 if (fast_strcasestr(meta, "http-equiv") && fast_strcasestr(meta, "content-type"))
908 {
909 charset = fast_strcasestr(meta, "charset=");
910 if (charset)
911 {
912 enc = match_encoding_name(charset + 8);
913 if (enc)
914 table = enc;
915 }
916 }
917 *end = '>';
918 }
919 meta = fast_strcasestr(meta + 5, "<meta");
920 }
921
922 return table;
923 }
924
925 static const char *find_xml_encoding(char *s)
926 {
927 const char *table = NULL;
928 char *end, *xml, *enc;
929
930 end = strchr(s, '>');
931 if (end)
932 {
933 *end = 0;
934 xml = strstr(s, "<?xml");
935 if (xml)
936 {
937 enc = strstr(xml, "encoding=");
938 if (enc)
939 {
940 enc = match_encoding_name(enc + 10);
941 if (enc)
942 table = enc;
943 }
944 }
945 *end = '>';
946 }
947
948 if (!table)
949 table = find_meta_encoding(s);
950
951 return table;
952 }
953
954 static char *convert_to_utf8(fz_context *ctx, unsigned char *s, size_t n, int *dofree)
955 {
956 fz_text_decoder dec;
957 const char *enc;
958 const unsigned char *e = s + n;
959 char *dst, *d;
960 int m;
961 int c;
962
963 if (s[0] == 0xFE && s[1] == 0xFF) {
964 s += 2;
965 dst = d = Memento_label(fz_malloc(ctx, n * FZ_UTFMAX), "utf8_from_be");
966 while (s + 1 < e) {
967 c = s[0] << 8 | s[1];
968 d += fz_runetochar(d, c);
969 s += 2;
970 }
971 *d = 0;
972 *dofree = 1;
973 return dst;
974 }
975
976 if (s[0] == 0xFF && s[1] == 0xFE) {
977 s += 2;
978 dst = d = Memento_label(fz_malloc(ctx, n * FZ_UTFMAX), "utf8_from_le");
979 while (s + 1 < e) {
980 c = s[0] | s[1] << 8;
981 d += fz_runetochar(d, c);
982 s += 2;
983 }
984 *d = 0;
985 *dofree = 1;
986 return dst;
987 }
988
989 enc = find_xml_encoding((char*)s);
990 if (enc)
991 {
992 fz_init_text_decoder(ctx, &dec, enc);
993 // NOTE: use decode_size if memory is more important than speed
994 m = (int)dec.decode_bound(&dec, s, (int)n);
995 dst = Memento_label(fz_malloc(ctx, m), "utf8");
996 dec.decode(&dec, dst, s, (int)n);
997 *dofree = 1;
998 return dst;
999 }
1000
1001 *dofree = 0;
1002
1003 if (s[0] == 0xEF && s[1] == 0xBB && s[2] == 0xBF)
1004 return (char*)s+3;
1005
1006 return (char*)s;
1007 }
1008
1009 fz_xml *
1010 fz_parse_xml_stream(fz_context *ctx, fz_stream *stm, int preserve_white)
1011 {
1012 fz_buffer *buf = fz_read_all(ctx, stm, 128);
1013 fz_xml *xml = NULL;
1014
1015 fz_var(xml);
1016
1017 fz_try(ctx)
1018 xml = fz_parse_xml(ctx, buf, preserve_white);
1019 fz_always(ctx)
1020 fz_drop_buffer(ctx, buf);
1021 fz_catch(ctx)
1022 fz_rethrow(ctx);
1023
1024 return xml;
1025 }
1026
1027 static fz_xml *
1028 parse_and_drop_buffer(fz_context *ctx, fz_buffer *buf, int preserve_white)
1029 {
1030 fz_xml *xml = NULL;
1031
1032 fz_var(xml);
1033
1034 fz_try(ctx)
1035 xml = fz_parse_xml(ctx, buf, preserve_white);
1036 fz_always(ctx)
1037 fz_drop_buffer(ctx, buf);
1038 fz_catch(ctx)
1039 fz_rethrow(ctx);
1040
1041 return xml;
1042 }
1043
1044 fz_xml *
1045 fz_parse_xml_archive_entry(fz_context *ctx, fz_archive *arch, const char *filename, int preserve_white)
1046 {
1047 fz_buffer *buf = fz_read_archive_entry(ctx, arch, filename);
1048
1049 return parse_and_drop_buffer(ctx, buf, preserve_white);
1050 }
1051
1052 fz_xml *
1053 fz_try_parse_xml_archive_entry(fz_context *ctx, fz_archive *arch, const char *filename, int preserve_white)
1054 {
1055 fz_buffer *buf = fz_try_read_archive_entry(ctx, arch, filename);
1056
1057 if (buf == NULL)
1058 return NULL;
1059
1060 return parse_and_drop_buffer(ctx, buf, preserve_white);
1061 }
1062
1063 fz_xml *
1064 fz_parse_xml(fz_context *ctx, fz_buffer *buf, int preserve_white)
1065 {
1066 struct parser parser;
1067 fz_xml *xml = NULL;
1068 fz_xml *root, *node;
1069 char *p = NULL;
1070 char *error;
1071 int dofree = 0;
1072 unsigned char *s;
1073 size_t n;
1074 static unsigned char empty_string[] = "";
1075
1076 fz_var(dofree);
1077 fz_var(p);
1078
1079 if (buf == NULL)
1080 {
1081 n = 0;
1082 s = empty_string;
1083 }
1084 else
1085 {
1086 /* ensure we are zero-terminated */
1087 fz_terminate_buffer(ctx, buf);
1088 n = fz_buffer_storage(ctx, buf, &s);
1089 }
1090
1091 parser.pool = fz_new_pool(ctx);
1092 parser.head = root = fz_pool_alloc_flexible(ctx, parser.pool, fz_xml, u.node.u.d.name, 1);
1093 parser.preserve_white = preserve_white;
1094 parser.depth = 0;
1095 #ifdef FZ_XML_SEQ
1096 parser.seq = 0;
1097 #endif
1098
1099 fz_try(ctx)
1100 {
1101 p = convert_to_utf8(ctx, s, n, &dofree);
1102
1103 error = xml_parse_document_imp(ctx, &parser, p);
1104 if (error)
1105 fz_throw(ctx, FZ_ERROR_SYNTAX, "%s", error);
1106
1107 for (node = parser.head; node; node = node->up)
1108 node->u.node.next = NULL;
1109
1110 xml = fz_pool_alloc(ctx, parser.pool, sizeof *xml);
1111 xml->up = NULL;
1112 xml->down = root->down;
1113 xml->u.doc.refs = 1;
1114 xml->u.doc.pool = parser.pool;
1115
1116 for (node = root->down; node; node = node->u.node.next)
1117 node->up = xml;
1118 }
1119 fz_always(ctx)
1120 {
1121 if (dofree)
1122 fz_free(ctx, p);
1123 }
1124 fz_catch(ctx)
1125 {
1126 fz_drop_pool(ctx, parser.pool);
1127 fz_rethrow(ctx);
1128 }
1129
1130 return xml;
1131 }
1132
1133 #if FZ_ENABLE_HTML_ENGINE
1134 /*
1135 Parse the contents of buffer into a tree of XML nodes, using the HTML5 syntax.
1136
1137 Gumbo doesn't check for malloc errors. Use our pool allocator and let it longjmp
1138 out of Gumbo on allocation errors. At the end (success or fail) we release the
1139 pool used for Gumbo's parse tree all at once.
1140 */
1141
1142 struct mem_gumbo {
1143 fz_context *ctx;
1144 fz_pool *pool;
1145 };
1146
1147 static void *alloc_gumbo(void *ctx, size_t size)
1148 {
1149 struct mem_gumbo *mem = ctx;
1150 return fz_pool_alloc(mem->ctx, mem->pool, size);
1151 }
1152
1153 static void dealloc_gumbo(void *ctx, void *ptr)
1154 {
1155 /* nothing */
1156 }
1157
1158 static void xml_from_gumbo(fz_context *ctx, struct parser *parser, GumboNode *node)
1159 {
1160 unsigned int i;
1161 const char *tag, *end, *sentinel;
1162
1163 switch (node->type)
1164 {
1165 case GUMBO_NODE_ELEMENT:
1166 if (node->v.element.tag != GUMBO_TAG_UNKNOWN)
1167 {
1168 tag = gumbo_normalized_tagname(node->v.element.tag);
1169 end = tag + strlen(tag);
1170 }
1171 else
1172 {
1173 tag = node->v.element.original_tag.data;
1174 sentinel = tag + node->v.element.original_tag.length;
1175 if (tag[0] == '<')
1176 ++tag;
1177 for (end = tag; end < sentinel; ++end)
1178 if (end[0] == '>' || end[0] == '/' || iswhite(end[0]))
1179 break;
1180 }
1181 xml_emit_open_tag(ctx, parser, tag, end, 0);
1182 for (i = 0; i < node->v.element.attributes.length; ++i)
1183 {
1184 GumboAttribute *att = node->v.element.attributes.data[i];
1185 xml_emit_att_name(ctx, parser, att->name, att->name+strlen(att->name));
1186 xml_emit_att_value(ctx, parser, att->value, att->value+strlen(att->value));
1187 }
1188 for (i = 0; i < node->v.element.children.length; ++i)
1189 {
1190 GumboNode *child = node->v.element.children.data[i];
1191 xml_from_gumbo(ctx, parser, child);
1192 }
1193 xml_emit_close_tag(ctx, parser);
1194 break;
1195
1196 case GUMBO_NODE_TEXT:
1197 case GUMBO_NODE_CDATA:
1198 case GUMBO_NODE_WHITESPACE:
1199 xml_emit_text(ctx, parser, node->v.text.text, node->v.text.text+strlen(node->v.text.text));
1200 break;
1201
1202 case GUMBO_NODE_DOCUMENT:
1203 case GUMBO_NODE_COMMENT:
1204 case GUMBO_NODE_TEMPLATE:
1205 break;
1206 }
1207 }
1208 #endif
1209
1210 fz_xml *
1211 fz_parse_xml_from_html5(fz_context *ctx, fz_buffer *buf)
1212 {
1213 #if FZ_ENABLE_HTML_ENGINE
1214 struct parser parser;
1215 fz_xml *xml = NULL;
1216 fz_xml root, *node;
1217 char *p = NULL;
1218 int dofree = 0;
1219 unsigned char *s;
1220 size_t n;
1221 GumboOutput *soup = NULL;
1222 GumboOptions opts;
1223 struct mem_gumbo mem;
1224 static unsigned char empty_string[] = "";
1225
1226 fz_var(mem.pool);
1227 fz_var(soup);
1228 fz_var(dofree);
1229 fz_var(p);
1230
1231 if (buf == NULL)
1232 {
1233 n = 0;
1234 s = empty_string;
1235 }
1236 else
1237 {
1238 /* ensure we are zero-terminated */
1239 fz_terminate_buffer(ctx, buf);
1240 n = fz_buffer_storage(ctx, buf, &s);
1241 }
1242
1243 mem.ctx = ctx;
1244 mem.pool = NULL;
1245
1246 memset(&root, 0, sizeof(root));
1247 parser.pool = fz_new_pool(ctx);
1248 parser.head = &root;
1249 parser.preserve_white = 1;
1250 parser.depth = 0;
1251 #ifdef FZ_XML_SEQ
1252 parser.seq = 0;
1253 #endif
1254
1255 fz_try(ctx)
1256 {
1257 p = convert_to_utf8(ctx, s, n, &dofree);
1258
1259 mem.pool = fz_new_pool(ctx);
1260 memset(&opts, 0, sizeof opts);
1261 opts.allocator = alloc_gumbo;
1262 opts.deallocator = dealloc_gumbo;
1263 opts.userdata = &mem;
1264 opts.tab_stop = 8;
1265 opts.stop_on_first_error = 0;
1266 opts.max_errors = -1;
1267 opts.fragment_context = GUMBO_TAG_LAST;
1268 opts.fragment_namespace = GUMBO_NAMESPACE_HTML;
1269
1270 soup = gumbo_parse_with_options(&opts, (const char *)p, strlen(p));
1271
1272 xml_from_gumbo(ctx, &parser, soup->root);
1273
1274 for (node = parser.head; node; node = node->up)
1275 node->u.node.next = NULL;
1276
1277 xml = fz_pool_alloc(ctx, parser.pool, sizeof *xml);
1278 xml->up = NULL;
1279 xml->down = root.down;
1280 xml->u.doc.pool = parser.pool;
1281 xml->u.doc.refs = 1;
1282
1283 for (node = root.down; node; node = node->u.node.next)
1284 node->up = xml;
1285 }
1286 fz_always(ctx)
1287 {
1288 if (soup)
1289 gumbo_destroy_output(&opts, soup);
1290 fz_drop_pool(ctx, mem.pool);
1291 if (dofree)
1292 fz_free(ctx, p);
1293 }
1294 fz_catch(ctx)
1295 {
1296 fz_drop_pool(ctx, parser.pool);
1297 fz_rethrow(ctx);
1298 }
1299
1300 return xml;
1301 #else
1302 fz_throw(ctx, FZ_ERROR_GENERIC, "HTML Engine not enabled in this build");
1303 #endif
1304 }
1305
1306 fz_xml *fz_xml_find_dfs(fz_xml *item, const char *tag, const char *att, const char *match)
1307 {
1308 return fz_xml_find_dfs_top(item, tag, att, match, NULL);
1309 }
1310
1311 fz_xml *fz_xml_find_dfs_top(fz_xml *item, const char *tag, const char *att, const char *match, fz_xml *top)
1312 {
1313 /* Skip over any DOC object. */
1314 if (item && FZ_DOCUMENT_ITEM(item))
1315 item = item->down;
1316
1317 while (item)
1318 {
1319 if (!FZ_TEXT_ITEM(item) && (tag == NULL || !strcmp(item->u.node.u.d.name, tag)))
1320 {
1321 if (att == NULL || (match == NULL ? fz_xml_att(item, att) != NULL : fz_xml_att_eq(item, att, match)))
1322 return item;
1323 }
1324
1325 if (!FZ_TEXT_ITEM(item) && item->down)
1326 item = item->down;
1327 else if (item->u.node.next)
1328 item = item->u.node.next;
1329 else
1330 while (1) {
1331 item = item->up;
1332 /* Stop searching if we hit our declared 'top' item. */
1333 if (item == top)
1334 return NULL;
1335 /* We should never reach item == NULL, but just in case. */
1336 if (item == NULL)
1337 return NULL;
1338 /* If we reach the DOC object at the top, we're done. */
1339 if (item->up == NULL)
1340 return NULL;
1341 if (item->u.node.next)
1342 {
1343 item = item->u.node.next;
1344 break;
1345 }
1346 }
1347 }
1348
1349 return NULL;
1350 }
1351
1352 fz_xml *fz_xml_find_next_dfs(fz_xml *item, const char *tag, const char *att, const char *match)
1353 {
1354 return fz_xml_find_next_dfs_top(item, tag, att, match, NULL);
1355 }
1356
1357 fz_xml *fz_xml_find_next_dfs_top(fz_xml *item, const char *tag, const char *att, const char *match, fz_xml *top)
1358 {
1359 /* Skip over any DOC object. */
1360 if (item && FZ_DOCUMENT_ITEM(item))
1361 item = item->down;
1362
1363 if (item == NULL)
1364 return NULL;
1365
1366 if (item->down)
1367 item = item->down;
1368 else if (item->u.node.next)
1369 item = item->u.node.next;
1370 else
1371 while (1) {
1372 item = item->up;
1373 /* Stop searching if we hit our declared 'top' item. */
1374 if (item == top)
1375 return NULL;
1376 /* We should never reach item == NULL, but just in case. */
1377 if (item == NULL)
1378 return NULL;
1379 /* If we reach the DOC object at the top, we're done. */
1380 if (item->up == NULL)
1381 return NULL;
1382 if (item->u.node.next)
1383 {
1384 item = item->u.node.next;
1385 break;
1386 }
1387 }
1388
1389 return fz_xml_find_dfs_top(item, tag, att, match, top);
1390 }
1391
1392 fz_xml *fz_keep_xml(fz_context *ctx, fz_xml *xml)
1393 {
1394 fz_xml *dom = xml;
1395 if (xml == NULL)
1396 return xml;
1397
1398 while (dom->up)
1399 dom = dom->up;
1400
1401 fz_keep_imp(ctx, dom, &dom->u.doc.refs);
1402
1403 /* Return the original node pointer, not the dom pointer! */
1404 return xml;
1405 }