Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/source/fitz/xml.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 // Copyright (C) 2004-2025 Artifex Software, Inc. | |
| 2 // | |
| 3 // This file is part of MuPDF. | |
| 4 // | |
| 5 // MuPDF is free software: you can redistribute it and/or modify it under the | |
| 6 // terms of the GNU Affero General Public License as published by the Free | |
| 7 // Software Foundation, either version 3 of the License, or (at your option) | |
| 8 // any later version. | |
| 9 // | |
| 10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY | |
| 11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | |
| 12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more | |
| 13 // details. | |
| 14 // | |
| 15 // You should have received a copy of the GNU Affero General Public License | |
| 16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> | |
| 17 // | |
| 18 // Alternative licensing terms are available from the licensor. | |
| 19 // For commercial licensing, see <https://www.artifex.com/> or contact | |
| 20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, | |
| 21 // CA 94129, USA, for further information. | |
| 22 | |
| 23 #include "xml-imp.h" | |
| 24 | |
| 25 #include <string.h> | |
| 26 #include <stdlib.h> | |
| 27 #include <stdio.h> | |
| 28 | |
| 29 #if FZ_ENABLE_HTML_ENGINE | |
| 30 #include <gumbo.h> | |
| 31 #endif | |
| 32 | |
| 33 #define FZ_XML_MAX_DEPTH 4096 | |
| 34 | |
| 35 /* #define FZ_XML_SEQ */ | |
| 36 | |
| 37 static const struct { const char *name; int c; } html_entities[] = { | |
| 38 {"nbsp",160}, {"iexcl",161}, {"cent",162}, {"pound",163}, | |
| 39 {"curren",164}, {"yen",165}, {"brvbar",166}, {"sect",167}, | |
| 40 {"uml",168}, {"copy",169}, {"ordf",170}, {"laquo",171}, | |
| 41 {"not",172}, {"shy",173}, {"reg",174}, {"macr",175}, {"deg",176}, | |
| 42 {"plusmn",177}, {"sup2",178}, {"sup3",179}, {"acute",180}, | |
| 43 {"micro",181}, {"para",182}, {"middot",183}, {"cedil",184}, | |
| 44 {"sup1",185}, {"ordm",186}, {"raquo",187}, {"frac14",188}, | |
| 45 {"frac12",189}, {"frac34",190}, {"iquest",191}, {"Agrave",192}, | |
| 46 {"Aacute",193}, {"Acirc",194}, {"Atilde",195}, {"Auml",196}, | |
| 47 {"Aring",197}, {"AElig",198}, {"Ccedil",199}, {"Egrave",200}, | |
| 48 {"Eacute",201}, {"Ecirc",202}, {"Euml",203}, {"Igrave",204}, | |
| 49 {"Iacute",205}, {"Icirc",206}, {"Iuml",207}, {"ETH",208}, | |
| 50 {"Ntilde",209}, {"Ograve",210}, {"Oacute",211}, {"Ocirc",212}, | |
| 51 {"Otilde",213}, {"Ouml",214}, {"times",215}, {"Oslash",216}, | |
| 52 {"Ugrave",217}, {"Uacute",218}, {"Ucirc",219}, {"Uuml",220}, | |
| 53 {"Yacute",221}, {"THORN",222}, {"szlig",223}, {"agrave",224}, | |
| 54 {"aacute",225}, {"acirc",226}, {"atilde",227}, {"auml",228}, | |
| 55 {"aring",229}, {"aelig",230}, {"ccedil",231}, {"egrave",232}, | |
| 56 {"eacute",233}, {"ecirc",234}, {"euml",235}, {"igrave",236}, | |
| 57 {"iacute",237}, {"icirc",238}, {"iuml",239}, {"eth",240}, | |
| 58 {"ntilde",241}, {"ograve",242}, {"oacute",243}, {"ocirc",244}, | |
| 59 {"otilde",245}, {"ouml",246}, {"divide",247}, {"oslash",248}, | |
| 60 {"ugrave",249}, {"uacute",250}, {"ucirc",251}, {"uuml",252}, | |
| 61 {"yacute",253}, {"thorn",254}, {"yuml",255}, {"lt",60}, {"gt",62}, | |
| 62 {"amp",38}, {"apos",39}, {"quot",34}, {"OElig",338}, {"oelig",339}, | |
| 63 {"Scaron",352}, {"scaron",353}, {"Yuml",376}, {"circ",710}, | |
| 64 {"tilde",732}, {"ensp",8194}, {"emsp",8195}, {"thinsp",8201}, | |
| 65 {"zwnj",8204}, {"zwj",8205}, {"lrm",8206}, {"rlm",8207}, | |
| 66 {"ndash",8211}, {"mdash",8212}, {"lsquo",8216}, {"rsquo",8217}, | |
| 67 {"sbquo",8218}, {"ldquo",8220}, {"rdquo",8221}, {"bdquo",8222}, | |
| 68 {"dagger",8224}, {"Dagger",8225}, {"permil",8240}, {"lsaquo",8249}, | |
| 69 {"rsaquo",8250}, {"euro",8364}, {"fnof",402}, {"Alpha",913}, | |
| 70 {"Beta",914}, {"Gamma",915}, {"Delta",916}, {"Epsilon",917}, | |
| 71 {"Zeta",918}, {"Eta",919}, {"Theta",920}, {"Iota",921}, {"Kappa",922}, | |
| 72 {"Lambda",923}, {"Mu",924}, {"Nu",925}, {"Xi",926}, {"Omicron",927}, | |
| 73 {"Pi",928}, {"Rho",929}, {"Sigma",931}, {"Tau",932}, {"Upsilon",933}, | |
| 74 {"Phi",934}, {"Chi",935}, {"Psi",936}, {"Omega",937}, {"alpha",945}, | |
| 75 {"beta",946}, {"gamma",947}, {"delta",948}, {"epsilon",949}, | |
| 76 {"zeta",950}, {"eta",951}, {"theta",952}, {"iota",953}, {"kappa",954}, | |
| 77 {"lambda",955}, {"mu",956}, {"nu",957}, {"xi",958}, {"omicron",959}, | |
| 78 {"pi",960}, {"rho",961}, {"sigmaf",962}, {"sigma",963}, {"tau",964}, | |
| 79 {"upsilon",965}, {"phi",966}, {"chi",967}, {"psi",968}, {"omega",969}, | |
| 80 {"thetasym",977}, {"upsih",978}, {"piv",982}, {"bull",8226}, | |
| 81 {"hellip",8230}, {"prime",8242}, {"Prime",8243}, {"oline",8254}, | |
| 82 {"frasl",8260}, {"weierp",8472}, {"image",8465}, {"real",8476}, | |
| 83 {"trade",8482}, {"alefsym",8501}, {"larr",8592}, {"uarr",8593}, | |
| 84 {"rarr",8594}, {"darr",8595}, {"harr",8596}, {"crarr",8629}, | |
| 85 {"lArr",8656}, {"uArr",8657}, {"rArr",8658}, {"dArr",8659}, | |
| 86 {"hArr",8660}, {"forall",8704}, {"part",8706}, {"exist",8707}, | |
| 87 {"empty",8709}, {"nabla",8711}, {"isin",8712}, {"notin",8713}, | |
| 88 {"ni",8715}, {"prod",8719}, {"sum",8721}, {"minus",8722}, | |
| 89 {"lowast",8727}, {"radic",8730}, {"prop",8733}, {"infin",8734}, | |
| 90 {"ang",8736}, {"and",8743}, {"or",8744}, {"cap",8745}, {"cup",8746}, | |
| 91 {"int",8747}, {"there4",8756}, {"sim",8764}, {"cong",8773}, | |
| 92 {"asymp",8776}, {"ne",8800}, {"equiv",8801}, {"le",8804}, {"ge",8805}, | |
| 93 {"sub",8834}, {"sup",8835}, {"nsub",8836}, {"sube",8838}, | |
| 94 {"supe",8839}, {"oplus",8853}, {"otimes",8855}, {"perp",8869}, | |
| 95 {"sdot",8901}, {"lceil",8968}, {"rceil",8969}, {"lfloor",8970}, | |
| 96 {"rfloor",8971}, {"lang",9001}, {"rang",9002}, {"loz",9674}, | |
| 97 {"spades",9824}, {"clubs",9827}, {"hearts",9829}, {"diams",9830}, | |
| 98 }; | |
| 99 | |
| 100 struct parser | |
| 101 { | |
| 102 fz_pool *pool; | |
| 103 fz_xml *head; | |
| 104 int preserve_white; | |
| 105 int depth; | |
| 106 #ifdef FZ_XML_SEQ | |
| 107 int seq; | |
| 108 #endif | |
| 109 }; | |
| 110 | |
| 111 static void xml_indent(fz_context *ctx, fz_output *out, int n) | |
| 112 { | |
| 113 while (n--) { | |
| 114 fz_write_byte(ctx, out, ' '); | |
| 115 fz_write_byte(ctx, out, ' '); | |
| 116 } | |
| 117 } | |
| 118 | |
| 119 void fz_debug_xml(fz_xml *item, int level) | |
| 120 { | |
| 121 /* This is a bit nasty as it relies on implementation | |
| 122 * details of both fz_stdout, and fz_write_printf coping | |
| 123 * with NULL ctx. */ | |
| 124 fz_output_xml(NULL, fz_stdout(NULL), item, level); | |
| 125 } | |
| 126 | |
| 127 void fz_output_xml(fz_context *ctx, fz_output *out, fz_xml *item, int level) | |
| 128 { | |
| 129 char *s; | |
| 130 | |
| 131 if (item == NULL) | |
| 132 return; | |
| 133 | |
| 134 /* Skip over the DOC object at the top. */ | |
| 135 if (item->up == NULL) | |
| 136 { | |
| 137 fz_xml *child; | |
| 138 for (child = fz_xml_down(item); child; child = child->u.node.next) | |
| 139 fz_output_xml(ctx, out, child, level + 1); | |
| 140 return; | |
| 141 } | |
| 142 | |
| 143 s = fz_xml_text(item); | |
| 144 xml_indent(ctx, out, level); | |
| 145 if (s) | |
| 146 { | |
| 147 int c; | |
| 148 fz_write_byte(ctx, out, '"'); | |
| 149 while (*s) { | |
| 150 s += fz_chartorune(&c, s); | |
| 151 switch (c) { | |
| 152 default: | |
| 153 if (c > 0xFFFF) | |
| 154 fz_write_printf(ctx, out, "\\u{%X}", c); | |
| 155 else if (c < 32 || c > 127) | |
| 156 fz_write_printf(ctx, out, "\\u%04X", c); | |
| 157 else | |
| 158 fz_write_byte(ctx, out, c); | |
| 159 break; | |
| 160 case '\\': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, '\\'); break; | |
| 161 case '\b': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, 'b'); break; | |
| 162 case '\f': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, 'f'); break; | |
| 163 case '\n': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, 'n'); break; | |
| 164 case '\r': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, 'r'); break; | |
| 165 case '\t': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, 't'); break; | |
| 166 } | |
| 167 } | |
| 168 fz_write_byte(ctx, out, '"'); | |
| 169 #ifdef FZ_XML_SEQ | |
| 170 fz_write_printf(ctx, out, " <%d>", item->seq); | |
| 171 #endif | |
| 172 fz_write_byte(ctx, out, '\n'); | |
| 173 } | |
| 174 else | |
| 175 { | |
| 176 fz_xml *child; | |
| 177 struct attribute *att; | |
| 178 | |
| 179 #ifdef FZ_XML_SEQ | |
| 180 fz_write_printf(ctx, out, "(%s <%d>\n", item->u.node.u.d.name, item->u.node.seq); | |
| 181 #else | |
| 182 fz_write_printf(ctx, out, "(%s\n", item->u.node.u.d.name); | |
| 183 #endif | |
| 184 for (att = item->u.node.u.d.atts; att; att = att->next) | |
| 185 { | |
| 186 xml_indent(ctx, out, level); | |
| 187 fz_write_printf(ctx, out, "=%s %s\n", att->name, att->value); | |
| 188 } | |
| 189 for (child = fz_xml_down(item); child; child = child->u.node.next) | |
| 190 fz_output_xml(ctx, out, child, level + 1); | |
| 191 xml_indent(ctx, out, level); | |
| 192 #ifdef FZ_XML_SEQ | |
| 193 fz_write_printf(ctx, out, ")%s <%d>\n", item->u.node.u.d.name, item->u.node.seq); | |
| 194 #else | |
| 195 fz_write_printf(ctx, out, ")%s\n", item->u.node.u.d.name); | |
| 196 #endif | |
| 197 } | |
| 198 } | |
| 199 | |
| 200 fz_xml *fz_xml_prev(fz_xml *item) | |
| 201 { | |
| 202 return item && item->up ? item->u.node.prev : NULL; | |
| 203 } | |
| 204 | |
| 205 fz_xml *fz_xml_next(fz_xml *item) | |
| 206 { | |
| 207 return item && item->up ? item->u.node.next : NULL; | |
| 208 } | |
| 209 | |
| 210 fz_xml *fz_xml_up(fz_xml *item) | |
| 211 { | |
| 212 /* Never step up to the DOC. */ | |
| 213 return item && item->up && item->up->up ? item->up : NULL; | |
| 214 } | |
| 215 | |
| 216 fz_xml *fz_xml_down(fz_xml *item) | |
| 217 { | |
| 218 /* DOC items can never have MAGIC_TEXT as their down value, | |
| 219 * so this is safe. */ | |
| 220 return item && !FZ_TEXT_ITEM(item) ? item->down : NULL; | |
| 221 } | |
| 222 | |
| 223 char *fz_xml_text(fz_xml *item) | |
| 224 { | |
| 225 /* DOC items can never have MAGIC_TEXT as their down value, | |
| 226 * so this is safe. */ | |
| 227 return (item && FZ_TEXT_ITEM(item)) ? item->u.node.u.text : NULL; | |
| 228 } | |
| 229 | |
| 230 char *fz_xml_tag(fz_xml *item) | |
| 231 { | |
| 232 /* DOC items can never have MAGIC_TEXT as their down value, | |
| 233 * so this is safe. */ | |
| 234 return item && !FZ_TEXT_ITEM(item) ? item->u.node.u.d.name : NULL; | |
| 235 } | |
| 236 | |
| 237 int fz_xml_is_tag(fz_xml *item, const char *name) | |
| 238 { | |
| 239 if (!item || FZ_DOCUMENT_ITEM(item) || FZ_TEXT_ITEM(item)) | |
| 240 return 0; | |
| 241 return !strcmp(item->u.node.u.d.name, name); | |
| 242 } | |
| 243 | |
| 244 char *fz_xml_att(fz_xml *item, const char *name) | |
| 245 { | |
| 246 struct attribute *att; | |
| 247 if (!item || FZ_DOCUMENT_ITEM(item) || FZ_TEXT_ITEM(item)) | |
| 248 return NULL; | |
| 249 for (att = item->u.node.u.d.atts; att; att = att->next) | |
| 250 if (!strcmp(att->name, name)) | |
| 251 return att->value; | |
| 252 return NULL; | |
| 253 } | |
| 254 | |
| 255 char *fz_xml_att_alt(fz_xml *item, const char *one, const char *two) | |
| 256 { | |
| 257 char *val = fz_xml_att(item, one); | |
| 258 if (!val) | |
| 259 val = fz_xml_att(item, two); | |
| 260 return val; | |
| 261 } | |
| 262 | |
| 263 fz_xml *fz_xml_find(fz_xml *item, const char *tag) | |
| 264 { | |
| 265 /* Skip over any DOC item. */ | |
| 266 if (item && FZ_DOCUMENT_ITEM(item)) | |
| 267 item = item->down; | |
| 268 | |
| 269 while (item) | |
| 270 { | |
| 271 if (!FZ_TEXT_ITEM(item) && !strcmp(item->u.node.u.d.name, tag)) | |
| 272 return item; | |
| 273 item = item->u.node.next; | |
| 274 } | |
| 275 return NULL; | |
| 276 } | |
| 277 | |
| 278 fz_xml *fz_xml_find_next(fz_xml *item, const char *tag) | |
| 279 { | |
| 280 /* Skip over any DOC item. */ | |
| 281 if (item && FZ_DOCUMENT_ITEM(item)) | |
| 282 item = item->down; | |
| 283 | |
| 284 if (item) | |
| 285 item = item->u.node.next; | |
| 286 return fz_xml_find(item, tag); | |
| 287 } | |
| 288 | |
| 289 fz_xml *fz_xml_find_down(fz_xml *item, const char *tag) | |
| 290 { | |
| 291 if (item) | |
| 292 item = fz_xml_down(item); | |
| 293 return fz_xml_find(item, tag); | |
| 294 } | |
| 295 | |
| 296 int fz_xml_att_eq(fz_xml *item, const char *name, const char *match) | |
| 297 { | |
| 298 const char *val = fz_xml_att(item, name); | |
| 299 | |
| 300 return val ? !strcmp(val, match) : 0; | |
| 301 } | |
| 302 | |
| 303 fz_xml *fz_xml_find_match(fz_xml *item, const char *tag, const char *att, const char *match) | |
| 304 { | |
| 305 /* Skip over any document item. */ | |
| 306 if (item && FZ_DOCUMENT_ITEM(item)) | |
| 307 item = item->down; | |
| 308 | |
| 309 while (1) | |
| 310 { | |
| 311 item = tag ? fz_xml_find(item, tag) : item; | |
| 312 if (item == NULL || fz_xml_att_eq(item, att, match)) | |
| 313 break; | |
| 314 item = item->u.node.next; | |
| 315 } | |
| 316 | |
| 317 return item; | |
| 318 } | |
| 319 | |
| 320 fz_xml *fz_xml_find_next_match(fz_xml *item, const char *tag, const char *att, const char *match) | |
| 321 { | |
| 322 /* Skip over any document item. */ | |
| 323 if (item && FZ_DOCUMENT_ITEM(item)) | |
| 324 item = item->down; | |
| 325 | |
| 326 if (item != NULL) | |
| 327 { | |
| 328 do | |
| 329 { | |
| 330 item = tag ? fz_xml_find_next(item, tag) : item->u.node.next; | |
| 331 } | |
| 332 while (item != NULL && !fz_xml_att_eq(item, att, match)); | |
| 333 } | |
| 334 | |
| 335 return item; | |
| 336 } | |
| 337 | |
| 338 fz_xml *fz_xml_find_down_match(fz_xml *item, const char *tag, const char *att, const char *match) | |
| 339 { | |
| 340 return fz_xml_find_match(fz_xml_down(item), tag, att, match); | |
| 341 } | |
| 342 | |
| 343 fz_xml *fz_xml_root(fz_xml *xml) | |
| 344 { | |
| 345 if (xml == NULL) | |
| 346 return NULL; | |
| 347 | |
| 348 /* If we've been given a node mid-tree, run up to the root to find | |
| 349 * the doc node. */ | |
| 350 while (xml->up) | |
| 351 xml = xml->up; | |
| 352 | |
| 353 /* And the root is the child of the doc.*/ | |
| 354 return xml->down; | |
| 355 } | |
| 356 | |
| 357 void fz_drop_xml(fz_context *ctx, fz_xml *xml) | |
| 358 { | |
| 359 if (!xml) | |
| 360 return; | |
| 361 | |
| 362 /* Wherever we are in the tree, we want the doc node at the root. */ | |
| 363 while (xml->up) | |
| 364 xml = xml->up; | |
| 365 | |
| 366 /* Drop a reference to the tree as a whole. */ | |
| 367 if (fz_drop_imp(ctx, xml, &xml->u.doc.refs) == 0) | |
| 368 return; | |
| 369 | |
| 370 fz_drop_pool(ctx, xml->u.doc.pool); | |
| 371 } | |
| 372 | |
| 373 void fz_detach_xml(fz_context *ctx, fz_xml *node) | |
| 374 { | |
| 375 fz_xml *doc = node; | |
| 376 | |
| 377 /* If we're already a document node, then this is a NOP. */ | |
| 378 if (doc->up == NULL) | |
| 379 return; | |
| 380 | |
| 381 /* Move doc to be the doc pointer at the top of the tree. */ | |
| 382 while (doc->up) | |
| 383 { | |
| 384 doc = doc->up; | |
| 385 } | |
| 386 | |
| 387 /* Relocate node to be the child of doc. */ | |
| 388 node->up->down = NULL; | |
| 389 doc->down = node; | |
| 390 | |
| 391 /* NOTE: Suppose that X = doc->down on entry. On exit doc->down == node, but | |
| 392 * X->up = doc. We need to be careful throughout this code to not assume that | |
| 393 * Y is always a child of Y->up. */ | |
| 394 } | |
| 395 | |
| 396 size_t xml_parse_entity(int *c, const char *a) | |
| 397 { | |
| 398 char *b; | |
| 399 size_t i; | |
| 400 | |
| 401 if (a[1] == '#') { | |
| 402 if (a[2] == 'x') | |
| 403 *c = strtol(a + 3, &b, 16); | |
| 404 else | |
| 405 *c = strtol(a + 2, &b, 10); | |
| 406 if (*b == ';') | |
| 407 return b - a + 1; | |
| 408 } | |
| 409 else if (a[1] == 'l' && a[2] == 't' && a[3] == ';') { | |
| 410 *c = '<'; | |
| 411 return 4; | |
| 412 } | |
| 413 else if (a[1] == 'g' && a[2] == 't' && a[3] == ';') { | |
| 414 *c = '>'; | |
| 415 return 4; | |
| 416 } | |
| 417 else if (a[1] == 'a' && a[2] == 'm' && a[3] == 'p' && a[4] == ';') { | |
| 418 *c = '&'; | |
| 419 return 5; | |
| 420 } | |
| 421 else if (a[1] == 'a' && a[2] == 'p' && a[3] == 'o' && a[4] == 's' && a[5] == ';') { | |
| 422 *c = '\''; | |
| 423 return 6; | |
| 424 } | |
| 425 else if (a[1] == 'q' && a[2] == 'u' && a[3] == 'o' && a[4] == 't' && a[5] == ';') { | |
| 426 *c = '"'; | |
| 427 return 6; | |
| 428 } | |
| 429 | |
| 430 /* We should only be doing this for XHTML, but it shouldn't be a problem. */ | |
| 431 for (i = 0; i < nelem(html_entities); ++i) { | |
| 432 size_t n = strlen(html_entities[i].name); | |
| 433 if (!strncmp(a+1, html_entities[i].name, n) && a[n+1] == ';') { | |
| 434 *c = html_entities[i].c; | |
| 435 return n + 2; | |
| 436 } | |
| 437 } | |
| 438 | |
| 439 *c = *a; | |
| 440 return 1; | |
| 441 } | |
| 442 | |
| 443 static inline int isname(int c) | |
| 444 { | |
| 445 return c == '.' || c == '-' || c == '_' || c == ':' || | |
| 446 (c >= '0' && c <= '9') || | |
| 447 (c >= 'A' && c <= 'Z') || | |
| 448 (c >= 'a' && c <= 'z'); | |
| 449 } | |
| 450 | |
| 451 static inline int iswhite(int c) | |
| 452 { | |
| 453 return c == ' ' || c == '\r' || c == '\n' || c == '\t'; | |
| 454 } | |
| 455 | |
| 456 static void xml_emit_open_tag(fz_context *ctx, struct parser *parser, const char *a, const char *b, int is_text) | |
| 457 { | |
| 458 fz_xml *head, *tail; | |
| 459 const char *ns; | |
| 460 size_t size; | |
| 461 | |
| 462 if (is_text) | |
| 463 size = offsetof(fz_xml, u.node.u.text) + b-a+1; | |
| 464 else | |
| 465 { | |
| 466 /* skip namespace prefix */ | |
| 467 for (ns = a; ns < b - 1; ++ns) | |
| 468 if (*ns == ':') | |
| 469 a = ns + 1; | |
| 470 | |
| 471 size = offsetof(fz_xml, u.node.u.d.name) + b-a+1; | |
| 472 } | |
| 473 head = fz_pool_alloc(ctx, parser->pool, size); | |
| 474 | |
| 475 if (is_text) | |
| 476 head->down = MAGIC_TEXT; | |
| 477 else | |
| 478 { | |
| 479 memcpy(head->u.node.u.d.name, a, b - a); | |
| 480 head->u.node.u.d.name[b - a] = 0; | |
| 481 head->u.node.u.d.atts = NULL; | |
| 482 head->down = NULL; | |
| 483 } | |
| 484 | |
| 485 head->up = parser->head; | |
| 486 head->u.node.next = NULL; | |
| 487 #ifdef FZ_XML_SEQ | |
| 488 head->u.node.seq = parser->seq++; | |
| 489 #endif | |
| 490 | |
| 491 /* During construction, we use head->next to mean "the | |
| 492 * tail of the children. When we close the tag, we | |
| 493 * rewrite it to be NULL. */ | |
| 494 if (!parser->head->down) { | |
| 495 parser->head->down = head; | |
| 496 parser->head->u.node.next = head; | |
| 497 head->u.node.prev = NULL; | |
| 498 } | |
| 499 else { | |
| 500 tail = parser->head->u.node.next; | |
| 501 tail->u.node.next = head; | |
| 502 head->u.node.prev = tail; | |
| 503 parser->head->u.node.next = head; | |
| 504 } | |
| 505 | |
| 506 parser->head = head; | |
| 507 parser->depth++; | |
| 508 if (parser->depth >= FZ_XML_MAX_DEPTH) | |
| 509 fz_throw(ctx, FZ_ERROR_SYNTAX, "too deep xml element nesting"); | |
| 510 } | |
| 511 | |
| 512 static void xml_emit_att_name(fz_context *ctx, struct parser *parser, const char *a, const char *b) | |
| 513 { | |
| 514 fz_xml *head = parser->head; | |
| 515 struct attribute *att; | |
| 516 size_t size; | |
| 517 | |
| 518 size = offsetof(struct attribute, name) + b-a+1; | |
| 519 att = fz_pool_alloc(ctx, parser->pool, size); | |
| 520 memcpy(att->name, a, b - a); | |
| 521 att->name[b - a] = 0; | |
| 522 att->value = NULL; | |
| 523 att->next = head->u.node.u.d.atts; | |
| 524 head->u.node.u.d.atts = att; | |
| 525 } | |
| 526 | |
| 527 void fz_xml_add_att(fz_context *ctx, fz_pool *pool, fz_xml *node, const char *key, const char *val) | |
| 528 { | |
| 529 size_t size = offsetof(struct attribute, name) + strlen(key) + 1; | |
| 530 struct attribute *att = fz_pool_alloc(ctx, pool, size); | |
| 531 memcpy(att->name, key, strlen(key)+1); | |
| 532 att->value = fz_pool_alloc(ctx, pool, strlen(val) + 1); | |
| 533 memcpy(att->value, val, strlen(val)+1); | |
| 534 att->next = node->u.node.u.d.atts; | |
| 535 node->u.node.u.d.atts = att; | |
| 536 } | |
| 537 | |
| 538 static void xml_emit_att_value(fz_context *ctx, struct parser *parser, const char *a, const char *b) | |
| 539 { | |
| 540 fz_xml *head = parser->head; | |
| 541 struct attribute *att = head->u.node.u.d.atts; | |
| 542 char *s; | |
| 543 int c; | |
| 544 | |
| 545 /* entities are all longer than UTFmax so runetochar is safe */ | |
| 546 s = att->value = fz_pool_alloc(ctx, parser->pool, b - a + 1); | |
| 547 while (a < b) { | |
| 548 if (*a == '&') { | |
| 549 a += xml_parse_entity(&c, a); | |
| 550 s += fz_runetochar(s, c); | |
| 551 } | |
| 552 else { | |
| 553 *s++ = *a++; | |
| 554 } | |
| 555 } | |
| 556 *s = 0; | |
| 557 } | |
| 558 | |
| 559 static void xml_emit_close_tag(fz_context *ctx, struct parser *parser) | |
| 560 { | |
| 561 parser->depth--; | |
| 562 parser->head->u.node.next = NULL; | |
| 563 if (parser->head->up) | |
| 564 parser->head = parser->head->up; | |
| 565 } | |
| 566 | |
| 567 static void xml_emit_text(fz_context *ctx, struct parser *parser, const char *a, const char *b) | |
| 568 { | |
| 569 fz_xml *head; | |
| 570 const char *p; | |
| 571 char *s; | |
| 572 int c; | |
| 573 | |
| 574 /* Skip text outside the root tag */ | |
| 575 if (parser->depth == 0) | |
| 576 return; | |
| 577 | |
| 578 /* Skip all-whitespace text nodes */ | |
| 579 if (!parser->preserve_white) | |
| 580 { | |
| 581 for (p = a; p < b; p++) | |
| 582 if (!iswhite(*p)) | |
| 583 break; | |
| 584 if (p == b) | |
| 585 return; | |
| 586 } | |
| 587 | |
| 588 xml_emit_open_tag(ctx, parser, a, b, 1); | |
| 589 head = parser->head; | |
| 590 | |
| 591 /* entities are all longer than UTFmax so runetochar is safe */ | |
| 592 s = fz_xml_text(head); | |
| 593 while (a < b) { | |
| 594 if (*a == '&') { | |
| 595 a += xml_parse_entity(&c, a); | |
| 596 s += fz_runetochar(s, c); | |
| 597 } | |
| 598 else { | |
| 599 *s++ = *a++; | |
| 600 } | |
| 601 } | |
| 602 *s = 0; | |
| 603 | |
| 604 xml_emit_close_tag(ctx, parser); | |
| 605 } | |
| 606 | |
| 607 static void xml_emit_cdata(fz_context *ctx, struct parser *parser, const char *a, const char *b) | |
| 608 { | |
| 609 fz_xml *head; | |
| 610 char *s; | |
| 611 | |
| 612 xml_emit_open_tag(ctx, parser, a, b, 1); | |
| 613 head = parser->head; | |
| 614 | |
| 615 s = head->u.node.u.text; | |
| 616 while (a < b) | |
| 617 *s++ = *a++; | |
| 618 *s = 0; | |
| 619 | |
| 620 xml_emit_close_tag(ctx, parser); | |
| 621 } | |
| 622 | |
| 623 static int close_tag(fz_context *ctx, struct parser *parser, const char *mark, const char *p) | |
| 624 { | |
| 625 const char *ns, *tag; | |
| 626 | |
| 627 /* skip namespace prefix */ | |
| 628 for (ns = mark; ns < p - 1; ++ns) | |
| 629 if (*ns == ':') | |
| 630 mark = ns + 1; | |
| 631 | |
| 632 tag = fz_xml_tag(parser->head); | |
| 633 if (tag && strncmp(tag, mark, p-mark) == 0 && tag[p-mark] == 0) | |
| 634 { | |
| 635 xml_emit_close_tag(ctx, parser); | |
| 636 return 0; | |
| 637 } | |
| 638 return 1; | |
| 639 } | |
| 640 | |
| 641 static char *xml_parse_document_imp(fz_context *ctx, struct parser *parser, const char *p) /* lgtm [cpp/use-of-goto] */ | |
| 642 { | |
| 643 const char *mark; | |
| 644 int quote; | |
| 645 | |
| 646 parse_text: | |
| 647 mark = p; | |
| 648 while (*p && *p != '<') ++p; | |
| 649 if (*p == '<') { | |
| 650 if (mark < p) | |
| 651 xml_emit_text(ctx, parser, mark, p); | |
| 652 ++p; | |
| 653 goto parse_element; | |
| 654 } else if (mark < p) | |
| 655 xml_emit_text(ctx, parser, mark, p); | |
| 656 return NULL; | |
| 657 | |
| 658 parse_element: | |
| 659 if (*p == '/') { ++p; goto parse_closing_element; } | |
| 660 if (*p == '!') { ++p; goto parse_comment; } | |
| 661 if (*p == '?') { ++p; goto parse_processing_instruction; } | |
| 662 while (iswhite(*p)) ++p; | |
| 663 if (isname(*p)) | |
| 664 goto parse_element_name; | |
| 665 return "syntax error in element"; | |
| 666 | |
| 667 parse_comment: | |
| 668 if (p[0]=='D' && p[1]=='O' && p[2]=='C' && p[3]=='T' && p[4]=='Y' && p[5]=='P' && p[6]=='E') | |
| 669 goto parse_declaration; | |
| 670 if (p[0]=='E' && p[1]=='N' && p[2]=='T' && p[3]=='I' && p[4]=='T' && p[5]=='Y') | |
| 671 goto parse_declaration; | |
| 672 if (*p == '[') goto parse_cdata; | |
| 673 if (*p++ != '-') return "syntax error in comment (<! not followed by --)"; | |
| 674 if (*p++ != '-') return "syntax error in comment (<!- not followed by -)"; | |
| 675 while (*p) { | |
| 676 if (p[0] == '-' && p[1] == '-' && p[2] == '>') { | |
| 677 p += 3; | |
| 678 goto parse_text; | |
| 679 } | |
| 680 ++p; | |
| 681 } | |
| 682 return "end of data in comment"; | |
| 683 | |
| 684 parse_declaration: | |
| 685 while (*p) if (*p++ == '>') goto parse_text; | |
| 686 return "end of data in declaration"; | |
| 687 | |
| 688 parse_cdata: | |
| 689 if (p[1] != 'C' || p[2] != 'D' || p[3] != 'A' || p[4] != 'T' || p[5] != 'A' || p[6] != '[') | |
| 690 return "syntax error in CDATA section"; | |
| 691 p += 7; | |
| 692 mark = p; | |
| 693 while (*p) { | |
| 694 if (p[0] == ']' && p[1] == ']' && p[2] == '>') { | |
| 695 xml_emit_cdata(ctx, parser, mark, p); | |
| 696 p += 3; | |
| 697 goto parse_text; | |
| 698 } | |
| 699 ++p; | |
| 700 } | |
| 701 return "end of data in CDATA section"; | |
| 702 | |
| 703 parse_processing_instruction: | |
| 704 while (*p) { | |
| 705 if (p[0] == '?' && p[1] == '>') { | |
| 706 p += 2; | |
| 707 goto parse_text; | |
| 708 } | |
| 709 ++p; | |
| 710 } | |
| 711 return "end of data in processing instruction"; | |
| 712 | |
| 713 parse_closing_element: | |
| 714 while (iswhite(*p)) ++p; | |
| 715 mark = p; | |
| 716 while (isname(*p)) ++p; | |
| 717 if (!isname(*mark)) | |
| 718 return "syntax error in closing element"; | |
| 719 if (close_tag(ctx, parser, mark, p)) | |
| 720 return "opening and closing tag mismatch"; | |
| 721 while (iswhite(*p)) ++p; | |
| 722 if (*p != '>') | |
| 723 return "syntax error in closing element"; | |
| 724 ++p; | |
| 725 goto parse_text; | |
| 726 | |
| 727 parse_element_name: | |
| 728 mark = p; | |
| 729 while (isname(*p)) ++p; | |
| 730 xml_emit_open_tag(ctx, parser, mark, p, 0); | |
| 731 if (*p == '>') { | |
| 732 ++p; | |
| 733 goto parse_text; | |
| 734 } | |
| 735 if (p[0] == '/' && p[1] == '>') { | |
| 736 xml_emit_close_tag(ctx, parser); | |
| 737 p += 2; | |
| 738 goto parse_text; | |
| 739 } | |
| 740 if (iswhite(*p)) | |
| 741 goto parse_attributes; | |
| 742 return "syntax error after element name"; | |
| 743 | |
| 744 parse_attributes: | |
| 745 while (iswhite(*p)) ++p; | |
| 746 if (isname(*p)) | |
| 747 goto parse_attribute_name; | |
| 748 if (*p == '>') { | |
| 749 ++p; | |
| 750 goto parse_text; | |
| 751 } | |
| 752 if (p[0] == '/' && p[1] == '>') { | |
| 753 xml_emit_close_tag(ctx, parser); | |
| 754 p += 2; | |
| 755 goto parse_text; | |
| 756 } | |
| 757 return "syntax error in attributes"; | |
| 758 | |
| 759 parse_attribute_name: | |
| 760 mark = p; | |
| 761 while (isname(*p)) ++p; | |
| 762 xml_emit_att_name(ctx, parser, mark, p); | |
| 763 while (iswhite(*p)) ++p; | |
| 764 if (*p == '=') { ++p; goto parse_attribute_value; } | |
| 765 return "syntax error after attribute name"; | |
| 766 | |
| 767 parse_attribute_value: | |
| 768 while (iswhite(*p)) ++p; | |
| 769 quote = *p++; | |
| 770 mark = p; | |
| 771 | |
| 772 /* special case for handling MOBI filepos=00000 syntax */ | |
| 773 if (quote >= '0' && quote <= '9') { | |
| 774 while (*p >= '0' && *p <= '9') ++p; | |
| 775 xml_emit_att_value(ctx, parser, mark, p); | |
| 776 goto parse_attributes; | |
| 777 } | |
| 778 | |
| 779 if (quote != '"' && quote != '\'') | |
| 780 return "missing quote character"; | |
| 781 while (*p && *p != quote) ++p; | |
| 782 if (*p == quote) { | |
| 783 xml_emit_att_value(ctx, parser, mark, p++); | |
| 784 goto parse_attributes; | |
| 785 } | |
| 786 return "end of data in attribute value"; | |
| 787 } | |
| 788 | |
| 789 static int fast_tolower(int c) | |
| 790 { | |
| 791 if ((unsigned)c - 'A' < 26) | |
| 792 return c | 32; | |
| 793 return c; | |
| 794 } | |
| 795 | |
| 796 static int fast_strncasecmp(const char *a, const char *b, size_t n) | |
| 797 { | |
| 798 if (!n--) | |
| 799 return 0; | |
| 800 for (; *a && *b && n && fast_tolower(*a) == fast_tolower(*b); a++, b++, n--) | |
| 801 ; | |
| 802 return fast_tolower(*a) - fast_tolower(*b); | |
| 803 } | |
| 804 | |
| 805 static char *fast_strcasestr(char *h, char *n) | |
| 806 { | |
| 807 int n0 = fast_tolower(*n++); | |
| 808 size_t nn = strlen(n); | |
| 809 while (*h != 0) | |
| 810 { | |
| 811 if (fast_tolower(*h) == n0 && fast_strncasecmp(h+1, n, nn) == 0) | |
| 812 return h; | |
| 813 ++h; | |
| 814 } | |
| 815 return NULL; | |
| 816 } | |
| 817 | |
| 818 static int startswith(const char *a, const char *b) | |
| 819 { | |
| 820 return !fast_strncasecmp(a, b, strlen(b)); | |
| 821 } | |
| 822 | |
| 823 /* https://encoding.spec.whatwg.org/#names-and-labels */ | |
| 824 static struct { char *encoding; char *alias; } encoding_aliases[] = { | |
| 825 { "big5", "big5" }, | |
| 826 { "big5", "big5-hkscs" }, | |
| 827 { "big5", "cn-big5" }, | |
| 828 { "big5", "csbig5" }, | |
| 829 { "big5", "x-x-big5" }, | |
| 830 { "euc-cn", "euc-cn" }, | |
| 831 { "euc-jp", "cseucpkdfmtjapanese" }, | |
| 832 { "euc-jp", "euc-jp" }, | |
| 833 { "euc-jp", "x-euc-jp" }, | |
| 834 { "euc-kr", "cseuckr" }, | |
| 835 { "euc-kr", "csksc56011987" }, | |
| 836 { "euc-kr", "euc-kr" }, | |
| 837 { "euc-kr", "iso-ir-149" }, | |
| 838 { "euc-kr", "korean" }, | |
| 839 { "euc-kr", "ks_c_5601" }, | |
| 840 { "euc-kr", "ksc5601" }, | |
| 841 { "euc-kr", "ksc_5601" }, | |
| 842 { "euc-kr", "windows-949" }, | |
| 843 { "euc-tw", "euc-tw" }, | |
| 844 { "gb18030", "chinese" }, | |
| 845 { "gb18030", "csgb2312" }, | |
| 846 { "gb18030", "csiso58gb231280" }, | |
| 847 { "gb18030", "gb18030" }, | |
| 848 { "gb18030", "gb2312" }, | |
| 849 { "gb18030", "gb_2312" }, | |
| 850 { "gb18030", "gbk" }, | |
| 851 { "gb18030", "iso-ir-58" }, | |
| 852 { "gb18030", "x-gbk" }, | |
| 853 { "iso-8859-1", "ascii" }, | |
| 854 { "iso-8859-1", "iso-8859-1" }, | |
| 855 { "iso-8859-1", "iso8859-1" }, | |
| 856 { "iso-8859-1", "latin1" }, | |
| 857 { "iso-8859-1", "us-ascii" }, | |
| 858 { "iso-8859-7", "greek" }, | |
| 859 { "iso-8859-7", "greek8" }, | |
| 860 { "iso-8859-7", "iso-8859-1" }, | |
| 861 { "iso-8859-7", "iso8859-1" }, | |
| 862 { "koi8-r", "koi" }, | |
| 863 { "koi8-r", "koi8" }, | |
| 864 { "koi8-r", "koi8-r" }, | |
| 865 { "koi8-r", "koi8-ru" }, | |
| 866 { "koi8-r", "koi8-u" }, | |
| 867 { "koi8-r", "koi8_r" }, | |
| 868 { "shift_jis", "csshiftjis" }, | |
| 869 { "shift_jis", "ms932" }, | |
| 870 { "shift_jis", "ms_kanji" }, | |
| 871 { "shift_jis", "shift-jis" }, | |
| 872 { "shift_jis", "shift_jis" }, | |
| 873 { "shift_jis", "sjis" }, | |
| 874 { "shift_jis", "windows-31j" }, | |
| 875 { "shift_jis", "x-sjis" }, | |
| 876 { "windows-1250", "cp1250" }, | |
| 877 { "windows-1250", "windows-1250" }, | |
| 878 { "windows-1251", "cp1251" }, | |
| 879 { "windows-1251", "windows-1251" }, | |
| 880 { "windows-1252", "cp1252" }, | |
| 881 { "windows-1252", "cp819" }, | |
| 882 { "windows-1252", "windows-1252" }, | |
| 883 }; | |
| 884 | |
| 885 static char *match_encoding_name(char *enc) | |
| 886 { | |
| 887 size_t i; | |
| 888 for (i = 0; i < nelem(encoding_aliases); ++i) | |
| 889 if (startswith(enc, encoding_aliases[i].alias)) | |
| 890 return encoding_aliases[i].encoding; | |
| 891 return NULL; | |
| 892 } | |
| 893 | |
| 894 // Look for encoding in <meta http-equiv="content-type" content="text/html; charset=XXX"> tags | |
| 895 static const char *find_meta_encoding(char *s) | |
| 896 { | |
| 897 const char *table = NULL; | |
| 898 char *end, *meta, *charset, *enc; | |
| 899 | |
| 900 meta = fast_strcasestr(s, "<meta"); | |
| 901 while (meta && !table) | |
| 902 { | |
| 903 end = strchr(meta, '>'); | |
| 904 if (end) | |
| 905 { | |
| 906 *end = 0; | |
| 907 if (fast_strcasestr(meta, "http-equiv") && fast_strcasestr(meta, "content-type")) | |
| 908 { | |
| 909 charset = fast_strcasestr(meta, "charset="); | |
| 910 if (charset) | |
| 911 { | |
| 912 enc = match_encoding_name(charset + 8); | |
| 913 if (enc) | |
| 914 table = enc; | |
| 915 } | |
| 916 } | |
| 917 *end = '>'; | |
| 918 } | |
| 919 meta = fast_strcasestr(meta + 5, "<meta"); | |
| 920 } | |
| 921 | |
| 922 return table; | |
| 923 } | |
| 924 | |
| 925 static const char *find_xml_encoding(char *s) | |
| 926 { | |
| 927 const char *table = NULL; | |
| 928 char *end, *xml, *enc; | |
| 929 | |
| 930 end = strchr(s, '>'); | |
| 931 if (end) | |
| 932 { | |
| 933 *end = 0; | |
| 934 xml = strstr(s, "<?xml"); | |
| 935 if (xml) | |
| 936 { | |
| 937 enc = strstr(xml, "encoding="); | |
| 938 if (enc) | |
| 939 { | |
| 940 enc = match_encoding_name(enc + 10); | |
| 941 if (enc) | |
| 942 table = enc; | |
| 943 } | |
| 944 } | |
| 945 *end = '>'; | |
| 946 } | |
| 947 | |
| 948 if (!table) | |
| 949 table = find_meta_encoding(s); | |
| 950 | |
| 951 return table; | |
| 952 } | |
| 953 | |
| 954 static char *convert_to_utf8(fz_context *ctx, unsigned char *s, size_t n, int *dofree) | |
| 955 { | |
| 956 fz_text_decoder dec; | |
| 957 const char *enc; | |
| 958 const unsigned char *e = s + n; | |
| 959 char *dst, *d; | |
| 960 int m; | |
| 961 int c; | |
| 962 | |
| 963 if (s[0] == 0xFE && s[1] == 0xFF) { | |
| 964 s += 2; | |
| 965 dst = d = Memento_label(fz_malloc(ctx, n * FZ_UTFMAX), "utf8_from_be"); | |
| 966 while (s + 1 < e) { | |
| 967 c = s[0] << 8 | s[1]; | |
| 968 d += fz_runetochar(d, c); | |
| 969 s += 2; | |
| 970 } | |
| 971 *d = 0; | |
| 972 *dofree = 1; | |
| 973 return dst; | |
| 974 } | |
| 975 | |
| 976 if (s[0] == 0xFF && s[1] == 0xFE) { | |
| 977 s += 2; | |
| 978 dst = d = Memento_label(fz_malloc(ctx, n * FZ_UTFMAX), "utf8_from_le"); | |
| 979 while (s + 1 < e) { | |
| 980 c = s[0] | s[1] << 8; | |
| 981 d += fz_runetochar(d, c); | |
| 982 s += 2; | |
| 983 } | |
| 984 *d = 0; | |
| 985 *dofree = 1; | |
| 986 return dst; | |
| 987 } | |
| 988 | |
| 989 enc = find_xml_encoding((char*)s); | |
| 990 if (enc) | |
| 991 { | |
| 992 fz_init_text_decoder(ctx, &dec, enc); | |
| 993 // NOTE: use decode_size if memory is more important than speed | |
| 994 m = (int)dec.decode_bound(&dec, s, (int)n); | |
| 995 dst = Memento_label(fz_malloc(ctx, m), "utf8"); | |
| 996 dec.decode(&dec, dst, s, (int)n); | |
| 997 *dofree = 1; | |
| 998 return dst; | |
| 999 } | |
| 1000 | |
| 1001 *dofree = 0; | |
| 1002 | |
| 1003 if (s[0] == 0xEF && s[1] == 0xBB && s[2] == 0xBF) | |
| 1004 return (char*)s+3; | |
| 1005 | |
| 1006 return (char*)s; | |
| 1007 } | |
| 1008 | |
| 1009 fz_xml * | |
| 1010 fz_parse_xml_stream(fz_context *ctx, fz_stream *stm, int preserve_white) | |
| 1011 { | |
| 1012 fz_buffer *buf = fz_read_all(ctx, stm, 128); | |
| 1013 fz_xml *xml = NULL; | |
| 1014 | |
| 1015 fz_var(xml); | |
| 1016 | |
| 1017 fz_try(ctx) | |
| 1018 xml = fz_parse_xml(ctx, buf, preserve_white); | |
| 1019 fz_always(ctx) | |
| 1020 fz_drop_buffer(ctx, buf); | |
| 1021 fz_catch(ctx) | |
| 1022 fz_rethrow(ctx); | |
| 1023 | |
| 1024 return xml; | |
| 1025 } | |
| 1026 | |
| 1027 static fz_xml * | |
| 1028 parse_and_drop_buffer(fz_context *ctx, fz_buffer *buf, int preserve_white) | |
| 1029 { | |
| 1030 fz_xml *xml = NULL; | |
| 1031 | |
| 1032 fz_var(xml); | |
| 1033 | |
| 1034 fz_try(ctx) | |
| 1035 xml = fz_parse_xml(ctx, buf, preserve_white); | |
| 1036 fz_always(ctx) | |
| 1037 fz_drop_buffer(ctx, buf); | |
| 1038 fz_catch(ctx) | |
| 1039 fz_rethrow(ctx); | |
| 1040 | |
| 1041 return xml; | |
| 1042 } | |
| 1043 | |
| 1044 fz_xml * | |
| 1045 fz_parse_xml_archive_entry(fz_context *ctx, fz_archive *arch, const char *filename, int preserve_white) | |
| 1046 { | |
| 1047 fz_buffer *buf = fz_read_archive_entry(ctx, arch, filename); | |
| 1048 | |
| 1049 return parse_and_drop_buffer(ctx, buf, preserve_white); | |
| 1050 } | |
| 1051 | |
| 1052 fz_xml * | |
| 1053 fz_try_parse_xml_archive_entry(fz_context *ctx, fz_archive *arch, const char *filename, int preserve_white) | |
| 1054 { | |
| 1055 fz_buffer *buf = fz_try_read_archive_entry(ctx, arch, filename); | |
| 1056 | |
| 1057 if (buf == NULL) | |
| 1058 return NULL; | |
| 1059 | |
| 1060 return parse_and_drop_buffer(ctx, buf, preserve_white); | |
| 1061 } | |
| 1062 | |
| 1063 fz_xml * | |
| 1064 fz_parse_xml(fz_context *ctx, fz_buffer *buf, int preserve_white) | |
| 1065 { | |
| 1066 struct parser parser; | |
| 1067 fz_xml *xml = NULL; | |
| 1068 fz_xml *root, *node; | |
| 1069 char *p = NULL; | |
| 1070 char *error; | |
| 1071 int dofree = 0; | |
| 1072 unsigned char *s; | |
| 1073 size_t n; | |
| 1074 static unsigned char empty_string[] = ""; | |
| 1075 | |
| 1076 fz_var(dofree); | |
| 1077 fz_var(p); | |
| 1078 | |
| 1079 if (buf == NULL) | |
| 1080 { | |
| 1081 n = 0; | |
| 1082 s = empty_string; | |
| 1083 } | |
| 1084 else | |
| 1085 { | |
| 1086 /* ensure we are zero-terminated */ | |
| 1087 fz_terminate_buffer(ctx, buf); | |
| 1088 n = fz_buffer_storage(ctx, buf, &s); | |
| 1089 } | |
| 1090 | |
| 1091 parser.pool = fz_new_pool(ctx); | |
| 1092 parser.head = root = fz_pool_alloc_flexible(ctx, parser.pool, fz_xml, u.node.u.d.name, 1); | |
| 1093 parser.preserve_white = preserve_white; | |
| 1094 parser.depth = 0; | |
| 1095 #ifdef FZ_XML_SEQ | |
| 1096 parser.seq = 0; | |
| 1097 #endif | |
| 1098 | |
| 1099 fz_try(ctx) | |
| 1100 { | |
| 1101 p = convert_to_utf8(ctx, s, n, &dofree); | |
| 1102 | |
| 1103 error = xml_parse_document_imp(ctx, &parser, p); | |
| 1104 if (error) | |
| 1105 fz_throw(ctx, FZ_ERROR_SYNTAX, "%s", error); | |
| 1106 | |
| 1107 for (node = parser.head; node; node = node->up) | |
| 1108 node->u.node.next = NULL; | |
| 1109 | |
| 1110 xml = fz_pool_alloc(ctx, parser.pool, sizeof *xml); | |
| 1111 xml->up = NULL; | |
| 1112 xml->down = root->down; | |
| 1113 xml->u.doc.refs = 1; | |
| 1114 xml->u.doc.pool = parser.pool; | |
| 1115 | |
| 1116 for (node = root->down; node; node = node->u.node.next) | |
| 1117 node->up = xml; | |
| 1118 } | |
| 1119 fz_always(ctx) | |
| 1120 { | |
| 1121 if (dofree) | |
| 1122 fz_free(ctx, p); | |
| 1123 } | |
| 1124 fz_catch(ctx) | |
| 1125 { | |
| 1126 fz_drop_pool(ctx, parser.pool); | |
| 1127 fz_rethrow(ctx); | |
| 1128 } | |
| 1129 | |
| 1130 return xml; | |
| 1131 } | |
| 1132 | |
| 1133 #if FZ_ENABLE_HTML_ENGINE | |
| 1134 /* | |
| 1135 Parse the contents of buffer into a tree of XML nodes, using the HTML5 syntax. | |
| 1136 | |
| 1137 Gumbo doesn't check for malloc errors. Use our pool allocator and let it longjmp | |
| 1138 out of Gumbo on allocation errors. At the end (success or fail) we release the | |
| 1139 pool used for Gumbo's parse tree all at once. | |
| 1140 */ | |
| 1141 | |
| 1142 struct mem_gumbo { | |
| 1143 fz_context *ctx; | |
| 1144 fz_pool *pool; | |
| 1145 }; | |
| 1146 | |
| 1147 static void *alloc_gumbo(void *ctx, size_t size) | |
| 1148 { | |
| 1149 struct mem_gumbo *mem = ctx; | |
| 1150 return fz_pool_alloc(mem->ctx, mem->pool, size); | |
| 1151 } | |
| 1152 | |
| 1153 static void dealloc_gumbo(void *ctx, void *ptr) | |
| 1154 { | |
| 1155 /* nothing */ | |
| 1156 } | |
| 1157 | |
| 1158 static void xml_from_gumbo(fz_context *ctx, struct parser *parser, GumboNode *node) | |
| 1159 { | |
| 1160 unsigned int i; | |
| 1161 const char *tag, *end, *sentinel; | |
| 1162 | |
| 1163 switch (node->type) | |
| 1164 { | |
| 1165 case GUMBO_NODE_ELEMENT: | |
| 1166 if (node->v.element.tag != GUMBO_TAG_UNKNOWN) | |
| 1167 { | |
| 1168 tag = gumbo_normalized_tagname(node->v.element.tag); | |
| 1169 end = tag + strlen(tag); | |
| 1170 } | |
| 1171 else | |
| 1172 { | |
| 1173 tag = node->v.element.original_tag.data; | |
| 1174 sentinel = tag + node->v.element.original_tag.length; | |
| 1175 if (tag[0] == '<') | |
| 1176 ++tag; | |
| 1177 for (end = tag; end < sentinel; ++end) | |
| 1178 if (end[0] == '>' || end[0] == '/' || iswhite(end[0])) | |
| 1179 break; | |
| 1180 } | |
| 1181 xml_emit_open_tag(ctx, parser, tag, end, 0); | |
| 1182 for (i = 0; i < node->v.element.attributes.length; ++i) | |
| 1183 { | |
| 1184 GumboAttribute *att = node->v.element.attributes.data[i]; | |
| 1185 xml_emit_att_name(ctx, parser, att->name, att->name+strlen(att->name)); | |
| 1186 xml_emit_att_value(ctx, parser, att->value, att->value+strlen(att->value)); | |
| 1187 } | |
| 1188 for (i = 0; i < node->v.element.children.length; ++i) | |
| 1189 { | |
| 1190 GumboNode *child = node->v.element.children.data[i]; | |
| 1191 xml_from_gumbo(ctx, parser, child); | |
| 1192 } | |
| 1193 xml_emit_close_tag(ctx, parser); | |
| 1194 break; | |
| 1195 | |
| 1196 case GUMBO_NODE_TEXT: | |
| 1197 case GUMBO_NODE_CDATA: | |
| 1198 case GUMBO_NODE_WHITESPACE: | |
| 1199 xml_emit_text(ctx, parser, node->v.text.text, node->v.text.text+strlen(node->v.text.text)); | |
| 1200 break; | |
| 1201 | |
| 1202 case GUMBO_NODE_DOCUMENT: | |
| 1203 case GUMBO_NODE_COMMENT: | |
| 1204 case GUMBO_NODE_TEMPLATE: | |
| 1205 break; | |
| 1206 } | |
| 1207 } | |
| 1208 #endif | |
| 1209 | |
| 1210 fz_xml * | |
| 1211 fz_parse_xml_from_html5(fz_context *ctx, fz_buffer *buf) | |
| 1212 { | |
| 1213 #if FZ_ENABLE_HTML_ENGINE | |
| 1214 struct parser parser; | |
| 1215 fz_xml *xml = NULL; | |
| 1216 fz_xml root, *node; | |
| 1217 char *p = NULL; | |
| 1218 int dofree = 0; | |
| 1219 unsigned char *s; | |
| 1220 size_t n; | |
| 1221 GumboOutput *soup = NULL; | |
| 1222 GumboOptions opts; | |
| 1223 struct mem_gumbo mem; | |
| 1224 static unsigned char empty_string[] = ""; | |
| 1225 | |
| 1226 fz_var(mem.pool); | |
| 1227 fz_var(soup); | |
| 1228 fz_var(dofree); | |
| 1229 fz_var(p); | |
| 1230 | |
| 1231 if (buf == NULL) | |
| 1232 { | |
| 1233 n = 0; | |
| 1234 s = empty_string; | |
| 1235 } | |
| 1236 else | |
| 1237 { | |
| 1238 /* ensure we are zero-terminated */ | |
| 1239 fz_terminate_buffer(ctx, buf); | |
| 1240 n = fz_buffer_storage(ctx, buf, &s); | |
| 1241 } | |
| 1242 | |
| 1243 mem.ctx = ctx; | |
| 1244 mem.pool = NULL; | |
| 1245 | |
| 1246 memset(&root, 0, sizeof(root)); | |
| 1247 parser.pool = fz_new_pool(ctx); | |
| 1248 parser.head = &root; | |
| 1249 parser.preserve_white = 1; | |
| 1250 parser.depth = 0; | |
| 1251 #ifdef FZ_XML_SEQ | |
| 1252 parser.seq = 0; | |
| 1253 #endif | |
| 1254 | |
| 1255 fz_try(ctx) | |
| 1256 { | |
| 1257 p = convert_to_utf8(ctx, s, n, &dofree); | |
| 1258 | |
| 1259 mem.pool = fz_new_pool(ctx); | |
| 1260 memset(&opts, 0, sizeof opts); | |
| 1261 opts.allocator = alloc_gumbo; | |
| 1262 opts.deallocator = dealloc_gumbo; | |
| 1263 opts.userdata = &mem; | |
| 1264 opts.tab_stop = 8; | |
| 1265 opts.stop_on_first_error = 0; | |
| 1266 opts.max_errors = -1; | |
| 1267 opts.fragment_context = GUMBO_TAG_LAST; | |
| 1268 opts.fragment_namespace = GUMBO_NAMESPACE_HTML; | |
| 1269 | |
| 1270 soup = gumbo_parse_with_options(&opts, (const char *)p, strlen(p)); | |
| 1271 | |
| 1272 xml_from_gumbo(ctx, &parser, soup->root); | |
| 1273 | |
| 1274 for (node = parser.head; node; node = node->up) | |
| 1275 node->u.node.next = NULL; | |
| 1276 | |
| 1277 xml = fz_pool_alloc(ctx, parser.pool, sizeof *xml); | |
| 1278 xml->up = NULL; | |
| 1279 xml->down = root.down; | |
| 1280 xml->u.doc.pool = parser.pool; | |
| 1281 xml->u.doc.refs = 1; | |
| 1282 | |
| 1283 for (node = root.down; node; node = node->u.node.next) | |
| 1284 node->up = xml; | |
| 1285 } | |
| 1286 fz_always(ctx) | |
| 1287 { | |
| 1288 if (soup) | |
| 1289 gumbo_destroy_output(&opts, soup); | |
| 1290 fz_drop_pool(ctx, mem.pool); | |
| 1291 if (dofree) | |
| 1292 fz_free(ctx, p); | |
| 1293 } | |
| 1294 fz_catch(ctx) | |
| 1295 { | |
| 1296 fz_drop_pool(ctx, parser.pool); | |
| 1297 fz_rethrow(ctx); | |
| 1298 } | |
| 1299 | |
| 1300 return xml; | |
| 1301 #else | |
| 1302 fz_throw(ctx, FZ_ERROR_GENERIC, "HTML Engine not enabled in this build"); | |
| 1303 #endif | |
| 1304 } | |
| 1305 | |
| 1306 fz_xml *fz_xml_find_dfs(fz_xml *item, const char *tag, const char *att, const char *match) | |
| 1307 { | |
| 1308 return fz_xml_find_dfs_top(item, tag, att, match, NULL); | |
| 1309 } | |
| 1310 | |
| 1311 fz_xml *fz_xml_find_dfs_top(fz_xml *item, const char *tag, const char *att, const char *match, fz_xml *top) | |
| 1312 { | |
| 1313 /* Skip over any DOC object. */ | |
| 1314 if (item && FZ_DOCUMENT_ITEM(item)) | |
| 1315 item = item->down; | |
| 1316 | |
| 1317 while (item) | |
| 1318 { | |
| 1319 if (!FZ_TEXT_ITEM(item) && (tag == NULL || !strcmp(item->u.node.u.d.name, tag))) | |
| 1320 { | |
| 1321 if (att == NULL || (match == NULL ? fz_xml_att(item, att) != NULL : fz_xml_att_eq(item, att, match))) | |
| 1322 return item; | |
| 1323 } | |
| 1324 | |
| 1325 if (!FZ_TEXT_ITEM(item) && item->down) | |
| 1326 item = item->down; | |
| 1327 else if (item->u.node.next) | |
| 1328 item = item->u.node.next; | |
| 1329 else | |
| 1330 while (1) { | |
| 1331 item = item->up; | |
| 1332 /* Stop searching if we hit our declared 'top' item. */ | |
| 1333 if (item == top) | |
| 1334 return NULL; | |
| 1335 /* We should never reach item == NULL, but just in case. */ | |
| 1336 if (item == NULL) | |
| 1337 return NULL; | |
| 1338 /* If we reach the DOC object at the top, we're done. */ | |
| 1339 if (item->up == NULL) | |
| 1340 return NULL; | |
| 1341 if (item->u.node.next) | |
| 1342 { | |
| 1343 item = item->u.node.next; | |
| 1344 break; | |
| 1345 } | |
| 1346 } | |
| 1347 } | |
| 1348 | |
| 1349 return NULL; | |
| 1350 } | |
| 1351 | |
| 1352 fz_xml *fz_xml_find_next_dfs(fz_xml *item, const char *tag, const char *att, const char *match) | |
| 1353 { | |
| 1354 return fz_xml_find_next_dfs_top(item, tag, att, match, NULL); | |
| 1355 } | |
| 1356 | |
| 1357 fz_xml *fz_xml_find_next_dfs_top(fz_xml *item, const char *tag, const char *att, const char *match, fz_xml *top) | |
| 1358 { | |
| 1359 /* Skip over any DOC object. */ | |
| 1360 if (item && FZ_DOCUMENT_ITEM(item)) | |
| 1361 item = item->down; | |
| 1362 | |
| 1363 if (item == NULL) | |
| 1364 return NULL; | |
| 1365 | |
| 1366 if (item->down) | |
| 1367 item = item->down; | |
| 1368 else if (item->u.node.next) | |
| 1369 item = item->u.node.next; | |
| 1370 else | |
| 1371 while (1) { | |
| 1372 item = item->up; | |
| 1373 /* Stop searching if we hit our declared 'top' item. */ | |
| 1374 if (item == top) | |
| 1375 return NULL; | |
| 1376 /* We should never reach item == NULL, but just in case. */ | |
| 1377 if (item == NULL) | |
| 1378 return NULL; | |
| 1379 /* If we reach the DOC object at the top, we're done. */ | |
| 1380 if (item->up == NULL) | |
| 1381 return NULL; | |
| 1382 if (item->u.node.next) | |
| 1383 { | |
| 1384 item = item->u.node.next; | |
| 1385 break; | |
| 1386 } | |
| 1387 } | |
| 1388 | |
| 1389 return fz_xml_find_dfs_top(item, tag, att, match, top); | |
| 1390 } | |
| 1391 | |
| 1392 fz_xml *fz_keep_xml(fz_context *ctx, fz_xml *xml) | |
| 1393 { | |
| 1394 fz_xml *dom = xml; | |
| 1395 if (xml == NULL) | |
| 1396 return xml; | |
| 1397 | |
| 1398 while (dom->up) | |
| 1399 dom = dom->up; | |
| 1400 | |
| 1401 fz_keep_imp(ctx, dom, &dom->u.doc.refs); | |
| 1402 | |
| 1403 /* Return the original node pointer, not the dom pointer! */ | |
| 1404 return xml; | |
| 1405 } |
