Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/gumbo-parser/src/parser.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 // Copyright 2010 Google Inc. All Rights Reserved. | |
| 2 // | |
| 3 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 4 // you may not use this file except in compliance with the License. | |
| 5 // You may obtain a copy of the License at | |
| 6 // | |
| 7 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 8 // | |
| 9 // Unless required by applicable law or agreed to in writing, software | |
| 10 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 12 // See the License for the specific language governing permissions and | |
| 13 // limitations under the License. | |
| 14 // | |
| 15 // Author: jdtang@google.com (Jonathan Tang) | |
| 16 | |
| 17 #include <assert.h> | |
| 18 #include <ctype.h> | |
| 19 #include <stdarg.h> | |
| 20 #include <stdlib.h> | |
| 21 #include <string.h> | |
| 22 #include <strings.h> | |
| 23 | |
| 24 #include "attribute.h" | |
| 25 #include "error.h" | |
| 26 #include "gumbo.h" | |
| 27 #include "insertion_mode.h" | |
| 28 #include "parser.h" | |
| 29 #include "tokenizer.h" | |
| 30 #include "tokenizer_states.h" | |
| 31 #include "utf8.h" | |
| 32 #include "util.h" | |
| 33 #include "vector.h" | |
| 34 | |
| 35 #define AVOID_UNUSED_VARIABLE_WARNING(i) (void)(i) | |
| 36 | |
| 37 #define GUMBO_STRING(literal) \ | |
| 38 { literal, sizeof(literal) - 1 } | |
| 39 #define TERMINATOR \ | |
| 40 { "", 0 } | |
| 41 | |
| 42 typedef char gumbo_tagset[GUMBO_TAG_LAST]; | |
| 43 #define TAG(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_HTML) | |
| 44 #define TAG_SVG(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_SVG) | |
| 45 #define TAG_MATHML(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_MATHML) | |
| 46 | |
| 47 #define TAGSET_INCLUDES(tagset, namespace, tag) \ | |
| 48 (tag < GUMBO_TAG_LAST && tagset[(int) tag] == (1 << (int) namespace)) | |
| 49 | |
| 50 // selected forward declarations as it is getting hard to find | |
| 51 // an appropriate order | |
| 52 static bool node_html_tag_is(const GumboNode*, GumboTag); | |
| 53 static GumboInsertionMode get_current_template_insertion_mode( | |
| 54 const GumboParser*); | |
| 55 static bool handle_in_template(GumboParser*, GumboToken*); | |
| 56 static void destroy_node(GumboParser*, GumboNode*); | |
| 57 | |
| 58 static void* malloc_wrapper(void* unused, size_t size) { return malloc(size); } | |
| 59 | |
| 60 static void free_wrapper(void* unused, void* ptr) { free(ptr); } | |
| 61 | |
| 62 const GumboOptions kGumboDefaultOptions = {&malloc_wrapper, &free_wrapper, NULL, | |
| 63 8, false, -1, GUMBO_TAG_LAST, GUMBO_NAMESPACE_HTML}; | |
| 64 | |
| 65 static const GumboStringPiece kDoctypeHtml = GUMBO_STRING("html"); | |
| 66 static const GumboStringPiece kPublicIdHtml4_0 = | |
| 67 GUMBO_STRING("-//W3C//DTD HTML 4.0//EN"); | |
| 68 static const GumboStringPiece kPublicIdHtml4_01 = | |
| 69 GUMBO_STRING("-//W3C//DTD HTML 4.01//EN"); | |
| 70 static const GumboStringPiece kPublicIdXhtml1_0 = | |
| 71 GUMBO_STRING("-//W3C//DTD XHTML 1.0 Strict//EN"); | |
| 72 static const GumboStringPiece kPublicIdXhtml1_1 = | |
| 73 GUMBO_STRING("-//W3C//DTD XHTML 1.1//EN"); | |
| 74 static const GumboStringPiece kSystemIdRecHtml4_0 = | |
| 75 GUMBO_STRING("http://www.w3.org/TR/REC-html40/strict.dtd"); | |
| 76 static const GumboStringPiece kSystemIdHtml4 = | |
| 77 GUMBO_STRING("http://www.w3.org/TR/html4/strict.dtd"); | |
| 78 static const GumboStringPiece kSystemIdXhtmlStrict1_1 = | |
| 79 GUMBO_STRING("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"); | |
| 80 static const GumboStringPiece kSystemIdXhtml1_1 = | |
| 81 GUMBO_STRING("http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"); | |
| 82 static const GumboStringPiece kSystemIdLegacyCompat = | |
| 83 GUMBO_STRING("about:legacy-compat"); | |
| 84 | |
| 85 // The doctype arrays have an explicit terminator because we want to pass them | |
| 86 // to a helper function, and passing them as a pointer discards sizeof | |
| 87 // information. The SVG arrays are used only by one-off functions, and so loops | |
| 88 // over them use sizeof directly instead of a terminator. | |
| 89 | |
| 90 static const GumboStringPiece kQuirksModePublicIdPrefixes[] = { | |
| 91 GUMBO_STRING("+//Silmaril//dtd html Pro v0r11 19970101//"), | |
| 92 GUMBO_STRING("-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//"), | |
| 93 GUMBO_STRING("-//AS//DTD HTML 3.0 asWedit + extensions//"), | |
| 94 GUMBO_STRING("-//IETF//DTD HTML 2.0 Level 1//"), | |
| 95 GUMBO_STRING("-//IETF//DTD HTML 2.0 Level 2//"), | |
| 96 GUMBO_STRING("-//IETF//DTD HTML 2.0 Strict Level 1//"), | |
| 97 GUMBO_STRING("-//IETF//DTD HTML 2.0 Strict Level 2//"), | |
| 98 GUMBO_STRING("-//IETF//DTD HTML 2.0 Strict//"), | |
| 99 GUMBO_STRING("-//IETF//DTD HTML 2.0//"), | |
| 100 GUMBO_STRING("-//IETF//DTD HTML 2.1E//"), | |
| 101 GUMBO_STRING("-//IETF//DTD HTML 3.0//"), | |
| 102 GUMBO_STRING("-//IETF//DTD HTML 3.2 Final//"), | |
| 103 GUMBO_STRING("-//IETF//DTD HTML 3.2//"), | |
| 104 GUMBO_STRING("-//IETF//DTD HTML 3//"), | |
| 105 GUMBO_STRING("-//IETF//DTD HTML Level 0//"), | |
| 106 GUMBO_STRING("-//IETF//DTD HTML Level 1//"), | |
| 107 GUMBO_STRING("-//IETF//DTD HTML Level 2//"), | |
| 108 GUMBO_STRING("-//IETF//DTD HTML Level 3//"), | |
| 109 GUMBO_STRING("-//IETF//DTD HTML Strict Level 0//"), | |
| 110 GUMBO_STRING("-//IETF//DTD HTML Strict Level 1//"), | |
| 111 GUMBO_STRING("-//IETF//DTD HTML Strict Level 2//"), | |
| 112 GUMBO_STRING("-//IETF//DTD HTML Strict Level 3//"), | |
| 113 GUMBO_STRING("-//IETF//DTD HTML Strict//"), | |
| 114 GUMBO_STRING("-//IETF//DTD HTML//"), | |
| 115 GUMBO_STRING("-//Metrius//DTD Metrius Presentational//"), | |
| 116 GUMBO_STRING("-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//"), | |
| 117 GUMBO_STRING("-//Microsoft//DTD Internet Explorer 2.0 HTML//"), | |
| 118 GUMBO_STRING("-//Microsoft//DTD Internet Explorer 2.0 Tables//"), | |
| 119 GUMBO_STRING("-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//"), | |
| 120 GUMBO_STRING("-//Microsoft//DTD Internet Explorer 3.0 HTML//"), | |
| 121 GUMBO_STRING("-//Microsoft//DTD Internet Explorer 3.0 Tables//"), | |
| 122 GUMBO_STRING("-//Netscape Comm. Corp.//DTD HTML//"), | |
| 123 GUMBO_STRING("-//Netscape Comm. Corp.//DTD Strict HTML//"), | |
| 124 GUMBO_STRING("-//O'Reilly and Associates//DTD HTML 2.0//"), | |
| 125 GUMBO_STRING("-//O'Reilly and Associates//DTD HTML Extended 1.0//"), | |
| 126 GUMBO_STRING("-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//"), | |
| 127 GUMBO_STRING( | |
| 128 "-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::)" | |
| 129 "extensions to HTML 4.0//"), | |
| 130 GUMBO_STRING( | |
| 131 "-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::" | |
| 132 "extensions to HTML 4.0//"), | |
| 133 GUMBO_STRING("-//Spyglass//DTD HTML 2.0 Extended//"), | |
| 134 GUMBO_STRING("-//SQ//DTD HTML 2.0 HoTMetaL + extensions//"), | |
| 135 GUMBO_STRING("-//Sun Microsystems Corp.//DTD HotJava HTML//"), | |
| 136 GUMBO_STRING("-//Sun Microsystems Corp.//DTD HotJava Strict HTML//"), | |
| 137 GUMBO_STRING("-//W3C//DTD HTML 3 1995-03-24//"), | |
| 138 GUMBO_STRING("-//W3C//DTD HTML 3.2 Draft//"), | |
| 139 GUMBO_STRING("-//W3C//DTD HTML 3.2 Final//"), | |
| 140 GUMBO_STRING("-//W3C//DTD HTML 3.2//"), | |
| 141 GUMBO_STRING("-//W3C//DTD HTML 3.2S Draft//"), | |
| 142 GUMBO_STRING("-//W3C//DTD HTML 4.0 Frameset//"), | |
| 143 GUMBO_STRING("-//W3C//DTD HTML 4.0 Transitional//"), | |
| 144 GUMBO_STRING("-//W3C//DTD HTML Experimental 19960712//"), | |
| 145 GUMBO_STRING("-//W3C//DTD HTML Experimental 970421//"), | |
| 146 GUMBO_STRING("-//W3C//DTD W3 HTML//"), | |
| 147 GUMBO_STRING("-//W3O//DTD W3 HTML 3.0//"), | |
| 148 GUMBO_STRING("-//WebTechs//DTD Mozilla HTML 2.0//"), | |
| 149 GUMBO_STRING("-//WebTechs//DTD Mozilla HTML//"), TERMINATOR}; | |
| 150 | |
| 151 static const GumboStringPiece kQuirksModePublicIdExactMatches[] = { | |
| 152 GUMBO_STRING("-//W3O//DTD W3 HTML Strict 3.0//EN//"), | |
| 153 GUMBO_STRING("-/W3C/DTD HTML 4.0 Transitional/EN"), GUMBO_STRING("HTML"), | |
| 154 TERMINATOR}; | |
| 155 | |
| 156 static const GumboStringPiece kQuirksModeSystemIdExactMatches[] = { | |
| 157 GUMBO_STRING("http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"), | |
| 158 TERMINATOR}; | |
| 159 | |
| 160 static const GumboStringPiece kLimitedQuirksPublicIdPrefixes[] = { | |
| 161 GUMBO_STRING("-//W3C//DTD XHTML 1.0 Frameset//"), | |
| 162 GUMBO_STRING("-//W3C//DTD XHTML 1.0 Transitional//"), TERMINATOR}; | |
| 163 | |
| 164 static const GumboStringPiece kLimitedQuirksRequiresSystemIdPublicIdPrefixes[] = | |
| 165 {GUMBO_STRING("-//W3C//DTD HTML 4.01 Frameset//"), | |
| 166 GUMBO_STRING("-//W3C//DTD HTML 4.01 Transitional//"), TERMINATOR}; | |
| 167 | |
| 168 // Indexed by GumboNamespaceEnum; keep in sync with that. | |
| 169 static const char* kLegalXmlns[] = {"http://www.w3.org/1999/xhtml", | |
| 170 "http://www.w3.org/2000/svg", "http://www.w3.org/1998/Math/MathML"}; | |
| 171 | |
| 172 typedef struct _ReplacementEntry { | |
| 173 const GumboStringPiece from; | |
| 174 const GumboStringPiece to; | |
| 175 } ReplacementEntry; | |
| 176 | |
| 177 #define REPLACEMENT_ENTRY(from, to) \ | |
| 178 { GUMBO_STRING(from), GUMBO_STRING(to) } | |
| 179 | |
| 180 // Static data for SVG attribute replacements. | |
| 181 // https://html.spec.whatwg.org/multipage/syntax.html#creating-and-inserting-nodes | |
| 182 static const ReplacementEntry kSvgAttributeReplacements[] = { | |
| 183 REPLACEMENT_ENTRY("attributename", "attributeName"), | |
| 184 REPLACEMENT_ENTRY("attributetype", "attributeType"), | |
| 185 REPLACEMENT_ENTRY("basefrequency", "baseFrequency"), | |
| 186 REPLACEMENT_ENTRY("baseprofile", "baseProfile"), | |
| 187 REPLACEMENT_ENTRY("calcmode", "calcMode"), | |
| 188 REPLACEMENT_ENTRY("clippathunits", "clipPathUnits"), | |
| 189 // REPLACEMENT_ENTRY("contentscripttype", "contentScriptType"), | |
| 190 // REPLACEMENT_ENTRY("contentstyletype", "contentStyleType"), | |
| 191 REPLACEMENT_ENTRY("diffuseconstant", "diffuseConstant"), | |
| 192 REPLACEMENT_ENTRY("edgemode", "edgeMode"), | |
| 193 // REPLACEMENT_ENTRY("externalresourcesrequired", | |
| 194 // "externalResourcesRequired"), | |
| 195 // REPLACEMENT_ENTRY("filterres", "filterRes"), | |
| 196 REPLACEMENT_ENTRY("filterunits", "filterUnits"), | |
| 197 REPLACEMENT_ENTRY("glyphref", "glyphRef"), | |
| 198 REPLACEMENT_ENTRY("gradienttransform", "gradientTransform"), | |
| 199 REPLACEMENT_ENTRY("gradientunits", "gradientUnits"), | |
| 200 REPLACEMENT_ENTRY("kernelmatrix", "kernelMatrix"), | |
| 201 REPLACEMENT_ENTRY("kernelunitlength", "kernelUnitLength"), | |
| 202 REPLACEMENT_ENTRY("keypoints", "keyPoints"), | |
| 203 REPLACEMENT_ENTRY("keysplines", "keySplines"), | |
| 204 REPLACEMENT_ENTRY("keytimes", "keyTimes"), | |
| 205 REPLACEMENT_ENTRY("lengthadjust", "lengthAdjust"), | |
| 206 REPLACEMENT_ENTRY("limitingconeangle", "limitingConeAngle"), | |
| 207 REPLACEMENT_ENTRY("markerheight", "markerHeight"), | |
| 208 REPLACEMENT_ENTRY("markerunits", "markerUnits"), | |
| 209 REPLACEMENT_ENTRY("markerwidth", "markerWidth"), | |
| 210 REPLACEMENT_ENTRY("maskcontentunits", "maskContentUnits"), | |
| 211 REPLACEMENT_ENTRY("maskunits", "maskUnits"), | |
| 212 REPLACEMENT_ENTRY("numoctaves", "numOctaves"), | |
| 213 REPLACEMENT_ENTRY("pathlength", "pathLength"), | |
| 214 REPLACEMENT_ENTRY("patterncontentunits", "patternContentUnits"), | |
| 215 REPLACEMENT_ENTRY("patterntransform", "patternTransform"), | |
| 216 REPLACEMENT_ENTRY("patternunits", "patternUnits"), | |
| 217 REPLACEMENT_ENTRY("pointsatx", "pointsAtX"), | |
| 218 REPLACEMENT_ENTRY("pointsaty", "pointsAtY"), | |
| 219 REPLACEMENT_ENTRY("pointsatz", "pointsAtZ"), | |
| 220 REPLACEMENT_ENTRY("preservealpha", "preserveAlpha"), | |
| 221 REPLACEMENT_ENTRY("preserveaspectratio", "preserveAspectRatio"), | |
| 222 REPLACEMENT_ENTRY("primitiveunits", "primitiveUnits"), | |
| 223 REPLACEMENT_ENTRY("refx", "refX"), REPLACEMENT_ENTRY("refy", "refY"), | |
| 224 REPLACEMENT_ENTRY("repeatcount", "repeatCount"), | |
| 225 REPLACEMENT_ENTRY("repeatdur", "repeatDur"), | |
| 226 REPLACEMENT_ENTRY("requiredextensions", "requiredExtensions"), | |
| 227 REPLACEMENT_ENTRY("requiredfeatures", "requiredFeatures"), | |
| 228 REPLACEMENT_ENTRY("specularconstant", "specularConstant"), | |
| 229 REPLACEMENT_ENTRY("specularexponent", "specularExponent"), | |
| 230 REPLACEMENT_ENTRY("spreadmethod", "spreadMethod"), | |
| 231 REPLACEMENT_ENTRY("startoffset", "startOffset"), | |
| 232 REPLACEMENT_ENTRY("stddeviation", "stdDeviation"), | |
| 233 REPLACEMENT_ENTRY("stitchtiles", "stitchTiles"), | |
| 234 REPLACEMENT_ENTRY("surfacescale", "surfaceScale"), | |
| 235 REPLACEMENT_ENTRY("systemlanguage", "systemLanguage"), | |
| 236 REPLACEMENT_ENTRY("tablevalues", "tableValues"), | |
| 237 REPLACEMENT_ENTRY("targetx", "targetX"), | |
| 238 REPLACEMENT_ENTRY("targety", "targetY"), | |
| 239 REPLACEMENT_ENTRY("textlength", "textLength"), | |
| 240 REPLACEMENT_ENTRY("viewbox", "viewBox"), | |
| 241 REPLACEMENT_ENTRY("viewtarget", "viewTarget"), | |
| 242 REPLACEMENT_ENTRY("xchannelselector", "xChannelSelector"), | |
| 243 REPLACEMENT_ENTRY("ychannelselector", "yChannelSelector"), | |
| 244 REPLACEMENT_ENTRY("zoomandpan", "zoomAndPan"), | |
| 245 }; | |
| 246 | |
| 247 static const ReplacementEntry kSvgTagReplacements[] = { | |
| 248 REPLACEMENT_ENTRY("altglyph", "altGlyph"), | |
| 249 REPLACEMENT_ENTRY("altglyphdef", "altGlyphDef"), | |
| 250 REPLACEMENT_ENTRY("altglyphitem", "altGlyphItem"), | |
| 251 REPLACEMENT_ENTRY("animatecolor", "animateColor"), | |
| 252 REPLACEMENT_ENTRY("animatemotion", "animateMotion"), | |
| 253 REPLACEMENT_ENTRY("animatetransform", "animateTransform"), | |
| 254 REPLACEMENT_ENTRY("clippath", "clipPath"), | |
| 255 REPLACEMENT_ENTRY("feblend", "feBlend"), | |
| 256 REPLACEMENT_ENTRY("fecolormatrix", "feColorMatrix"), | |
| 257 REPLACEMENT_ENTRY("fecomponenttransfer", "feComponentTransfer"), | |
| 258 REPLACEMENT_ENTRY("fecomposite", "feComposite"), | |
| 259 REPLACEMENT_ENTRY("feconvolvematrix", "feConvolveMatrix"), | |
| 260 REPLACEMENT_ENTRY("fediffuselighting", "feDiffuseLighting"), | |
| 261 REPLACEMENT_ENTRY("fedisplacementmap", "feDisplacementMap"), | |
| 262 REPLACEMENT_ENTRY("fedistantlight", "feDistantLight"), | |
| 263 REPLACEMENT_ENTRY("feflood", "feFlood"), | |
| 264 REPLACEMENT_ENTRY("fefunca", "feFuncA"), | |
| 265 REPLACEMENT_ENTRY("fefuncb", "feFuncB"), | |
| 266 REPLACEMENT_ENTRY("fefuncg", "feFuncG"), | |
| 267 REPLACEMENT_ENTRY("fefuncr", "feFuncR"), | |
| 268 REPLACEMENT_ENTRY("fegaussianblur", "feGaussianBlur"), | |
| 269 REPLACEMENT_ENTRY("feimage", "feImage"), | |
| 270 REPLACEMENT_ENTRY("femerge", "feMerge"), | |
| 271 REPLACEMENT_ENTRY("femergenode", "feMergeNode"), | |
| 272 REPLACEMENT_ENTRY("femorphology", "feMorphology"), | |
| 273 REPLACEMENT_ENTRY("feoffset", "feOffset"), | |
| 274 REPLACEMENT_ENTRY("fepointlight", "fePointLight"), | |
| 275 REPLACEMENT_ENTRY("fespecularlighting", "feSpecularLighting"), | |
| 276 REPLACEMENT_ENTRY("fespotlight", "feSpotLight"), | |
| 277 REPLACEMENT_ENTRY("fetile", "feTile"), | |
| 278 REPLACEMENT_ENTRY("feturbulence", "feTurbulence"), | |
| 279 REPLACEMENT_ENTRY("foreignobject", "foreignObject"), | |
| 280 REPLACEMENT_ENTRY("glyphref", "glyphRef"), | |
| 281 REPLACEMENT_ENTRY("lineargradient", "linearGradient"), | |
| 282 REPLACEMENT_ENTRY("radialgradient", "radialGradient"), | |
| 283 REPLACEMENT_ENTRY("textpath", "textPath"), | |
| 284 }; | |
| 285 | |
| 286 typedef struct _NamespacedAttributeReplacement { | |
| 287 const char* from; | |
| 288 const char* local_name; | |
| 289 const GumboAttributeNamespaceEnum attr_namespace; | |
| 290 } NamespacedAttributeReplacement; | |
| 291 | |
| 292 static const NamespacedAttributeReplacement kForeignAttributeReplacements[] = { | |
| 293 {"xlink:actuate", "actuate", GUMBO_ATTR_NAMESPACE_XLINK}, | |
| 294 {"xlink:actuate", "actuate", GUMBO_ATTR_NAMESPACE_XLINK}, | |
| 295 {"xlink:href", "href", GUMBO_ATTR_NAMESPACE_XLINK}, | |
| 296 {"xlink:role", "role", GUMBO_ATTR_NAMESPACE_XLINK}, | |
| 297 {"xlink:show", "show", GUMBO_ATTR_NAMESPACE_XLINK}, | |
| 298 {"xlink:title", "title", GUMBO_ATTR_NAMESPACE_XLINK}, | |
| 299 {"xlink:type", "type", GUMBO_ATTR_NAMESPACE_XLINK}, | |
| 300 {"xml:base", "base", GUMBO_ATTR_NAMESPACE_XML}, | |
| 301 {"xml:lang", "lang", GUMBO_ATTR_NAMESPACE_XML}, | |
| 302 {"xml:space", "space", GUMBO_ATTR_NAMESPACE_XML}, | |
| 303 {"xmlns", "xmlns", GUMBO_ATTR_NAMESPACE_XMLNS}, | |
| 304 {"xmlns:xlink", "xlink", GUMBO_ATTR_NAMESPACE_XMLNS}, | |
| 305 }; | |
| 306 | |
| 307 // The "scope marker" for the list of active formatting elements. We use a | |
| 308 // pointer to this as a generic marker element, since the particular element | |
| 309 // scope doesn't matter. | |
| 310 static const GumboNode kActiveFormattingScopeMarker; | |
| 311 | |
| 312 // The tag_is and tag_in function use true & false to denote start & end tags, | |
| 313 // but for readability, we define constants for them here. | |
| 314 static const bool kStartTag = true; | |
| 315 static const bool kEndTag = false; | |
| 316 | |
| 317 // Because GumboStringPieces are immutable, we can't insert a character directly | |
| 318 // into a text node. Instead, we accumulate all pending characters here and | |
| 319 // flush them out to a text node whenever a new element is inserted. | |
| 320 // | |
| 321 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#insert-a-character | |
| 322 typedef struct _TextNodeBufferState { | |
| 323 // The accumulated text to be inserted into the current text node. | |
| 324 GumboStringBuffer _buffer; | |
| 325 | |
| 326 // A pointer to the original text represented by this text node. Note that | |
| 327 // because of foster parenting and other strange DOM manipulations, this may | |
| 328 // include other non-text HTML tags in it; it is defined as the span of | |
| 329 // original text from the first character in this text node to the last | |
| 330 // character in this text node. | |
| 331 const char* _start_original_text; | |
| 332 | |
| 333 // The source position of the start of this text node. | |
| 334 GumboSourcePosition _start_position; | |
| 335 | |
| 336 // The type of node that will be inserted (TEXT, CDATA, or WHITESPACE). | |
| 337 GumboNodeType _type; | |
| 338 } TextNodeBufferState; | |
| 339 | |
| 340 typedef struct GumboInternalParserState { | |
| 341 // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#insertion-mode | |
| 342 GumboInsertionMode _insertion_mode; | |
| 343 | |
| 344 // Used for run_generic_parsing_algorithm, which needs to switch back to the | |
| 345 // original insertion mode at its conclusion. | |
| 346 GumboInsertionMode _original_insertion_mode; | |
| 347 | |
| 348 // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-stack-of-open-elements | |
| 349 GumboVector /*GumboNode*/ _open_elements; | |
| 350 | |
| 351 // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-list-of-active-formatting-elements | |
| 352 GumboVector /*GumboNode*/ _active_formatting_elements; | |
| 353 | |
| 354 // The stack of template insertion modes. | |
| 355 // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#the-insertion-mode | |
| 356 GumboVector /*InsertionMode*/ _template_insertion_modes; | |
| 357 | |
| 358 // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-element-pointers | |
| 359 GumboNode* _head_element; | |
| 360 GumboNode* _form_element; | |
| 361 | |
| 362 // The element used as fragment context when parsing in fragment mode | |
| 363 GumboNode* _fragment_ctx; | |
| 364 | |
| 365 // The flag for when the spec says "Reprocess the current token in..." | |
| 366 bool _reprocess_current_token; | |
| 367 | |
| 368 // The flag for "acknowledge the token's self-closing flag". | |
| 369 bool _self_closing_flag_acknowledged; | |
| 370 | |
| 371 // The "frameset-ok" flag from the spec. | |
| 372 bool _frameset_ok; | |
| 373 | |
| 374 // The flag for "If the next token is a LINE FEED, ignore that token...". | |
| 375 bool _ignore_next_linefeed; | |
| 376 | |
| 377 // The flag for "whenever a node would be inserted into the current node, it | |
| 378 // must instead be foster parented". This is used for misnested table | |
| 379 // content, which needs to be handled according to "in body" rules yet foster | |
| 380 // parented outside of the table. | |
| 381 // It would perhaps be more explicit to have this as a parameter to | |
| 382 // handle_in_body and insert_element, but given how special-purpose this is | |
| 383 // and the number of call-sites that would need to take the extra parameter, | |
| 384 // it's easier just to have a state flag. | |
| 385 bool _foster_parent_insertions; | |
| 386 | |
| 387 // The accumulated text node buffer state. | |
| 388 TextNodeBufferState _text_node; | |
| 389 | |
| 390 // The current token. | |
| 391 GumboToken* _current_token; | |
| 392 | |
| 393 // The way that the spec is written, the </body> and </html> tags are *always* | |
| 394 // implicit, because encountering one of those tokens merely switches the | |
| 395 // insertion mode out of "in body". So we have individual state flags for | |
| 396 // those end tags that are then inspected by pop_current_node when the <body> | |
| 397 // and <html> nodes are popped to set the GUMBO_INSERTION_IMPLICIT_END_TAG | |
| 398 // flag appropriately. | |
| 399 bool _closed_body_tag; | |
| 400 bool _closed_html_tag; | |
| 401 } GumboParserState; | |
| 402 | |
| 403 static bool token_has_attribute(const GumboToken* token, const char* name) { | |
| 404 assert(token->type == GUMBO_TOKEN_START_TAG); | |
| 405 return gumbo_get_attribute(&token->v.start_tag.attributes, name) != NULL; | |
| 406 } | |
| 407 | |
| 408 // Checks if the value of the specified attribute is a case-insensitive match | |
| 409 // for the specified string. | |
| 410 static bool attribute_matches( | |
| 411 const GumboVector* attributes, const char* name, const char* value) { | |
| 412 const GumboAttribute* attr = gumbo_get_attribute(attributes, name); | |
| 413 return attr ? strcasecmp(value, attr->value) == 0 : false; | |
| 414 } | |
| 415 | |
| 416 // Checks if the value of the specified attribute is a case-sensitive match | |
| 417 // for the specified string. | |
| 418 static bool attribute_matches_case_sensitive( | |
| 419 const GumboVector* attributes, const char* name, const char* value) { | |
| 420 const GumboAttribute* attr = gumbo_get_attribute(attributes, name); | |
| 421 return attr ? strcmp(value, attr->value) == 0 : false; | |
| 422 } | |
| 423 | |
| 424 // Checks if the specified attribute vectors are identical. | |
| 425 static bool all_attributes_match( | |
| 426 const GumboVector* attr1, const GumboVector* attr2) { | |
| 427 unsigned int num_unmatched_attr2_elements = attr2->length; | |
| 428 for (unsigned int i = 0; i < attr1->length; ++i) { | |
| 429 const GumboAttribute* attr = attr1->data[i]; | |
| 430 if (attribute_matches_case_sensitive(attr2, attr->name, attr->value)) { | |
| 431 --num_unmatched_attr2_elements; | |
| 432 } else { | |
| 433 return false; | |
| 434 } | |
| 435 } | |
| 436 return num_unmatched_attr2_elements == 0; | |
| 437 } | |
| 438 | |
| 439 static void set_frameset_not_ok(GumboParser* parser) { | |
| 440 gumbo_debug("Setting frameset_ok to false.\n"); | |
| 441 parser->_parser_state->_frameset_ok = false; | |
| 442 } | |
| 443 | |
| 444 static GumboNode* create_node(GumboParser* parser, GumboNodeType type) { | |
| 445 GumboNode* node = gumbo_parser_allocate(parser, sizeof(GumboNode)); | |
| 446 node->parent = NULL; | |
| 447 node->index_within_parent = -1; | |
| 448 node->type = type; | |
| 449 node->parse_flags = GUMBO_INSERTION_NORMAL; | |
| 450 return node; | |
| 451 } | |
| 452 | |
| 453 static GumboNode* new_document_node(GumboParser* parser) { | |
| 454 GumboNode* document_node = create_node(parser, GUMBO_NODE_DOCUMENT); | |
| 455 document_node->parse_flags = GUMBO_INSERTION_BY_PARSER; | |
| 456 gumbo_vector_init(parser, 1, &document_node->v.document.children); | |
| 457 | |
| 458 // Must be initialized explicitly, as there's no guarantee that we'll see a | |
| 459 // doc type token. | |
| 460 GumboDocument* document = &document_node->v.document; | |
| 461 document->has_doctype = false; | |
| 462 document->name = NULL; | |
| 463 document->public_identifier = NULL; | |
| 464 document->system_identifier = NULL; | |
| 465 return document_node; | |
| 466 } | |
| 467 | |
| 468 static void output_init(GumboParser* parser) { | |
| 469 GumboOutput* output = gumbo_parser_allocate(parser, sizeof(GumboOutput)); | |
| 470 output->root = NULL; | |
| 471 output->document = new_document_node(parser); | |
| 472 parser->_output = output; | |
| 473 gumbo_init_errors(parser); | |
| 474 } | |
| 475 | |
| 476 static void parser_state_init(GumboParser* parser) { | |
| 477 GumboParserState* parser_state = | |
| 478 gumbo_parser_allocate(parser, sizeof(GumboParserState)); | |
| 479 parser_state->_insertion_mode = GUMBO_INSERTION_MODE_INITIAL; | |
| 480 parser_state->_reprocess_current_token = false; | |
| 481 parser_state->_frameset_ok = true; | |
| 482 parser_state->_ignore_next_linefeed = false; | |
| 483 parser_state->_foster_parent_insertions = false; | |
| 484 parser_state->_text_node._type = GUMBO_NODE_WHITESPACE; | |
| 485 gumbo_string_buffer_init(parser, &parser_state->_text_node._buffer); | |
| 486 gumbo_vector_init(parser, 10, &parser_state->_open_elements); | |
| 487 gumbo_vector_init(parser, 5, &parser_state->_active_formatting_elements); | |
| 488 gumbo_vector_init(parser, 5, &parser_state->_template_insertion_modes); | |
| 489 parser_state->_head_element = NULL; | |
| 490 parser_state->_form_element = NULL; | |
| 491 parser_state->_fragment_ctx = NULL; | |
| 492 parser_state->_current_token = NULL; | |
| 493 parser_state->_closed_body_tag = false; | |
| 494 parser_state->_closed_html_tag = false; | |
| 495 parser->_parser_state = parser_state; | |
| 496 } | |
| 497 | |
| 498 static void parser_state_destroy(GumboParser* parser) { | |
| 499 GumboParserState* state = parser->_parser_state; | |
| 500 if (state->_fragment_ctx) { | |
| 501 destroy_node(parser, state->_fragment_ctx); | |
| 502 } | |
| 503 gumbo_vector_destroy(parser, &state->_active_formatting_elements); | |
| 504 gumbo_vector_destroy(parser, &state->_open_elements); | |
| 505 gumbo_vector_destroy(parser, &state->_template_insertion_modes); | |
| 506 gumbo_string_buffer_destroy(parser, &state->_text_node._buffer); | |
| 507 gumbo_parser_deallocate(parser, state); | |
| 508 } | |
| 509 | |
| 510 static GumboNode* get_document_node(GumboParser* parser) { | |
| 511 return parser->_output->document; | |
| 512 } | |
| 513 | |
| 514 static bool is_fragment_parser(const GumboParser* parser) { | |
| 515 return !!parser->_parser_state->_fragment_ctx; | |
| 516 } | |
| 517 | |
| 518 // Returns the node at the bottom of the stack of open elements, or NULL if no | |
| 519 // elements have been added yet. | |
| 520 static GumboNode* get_current_node(GumboParser* parser) { | |
| 521 GumboVector* open_elements = &parser->_parser_state->_open_elements; | |
| 522 if (open_elements->length == 0) { | |
| 523 assert(!parser->_output->root); | |
| 524 return NULL; | |
| 525 } | |
| 526 assert(open_elements->length > 0); | |
| 527 assert(open_elements->data != NULL); | |
| 528 return open_elements->data[open_elements->length - 1]; | |
| 529 } | |
| 530 | |
| 531 static GumboNode* get_adjusted_current_node(GumboParser* parser) { | |
| 532 GumboParserState* state = parser->_parser_state; | |
| 533 if (state->_open_elements.length == 1 && state->_fragment_ctx) { | |
| 534 return state->_fragment_ctx; | |
| 535 } | |
| 536 return get_current_node(parser); | |
| 537 } | |
| 538 | |
| 539 // Returns true if the given needle is in the given array of literal | |
| 540 // GumboStringPieces. If exact_match is true, this requires that they match | |
| 541 // exactly; otherwise, this performs a prefix match to check if any of the | |
| 542 // elements in haystack start with needle. This always performs a | |
| 543 // case-insensitive match. | |
| 544 static bool is_in_static_list( | |
| 545 const char* needle, const GumboStringPiece* haystack, bool exact_match) { | |
| 546 for (unsigned int i = 0; haystack[i].length > 0; ++i) { | |
| 547 if ((exact_match && !strcmp(needle, haystack[i].data)) || | |
| 548 (!exact_match && !strcasecmp(needle, haystack[i].data))) { | |
| 549 return true; | |
| 550 } | |
| 551 } | |
| 552 return false; | |
| 553 } | |
| 554 | |
| 555 static void set_insertion_mode(GumboParser* parser, GumboInsertionMode mode) { | |
| 556 parser->_parser_state->_insertion_mode = mode; | |
| 557 } | |
| 558 | |
| 559 // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#reset-the-insertion-mode-appropriately | |
| 560 // This is a helper function that returns the appropriate insertion mode instead | |
| 561 // of setting it. Returns GUMBO_INSERTION_MODE_INITIAL as a sentinel value to | |
| 562 // indicate that there is no appropriate insertion mode, and the loop should | |
| 563 // continue. | |
| 564 static GumboInsertionMode get_appropriate_insertion_mode( | |
| 565 const GumboParser* parser, int index) { | |
| 566 const GumboVector* open_elements = &parser->_parser_state->_open_elements; | |
| 567 const GumboNode* node = open_elements->data[index]; | |
| 568 const bool is_last = index == 0; | |
| 569 | |
| 570 if (is_last && is_fragment_parser(parser)) { | |
| 571 node = parser->_parser_state->_fragment_ctx; | |
| 572 } | |
| 573 | |
| 574 assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE); | |
| 575 switch (node->v.element.tag) { | |
| 576 case GUMBO_TAG_SELECT: { | |
| 577 if (is_last) { | |
| 578 return GUMBO_INSERTION_MODE_IN_SELECT; | |
| 579 } | |
| 580 for (int i = index; i > 0; --i) { | |
| 581 const GumboNode* ancestor = open_elements->data[i]; | |
| 582 if (node_html_tag_is(ancestor, GUMBO_TAG_TEMPLATE)) { | |
| 583 return GUMBO_INSERTION_MODE_IN_SELECT; | |
| 584 } | |
| 585 if (node_html_tag_is(ancestor, GUMBO_TAG_TABLE)) { | |
| 586 return GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE; | |
| 587 } | |
| 588 } | |
| 589 return GUMBO_INSERTION_MODE_IN_SELECT; | |
| 590 } | |
| 591 case GUMBO_TAG_TD: | |
| 592 case GUMBO_TAG_TH: | |
| 593 if (!is_last) return GUMBO_INSERTION_MODE_IN_CELL; | |
| 594 break; | |
| 595 case GUMBO_TAG_TR: | |
| 596 return GUMBO_INSERTION_MODE_IN_ROW; | |
| 597 case GUMBO_TAG_TBODY: | |
| 598 case GUMBO_TAG_THEAD: | |
| 599 case GUMBO_TAG_TFOOT: | |
| 600 return GUMBO_INSERTION_MODE_IN_TABLE_BODY; | |
| 601 case GUMBO_TAG_CAPTION: | |
| 602 return GUMBO_INSERTION_MODE_IN_CAPTION; | |
| 603 case GUMBO_TAG_COLGROUP: | |
| 604 return GUMBO_INSERTION_MODE_IN_COLUMN_GROUP; | |
| 605 case GUMBO_TAG_TABLE: | |
| 606 return GUMBO_INSERTION_MODE_IN_TABLE; | |
| 607 case GUMBO_TAG_TEMPLATE: | |
| 608 return get_current_template_insertion_mode(parser); | |
| 609 case GUMBO_TAG_HEAD: | |
| 610 if (!is_last) return GUMBO_INSERTION_MODE_IN_HEAD; | |
| 611 break; | |
| 612 case GUMBO_TAG_BODY: | |
| 613 return GUMBO_INSERTION_MODE_IN_BODY; | |
| 614 case GUMBO_TAG_FRAMESET: | |
| 615 return GUMBO_INSERTION_MODE_IN_FRAMESET; | |
| 616 case GUMBO_TAG_HTML: | |
| 617 return parser->_parser_state->_head_element | |
| 618 ? GUMBO_INSERTION_MODE_AFTER_HEAD | |
| 619 : GUMBO_INSERTION_MODE_BEFORE_HEAD; | |
| 620 default: | |
| 621 break; | |
| 622 } | |
| 623 return is_last ? GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_INITIAL; | |
| 624 } | |
| 625 | |
| 626 // This performs the actual "reset the insertion mode" loop. | |
| 627 static void reset_insertion_mode_appropriately(GumboParser* parser) { | |
| 628 const GumboVector* open_elements = &parser->_parser_state->_open_elements; | |
| 629 for (int i = open_elements->length; --i >= 0;) { | |
| 630 GumboInsertionMode mode = get_appropriate_insertion_mode(parser, i); | |
| 631 if (mode != GUMBO_INSERTION_MODE_INITIAL) { | |
| 632 set_insertion_mode(parser, mode); | |
| 633 return; | |
| 634 } | |
| 635 } | |
| 636 // Should never get here, because is_last will be set on the last iteration | |
| 637 // and will force GUMBO_INSERTION_MODE_IN_BODY. | |
| 638 assert(0); | |
| 639 } | |
| 640 | |
| 641 static GumboError* parser_add_parse_error( | |
| 642 GumboParser* parser, const GumboToken* token) { | |
| 643 gumbo_debug("Adding parse error.\n"); | |
| 644 GumboError* error = gumbo_add_error(parser); | |
| 645 if (!error) { | |
| 646 return NULL; | |
| 647 } | |
| 648 error->type = GUMBO_ERR_PARSER; | |
| 649 error->position = token->position; | |
| 650 error->original_text = token->original_text.data; | |
| 651 GumboParserError* extra_data = &error->v.parser; | |
| 652 extra_data->input_type = token->type; | |
| 653 extra_data->input_tag = GUMBO_TAG_UNKNOWN; | |
| 654 if (token->type == GUMBO_TOKEN_START_TAG) { | |
| 655 extra_data->input_tag = token->v.start_tag.tag; | |
| 656 } else if (token->type == GUMBO_TOKEN_END_TAG) { | |
| 657 extra_data->input_tag = token->v.end_tag; | |
| 658 } | |
| 659 GumboParserState* state = parser->_parser_state; | |
| 660 extra_data->parser_state = state->_insertion_mode; | |
| 661 gumbo_vector_init( | |
| 662 parser, state->_open_elements.length, &extra_data->tag_stack); | |
| 663 for (unsigned int i = 0; i < state->_open_elements.length; ++i) { | |
| 664 const GumboNode* node = state->_open_elements.data[i]; | |
| 665 assert( | |
| 666 node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE); | |
| 667 gumbo_vector_add( | |
| 668 parser, (void*) node->v.element.tag, &extra_data->tag_stack); | |
| 669 } | |
| 670 return error; | |
| 671 } | |
| 672 | |
| 673 // Returns true if the specified token is either a start or end tag (specified | |
| 674 // by is_start) with one of the tag types in the varargs list. Terminate the | |
| 675 // list with GUMBO_TAG_LAST; this functions as a sentinel since no portion of | |
| 676 // the spec references tags that are not in the spec. | |
| 677 static bool tag_in( | |
| 678 const GumboToken* token, bool is_start, const gumbo_tagset tags) { | |
| 679 GumboTag token_tag; | |
| 680 if (is_start && token->type == GUMBO_TOKEN_START_TAG) { | |
| 681 token_tag = token->v.start_tag.tag; | |
| 682 } else if (!is_start && token->type == GUMBO_TOKEN_END_TAG) { | |
| 683 token_tag = token->v.end_tag; | |
| 684 } else { | |
| 685 return false; | |
| 686 } | |
| 687 return (token_tag < GUMBO_TAG_LAST && tags[(int) token_tag] != 0); | |
| 688 } | |
| 689 | |
| 690 // Like tag_in, but for the single-tag case. | |
| 691 static bool tag_is(const GumboToken* token, bool is_start, GumboTag tag) { | |
| 692 if (is_start && token->type == GUMBO_TOKEN_START_TAG) { | |
| 693 return token->v.start_tag.tag == tag; | |
| 694 } else if (!is_start && token->type == GUMBO_TOKEN_END_TAG) { | |
| 695 return token->v.end_tag == tag; | |
| 696 } else { | |
| 697 return false; | |
| 698 } | |
| 699 } | |
| 700 | |
| 701 // Like tag_in, but checks for the tag of a node, rather than a token. | |
| 702 static bool node_tag_in_set(const GumboNode* node, const gumbo_tagset tags) { | |
| 703 assert(node != NULL); | |
| 704 if (node->type != GUMBO_NODE_ELEMENT && node->type != GUMBO_NODE_TEMPLATE) { | |
| 705 return false; | |
| 706 } | |
| 707 return TAGSET_INCLUDES( | |
| 708 tags, node->v.element.tag_namespace, node->v.element.tag); | |
| 709 } | |
| 710 | |
| 711 // Like node_tag_in, but for the single-tag case. | |
| 712 static bool node_qualified_tag_is( | |
| 713 const GumboNode* node, GumboNamespaceEnum ns, GumboTag tag) { | |
| 714 assert(node); | |
| 715 return (node->type == GUMBO_NODE_ELEMENT || | |
| 716 node->type == GUMBO_NODE_TEMPLATE) && | |
| 717 node->v.element.tag == tag && node->v.element.tag_namespace == ns; | |
| 718 } | |
| 719 | |
| 720 // Like node_tag_in, but for the single-tag case in the HTML namespace | |
| 721 static bool node_html_tag_is(const GumboNode* node, GumboTag tag) { | |
| 722 return node_qualified_tag_is(node, GUMBO_NAMESPACE_HTML, tag); | |
| 723 } | |
| 724 | |
| 725 static void push_template_insertion_mode( | |
| 726 GumboParser* parser, GumboInsertionMode mode) { | |
| 727 gumbo_vector_add( | |
| 728 parser, (void*) mode, &parser->_parser_state->_template_insertion_modes); | |
| 729 } | |
| 730 | |
| 731 static void pop_template_insertion_mode(GumboParser* parser) { | |
| 732 gumbo_vector_pop(parser, &parser->_parser_state->_template_insertion_modes); | |
| 733 } | |
| 734 | |
| 735 // Returns the current template insertion mode. If the stack of template | |
| 736 // insertion modes is empty, this returns GUMBO_INSERTION_MODE_INITIAL. | |
| 737 static GumboInsertionMode get_current_template_insertion_mode( | |
| 738 const GumboParser* parser) { | |
| 739 GumboVector* template_insertion_modes = | |
| 740 &parser->_parser_state->_template_insertion_modes; | |
| 741 if (template_insertion_modes->length == 0) { | |
| 742 return GUMBO_INSERTION_MODE_INITIAL; | |
| 743 } | |
| 744 return (GumboInsertionMode) | |
| 745 (intptr_t) template_insertion_modes->data[(template_insertion_modes->length - 1)]; | |
| 746 } | |
| 747 | |
| 748 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#mathml-text-integration-point | |
| 749 static bool is_mathml_integration_point(const GumboNode* node) { | |
| 750 return node_tag_in_set( | |
| 751 node, (gumbo_tagset){TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN), | |
| 752 TAG_MATHML(MS), TAG_MATHML(MTEXT)}); | |
| 753 } | |
| 754 | |
| 755 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#html-integration-point | |
| 756 static bool is_html_integration_point(const GumboNode* node) { | |
| 757 return node_tag_in_set(node, (gumbo_tagset){TAG_SVG(FOREIGNOBJECT), | |
| 758 TAG_SVG(DESC), TAG_SVG(TITLE)}) || | |
| 759 (node_qualified_tag_is( | |
| 760 node, GUMBO_NAMESPACE_MATHML, GUMBO_TAG_ANNOTATION_XML) && | |
| 761 (attribute_matches( | |
| 762 &node->v.element.attributes, "encoding", "text/html") || | |
| 763 attribute_matches(&node->v.element.attributes, "encoding", | |
| 764 "application/xhtml+xml"))); | |
| 765 } | |
| 766 | |
| 767 // This represents a place to insert a node, consisting of a target parent and a | |
| 768 // child index within that parent. If the node should be inserted at the end of | |
| 769 // the parent's child, index will be -1. | |
| 770 typedef struct { | |
| 771 GumboNode* target; | |
| 772 int index; | |
| 773 } InsertionLocation; | |
| 774 | |
| 775 InsertionLocation get_appropriate_insertion_location( | |
| 776 GumboParser* parser, GumboNode* override_target) { | |
| 777 InsertionLocation retval = {override_target, -1}; | |
| 778 if (retval.target == NULL) { | |
| 779 // No override target; default to the current node, but special-case the | |
| 780 // root node since get_current_node() assumes the stack of open elements is | |
| 781 // non-empty. | |
| 782 retval.target = parser->_output->root != NULL ? get_current_node(parser) | |
| 783 : get_document_node(parser); | |
| 784 } | |
| 785 if (!parser->_parser_state->_foster_parent_insertions || | |
| 786 !node_tag_in_set(retval.target, (gumbo_tagset){TAG(TABLE), TAG(TBODY), | |
| 787 TAG(TFOOT), TAG(THEAD), TAG(TR)})) { | |
| 788 return retval; | |
| 789 } | |
| 790 | |
| 791 // Foster-parenting case. | |
| 792 int last_template_index = -1; | |
| 793 int last_table_index = -1; | |
| 794 GumboVector* open_elements = &parser->_parser_state->_open_elements; | |
| 795 for (unsigned int i = 0; i < open_elements->length; ++i) { | |
| 796 if (node_html_tag_is(open_elements->data[i], GUMBO_TAG_TEMPLATE)) { | |
| 797 last_template_index = i; | |
| 798 } | |
| 799 if (node_html_tag_is(open_elements->data[i], GUMBO_TAG_TABLE)) { | |
| 800 last_table_index = i; | |
| 801 } | |
| 802 } | |
| 803 if (last_template_index != -1 && | |
| 804 (last_table_index == -1 || last_template_index > last_table_index)) { | |
| 805 retval.target = open_elements->data[last_template_index]; | |
| 806 return retval; | |
| 807 } | |
| 808 if (last_table_index == -1) { | |
| 809 retval.target = open_elements->data[0]; | |
| 810 return retval; | |
| 811 } | |
| 812 GumboNode* last_table = open_elements->data[last_table_index]; | |
| 813 if (last_table->parent != NULL) { | |
| 814 retval.target = last_table->parent; | |
| 815 retval.index = last_table->index_within_parent; | |
| 816 return retval; | |
| 817 } | |
| 818 | |
| 819 retval.target = open_elements->data[last_table_index - 1]; | |
| 820 return retval; | |
| 821 } | |
| 822 | |
| 823 // Appends a node to the end of its parent, setting the "parent" and | |
| 824 // "index_within_parent" fields appropriately. | |
| 825 static void append_node( | |
| 826 GumboParser* parser, GumboNode* parent, GumboNode* node) { | |
| 827 assert(node->parent == NULL); | |
| 828 assert(node->index_within_parent == -1); | |
| 829 GumboVector* children; | |
| 830 if (parent->type == GUMBO_NODE_ELEMENT || | |
| 831 parent->type == GUMBO_NODE_TEMPLATE) { | |
| 832 children = &parent->v.element.children; | |
| 833 } else { | |
| 834 assert(parent->type == GUMBO_NODE_DOCUMENT); | |
| 835 children = &parent->v.document.children; | |
| 836 } | |
| 837 node->parent = parent; | |
| 838 node->index_within_parent = children->length; | |
| 839 gumbo_vector_add(parser, (void*) node, children); | |
| 840 assert(node->index_within_parent < children->length); | |
| 841 } | |
| 842 | |
| 843 // Inserts a node at the specified InsertionLocation, updating the | |
| 844 // "parent" and "index_within_parent" fields of it and all its siblings. | |
| 845 // If the index of the location is -1, this calls append_node. | |
| 846 static void insert_node( | |
| 847 GumboParser* parser, GumboNode* node, InsertionLocation location) { | |
| 848 assert(node->parent == NULL); | |
| 849 assert(node->index_within_parent == -1); | |
| 850 GumboNode* parent = location.target; | |
| 851 int index = location.index; | |
| 852 if (index != -1) { | |
| 853 GumboVector* children = NULL; | |
| 854 if (parent->type == GUMBO_NODE_ELEMENT || | |
| 855 parent->type == GUMBO_NODE_TEMPLATE) { | |
| 856 children = &parent->v.element.children; | |
| 857 } else if (parent->type == GUMBO_NODE_DOCUMENT) { | |
| 858 children = &parent->v.document.children; | |
| 859 assert(children->length == 0); | |
| 860 } else { | |
| 861 assert(0); | |
| 862 } | |
| 863 | |
| 864 assert(index >= 0); | |
| 865 assert((unsigned int) index < children->length); | |
| 866 node->parent = parent; | |
| 867 node->index_within_parent = index; | |
| 868 gumbo_vector_insert_at(parser, (void*) node, index, children); | |
| 869 assert(node->index_within_parent < children->length); | |
| 870 for (unsigned int i = index + 1; i < children->length; ++i) { | |
| 871 GumboNode* sibling = children->data[i]; | |
| 872 sibling->index_within_parent = i; | |
| 873 assert(sibling->index_within_parent < children->length); | |
| 874 } | |
| 875 } else { | |
| 876 append_node(parser, parent, node); | |
| 877 } | |
| 878 } | |
| 879 | |
| 880 static void maybe_flush_text_node_buffer(GumboParser* parser) { | |
| 881 GumboParserState* state = parser->_parser_state; | |
| 882 TextNodeBufferState* buffer_state = &state->_text_node; | |
| 883 if (buffer_state->_buffer.length == 0) { | |
| 884 return; | |
| 885 } | |
| 886 | |
| 887 assert(buffer_state->_type == GUMBO_NODE_WHITESPACE || | |
| 888 buffer_state->_type == GUMBO_NODE_TEXT || | |
| 889 buffer_state->_type == GUMBO_NODE_CDATA); | |
| 890 GumboNode* text_node = create_node(parser, buffer_state->_type); | |
| 891 GumboText* text_node_data = &text_node->v.text; | |
| 892 text_node_data->text = | |
| 893 gumbo_string_buffer_to_string(parser, &buffer_state->_buffer); | |
| 894 text_node_data->original_text.data = buffer_state->_start_original_text; | |
| 895 text_node_data->original_text.length = | |
| 896 state->_current_token->original_text.data - | |
| 897 buffer_state->_start_original_text; | |
| 898 text_node_data->start_pos = buffer_state->_start_position; | |
| 899 | |
| 900 gumbo_debug("Flushing text node buffer of %.*s.\n", | |
| 901 (int) buffer_state->_buffer.length, buffer_state->_buffer.data); | |
| 902 | |
| 903 InsertionLocation location = get_appropriate_insertion_location(parser, NULL); | |
| 904 if (location.target->type == GUMBO_NODE_DOCUMENT) { | |
| 905 // The DOM does not allow Document nodes to have Text children, so per the | |
| 906 // spec, they are dropped on the floor. | |
| 907 destroy_node(parser, text_node); | |
| 908 } else { | |
| 909 insert_node(parser, text_node, location); | |
| 910 } | |
| 911 | |
| 912 gumbo_string_buffer_clear(parser, &buffer_state->_buffer); | |
| 913 buffer_state->_type = GUMBO_NODE_WHITESPACE; | |
| 914 assert(buffer_state->_buffer.length == 0); | |
| 915 } | |
| 916 | |
| 917 static void record_end_of_element( | |
| 918 GumboToken* current_token, GumboElement* element) { | |
| 919 element->end_pos = current_token->position; | |
| 920 element->original_end_tag = current_token->type == GUMBO_TOKEN_END_TAG | |
| 921 ? current_token->original_text | |
| 922 : kGumboEmptyString; | |
| 923 } | |
| 924 | |
| 925 static GumboNode* pop_current_node(GumboParser* parser) { | |
| 926 GumboParserState* state = parser->_parser_state; | |
| 927 maybe_flush_text_node_buffer(parser); | |
| 928 if (state->_open_elements.length > 0) { | |
| 929 assert(node_html_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML)); | |
| 930 gumbo_debug("Popping %s node.\n", | |
| 931 gumbo_normalized_tagname(get_current_node(parser)->v.element.tag)); | |
| 932 } | |
| 933 GumboNode* current_node = gumbo_vector_pop(parser, &state->_open_elements); | |
| 934 if (!current_node) { | |
| 935 assert(state->_open_elements.length == 0); | |
| 936 return NULL; | |
| 937 } | |
| 938 assert(current_node->type == GUMBO_NODE_ELEMENT || | |
| 939 current_node->type == GUMBO_NODE_TEMPLATE); | |
| 940 bool is_closed_body_or_html_tag = | |
| 941 (node_html_tag_is(current_node, GUMBO_TAG_BODY) && | |
| 942 state->_closed_body_tag) || | |
| 943 (node_html_tag_is(current_node, GUMBO_TAG_HTML) && | |
| 944 state->_closed_html_tag); | |
| 945 if ((state->_current_token->type != GUMBO_TOKEN_END_TAG || | |
| 946 !node_html_tag_is(current_node, state->_current_token->v.end_tag)) && | |
| 947 !is_closed_body_or_html_tag) { | |
| 948 current_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG; | |
| 949 } | |
| 950 if (!is_closed_body_or_html_tag) { | |
| 951 record_end_of_element(state->_current_token, ¤t_node->v.element); | |
| 952 } | |
| 953 return current_node; | |
| 954 } | |
| 955 | |
| 956 static void append_comment_node( | |
| 957 GumboParser* parser, GumboNode* node, const GumboToken* token) { | |
| 958 maybe_flush_text_node_buffer(parser); | |
| 959 GumboNode* comment = create_node(parser, GUMBO_NODE_COMMENT); | |
| 960 comment->type = GUMBO_NODE_COMMENT; | |
| 961 comment->parse_flags = GUMBO_INSERTION_NORMAL; | |
| 962 comment->v.text.text = token->v.text; | |
| 963 comment->v.text.original_text = token->original_text; | |
| 964 comment->v.text.start_pos = token->position; | |
| 965 append_node(parser, node, comment); | |
| 966 } | |
| 967 | |
| 968 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-row-context | |
| 969 static void clear_stack_to_table_row_context(GumboParser* parser) { | |
| 970 while (!node_tag_in_set(get_current_node(parser), | |
| 971 (gumbo_tagset){TAG(HTML), TAG(TR), TAG(TEMPLATE)})) { | |
| 972 pop_current_node(parser); | |
| 973 } | |
| 974 } | |
| 975 | |
| 976 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-context | |
| 977 static void clear_stack_to_table_context(GumboParser* parser) { | |
| 978 while (!node_tag_in_set(get_current_node(parser), | |
| 979 (gumbo_tagset){TAG(HTML), TAG(TABLE), TAG(TEMPLATE)})) { | |
| 980 pop_current_node(parser); | |
| 981 } | |
| 982 } | |
| 983 | |
| 984 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-body-context | |
| 985 void clear_stack_to_table_body_context(GumboParser* parser) { | |
| 986 while (!node_tag_in_set(get_current_node(parser), | |
| 987 (gumbo_tagset){TAG(HTML), TAG(TBODY), TAG(TFOOT), TAG(THEAD), | |
| 988 TAG(TEMPLATE)})) { | |
| 989 pop_current_node(parser); | |
| 990 } | |
| 991 } | |
| 992 | |
| 993 // Creates a parser-inserted element in the HTML namespace and returns it. | |
| 994 static GumboNode* create_element(GumboParser* parser, GumboTag tag) { | |
| 995 GumboNode* node = create_node(parser, GUMBO_NODE_ELEMENT); | |
| 996 GumboElement* element = &node->v.element; | |
| 997 gumbo_vector_init(parser, 1, &element->children); | |
| 998 gumbo_vector_init(parser, 0, &element->attributes); | |
| 999 element->tag = tag; | |
| 1000 element->tag_namespace = GUMBO_NAMESPACE_HTML; | |
| 1001 element->original_tag = kGumboEmptyString; | |
| 1002 element->original_end_tag = kGumboEmptyString; | |
| 1003 element->start_pos = (parser->_parser_state->_current_token) | |
| 1004 ? parser->_parser_state->_current_token->position | |
| 1005 : kGumboEmptySourcePosition; | |
| 1006 element->end_pos = kGumboEmptySourcePosition; | |
| 1007 return node; | |
| 1008 } | |
| 1009 | |
| 1010 // Constructs an element from the given start tag token. | |
| 1011 static GumboNode* create_element_from_token( | |
| 1012 GumboParser* parser, GumboToken* token, GumboNamespaceEnum tag_namespace) { | |
| 1013 assert(token->type == GUMBO_TOKEN_START_TAG); | |
| 1014 GumboTokenStartTag* start_tag = &token->v.start_tag; | |
| 1015 | |
| 1016 GumboNodeType type = (tag_namespace == GUMBO_NAMESPACE_HTML && | |
| 1017 start_tag->tag == GUMBO_TAG_TEMPLATE) | |
| 1018 ? GUMBO_NODE_TEMPLATE | |
| 1019 : GUMBO_NODE_ELEMENT; | |
| 1020 | |
| 1021 GumboNode* node = create_node(parser, type); | |
| 1022 GumboElement* element = &node->v.element; | |
| 1023 gumbo_vector_init(parser, 1, &element->children); | |
| 1024 element->attributes = start_tag->attributes; | |
| 1025 element->tag = start_tag->tag; | |
| 1026 element->tag_namespace = tag_namespace; | |
| 1027 | |
| 1028 assert(token->original_text.length >= 2); | |
| 1029 assert(token->original_text.data[0] == '<'); | |
| 1030 assert(token->original_text.data[token->original_text.length - 1] == '>'); | |
| 1031 element->original_tag = token->original_text; | |
| 1032 element->start_pos = token->position; | |
| 1033 element->original_end_tag = kGumboEmptyString; | |
| 1034 element->end_pos = kGumboEmptySourcePosition; | |
| 1035 | |
| 1036 // The element takes ownership of the attributes from the token, so any | |
| 1037 // allocated-memory fields should be nulled out. | |
| 1038 start_tag->attributes = kGumboEmptyVector; | |
| 1039 return node; | |
| 1040 } | |
| 1041 | |
| 1042 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#insert-an-html-element | |
| 1043 static void insert_element(GumboParser* parser, GumboNode* node, | |
| 1044 bool is_reconstructing_formatting_elements) { | |
| 1045 GumboParserState* state = parser->_parser_state; | |
| 1046 // NOTE(jdtang): The text node buffer must always be flushed before inserting | |
| 1047 // a node, otherwise we're handling nodes in a different order than the spec | |
| 1048 // mandated. However, one clause of the spec (character tokens in the body) | |
| 1049 // requires that we reconstruct the active formatting elements *before* adding | |
| 1050 // the character, and reconstructing the active formatting elements may itself | |
| 1051 // result in the insertion of new elements (which should be pushed onto the | |
| 1052 // stack of open elements before the buffer is flushed). We solve this (for | |
| 1053 // the time being, the spec has been rewritten for <template> and the new | |
| 1054 // version may be simpler here) with a boolean flag to this method. | |
| 1055 if (!is_reconstructing_formatting_elements) { | |
| 1056 maybe_flush_text_node_buffer(parser); | |
| 1057 } | |
| 1058 InsertionLocation location = get_appropriate_insertion_location(parser, NULL); | |
| 1059 insert_node(parser, node, location); | |
| 1060 gumbo_vector_add(parser, (void*) node, &state->_open_elements); | |
| 1061 } | |
| 1062 | |
| 1063 // Convenience method that combines create_element_from_token and | |
| 1064 // insert_element, inserting the generated element directly into the current | |
| 1065 // node. Returns the node inserted. | |
| 1066 static GumboNode* insert_element_from_token( | |
| 1067 GumboParser* parser, GumboToken* token) { | |
| 1068 GumboNode* element = | |
| 1069 create_element_from_token(parser, token, GUMBO_NAMESPACE_HTML); | |
| 1070 insert_element(parser, element, false); | |
| 1071 gumbo_debug("Inserting <%s> element (@%x) from token.\n", | |
| 1072 gumbo_normalized_tagname(element->v.element.tag), element); | |
| 1073 return element; | |
| 1074 } | |
| 1075 | |
| 1076 // Convenience method that combines create_element and insert_element, inserting | |
| 1077 // a parser-generated element of a specific tag type. Returns the node | |
| 1078 // inserted. | |
| 1079 static GumboNode* insert_element_of_tag_type( | |
| 1080 GumboParser* parser, GumboTag tag, GumboParseFlags reason) { | |
| 1081 GumboNode* element = create_element(parser, tag); | |
| 1082 element->parse_flags |= GUMBO_INSERTION_BY_PARSER | reason; | |
| 1083 insert_element(parser, element, false); | |
| 1084 gumbo_debug("Inserting %s element (@%x) from tag type.\n", | |
| 1085 gumbo_normalized_tagname(tag), element); | |
| 1086 return element; | |
| 1087 } | |
| 1088 | |
| 1089 // Convenience method for creating foreign namespaced element. Returns the node | |
| 1090 // inserted. | |
| 1091 static GumboNode* insert_foreign_element( | |
| 1092 GumboParser* parser, GumboToken* token, GumboNamespaceEnum tag_namespace) { | |
| 1093 assert(token->type == GUMBO_TOKEN_START_TAG); | |
| 1094 GumboNode* element = create_element_from_token(parser, token, tag_namespace); | |
| 1095 insert_element(parser, element, false); | |
| 1096 if (token_has_attribute(token, "xmlns") && | |
| 1097 !attribute_matches_case_sensitive(&token->v.start_tag.attributes, "xmlns", | |
| 1098 kLegalXmlns[tag_namespace])) { | |
| 1099 // TODO(jdtang): Since there're multiple possible error codes here, we | |
| 1100 // eventually need reason codes to differentiate them. | |
| 1101 parser_add_parse_error(parser, token); | |
| 1102 } | |
| 1103 if (token_has_attribute(token, "xmlns:xlink") && | |
| 1104 !attribute_matches_case_sensitive(&token->v.start_tag.attributes, | |
| 1105 "xmlns:xlink", "http://www.w3.org/1999/xlink")) { | |
| 1106 parser_add_parse_error(parser, token); | |
| 1107 } | |
| 1108 return element; | |
| 1109 } | |
| 1110 | |
| 1111 static void insert_text_token(GumboParser* parser, GumboToken* token) { | |
| 1112 assert(token->type == GUMBO_TOKEN_WHITESPACE || | |
| 1113 token->type == GUMBO_TOKEN_CHARACTER || | |
| 1114 token->type == GUMBO_TOKEN_NULL || token->type == GUMBO_TOKEN_CDATA); | |
| 1115 TextNodeBufferState* buffer_state = &parser->_parser_state->_text_node; | |
| 1116 if (buffer_state->_buffer.length == 0) { | |
| 1117 // Initialize position fields. | |
| 1118 buffer_state->_start_original_text = token->original_text.data; | |
| 1119 buffer_state->_start_position = token->position; | |
| 1120 } | |
| 1121 gumbo_string_buffer_append_codepoint( | |
| 1122 parser, token->v.character, &buffer_state->_buffer); | |
| 1123 if (token->type == GUMBO_TOKEN_CHARACTER) { | |
| 1124 buffer_state->_type = GUMBO_NODE_TEXT; | |
| 1125 } else if (token->type == GUMBO_TOKEN_CDATA) { | |
| 1126 buffer_state->_type = GUMBO_NODE_CDATA; | |
| 1127 } | |
| 1128 gumbo_debug("Inserting text token '%c'.\n", token->v.character); | |
| 1129 } | |
| 1130 | |
| 1131 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#generic-rcdata-element-parsing-algorithm | |
| 1132 static void run_generic_parsing_algorithm( | |
| 1133 GumboParser* parser, GumboToken* token, GumboTokenizerEnum lexer_state) { | |
| 1134 insert_element_from_token(parser, token); | |
| 1135 gumbo_tokenizer_set_state(parser, lexer_state); | |
| 1136 parser->_parser_state->_original_insertion_mode = | |
| 1137 parser->_parser_state->_insertion_mode; | |
| 1138 parser->_parser_state->_insertion_mode = GUMBO_INSERTION_MODE_TEXT; | |
| 1139 } | |
| 1140 | |
| 1141 static void acknowledge_self_closing_tag(GumboParser* parser) { | |
| 1142 parser->_parser_state->_self_closing_flag_acknowledged = true; | |
| 1143 } | |
| 1144 | |
| 1145 // Returns true if there's an anchor tag in the list of active formatting | |
| 1146 // elements, and fills in its index if so. | |
| 1147 static bool find_last_anchor_index(GumboParser* parser, int* anchor_index) { | |
| 1148 GumboVector* elements = &parser->_parser_state->_active_formatting_elements; | |
| 1149 for (int i = elements->length; --i >= 0;) { | |
| 1150 GumboNode* node = elements->data[i]; | |
| 1151 if (node == &kActiveFormattingScopeMarker) { | |
| 1152 return false; | |
| 1153 } | |
| 1154 if (node_html_tag_is(node, GUMBO_TAG_A)) { | |
| 1155 *anchor_index = i; | |
| 1156 return true; | |
| 1157 } | |
| 1158 } | |
| 1159 return false; | |
| 1160 } | |
| 1161 | |
| 1162 // Counts the number of open formatting elements in the list of active | |
| 1163 // formatting elements (after the last active scope marker) that have a specific | |
| 1164 // tag. If this is > 0, then earliest_matching_index will be filled in with the | |
| 1165 // index of the first such element. | |
| 1166 static int count_formatting_elements_of_tag(GumboParser* parser, | |
| 1167 const GumboNode* desired_node, int* earliest_matching_index) { | |
| 1168 const GumboElement* desired_element = &desired_node->v.element; | |
| 1169 GumboVector* elements = &parser->_parser_state->_active_formatting_elements; | |
| 1170 int num_identical_elements = 0; | |
| 1171 for (int i = elements->length; --i >= 0;) { | |
| 1172 GumboNode* node = elements->data[i]; | |
| 1173 if (node == &kActiveFormattingScopeMarker) { | |
| 1174 break; | |
| 1175 } | |
| 1176 assert(node->type == GUMBO_NODE_ELEMENT); | |
| 1177 if (node_qualified_tag_is( | |
| 1178 node, desired_element->tag_namespace, desired_element->tag) && | |
| 1179 all_attributes_match( | |
| 1180 &node->v.element.attributes, &desired_element->attributes)) { | |
| 1181 num_identical_elements++; | |
| 1182 *earliest_matching_index = i; | |
| 1183 } | |
| 1184 } | |
| 1185 return num_identical_elements; | |
| 1186 } | |
| 1187 | |
| 1188 // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#reconstruct-the-active-formatting-elements | |
| 1189 static void add_formatting_element(GumboParser* parser, const GumboNode* node) { | |
| 1190 assert(node == &kActiveFormattingScopeMarker || | |
| 1191 node->type == GUMBO_NODE_ELEMENT); | |
| 1192 GumboVector* elements = &parser->_parser_state->_active_formatting_elements; | |
| 1193 if (node == &kActiveFormattingScopeMarker) { | |
| 1194 gumbo_debug("Adding a scope marker.\n"); | |
| 1195 } else { | |
| 1196 gumbo_debug("Adding a formatting element.\n"); | |
| 1197 } | |
| 1198 | |
| 1199 // Hunt for identical elements. | |
| 1200 int earliest_identical_element = elements->length; | |
| 1201 int num_identical_elements = count_formatting_elements_of_tag( | |
| 1202 parser, node, &earliest_identical_element); | |
| 1203 | |
| 1204 // Noah's Ark clause: if there're at least 3, remove the earliest. | |
| 1205 if (num_identical_elements >= 3) { | |
| 1206 gumbo_debug("Noah's ark clause: removing element at %d.\n", | |
| 1207 earliest_identical_element); | |
| 1208 gumbo_vector_remove_at(parser, earliest_identical_element, elements); | |
| 1209 } | |
| 1210 | |
| 1211 gumbo_vector_add(parser, (void*) node, elements); | |
| 1212 } | |
| 1213 | |
| 1214 static bool is_open_element(GumboParser* parser, const GumboNode* node) { | |
| 1215 GumboVector* open_elements = &parser->_parser_state->_open_elements; | |
| 1216 for (unsigned int i = 0; i < open_elements->length; ++i) { | |
| 1217 if (open_elements->data[i] == node) { | |
| 1218 return true; | |
| 1219 } | |
| 1220 } | |
| 1221 return false; | |
| 1222 } | |
| 1223 | |
| 1224 // Clones attributes, tags, etc. of a node, but does not copy the content. The | |
| 1225 // clone shares no structure with the original node: all owned strings and | |
| 1226 // values are fresh copies. | |
| 1227 GumboNode* clone_node( | |
| 1228 GumboParser* parser, GumboNode* node, GumboParseFlags reason) { | |
| 1229 assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE); | |
| 1230 GumboNode* new_node = gumbo_parser_allocate(parser, sizeof(GumboNode)); | |
| 1231 *new_node = *node; | |
| 1232 new_node->parent = NULL; | |
| 1233 new_node->index_within_parent = -1; | |
| 1234 // Clear the GUMBO_INSERTION_IMPLICIT_END_TAG flag, as the cloned node may | |
| 1235 // have a separate end tag. | |
| 1236 new_node->parse_flags &= ~GUMBO_INSERTION_IMPLICIT_END_TAG; | |
| 1237 new_node->parse_flags |= reason | GUMBO_INSERTION_BY_PARSER; | |
| 1238 GumboElement* element = &new_node->v.element; | |
| 1239 gumbo_vector_init(parser, 1, &element->children); | |
| 1240 | |
| 1241 const GumboVector* old_attributes = &node->v.element.attributes; | |
| 1242 gumbo_vector_init(parser, old_attributes->length, &element->attributes); | |
| 1243 for (unsigned int i = 0; i < old_attributes->length; ++i) { | |
| 1244 const GumboAttribute* old_attr = old_attributes->data[i]; | |
| 1245 GumboAttribute* attr = | |
| 1246 gumbo_parser_allocate(parser, sizeof(GumboAttribute)); | |
| 1247 *attr = *old_attr; | |
| 1248 attr->name = gumbo_copy_stringz(parser, old_attr->name); | |
| 1249 attr->value = gumbo_copy_stringz(parser, old_attr->value); | |
| 1250 gumbo_vector_add(parser, attr, &element->attributes); | |
| 1251 } | |
| 1252 return new_node; | |
| 1253 } | |
| 1254 | |
| 1255 // "Reconstruct active formatting elements" part of the spec. | |
| 1256 // This implementation is based on the html5lib translation from the mess of | |
| 1257 // GOTOs in the spec to reasonably structured programming. | |
| 1258 // http://code.google.com/p/html5lib/source/browse/python/html5lib/treebuilders/_base.py | |
| 1259 static void reconstruct_active_formatting_elements(GumboParser* parser) { | |
| 1260 GumboVector* elements = &parser->_parser_state->_active_formatting_elements; | |
| 1261 // Step 1 | |
| 1262 if (elements->length == 0) { | |
| 1263 return; | |
| 1264 } | |
| 1265 | |
| 1266 // Step 2 & 3 | |
| 1267 unsigned int i = elements->length - 1; | |
| 1268 GumboNode* element = elements->data[i]; | |
| 1269 if (element == &kActiveFormattingScopeMarker || | |
| 1270 is_open_element(parser, element)) { | |
| 1271 return; | |
| 1272 } | |
| 1273 | |
| 1274 // Step 6 | |
| 1275 do { | |
| 1276 if (i == 0) { | |
| 1277 // Step 4 | |
| 1278 i = -1; // Incremented to 0 below. | |
| 1279 break; | |
| 1280 } | |
| 1281 // Step 5 | |
| 1282 element = elements->data[--i]; | |
| 1283 } while (element != &kActiveFormattingScopeMarker && | |
| 1284 !is_open_element(parser, element)); | |
| 1285 | |
| 1286 ++i; | |
| 1287 gumbo_debug("Reconstructing elements from %d on %s parent.\n", i, | |
| 1288 gumbo_normalized_tagname(get_current_node(parser)->v.element.tag)); | |
| 1289 for (; i < elements->length; ++i) { | |
| 1290 // Step 7 & 8. | |
| 1291 assert(elements->length > 0); | |
| 1292 assert(i < elements->length); | |
| 1293 element = elements->data[i]; | |
| 1294 assert(element != &kActiveFormattingScopeMarker); | |
| 1295 GumboNode* clone = clone_node( | |
| 1296 parser, element, GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT); | |
| 1297 // Step 9. | |
| 1298 InsertionLocation location = | |
| 1299 get_appropriate_insertion_location(parser, NULL); | |
| 1300 insert_node(parser, clone, location); | |
| 1301 gumbo_vector_add( | |
| 1302 parser, (void*) clone, &parser->_parser_state->_open_elements); | |
| 1303 | |
| 1304 // Step 10. | |
| 1305 elements->data[i] = clone; | |
| 1306 gumbo_debug("Reconstructed %s element at %d.\n", | |
| 1307 gumbo_normalized_tagname(clone->v.element.tag), i); | |
| 1308 } | |
| 1309 } | |
| 1310 | |
| 1311 static void clear_active_formatting_elements(GumboParser* parser) { | |
| 1312 GumboVector* elements = &parser->_parser_state->_active_formatting_elements; | |
| 1313 int num_elements_cleared = 0; | |
| 1314 const GumboNode* node; | |
| 1315 do { | |
| 1316 node = gumbo_vector_pop(parser, elements); | |
| 1317 ++num_elements_cleared; | |
| 1318 } while (node && node != &kActiveFormattingScopeMarker); | |
| 1319 gumbo_debug("Cleared %d elements from active formatting list.\n", | |
| 1320 num_elements_cleared); | |
| 1321 } | |
| 1322 | |
| 1323 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-initial-insertion-mode | |
| 1324 static GumboQuirksModeEnum compute_quirks_mode( | |
| 1325 const GumboTokenDocType* doctype) { | |
| 1326 if (doctype->force_quirks || strcmp(doctype->name, kDoctypeHtml.data) || | |
| 1327 is_in_static_list( | |
| 1328 doctype->public_identifier, kQuirksModePublicIdPrefixes, false) || | |
| 1329 is_in_static_list( | |
| 1330 doctype->public_identifier, kQuirksModePublicIdExactMatches, true) || | |
| 1331 is_in_static_list( | |
| 1332 doctype->system_identifier, kQuirksModeSystemIdExactMatches, true) || | |
| 1333 (is_in_static_list(doctype->public_identifier, | |
| 1334 kLimitedQuirksRequiresSystemIdPublicIdPrefixes, false) && | |
| 1335 !doctype->has_system_identifier)) { | |
| 1336 return GUMBO_DOCTYPE_QUIRKS; | |
| 1337 } else if (is_in_static_list(doctype->public_identifier, | |
| 1338 kLimitedQuirksPublicIdPrefixes, false) || | |
| 1339 (is_in_static_list(doctype->public_identifier, | |
| 1340 kLimitedQuirksRequiresSystemIdPublicIdPrefixes, false) && | |
| 1341 doctype->has_system_identifier)) { | |
| 1342 return GUMBO_DOCTYPE_LIMITED_QUIRKS; | |
| 1343 } | |
| 1344 return GUMBO_DOCTYPE_NO_QUIRKS; | |
| 1345 } | |
| 1346 | |
| 1347 // The following functions are all defined by the "has an element in __ scope" | |
| 1348 // sections of the HTML5 spec: | |
| 1349 // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-the-specific-scope | |
| 1350 // The basic idea behind them is that they check for an element of the given | |
| 1351 // qualified name, contained within a scope formed by a set of other qualified | |
| 1352 // names. For example, "has an element in list scope" looks for an element of | |
| 1353 // the given qualified name within the nearest enclosing <ol> or <ul>, along | |
| 1354 // with a bunch of generic element types that serve to "firewall" their content | |
| 1355 // from the rest of the document. Note that because of the way the spec is | |
| 1356 // written, | |
| 1357 // all elements are expected to be in the HTML namespace | |
| 1358 static bool has_an_element_in_specific_scope(GumboParser* parser, | |
| 1359 int expected_size, const GumboTag* expected, bool negate, | |
| 1360 const gumbo_tagset tags) { | |
| 1361 GumboVector* open_elements = &parser->_parser_state->_open_elements; | |
| 1362 for (int i = open_elements->length; --i >= 0;) { | |
| 1363 const GumboNode* node = open_elements->data[i]; | |
| 1364 if (node->type != GUMBO_NODE_ELEMENT && node->type != GUMBO_NODE_TEMPLATE) | |
| 1365 continue; | |
| 1366 | |
| 1367 GumboTag node_tag = node->v.element.tag; | |
| 1368 GumboNamespaceEnum node_ns = node->v.element.tag_namespace; | |
| 1369 for (int j = 0; j < expected_size; ++j) { | |
| 1370 if (node_tag == expected[j] && node_ns == GUMBO_NAMESPACE_HTML) | |
| 1371 return true; | |
| 1372 } | |
| 1373 | |
| 1374 bool found = TAGSET_INCLUDES(tags, node_ns, node_tag); | |
| 1375 if (negate != found) return false; | |
| 1376 } | |
| 1377 return false; | |
| 1378 } | |
| 1379 | |
| 1380 // Checks for the presence of an open element of the specified tag type. | |
| 1381 static bool has_open_element(GumboParser* parser, GumboTag tag) { | |
| 1382 return has_an_element_in_specific_scope( | |
| 1383 parser, 1, &tag, false, (gumbo_tagset){TAG(HTML)}); | |
| 1384 } | |
| 1385 | |
| 1386 // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-scope | |
| 1387 static bool has_an_element_in_scope(GumboParser* parser, GumboTag tag) { | |
| 1388 return has_an_element_in_specific_scope(parser, 1, &tag, false, | |
| 1389 (gumbo_tagset){TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD), | |
| 1390 TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI), | |
| 1391 TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), TAG_MATHML(MTEXT), | |
| 1392 TAG_MATHML(ANNOTATION_XML), TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), | |
| 1393 TAG_SVG(TITLE)}); | |
| 1394 } | |
| 1395 | |
| 1396 // Like "has an element in scope", but for the specific case of looking for a | |
| 1397 // unique target node, not for any node with a given tag name. This duplicates | |
| 1398 // much of the algorithm from has_an_element_in_specific_scope because the | |
| 1399 // predicate is different when checking for an exact node, and it's easier & | |
| 1400 // faster just to duplicate the code for this one case than to try and | |
| 1401 // parameterize it. | |
| 1402 static bool has_node_in_scope(GumboParser* parser, const GumboNode* node) { | |
| 1403 GumboVector* open_elements = &parser->_parser_state->_open_elements; | |
| 1404 for (int i = open_elements->length; --i >= 0;) { | |
| 1405 const GumboNode* current = open_elements->data[i]; | |
| 1406 if (current == node) { | |
| 1407 return true; | |
| 1408 } | |
| 1409 if (current->type != GUMBO_NODE_ELEMENT && | |
| 1410 current->type != GUMBO_NODE_TEMPLATE) { | |
| 1411 continue; | |
| 1412 } | |
| 1413 if (node_tag_in_set(current, | |
| 1414 (gumbo_tagset){TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), | |
| 1415 TAG(TD), TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE), | |
| 1416 TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), | |
| 1417 TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML), | |
| 1418 TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), TAG_SVG(TITLE)})) { | |
| 1419 return false; | |
| 1420 } | |
| 1421 } | |
| 1422 assert(false); | |
| 1423 return false; | |
| 1424 } | |
| 1425 | |
| 1426 // Like has_an_element_in_scope, but restricts the expected qualified name to a | |
| 1427 // range of possible qualified names instead of just a single one. | |
| 1428 static bool has_an_element_in_scope_with_tagname( | |
| 1429 GumboParser* parser, int expected_len, const GumboTag expected[]) { | |
| 1430 return has_an_element_in_specific_scope(parser, expected_len, expected, false, | |
| 1431 (gumbo_tagset){TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD), | |
| 1432 TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI), | |
| 1433 TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), TAG_MATHML(MTEXT), | |
| 1434 TAG_MATHML(ANNOTATION_XML), TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), | |
| 1435 TAG_SVG(TITLE)}); | |
| 1436 } | |
| 1437 | |
| 1438 // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-list-item-scope | |
| 1439 static bool has_an_element_in_list_scope(GumboParser* parser, GumboTag tag) { | |
| 1440 return has_an_element_in_specific_scope(parser, 1, &tag, false, | |
| 1441 (gumbo_tagset){TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD), | |
| 1442 TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI), | |
| 1443 TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), TAG_MATHML(MTEXT), | |
| 1444 TAG_MATHML(ANNOTATION_XML), TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), | |
| 1445 TAG_SVG(TITLE), TAG(OL), TAG(UL)}); | |
| 1446 } | |
| 1447 | |
| 1448 // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-button-scope | |
| 1449 static bool has_an_element_in_button_scope(GumboParser* parser, GumboTag tag) { | |
| 1450 return has_an_element_in_specific_scope(parser, 1, &tag, false, | |
| 1451 (gumbo_tagset){TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD), | |
| 1452 TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI), | |
| 1453 TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), TAG_MATHML(MTEXT), | |
| 1454 TAG_MATHML(ANNOTATION_XML), TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), | |
| 1455 TAG_SVG(TITLE), TAG(BUTTON)}); | |
| 1456 } | |
| 1457 | |
| 1458 // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-table-scope | |
| 1459 static bool has_an_element_in_table_scope(GumboParser* parser, GumboTag tag) { | |
| 1460 return has_an_element_in_specific_scope(parser, 1, &tag, false, | |
| 1461 (gumbo_tagset){TAG(HTML), TAG(TABLE), TAG(TEMPLATE)}); | |
| 1462 } | |
| 1463 | |
| 1464 // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-select-scope | |
| 1465 static bool has_an_element_in_select_scope(GumboParser* parser, GumboTag tag) { | |
| 1466 return has_an_element_in_specific_scope( | |
| 1467 parser, 1, &tag, true, (gumbo_tagset){TAG(OPTGROUP), TAG(OPTION)}); | |
| 1468 } | |
| 1469 | |
| 1470 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#generate-implied-end-tags | |
| 1471 // "exception" is the "element to exclude from the process" listed in the spec. | |
| 1472 // Pass GUMBO_TAG_LAST to not exclude any of them. | |
| 1473 static void generate_implied_end_tags(GumboParser* parser, GumboTag exception) { | |
| 1474 for (; node_tag_in_set(get_current_node(parser), | |
| 1475 (gumbo_tagset){TAG(DD), TAG(DT), TAG(LI), TAG(OPTION), | |
| 1476 TAG(OPTGROUP), TAG(P), TAG(RP), TAG(RB), TAG(RT), TAG(RTC)}) && | |
| 1477 !node_html_tag_is(get_current_node(parser), exception); | |
| 1478 pop_current_node(parser)) | |
| 1479 ; | |
| 1480 } | |
| 1481 | |
| 1482 // This is the "generate all implied end tags thoroughly" clause of the spec. | |
| 1483 // https://html.spec.whatwg.org/multipage/syntax.html#closing-elements-that-have-implied-end-tags | |
| 1484 static void generate_all_implied_end_tags_thoroughly(GumboParser* parser) { | |
| 1485 for ( | |
| 1486 ; node_tag_in_set(get_current_node(parser), | |
| 1487 (gumbo_tagset){TAG(CAPTION), TAG(COLGROUP), TAG(DD), TAG(DT), TAG(LI), | |
| 1488 TAG(OPTION), TAG(OPTGROUP), TAG(P), TAG(RP), TAG(RT), TAG(RTC), | |
| 1489 TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(HEAD), TAG(TR)}); | |
| 1490 pop_current_node(parser)) | |
| 1491 ; | |
| 1492 } | |
| 1493 | |
| 1494 // This factors out the clauses relating to "act as if an end tag token with tag | |
| 1495 // name "table" had been seen. Returns true if there's a table element in table | |
| 1496 // scope which was successfully closed, false if not and the token should be | |
| 1497 // ignored. Does not add parse errors; callers should handle that. | |
| 1498 static bool close_table(GumboParser* parser) { | |
| 1499 if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TABLE)) { | |
| 1500 return false; | |
| 1501 } | |
| 1502 | |
| 1503 GumboNode* node = pop_current_node(parser); | |
| 1504 while (!node_html_tag_is(node, GUMBO_TAG_TABLE)) { | |
| 1505 node = pop_current_node(parser); | |
| 1506 } | |
| 1507 reset_insertion_mode_appropriately(parser); | |
| 1508 return true; | |
| 1509 } | |
| 1510 | |
| 1511 // This factors out the clauses relating to "act as if an end tag token with tag | |
| 1512 // name `cell_tag` had been seen". | |
| 1513 static bool close_table_cell( | |
| 1514 GumboParser* parser, const GumboToken* token, GumboTag cell_tag) { | |
| 1515 bool result = true; | |
| 1516 generate_implied_end_tags(parser, GUMBO_TAG_LAST); | |
| 1517 const GumboNode* node = get_current_node(parser); | |
| 1518 if (!node_html_tag_is(node, cell_tag)) { | |
| 1519 parser_add_parse_error(parser, token); | |
| 1520 result = false; | |
| 1521 } | |
| 1522 do { | |
| 1523 node = pop_current_node(parser); | |
| 1524 } while (!node_html_tag_is(node, cell_tag)); | |
| 1525 | |
| 1526 clear_active_formatting_elements(parser); | |
| 1527 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW); | |
| 1528 return result; | |
| 1529 } | |
| 1530 | |
| 1531 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#close-the-cell | |
| 1532 // This holds the logic to determine whether we should close a <td> or a <th>. | |
| 1533 static bool close_current_cell(GumboParser* parser, const GumboToken* token) { | |
| 1534 if (has_an_element_in_table_scope(parser, GUMBO_TAG_TD)) { | |
| 1535 assert(!has_an_element_in_table_scope(parser, GUMBO_TAG_TH)); | |
| 1536 return close_table_cell(parser, token, GUMBO_TAG_TD); | |
| 1537 } else { | |
| 1538 assert(has_an_element_in_table_scope(parser, GUMBO_TAG_TH)); | |
| 1539 return close_table_cell(parser, token, GUMBO_TAG_TH); | |
| 1540 } | |
| 1541 } | |
| 1542 | |
| 1543 // This factors out the "act as if an end tag of tag name 'select' had been | |
| 1544 // seen" clause of the spec, since it's referenced in several places. It pops | |
| 1545 // all nodes from the stack until the current <select> has been closed, then | |
| 1546 // resets the insertion mode appropriately. | |
| 1547 static void close_current_select(GumboParser* parser) { | |
| 1548 GumboNode* node = pop_current_node(parser); | |
| 1549 while (!node_html_tag_is(node, GUMBO_TAG_SELECT)) { | |
| 1550 node = pop_current_node(parser); | |
| 1551 } | |
| 1552 reset_insertion_mode_appropriately(parser); | |
| 1553 } | |
| 1554 | |
| 1555 // The list of nodes in the "special" category: | |
| 1556 // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#special | |
| 1557 static bool is_special_node(const GumboNode* node) { | |
| 1558 assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE); | |
| 1559 return node_tag_in_set(node, | |
| 1560 (gumbo_tagset){TAG(ADDRESS), TAG(APPLET), TAG(AREA), TAG(ARTICLE), | |
| 1561 TAG(ASIDE), TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(BLOCKQUOTE), | |
| 1562 TAG(BODY), TAG(BR), TAG(BUTTON), TAG(CAPTION), TAG(CENTER), TAG(COL), | |
| 1563 TAG(COLGROUP), TAG(MENUITEM), TAG(DD), TAG(DETAILS), TAG(DIR), | |
| 1564 TAG(DIV), TAG(DL), TAG(DT), TAG(EMBED), TAG(FIELDSET), | |
| 1565 TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER), TAG(FORM), TAG(FRAME), | |
| 1566 TAG(FRAMESET), TAG(H1), TAG(H2), TAG(H3), TAG(H4), TAG(H5), TAG(H6), | |
| 1567 TAG(HEAD), TAG(HEADER), TAG(HGROUP), TAG(HR), TAG(HTML), TAG(IFRAME), | |
| 1568 TAG(IMG), TAG(INPUT), TAG(ISINDEX), TAG(LI), TAG(LINK), TAG(LISTING), | |
| 1569 TAG(MARQUEE), TAG(MENU), TAG(META), TAG(NAV), TAG(NOEMBED), | |
| 1570 TAG(NOFRAMES), TAG(NOSCRIPT), TAG(OBJECT), TAG(OL), TAG(P), | |
| 1571 TAG(PARAM), TAG(PLAINTEXT), TAG(PRE), TAG(SCRIPT), TAG(SECTION), | |
| 1572 TAG(SELECT), TAG(STYLE), TAG(SUMMARY), TAG(TABLE), TAG(TBODY), | |
| 1573 TAG(TD), TAG(TEMPLATE), TAG(TEXTAREA), TAG(TFOOT), TAG(TH), | |
| 1574 TAG(THEAD), TAG(TITLE), TAG(TR), TAG(UL), TAG(WBR), TAG(XMP), | |
| 1575 | |
| 1576 TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), | |
| 1577 TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML), | |
| 1578 | |
| 1579 TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC)}); | |
| 1580 } | |
| 1581 | |
| 1582 // Implicitly closes currently open elements until it reaches an element with | |
| 1583 // the | |
| 1584 // specified qualified name. If the elements closed are in the set handled by | |
| 1585 // generate_implied_end_tags, this is normal operation and this function returns | |
| 1586 // true. Otherwise, a parse error is recorded and this function returns false. | |
| 1587 static bool implicitly_close_tags(GumboParser* parser, GumboToken* token, | |
| 1588 GumboNamespaceEnum target_ns, GumboTag target) { | |
| 1589 bool result = true; | |
| 1590 generate_implied_end_tags(parser, target); | |
| 1591 if (!node_qualified_tag_is(get_current_node(parser), target_ns, target)) { | |
| 1592 parser_add_parse_error(parser, token); | |
| 1593 while ( | |
| 1594 !node_qualified_tag_is(get_current_node(parser), target_ns, target)) { | |
| 1595 pop_current_node(parser); | |
| 1596 } | |
| 1597 result = false; | |
| 1598 } | |
| 1599 assert(node_qualified_tag_is(get_current_node(parser), target_ns, target)); | |
| 1600 pop_current_node(parser); | |
| 1601 return result; | |
| 1602 } | |
| 1603 | |
| 1604 // If the stack of open elements has a <p> tag in button scope, this acts as if | |
| 1605 // a </p> tag was encountered, implicitly closing tags. Returns false if a | |
| 1606 // parse error occurs. This is a convenience function because this particular | |
| 1607 // clause appears several times in the spec. | |
| 1608 static bool maybe_implicitly_close_p_tag( | |
| 1609 GumboParser* parser, GumboToken* token) { | |
| 1610 if (has_an_element_in_button_scope(parser, GUMBO_TAG_P)) { | |
| 1611 return implicitly_close_tags( | |
| 1612 parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_P); | |
| 1613 } | |
| 1614 return true; | |
| 1615 } | |
| 1616 | |
| 1617 // Convenience function to encapsulate the logic for closing <li> or <dd>/<dt> | |
| 1618 // tags. Pass true to is_li for handling <li> tags, false for <dd> and <dt>. | |
| 1619 static void maybe_implicitly_close_list_tag( | |
| 1620 GumboParser* parser, GumboToken* token, bool is_li) { | |
| 1621 GumboParserState* state = parser->_parser_state; | |
| 1622 state->_frameset_ok = false; | |
| 1623 for (int i = state->_open_elements.length; --i >= 0;) { | |
| 1624 const GumboNode* node = state->_open_elements.data[i]; | |
| 1625 bool is_list_tag = | |
| 1626 is_li ? node_html_tag_is(node, GUMBO_TAG_LI) | |
| 1627 : node_tag_in_set(node, (gumbo_tagset){TAG(DD), TAG(DT)}); | |
| 1628 if (is_list_tag) { | |
| 1629 implicitly_close_tags( | |
| 1630 parser, token, node->v.element.tag_namespace, node->v.element.tag); | |
| 1631 return; | |
| 1632 } | |
| 1633 if (is_special_node(node) && | |
| 1634 !node_tag_in_set( | |
| 1635 node, (gumbo_tagset){TAG(ADDRESS), TAG(DIV), TAG(P)})) { | |
| 1636 return; | |
| 1637 } | |
| 1638 } | |
| 1639 } | |
| 1640 | |
| 1641 static void merge_attributes( | |
| 1642 GumboParser* parser, GumboToken* token, GumboNode* node) { | |
| 1643 assert(token->type == GUMBO_TOKEN_START_TAG); | |
| 1644 assert(node->type == GUMBO_NODE_ELEMENT); | |
| 1645 const GumboVector* token_attr = &token->v.start_tag.attributes; | |
| 1646 GumboVector* node_attr = &node->v.element.attributes; | |
| 1647 | |
| 1648 for (unsigned int i = 0; i < token_attr->length; ++i) { | |
| 1649 GumboAttribute* attr = token_attr->data[i]; | |
| 1650 if (!gumbo_get_attribute(node_attr, attr->name)) { | |
| 1651 // Ownership of the attribute is transferred by this gumbo_vector_add, | |
| 1652 // so it has to be nulled out of the original token so it doesn't get | |
| 1653 // double-deleted. | |
| 1654 gumbo_vector_add(parser, attr, node_attr); | |
| 1655 token_attr->data[i] = NULL; | |
| 1656 } | |
| 1657 } | |
| 1658 // When attributes are merged, it means the token has been ignored and merged | |
| 1659 // with another token, so we need to free its memory. The attributes that are | |
| 1660 // transferred need to be nulled-out in the vector above so that they aren't | |
| 1661 // double-deleted. | |
| 1662 gumbo_token_destroy(parser, token); | |
| 1663 | |
| 1664 #ifndef NDEBUG | |
| 1665 // Mark this sentinel so the assertion in the main loop knows it's been | |
| 1666 // destroyed. | |
| 1667 token->v.start_tag.attributes = kGumboEmptyVector; | |
| 1668 #endif | |
| 1669 } | |
| 1670 | |
| 1671 const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tag) { | |
| 1672 for (size_t i = 0; i < sizeof(kSvgTagReplacements) / sizeof(ReplacementEntry); | |
| 1673 ++i) { | |
| 1674 const ReplacementEntry* entry = &kSvgTagReplacements[i]; | |
| 1675 if (gumbo_string_equals_ignore_case(tag, &entry->from)) { | |
| 1676 return entry->to.data; | |
| 1677 } | |
| 1678 } | |
| 1679 return NULL; | |
| 1680 } | |
| 1681 | |
| 1682 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#adjust-foreign-attributes | |
| 1683 // This destructively modifies any matching attributes on the token and sets the | |
| 1684 // namespace appropriately. | |
| 1685 static void adjust_foreign_attributes(GumboParser* parser, GumboToken* token) { | |
| 1686 assert(token->type == GUMBO_TOKEN_START_TAG); | |
| 1687 const GumboVector* attributes = &token->v.start_tag.attributes; | |
| 1688 for (size_t i = 0; i < sizeof(kForeignAttributeReplacements) / | |
| 1689 sizeof(NamespacedAttributeReplacement); | |
| 1690 ++i) { | |
| 1691 const NamespacedAttributeReplacement* entry = | |
| 1692 &kForeignAttributeReplacements[i]; | |
| 1693 GumboAttribute* attr = gumbo_get_attribute(attributes, entry->from); | |
| 1694 if (!attr) { | |
| 1695 continue; | |
| 1696 } | |
| 1697 gumbo_parser_deallocate(parser, (void*) attr->name); | |
| 1698 attr->attr_namespace = entry->attr_namespace; | |
| 1699 attr->name = gumbo_copy_stringz(parser, entry->local_name); | |
| 1700 } | |
| 1701 } | |
| 1702 | |
| 1703 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#adjust-svg-attributes | |
| 1704 // This destructively modifies any matching attributes on the token. | |
| 1705 static void adjust_svg_attributes(GumboParser* parser, GumboToken* token) { | |
| 1706 assert(token->type == GUMBO_TOKEN_START_TAG); | |
| 1707 const GumboVector* attributes = &token->v.start_tag.attributes; | |
| 1708 for (size_t i = 0; | |
| 1709 i < sizeof(kSvgAttributeReplacements) / sizeof(ReplacementEntry); ++i) { | |
| 1710 const ReplacementEntry* entry = &kSvgAttributeReplacements[i]; | |
| 1711 GumboAttribute* attr = gumbo_get_attribute(attributes, entry->from.data); | |
| 1712 if (!attr) { | |
| 1713 continue; | |
| 1714 } | |
| 1715 gumbo_parser_deallocate(parser, (void*) attr->name); | |
| 1716 attr->name = gumbo_copy_stringz(parser, entry->to.data); | |
| 1717 } | |
| 1718 } | |
| 1719 | |
| 1720 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#adjust-mathml-attributes | |
| 1721 // Note that this may destructively modify the token with the new attribute | |
| 1722 // value. | |
| 1723 static void adjust_mathml_attributes(GumboParser* parser, GumboToken* token) { | |
| 1724 assert(token->type == GUMBO_TOKEN_START_TAG); | |
| 1725 GumboAttribute* attr = | |
| 1726 gumbo_get_attribute(&token->v.start_tag.attributes, "definitionurl"); | |
| 1727 if (!attr) { | |
| 1728 return; | |
| 1729 } | |
| 1730 gumbo_parser_deallocate(parser, (void*) attr->name); | |
| 1731 attr->name = gumbo_copy_stringz(parser, "definitionURL"); | |
| 1732 } | |
| 1733 | |
| 1734 static bool doctype_matches(const GumboTokenDocType* doctype, | |
| 1735 const GumboStringPiece* public_id, const GumboStringPiece* system_id, | |
| 1736 bool allow_missing_system_id) { | |
| 1737 return !strcmp(doctype->public_identifier, public_id->data) && | |
| 1738 (allow_missing_system_id || doctype->has_system_identifier) && | |
| 1739 !strcmp(doctype->system_identifier, system_id->data); | |
| 1740 } | |
| 1741 | |
| 1742 static bool maybe_add_doctype_error( | |
| 1743 GumboParser* parser, const GumboToken* token) { | |
| 1744 const GumboTokenDocType* doctype = &token->v.doc_type; | |
| 1745 bool html_doctype = !strcmp(doctype->name, kDoctypeHtml.data); | |
| 1746 if ((!html_doctype || doctype->has_public_identifier || | |
| 1747 (doctype->has_system_identifier && | |
| 1748 !strcmp( | |
| 1749 doctype->system_identifier, kSystemIdLegacyCompat.data))) && | |
| 1750 !(html_doctype && (doctype_matches(doctype, &kPublicIdHtml4_0, | |
| 1751 &kSystemIdRecHtml4_0, true) || | |
| 1752 doctype_matches(doctype, &kPublicIdHtml4_01, | |
| 1753 &kSystemIdHtml4, true) || | |
| 1754 doctype_matches(doctype, &kPublicIdXhtml1_0, | |
| 1755 &kSystemIdXhtmlStrict1_1, false) || | |
| 1756 doctype_matches(doctype, &kPublicIdXhtml1_1, | |
| 1757 &kSystemIdXhtml1_1, false)))) { | |
| 1758 parser_add_parse_error(parser, token); | |
| 1759 return false; | |
| 1760 } | |
| 1761 return true; | |
| 1762 } | |
| 1763 | |
| 1764 static void remove_from_parent(GumboParser* parser, GumboNode* node) { | |
| 1765 if (!node->parent) { | |
| 1766 // The node may not have a parent if, for example, it is a newly-cloned copy | |
| 1767 // of an active formatting element. DOM manipulations continue with the | |
| 1768 // orphaned fragment of the DOM tree until it's appended/foster-parented to | |
| 1769 // the common ancestor at the end of the adoption agency algorithm. | |
| 1770 return; | |
| 1771 } | |
| 1772 assert(node->parent->type == GUMBO_NODE_ELEMENT); | |
| 1773 GumboVector* children = &node->parent->v.element.children; | |
| 1774 int index = gumbo_vector_index_of(children, node); | |
| 1775 assert(index != -1); | |
| 1776 | |
| 1777 gumbo_vector_remove_at(parser, index, children); | |
| 1778 node->parent = NULL; | |
| 1779 node->index_within_parent = -1; | |
| 1780 for (unsigned int i = index; i < children->length; ++i) { | |
| 1781 GumboNode* child = children->data[i]; | |
| 1782 child->index_within_parent = i; | |
| 1783 } | |
| 1784 } | |
| 1785 | |
| 1786 // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser | |
| 1787 // Also described in the "in body" handling for end formatting tags. | |
| 1788 static bool adoption_agency_algorithm( | |
| 1789 GumboParser* parser, GumboToken* token, GumboTag subject) { | |
| 1790 GumboParserState* state = parser->_parser_state; | |
| 1791 gumbo_debug("Entering adoption agency algorithm.\n"); | |
| 1792 // Step 1. | |
| 1793 GumboNode* current_node = get_current_node(parser); | |
| 1794 if (current_node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML && | |
| 1795 current_node->v.element.tag == subject && | |
| 1796 gumbo_vector_index_of( | |
| 1797 &state->_active_formatting_elements, current_node) == -1) { | |
| 1798 pop_current_node(parser); | |
| 1799 return false; | |
| 1800 } | |
| 1801 // Steps 2-4 & 20: | |
| 1802 for (unsigned int i = 0; i < 8; ++i) { | |
| 1803 // Step 5. | |
| 1804 GumboNode* formatting_node = NULL; | |
| 1805 int formatting_node_in_open_elements = -1; | |
| 1806 for (int j = state->_active_formatting_elements.length; --j >= 0;) { | |
| 1807 GumboNode* current_node = state->_active_formatting_elements.data[j]; | |
| 1808 if (current_node == &kActiveFormattingScopeMarker) { | |
| 1809 gumbo_debug("Broke on scope marker; aborting.\n"); | |
| 1810 // Last scope marker; abort the algorithm. | |
| 1811 return false; | |
| 1812 } | |
| 1813 if (node_html_tag_is(current_node, subject)) { | |
| 1814 // Found it. | |
| 1815 formatting_node = current_node; | |
| 1816 formatting_node_in_open_elements = | |
| 1817 gumbo_vector_index_of(&state->_open_elements, formatting_node); | |
| 1818 gumbo_debug("Formatting element of tag %s at %d.\n", | |
| 1819 gumbo_normalized_tagname(subject), | |
| 1820 formatting_node_in_open_elements); | |
| 1821 break; | |
| 1822 } | |
| 1823 } | |
| 1824 if (!formatting_node) { | |
| 1825 // No matching tag; not a parse error outright, but fall through to the | |
| 1826 // "any other end tag" clause (which may potentially add a parse error, | |
| 1827 // but not always). | |
| 1828 gumbo_debug("No active formatting elements; aborting.\n"); | |
| 1829 return false; | |
| 1830 } | |
| 1831 | |
| 1832 // Step 6 | |
| 1833 if (formatting_node_in_open_elements == -1) { | |
| 1834 gumbo_debug("Formatting node not on stack of open elements.\n"); | |
| 1835 parser_add_parse_error(parser, token); | |
| 1836 gumbo_vector_remove( | |
| 1837 parser, formatting_node, &state->_active_formatting_elements); | |
| 1838 return false; | |
| 1839 } | |
| 1840 | |
| 1841 // Step 7 | |
| 1842 if (!has_an_element_in_scope(parser, formatting_node->v.element.tag)) { | |
| 1843 parser_add_parse_error(parser, token); | |
| 1844 gumbo_debug("Element not in scope.\n"); | |
| 1845 return false; | |
| 1846 } | |
| 1847 | |
| 1848 // Step 8 | |
| 1849 if (formatting_node != get_current_node(parser)) { | |
| 1850 parser_add_parse_error(parser, token); // But continue onwards. | |
| 1851 } | |
| 1852 assert(formatting_node); | |
| 1853 assert(!node_html_tag_is(formatting_node, GUMBO_TAG_HTML)); | |
| 1854 assert(!node_html_tag_is(formatting_node, GUMBO_TAG_BODY)); | |
| 1855 | |
| 1856 // Step 9 & 10 | |
| 1857 GumboNode* furthest_block = NULL; | |
| 1858 for (unsigned int j = formatting_node_in_open_elements; | |
| 1859 j < state->_open_elements.length; ++j) { | |
| 1860 assert(j > 0); | |
| 1861 GumboNode* current = state->_open_elements.data[j]; | |
| 1862 if (is_special_node(current)) { | |
| 1863 // Step 9. | |
| 1864 furthest_block = current; | |
| 1865 break; | |
| 1866 } | |
| 1867 } | |
| 1868 if (!furthest_block) { | |
| 1869 // Step 10. | |
| 1870 while (get_current_node(parser) != formatting_node) { | |
| 1871 pop_current_node(parser); | |
| 1872 } | |
| 1873 // And the formatting element itself. | |
| 1874 pop_current_node(parser); | |
| 1875 gumbo_vector_remove( | |
| 1876 parser, formatting_node, &state->_active_formatting_elements); | |
| 1877 return false; | |
| 1878 } | |
| 1879 assert(!node_html_tag_is(furthest_block, GUMBO_TAG_HTML)); | |
| 1880 assert(furthest_block); | |
| 1881 | |
| 1882 // Step 11. | |
| 1883 // Elements may be moved and reparented by this algorithm, so | |
| 1884 // common_ancestor is not necessarily the same as formatting_node->parent. | |
| 1885 GumboNode* common_ancestor = | |
| 1886 state->_open_elements.data[gumbo_vector_index_of(&state->_open_elements, | |
| 1887 formatting_node) - | |
| 1888 1]; | |
| 1889 gumbo_debug("Common ancestor tag = %s, furthest block tag = %s.\n", | |
| 1890 gumbo_normalized_tagname(common_ancestor->v.element.tag), | |
| 1891 gumbo_normalized_tagname(furthest_block->v.element.tag)); | |
| 1892 | |
| 1893 // Step 12. | |
| 1894 int bookmark = gumbo_vector_index_of( | |
| 1895 &state->_active_formatting_elements, formatting_node) + | |
| 1896 1; | |
| 1897 gumbo_debug("Bookmark at %d.\n", bookmark); | |
| 1898 // Step 13. | |
| 1899 GumboNode* node = furthest_block; | |
| 1900 GumboNode* last_node = furthest_block; | |
| 1901 // Must be stored explicitly, in case node is removed from the stack of open | |
| 1902 // elements, to handle step 9.4. | |
| 1903 int saved_node_index = gumbo_vector_index_of(&state->_open_elements, node); | |
| 1904 assert(saved_node_index > 0); | |
| 1905 // Step 13.1. | |
| 1906 for (int j = 0;;) { | |
| 1907 // Step 13.2. | |
| 1908 ++j; | |
| 1909 // Step 13.3. | |
| 1910 int node_index = gumbo_vector_index_of(&state->_open_elements, node); | |
| 1911 gumbo_debug( | |
| 1912 "Current index: %d, last index: %d.\n", node_index, saved_node_index); | |
| 1913 if (node_index == -1) { | |
| 1914 node_index = saved_node_index; | |
| 1915 } | |
| 1916 saved_node_index = --node_index; | |
| 1917 assert(node_index > 0); | |
| 1918 assert((unsigned int) node_index < state->_open_elements.capacity); | |
| 1919 node = state->_open_elements.data[node_index]; | |
| 1920 assert(node->parent); | |
| 1921 if (node == formatting_node) { | |
| 1922 // Step 13.4. | |
| 1923 break; | |
| 1924 } | |
| 1925 int formatting_index = | |
| 1926 gumbo_vector_index_of(&state->_active_formatting_elements, node); | |
| 1927 if (j > 3 && formatting_index != -1) { | |
| 1928 // Step 13.5. | |
| 1929 gumbo_debug("Removing formatting element at %d.\n", formatting_index); | |
| 1930 gumbo_vector_remove_at( | |
| 1931 parser, formatting_index, &state->_active_formatting_elements); | |
| 1932 // Removing the element shifts all indices over by one, so we may need | |
| 1933 // to move the bookmark. | |
| 1934 if (formatting_index < bookmark) { | |
| 1935 --bookmark; | |
| 1936 gumbo_debug("Moving bookmark to %d.\n", bookmark); | |
| 1937 } | |
| 1938 continue; | |
| 1939 } | |
| 1940 if (formatting_index == -1) { | |
| 1941 // Step 13.6. | |
| 1942 gumbo_vector_remove_at(parser, node_index, &state->_open_elements); | |
| 1943 continue; | |
| 1944 } | |
| 1945 // Step 13.7. | |
| 1946 // "common ancestor as the intended parent" doesn't actually mean insert | |
| 1947 // it into the common ancestor; that happens below. | |
| 1948 node = clone_node(parser, node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED); | |
| 1949 assert(formatting_index >= 0); | |
| 1950 state->_active_formatting_elements.data[formatting_index] = node; | |
| 1951 assert(node_index >= 0); | |
| 1952 state->_open_elements.data[node_index] = node; | |
| 1953 // Step 13.8. | |
| 1954 if (last_node == furthest_block) { | |
| 1955 bookmark = formatting_index + 1; | |
| 1956 gumbo_debug("Bookmark moved to %d.\n", bookmark); | |
| 1957 assert((unsigned int) bookmark <= state->_active_formatting_elements.length); | |
| 1958 } | |
| 1959 // Step 13.9. | |
| 1960 last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED; | |
| 1961 remove_from_parent(parser, last_node); | |
| 1962 append_node(parser, node, last_node); | |
| 1963 // Step 13.10. | |
| 1964 last_node = node; | |
| 1965 } // Step 13.11. | |
| 1966 | |
| 1967 // Step 14. | |
| 1968 gumbo_debug("Removing %s node from parent ", | |
| 1969 gumbo_normalized_tagname(last_node->v.element.tag)); | |
| 1970 remove_from_parent(parser, last_node); | |
| 1971 last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED; | |
| 1972 InsertionLocation location = | |
| 1973 get_appropriate_insertion_location(parser, common_ancestor); | |
| 1974 gumbo_debug("and inserting it into %s.\n", | |
| 1975 gumbo_normalized_tagname(location.target->v.element.tag)); | |
| 1976 insert_node(parser, last_node, location); | |
| 1977 | |
| 1978 // Step 15. | |
| 1979 GumboNode* new_formatting_node = clone_node( | |
| 1980 parser, formatting_node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED); | |
| 1981 formatting_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG; | |
| 1982 | |
| 1983 // Step 16. Instead of appending nodes one-by-one, we swap the children | |
| 1984 // vector of furthest_block with the empty children of new_formatting_node, | |
| 1985 // reducing memory traffic and allocations. We still have to reset their | |
| 1986 // parent pointers, though. | |
| 1987 GumboVector temp = new_formatting_node->v.element.children; | |
| 1988 new_formatting_node->v.element.children = | |
| 1989 furthest_block->v.element.children; | |
| 1990 furthest_block->v.element.children = temp; | |
| 1991 | |
| 1992 temp = new_formatting_node->v.element.children; | |
| 1993 for (unsigned int i = 0; i < temp.length; ++i) { | |
| 1994 GumboNode* child = temp.data[i]; | |
| 1995 child->parent = new_formatting_node; | |
| 1996 } | |
| 1997 | |
| 1998 // Step 17. | |
| 1999 append_node(parser, furthest_block, new_formatting_node); | |
| 2000 | |
| 2001 // Step 18. | |
| 2002 // If the formatting node was before the bookmark, it may shift over all | |
| 2003 // indices after it, so we need to explicitly find the index and possibly | |
| 2004 // adjust the bookmark. | |
| 2005 int formatting_node_index = gumbo_vector_index_of( | |
| 2006 &state->_active_formatting_elements, formatting_node); | |
| 2007 assert(formatting_node_index != -1); | |
| 2008 if (formatting_node_index < bookmark) { | |
| 2009 gumbo_debug( | |
| 2010 "Formatting node at %d is before bookmark at %d; decrementing.\n", | |
| 2011 formatting_node_index, bookmark); | |
| 2012 --bookmark; | |
| 2013 } | |
| 2014 gumbo_vector_remove_at( | |
| 2015 parser, formatting_node_index, &state->_active_formatting_elements); | |
| 2016 assert(bookmark >= 0); | |
| 2017 assert((unsigned int) bookmark <= state->_active_formatting_elements.length); | |
| 2018 gumbo_vector_insert_at(parser, new_formatting_node, bookmark, | |
| 2019 &state->_active_formatting_elements); | |
| 2020 | |
| 2021 // Step 19. | |
| 2022 gumbo_vector_remove(parser, formatting_node, &state->_open_elements); | |
| 2023 int insert_at = | |
| 2024 gumbo_vector_index_of(&state->_open_elements, furthest_block) + 1; | |
| 2025 assert(insert_at >= 0); | |
| 2026 assert((unsigned int) insert_at <= state->_open_elements.length); | |
| 2027 gumbo_vector_insert_at( | |
| 2028 parser, new_formatting_node, insert_at, &state->_open_elements); | |
| 2029 } // Step 20. | |
| 2030 return true; | |
| 2031 } | |
| 2032 | |
| 2033 // This is here to clean up memory when the spec says "Ignore current token." | |
| 2034 static void ignore_token(GumboParser* parser) { | |
| 2035 GumboToken* token = parser->_parser_state->_current_token; | |
| 2036 // Ownership of the token's internal buffers are normally transferred to the | |
| 2037 // element, but if no element is emitted (as happens in non-verbatim-mode | |
| 2038 // when a token is ignored), we need to free it here to prevent a memory | |
| 2039 // leak. | |
| 2040 gumbo_token_destroy(parser, token); | |
| 2041 #ifndef NDEBUG | |
| 2042 if (token->type == GUMBO_TOKEN_START_TAG) { | |
| 2043 // Mark this sentinel so the assertion in the main loop knows it's been | |
| 2044 // destroyed. | |
| 2045 token->v.start_tag.attributes = kGumboEmptyVector; | |
| 2046 } | |
| 2047 #endif | |
| 2048 } | |
| 2049 | |
| 2050 // http://www.whatwg.org/specs/web-apps/current-work/complete/the-end.html | |
| 2051 static void finish_parsing(GumboParser* parser) { | |
| 2052 gumbo_debug("Finishing parsing"); | |
| 2053 maybe_flush_text_node_buffer(parser); | |
| 2054 GumboParserState* state = parser->_parser_state; | |
| 2055 for (GumboNode* node = pop_current_node(parser); node; | |
| 2056 node = pop_current_node(parser)) { | |
| 2057 if ((node_html_tag_is(node, GUMBO_TAG_BODY) && state->_closed_body_tag) || | |
| 2058 (node_html_tag_is(node, GUMBO_TAG_HTML) && state->_closed_html_tag)) { | |
| 2059 continue; | |
| 2060 } | |
| 2061 node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG; | |
| 2062 } | |
| 2063 while (pop_current_node(parser)) | |
| 2064 ; // Pop them all. | |
| 2065 } | |
| 2066 | |
| 2067 static bool handle_initial(GumboParser* parser, GumboToken* token) { | |
| 2068 GumboDocument* document = &get_document_node(parser)->v.document; | |
| 2069 if (token->type == GUMBO_TOKEN_WHITESPACE) { | |
| 2070 ignore_token(parser); | |
| 2071 return true; | |
| 2072 } else if (token->type == GUMBO_TOKEN_COMMENT) { | |
| 2073 append_comment_node(parser, get_document_node(parser), token); | |
| 2074 return true; | |
| 2075 } else if (token->type == GUMBO_TOKEN_DOCTYPE) { | |
| 2076 document->has_doctype = true; | |
| 2077 document->name = token->v.doc_type.name; | |
| 2078 document->public_identifier = token->v.doc_type.public_identifier; | |
| 2079 document->system_identifier = token->v.doc_type.system_identifier; | |
| 2080 document->doc_type_quirks_mode = compute_quirks_mode(&token->v.doc_type); | |
| 2081 set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HTML); | |
| 2082 return maybe_add_doctype_error(parser, token); | |
| 2083 } | |
| 2084 parser_add_parse_error(parser, token); | |
| 2085 document->doc_type_quirks_mode = GUMBO_DOCTYPE_QUIRKS; | |
| 2086 set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HTML); | |
| 2087 parser->_parser_state->_reprocess_current_token = true; | |
| 2088 return true; | |
| 2089 } | |
| 2090 | |
| 2091 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-before-html-insertion-mode | |
| 2092 static bool handle_before_html(GumboParser* parser, GumboToken* token) { | |
| 2093 if (token->type == GUMBO_TOKEN_DOCTYPE) { | |
| 2094 parser_add_parse_error(parser, token); | |
| 2095 ignore_token(parser); | |
| 2096 return false; | |
| 2097 } else if (token->type == GUMBO_TOKEN_COMMENT) { | |
| 2098 append_comment_node(parser, get_document_node(parser), token); | |
| 2099 return true; | |
| 2100 } else if (token->type == GUMBO_TOKEN_WHITESPACE) { | |
| 2101 ignore_token(parser); | |
| 2102 return true; | |
| 2103 } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) { | |
| 2104 GumboNode* html_node = insert_element_from_token(parser, token); | |
| 2105 parser->_output->root = html_node; | |
| 2106 set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD); | |
| 2107 return true; | |
| 2108 } else if (token->type == GUMBO_TOKEN_END_TAG && | |
| 2109 !tag_in(token, false, | |
| 2110 (gumbo_tagset){TAG(HEAD), TAG(BODY), TAG(HTML), TAG(BR)})) { | |
| 2111 parser_add_parse_error(parser, token); | |
| 2112 ignore_token(parser); | |
| 2113 return false; | |
| 2114 } else { | |
| 2115 GumboNode* html_node = insert_element_of_tag_type( | |
| 2116 parser, GUMBO_TAG_HTML, GUMBO_INSERTION_IMPLIED); | |
| 2117 assert(html_node); | |
| 2118 parser->_output->root = html_node; | |
| 2119 set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD); | |
| 2120 parser->_parser_state->_reprocess_current_token = true; | |
| 2121 return true; | |
| 2122 } | |
| 2123 } | |
| 2124 | |
| 2125 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-before-head-insertion-mode | |
| 2126 static bool handle_before_head(GumboParser* parser, GumboToken* token) { | |
| 2127 if (token->type == GUMBO_TOKEN_DOCTYPE) { | |
| 2128 parser_add_parse_error(parser, token); | |
| 2129 ignore_token(parser); | |
| 2130 return false; | |
| 2131 } else if (token->type == GUMBO_TOKEN_COMMENT) { | |
| 2132 append_comment_node(parser, get_current_node(parser), token); | |
| 2133 return true; | |
| 2134 } else if (token->type == GUMBO_TOKEN_WHITESPACE) { | |
| 2135 ignore_token(parser); | |
| 2136 return true; | |
| 2137 } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD)) { | |
| 2138 GumboNode* node = insert_element_from_token(parser, token); | |
| 2139 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD); | |
| 2140 parser->_parser_state->_head_element = node; | |
| 2141 return true; | |
| 2142 } else if (token->type == GUMBO_TOKEN_END_TAG && | |
| 2143 !tag_in(token, false, | |
| 2144 (gumbo_tagset){TAG(HEAD), TAG(BODY), TAG(HTML), TAG(BR)})) { | |
| 2145 parser_add_parse_error(parser, token); | |
| 2146 ignore_token(parser); | |
| 2147 return false; | |
| 2148 } else { | |
| 2149 GumboNode* node = insert_element_of_tag_type( | |
| 2150 parser, GUMBO_TAG_HEAD, GUMBO_INSERTION_IMPLIED); | |
| 2151 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD); | |
| 2152 parser->_parser_state->_head_element = node; | |
| 2153 parser->_parser_state->_reprocess_current_token = true; | |
| 2154 return true; | |
| 2155 } | |
| 2156 } | |
| 2157 | |
| 2158 // Forward declarations because of mutual dependencies. | |
| 2159 static bool handle_token(GumboParser* parser, GumboToken* token); | |
| 2160 static bool handle_in_body(GumboParser* parser, GumboToken* token); | |
| 2161 | |
| 2162 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inhead | |
| 2163 static bool handle_in_head(GumboParser* parser, GumboToken* token) { | |
| 2164 if (token->type == GUMBO_TOKEN_WHITESPACE) { | |
| 2165 insert_text_token(parser, token); | |
| 2166 return true; | |
| 2167 } else if (token->type == GUMBO_TOKEN_DOCTYPE) { | |
| 2168 parser_add_parse_error(parser, token); | |
| 2169 ignore_token(parser); | |
| 2170 return false; | |
| 2171 } else if (token->type == GUMBO_TOKEN_COMMENT) { | |
| 2172 append_comment_node(parser, get_current_node(parser), token); | |
| 2173 return true; | |
| 2174 } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) { | |
| 2175 return handle_in_body(parser, token); | |
| 2176 } else if (tag_in(token, kStartTag, | |
| 2177 (gumbo_tagset){TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), | |
| 2178 TAG(MENUITEM), TAG(LINK)})) { | |
| 2179 insert_element_from_token(parser, token); | |
| 2180 pop_current_node(parser); | |
| 2181 acknowledge_self_closing_tag(parser); | |
| 2182 return true; | |
| 2183 } else if (tag_is(token, kStartTag, GUMBO_TAG_META)) { | |
| 2184 insert_element_from_token(parser, token); | |
| 2185 pop_current_node(parser); | |
| 2186 acknowledge_self_closing_tag(parser); | |
| 2187 // NOTE(jdtang): Gumbo handles only UTF-8, so the encoding clause of the | |
| 2188 // spec doesn't apply. If clients want to handle meta-tag re-encoding, they | |
| 2189 // should specifically look for that string in the document and re-encode it | |
| 2190 // before passing to Gumbo. | |
| 2191 return true; | |
| 2192 } else if (tag_is(token, kStartTag, GUMBO_TAG_TITLE)) { | |
| 2193 run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA); | |
| 2194 return true; | |
| 2195 } else if (tag_in( | |
| 2196 token, kStartTag, (gumbo_tagset){TAG(NOFRAMES), TAG(STYLE)})) { | |
| 2197 run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT); | |
| 2198 return true; | |
| 2199 } else if (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT)) { | |
| 2200 insert_element_from_token(parser, token); | |
| 2201 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD_NOSCRIPT); | |
| 2202 return true; | |
| 2203 } else if (tag_is(token, kStartTag, GUMBO_TAG_SCRIPT)) { | |
| 2204 run_generic_parsing_algorithm(parser, token, GUMBO_LEX_SCRIPT); | |
| 2205 return true; | |
| 2206 } else if (tag_is(token, kEndTag, GUMBO_TAG_HEAD)) { | |
| 2207 GumboNode* head = pop_current_node(parser); | |
| 2208 AVOID_UNUSED_VARIABLE_WARNING(head); | |
| 2209 assert(node_html_tag_is(head, GUMBO_TAG_HEAD)); | |
| 2210 set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD); | |
| 2211 return true; | |
| 2212 } else if (tag_in(token, kEndTag, | |
| 2213 (gumbo_tagset){TAG(BODY), TAG(HTML), TAG(BR)})) { | |
| 2214 pop_current_node(parser); | |
| 2215 set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD); | |
| 2216 parser->_parser_state->_reprocess_current_token = true; | |
| 2217 return true; | |
| 2218 } else if (tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE)) { | |
| 2219 insert_element_from_token(parser, token); | |
| 2220 add_formatting_element(parser, &kActiveFormattingScopeMarker); | |
| 2221 parser->_parser_state->_frameset_ok = false; | |
| 2222 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE); | |
| 2223 push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE); | |
| 2224 return true; | |
| 2225 } else if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) { | |
| 2226 if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) { | |
| 2227 parser_add_parse_error(parser, token); | |
| 2228 ignore_token(parser); | |
| 2229 return false; | |
| 2230 } | |
| 2231 generate_all_implied_end_tags_thoroughly(parser); | |
| 2232 bool success = true; | |
| 2233 if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_TEMPLATE)) { | |
| 2234 parser_add_parse_error(parser, token); | |
| 2235 success = false; | |
| 2236 } | |
| 2237 while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_TEMPLATE)) | |
| 2238 ; | |
| 2239 clear_active_formatting_elements(parser); | |
| 2240 pop_template_insertion_mode(parser); | |
| 2241 reset_insertion_mode_appropriately(parser); | |
| 2242 return success; | |
| 2243 } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD) || | |
| 2244 (token->type == GUMBO_TOKEN_END_TAG)) { | |
| 2245 parser_add_parse_error(parser, token); | |
| 2246 ignore_token(parser); | |
| 2247 return false; | |
| 2248 } else { | |
| 2249 pop_current_node(parser); | |
| 2250 set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD); | |
| 2251 parser->_parser_state->_reprocess_current_token = true; | |
| 2252 return true; | |
| 2253 } | |
| 2254 return true; | |
| 2255 } | |
| 2256 | |
| 2257 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inheadnoscript | |
| 2258 static bool handle_in_head_noscript(GumboParser* parser, GumboToken* token) { | |
| 2259 if (token->type == GUMBO_TOKEN_DOCTYPE) { | |
| 2260 parser_add_parse_error(parser, token); | |
| 2261 return false; | |
| 2262 } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) { | |
| 2263 return handle_in_body(parser, token); | |
| 2264 } else if (tag_is(token, kEndTag, GUMBO_TAG_NOSCRIPT)) { | |
| 2265 const GumboNode* node = pop_current_node(parser); | |
| 2266 assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT)); | |
| 2267 AVOID_UNUSED_VARIABLE_WARNING(node); | |
| 2268 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD); | |
| 2269 return true; | |
| 2270 } else if (token->type == GUMBO_TOKEN_WHITESPACE || | |
| 2271 token->type == GUMBO_TOKEN_COMMENT || | |
| 2272 tag_in(token, kStartTag, | |
| 2273 (gumbo_tagset){TAG(BASEFONT), TAG(BGSOUND), TAG(LINK), | |
| 2274 TAG(META), TAG(NOFRAMES), TAG(STYLE)})) { | |
| 2275 return handle_in_head(parser, token); | |
| 2276 } else if (tag_in( | |
| 2277 token, kStartTag, (gumbo_tagset){TAG(HEAD), TAG(NOSCRIPT)}) || | |
| 2278 (token->type == GUMBO_TOKEN_END_TAG && | |
| 2279 !tag_is(token, kEndTag, GUMBO_TAG_BR))) { | |
| 2280 parser_add_parse_error(parser, token); | |
| 2281 ignore_token(parser); | |
| 2282 return false; | |
| 2283 } else { | |
| 2284 parser_add_parse_error(parser, token); | |
| 2285 const GumboNode* node = pop_current_node(parser); | |
| 2286 assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT)); | |
| 2287 AVOID_UNUSED_VARIABLE_WARNING(node); | |
| 2288 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD); | |
| 2289 parser->_parser_state->_reprocess_current_token = true; | |
| 2290 return false; | |
| 2291 } | |
| 2292 } | |
| 2293 | |
| 2294 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-after-head-insertion-mode | |
| 2295 static bool handle_after_head(GumboParser* parser, GumboToken* token) { | |
| 2296 GumboParserState* state = parser->_parser_state; | |
| 2297 if (token->type == GUMBO_TOKEN_WHITESPACE) { | |
| 2298 insert_text_token(parser, token); | |
| 2299 return true; | |
| 2300 } else if (token->type == GUMBO_TOKEN_DOCTYPE) { | |
| 2301 parser_add_parse_error(parser, token); | |
| 2302 ignore_token(parser); | |
| 2303 return false; | |
| 2304 } else if (token->type == GUMBO_TOKEN_COMMENT) { | |
| 2305 append_comment_node(parser, get_current_node(parser), token); | |
| 2306 return true; | |
| 2307 } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) { | |
| 2308 return handle_in_body(parser, token); | |
| 2309 } else if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) { | |
| 2310 insert_element_from_token(parser, token); | |
| 2311 state->_frameset_ok = false; | |
| 2312 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY); | |
| 2313 return true; | |
| 2314 } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) { | |
| 2315 insert_element_from_token(parser, token); | |
| 2316 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET); | |
| 2317 return true; | |
| 2318 } else if (tag_in(token, kStartTag, | |
| 2319 (gumbo_tagset){TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), | |
| 2320 TAG(LINK), TAG(META), TAG(NOFRAMES), TAG(SCRIPT), | |
| 2321 TAG(STYLE), TAG(TEMPLATE), TAG(TITLE)})) { | |
| 2322 parser_add_parse_error(parser, token); | |
| 2323 assert(state->_head_element != NULL); | |
| 2324 // This must be flushed before we push the head element on, as there may be | |
| 2325 // pending character tokens that should be attached to the root. | |
| 2326 maybe_flush_text_node_buffer(parser); | |
| 2327 gumbo_vector_add(parser, state->_head_element, &state->_open_elements); | |
| 2328 bool result = handle_in_head(parser, token); | |
| 2329 gumbo_vector_remove(parser, state->_head_element, &state->_open_elements); | |
| 2330 return result; | |
| 2331 } else if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) { | |
| 2332 return handle_in_head(parser, token); | |
| 2333 } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD) || | |
| 2334 (token->type == GUMBO_TOKEN_END_TAG && | |
| 2335 !tag_in(token, kEndTag, | |
| 2336 (gumbo_tagset){TAG(BODY), TAG(HTML), TAG(BR)}))) { | |
| 2337 parser_add_parse_error(parser, token); | |
| 2338 ignore_token(parser); | |
| 2339 return false; | |
| 2340 } else { | |
| 2341 insert_element_of_tag_type(parser, GUMBO_TAG_BODY, GUMBO_INSERTION_IMPLIED); | |
| 2342 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY); | |
| 2343 state->_reprocess_current_token = true; | |
| 2344 return true; | |
| 2345 } | |
| 2346 } | |
| 2347 | |
| 2348 static void destroy_node(GumboParser* parser, GumboNode* node) { | |
| 2349 switch (node->type) { | |
| 2350 case GUMBO_NODE_DOCUMENT: { | |
| 2351 GumboDocument* doc = &node->v.document; | |
| 2352 for (unsigned int i = 0; i < doc->children.length; ++i) { | |
| 2353 destroy_node(parser, doc->children.data[i]); | |
| 2354 } | |
| 2355 gumbo_parser_deallocate(parser, (void*) doc->children.data); | |
| 2356 gumbo_parser_deallocate(parser, (void*) doc->name); | |
| 2357 gumbo_parser_deallocate(parser, (void*) doc->public_identifier); | |
| 2358 gumbo_parser_deallocate(parser, (void*) doc->system_identifier); | |
| 2359 } break; | |
| 2360 case GUMBO_NODE_TEMPLATE: | |
| 2361 case GUMBO_NODE_ELEMENT: | |
| 2362 for (unsigned int i = 0; i < node->v.element.attributes.length; ++i) { | |
| 2363 gumbo_destroy_attribute(parser, node->v.element.attributes.data[i]); | |
| 2364 } | |
| 2365 gumbo_parser_deallocate(parser, node->v.element.attributes.data); | |
| 2366 for (unsigned int i = 0; i < node->v.element.children.length; ++i) { | |
| 2367 destroy_node(parser, node->v.element.children.data[i]); | |
| 2368 } | |
| 2369 gumbo_parser_deallocate(parser, node->v.element.children.data); | |
| 2370 break; | |
| 2371 case GUMBO_NODE_TEXT: | |
| 2372 case GUMBO_NODE_CDATA: | |
| 2373 case GUMBO_NODE_COMMENT: | |
| 2374 case GUMBO_NODE_WHITESPACE: | |
| 2375 gumbo_parser_deallocate(parser, (void*) node->v.text.text); | |
| 2376 break; | |
| 2377 } | |
| 2378 gumbo_parser_deallocate(parser, node); | |
| 2379 } | |
| 2380 | |
| 2381 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inbody | |
| 2382 static bool handle_in_body(GumboParser* parser, GumboToken* token) { | |
| 2383 GumboParserState* state = parser->_parser_state; | |
| 2384 assert(state->_open_elements.length > 0); | |
| 2385 if (token->type == GUMBO_TOKEN_NULL) { | |
| 2386 parser_add_parse_error(parser, token); | |
| 2387 ignore_token(parser); | |
| 2388 return false; | |
| 2389 } else if (token->type == GUMBO_TOKEN_WHITESPACE) { | |
| 2390 reconstruct_active_formatting_elements(parser); | |
| 2391 insert_text_token(parser, token); | |
| 2392 return true; | |
| 2393 } else if (token->type == GUMBO_TOKEN_CHARACTER || | |
| 2394 token->type == GUMBO_TOKEN_CDATA) { | |
| 2395 reconstruct_active_formatting_elements(parser); | |
| 2396 insert_text_token(parser, token); | |
| 2397 set_frameset_not_ok(parser); | |
| 2398 return true; | |
| 2399 } else if (token->type == GUMBO_TOKEN_COMMENT) { | |
| 2400 append_comment_node(parser, get_current_node(parser), token); | |
| 2401 return true; | |
| 2402 } else if (token->type == GUMBO_TOKEN_DOCTYPE) { | |
| 2403 parser_add_parse_error(parser, token); | |
| 2404 ignore_token(parser); | |
| 2405 return false; | |
| 2406 } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) { | |
| 2407 parser_add_parse_error(parser, token); | |
| 2408 if (has_open_element(parser, GUMBO_TAG_TEMPLATE)) { | |
| 2409 ignore_token(parser); | |
| 2410 return false; | |
| 2411 } | |
| 2412 assert(parser->_output->root != NULL); | |
| 2413 assert(parser->_output->root->type == GUMBO_NODE_ELEMENT); | |
| 2414 merge_attributes(parser, token, parser->_output->root); | |
| 2415 return false; | |
| 2416 } else if (tag_in(token, kStartTag, | |
| 2417 (gumbo_tagset){TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), | |
| 2418 TAG(MENUITEM), TAG(LINK), TAG(META), TAG(NOFRAMES), | |
| 2419 TAG(SCRIPT), TAG(STYLE), TAG(TEMPLATE), TAG(TITLE)}) || | |
| 2420 tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) { | |
| 2421 return handle_in_head(parser, token); | |
| 2422 } else if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) { | |
| 2423 parser_add_parse_error(parser, token); | |
| 2424 if (state->_open_elements.length < 2 || | |
| 2425 !node_html_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY) || | |
| 2426 has_open_element(parser, GUMBO_TAG_TEMPLATE)) { | |
| 2427 ignore_token(parser); | |
| 2428 return false; | |
| 2429 } | |
| 2430 state->_frameset_ok = false; | |
| 2431 merge_attributes(parser, token, state->_open_elements.data[1]); | |
| 2432 return false; | |
| 2433 } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) { | |
| 2434 parser_add_parse_error(parser, token); | |
| 2435 if (state->_open_elements.length < 2 || | |
| 2436 !node_html_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY) || | |
| 2437 !state->_frameset_ok) { | |
| 2438 ignore_token(parser); | |
| 2439 return false; | |
| 2440 } | |
| 2441 // Save the body node for later removal. | |
| 2442 GumboNode* body_node = state->_open_elements.data[1]; | |
| 2443 | |
| 2444 // Pop all nodes except root HTML element. | |
| 2445 GumboNode* node; | |
| 2446 do { | |
| 2447 node = pop_current_node(parser); | |
| 2448 } while (node != state->_open_elements.data[1]); | |
| 2449 | |
| 2450 // Removing & destroying the body node is going to kill any nodes that have | |
| 2451 // been added to the list of active formatting elements, and so we should | |
| 2452 // clear it to prevent a use-after-free if the list of active formatting | |
| 2453 // elements is reconstructed afterwards. This may happen if whitespace | |
| 2454 // follows the </frameset>. | |
| 2455 clear_active_formatting_elements(parser); | |
| 2456 | |
| 2457 // Remove the body node. We may want to factor this out into a generic | |
| 2458 // helper, but right now this is the only code that needs to do this. | |
| 2459 GumboVector* children = &parser->_output->root->v.element.children; | |
| 2460 for (unsigned int i = 0; i < children->length; ++i) { | |
| 2461 if (children->data[i] == body_node) { | |
| 2462 gumbo_vector_remove_at(parser, i, children); | |
| 2463 break; | |
| 2464 } | |
| 2465 } | |
| 2466 destroy_node(parser, body_node); | |
| 2467 | |
| 2468 // Insert the <frameset>, and switch the insertion mode. | |
| 2469 insert_element_from_token(parser, token); | |
| 2470 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET); | |
| 2471 return true; | |
| 2472 } else if (token->type == GUMBO_TOKEN_EOF) { | |
| 2473 for (unsigned int i = 0; i < state->_open_elements.length; ++i) { | |
| 2474 if (!node_tag_in_set(state->_open_elements.data[i], | |
| 2475 (gumbo_tagset){TAG(DD), TAG(DT), TAG(LI), TAG(P), TAG(TBODY), | |
| 2476 TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR), TAG(BODY), | |
| 2477 TAG(HTML)})) { | |
| 2478 parser_add_parse_error(parser, token); | |
| 2479 } | |
| 2480 } | |
| 2481 if (get_current_template_insertion_mode(parser) != | |
| 2482 GUMBO_INSERTION_MODE_INITIAL) { | |
| 2483 return handle_in_template(parser, token); | |
| 2484 } | |
| 2485 return true; | |
| 2486 } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(BODY), TAG(HTML)})) { | |
| 2487 if (!has_an_element_in_scope(parser, GUMBO_TAG_BODY)) { | |
| 2488 parser_add_parse_error(parser, token); | |
| 2489 ignore_token(parser); | |
| 2490 return false; | |
| 2491 } | |
| 2492 bool success = true; | |
| 2493 for (unsigned int i = 0; i < state->_open_elements.length; ++i) { | |
| 2494 if (!node_tag_in_set(state->_open_elements.data[i], | |
| 2495 (gumbo_tagset){TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP), | |
| 2496 TAG(OPTION), TAG(P), TAG(RB), TAG(RP), TAG(RT), TAG(RTC), | |
| 2497 TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR), | |
| 2498 TAG(BODY), TAG(HTML)})) { | |
| 2499 parser_add_parse_error(parser, token); | |
| 2500 success = false; | |
| 2501 break; | |
| 2502 } | |
| 2503 } | |
| 2504 set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_BODY); | |
| 2505 if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) { | |
| 2506 parser->_parser_state->_reprocess_current_token = true; | |
| 2507 } else { | |
| 2508 GumboNode* body = state->_open_elements.data[1]; | |
| 2509 assert(node_html_tag_is(body, GUMBO_TAG_BODY)); | |
| 2510 record_end_of_element(state->_current_token, &body->v.element); | |
| 2511 } | |
| 2512 return success; | |
| 2513 } else if (tag_in(token, kStartTag, | |
| 2514 (gumbo_tagset){TAG(ADDRESS), TAG(ARTICLE), TAG(ASIDE), | |
| 2515 TAG(BLOCKQUOTE), TAG(CENTER), TAG(DETAILS), TAG(DIR), | |
| 2516 TAG(DIV), TAG(DL), TAG(FIELDSET), TAG(FIGCAPTION), | |
| 2517 TAG(FIGURE), TAG(FOOTER), TAG(HEADER), TAG(HGROUP), | |
| 2518 TAG(MENU), TAG(MAIN), TAG(NAV), TAG(OL), TAG(P), | |
| 2519 TAG(SECTION), TAG(SUMMARY), TAG(UL)})) { | |
| 2520 bool result = maybe_implicitly_close_p_tag(parser, token); | |
| 2521 insert_element_from_token(parser, token); | |
| 2522 return result; | |
| 2523 } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(H1), TAG(H2), TAG(H3), | |
| 2524 TAG(H4), TAG(H5), TAG(H6)})) { | |
| 2525 bool result = maybe_implicitly_close_p_tag(parser, token); | |
| 2526 if (node_tag_in_set( | |
| 2527 get_current_node(parser), (gumbo_tagset){TAG(H1), TAG(H2), TAG(H3), | |
| 2528 TAG(H4), TAG(H5), TAG(H6)})) { | |
| 2529 parser_add_parse_error(parser, token); | |
| 2530 pop_current_node(parser); | |
| 2531 result = false; | |
| 2532 } | |
| 2533 insert_element_from_token(parser, token); | |
| 2534 return result; | |
| 2535 } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(PRE), TAG(LISTING)})) { | |
| 2536 bool result = maybe_implicitly_close_p_tag(parser, token); | |
| 2537 insert_element_from_token(parser, token); | |
| 2538 state->_ignore_next_linefeed = true; | |
| 2539 state->_frameset_ok = false; | |
| 2540 return result; | |
| 2541 } else if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) { | |
| 2542 if (state->_form_element != NULL && | |
| 2543 !has_open_element(parser, GUMBO_TAG_TEMPLATE)) { | |
| 2544 gumbo_debug("Ignoring nested form.\n"); | |
| 2545 parser_add_parse_error(parser, token); | |
| 2546 ignore_token(parser); | |
| 2547 return false; | |
| 2548 } | |
| 2549 bool result = maybe_implicitly_close_p_tag(parser, token); | |
| 2550 GumboNode* form_element = insert_element_from_token(parser, token); | |
| 2551 if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) { | |
| 2552 state->_form_element = form_element; | |
| 2553 } | |
| 2554 return result; | |
| 2555 } else if (tag_is(token, kStartTag, GUMBO_TAG_LI)) { | |
| 2556 maybe_implicitly_close_list_tag(parser, token, true); | |
| 2557 bool result = maybe_implicitly_close_p_tag(parser, token); | |
| 2558 insert_element_from_token(parser, token); | |
| 2559 return result; | |
| 2560 } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(DD), TAG(DT)})) { | |
| 2561 maybe_implicitly_close_list_tag(parser, token, false); | |
| 2562 bool result = maybe_implicitly_close_p_tag(parser, token); | |
| 2563 insert_element_from_token(parser, token); | |
| 2564 return result; | |
| 2565 } else if (tag_is(token, kStartTag, GUMBO_TAG_PLAINTEXT)) { | |
| 2566 bool result = maybe_implicitly_close_p_tag(parser, token); | |
| 2567 insert_element_from_token(parser, token); | |
| 2568 gumbo_tokenizer_set_state(parser, GUMBO_LEX_PLAINTEXT); | |
| 2569 return result; | |
| 2570 } else if (tag_is(token, kStartTag, GUMBO_TAG_BUTTON)) { | |
| 2571 if (has_an_element_in_scope(parser, GUMBO_TAG_BUTTON)) { | |
| 2572 parser_add_parse_error(parser, token); | |
| 2573 implicitly_close_tags( | |
| 2574 parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_BUTTON); | |
| 2575 state->_reprocess_current_token = true; | |
| 2576 return false; | |
| 2577 } | |
| 2578 reconstruct_active_formatting_elements(parser); | |
| 2579 insert_element_from_token(parser, token); | |
| 2580 state->_frameset_ok = false; | |
| 2581 return true; | |
| 2582 } else if (tag_in(token, kEndTag, | |
| 2583 (gumbo_tagset){TAG(ADDRESS), TAG(ARTICLE), TAG(ASIDE), | |
| 2584 TAG(BLOCKQUOTE), TAG(BUTTON), TAG(CENTER), TAG(DETAILS), | |
| 2585 TAG(DIR), TAG(DIV), TAG(DL), TAG(FIELDSET), | |
| 2586 TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER), TAG(HEADER), | |
| 2587 TAG(HGROUP), TAG(LISTING), TAG(MAIN), TAG(MENU), TAG(NAV), | |
| 2588 TAG(OL), TAG(PRE), TAG(SECTION), TAG(SUMMARY), TAG(UL)})) { | |
| 2589 GumboTag tag = token->v.end_tag; | |
| 2590 if (!has_an_element_in_scope(parser, tag)) { | |
| 2591 parser_add_parse_error(parser, token); | |
| 2592 ignore_token(parser); | |
| 2593 return false; | |
| 2594 } | |
| 2595 implicitly_close_tags( | |
| 2596 parser, token, GUMBO_NAMESPACE_HTML, token->v.end_tag); | |
| 2597 return true; | |
| 2598 } else if (tag_is(token, kEndTag, GUMBO_TAG_FORM)) { | |
| 2599 if (has_open_element(parser, GUMBO_TAG_TEMPLATE)) { | |
| 2600 if (!has_an_element_in_scope(parser, GUMBO_TAG_FORM)) { | |
| 2601 parser_add_parse_error(parser, token); | |
| 2602 ignore_token(parser); | |
| 2603 return false; | |
| 2604 } | |
| 2605 bool success = true; | |
| 2606 generate_implied_end_tags(parser, GUMBO_TAG_LAST); | |
| 2607 if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_FORM)) { | |
| 2608 parser_add_parse_error(parser, token); | |
| 2609 return false; | |
| 2610 } | |
| 2611 while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_FORM)) | |
| 2612 ; | |
| 2613 return success; | |
| 2614 } else { | |
| 2615 bool result = true; | |
| 2616 const GumboNode* node = state->_form_element; | |
| 2617 assert(!node || node->type == GUMBO_NODE_ELEMENT); | |
| 2618 state->_form_element = NULL; | |
| 2619 if (!node || !has_node_in_scope(parser, node)) { | |
| 2620 gumbo_debug("Closing an unopened form.\n"); | |
| 2621 parser_add_parse_error(parser, token); | |
| 2622 ignore_token(parser); | |
| 2623 return false; | |
| 2624 } | |
| 2625 // This differs from implicitly_close_tags because we remove *only* the | |
| 2626 // <form> element; other nodes are left in scope. | |
| 2627 generate_implied_end_tags(parser, GUMBO_TAG_LAST); | |
| 2628 if (get_current_node(parser) != node) { | |
| 2629 parser_add_parse_error(parser, token); | |
| 2630 result = false; | |
| 2631 } | |
| 2632 | |
| 2633 GumboVector* open_elements = &state->_open_elements; | |
| 2634 int index = gumbo_vector_index_of(open_elements, node); | |
| 2635 assert(index >= 0); | |
| 2636 gumbo_vector_remove_at(parser, index, open_elements); | |
| 2637 return result; | |
| 2638 } | |
| 2639 } else if (tag_is(token, kEndTag, GUMBO_TAG_P)) { | |
| 2640 if (!has_an_element_in_button_scope(parser, GUMBO_TAG_P)) { | |
| 2641 parser_add_parse_error(parser, token); | |
| 2642 // reconstruct_active_formatting_elements(parser); | |
| 2643 insert_element_of_tag_type( | |
| 2644 parser, GUMBO_TAG_P, GUMBO_INSERTION_CONVERTED_FROM_END_TAG); | |
| 2645 state->_reprocess_current_token = true; | |
| 2646 return false; | |
| 2647 } | |
| 2648 return implicitly_close_tags( | |
| 2649 parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_P); | |
| 2650 } else if (tag_is(token, kEndTag, GUMBO_TAG_LI)) { | |
| 2651 if (!has_an_element_in_list_scope(parser, GUMBO_TAG_LI)) { | |
| 2652 parser_add_parse_error(parser, token); | |
| 2653 ignore_token(parser); | |
| 2654 return false; | |
| 2655 } | |
| 2656 return implicitly_close_tags( | |
| 2657 parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_LI); | |
| 2658 } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(DD), TAG(DT)})) { | |
| 2659 assert(token->type == GUMBO_TOKEN_END_TAG); | |
| 2660 GumboTag token_tag = token->v.end_tag; | |
| 2661 if (!has_an_element_in_scope(parser, token_tag)) { | |
| 2662 parser_add_parse_error(parser, token); | |
| 2663 ignore_token(parser); | |
| 2664 return false; | |
| 2665 } | |
| 2666 return implicitly_close_tags( | |
| 2667 parser, token, GUMBO_NAMESPACE_HTML, token_tag); | |
| 2668 } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(H1), TAG(H2), TAG(H3), | |
| 2669 TAG(H4), TAG(H5), TAG(H6)})) { | |
| 2670 if (!has_an_element_in_scope_with_tagname( | |
| 2671 parser, 6, (GumboTag[]){GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3, | |
| 2672 GUMBO_TAG_H4, GUMBO_TAG_H5, GUMBO_TAG_H6})) { | |
| 2673 // No heading open; ignore the token entirely. | |
| 2674 parser_add_parse_error(parser, token); | |
| 2675 ignore_token(parser); | |
| 2676 return false; | |
| 2677 } else { | |
| 2678 generate_implied_end_tags(parser, GUMBO_TAG_LAST); | |
| 2679 const GumboNode* current_node = get_current_node(parser); | |
| 2680 bool success = node_html_tag_is(current_node, token->v.end_tag); | |
| 2681 if (!success) { | |
| 2682 // There're children of the heading currently open; close them below and | |
| 2683 // record a parse error. | |
| 2684 // TODO(jdtang): Add a way to distinguish this error case from the one | |
| 2685 // above. | |
| 2686 parser_add_parse_error(parser, token); | |
| 2687 } | |
| 2688 do { | |
| 2689 current_node = pop_current_node(parser); | |
| 2690 } while (!node_tag_in_set( | |
| 2691 current_node, (gumbo_tagset){TAG(H1), TAG(H2), TAG(H3), | |
| 2692 TAG(H4), TAG(H5), TAG(H6)})); | |
| 2693 return success; | |
| 2694 } | |
| 2695 } else if (tag_is(token, kStartTag, GUMBO_TAG_A)) { | |
| 2696 bool success = true; | |
| 2697 int last_a; | |
| 2698 int has_matching_a = find_last_anchor_index(parser, &last_a); | |
| 2699 if (has_matching_a) { | |
| 2700 assert(has_matching_a == 1); | |
| 2701 parser_add_parse_error(parser, token); | |
| 2702 adoption_agency_algorithm(parser, token, GUMBO_TAG_A); | |
| 2703 // The adoption agency algorithm usually removes all instances of <a> | |
| 2704 // from the list of active formatting elements, but in case it doesn't, | |
| 2705 // we're supposed to do this. (The conditions where it might not are | |
| 2706 // listed in the spec.) | |
| 2707 if (find_last_anchor_index(parser, &last_a)) { | |
| 2708 void* last_element = gumbo_vector_remove_at( | |
| 2709 parser, last_a, &state->_active_formatting_elements); | |
| 2710 gumbo_vector_remove(parser, last_element, &state->_open_elements); | |
| 2711 } | |
| 2712 success = false; | |
| 2713 } | |
| 2714 reconstruct_active_formatting_elements(parser); | |
| 2715 add_formatting_element(parser, insert_element_from_token(parser, token)); | |
| 2716 return success; | |
| 2717 } else if (tag_in(token, kStartTag, | |
| 2718 (gumbo_tagset){TAG(B), TAG(BIG), TAG(CODE), TAG(EM), TAG(FONT), | |
| 2719 TAG(I), TAG(S), TAG(SMALL), TAG(STRIKE), TAG(STRONG), | |
| 2720 TAG(TT), TAG(U)})) { | |
| 2721 reconstruct_active_formatting_elements(parser); | |
| 2722 add_formatting_element(parser, insert_element_from_token(parser, token)); | |
| 2723 return true; | |
| 2724 } else if (tag_is(token, kStartTag, GUMBO_TAG_NOBR)) { | |
| 2725 bool result = true; | |
| 2726 reconstruct_active_formatting_elements(parser); | |
| 2727 if (has_an_element_in_scope(parser, GUMBO_TAG_NOBR)) { | |
| 2728 result = false; | |
| 2729 parser_add_parse_error(parser, token); | |
| 2730 adoption_agency_algorithm(parser, token, GUMBO_TAG_NOBR); | |
| 2731 reconstruct_active_formatting_elements(parser); | |
| 2732 } | |
| 2733 insert_element_from_token(parser, token); | |
| 2734 add_formatting_element(parser, get_current_node(parser)); | |
| 2735 return result; | |
| 2736 } else if (tag_in(token, kEndTag, | |
| 2737 (gumbo_tagset){TAG(A), TAG(B), TAG(BIG), TAG(CODE), TAG(EM), | |
| 2738 TAG(FONT), TAG(I), TAG(NOBR), TAG(S), TAG(SMALL), | |
| 2739 TAG(STRIKE), TAG(STRONG), TAG(TT), TAG(U)})) { | |
| 2740 return adoption_agency_algorithm(parser, token, token->v.end_tag); | |
| 2741 } else if (tag_in(token, kStartTag, | |
| 2742 (gumbo_tagset){TAG(APPLET), TAG(MARQUEE), TAG(OBJECT)})) { | |
| 2743 reconstruct_active_formatting_elements(parser); | |
| 2744 insert_element_from_token(parser, token); | |
| 2745 add_formatting_element(parser, &kActiveFormattingScopeMarker); | |
| 2746 set_frameset_not_ok(parser); | |
| 2747 return true; | |
| 2748 } else if (tag_in(token, kEndTag, | |
| 2749 (gumbo_tagset){TAG(APPLET), TAG(MARQUEE), TAG(OBJECT)})) { | |
| 2750 GumboTag token_tag = token->v.end_tag; | |
| 2751 if (!has_an_element_in_table_scope(parser, token_tag)) { | |
| 2752 parser_add_parse_error(parser, token); | |
| 2753 ignore_token(parser); | |
| 2754 return false; | |
| 2755 } | |
| 2756 implicitly_close_tags(parser, token, GUMBO_NAMESPACE_HTML, token_tag); | |
| 2757 clear_active_formatting_elements(parser); | |
| 2758 return true; | |
| 2759 } else if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) { | |
| 2760 if (get_document_node(parser)->v.document.doc_type_quirks_mode != | |
| 2761 GUMBO_DOCTYPE_QUIRKS) { | |
| 2762 maybe_implicitly_close_p_tag(parser, token); | |
| 2763 } | |
| 2764 insert_element_from_token(parser, token); | |
| 2765 set_frameset_not_ok(parser); | |
| 2766 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE); | |
| 2767 return true; | |
| 2768 } else if (tag_in(token, kStartTag, | |
| 2769 (gumbo_tagset){TAG(AREA), TAG(BR), TAG(EMBED), TAG(IMG), | |
| 2770 TAG(IMAGE), TAG(KEYGEN), TAG(WBR)})) { | |
| 2771 bool success = true; | |
| 2772 if (tag_is(token, kStartTag, GUMBO_TAG_IMAGE)) { | |
| 2773 success = false; | |
| 2774 parser_add_parse_error(parser, token); | |
| 2775 token->v.start_tag.tag = GUMBO_TAG_IMG; | |
| 2776 } | |
| 2777 reconstruct_active_formatting_elements(parser); | |
| 2778 GumboNode* node = insert_element_from_token(parser, token); | |
| 2779 if (tag_is(token, kStartTag, GUMBO_TAG_IMAGE)) { | |
| 2780 success = false; | |
| 2781 parser_add_parse_error(parser, token); | |
| 2782 node->v.element.tag = GUMBO_TAG_IMG; | |
| 2783 node->parse_flags |= GUMBO_INSERTION_FROM_IMAGE; | |
| 2784 } | |
| 2785 pop_current_node(parser); | |
| 2786 acknowledge_self_closing_tag(parser); | |
| 2787 set_frameset_not_ok(parser); | |
| 2788 return success; | |
| 2789 } else if (tag_is(token, kStartTag, GUMBO_TAG_INPUT)) { | |
| 2790 if (!attribute_matches(&token->v.start_tag.attributes, "type", "hidden")) { | |
| 2791 // Must be before the element is inserted, as that takes ownership of the | |
| 2792 // token's attribute vector. | |
| 2793 set_frameset_not_ok(parser); | |
| 2794 } | |
| 2795 reconstruct_active_formatting_elements(parser); | |
| 2796 insert_element_from_token(parser, token); | |
| 2797 pop_current_node(parser); | |
| 2798 acknowledge_self_closing_tag(parser); | |
| 2799 return true; | |
| 2800 } else if (tag_in(token, kStartTag, | |
| 2801 (gumbo_tagset){TAG(PARAM), TAG(SOURCE), TAG(TRACK)})) { | |
| 2802 insert_element_from_token(parser, token); | |
| 2803 pop_current_node(parser); | |
| 2804 acknowledge_self_closing_tag(parser); | |
| 2805 return true; | |
| 2806 } else if (tag_is(token, kStartTag, GUMBO_TAG_HR)) { | |
| 2807 bool result = maybe_implicitly_close_p_tag(parser, token); | |
| 2808 insert_element_from_token(parser, token); | |
| 2809 pop_current_node(parser); | |
| 2810 acknowledge_self_closing_tag(parser); | |
| 2811 set_frameset_not_ok(parser); | |
| 2812 return result; | |
| 2813 } else if (tag_is(token, kStartTag, GUMBO_TAG_ISINDEX)) { | |
| 2814 parser_add_parse_error(parser, token); | |
| 2815 if (parser->_parser_state->_form_element != NULL && | |
| 2816 !has_open_element(parser, GUMBO_TAG_TEMPLATE)) { | |
| 2817 ignore_token(parser); | |
| 2818 return false; | |
| 2819 } | |
| 2820 acknowledge_self_closing_tag(parser); | |
| 2821 maybe_implicitly_close_p_tag(parser, token); | |
| 2822 set_frameset_not_ok(parser); | |
| 2823 | |
| 2824 GumboVector* token_attrs = &token->v.start_tag.attributes; | |
| 2825 GumboAttribute* prompt_attr = gumbo_get_attribute(token_attrs, "prompt"); | |
| 2826 GumboAttribute* action_attr = gumbo_get_attribute(token_attrs, "action"); | |
| 2827 GumboAttribute* name_attr = gumbo_get_attribute(token_attrs, "name"); | |
| 2828 | |
| 2829 GumboNode* form = insert_element_of_tag_type( | |
| 2830 parser, GUMBO_TAG_FORM, GUMBO_INSERTION_FROM_ISINDEX); | |
| 2831 if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) { | |
| 2832 parser->_parser_state->_form_element = form; | |
| 2833 } | |
| 2834 if (action_attr) { | |
| 2835 gumbo_vector_add(parser, action_attr, &form->v.element.attributes); | |
| 2836 } | |
| 2837 insert_element_of_tag_type( | |
| 2838 parser, GUMBO_TAG_HR, GUMBO_INSERTION_FROM_ISINDEX); | |
| 2839 pop_current_node(parser); // <hr> | |
| 2840 | |
| 2841 insert_element_of_tag_type( | |
| 2842 parser, GUMBO_TAG_LABEL, GUMBO_INSERTION_FROM_ISINDEX); | |
| 2843 TextNodeBufferState* text_state = &parser->_parser_state->_text_node; | |
| 2844 text_state->_start_original_text = token->original_text.data; | |
| 2845 text_state->_start_position = token->position; | |
| 2846 text_state->_type = GUMBO_NODE_TEXT; | |
| 2847 if (prompt_attr) { | |
| 2848 size_t prompt_attr_length = strlen(prompt_attr->value); | |
| 2849 gumbo_string_buffer_destroy(parser, &text_state->_buffer); | |
| 2850 text_state->_buffer.data = gumbo_copy_stringz(parser, prompt_attr->value); | |
| 2851 text_state->_buffer.length = prompt_attr_length; | |
| 2852 text_state->_buffer.capacity = prompt_attr_length + 1; | |
| 2853 gumbo_destroy_attribute(parser, prompt_attr); | |
| 2854 } else { | |
| 2855 GumboStringPiece prompt_text = | |
| 2856 GUMBO_STRING("This is a searchable index. Enter search keywords: "); | |
| 2857 gumbo_string_buffer_append_string( | |
| 2858 parser, &prompt_text, &text_state->_buffer); | |
| 2859 } | |
| 2860 | |
| 2861 GumboNode* input = insert_element_of_tag_type( | |
| 2862 parser, GUMBO_TAG_INPUT, GUMBO_INSERTION_FROM_ISINDEX); | |
| 2863 for (unsigned int i = 0; i < token_attrs->length; ++i) { | |
| 2864 GumboAttribute* attr = token_attrs->data[i]; | |
| 2865 if (attr != prompt_attr && attr != action_attr && attr != name_attr) { | |
| 2866 gumbo_vector_add(parser, attr, &input->v.element.attributes); | |
| 2867 } | |
| 2868 token_attrs->data[i] = NULL; | |
| 2869 } | |
| 2870 | |
| 2871 // All attributes have been successfully transferred and nulled out at this | |
| 2872 // point, so the call to ignore_token will free the memory for it without | |
| 2873 // touching the attributes. | |
| 2874 ignore_token(parser); | |
| 2875 | |
| 2876 // The name attribute, if present, should be destroyed since it's ignored | |
| 2877 // when copying over. The action attribute should be kept since it's moved | |
| 2878 // to the form. | |
| 2879 if (name_attr) { | |
| 2880 gumbo_destroy_attribute(parser, name_attr); | |
| 2881 } | |
| 2882 | |
| 2883 GumboAttribute* name = | |
| 2884 gumbo_parser_allocate(parser, sizeof(GumboAttribute)); | |
| 2885 GumboStringPiece name_str = GUMBO_STRING("name"); | |
| 2886 GumboStringPiece isindex_str = GUMBO_STRING("isindex"); | |
| 2887 name->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE; | |
| 2888 name->name = gumbo_copy_stringz(parser, "name"); | |
| 2889 name->value = gumbo_copy_stringz(parser, "isindex"); | |
| 2890 name->original_name = name_str; | |
| 2891 name->original_value = isindex_str; | |
| 2892 name->name_start = kGumboEmptySourcePosition; | |
| 2893 name->name_end = kGumboEmptySourcePosition; | |
| 2894 name->value_start = kGumboEmptySourcePosition; | |
| 2895 name->value_end = kGumboEmptySourcePosition; | |
| 2896 gumbo_vector_add(parser, name, &input->v.element.attributes); | |
| 2897 | |
| 2898 pop_current_node(parser); // <input> | |
| 2899 pop_current_node(parser); // <label> | |
| 2900 insert_element_of_tag_type( | |
| 2901 parser, GUMBO_TAG_HR, GUMBO_INSERTION_FROM_ISINDEX); | |
| 2902 pop_current_node(parser); // <hr> | |
| 2903 pop_current_node(parser); // <form> | |
| 2904 if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) { | |
| 2905 parser->_parser_state->_form_element = NULL; | |
| 2906 } | |
| 2907 return false; | |
| 2908 } else if (tag_is(token, kStartTag, GUMBO_TAG_TEXTAREA)) { | |
| 2909 run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA); | |
| 2910 parser->_parser_state->_ignore_next_linefeed = true; | |
| 2911 set_frameset_not_ok(parser); | |
| 2912 return true; | |
| 2913 } else if (tag_is(token, kStartTag, GUMBO_TAG_XMP)) { | |
| 2914 bool result = maybe_implicitly_close_p_tag(parser, token); | |
| 2915 reconstruct_active_formatting_elements(parser); | |
| 2916 set_frameset_not_ok(parser); | |
| 2917 run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT); | |
| 2918 return result; | |
| 2919 } else if (tag_is(token, kStartTag, GUMBO_TAG_IFRAME)) { | |
| 2920 set_frameset_not_ok(parser); | |
| 2921 run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT); | |
| 2922 return true; | |
| 2923 } else if (tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)) { | |
| 2924 run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT); | |
| 2925 return true; | |
| 2926 } else if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) { | |
| 2927 reconstruct_active_formatting_elements(parser); | |
| 2928 insert_element_from_token(parser, token); | |
| 2929 set_frameset_not_ok(parser); | |
| 2930 GumboInsertionMode state = parser->_parser_state->_insertion_mode; | |
| 2931 if (state == GUMBO_INSERTION_MODE_IN_TABLE || | |
| 2932 state == GUMBO_INSERTION_MODE_IN_CAPTION || | |
| 2933 state == GUMBO_INSERTION_MODE_IN_TABLE_BODY || | |
| 2934 state == GUMBO_INSERTION_MODE_IN_ROW || | |
| 2935 state == GUMBO_INSERTION_MODE_IN_CELL) { | |
| 2936 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE); | |
| 2937 } else { | |
| 2938 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_SELECT); | |
| 2939 } | |
| 2940 return true; | |
| 2941 } else if (tag_in(token, kStartTag, | |
| 2942 (gumbo_tagset){TAG(OPTION), TAG(OPTGROUP)})) { | |
| 2943 if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) { | |
| 2944 pop_current_node(parser); | |
| 2945 } | |
| 2946 reconstruct_active_formatting_elements(parser); | |
| 2947 insert_element_from_token(parser, token); | |
| 2948 return true; | |
| 2949 } else if (tag_in(token, kStartTag, | |
| 2950 (gumbo_tagset){TAG(RB), TAG(RP), TAG(RT), TAG(RTC)})) { | |
| 2951 bool success = true; | |
| 2952 GumboTag exception = | |
| 2953 tag_in(token, kStartTag, (gumbo_tagset){TAG(RT), TAG(RP)}) | |
| 2954 ? GUMBO_TAG_RTC | |
| 2955 : GUMBO_TAG_LAST; | |
| 2956 if (has_an_element_in_scope(parser, GUMBO_TAG_RUBY)) { | |
| 2957 generate_implied_end_tags(parser, exception); | |
| 2958 } | |
| 2959 if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_RUBY) && | |
| 2960 !(exception == GUMBO_TAG_LAST || | |
| 2961 node_html_tag_is(get_current_node(parser), GUMBO_TAG_RTC))) { | |
| 2962 parser_add_parse_error(parser, token); | |
| 2963 success = false; | |
| 2964 } | |
| 2965 insert_element_from_token(parser, token); | |
| 2966 return success; | |
| 2967 } else if (tag_is(token, kEndTag, GUMBO_TAG_BR)) { | |
| 2968 parser_add_parse_error(parser, token); | |
| 2969 reconstruct_active_formatting_elements(parser); | |
| 2970 insert_element_of_tag_type( | |
| 2971 parser, GUMBO_TAG_BR, GUMBO_INSERTION_CONVERTED_FROM_END_TAG); | |
| 2972 pop_current_node(parser); | |
| 2973 return false; | |
| 2974 } else if (tag_is(token, kStartTag, GUMBO_TAG_MATH)) { | |
| 2975 reconstruct_active_formatting_elements(parser); | |
| 2976 adjust_mathml_attributes(parser, token); | |
| 2977 adjust_foreign_attributes(parser, token); | |
| 2978 insert_foreign_element(parser, token, GUMBO_NAMESPACE_MATHML); | |
| 2979 if (token->v.start_tag.is_self_closing) { | |
| 2980 pop_current_node(parser); | |
| 2981 acknowledge_self_closing_tag(parser); | |
| 2982 } | |
| 2983 return true; | |
| 2984 } else if (tag_is(token, kStartTag, GUMBO_TAG_SVG)) { | |
| 2985 reconstruct_active_formatting_elements(parser); | |
| 2986 adjust_svg_attributes(parser, token); | |
| 2987 adjust_foreign_attributes(parser, token); | |
| 2988 insert_foreign_element(parser, token, GUMBO_NAMESPACE_SVG); | |
| 2989 if (token->v.start_tag.is_self_closing) { | |
| 2990 pop_current_node(parser); | |
| 2991 acknowledge_self_closing_tag(parser); | |
| 2992 } | |
| 2993 return true; | |
| 2994 } else if (tag_in(token, kStartTag, | |
| 2995 (gumbo_tagset){TAG(CAPTION), TAG(COL), TAG(COLGROUP), | |
| 2996 TAG(FRAME), TAG(HEAD), TAG(TBODY), TAG(TD), TAG(TFOOT), | |
| 2997 TAG(TH), TAG(THEAD), TAG(TR)})) { | |
| 2998 parser_add_parse_error(parser, token); | |
| 2999 ignore_token(parser); | |
| 3000 return false; | |
| 3001 } else if (token->type == GUMBO_TOKEN_START_TAG) { | |
| 3002 reconstruct_active_formatting_elements(parser); | |
| 3003 insert_element_from_token(parser, token); | |
| 3004 return true; | |
| 3005 } else { | |
| 3006 assert(token->type == GUMBO_TOKEN_END_TAG); | |
| 3007 GumboTag end_tag = token->v.end_tag; | |
| 3008 assert(state->_open_elements.length > 0); | |
| 3009 assert(node_html_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML)); | |
| 3010 // Walk up the stack of open elements until we find one that either: | |
| 3011 // a) Matches the tag name we saw | |
| 3012 // b) Is in the "special" category. | |
| 3013 // If we see a), implicitly close everything up to and including it. If we | |
| 3014 // see b), then record a parse error, don't close anything (except the | |
| 3015 // implied end tags) and ignore the end tag token. | |
| 3016 for (int i = state->_open_elements.length; --i >= 0;) { | |
| 3017 const GumboNode* node = state->_open_elements.data[i]; | |
| 3018 if (node_html_tag_is(node, end_tag)) { | |
| 3019 generate_implied_end_tags(parser, end_tag); | |
| 3020 // TODO(jdtang): Do I need to add a parse error here? The condition in | |
| 3021 // the spec seems like it's the inverse of the loop condition above, and | |
| 3022 // so would never fire. | |
| 3023 while (node != pop_current_node(parser)) | |
| 3024 ; // Pop everything. | |
| 3025 return true; | |
| 3026 } else if (is_special_node(node)) { | |
| 3027 parser_add_parse_error(parser, token); | |
| 3028 ignore_token(parser); | |
| 3029 return false; | |
| 3030 } | |
| 3031 } | |
| 3032 // <html> is in the special category, so we should never get here. | |
| 3033 assert(0); | |
| 3034 return false; | |
| 3035 } | |
| 3036 } | |
| 3037 | |
| 3038 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-incdata | |
| 3039 static bool handle_text(GumboParser* parser, GumboToken* token) { | |
| 3040 if (token->type == GUMBO_TOKEN_CHARACTER || | |
| 3041 token->type == GUMBO_TOKEN_WHITESPACE) { | |
| 3042 insert_text_token(parser, token); | |
| 3043 } else { | |
| 3044 // We provide only bare-bones script handling that doesn't involve any of | |
| 3045 // the parser-pause/already-started/script-nesting flags or re-entrant | |
| 3046 // invocations of the tokenizer. Because the intended usage of this library | |
| 3047 // is mostly for templating, refactoring, and static-analysis libraries, we | |
| 3048 // provide the script body as a text-node child of the <script> element. | |
| 3049 // This behavior doesn't support document.write of partial HTML elements, | |
| 3050 // but should be adequate for almost all other scripting support. | |
| 3051 if (token->type == GUMBO_TOKEN_EOF) { | |
| 3052 parser_add_parse_error(parser, token); | |
| 3053 parser->_parser_state->_reprocess_current_token = true; | |
| 3054 } | |
| 3055 pop_current_node(parser); | |
| 3056 set_insertion_mode(parser, parser->_parser_state->_original_insertion_mode); | |
| 3057 } | |
| 3058 return true; | |
| 3059 } | |
| 3060 | |
| 3061 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intable | |
| 3062 static bool handle_in_table(GumboParser* parser, GumboToken* token) { | |
| 3063 GumboParserState* state = parser->_parser_state; | |
| 3064 if (token->type == GUMBO_TOKEN_CHARACTER || | |
| 3065 token->type == GUMBO_TOKEN_WHITESPACE) { | |
| 3066 // The "pending table character tokens" list described in the spec is | |
| 3067 // nothing more than the TextNodeBufferState. We accumulate text tokens as | |
| 3068 // normal, except that when we go to flush them in the handle_in_table_text, | |
| 3069 // we set _foster_parent_insertions if there're non-whitespace characters in | |
| 3070 // the buffer. | |
| 3071 assert(state->_text_node._buffer.length == 0); | |
| 3072 state->_original_insertion_mode = state->_insertion_mode; | |
| 3073 state->_reprocess_current_token = true; | |
| 3074 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_TEXT); | |
| 3075 return true; | |
| 3076 } else if (token->type == GUMBO_TOKEN_DOCTYPE) { | |
| 3077 parser_add_parse_error(parser, token); | |
| 3078 ignore_token(parser); | |
| 3079 return false; | |
| 3080 } else if (token->type == GUMBO_TOKEN_COMMENT) { | |
| 3081 append_comment_node(parser, get_current_node(parser), token); | |
| 3082 return true; | |
| 3083 } else if (tag_is(token, kStartTag, GUMBO_TAG_CAPTION)) { | |
| 3084 clear_stack_to_table_context(parser); | |
| 3085 add_formatting_element(parser, &kActiveFormattingScopeMarker); | |
| 3086 insert_element_from_token(parser, token); | |
| 3087 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CAPTION); | |
| 3088 return true; | |
| 3089 } else if (tag_is(token, kStartTag, GUMBO_TAG_COLGROUP)) { | |
| 3090 clear_stack_to_table_context(parser); | |
| 3091 insert_element_from_token(parser, token); | |
| 3092 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP); | |
| 3093 return true; | |
| 3094 } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) { | |
| 3095 clear_stack_to_table_context(parser); | |
| 3096 insert_element_of_tag_type( | |
| 3097 parser, GUMBO_TAG_COLGROUP, GUMBO_INSERTION_IMPLIED); | |
| 3098 parser->_parser_state->_reprocess_current_token = true; | |
| 3099 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP); | |
| 3100 return true; | |
| 3101 } else if (tag_in(token, kStartTag, | |
| 3102 (gumbo_tagset){TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TD), | |
| 3103 TAG(TH), TAG(TR)})) { | |
| 3104 clear_stack_to_table_context(parser); | |
| 3105 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY); | |
| 3106 if (tag_in(token, kStartTag, (gumbo_tagset){TAG(TD), TAG(TH), TAG(TR)})) { | |
| 3107 insert_element_of_tag_type( | |
| 3108 parser, GUMBO_TAG_TBODY, GUMBO_INSERTION_IMPLIED); | |
| 3109 state->_reprocess_current_token = true; | |
| 3110 } else { | |
| 3111 insert_element_from_token(parser, token); | |
| 3112 } | |
| 3113 return true; | |
| 3114 } else if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) { | |
| 3115 parser_add_parse_error(parser, token); | |
| 3116 if (close_table(parser)) { | |
| 3117 parser->_parser_state->_reprocess_current_token = true; | |
| 3118 } else { | |
| 3119 ignore_token(parser); | |
| 3120 } | |
| 3121 return false; | |
| 3122 } else if (tag_is(token, kEndTag, GUMBO_TAG_TABLE)) { | |
| 3123 if (!close_table(parser)) { | |
| 3124 parser_add_parse_error(parser, token); | |
| 3125 return false; | |
| 3126 } | |
| 3127 return true; | |
| 3128 } else if (tag_in(token, kEndTag, | |
| 3129 (gumbo_tagset){TAG(BODY), TAG(CAPTION), TAG(COL), | |
| 3130 TAG(COLGROUP), TAG(HTML), TAG(TBODY), TAG(TD), TAG(TFOOT), | |
| 3131 TAG(TH), TAG(THEAD), TAG(TR)})) { | |
| 3132 parser_add_parse_error(parser, token); | |
| 3133 ignore_token(parser); | |
| 3134 return false; | |
| 3135 } else if (tag_in(token, kStartTag, | |
| 3136 (gumbo_tagset){TAG(STYLE), TAG(SCRIPT), TAG(TEMPLATE)}) || | |
| 3137 (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE))) { | |
| 3138 return handle_in_head(parser, token); | |
| 3139 } else if (tag_is(token, kStartTag, GUMBO_TAG_INPUT) && | |
| 3140 attribute_matches( | |
| 3141 &token->v.start_tag.attributes, "type", "hidden")) { | |
| 3142 parser_add_parse_error(parser, token); | |
| 3143 insert_element_from_token(parser, token); | |
| 3144 pop_current_node(parser); | |
| 3145 return false; | |
| 3146 } else if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) { | |
| 3147 parser_add_parse_error(parser, token); | |
| 3148 if (state->_form_element || has_open_element(parser, GUMBO_TAG_TEMPLATE)) { | |
| 3149 ignore_token(parser); | |
| 3150 return false; | |
| 3151 } | |
| 3152 state->_form_element = insert_element_from_token(parser, token); | |
| 3153 pop_current_node(parser); | |
| 3154 return false; | |
| 3155 } else if (token->type == GUMBO_TOKEN_EOF) { | |
| 3156 return handle_in_body(parser, token); | |
| 3157 } else { | |
| 3158 parser_add_parse_error(parser, token); | |
| 3159 state->_foster_parent_insertions = true; | |
| 3160 bool result = handle_in_body(parser, token); | |
| 3161 state->_foster_parent_insertions = false; | |
| 3162 return result; | |
| 3163 } | |
| 3164 } | |
| 3165 | |
| 3166 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intabletext | |
| 3167 static bool handle_in_table_text(GumboParser* parser, GumboToken* token) { | |
| 3168 if (token->type == GUMBO_TOKEN_NULL) { | |
| 3169 parser_add_parse_error(parser, token); | |
| 3170 ignore_token(parser); | |
| 3171 return false; | |
| 3172 } else if (token->type == GUMBO_TOKEN_CHARACTER || | |
| 3173 token->type == GUMBO_TOKEN_WHITESPACE) { | |
| 3174 insert_text_token(parser, token); | |
| 3175 return true; | |
| 3176 } else { | |
| 3177 GumboParserState* state = parser->_parser_state; | |
| 3178 GumboStringBuffer* buffer = &state->_text_node._buffer; | |
| 3179 // Can't use strspn for this because GumboStringBuffers are not | |
| 3180 // null-terminated. | |
| 3181 // Note that TextNodeBuffer may contain UTF-8 characters, but the presence | |
| 3182 // of any one byte that is not whitespace means we flip the flag, so this | |
| 3183 // loop is still valid. | |
| 3184 for (unsigned int i = 0; i < buffer->length; ++i) { | |
| 3185 if (!isspace((unsigned char) buffer->data[i]) || | |
| 3186 buffer->data[i] == '\v') { | |
| 3187 state->_foster_parent_insertions = true; | |
| 3188 reconstruct_active_formatting_elements(parser); | |
| 3189 break; | |
| 3190 } | |
| 3191 } | |
| 3192 maybe_flush_text_node_buffer(parser); | |
| 3193 state->_foster_parent_insertions = false; | |
| 3194 state->_reprocess_current_token = true; | |
| 3195 state->_insertion_mode = state->_original_insertion_mode; | |
| 3196 return true; | |
| 3197 } | |
| 3198 } | |
| 3199 | |
| 3200 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-incaption | |
| 3201 static bool handle_in_caption(GumboParser* parser, GumboToken* token) { | |
| 3202 if (tag_is(token, kEndTag, GUMBO_TAG_CAPTION)) { | |
| 3203 if (!has_an_element_in_table_scope(parser, GUMBO_TAG_CAPTION)) { | |
| 3204 parser_add_parse_error(parser, token); | |
| 3205 ignore_token(parser); | |
| 3206 return false; | |
| 3207 } else { | |
| 3208 generate_implied_end_tags(parser, GUMBO_TAG_LAST); | |
| 3209 bool result = true; | |
| 3210 if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION)) { | |
| 3211 parser_add_parse_error(parser, token); | |
| 3212 } | |
| 3213 while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_CAPTION)) | |
| 3214 ; | |
| 3215 clear_active_formatting_elements(parser); | |
| 3216 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE); | |
| 3217 return result; | |
| 3218 } | |
| 3219 } else if (tag_in(token, kStartTag, | |
| 3220 (gumbo_tagset){TAG(CAPTION), TAG(COL), TAG(COLGROUP), | |
| 3221 TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), | |
| 3222 TAG(TR)}) || | |
| 3223 (tag_is(token, kEndTag, GUMBO_TAG_TABLE))) { | |
| 3224 if (!has_an_element_in_table_scope(parser, GUMBO_TAG_CAPTION)) { | |
| 3225 parser_add_parse_error(parser, token); | |
| 3226 ignore_token(parser); | |
| 3227 return false; | |
| 3228 } | |
| 3229 while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_CAPTION)) | |
| 3230 ; | |
| 3231 clear_active_formatting_elements(parser); | |
| 3232 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE); | |
| 3233 parser->_parser_state->_reprocess_current_token = true; | |
| 3234 return true; | |
| 3235 } else if (tag_in(token, kEndTag, | |
| 3236 (gumbo_tagset){TAG(BODY), TAG(COL), TAG(COLGROUP), TAG(HTML), | |
| 3237 TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), | |
| 3238 TAG(TR)})) { | |
| 3239 parser_add_parse_error(parser, token); | |
| 3240 ignore_token(parser); | |
| 3241 return false; | |
| 3242 } else { | |
| 3243 return handle_in_body(parser, token); | |
| 3244 } | |
| 3245 } | |
| 3246 | |
| 3247 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-incolgroup | |
| 3248 static bool handle_in_column_group(GumboParser* parser, GumboToken* token) { | |
| 3249 if (token->type == GUMBO_TOKEN_WHITESPACE) { | |
| 3250 insert_text_token(parser, token); | |
| 3251 return true; | |
| 3252 } else if (token->type == GUMBO_TOKEN_DOCTYPE) { | |
| 3253 parser_add_parse_error(parser, token); | |
| 3254 ignore_token(parser); | |
| 3255 return false; | |
| 3256 } else if (token->type == GUMBO_TOKEN_COMMENT) { | |
| 3257 append_comment_node(parser, get_current_node(parser), token); | |
| 3258 return true; | |
| 3259 } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) { | |
| 3260 return handle_in_body(parser, token); | |
| 3261 } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) { | |
| 3262 insert_element_from_token(parser, token); | |
| 3263 pop_current_node(parser); | |
| 3264 acknowledge_self_closing_tag(parser); | |
| 3265 return true; | |
| 3266 } else if (tag_is(token, kEndTag, GUMBO_TAG_COLGROUP)) { | |
| 3267 if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP)) { | |
| 3268 parser_add_parse_error(parser, token); | |
| 3269 ignore_token(parser); | |
| 3270 return false; | |
| 3271 } | |
| 3272 pop_current_node(parser); | |
| 3273 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE); | |
| 3274 return false; | |
| 3275 } else if (tag_is(token, kEndTag, GUMBO_TAG_COL)) { | |
| 3276 parser_add_parse_error(parser, token); | |
| 3277 ignore_token(parser); | |
| 3278 return false; | |
| 3279 } else if (tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE) || | |
| 3280 tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) { | |
| 3281 return handle_in_head(parser, token); | |
| 3282 } else if (token->type == GUMBO_TOKEN_EOF) { | |
| 3283 return handle_in_body(parser, token); | |
| 3284 } else { | |
| 3285 if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP)) { | |
| 3286 parser_add_parse_error(parser, token); | |
| 3287 ignore_token(parser); | |
| 3288 return false; | |
| 3289 } | |
| 3290 pop_current_node(parser); | |
| 3291 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE); | |
| 3292 parser->_parser_state->_reprocess_current_token = true; | |
| 3293 return true; | |
| 3294 } | |
| 3295 } | |
| 3296 | |
| 3297 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intbody | |
| 3298 static bool handle_in_table_body(GumboParser* parser, GumboToken* token) { | |
| 3299 if (tag_is(token, kStartTag, GUMBO_TAG_TR)) { | |
| 3300 clear_stack_to_table_body_context(parser); | |
| 3301 insert_element_from_token(parser, token); | |
| 3302 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW); | |
| 3303 return true; | |
| 3304 } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(TD), TAG(TH)})) { | |
| 3305 parser_add_parse_error(parser, token); | |
| 3306 clear_stack_to_table_body_context(parser); | |
| 3307 insert_element_of_tag_type(parser, GUMBO_TAG_TR, GUMBO_INSERTION_IMPLIED); | |
| 3308 parser->_parser_state->_reprocess_current_token = true; | |
| 3309 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW); | |
| 3310 return false; | |
| 3311 } else if (tag_in(token, kEndTag, | |
| 3312 (gumbo_tagset){TAG(TBODY), TAG(TFOOT), TAG(THEAD)})) { | |
| 3313 if (!has_an_element_in_table_scope(parser, token->v.end_tag)) { | |
| 3314 parser_add_parse_error(parser, token); | |
| 3315 ignore_token(parser); | |
| 3316 return false; | |
| 3317 } | |
| 3318 clear_stack_to_table_body_context(parser); | |
| 3319 pop_current_node(parser); | |
| 3320 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE); | |
| 3321 return true; | |
| 3322 } else if (tag_in(token, kStartTag, | |
| 3323 (gumbo_tagset){TAG(CAPTION), TAG(COL), TAG(COLGROUP), | |
| 3324 TAG(TBODY), TAG(TFOOT), TAG(THEAD)}) || | |
| 3325 tag_is(token, kEndTag, GUMBO_TAG_TABLE)) { | |
| 3326 if (!(has_an_element_in_table_scope(parser, GUMBO_TAG_TBODY) || | |
| 3327 has_an_element_in_table_scope(parser, GUMBO_TAG_THEAD) || | |
| 3328 has_an_element_in_table_scope(parser, GUMBO_TAG_TFOOT))) { | |
| 3329 parser_add_parse_error(parser, token); | |
| 3330 ignore_token(parser); | |
| 3331 return false; | |
| 3332 } | |
| 3333 clear_stack_to_table_body_context(parser); | |
| 3334 pop_current_node(parser); | |
| 3335 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE); | |
| 3336 parser->_parser_state->_reprocess_current_token = true; | |
| 3337 return true; | |
| 3338 } else if (tag_in(token, kEndTag, | |
| 3339 (gumbo_tagset){TAG(BODY), TAG(CAPTION), TAG(COL), TAG(TR), | |
| 3340 TAG(COLGROUP), TAG(HTML), TAG(TD), TAG(TH)})) { | |
| 3341 parser_add_parse_error(parser, token); | |
| 3342 ignore_token(parser); | |
| 3343 return false; | |
| 3344 } else { | |
| 3345 return handle_in_table(parser, token); | |
| 3346 } | |
| 3347 } | |
| 3348 | |
| 3349 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intr | |
| 3350 static bool handle_in_row(GumboParser* parser, GumboToken* token) { | |
| 3351 if (tag_in(token, kStartTag, (gumbo_tagset){TAG(TH), TAG(TD)})) { | |
| 3352 clear_stack_to_table_row_context(parser); | |
| 3353 insert_element_from_token(parser, token); | |
| 3354 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CELL); | |
| 3355 add_formatting_element(parser, &kActiveFormattingScopeMarker); | |
| 3356 return true; | |
| 3357 } else if (tag_is(token, kEndTag, GUMBO_TAG_TR)) { | |
| 3358 if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR)) { | |
| 3359 parser_add_parse_error(parser, token); | |
| 3360 ignore_token(parser); | |
| 3361 return false; | |
| 3362 } else { | |
| 3363 clear_stack_to_table_row_context(parser); | |
| 3364 pop_current_node(parser); | |
| 3365 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY); | |
| 3366 return true; | |
| 3367 } | |
| 3368 } else if (tag_in(token, kStartTag, | |
| 3369 (gumbo_tagset){TAG(CAPTION), TAG(COL), TAG(COLGROUP), | |
| 3370 TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TR)}) || | |
| 3371 tag_is(token, kEndTag, GUMBO_TAG_TABLE)) { | |
| 3372 if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR)) { | |
| 3373 parser_add_parse_error(parser, token); | |
| 3374 ignore_token(parser); | |
| 3375 return false; | |
| 3376 } else { | |
| 3377 clear_stack_to_table_row_context(parser); | |
| 3378 pop_current_node(parser); | |
| 3379 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY); | |
| 3380 parser->_parser_state->_reprocess_current_token = true; | |
| 3381 return true; | |
| 3382 } | |
| 3383 } else if (tag_in(token, kEndTag, | |
| 3384 (gumbo_tagset){TAG(TBODY), TAG(TFOOT), TAG(THEAD)})) { | |
| 3385 if (!has_an_element_in_table_scope(parser, token->v.end_tag) || | |
| 3386 (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR))) { | |
| 3387 parser_add_parse_error(parser, token); | |
| 3388 ignore_token(parser); | |
| 3389 return false; | |
| 3390 } else { | |
| 3391 clear_stack_to_table_row_context(parser); | |
| 3392 pop_current_node(parser); | |
| 3393 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY); | |
| 3394 parser->_parser_state->_reprocess_current_token = true; | |
| 3395 return true; | |
| 3396 } | |
| 3397 } else if (tag_in(token, kEndTag, | |
| 3398 (gumbo_tagset){TAG(BODY), TAG(CAPTION), TAG(COL), | |
| 3399 TAG(COLGROUP), TAG(HTML), TAG(TD), TAG(TH)})) { | |
| 3400 parser_add_parse_error(parser, token); | |
| 3401 ignore_token(parser); | |
| 3402 return false; | |
| 3403 } else { | |
| 3404 return handle_in_table(parser, token); | |
| 3405 } | |
| 3406 } | |
| 3407 | |
| 3408 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intd | |
| 3409 static bool handle_in_cell(GumboParser* parser, GumboToken* token) { | |
| 3410 if (tag_in(token, kEndTag, (gumbo_tagset){TAG(TD), TAG(TH)})) { | |
| 3411 GumboTag token_tag = token->v.end_tag; | |
| 3412 if (!has_an_element_in_table_scope(parser, token_tag)) { | |
| 3413 parser_add_parse_error(parser, token); | |
| 3414 ignore_token(parser); | |
| 3415 return false; | |
| 3416 } | |
| 3417 return close_table_cell(parser, token, token_tag); | |
| 3418 } else if (tag_in(token, kStartTag, | |
| 3419 (gumbo_tagset){TAG(CAPTION), TAG(COL), TAG(COLGROUP), | |
| 3420 TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), | |
| 3421 TAG(TR)})) { | |
| 3422 gumbo_debug("Handling <td> in cell.\n"); | |
| 3423 if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TH) && | |
| 3424 !has_an_element_in_table_scope(parser, GUMBO_TAG_TD)) { | |
| 3425 gumbo_debug("Bailing out because there's no <td> or <th> in scope.\n"); | |
| 3426 parser_add_parse_error(parser, token); | |
| 3427 ignore_token(parser); | |
| 3428 return false; | |
| 3429 } | |
| 3430 parser->_parser_state->_reprocess_current_token = true; | |
| 3431 return close_current_cell(parser, token); | |
| 3432 } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(BODY), TAG(CAPTION), | |
| 3433 TAG(COL), TAG(COLGROUP), TAG(HTML)})) { | |
| 3434 parser_add_parse_error(parser, token); | |
| 3435 ignore_token(parser); | |
| 3436 return false; | |
| 3437 } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(TABLE), TAG(TBODY), | |
| 3438 TAG(TFOOT), TAG(THEAD), TAG(TR)})) { | |
| 3439 if (!has_an_element_in_table_scope(parser, token->v.end_tag)) { | |
| 3440 parser_add_parse_error(parser, token); | |
| 3441 ignore_token(parser); | |
| 3442 return false; | |
| 3443 } | |
| 3444 parser->_parser_state->_reprocess_current_token = true; | |
| 3445 return close_current_cell(parser, token); | |
| 3446 } else { | |
| 3447 return handle_in_body(parser, token); | |
| 3448 } | |
| 3449 } | |
| 3450 | |
| 3451 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inselect | |
| 3452 static bool handle_in_select(GumboParser* parser, GumboToken* token) { | |
| 3453 if (token->type == GUMBO_TOKEN_NULL) { | |
| 3454 parser_add_parse_error(parser, token); | |
| 3455 ignore_token(parser); | |
| 3456 return false; | |
| 3457 } else if (token->type == GUMBO_TOKEN_CHARACTER || | |
| 3458 token->type == GUMBO_TOKEN_WHITESPACE) { | |
| 3459 insert_text_token(parser, token); | |
| 3460 return true; | |
| 3461 } else if (token->type == GUMBO_TOKEN_DOCTYPE) { | |
| 3462 parser_add_parse_error(parser, token); | |
| 3463 ignore_token(parser); | |
| 3464 return false; | |
| 3465 } else if (token->type == GUMBO_TOKEN_COMMENT) { | |
| 3466 append_comment_node(parser, get_current_node(parser), token); | |
| 3467 return true; | |
| 3468 } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) { | |
| 3469 return handle_in_body(parser, token); | |
| 3470 } else if (tag_is(token, kStartTag, GUMBO_TAG_OPTION)) { | |
| 3471 if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) { | |
| 3472 pop_current_node(parser); | |
| 3473 } | |
| 3474 insert_element_from_token(parser, token); | |
| 3475 return true; | |
| 3476 } else if (tag_is(token, kStartTag, GUMBO_TAG_OPTGROUP)) { | |
| 3477 if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) { | |
| 3478 pop_current_node(parser); | |
| 3479 } | |
| 3480 if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) { | |
| 3481 pop_current_node(parser); | |
| 3482 } | |
| 3483 insert_element_from_token(parser, token); | |
| 3484 return true; | |
| 3485 } else if (tag_is(token, kEndTag, GUMBO_TAG_OPTGROUP)) { | |
| 3486 GumboVector* open_elements = &parser->_parser_state->_open_elements; | |
| 3487 if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION) && | |
| 3488 node_html_tag_is(open_elements->data[open_elements->length - 2], | |
| 3489 GUMBO_TAG_OPTGROUP)) { | |
| 3490 pop_current_node(parser); | |
| 3491 } | |
| 3492 if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) { | |
| 3493 pop_current_node(parser); | |
| 3494 return true; | |
| 3495 } else { | |
| 3496 parser_add_parse_error(parser, token); | |
| 3497 ignore_token(parser); | |
| 3498 return false; | |
| 3499 } | |
| 3500 } else if (tag_is(token, kEndTag, GUMBO_TAG_OPTION)) { | |
| 3501 if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) { | |
| 3502 pop_current_node(parser); | |
| 3503 return true; | |
| 3504 } else { | |
| 3505 parser_add_parse_error(parser, token); | |
| 3506 ignore_token(parser); | |
| 3507 return false; | |
| 3508 } | |
| 3509 } else if (tag_is(token, kEndTag, GUMBO_TAG_SELECT)) { | |
| 3510 if (!has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) { | |
| 3511 parser_add_parse_error(parser, token); | |
| 3512 ignore_token(parser); | |
| 3513 return false; | |
| 3514 } | |
| 3515 close_current_select(parser); | |
| 3516 return true; | |
| 3517 } else if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) { | |
| 3518 parser_add_parse_error(parser, token); | |
| 3519 ignore_token(parser); | |
| 3520 if (has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) { | |
| 3521 close_current_select(parser); | |
| 3522 } | |
| 3523 return false; | |
| 3524 } else if (tag_in(token, kStartTag, | |
| 3525 (gumbo_tagset){TAG(INPUT), TAG(KEYGEN), TAG(TEXTAREA)})) { | |
| 3526 parser_add_parse_error(parser, token); | |
| 3527 if (!has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) { | |
| 3528 ignore_token(parser); | |
| 3529 } else { | |
| 3530 close_current_select(parser); | |
| 3531 parser->_parser_state->_reprocess_current_token = true; | |
| 3532 } | |
| 3533 return false; | |
| 3534 } else if (tag_in(token, kStartTag, | |
| 3535 (gumbo_tagset){TAG(SCRIPT), TAG(TEMPLATE)}) || | |
| 3536 tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) { | |
| 3537 return handle_in_head(parser, token); | |
| 3538 } else if (token->type == GUMBO_TOKEN_EOF) { | |
| 3539 return handle_in_body(parser, token); | |
| 3540 } else { | |
| 3541 parser_add_parse_error(parser, token); | |
| 3542 ignore_token(parser); | |
| 3543 return false; | |
| 3544 } | |
| 3545 } | |
| 3546 | |
| 3547 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inselectintable | |
| 3548 static bool handle_in_select_in_table(GumboParser* parser, GumboToken* token) { | |
| 3549 if (tag_in(token, kStartTag, | |
| 3550 (gumbo_tagset){TAG(CAPTION), TAG(TABLE), TAG(TBODY), TAG(TFOOT), | |
| 3551 TAG(THEAD), TAG(TR), TAG(TD), TAG(TH)})) { | |
| 3552 parser_add_parse_error(parser, token); | |
| 3553 close_current_select(parser); | |
| 3554 parser->_parser_state->_reprocess_current_token = true; | |
| 3555 return false; | |
| 3556 } else if (tag_in(token, kEndTag, | |
| 3557 (gumbo_tagset){TAG(CAPTION), TAG(TABLE), TAG(TBODY), | |
| 3558 TAG(TFOOT), TAG(THEAD), TAG(TR), TAG(TD), TAG(TH)})) { | |
| 3559 parser_add_parse_error(parser, token); | |
| 3560 if (!has_an_element_in_table_scope(parser, token->v.end_tag)) { | |
| 3561 ignore_token(parser); | |
| 3562 return false; | |
| 3563 } else { | |
| 3564 close_current_select(parser); | |
| 3565 // close_current_select already does the | |
| 3566 // reset_insertion_mode_appropriately | |
| 3567 // reset_insertion_mode_appropriately(parser); | |
| 3568 parser->_parser_state->_reprocess_current_token = true; | |
| 3569 return false; | |
| 3570 } | |
| 3571 } else { | |
| 3572 return handle_in_select(parser, token); | |
| 3573 } | |
| 3574 } | |
| 3575 | |
| 3576 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#parsing-main-intemplate | |
| 3577 static bool handle_in_template(GumboParser* parser, GumboToken* token) { | |
| 3578 GumboParserState* state = parser->_parser_state; | |
| 3579 if (token->type == GUMBO_TOKEN_WHITESPACE || | |
| 3580 token->type == GUMBO_TOKEN_CHARACTER || | |
| 3581 token->type == GUMBO_TOKEN_COMMENT || token->type == GUMBO_TOKEN_NULL || | |
| 3582 token->type == GUMBO_TOKEN_DOCTYPE) { | |
| 3583 return handle_in_body(parser, token); | |
| 3584 } else if (tag_in(token, kStartTag, | |
| 3585 (gumbo_tagset){TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), | |
| 3586 TAG(LINK), TAG(META), TAG(NOFRAMES), TAG(SCRIPT), | |
| 3587 TAG(STYLE), TAG(TEMPLATE), TAG(TITLE)}) || | |
| 3588 tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) { | |
| 3589 return handle_in_head(parser, token); | |
| 3590 } else if (tag_in( | |
| 3591 token, kStartTag, (gumbo_tagset){TAG(CAPTION), TAG(COLGROUP), | |
| 3592 TAG(TBODY), TAG(TFOOT), TAG(THEAD)})) { | |
| 3593 pop_template_insertion_mode(parser); | |
| 3594 push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE); | |
| 3595 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE); | |
| 3596 state->_reprocess_current_token = true; | |
| 3597 return true; | |
| 3598 } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) { | |
| 3599 pop_template_insertion_mode(parser); | |
| 3600 push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP); | |
| 3601 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP); | |
| 3602 state->_reprocess_current_token = true; | |
| 3603 return true; | |
| 3604 } else if (tag_is(token, kStartTag, GUMBO_TAG_TR)) { | |
| 3605 pop_template_insertion_mode(parser); | |
| 3606 push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY); | |
| 3607 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY); | |
| 3608 state->_reprocess_current_token = true; | |
| 3609 return true; | |
| 3610 } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(TD), TAG(TH)})) { | |
| 3611 pop_template_insertion_mode(parser); | |
| 3612 push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW); | |
| 3613 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW); | |
| 3614 state->_reprocess_current_token = true; | |
| 3615 return true; | |
| 3616 } else if (token->type == GUMBO_TOKEN_START_TAG) { | |
| 3617 pop_template_insertion_mode(parser); | |
| 3618 push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY); | |
| 3619 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY); | |
| 3620 state->_reprocess_current_token = true; | |
| 3621 return true; | |
| 3622 } else if (token->type == GUMBO_TOKEN_END_TAG) { | |
| 3623 parser_add_parse_error(parser, token); | |
| 3624 ignore_token(parser); | |
| 3625 return false; | |
| 3626 } else if (token->type == GUMBO_TOKEN_EOF) { | |
| 3627 if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) { | |
| 3628 // Stop parsing. | |
| 3629 return true; | |
| 3630 } | |
| 3631 parser_add_parse_error(parser, token); | |
| 3632 while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_TEMPLATE)) | |
| 3633 ; | |
| 3634 clear_active_formatting_elements(parser); | |
| 3635 pop_template_insertion_mode(parser); | |
| 3636 reset_insertion_mode_appropriately(parser); | |
| 3637 state->_reprocess_current_token = true; | |
| 3638 return false; | |
| 3639 } else { | |
| 3640 assert(0); | |
| 3641 return false; | |
| 3642 } | |
| 3643 } | |
| 3644 | |
| 3645 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-afterbody | |
| 3646 static bool handle_after_body(GumboParser* parser, GumboToken* token) { | |
| 3647 if (token->type == GUMBO_TOKEN_WHITESPACE || | |
| 3648 tag_is(token, kStartTag, GUMBO_TAG_HTML)) { | |
| 3649 return handle_in_body(parser, token); | |
| 3650 } else if (token->type == GUMBO_TOKEN_COMMENT) { | |
| 3651 GumboNode* html_node = parser->_output->root; | |
| 3652 assert(html_node != NULL); | |
| 3653 append_comment_node(parser, html_node, token); | |
| 3654 return true; | |
| 3655 } else if (token->type == GUMBO_TOKEN_DOCTYPE) { | |
| 3656 parser_add_parse_error(parser, token); | |
| 3657 ignore_token(parser); | |
| 3658 return false; | |
| 3659 } else if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) { | |
| 3660 /* fragment case: ignore the closing HTML token */ | |
| 3661 if (is_fragment_parser(parser)) { | |
| 3662 parser_add_parse_error(parser, token); | |
| 3663 ignore_token(parser); | |
| 3664 return false; | |
| 3665 } | |
| 3666 set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_BODY); | |
| 3667 GumboNode* html = parser->_parser_state->_open_elements.data[0]; | |
| 3668 assert(node_html_tag_is(html, GUMBO_TAG_HTML)); | |
| 3669 record_end_of_element( | |
| 3670 parser->_parser_state->_current_token, &html->v.element); | |
| 3671 return true; | |
| 3672 } else if (token->type == GUMBO_TOKEN_EOF) { | |
| 3673 return true; | |
| 3674 } else { | |
| 3675 parser_add_parse_error(parser, token); | |
| 3676 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY); | |
| 3677 parser->_parser_state->_reprocess_current_token = true; | |
| 3678 return false; | |
| 3679 } | |
| 3680 } | |
| 3681 | |
| 3682 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inframeset | |
| 3683 static bool handle_in_frameset(GumboParser* parser, GumboToken* token) { | |
| 3684 if (token->type == GUMBO_TOKEN_WHITESPACE) { | |
| 3685 insert_text_token(parser, token); | |
| 3686 return true; | |
| 3687 } else if (token->type == GUMBO_TOKEN_COMMENT) { | |
| 3688 append_comment_node(parser, get_current_node(parser), token); | |
| 3689 return true; | |
| 3690 } else if (token->type == GUMBO_TOKEN_DOCTYPE) { | |
| 3691 parser_add_parse_error(parser, token); | |
| 3692 ignore_token(parser); | |
| 3693 return false; | |
| 3694 } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) { | |
| 3695 return handle_in_body(parser, token); | |
| 3696 } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) { | |
| 3697 insert_element_from_token(parser, token); | |
| 3698 return true; | |
| 3699 } else if (tag_is(token, kEndTag, GUMBO_TAG_FRAMESET)) { | |
| 3700 if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) { | |
| 3701 parser_add_parse_error(parser, token); | |
| 3702 ignore_token(parser); | |
| 3703 return false; | |
| 3704 } | |
| 3705 pop_current_node(parser); | |
| 3706 if (!is_fragment_parser(parser) && | |
| 3707 !node_html_tag_is(get_current_node(parser), GUMBO_TAG_FRAMESET)) { | |
| 3708 set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_FRAMESET); | |
| 3709 } | |
| 3710 return true; | |
| 3711 } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAME)) { | |
| 3712 insert_element_from_token(parser, token); | |
| 3713 pop_current_node(parser); | |
| 3714 acknowledge_self_closing_tag(parser); | |
| 3715 return true; | |
| 3716 } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) { | |
| 3717 return handle_in_head(parser, token); | |
| 3718 } else if (token->type == GUMBO_TOKEN_EOF) { | |
| 3719 if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) { | |
| 3720 parser_add_parse_error(parser, token); | |
| 3721 return false; | |
| 3722 } | |
| 3723 return true; | |
| 3724 } else { | |
| 3725 parser_add_parse_error(parser, token); | |
| 3726 ignore_token(parser); | |
| 3727 return false; | |
| 3728 } | |
| 3729 } | |
| 3730 | |
| 3731 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-afterframeset | |
| 3732 static bool handle_after_frameset(GumboParser* parser, GumboToken* token) { | |
| 3733 if (token->type == GUMBO_TOKEN_WHITESPACE) { | |
| 3734 insert_text_token(parser, token); | |
| 3735 return true; | |
| 3736 } else if (token->type == GUMBO_TOKEN_COMMENT) { | |
| 3737 append_comment_node(parser, get_current_node(parser), token); | |
| 3738 return true; | |
| 3739 } else if (token->type == GUMBO_TOKEN_DOCTYPE) { | |
| 3740 parser_add_parse_error(parser, token); | |
| 3741 ignore_token(parser); | |
| 3742 return false; | |
| 3743 } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) { | |
| 3744 return handle_in_body(parser, token); | |
| 3745 } else if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) { | |
| 3746 GumboNode* html = parser->_parser_state->_open_elements.data[0]; | |
| 3747 assert(node_html_tag_is(html, GUMBO_TAG_HTML)); | |
| 3748 record_end_of_element( | |
| 3749 parser->_parser_state->_current_token, &html->v.element); | |
| 3750 set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_FRAMESET); | |
| 3751 return true; | |
| 3752 } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) { | |
| 3753 return handle_in_head(parser, token); | |
| 3754 } else if (token->type == GUMBO_TOKEN_EOF) { | |
| 3755 return true; | |
| 3756 } else { | |
| 3757 parser_add_parse_error(parser, token); | |
| 3758 ignore_token(parser); | |
| 3759 return false; | |
| 3760 } | |
| 3761 } | |
| 3762 | |
| 3763 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-after-after-body-insertion-mode | |
| 3764 static bool handle_after_after_body(GumboParser* parser, GumboToken* token) { | |
| 3765 if (token->type == GUMBO_TOKEN_COMMENT) { | |
| 3766 append_comment_node(parser, get_document_node(parser), token); | |
| 3767 return true; | |
| 3768 } else if (token->type == GUMBO_TOKEN_DOCTYPE || | |
| 3769 token->type == GUMBO_TOKEN_WHITESPACE || | |
| 3770 tag_is(token, kStartTag, GUMBO_TAG_HTML)) { | |
| 3771 return handle_in_body(parser, token); | |
| 3772 } else if (token->type == GUMBO_TOKEN_EOF) { | |
| 3773 return true; | |
| 3774 } else { | |
| 3775 parser_add_parse_error(parser, token); | |
| 3776 set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY); | |
| 3777 parser->_parser_state->_reprocess_current_token = true; | |
| 3778 return false; | |
| 3779 } | |
| 3780 } | |
| 3781 | |
| 3782 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-after-after-frameset-insertion-mode | |
| 3783 static bool handle_after_after_frameset( | |
| 3784 GumboParser* parser, GumboToken* token) { | |
| 3785 if (token->type == GUMBO_TOKEN_COMMENT) { | |
| 3786 append_comment_node(parser, get_document_node(parser), token); | |
| 3787 return true; | |
| 3788 } else if (token->type == GUMBO_TOKEN_DOCTYPE || | |
| 3789 token->type == GUMBO_TOKEN_WHITESPACE || | |
| 3790 tag_is(token, kStartTag, GUMBO_TAG_HTML)) { | |
| 3791 return handle_in_body(parser, token); | |
| 3792 } else if (token->type == GUMBO_TOKEN_EOF) { | |
| 3793 return true; | |
| 3794 } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) { | |
| 3795 return handle_in_head(parser, token); | |
| 3796 } else { | |
| 3797 parser_add_parse_error(parser, token); | |
| 3798 ignore_token(parser); | |
| 3799 return false; | |
| 3800 } | |
| 3801 } | |
| 3802 | |
| 3803 // Function pointers for each insertion mode. Keep in sync with | |
| 3804 // insertion_mode.h. | |
| 3805 typedef bool (*TokenHandler)(GumboParser* parser, GumboToken* token); | |
| 3806 static const TokenHandler kTokenHandlers[] = {handle_initial, | |
| 3807 handle_before_html, handle_before_head, handle_in_head, | |
| 3808 handle_in_head_noscript, handle_after_head, handle_in_body, handle_text, | |
| 3809 handle_in_table, handle_in_table_text, handle_in_caption, | |
| 3810 handle_in_column_group, handle_in_table_body, handle_in_row, handle_in_cell, | |
| 3811 handle_in_select, handle_in_select_in_table, handle_in_template, | |
| 3812 handle_after_body, handle_in_frameset, handle_after_frameset, | |
| 3813 handle_after_after_body, handle_after_after_frameset}; | |
| 3814 | |
| 3815 static bool handle_html_content(GumboParser* parser, GumboToken* token) { | |
| 3816 return kTokenHandlers[(unsigned int) parser->_parser_state->_insertion_mode]( | |
| 3817 parser, token); | |
| 3818 } | |
| 3819 | |
| 3820 // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inforeign | |
| 3821 static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) { | |
| 3822 gumbo_debug("Handling foreign content"); | |
| 3823 switch (token->type) { | |
| 3824 case GUMBO_TOKEN_NULL: | |
| 3825 parser_add_parse_error(parser, token); | |
| 3826 token->v.character = kUtf8ReplacementChar; | |
| 3827 insert_text_token(parser, token); | |
| 3828 return false; | |
| 3829 case GUMBO_TOKEN_WHITESPACE: | |
| 3830 insert_text_token(parser, token); | |
| 3831 return true; | |
| 3832 case GUMBO_TOKEN_CDATA: | |
| 3833 case GUMBO_TOKEN_CHARACTER: | |
| 3834 insert_text_token(parser, token); | |
| 3835 set_frameset_not_ok(parser); | |
| 3836 return true; | |
| 3837 case GUMBO_TOKEN_COMMENT: | |
| 3838 append_comment_node(parser, get_current_node(parser), token); | |
| 3839 return true; | |
| 3840 case GUMBO_TOKEN_DOCTYPE: | |
| 3841 parser_add_parse_error(parser, token); | |
| 3842 ignore_token(parser); | |
| 3843 return false; | |
| 3844 default: | |
| 3845 // Fall through to the if-statements below. | |
| 3846 break; | |
| 3847 } | |
| 3848 // Order matters for these clauses. | |
| 3849 if (tag_in(token, kStartTag, | |
| 3850 (gumbo_tagset){TAG(B), TAG(BIG), TAG(BLOCKQUOTE), TAG(BODY), TAG(BR), | |
| 3851 TAG(CENTER), TAG(CODE), TAG(DD), TAG(DIV), TAG(DL), TAG(DT), | |
| 3852 TAG(EM), TAG(EMBED), TAG(H1), TAG(H2), TAG(H3), TAG(H4), TAG(H5), | |
| 3853 TAG(H6), TAG(HEAD), TAG(HR), TAG(I), TAG(IMG), TAG(LI), | |
| 3854 TAG(LISTING), TAG(MENU), TAG(META), TAG(NOBR), TAG(OL), TAG(P), | |
| 3855 TAG(PRE), TAG(RUBY), TAG(S), TAG(SMALL), TAG(SPAN), TAG(STRONG), | |
| 3856 TAG(STRIKE), TAG(SUB), TAG(SUP), TAG(TABLE), TAG(TT), TAG(U), | |
| 3857 TAG(UL), TAG(VAR)}) || | |
| 3858 (tag_is(token, kStartTag, GUMBO_TAG_FONT) && | |
| 3859 (token_has_attribute(token, "color") || | |
| 3860 token_has_attribute(token, "face") || | |
| 3861 token_has_attribute(token, "size")))) { | |
| 3862 /* Parse error */ | |
| 3863 parser_add_parse_error(parser, token); | |
| 3864 | |
| 3865 /* | |
| 3866 * Fragment case: If the parser was originally created for the HTML | |
| 3867 * fragment parsing algorithm, then act as described in the "any other | |
| 3868 * start tag" entry below. | |
| 3869 */ | |
| 3870 if (!is_fragment_parser(parser)) { | |
| 3871 do { | |
| 3872 pop_current_node(parser); | |
| 3873 } while (!(is_mathml_integration_point(get_current_node(parser)) || | |
| 3874 is_html_integration_point(get_current_node(parser)) || | |
| 3875 get_current_node(parser)->v.element.tag_namespace == | |
| 3876 GUMBO_NAMESPACE_HTML)); | |
| 3877 parser->_parser_state->_reprocess_current_token = true; | |
| 3878 return false; | |
| 3879 } | |
| 3880 | |
| 3881 assert(token->type == GUMBO_TOKEN_START_TAG); | |
| 3882 } | |
| 3883 | |
| 3884 if (token->type == GUMBO_TOKEN_START_TAG) { | |
| 3885 const GumboNamespaceEnum current_namespace = | |
| 3886 get_adjusted_current_node(parser)->v.element.tag_namespace; | |
| 3887 if (current_namespace == GUMBO_NAMESPACE_MATHML) { | |
| 3888 adjust_mathml_attributes(parser, token); | |
| 3889 } | |
| 3890 if (current_namespace == GUMBO_NAMESPACE_SVG) { | |
| 3891 // Tag adjustment is left to the gumbo_normalize_svg_tagname helper | |
| 3892 // function. | |
| 3893 adjust_svg_attributes(parser, token); | |
| 3894 } | |
| 3895 adjust_foreign_attributes(parser, token); | |
| 3896 insert_foreign_element(parser, token, current_namespace); | |
| 3897 if (token->v.start_tag.is_self_closing) { | |
| 3898 pop_current_node(parser); | |
| 3899 acknowledge_self_closing_tag(parser); | |
| 3900 } | |
| 3901 return true; | |
| 3902 // </script> tags are handled like any other end tag, putting the script's | |
| 3903 // text into a text node child and closing the current node. | |
| 3904 } else { | |
| 3905 assert(token->type == GUMBO_TOKEN_END_TAG); | |
| 3906 GumboNode* node = get_current_node(parser); | |
| 3907 assert(node != NULL); | |
| 3908 GumboStringPiece token_tagname = token->original_text; | |
| 3909 GumboStringPiece node_tagname = node->v.element.original_tag; | |
| 3910 gumbo_tag_from_original_text(&token_tagname); | |
| 3911 gumbo_tag_from_original_text(&node_tagname); | |
| 3912 | |
| 3913 bool is_success = true; | |
| 3914 if (!gumbo_string_equals_ignore_case(&node_tagname, &token_tagname)) { | |
| 3915 parser_add_parse_error(parser, token); | |
| 3916 is_success = false; | |
| 3917 } | |
| 3918 int i = parser->_parser_state->_open_elements.length; | |
| 3919 for (--i; i > 0;) { | |
| 3920 // Here we move up the stack until we find an HTML element (in which | |
| 3921 // case we do nothing) or we find the element that we're about to | |
| 3922 // close (in which case we pop everything we've seen until that | |
| 3923 // point.) | |
| 3924 gumbo_debug("Foreign %.*s node at %d.\n", node_tagname.length, | |
| 3925 node_tagname.data, i); | |
| 3926 if (gumbo_string_equals_ignore_case(&node_tagname, &token_tagname)) { | |
| 3927 gumbo_debug("Matches.\n"); | |
| 3928 while (pop_current_node(parser) != node) { | |
| 3929 // Pop all the nodes below the current one. Node is guaranteed to | |
| 3930 // be an element on the stack of open elements (set below), so | |
| 3931 // this loop is guaranteed to terminate. | |
| 3932 } | |
| 3933 return is_success; | |
| 3934 } | |
| 3935 --i; | |
| 3936 node = parser->_parser_state->_open_elements.data[i]; | |
| 3937 if (node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML) { | |
| 3938 // Must break before gumbo_tag_from_original_text to avoid passing | |
| 3939 // parser-inserted nodes through. | |
| 3940 break; | |
| 3941 } | |
| 3942 node_tagname = node->v.element.original_tag; | |
| 3943 gumbo_tag_from_original_text(&node_tagname); | |
| 3944 } | |
| 3945 assert(node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML); | |
| 3946 // We can't call handle_token directly because the current node is still in | |
| 3947 // the SVG namespace, so it would re-enter this and result in infinite | |
| 3948 // recursion. | |
| 3949 return handle_html_content(parser, token) && is_success; | |
| 3950 } | |
| 3951 } | |
| 3952 | |
| 3953 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#tree-construction | |
| 3954 static bool handle_token(GumboParser* parser, GumboToken* token) { | |
| 3955 if (parser->_parser_state->_ignore_next_linefeed && | |
| 3956 token->type == GUMBO_TOKEN_WHITESPACE && token->v.character == '\n') { | |
| 3957 parser->_parser_state->_ignore_next_linefeed = false; | |
| 3958 ignore_token(parser); | |
| 3959 return true; | |
| 3960 } | |
| 3961 // This needs to be reset both here and in the conditional above to catch both | |
| 3962 // the case where the next token is not whitespace (so we don't ignore | |
| 3963 // whitespace in the middle of <pre> tags) and where there are multiple | |
| 3964 // whitespace tokens (so we don't ignore the second one). | |
| 3965 parser->_parser_state->_ignore_next_linefeed = false; | |
| 3966 | |
| 3967 if (tag_is(token, kEndTag, GUMBO_TAG_BODY)) { | |
| 3968 parser->_parser_state->_closed_body_tag = true; | |
| 3969 } | |
| 3970 if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) { | |
| 3971 parser->_parser_state->_closed_html_tag = true; | |
| 3972 } | |
| 3973 | |
| 3974 const GumboNode* current_node = get_adjusted_current_node(parser); | |
| 3975 assert(!current_node || current_node->type == GUMBO_NODE_ELEMENT || | |
| 3976 current_node->type == GUMBO_NODE_TEMPLATE); | |
| 3977 if (current_node) { | |
| 3978 gumbo_debug("Current node: <%s>.\n", | |
| 3979 gumbo_normalized_tagname(current_node->v.element.tag)); | |
| 3980 } | |
| 3981 if (!current_node || | |
| 3982 current_node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML || | |
| 3983 (is_mathml_integration_point(current_node) && | |
| 3984 (token->type == GUMBO_TOKEN_CHARACTER || | |
| 3985 token->type == GUMBO_TOKEN_WHITESPACE || | |
| 3986 token->type == GUMBO_TOKEN_NULL || | |
| 3987 (token->type == GUMBO_TOKEN_START_TAG && | |
| 3988 !tag_in(token, kStartTag, | |
| 3989 (gumbo_tagset){TAG(MGLYPH), TAG(MALIGNMARK)})))) || | |
| 3990 (current_node->v.element.tag_namespace == GUMBO_NAMESPACE_MATHML && | |
| 3991 node_qualified_tag_is( | |
| 3992 current_node, GUMBO_NAMESPACE_MATHML, GUMBO_TAG_ANNOTATION_XML) && | |
| 3993 tag_is(token, kStartTag, GUMBO_TAG_SVG)) || | |
| 3994 (is_html_integration_point(current_node) && | |
| 3995 (token->type == GUMBO_TOKEN_START_TAG || | |
| 3996 token->type == GUMBO_TOKEN_CHARACTER || | |
| 3997 token->type == GUMBO_TOKEN_NULL || | |
| 3998 token->type == GUMBO_TOKEN_WHITESPACE)) || | |
| 3999 token->type == GUMBO_TOKEN_EOF) { | |
| 4000 return handle_html_content(parser, token); | |
| 4001 } else { | |
| 4002 return handle_in_foreign_content(parser, token); | |
| 4003 } | |
| 4004 } | |
| 4005 | |
| 4006 static void fragment_parser_init(GumboParser* parser, GumboTag fragment_ctx, | |
| 4007 GumboNamespaceEnum fragment_namespace) { | |
| 4008 GumboNode* root; | |
| 4009 assert(fragment_ctx != GUMBO_TAG_LAST); | |
| 4010 | |
| 4011 // 3 | |
| 4012 parser->_parser_state->_fragment_ctx = create_element(parser, fragment_ctx); | |
| 4013 parser->_parser_state->_fragment_ctx->v.element.tag_namespace = | |
| 4014 fragment_namespace; | |
| 4015 | |
| 4016 // 4 | |
| 4017 if (fragment_namespace == GUMBO_NAMESPACE_HTML) { | |
| 4018 // Non-HTML namespaces always start in the DATA state. | |
| 4019 switch (fragment_ctx) { | |
| 4020 case GUMBO_TAG_TITLE: | |
| 4021 case GUMBO_TAG_TEXTAREA: | |
| 4022 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA); | |
| 4023 break; | |
| 4024 | |
| 4025 case GUMBO_TAG_STYLE: | |
| 4026 case GUMBO_TAG_XMP: | |
| 4027 case GUMBO_TAG_IFRAME: | |
| 4028 case GUMBO_TAG_NOEMBED: | |
| 4029 case GUMBO_TAG_NOFRAMES: | |
| 4030 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT); | |
| 4031 break; | |
| 4032 | |
| 4033 case GUMBO_TAG_SCRIPT: | |
| 4034 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT); | |
| 4035 break; | |
| 4036 | |
| 4037 case GUMBO_TAG_NOSCRIPT: | |
| 4038 /* scripting is disabled in Gumbo, so leave the tokenizer | |
| 4039 * in the default data state */ | |
| 4040 break; | |
| 4041 | |
| 4042 case GUMBO_TAG_PLAINTEXT: | |
| 4043 gumbo_tokenizer_set_state(parser, GUMBO_LEX_PLAINTEXT); | |
| 4044 break; | |
| 4045 | |
| 4046 default: | |
| 4047 /* default data state */ | |
| 4048 break; | |
| 4049 } | |
| 4050 } | |
| 4051 | |
| 4052 // 5. 6. 7. | |
| 4053 root = insert_element_of_tag_type( | |
| 4054 parser, GUMBO_TAG_HTML, GUMBO_INSERTION_IMPLIED); | |
| 4055 parser->_output->root = root; | |
| 4056 | |
| 4057 // 8. | |
| 4058 if (fragment_ctx == GUMBO_TAG_TEMPLATE) { | |
| 4059 push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE); | |
| 4060 } | |
| 4061 | |
| 4062 // 10. | |
| 4063 reset_insertion_mode_appropriately(parser); | |
| 4064 } | |
| 4065 | |
| 4066 GumboOutput* gumbo_parse(const char* buffer) { | |
| 4067 return gumbo_parse_with_options( | |
| 4068 &kGumboDefaultOptions, buffer, strlen(buffer)); | |
| 4069 } | |
| 4070 | |
| 4071 GumboOutput* gumbo_parse_with_options( | |
| 4072 const GumboOptions* options, const char* buffer, size_t length) { | |
| 4073 GumboParser parser; | |
| 4074 parser._options = options; | |
| 4075 output_init(&parser); | |
| 4076 gumbo_tokenizer_state_init(&parser, buffer, length); | |
| 4077 parser_state_init(&parser); | |
| 4078 | |
| 4079 if (options->fragment_context != GUMBO_TAG_LAST) { | |
| 4080 fragment_parser_init( | |
| 4081 &parser, options->fragment_context, options->fragment_namespace); | |
| 4082 } | |
| 4083 | |
| 4084 GumboParserState* state = parser._parser_state; | |
| 4085 gumbo_debug("Parsing %.*s.\n", length, buffer); | |
| 4086 | |
| 4087 // Sanity check so that infinite loops die with an assertion failure instead | |
| 4088 // of hanging the process before we ever get an error. | |
| 4089 int loop_count = 0; | |
| 4090 | |
| 4091 GumboToken token; | |
| 4092 bool has_error = false; | |
| 4093 | |
| 4094 do { | |
| 4095 if (state->_reprocess_current_token) { | |
| 4096 state->_reprocess_current_token = false; | |
| 4097 } else { | |
| 4098 GumboNode* current_node = get_current_node(&parser); | |
| 4099 gumbo_tokenizer_set_is_current_node_foreign(&parser, | |
| 4100 current_node && | |
| 4101 current_node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML); | |
| 4102 has_error = !gumbo_lex(&parser, &token) || has_error; | |
| 4103 } | |
| 4104 const char* token_type = "text"; | |
| 4105 switch (token.type) { | |
| 4106 case GUMBO_TOKEN_DOCTYPE: | |
| 4107 token_type = "doctype"; | |
| 4108 break; | |
| 4109 case GUMBO_TOKEN_START_TAG: | |
| 4110 token_type = gumbo_normalized_tagname(token.v.start_tag.tag); | |
| 4111 break; | |
| 4112 case GUMBO_TOKEN_END_TAG: | |
| 4113 token_type = gumbo_normalized_tagname(token.v.end_tag); | |
| 4114 break; | |
| 4115 case GUMBO_TOKEN_COMMENT: | |
| 4116 token_type = "comment"; | |
| 4117 break; | |
| 4118 default: | |
| 4119 break; | |
| 4120 } | |
| 4121 gumbo_debug("Handling %s token @%d:%d in state %d.\n", (char*) token_type, | |
| 4122 token.position.line, token.position.column, state->_insertion_mode); | |
| 4123 | |
| 4124 state->_current_token = &token; | |
| 4125 state->_self_closing_flag_acknowledged = | |
| 4126 !(token.type == GUMBO_TOKEN_START_TAG && | |
| 4127 token.v.start_tag.is_self_closing); | |
| 4128 | |
| 4129 has_error = !handle_token(&parser, &token) || has_error; | |
| 4130 | |
| 4131 // Check for memory leaks when ownership is transferred from start tag | |
| 4132 // tokens to nodes. | |
| 4133 assert(state->_reprocess_current_token || | |
| 4134 token.type != GUMBO_TOKEN_START_TAG || | |
| 4135 token.v.start_tag.attributes.data == NULL); | |
| 4136 | |
| 4137 if (!state->_self_closing_flag_acknowledged) { | |
| 4138 GumboError* error = parser_add_parse_error(&parser, &token); | |
| 4139 if (error) { | |
| 4140 error->type = GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG; | |
| 4141 } | |
| 4142 } | |
| 4143 | |
| 4144 ++loop_count; | |
| 4145 assert(loop_count < 1000000000); | |
| 4146 | |
| 4147 } while ((token.type != GUMBO_TOKEN_EOF || state->_reprocess_current_token) && | |
| 4148 !(options->stop_on_first_error && has_error)); | |
| 4149 | |
| 4150 finish_parsing(&parser); | |
| 4151 // For API uniformity reasons, if the doctype still has nulls, convert them to | |
| 4152 // empty strings. | |
| 4153 GumboDocument* doc_type = &parser._output->document->v.document; | |
| 4154 if (doc_type->name == NULL) { | |
| 4155 doc_type->name = gumbo_copy_stringz(&parser, ""); | |
| 4156 } | |
| 4157 if (doc_type->public_identifier == NULL) { | |
| 4158 doc_type->public_identifier = gumbo_copy_stringz(&parser, ""); | |
| 4159 } | |
| 4160 if (doc_type->system_identifier == NULL) { | |
| 4161 doc_type->system_identifier = gumbo_copy_stringz(&parser, ""); | |
| 4162 } | |
| 4163 | |
| 4164 parser_state_destroy(&parser); | |
| 4165 gumbo_tokenizer_state_destroy(&parser); | |
| 4166 return parser._output; | |
| 4167 } | |
| 4168 | |
| 4169 void gumbo_destroy_node(GumboOptions* options, GumboNode* node) { | |
| 4170 // Need a dummy GumboParser because the allocator comes along with the | |
| 4171 // options object. | |
| 4172 GumboParser parser; | |
| 4173 parser._options = options; | |
| 4174 destroy_node(&parser, node); | |
| 4175 } | |
| 4176 | |
| 4177 void gumbo_destroy_output(const GumboOptions* options, GumboOutput* output) { | |
| 4178 // Need a dummy GumboParser because the allocator comes along with the | |
| 4179 // options object. | |
| 4180 GumboParser parser; | |
| 4181 parser._options = options; | |
| 4182 destroy_node(&parser, output->document); | |
| 4183 for (unsigned int i = 0; i < output->errors.length; ++i) { | |
| 4184 gumbo_error_destroy(&parser, output->errors.data[i]); | |
| 4185 } | |
| 4186 gumbo_vector_destroy(&parser, &output->errors); | |
| 4187 gumbo_parser_deallocate(&parser, output); | |
| 4188 } |
