Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/gumbo-parser/src/tokenizer.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 // Copyright 2010 Google Inc. All Rights Reserved. | |
| 2 // | |
| 3 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 4 // you may not use this file except in compliance with the License. | |
| 5 // You may obtain a copy of the License at | |
| 6 // | |
| 7 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 8 // | |
| 9 // Unless required by applicable law or agreed to in writing, software | |
| 10 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 12 // See the License for the specific language governing permissions and | |
| 13 // limitations under the License. | |
| 14 // | |
| 15 // Author: jdtang@google.com (Jonathan Tang) | |
| 16 // | |
| 17 // Coding conventions specific to this file: | |
| 18 // | |
| 19 // 1. Functions that fill in a token should be named emit_*, and should be | |
| 20 // followed immediately by a return from the tokenizer (true if no error | |
| 21 // occurred, false if an error occurred). Sometimes the emit functions | |
| 22 // themselves return a boolean so that they can be combined with the return | |
| 23 // statement; in this case, they should match this convention. | |
| 24 // 2. Functions that shuffle data from temporaries to final API structures | |
| 25 // should be named finish_*, and be called just before the tokenizer exits the | |
| 26 // state that accumulates the temporary. | |
| 27 // 3. All internal data structures should be kept in an initialized state from | |
| 28 // tokenizer creation onwards, ready to accept input. When a buffer's flushed | |
| 29 // and reset, it should be deallocated and immediately reinitialized. | |
| 30 // 4. Make sure there are appropriate break statements following each state. | |
| 31 // 5. Assertions on the state of the temporary and tag buffers are usually a | |
| 32 // good idea, and should go at the entry point of each state when added. | |
| 33 // 6. Statement order within states goes: | |
| 34 // 1. Add parse errors, if appropriate. | |
| 35 // 2. Call finish_* functions to build up tag state. | |
| 36 // 2. Switch to new state. Set _reconsume flag if appropriate. | |
| 37 // 3. Perform any other temporary buffer manipulation. | |
| 38 // 4. Emit tokens | |
| 39 // 5. Return/break. | |
| 40 // This order ensures that we can verify that every emit is followed by a | |
| 41 // return, ensures that the correct state is recorded with any parse errors, and | |
| 42 // prevents parse error position from being messed up by possible mark/resets in | |
| 43 // temporary buffer manipulation. | |
| 44 | |
| 45 #include "tokenizer.h" | |
| 46 | |
| 47 #include <assert.h> | |
| 48 #include <stdbool.h> | |
| 49 #include <string.h> | |
| 50 | |
| 51 #include "attribute.h" | |
| 52 #include "char_ref.h" | |
| 53 #include "error.h" | |
| 54 #include "gumbo.h" | |
| 55 #include "parser.h" | |
| 56 #include "string_buffer.h" | |
| 57 #include "string_piece.h" | |
| 58 #include "token_type.h" | |
| 59 #include "tokenizer_states.h" | |
| 60 #include "utf8.h" | |
| 61 #include "util.h" | |
| 62 #include "vector.h" | |
| 63 | |
| 64 // Compared against _script_data_buffer to determine if we're in double-escaped | |
| 65 // script mode. | |
| 66 const GumboStringPiece kScriptTag = {"script", 6}; | |
| 67 | |
| 68 // An enum for the return value of each individual state. | |
| 69 typedef enum { | |
| 70 RETURN_ERROR, // Return false (error) from the tokenizer. | |
| 71 RETURN_SUCCESS, // Return true (success) from the tokenizer. | |
| 72 NEXT_CHAR // Proceed to the next character and continue lexing. | |
| 73 } StateResult; | |
| 74 | |
| 75 // This is a struct containing state necessary to build up a tag token, | |
| 76 // character by character. | |
| 77 typedef struct GumboInternalTagState { | |
| 78 // A buffer to accumulate characters for various GumboStringPiece fields. | |
| 79 GumboStringBuffer _buffer; | |
| 80 | |
| 81 // A pointer to the start of the original text corresponding to the contents | |
| 82 // of the buffer. | |
| 83 const char* _original_text; | |
| 84 | |
| 85 // The current tag enum, computed once the tag name state has finished so that | |
| 86 // the buffer can be re-used for building up attributes. | |
| 87 GumboTag _tag; | |
| 88 | |
| 89 // The starting location of the text in the buffer. | |
| 90 GumboSourcePosition _start_pos; | |
| 91 | |
| 92 // The current list of attributes. This is copied (and ownership of its data | |
| 93 // transferred) to the GumboStartTag token upon completion of the tag. New | |
| 94 // attributes are added as soon as their attribute name state is complete, and | |
| 95 // values are filled in by operating on _attributes.data[attributes.length-1]. | |
| 96 GumboVector /* GumboAttribute */ _attributes; | |
| 97 | |
| 98 // If true, the next attribute value to be finished should be dropped. This | |
| 99 // happens if a duplicate attribute name is encountered - we want to consume | |
| 100 // the attribute value, but shouldn't overwrite the existing value. | |
| 101 bool _drop_next_attr_value; | |
| 102 | |
| 103 // The state that caused the tokenizer to switch into a character reference in | |
| 104 // attribute value state. This is used to set the additional allowed | |
| 105 // character, and is switched back to on completion. Initialized as the | |
| 106 // tokenizer enters the character reference state. | |
| 107 GumboTokenizerEnum _attr_value_state; | |
| 108 | |
| 109 // The last start tag to have been emitted by the tokenizer. This is | |
| 110 // necessary to check for appropriate end tags. | |
| 111 GumboTag _last_start_tag; | |
| 112 | |
| 113 // If true, then this is a start tag. If false, it's an end tag. This is | |
| 114 // necessary to generate the appropriate token type at tag-closing time. | |
| 115 bool _is_start_tag; | |
| 116 | |
| 117 // If true, then this tag is "self-closing" and doesn't have an end tag. | |
| 118 bool _is_self_closing; | |
| 119 } GumboTagState; | |
| 120 | |
| 121 // This is the main tokenizer state struct, containing all state used by in | |
| 122 // tokenizing the input stream. | |
| 123 typedef struct GumboInternalTokenizerState { | |
| 124 // The current lexer state. Starts in GUMBO_LEX_DATA. | |
| 125 GumboTokenizerEnum _state; | |
| 126 | |
| 127 // A flag indicating whether the current input character needs to reconsumed | |
| 128 // in another state, or whether the next input character should be read for | |
| 129 // the next iteration of the state loop. This is set when the spec reads | |
| 130 // "Reconsume the current input character in..." | |
| 131 bool _reconsume_current_input; | |
| 132 | |
| 133 // A flag indicating whether the current node is a foreign element. This is | |
| 134 // set by gumbo_tokenizer_set_is_current_node_foreign and checked in the | |
| 135 // markup declaration state. | |
| 136 bool _is_current_node_foreign; | |
| 137 | |
| 138 // A flag indicating whether the tokenizer is in a CDATA section. If so, then | |
| 139 // text tokens emitted will be GUMBO_TOKEN_CDATA. | |
| 140 bool _is_in_cdata; | |
| 141 | |
| 142 // Certain states (notably character references) may emit two character tokens | |
| 143 // at once, but the contract for lex() fills in only one token at a time. The | |
| 144 // extra character is buffered here, and then this is checked on entry to | |
| 145 // lex(). If a character is stored here, it's immediately emitted and control | |
| 146 // returns from the lexer. kGumboNoChar is used to represent 'no character | |
| 147 // stored.' | |
| 148 // | |
| 149 // Note that characters emitted through this mechanism will have their source | |
| 150 // position marked as the character under the mark, i.e. multiple characters | |
| 151 // may be emitted with the same position. This is desirable for character | |
| 152 // references, but unsuitable for many other cases. Use the _temporary_buffer | |
| 153 // mechanism if the buffered characters must have their original positions in | |
| 154 // the document. | |
| 155 int _buffered_emit_char; | |
| 156 | |
| 157 // A temporary buffer to accumulate characters, as described by the "temporary | |
| 158 // buffer" phrase in the tokenizer spec. We use this in a somewhat unorthodox | |
| 159 // way: we record the specific character to go into the buffer, which may | |
| 160 // sometimes be a lowercased version of the actual input character. However, | |
| 161 // we *also* use utf8iterator_mark() to record the position at tag start. | |
| 162 // When we start flushing the temporary buffer, we set _temporary_buffer_emit | |
| 163 // to the start of it, and then increment it for each call to the tokenizer. | |
| 164 // We also call utf8iterator_reset(), and utf8iterator_next() through the | |
| 165 // input stream, so that tokens emitted by emit_char have the correct position | |
| 166 // and original text. | |
| 167 GumboStringBuffer _temporary_buffer; | |
| 168 | |
| 169 // The current cursor position we're emitting from within | |
| 170 // _temporary_buffer.data. NULL whenever we're not flushing the buffer. | |
| 171 const char* _temporary_buffer_emit; | |
| 172 | |
| 173 // The temporary buffer is also used by the spec to check whether we should | |
| 174 // enter the script data double escaped state, but we can't use the same | |
| 175 // buffer for both because we have to flush out "<s" as emits while still | |
| 176 // maintaining the context that will eventually become "script". This is a | |
| 177 // separate buffer that's used in place of the temporary buffer for states | |
| 178 // that may enter the script data double escape start state. | |
| 179 GumboStringBuffer _script_data_buffer; | |
| 180 | |
| 181 // Pointer to the beginning of the current token in the original buffer; used | |
| 182 // to record the original text. | |
| 183 const char* _token_start; | |
| 184 | |
| 185 // GumboSourcePosition recording the source location of the start of the | |
| 186 // current token. | |
| 187 GumboSourcePosition _token_start_pos; | |
| 188 | |
| 189 // Current tag state. | |
| 190 GumboTagState _tag_state; | |
| 191 | |
| 192 // Doctype state. We use the temporary buffer to accumulate characters (it's | |
| 193 // not used for anything else in the doctype states), and then freshly | |
| 194 // allocate the strings in the doctype token, then copy it over on emit. | |
| 195 GumboTokenDocType _doc_type_state; | |
| 196 | |
| 197 // The UTF8Iterator over the tokenizer input. | |
| 198 Utf8Iterator _input; | |
| 199 } GumboTokenizerState; | |
| 200 | |
| 201 // Adds an ERR_UNEXPECTED_CODE_POINT parse error to the parser's error struct. | |
| 202 static void tokenizer_add_parse_error( | |
| 203 GumboParser* parser, GumboErrorType type) { | |
| 204 GumboError* error = gumbo_add_error(parser); | |
| 205 if (!error) { | |
| 206 return; | |
| 207 } | |
| 208 GumboTokenizerState* tokenizer = parser->_tokenizer_state; | |
| 209 utf8iterator_get_position(&tokenizer->_input, &error->position); | |
| 210 error->original_text = utf8iterator_get_char_pointer(&tokenizer->_input); | |
| 211 error->type = type; | |
| 212 error->v.tokenizer.codepoint = utf8iterator_current(&tokenizer->_input); | |
| 213 switch (tokenizer->_state) { | |
| 214 case GUMBO_LEX_DATA: | |
| 215 error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_DATA; | |
| 216 break; | |
| 217 case GUMBO_LEX_CHAR_REF_IN_DATA: | |
| 218 case GUMBO_LEX_CHAR_REF_IN_RCDATA: | |
| 219 case GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE: | |
| 220 error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_CHAR_REF; | |
| 221 break; | |
| 222 case GUMBO_LEX_RCDATA: | |
| 223 case GUMBO_LEX_RCDATA_LT: | |
| 224 case GUMBO_LEX_RCDATA_END_TAG_OPEN: | |
| 225 case GUMBO_LEX_RCDATA_END_TAG_NAME: | |
| 226 error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_RCDATA; | |
| 227 break; | |
| 228 case GUMBO_LEX_RAWTEXT: | |
| 229 case GUMBO_LEX_RAWTEXT_LT: | |
| 230 case GUMBO_LEX_RAWTEXT_END_TAG_OPEN: | |
| 231 case GUMBO_LEX_RAWTEXT_END_TAG_NAME: | |
| 232 error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_RAWTEXT; | |
| 233 break; | |
| 234 case GUMBO_LEX_PLAINTEXT: | |
| 235 error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_PLAINTEXT; | |
| 236 break; | |
| 237 case GUMBO_LEX_SCRIPT: | |
| 238 case GUMBO_LEX_SCRIPT_LT: | |
| 239 case GUMBO_LEX_SCRIPT_END_TAG_OPEN: | |
| 240 case GUMBO_LEX_SCRIPT_END_TAG_NAME: | |
| 241 case GUMBO_LEX_SCRIPT_ESCAPED_START: | |
| 242 case GUMBO_LEX_SCRIPT_ESCAPED_START_DASH: | |
| 243 case GUMBO_LEX_SCRIPT_ESCAPED: | |
| 244 case GUMBO_LEX_SCRIPT_ESCAPED_DASH: | |
| 245 case GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH: | |
| 246 case GUMBO_LEX_SCRIPT_ESCAPED_LT: | |
| 247 case GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN: | |
| 248 case GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME: | |
| 249 case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START: | |
| 250 case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED: | |
| 251 case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH: | |
| 252 case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH: | |
| 253 case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT: | |
| 254 case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END: | |
| 255 error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_SCRIPT; | |
| 256 break; | |
| 257 case GUMBO_LEX_TAG_OPEN: | |
| 258 case GUMBO_LEX_END_TAG_OPEN: | |
| 259 case GUMBO_LEX_TAG_NAME: | |
| 260 case GUMBO_LEX_BEFORE_ATTR_NAME: | |
| 261 error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_TAG; | |
| 262 break; | |
| 263 case GUMBO_LEX_SELF_CLOSING_START_TAG: | |
| 264 error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_SELF_CLOSING_TAG; | |
| 265 break; | |
| 266 case GUMBO_LEX_ATTR_NAME: | |
| 267 case GUMBO_LEX_AFTER_ATTR_NAME: | |
| 268 case GUMBO_LEX_BEFORE_ATTR_VALUE: | |
| 269 error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_ATTR_NAME; | |
| 270 break; | |
| 271 case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED: | |
| 272 case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED: | |
| 273 case GUMBO_LEX_ATTR_VALUE_UNQUOTED: | |
| 274 case GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED: | |
| 275 error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_ATTR_VALUE; | |
| 276 break; | |
| 277 case GUMBO_LEX_BOGUS_COMMENT: | |
| 278 case GUMBO_LEX_COMMENT_START: | |
| 279 case GUMBO_LEX_COMMENT_START_DASH: | |
| 280 case GUMBO_LEX_COMMENT: | |
| 281 case GUMBO_LEX_COMMENT_END_DASH: | |
| 282 case GUMBO_LEX_COMMENT_END: | |
| 283 case GUMBO_LEX_COMMENT_END_BANG: | |
| 284 error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_COMMENT; | |
| 285 break; | |
| 286 case GUMBO_LEX_MARKUP_DECLARATION: | |
| 287 case GUMBO_LEX_DOCTYPE: | |
| 288 case GUMBO_LEX_BEFORE_DOCTYPE_NAME: | |
| 289 case GUMBO_LEX_DOCTYPE_NAME: | |
| 290 case GUMBO_LEX_AFTER_DOCTYPE_NAME: | |
| 291 case GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD: | |
| 292 case GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID: | |
| 293 case GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED: | |
| 294 case GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED: | |
| 295 case GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID: | |
| 296 case GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID: | |
| 297 case GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD: | |
| 298 case GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID: | |
| 299 case GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED: | |
| 300 case GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED: | |
| 301 case GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID: | |
| 302 case GUMBO_LEX_BOGUS_DOCTYPE: | |
| 303 error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_DOCTYPE; | |
| 304 break; | |
| 305 case GUMBO_LEX_CDATA: | |
| 306 error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_CDATA; | |
| 307 break; | |
| 308 } | |
| 309 } | |
| 310 | |
| 311 static bool is_alpha(int c) { | |
| 312 // We don't use ISO C isupper/islower functions here because they | |
| 313 // depend upon the program's locale, while the behavior of the HTML5 spec is | |
| 314 // independent of which locale the program is run in. | |
| 315 return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); | |
| 316 } | |
| 317 | |
| 318 static int ensure_lowercase(int c) { | |
| 319 return c >= 'A' && c <= 'Z' ? c + 0x20 : c; | |
| 320 } | |
| 321 | |
| 322 static GumboTokenType get_char_token_type(bool is_in_cdata, int c) { | |
| 323 if (is_in_cdata && c > 0) { | |
| 324 return GUMBO_TOKEN_CDATA; | |
| 325 } | |
| 326 | |
| 327 switch (c) { | |
| 328 case '\t': | |
| 329 case '\n': | |
| 330 case '\r': | |
| 331 case '\f': | |
| 332 case ' ': | |
| 333 return GUMBO_TOKEN_WHITESPACE; | |
| 334 case 0: | |
| 335 gumbo_debug("Emitted null byte.\n"); | |
| 336 return GUMBO_TOKEN_NULL; | |
| 337 case -1: | |
| 338 return GUMBO_TOKEN_EOF; | |
| 339 default: | |
| 340 return GUMBO_TOKEN_CHARACTER; | |
| 341 } | |
| 342 } | |
| 343 | |
| 344 // Starts recording characters in the temporary buffer. | |
| 345 // Because this needs to reset the utf8iterator_mark to the beginning of the | |
| 346 // text that will eventually be emitted, it needs to be called a couple of | |
| 347 // states before the spec says "Set the temporary buffer to the empty string". | |
| 348 // In general, this should be called whenever there's a transition to a | |
| 349 // "less-than sign state". The initial < and possibly / then need to be | |
| 350 // appended to the temporary buffer, their presence needs to be accounted for in | |
| 351 // states that compare the temporary buffer against a literal value, and | |
| 352 // spec stanzas that say "emit a < and / character token along with a character | |
| 353 // token for each character in the temporary buffer" need to be adjusted to | |
| 354 // account for the presence of the < and / inside the temporary buffer. | |
| 355 static void clear_temporary_buffer(GumboParser* parser) { | |
| 356 GumboTokenizerState* tokenizer = parser->_tokenizer_state; | |
| 357 assert(!tokenizer->_temporary_buffer_emit); | |
| 358 utf8iterator_mark(&tokenizer->_input); | |
| 359 gumbo_string_buffer_clear(parser, &tokenizer->_temporary_buffer); | |
| 360 // The temporary buffer and script data buffer are the same object in the | |
| 361 // spec, so the script data buffer should be cleared as well. | |
| 362 gumbo_string_buffer_clear(parser, &tokenizer->_script_data_buffer); | |
| 363 } | |
| 364 | |
| 365 // Appends a codepoint to the temporary buffer. | |
| 366 static void append_char_to_temporary_buffer( | |
| 367 GumboParser* parser, int codepoint) { | |
| 368 gumbo_string_buffer_append_codepoint( | |
| 369 parser, codepoint, &parser->_tokenizer_state->_temporary_buffer); | |
| 370 } | |
| 371 | |
| 372 // Checks to see if the temporary buffer equals a certain string. | |
| 373 // Make sure this remains side-effect free; it's used in assertions. | |
| 374 #ifndef NDEBUG | |
| 375 static bool temporary_buffer_equals(GumboParser* parser, const char* text) { | |
| 376 GumboStringBuffer* buffer = &parser->_tokenizer_state->_temporary_buffer; | |
| 377 // TODO(jdtang): See if the extra strlen is a performance problem, and replace | |
| 378 // it with an explicit sizeof(literal) if necessary. I don't think it will | |
| 379 // be, as this is only used in a couple of rare states. | |
| 380 size_t text_len = strlen(text); | |
| 381 return text_len == buffer->length && | |
| 382 memcmp(buffer->data, text, text_len) == 0; | |
| 383 } | |
| 384 #endif | |
| 385 | |
| 386 static void doc_type_state_init(GumboParser* parser) { | |
| 387 GumboTokenDocType* doc_type_state = | |
| 388 &parser->_tokenizer_state->_doc_type_state; | |
| 389 // We initialize these to NULL here so that we don't end up leaking memory if | |
| 390 // we never see a doctype token. When we do see a doctype token, we reset | |
| 391 // them to a freshly-allocated empty string so that we can present a uniform | |
| 392 // interface to client code and not make them check for null. Ownership is | |
| 393 // transferred to the doctype token when it's emitted. | |
| 394 doc_type_state->name = NULL; | |
| 395 doc_type_state->public_identifier = NULL; | |
| 396 doc_type_state->system_identifier = NULL; | |
| 397 doc_type_state->force_quirks = false; | |
| 398 doc_type_state->has_public_identifier = false; | |
| 399 doc_type_state->has_system_identifier = false; | |
| 400 } | |
| 401 | |
| 402 // Sets the token original_text and position to the current iterator position. | |
| 403 // This is necessary because [CDATA[ sections may include text that is ignored | |
| 404 // by the tokenizer. | |
| 405 static void reset_token_start_point(GumboTokenizerState* tokenizer) { | |
| 406 tokenizer->_token_start = utf8iterator_get_char_pointer(&tokenizer->_input); | |
| 407 utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos); | |
| 408 } | |
| 409 | |
| 410 // Sets the tag buffer original text and start point to the current iterator | |
| 411 // position. This is necessary because attribute names & values may have | |
| 412 // whitespace preceeding them, and so we can't assume that the actual token | |
| 413 // starting point was the end of the last tag buffer usage. | |
| 414 static void reset_tag_buffer_start_point(GumboParser* parser) { | |
| 415 GumboTokenizerState* tokenizer = parser->_tokenizer_state; | |
| 416 GumboTagState* tag_state = &tokenizer->_tag_state; | |
| 417 | |
| 418 utf8iterator_get_position(&tokenizer->_input, &tag_state->_start_pos); | |
| 419 tag_state->_original_text = utf8iterator_get_char_pointer(&tokenizer->_input); | |
| 420 } | |
| 421 | |
| 422 // Moves the temporary buffer contents over to the specified output string, | |
| 423 // and clears the temporary buffer. | |
| 424 static void finish_temporary_buffer(GumboParser* parser, const char** output) { | |
| 425 GumboTokenizerState* tokenizer = parser->_tokenizer_state; | |
| 426 *output = | |
| 427 gumbo_string_buffer_to_string(parser, &tokenizer->_temporary_buffer); | |
| 428 clear_temporary_buffer(parser); | |
| 429 } | |
| 430 | |
| 431 // Advances the iterator past the end of the token, and then fills in the | |
| 432 // relevant position fields. It's assumed that after every emit, the tokenizer | |
| 433 // will immediately return (letting the tree-construction stage read the filled | |
| 434 // in Token). Thus, it's safe to advance the input stream here, since it will | |
| 435 // bypass the advance at the bottom of the state machine loop. | |
| 436 // | |
| 437 // Since this advances the iterator and resets the current input, make sure to | |
| 438 // call it after you've recorded any other data you need for the token. | |
| 439 static void finish_token(GumboParser* parser, GumboToken* token) { | |
| 440 GumboTokenizerState* tokenizer = parser->_tokenizer_state; | |
| 441 if (!tokenizer->_reconsume_current_input) { | |
| 442 utf8iterator_next(&tokenizer->_input); | |
| 443 } | |
| 444 | |
| 445 token->position = tokenizer->_token_start_pos; | |
| 446 token->original_text.data = tokenizer->_token_start; | |
| 447 reset_token_start_point(tokenizer); | |
| 448 token->original_text.length = | |
| 449 tokenizer->_token_start - token->original_text.data; | |
| 450 if (token->original_text.length > 0 && | |
| 451 token->original_text.data[token->original_text.length - 1] == '\r') { | |
| 452 // The UTF8 iterator will ignore carriage returns in the input stream, which | |
| 453 // means that the next token may start one past a \r character. The pointer | |
| 454 // arithmetic above results in that \r being appended to the original text | |
| 455 // of the preceding token, so we have to adjust its length here to chop the | |
| 456 // \r off. | |
| 457 --token->original_text.length; | |
| 458 } | |
| 459 } | |
| 460 | |
| 461 // Records the doctype public ID, assumed to be in the temporary buffer. | |
| 462 // Convenience method that also sets has_public_identifier to true. | |
| 463 static void finish_doctype_public_id(GumboParser* parser) { | |
| 464 GumboTokenDocType* doc_type_state = | |
| 465 &parser->_tokenizer_state->_doc_type_state; | |
| 466 gumbo_parser_deallocate(parser, (void*) doc_type_state->public_identifier); | |
| 467 finish_temporary_buffer(parser, &doc_type_state->public_identifier); | |
| 468 doc_type_state->has_public_identifier = true; | |
| 469 } | |
| 470 | |
| 471 // Records the doctype system ID, assumed to be in the temporary buffer. | |
| 472 // Convenience method that also sets has_system_identifier to true. | |
| 473 static void finish_doctype_system_id(GumboParser* parser) { | |
| 474 GumboTokenDocType* doc_type_state = | |
| 475 &parser->_tokenizer_state->_doc_type_state; | |
| 476 gumbo_parser_deallocate(parser, (void*) doc_type_state->system_identifier); | |
| 477 finish_temporary_buffer(parser, &doc_type_state->system_identifier); | |
| 478 doc_type_state->has_system_identifier = true; | |
| 479 } | |
| 480 | |
| 481 // Writes a single specified character to the output token. | |
| 482 static void emit_char(GumboParser* parser, int c, GumboToken* output) { | |
| 483 output->type = get_char_token_type(parser->_tokenizer_state->_is_in_cdata, c); | |
| 484 output->v.character = c; | |
| 485 finish_token(parser, output); | |
| 486 } | |
| 487 | |
| 488 // Writes a replacement character token and records a parse error. | |
| 489 // Always returns RETURN_ERROR, per gumbo_lex return value. | |
| 490 static StateResult emit_replacement_char( | |
| 491 GumboParser* parser, GumboToken* output) { | |
| 492 // In all cases, this is because of a null byte in the input stream. | |
| 493 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL); | |
| 494 emit_char(parser, kUtf8ReplacementChar, output); | |
| 495 return RETURN_ERROR; | |
| 496 } | |
| 497 | |
| 498 // Writes an EOF character token. Always returns RETURN_SUCCESS. | |
| 499 static StateResult emit_eof(GumboParser* parser, GumboToken* output) { | |
| 500 emit_char(parser, -1, output); | |
| 501 return RETURN_SUCCESS; | |
| 502 } | |
| 503 | |
| 504 // Writes the current input character out as a character token. | |
| 505 // Always returns RETURN_SUCCESS. | |
| 506 static bool emit_current_char(GumboParser* parser, GumboToken* output) { | |
| 507 emit_char( | |
| 508 parser, utf8iterator_current(&parser->_tokenizer_state->_input), output); | |
| 509 return RETURN_SUCCESS; | |
| 510 } | |
| 511 | |
| 512 // Writes out a doctype token, copying it from the tokenizer state. | |
| 513 static void emit_doctype(GumboParser* parser, GumboToken* output) { | |
| 514 output->type = GUMBO_TOKEN_DOCTYPE; | |
| 515 output->v.doc_type = parser->_tokenizer_state->_doc_type_state; | |
| 516 finish_token(parser, output); | |
| 517 doc_type_state_init(parser); | |
| 518 } | |
| 519 | |
| 520 // Debug-only function that explicitly sets the attribute vector data to NULL so | |
| 521 // it can be asserted on tag creation, verifying that there are no memory leaks. | |
| 522 static void mark_tag_state_as_empty(GumboTagState* tag_state) { | |
| 523 #ifndef NDEBUG | |
| 524 tag_state->_attributes = kGumboEmptyVector; | |
| 525 #endif | |
| 526 } | |
| 527 | |
| 528 // Writes out the current tag as a start or end tag token. | |
| 529 // Always returns RETURN_SUCCESS. | |
| 530 static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) { | |
| 531 GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state; | |
| 532 if (tag_state->_is_start_tag) { | |
| 533 output->type = GUMBO_TOKEN_START_TAG; | |
| 534 output->v.start_tag.tag = tag_state->_tag; | |
| 535 output->v.start_tag.attributes = tag_state->_attributes; | |
| 536 output->v.start_tag.is_self_closing = tag_state->_is_self_closing; | |
| 537 tag_state->_last_start_tag = tag_state->_tag; | |
| 538 mark_tag_state_as_empty(tag_state); | |
| 539 gumbo_debug( | |
| 540 "Emitted start tag %s.\n", gumbo_normalized_tagname(tag_state->_tag)); | |
| 541 } else { | |
| 542 output->type = GUMBO_TOKEN_END_TAG; | |
| 543 output->v.end_tag = tag_state->_tag; | |
| 544 // In end tags, ownership of the attributes vector is not transferred to the | |
| 545 // token, but it's still initialized as normal, so it must be manually | |
| 546 // deallocated. There may also be attributes to destroy, in certain broken | |
| 547 // cases like </div</th> (the "th" is an attribute there). | |
| 548 for (unsigned int i = 0; i < tag_state->_attributes.length; ++i) { | |
| 549 gumbo_destroy_attribute(parser, tag_state->_attributes.data[i]); | |
| 550 } | |
| 551 gumbo_parser_deallocate(parser, tag_state->_attributes.data); | |
| 552 mark_tag_state_as_empty(tag_state); | |
| 553 gumbo_debug( | |
| 554 "Emitted end tag %s.\n", gumbo_normalized_tagname(tag_state->_tag)); | |
| 555 } | |
| 556 gumbo_string_buffer_destroy(parser, &tag_state->_buffer); | |
| 557 finish_token(parser, output); | |
| 558 gumbo_debug("Original text = %.*s.\n", output->original_text.length, | |
| 559 output->original_text.data); | |
| 560 assert(output->original_text.length >= 2); | |
| 561 assert(output->original_text.data[0] == '<'); | |
| 562 assert(output->original_text.data[output->original_text.length - 1] == '>'); | |
| 563 return RETURN_SUCCESS; | |
| 564 } | |
| 565 | |
| 566 // In some states, we speculatively start a tag, but don't know whether it'll be | |
| 567 // emitted as tag token or as a series of character tokens until we finish it. | |
| 568 // We need to abandon the tag we'd started & free its memory in that case to | |
| 569 // avoid a memory leak. | |
| 570 static void abandon_current_tag(GumboParser* parser) { | |
| 571 GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state; | |
| 572 for (unsigned int i = 0; i < tag_state->_attributes.length; ++i) { | |
| 573 gumbo_destroy_attribute(parser, tag_state->_attributes.data[i]); | |
| 574 } | |
| 575 gumbo_parser_deallocate(parser, tag_state->_attributes.data); | |
| 576 mark_tag_state_as_empty(tag_state); | |
| 577 gumbo_string_buffer_destroy(parser, &tag_state->_buffer); | |
| 578 gumbo_debug("Abandoning current tag.\n"); | |
| 579 } | |
| 580 | |
| 581 // Wraps the consume_char_ref function to handle its output and make the | |
| 582 // appropriate TokenizerState modifications. Returns RETURN_ERROR if a parse | |
| 583 // error occurred, RETURN_SUCCESS otherwise. | |
| 584 static StateResult emit_char_ref(GumboParser* parser, | |
| 585 int additional_allowed_char, bool is_in_attribute, GumboToken* output) { | |
| 586 GumboTokenizerState* tokenizer = parser->_tokenizer_state; | |
| 587 OneOrTwoCodepoints char_ref; | |
| 588 bool status = consume_char_ref( | |
| 589 parser, &tokenizer->_input, additional_allowed_char, false, &char_ref); | |
| 590 if (char_ref.first != kGumboNoChar) { | |
| 591 // consume_char_ref ends with the iterator pointing at the next character, | |
| 592 // so we need to be sure not advance it again before reading the next token. | |
| 593 tokenizer->_reconsume_current_input = true; | |
| 594 emit_char(parser, char_ref.first, output); | |
| 595 tokenizer->_buffered_emit_char = char_ref.second; | |
| 596 } else { | |
| 597 emit_char(parser, '&', output); | |
| 598 } | |
| 599 return status ? RETURN_SUCCESS : RETURN_ERROR; | |
| 600 } | |
| 601 | |
| 602 // Emits a comment token. Comments use the temporary buffer to accumulate their | |
| 603 // data, and then it's copied over and released to the 'text' field of the | |
| 604 // GumboToken union. Always returns RETURN_SUCCESS. | |
| 605 static StateResult emit_comment(GumboParser* parser, GumboToken* output) { | |
| 606 output->type = GUMBO_TOKEN_COMMENT; | |
| 607 finish_temporary_buffer(parser, &output->v.text); | |
| 608 finish_token(parser, output); | |
| 609 return RETURN_SUCCESS; | |
| 610 } | |
| 611 | |
| 612 // Checks to see we should be flushing accumulated characters in the temporary | |
| 613 // buffer, and fills the output token with the next output character if so. | |
| 614 // Returns true if a character has been emitted and the tokenizer should | |
| 615 // immediately return, false if we're at the end of the temporary buffer and | |
| 616 // should resume normal operation. | |
| 617 static bool maybe_emit_from_temporary_buffer( | |
| 618 GumboParser* parser, GumboToken* output) { | |
| 619 GumboTokenizerState* tokenizer = parser->_tokenizer_state; | |
| 620 const char* c = tokenizer->_temporary_buffer_emit; | |
| 621 GumboStringBuffer* buffer = &tokenizer->_temporary_buffer; | |
| 622 | |
| 623 if (!c || c >= buffer->data + buffer->length) { | |
| 624 tokenizer->_temporary_buffer_emit = NULL; | |
| 625 return false; | |
| 626 } | |
| 627 | |
| 628 assert(*c == utf8iterator_current(&tokenizer->_input)); | |
| 629 // emit_char also advances the input stream. We need to do some juggling of | |
| 630 // the _reconsume_current_input flag to get the proper behavior when emitting | |
| 631 // previous tokens. Basically, _reconsume_current_input should *never* be set | |
| 632 // when emitting anything from the temporary buffer, since those characters | |
| 633 // have already been advanced past. However, it should be preserved so that | |
| 634 // when the *next* character is encountered again, the tokenizer knows not to | |
| 635 // advance past it. | |
| 636 bool saved_reconsume_state = tokenizer->_reconsume_current_input; | |
| 637 tokenizer->_reconsume_current_input = false; | |
| 638 emit_char(parser, *c, output); | |
| 639 ++tokenizer->_temporary_buffer_emit; | |
| 640 tokenizer->_reconsume_current_input = saved_reconsume_state; | |
| 641 return true; | |
| 642 } | |
| 643 | |
| 644 // Sets up the tokenizer to begin flushing the temporary buffer. | |
| 645 // This resets the input iterator stream to the start of the last tag, sets up | |
| 646 // _temporary_buffer_emit, and then (if the temporary buffer is non-empty) emits | |
| 647 // the first character in it. It returns true if a character was emitted, false | |
| 648 // otherwise. | |
| 649 static bool emit_temporary_buffer(GumboParser* parser, GumboToken* output) { | |
| 650 GumboTokenizerState* tokenizer = parser->_tokenizer_state; | |
| 651 assert(tokenizer->_temporary_buffer.data); | |
| 652 utf8iterator_reset(&tokenizer->_input); | |
| 653 tokenizer->_temporary_buffer_emit = tokenizer->_temporary_buffer.data; | |
| 654 return maybe_emit_from_temporary_buffer(parser, output); | |
| 655 } | |
| 656 | |
| 657 // Appends a codepoint to the current tag buffer. If | |
| 658 // reinitilize_position_on_first is set, this also initializes the tag buffer | |
| 659 // start point; the only time you would *not* want to pass true for this | |
| 660 // parameter is if you want the original_text to include character (like an | |
| 661 // opening quote) that doesn't appear in the value. | |
| 662 static void append_char_to_tag_buffer( | |
| 663 GumboParser* parser, int codepoint, bool reinitilize_position_on_first) { | |
| 664 GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer; | |
| 665 if (buffer->length == 0 && reinitilize_position_on_first) { | |
| 666 reset_tag_buffer_start_point(parser); | |
| 667 } | |
| 668 gumbo_string_buffer_append_codepoint(parser, codepoint, buffer); | |
| 669 } | |
| 670 | |
| 671 // (Re-)initialize the tag buffer. This also resets the original_text pointer | |
| 672 // and _start_pos field to point to the current position. | |
| 673 static void initialize_tag_buffer(GumboParser* parser) { | |
| 674 GumboTokenizerState* tokenizer = parser->_tokenizer_state; | |
| 675 GumboTagState* tag_state = &tokenizer->_tag_state; | |
| 676 | |
| 677 gumbo_string_buffer_init(parser, &tag_state->_buffer); | |
| 678 reset_tag_buffer_start_point(parser); | |
| 679 } | |
| 680 | |
| 681 // Initializes the tag_state to start a new tag, keeping track of the opening | |
| 682 // positions and original text. Takes a boolean indicating whether this is a | |
| 683 // start or end tag. | |
| 684 static void start_new_tag(GumboParser* parser, bool is_start_tag) { | |
| 685 GumboTokenizerState* tokenizer = parser->_tokenizer_state; | |
| 686 GumboTagState* tag_state = &tokenizer->_tag_state; | |
| 687 int c = utf8iterator_current(&tokenizer->_input); | |
| 688 assert(is_alpha(c)); | |
| 689 c = ensure_lowercase(c); | |
| 690 assert(is_alpha(c)); | |
| 691 | |
| 692 initialize_tag_buffer(parser); | |
| 693 gumbo_string_buffer_append_codepoint(parser, c, &tag_state->_buffer); | |
| 694 | |
| 695 assert(tag_state->_attributes.data == NULL); | |
| 696 // Initial size chosen by statistical analysis of a corpus of 60k webpages. | |
| 697 // 99.5% of elements have 0 attributes, 93% of the remainder have 1. These | |
| 698 // numbers are a bit higher for more modern websites (eg. ~45% = 0, ~40% = 1 | |
| 699 // for the HTML5 Spec), but still have basically 99% of nodes with <= 2 attrs. | |
| 700 gumbo_vector_init(parser, 1, &tag_state->_attributes); | |
| 701 tag_state->_drop_next_attr_value = false; | |
| 702 tag_state->_is_start_tag = is_start_tag; | |
| 703 tag_state->_is_self_closing = false; | |
| 704 gumbo_debug("Starting new tag.\n"); | |
| 705 } | |
| 706 | |
| 707 // Fills in the specified char* with the contents of the tag buffer. | |
| 708 static void copy_over_tag_buffer(GumboParser* parser, const char** output) { | |
| 709 GumboTokenizerState* tokenizer = parser->_tokenizer_state; | |
| 710 GumboTagState* tag_state = &tokenizer->_tag_state; | |
| 711 *output = gumbo_string_buffer_to_string(parser, &tag_state->_buffer); | |
| 712 } | |
| 713 | |
| 714 // Fills in: | |
| 715 // * The original_text GumboStringPiece with the portion of the original | |
| 716 // buffer that corresponds to the tag buffer. | |
| 717 // * The start_pos GumboSourcePosition with the start position of the tag | |
| 718 // buffer. | |
| 719 // * The end_pos GumboSourcePosition with the current source position. | |
| 720 static void copy_over_original_tag_text(GumboParser* parser, | |
| 721 GumboStringPiece* original_text, GumboSourcePosition* start_pos, | |
| 722 GumboSourcePosition* end_pos) { | |
| 723 GumboTokenizerState* tokenizer = parser->_tokenizer_state; | |
| 724 GumboTagState* tag_state = &tokenizer->_tag_state; | |
| 725 | |
| 726 original_text->data = tag_state->_original_text; | |
| 727 original_text->length = utf8iterator_get_char_pointer(&tokenizer->_input) - | |
| 728 tag_state->_original_text; | |
| 729 if (original_text->data[original_text->length - 1] == '\r') { | |
| 730 // Since \r is skipped by the UTF-8 iterator, it can sometimes end up | |
| 731 // appended to the end of original text even when it's really the first part | |
| 732 // of the next character. If we detect this situation, shrink the length of | |
| 733 // the original text by 1 to remove the carriage return. | |
| 734 --original_text->length; | |
| 735 } | |
| 736 *start_pos = tag_state->_start_pos; | |
| 737 utf8iterator_get_position(&tokenizer->_input, end_pos); | |
| 738 } | |
| 739 | |
| 740 // Releases and then re-initializes the tag buffer. | |
| 741 static void reinitialize_tag_buffer(GumboParser* parser) { | |
| 742 gumbo_parser_deallocate( | |
| 743 parser, parser->_tokenizer_state->_tag_state._buffer.data); | |
| 744 initialize_tag_buffer(parser); | |
| 745 } | |
| 746 | |
| 747 // Moves some data from the temporary buffer over the the tag-based fields in | |
| 748 // TagState. | |
| 749 static void finish_tag_name(GumboParser* parser) { | |
| 750 GumboTokenizerState* tokenizer = parser->_tokenizer_state; | |
| 751 GumboTagState* tag_state = &tokenizer->_tag_state; | |
| 752 | |
| 753 tag_state->_tag = | |
| 754 gumbo_tagn_enum(tag_state->_buffer.data, tag_state->_buffer.length); | |
| 755 reinitialize_tag_buffer(parser); | |
| 756 } | |
| 757 | |
| 758 // Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct. | |
| 759 static void add_duplicate_attr_error(GumboParser* parser, const char* attr_name, | |
| 760 int original_index, int new_index) { | |
| 761 GumboError* error = gumbo_add_error(parser); | |
| 762 if (!error) { | |
| 763 return; | |
| 764 } | |
| 765 GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state; | |
| 766 error->type = GUMBO_ERR_DUPLICATE_ATTR; | |
| 767 error->position = tag_state->_start_pos; | |
| 768 error->original_text = tag_state->_original_text; | |
| 769 error->v.duplicate_attr.original_index = original_index; | |
| 770 error->v.duplicate_attr.new_index = new_index; | |
| 771 copy_over_tag_buffer(parser, &error->v.duplicate_attr.name); | |
| 772 reinitialize_tag_buffer(parser); | |
| 773 } | |
| 774 | |
| 775 // Creates a new attribute in the current tag, copying the current tag buffer to | |
| 776 // the attribute's name. The attribute's value starts out as the empty string | |
| 777 // (following the "Boolean attributes" section of the spec) and is only | |
| 778 // overwritten on finish_attribute_value(). If the attribute has already been | |
| 779 // specified, the new attribute is dropped, a parse error is added, and the | |
| 780 // function returns false. Otherwise, this returns true. | |
| 781 static bool finish_attribute_name(GumboParser* parser) { | |
| 782 GumboTokenizerState* tokenizer = parser->_tokenizer_state; | |
| 783 GumboTagState* tag_state = &tokenizer->_tag_state; | |
| 784 // May've been set by a previous attribute without a value; reset it here. | |
| 785 tag_state->_drop_next_attr_value = false; | |
| 786 assert(tag_state->_attributes.data); | |
| 787 assert(tag_state->_attributes.capacity); | |
| 788 | |
| 789 GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes; | |
| 790 for (unsigned int i = 0; i < attributes->length; ++i) { | |
| 791 GumboAttribute* attr = attributes->data[i]; | |
| 792 if (strlen(attr->name) == tag_state->_buffer.length && | |
| 793 memcmp(attr->name, tag_state->_buffer.data, | |
| 794 tag_state->_buffer.length) == 0) { | |
| 795 // Identical attribute; bail. | |
| 796 add_duplicate_attr_error(parser, attr->name, i, attributes->length); | |
| 797 tag_state->_drop_next_attr_value = true; | |
| 798 return false; | |
| 799 } | |
| 800 } | |
| 801 | |
| 802 GumboAttribute* attr = gumbo_parser_allocate(parser, sizeof(GumboAttribute)); | |
| 803 attr->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE; | |
| 804 copy_over_tag_buffer(parser, &attr->name); | |
| 805 copy_over_original_tag_text( | |
| 806 parser, &attr->original_name, &attr->name_start, &attr->name_end); | |
| 807 attr->value = gumbo_copy_stringz(parser, ""); | |
| 808 copy_over_original_tag_text( | |
| 809 parser, &attr->original_value, &attr->name_start, &attr->name_end); | |
| 810 gumbo_vector_add(parser, attr, attributes); | |
| 811 reinitialize_tag_buffer(parser); | |
| 812 return true; | |
| 813 } | |
| 814 | |
| 815 // Finishes an attribute value. This sets the value of the most recently added | |
| 816 // attribute to the current contents of the tag buffer. | |
| 817 static void finish_attribute_value(GumboParser* parser) { | |
| 818 GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state; | |
| 819 if (tag_state->_drop_next_attr_value) { | |
| 820 // Duplicate attribute name detected in an earlier state, so we have to | |
| 821 // ignore the value. | |
| 822 tag_state->_drop_next_attr_value = false; | |
| 823 reinitialize_tag_buffer(parser); | |
| 824 return; | |
| 825 } | |
| 826 | |
| 827 GumboAttribute* attr = | |
| 828 tag_state->_attributes.data[tag_state->_attributes.length - 1]; | |
| 829 gumbo_parser_deallocate(parser, (void*) attr->value); | |
| 830 copy_over_tag_buffer(parser, &attr->value); | |
| 831 copy_over_original_tag_text( | |
| 832 parser, &attr->original_value, &attr->value_start, &attr->value_end); | |
| 833 reinitialize_tag_buffer(parser); | |
| 834 } | |
| 835 | |
| 836 // Returns true if the current end tag matches the last start tag emitted. | |
| 837 static bool is_appropriate_end_tag(GumboParser* parser) { | |
| 838 GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state; | |
| 839 assert(!tag_state->_is_start_tag); | |
| 840 return tag_state->_last_start_tag != GUMBO_TAG_LAST && | |
| 841 tag_state->_last_start_tag == gumbo_tagn_enum(tag_state->_buffer.data, | |
| 842 tag_state->_buffer.length); | |
| 843 } | |
| 844 | |
| 845 void gumbo_tokenizer_state_init( | |
| 846 GumboParser* parser, const char* text, size_t text_length) { | |
| 847 GumboTokenizerState* tokenizer = | |
| 848 gumbo_parser_allocate(parser, sizeof(GumboTokenizerState)); | |
| 849 parser->_tokenizer_state = tokenizer; | |
| 850 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 851 tokenizer->_reconsume_current_input = false; | |
| 852 tokenizer->_is_current_node_foreign = false; | |
| 853 tokenizer->_is_in_cdata = false; | |
| 854 tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST; | |
| 855 | |
| 856 tokenizer->_buffered_emit_char = kGumboNoChar; | |
| 857 gumbo_string_buffer_init(parser, &tokenizer->_temporary_buffer); | |
| 858 tokenizer->_temporary_buffer_emit = NULL; | |
| 859 | |
| 860 mark_tag_state_as_empty(&tokenizer->_tag_state); | |
| 861 | |
| 862 gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer); | |
| 863 tokenizer->_token_start = text; | |
| 864 utf8iterator_init(parser, text, text_length, &tokenizer->_input); | |
| 865 utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos); | |
| 866 doc_type_state_init(parser); | |
| 867 } | |
| 868 | |
| 869 void gumbo_tokenizer_state_destroy(GumboParser* parser) { | |
| 870 GumboTokenizerState* tokenizer = parser->_tokenizer_state; | |
| 871 assert(tokenizer->_doc_type_state.name == NULL); | |
| 872 assert(tokenizer->_doc_type_state.public_identifier == NULL); | |
| 873 assert(tokenizer->_doc_type_state.system_identifier == NULL); | |
| 874 gumbo_string_buffer_destroy(parser, &tokenizer->_temporary_buffer); | |
| 875 gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer); | |
| 876 gumbo_parser_deallocate(parser, tokenizer); | |
| 877 } | |
| 878 | |
| 879 void gumbo_tokenizer_set_state(GumboParser* parser, GumboTokenizerEnum state) { | |
| 880 parser->_tokenizer_state->_state = state; | |
| 881 } | |
| 882 | |
| 883 void gumbo_tokenizer_set_is_current_node_foreign( | |
| 884 GumboParser* parser, bool is_foreign) { | |
| 885 if (is_foreign != parser->_tokenizer_state->_is_current_node_foreign) { | |
| 886 gumbo_debug("Toggling is_current_node_foreign to %s.\n", | |
| 887 is_foreign ? "true" : "false"); | |
| 888 } | |
| 889 parser->_tokenizer_state->_is_current_node_foreign = is_foreign; | |
| 890 } | |
| 891 | |
| 892 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#data-state | |
| 893 static StateResult handle_data_state(GumboParser* parser, | |
| 894 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 895 switch (c) { | |
| 896 case '&': | |
| 897 gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_DATA); | |
| 898 // The char_ref machinery expects to be on the & so it can mark that | |
| 899 // and return to it if the text isn't a char ref, so we need to | |
| 900 // reconsume it. | |
| 901 tokenizer->_reconsume_current_input = true; | |
| 902 return NEXT_CHAR; | |
| 903 case '<': | |
| 904 gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_OPEN); | |
| 905 clear_temporary_buffer(parser); | |
| 906 append_char_to_temporary_buffer(parser, '<'); | |
| 907 return NEXT_CHAR; | |
| 908 case '\0': | |
| 909 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL); | |
| 910 emit_char(parser, c, output); | |
| 911 return RETURN_ERROR; | |
| 912 default: | |
| 913 return emit_current_char(parser, output); | |
| 914 } | |
| 915 } | |
| 916 | |
| 917 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-data-state | |
| 918 static StateResult handle_char_ref_in_data_state(GumboParser* parser, | |
| 919 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 920 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 921 return emit_char_ref(parser, ' ', false, output); | |
| 922 } | |
| 923 | |
| 924 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rcdata-state | |
| 925 static StateResult handle_rcdata_state(GumboParser* parser, | |
| 926 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 927 switch (c) { | |
| 928 case '&': | |
| 929 gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_RCDATA); | |
| 930 tokenizer->_reconsume_current_input = true; | |
| 931 return NEXT_CHAR; | |
| 932 case '<': | |
| 933 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_LT); | |
| 934 clear_temporary_buffer(parser); | |
| 935 append_char_to_temporary_buffer(parser, '<'); | |
| 936 return NEXT_CHAR; | |
| 937 case '\0': | |
| 938 return emit_replacement_char(parser, output); | |
| 939 case -1: | |
| 940 return emit_eof(parser, output); | |
| 941 default: | |
| 942 return emit_current_char(parser, output); | |
| 943 } | |
| 944 } | |
| 945 | |
| 946 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-rcdata-state | |
| 947 static StateResult handle_char_ref_in_rcdata_state(GumboParser* parser, | |
| 948 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 949 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA); | |
| 950 return emit_char_ref(parser, ' ', false, output); | |
| 951 } | |
| 952 | |
| 953 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-state | |
| 954 static StateResult handle_rawtext_state(GumboParser* parser, | |
| 955 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 956 switch (c) { | |
| 957 case '<': | |
| 958 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_LT); | |
| 959 clear_temporary_buffer(parser); | |
| 960 append_char_to_temporary_buffer(parser, '<'); | |
| 961 return NEXT_CHAR; | |
| 962 case '\0': | |
| 963 return emit_replacement_char(parser, output); | |
| 964 case -1: | |
| 965 return emit_eof(parser, output); | |
| 966 default: | |
| 967 return emit_current_char(parser, output); | |
| 968 } | |
| 969 } | |
| 970 | |
| 971 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-state | |
| 972 static StateResult handle_script_state(GumboParser* parser, | |
| 973 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 974 switch (c) { | |
| 975 case '<': | |
| 976 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_LT); | |
| 977 clear_temporary_buffer(parser); | |
| 978 append_char_to_temporary_buffer(parser, '<'); | |
| 979 return NEXT_CHAR; | |
| 980 case '\0': | |
| 981 return emit_replacement_char(parser, output); | |
| 982 case -1: | |
| 983 return emit_eof(parser, output); | |
| 984 default: | |
| 985 return emit_current_char(parser, output); | |
| 986 } | |
| 987 } | |
| 988 | |
| 989 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#plaintext-state | |
| 990 static StateResult handle_plaintext_state(GumboParser* parser, | |
| 991 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 992 switch (c) { | |
| 993 case '\0': | |
| 994 return emit_replacement_char(parser, output); | |
| 995 case -1: | |
| 996 return emit_eof(parser, output); | |
| 997 default: | |
| 998 return emit_current_char(parser, output); | |
| 999 } | |
| 1000 } | |
| 1001 | |
| 1002 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#tag-open-state | |
| 1003 static StateResult handle_tag_open_state(GumboParser* parser, | |
| 1004 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 1005 assert(temporary_buffer_equals(parser, "<")); | |
| 1006 switch (c) { | |
| 1007 case '!': | |
| 1008 gumbo_tokenizer_set_state(parser, GUMBO_LEX_MARKUP_DECLARATION); | |
| 1009 clear_temporary_buffer(parser); | |
| 1010 return NEXT_CHAR; | |
| 1011 case '/': | |
| 1012 gumbo_tokenizer_set_state(parser, GUMBO_LEX_END_TAG_OPEN); | |
| 1013 append_char_to_temporary_buffer(parser, '/'); | |
| 1014 return NEXT_CHAR; | |
| 1015 case '?': | |
| 1016 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT); | |
| 1017 clear_temporary_buffer(parser); | |
| 1018 append_char_to_temporary_buffer(parser, '?'); | |
| 1019 tokenizer_add_parse_error(parser, GUMBO_ERR_TAG_STARTS_WITH_QUESTION); | |
| 1020 return NEXT_CHAR; | |
| 1021 default: | |
| 1022 if (is_alpha(c)) { | |
| 1023 gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_NAME); | |
| 1024 start_new_tag(parser, true); | |
| 1025 return NEXT_CHAR; | |
| 1026 } else { | |
| 1027 tokenizer_add_parse_error(parser, GUMBO_ERR_TAG_INVALID); | |
| 1028 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 1029 emit_temporary_buffer(parser, output); | |
| 1030 return RETURN_ERROR; | |
| 1031 } | |
| 1032 } | |
| 1033 } | |
| 1034 | |
| 1035 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#end-tag-open-state | |
| 1036 static StateResult handle_end_tag_open_state(GumboParser* parser, | |
| 1037 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 1038 assert(temporary_buffer_equals(parser, "</")); | |
| 1039 switch (c) { | |
| 1040 case '>': | |
| 1041 tokenizer_add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_EMPTY); | |
| 1042 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 1043 return NEXT_CHAR; | |
| 1044 case -1: | |
| 1045 tokenizer_add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_EOF); | |
| 1046 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 1047 return emit_temporary_buffer(parser, output); | |
| 1048 default: | |
| 1049 if (is_alpha(c)) { | |
| 1050 gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_NAME); | |
| 1051 start_new_tag(parser, false); | |
| 1052 } else { | |
| 1053 tokenizer_add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_INVALID); | |
| 1054 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT); | |
| 1055 clear_temporary_buffer(parser); | |
| 1056 append_char_to_temporary_buffer(parser, c); | |
| 1057 } | |
| 1058 return NEXT_CHAR; | |
| 1059 } | |
| 1060 } | |
| 1061 | |
| 1062 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#tag-name-state | |
| 1063 static StateResult handle_tag_name_state(GumboParser* parser, | |
| 1064 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 1065 switch (c) { | |
| 1066 case '\t': | |
| 1067 case '\n': | |
| 1068 case '\f': | |
| 1069 case ' ': | |
| 1070 finish_tag_name(parser); | |
| 1071 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME); | |
| 1072 return NEXT_CHAR; | |
| 1073 case '/': | |
| 1074 finish_tag_name(parser); | |
| 1075 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG); | |
| 1076 return NEXT_CHAR; | |
| 1077 case '>': | |
| 1078 finish_tag_name(parser); | |
| 1079 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 1080 return emit_current_tag(parser, output); | |
| 1081 case '\0': | |
| 1082 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL); | |
| 1083 append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true); | |
| 1084 return NEXT_CHAR; | |
| 1085 case -1: | |
| 1086 tokenizer_add_parse_error(parser, GUMBO_ERR_TAG_EOF); | |
| 1087 abandon_current_tag(parser); | |
| 1088 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 1089 return NEXT_CHAR; | |
| 1090 default: | |
| 1091 append_char_to_tag_buffer(parser, ensure_lowercase(c), true); | |
| 1092 return NEXT_CHAR; | |
| 1093 } | |
| 1094 } | |
| 1095 | |
| 1096 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-less-than-sign-state | |
| 1097 static StateResult handle_rcdata_lt_state(GumboParser* parser, | |
| 1098 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 1099 assert(temporary_buffer_equals(parser, "<")); | |
| 1100 if (c == '/') { | |
| 1101 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_OPEN); | |
| 1102 append_char_to_temporary_buffer(parser, '/'); | |
| 1103 return NEXT_CHAR; | |
| 1104 } else { | |
| 1105 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA); | |
| 1106 tokenizer->_reconsume_current_input = true; | |
| 1107 return emit_temporary_buffer(parser, output); | |
| 1108 } | |
| 1109 } | |
| 1110 | |
| 1111 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-end-tag-open-state | |
| 1112 static StateResult handle_rcdata_end_tag_open_state(GumboParser* parser, | |
| 1113 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 1114 assert(temporary_buffer_equals(parser, "</")); | |
| 1115 if (is_alpha(c)) { | |
| 1116 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_NAME); | |
| 1117 start_new_tag(parser, false); | |
| 1118 append_char_to_temporary_buffer(parser, c); | |
| 1119 return NEXT_CHAR; | |
| 1120 } else { | |
| 1121 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA); | |
| 1122 return emit_temporary_buffer(parser, output); | |
| 1123 } | |
| 1124 return true; | |
| 1125 } | |
| 1126 | |
| 1127 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-end-tag-name-state | |
| 1128 static StateResult handle_rcdata_end_tag_name_state(GumboParser* parser, | |
| 1129 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 1130 assert(tokenizer->_temporary_buffer.length >= 2); | |
| 1131 if (is_alpha(c)) { | |
| 1132 append_char_to_tag_buffer(parser, ensure_lowercase(c), true); | |
| 1133 append_char_to_temporary_buffer(parser, c); | |
| 1134 return NEXT_CHAR; | |
| 1135 } else if (is_appropriate_end_tag(parser)) { | |
| 1136 switch (c) { | |
| 1137 case '\t': | |
| 1138 case '\n': | |
| 1139 case '\f': | |
| 1140 case ' ': | |
| 1141 finish_tag_name(parser); | |
| 1142 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME); | |
| 1143 return NEXT_CHAR; | |
| 1144 case '/': | |
| 1145 finish_tag_name(parser); | |
| 1146 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG); | |
| 1147 return NEXT_CHAR; | |
| 1148 case '>': | |
| 1149 finish_tag_name(parser); | |
| 1150 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 1151 return emit_current_tag(parser, output); | |
| 1152 } | |
| 1153 } | |
| 1154 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA); | |
| 1155 abandon_current_tag(parser); | |
| 1156 return emit_temporary_buffer(parser, output); | |
| 1157 } | |
| 1158 | |
| 1159 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-less-than-sign-state | |
| 1160 static StateResult handle_rawtext_lt_state(GumboParser* parser, | |
| 1161 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 1162 assert(temporary_buffer_equals(parser, "<")); | |
| 1163 if (c == '/') { | |
| 1164 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_OPEN); | |
| 1165 append_char_to_temporary_buffer(parser, '/'); | |
| 1166 return NEXT_CHAR; | |
| 1167 } else { | |
| 1168 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT); | |
| 1169 tokenizer->_reconsume_current_input = true; | |
| 1170 return emit_temporary_buffer(parser, output); | |
| 1171 } | |
| 1172 } | |
| 1173 | |
| 1174 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-end-tag-open-state | |
| 1175 static StateResult handle_rawtext_end_tag_open_state(GumboParser* parser, | |
| 1176 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 1177 assert(temporary_buffer_equals(parser, "</")); | |
| 1178 if (is_alpha(c)) { | |
| 1179 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_NAME); | |
| 1180 start_new_tag(parser, false); | |
| 1181 append_char_to_temporary_buffer(parser, c); | |
| 1182 return NEXT_CHAR; | |
| 1183 } else { | |
| 1184 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT); | |
| 1185 return emit_temporary_buffer(parser, output); | |
| 1186 } | |
| 1187 } | |
| 1188 | |
| 1189 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-end-tag-name-state | |
| 1190 static StateResult handle_rawtext_end_tag_name_state(GumboParser* parser, | |
| 1191 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 1192 assert(tokenizer->_temporary_buffer.length >= 2); | |
| 1193 gumbo_debug("Last end tag: %*s\n", (int) tokenizer->_tag_state._buffer.length, | |
| 1194 tokenizer->_tag_state._buffer.data); | |
| 1195 if (is_alpha(c)) { | |
| 1196 append_char_to_tag_buffer(parser, ensure_lowercase(c), true); | |
| 1197 append_char_to_temporary_buffer(parser, c); | |
| 1198 return NEXT_CHAR; | |
| 1199 } else if (is_appropriate_end_tag(parser)) { | |
| 1200 gumbo_debug("Is an appropriate end tag.\n"); | |
| 1201 switch (c) { | |
| 1202 case '\t': | |
| 1203 case '\n': | |
| 1204 case '\f': | |
| 1205 case ' ': | |
| 1206 finish_tag_name(parser); | |
| 1207 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME); | |
| 1208 return NEXT_CHAR; | |
| 1209 case '/': | |
| 1210 finish_tag_name(parser); | |
| 1211 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG); | |
| 1212 return NEXT_CHAR; | |
| 1213 case '>': | |
| 1214 finish_tag_name(parser); | |
| 1215 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 1216 return emit_current_tag(parser, output); | |
| 1217 } | |
| 1218 } | |
| 1219 gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT); | |
| 1220 abandon_current_tag(parser); | |
| 1221 return emit_temporary_buffer(parser, output); | |
| 1222 } | |
| 1223 | |
| 1224 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-less-than-sign-state | |
| 1225 static StateResult handle_script_lt_state(GumboParser* parser, | |
| 1226 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 1227 assert(temporary_buffer_equals(parser, "<")); | |
| 1228 if (c == '/') { | |
| 1229 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_OPEN); | |
| 1230 append_char_to_temporary_buffer(parser, '/'); | |
| 1231 return NEXT_CHAR; | |
| 1232 } else if (c == '!') { | |
| 1233 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START); | |
| 1234 append_char_to_temporary_buffer(parser, '!'); | |
| 1235 return emit_temporary_buffer(parser, output); | |
| 1236 } else { | |
| 1237 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT); | |
| 1238 tokenizer->_reconsume_current_input = true; | |
| 1239 return emit_temporary_buffer(parser, output); | |
| 1240 } | |
| 1241 } | |
| 1242 | |
| 1243 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-end-tag-open-state | |
| 1244 static StateResult handle_script_end_tag_open_state(GumboParser* parser, | |
| 1245 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 1246 assert(temporary_buffer_equals(parser, "</")); | |
| 1247 if (is_alpha(c)) { | |
| 1248 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_NAME); | |
| 1249 start_new_tag(parser, false); | |
| 1250 append_char_to_temporary_buffer(parser, c); | |
| 1251 return NEXT_CHAR; | |
| 1252 } else { | |
| 1253 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT); | |
| 1254 return emit_temporary_buffer(parser, output); | |
| 1255 } | |
| 1256 } | |
| 1257 | |
| 1258 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-end-tag-name-state | |
| 1259 static StateResult handle_script_end_tag_name_state(GumboParser* parser, | |
| 1260 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 1261 assert(tokenizer->_temporary_buffer.length >= 2); | |
| 1262 if (is_alpha(c)) { | |
| 1263 append_char_to_tag_buffer(parser, ensure_lowercase(c), true); | |
| 1264 append_char_to_temporary_buffer(parser, c); | |
| 1265 return NEXT_CHAR; | |
| 1266 } else if (is_appropriate_end_tag(parser)) { | |
| 1267 switch (c) { | |
| 1268 case '\t': | |
| 1269 case '\n': | |
| 1270 case '\f': | |
| 1271 case ' ': | |
| 1272 finish_tag_name(parser); | |
| 1273 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME); | |
| 1274 return NEXT_CHAR; | |
| 1275 case '/': | |
| 1276 finish_tag_name(parser); | |
| 1277 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG); | |
| 1278 return NEXT_CHAR; | |
| 1279 case '>': | |
| 1280 finish_tag_name(parser); | |
| 1281 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 1282 return emit_current_tag(parser, output); | |
| 1283 } | |
| 1284 } | |
| 1285 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT); | |
| 1286 abandon_current_tag(parser); | |
| 1287 return emit_temporary_buffer(parser, output); | |
| 1288 } | |
| 1289 | |
| 1290 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escape-start-state | |
| 1291 static StateResult handle_script_escaped_start_state(GumboParser* parser, | |
| 1292 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 1293 if (c == '-') { | |
| 1294 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START_DASH); | |
| 1295 return emit_current_char(parser, output); | |
| 1296 } else { | |
| 1297 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT); | |
| 1298 tokenizer->_reconsume_current_input = true; | |
| 1299 return NEXT_CHAR; | |
| 1300 } | |
| 1301 } | |
| 1302 | |
| 1303 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escape-start-dash-state | |
| 1304 static StateResult handle_script_escaped_start_dash_state(GumboParser* parser, | |
| 1305 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 1306 if (c == '-') { | |
| 1307 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH); | |
| 1308 return emit_current_char(parser, output); | |
| 1309 } else { | |
| 1310 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT); | |
| 1311 tokenizer->_reconsume_current_input = true; | |
| 1312 return NEXT_CHAR; | |
| 1313 } | |
| 1314 } | |
| 1315 | |
| 1316 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-state | |
| 1317 static StateResult handle_script_escaped_state(GumboParser* parser, | |
| 1318 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 1319 switch (c) { | |
| 1320 case '-': | |
| 1321 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH); | |
| 1322 return emit_current_char(parser, output); | |
| 1323 case '<': | |
| 1324 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT); | |
| 1325 clear_temporary_buffer(parser); | |
| 1326 append_char_to_temporary_buffer(parser, c); | |
| 1327 return NEXT_CHAR; | |
| 1328 case '\0': | |
| 1329 return emit_replacement_char(parser, output); | |
| 1330 case -1: | |
| 1331 tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF); | |
| 1332 return emit_eof(parser, output); | |
| 1333 default: | |
| 1334 return emit_current_char(parser, output); | |
| 1335 } | |
| 1336 } | |
| 1337 | |
| 1338 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-dash-state | |
| 1339 static StateResult handle_script_escaped_dash_state(GumboParser* parser, | |
| 1340 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 1341 switch (c) { | |
| 1342 case '-': | |
| 1343 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH); | |
| 1344 return emit_current_char(parser, output); | |
| 1345 case '<': | |
| 1346 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT); | |
| 1347 clear_temporary_buffer(parser); | |
| 1348 append_char_to_temporary_buffer(parser, c); | |
| 1349 return NEXT_CHAR; | |
| 1350 case '\0': | |
| 1351 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED); | |
| 1352 return emit_replacement_char(parser, output); | |
| 1353 case -1: | |
| 1354 tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF); | |
| 1355 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 1356 return NEXT_CHAR; | |
| 1357 default: | |
| 1358 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED); | |
| 1359 return emit_current_char(parser, output); | |
| 1360 } | |
| 1361 } | |
| 1362 | |
| 1363 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-dash-dash-state | |
| 1364 static StateResult handle_script_escaped_dash_dash_state(GumboParser* parser, | |
| 1365 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 1366 switch (c) { | |
| 1367 case '-': | |
| 1368 return emit_current_char(parser, output); | |
| 1369 case '<': | |
| 1370 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT); | |
| 1371 clear_temporary_buffer(parser); | |
| 1372 append_char_to_temporary_buffer(parser, c); | |
| 1373 return NEXT_CHAR; | |
| 1374 case '>': | |
| 1375 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT); | |
| 1376 return emit_current_char(parser, output); | |
| 1377 case '\0': | |
| 1378 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED); | |
| 1379 return emit_replacement_char(parser, output); | |
| 1380 case -1: | |
| 1381 tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF); | |
| 1382 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 1383 return NEXT_CHAR; | |
| 1384 default: | |
| 1385 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED); | |
| 1386 return emit_current_char(parser, output); | |
| 1387 } | |
| 1388 } | |
| 1389 | |
| 1390 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-less-than-sign-state | |
| 1391 static StateResult handle_script_escaped_lt_state(GumboParser* parser, | |
| 1392 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 1393 assert(temporary_buffer_equals(parser, "<")); | |
| 1394 assert(!tokenizer->_script_data_buffer.length); | |
| 1395 if (c == '/') { | |
| 1396 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN); | |
| 1397 append_char_to_temporary_buffer(parser, c); | |
| 1398 return NEXT_CHAR; | |
| 1399 } else if (is_alpha(c)) { | |
| 1400 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START); | |
| 1401 append_char_to_temporary_buffer(parser, c); | |
| 1402 gumbo_string_buffer_append_codepoint( | |
| 1403 parser, ensure_lowercase(c), &tokenizer->_script_data_buffer); | |
| 1404 return emit_temporary_buffer(parser, output); | |
| 1405 } else { | |
| 1406 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED); | |
| 1407 return emit_temporary_buffer(parser, output); | |
| 1408 } | |
| 1409 } | |
| 1410 | |
| 1411 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-end-tag-open-state | |
| 1412 static StateResult handle_script_escaped_end_tag_open_state(GumboParser* parser, | |
| 1413 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 1414 assert(temporary_buffer_equals(parser, "</")); | |
| 1415 if (is_alpha(c)) { | |
| 1416 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME); | |
| 1417 start_new_tag(parser, false); | |
| 1418 append_char_to_temporary_buffer(parser, c); | |
| 1419 return NEXT_CHAR; | |
| 1420 } else { | |
| 1421 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED); | |
| 1422 return emit_temporary_buffer(parser, output); | |
| 1423 } | |
| 1424 } | |
| 1425 | |
| 1426 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-end-tag-name-state | |
| 1427 static StateResult handle_script_escaped_end_tag_name_state(GumboParser* parser, | |
| 1428 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 1429 assert(tokenizer->_temporary_buffer.length >= 2); | |
| 1430 if (is_alpha(c)) { | |
| 1431 append_char_to_tag_buffer(parser, ensure_lowercase(c), true); | |
| 1432 append_char_to_temporary_buffer(parser, c); | |
| 1433 return NEXT_CHAR; | |
| 1434 } else if (is_appropriate_end_tag(parser)) { | |
| 1435 switch (c) { | |
| 1436 case '\t': | |
| 1437 case '\n': | |
| 1438 case '\f': | |
| 1439 case ' ': | |
| 1440 finish_tag_name(parser); | |
| 1441 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME); | |
| 1442 return NEXT_CHAR; | |
| 1443 case '/': | |
| 1444 finish_tag_name(parser); | |
| 1445 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG); | |
| 1446 return NEXT_CHAR; | |
| 1447 case '>': | |
| 1448 finish_tag_name(parser); | |
| 1449 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 1450 return emit_current_tag(parser, output); | |
| 1451 } | |
| 1452 } | |
| 1453 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED); | |
| 1454 abandon_current_tag(parser); | |
| 1455 return emit_temporary_buffer(parser, output); | |
| 1456 } | |
| 1457 | |
| 1458 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escape-start-state | |
| 1459 static StateResult handle_script_double_escaped_start_state(GumboParser* parser, | |
| 1460 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 1461 switch (c) { | |
| 1462 case '\t': | |
| 1463 case '\n': | |
| 1464 case '\f': | |
| 1465 case ' ': | |
| 1466 case '/': | |
| 1467 case '>': | |
| 1468 gumbo_tokenizer_set_state( | |
| 1469 parser, gumbo_string_equals(&kScriptTag, | |
| 1470 (GumboStringPiece*) &tokenizer->_script_data_buffer) | |
| 1471 ? GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED | |
| 1472 : GUMBO_LEX_SCRIPT_ESCAPED); | |
| 1473 return emit_current_char(parser, output); | |
| 1474 default: | |
| 1475 if (is_alpha(c)) { | |
| 1476 gumbo_string_buffer_append_codepoint( | |
| 1477 parser, ensure_lowercase(c), &tokenizer->_script_data_buffer); | |
| 1478 return emit_current_char(parser, output); | |
| 1479 } else { | |
| 1480 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED); | |
| 1481 tokenizer->_reconsume_current_input = true; | |
| 1482 return NEXT_CHAR; | |
| 1483 } | |
| 1484 } | |
| 1485 } | |
| 1486 | |
| 1487 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-state | |
| 1488 static StateResult handle_script_double_escaped_state(GumboParser* parser, | |
| 1489 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 1490 switch (c) { | |
| 1491 case '-': | |
| 1492 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH); | |
| 1493 return emit_current_char(parser, output); | |
| 1494 case '<': | |
| 1495 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT); | |
| 1496 return emit_current_char(parser, output); | |
| 1497 case '\0': | |
| 1498 return emit_replacement_char(parser, output); | |
| 1499 case -1: | |
| 1500 tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF); | |
| 1501 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 1502 return NEXT_CHAR; | |
| 1503 default: | |
| 1504 return emit_current_char(parser, output); | |
| 1505 } | |
| 1506 } | |
| 1507 | |
| 1508 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-dash-state | |
| 1509 static StateResult handle_script_double_escaped_dash_state(GumboParser* parser, | |
| 1510 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 1511 switch (c) { | |
| 1512 case '-': | |
| 1513 gumbo_tokenizer_set_state( | |
| 1514 parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH); | |
| 1515 return emit_current_char(parser, output); | |
| 1516 case '<': | |
| 1517 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT); | |
| 1518 return emit_current_char(parser, output); | |
| 1519 case '\0': | |
| 1520 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED); | |
| 1521 return emit_replacement_char(parser, output); | |
| 1522 case -1: | |
| 1523 tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF); | |
| 1524 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 1525 return NEXT_CHAR; | |
| 1526 default: | |
| 1527 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED); | |
| 1528 return emit_current_char(parser, output); | |
| 1529 } | |
| 1530 } | |
| 1531 | |
| 1532 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-dash-dash-state | |
| 1533 static StateResult handle_script_double_escaped_dash_dash_state( | |
| 1534 GumboParser* parser, GumboTokenizerState* tokenizer, int c, | |
| 1535 GumboToken* output) { | |
| 1536 switch (c) { | |
| 1537 case '-': | |
| 1538 return emit_current_char(parser, output); | |
| 1539 case '<': | |
| 1540 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT); | |
| 1541 return emit_current_char(parser, output); | |
| 1542 case '>': | |
| 1543 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT); | |
| 1544 return emit_current_char(parser, output); | |
| 1545 case '\0': | |
| 1546 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED); | |
| 1547 return emit_replacement_char(parser, output); | |
| 1548 case -1: | |
| 1549 tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF); | |
| 1550 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 1551 return NEXT_CHAR; | |
| 1552 default: | |
| 1553 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED); | |
| 1554 return emit_current_char(parser, output); | |
| 1555 } | |
| 1556 } | |
| 1557 | |
| 1558 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-less-than-sign-state | |
| 1559 static StateResult handle_script_double_escaped_lt_state(GumboParser* parser, | |
| 1560 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 1561 if (c == '/') { | |
| 1562 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END); | |
| 1563 gumbo_string_buffer_clear(parser, &tokenizer->_script_data_buffer); | |
| 1564 return emit_current_char(parser, output); | |
| 1565 } else { | |
| 1566 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED); | |
| 1567 tokenizer->_reconsume_current_input = true; | |
| 1568 return NEXT_CHAR; | |
| 1569 } | |
| 1570 } | |
| 1571 | |
| 1572 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escape-end-state | |
| 1573 static StateResult handle_script_double_escaped_end_state(GumboParser* parser, | |
| 1574 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 1575 switch (c) { | |
| 1576 case '\t': | |
| 1577 case '\n': | |
| 1578 case '\f': | |
| 1579 case ' ': | |
| 1580 case '/': | |
| 1581 case '>': | |
| 1582 gumbo_tokenizer_set_state( | |
| 1583 parser, gumbo_string_equals(&kScriptTag, | |
| 1584 (GumboStringPiece*) &tokenizer->_script_data_buffer) | |
| 1585 ? GUMBO_LEX_SCRIPT_ESCAPED | |
| 1586 : GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED); | |
| 1587 return emit_current_char(parser, output); | |
| 1588 default: | |
| 1589 if (is_alpha(c)) { | |
| 1590 gumbo_string_buffer_append_codepoint( | |
| 1591 parser, ensure_lowercase(c), &tokenizer->_script_data_buffer); | |
| 1592 return emit_current_char(parser, output); | |
| 1593 } else { | |
| 1594 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED); | |
| 1595 tokenizer->_reconsume_current_input = true; | |
| 1596 return NEXT_CHAR; | |
| 1597 } | |
| 1598 } | |
| 1599 } | |
| 1600 | |
| 1601 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-attribute-name-state | |
| 1602 static StateResult handle_before_attr_name_state(GumboParser* parser, | |
| 1603 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 1604 switch (c) { | |
| 1605 case '\t': | |
| 1606 case '\n': | |
| 1607 case '\f': | |
| 1608 case ' ': | |
| 1609 return NEXT_CHAR; | |
| 1610 case '/': | |
| 1611 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG); | |
| 1612 return NEXT_CHAR; | |
| 1613 case '>': | |
| 1614 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 1615 return emit_current_tag(parser, output); | |
| 1616 case '\0': | |
| 1617 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL); | |
| 1618 gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME); | |
| 1619 append_char_to_temporary_buffer(parser, 0xfffd); | |
| 1620 return NEXT_CHAR; | |
| 1621 case -1: | |
| 1622 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF); | |
| 1623 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 1624 abandon_current_tag(parser); | |
| 1625 return NEXT_CHAR; | |
| 1626 case '"': | |
| 1627 case '\'': | |
| 1628 case '<': | |
| 1629 case '=': | |
| 1630 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID); | |
| 1631 // Fall through. | |
| 1632 default: | |
| 1633 gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME); | |
| 1634 append_char_to_tag_buffer(parser, ensure_lowercase(c), true); | |
| 1635 return NEXT_CHAR; | |
| 1636 } | |
| 1637 } | |
| 1638 | |
| 1639 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-name-state | |
| 1640 static StateResult handle_attr_name_state(GumboParser* parser, | |
| 1641 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 1642 switch (c) { | |
| 1643 case '\t': | |
| 1644 case '\n': | |
| 1645 case '\f': | |
| 1646 case ' ': | |
| 1647 finish_attribute_name(parser); | |
| 1648 gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_NAME); | |
| 1649 return NEXT_CHAR; | |
| 1650 case '/': | |
| 1651 finish_attribute_name(parser); | |
| 1652 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG); | |
| 1653 return NEXT_CHAR; | |
| 1654 case '=': | |
| 1655 finish_attribute_name(parser); | |
| 1656 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE); | |
| 1657 return NEXT_CHAR; | |
| 1658 case '>': | |
| 1659 finish_attribute_name(parser); | |
| 1660 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 1661 return emit_current_tag(parser, output); | |
| 1662 case '\0': | |
| 1663 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL); | |
| 1664 append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true); | |
| 1665 return NEXT_CHAR; | |
| 1666 case -1: | |
| 1667 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 1668 abandon_current_tag(parser); | |
| 1669 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF); | |
| 1670 return NEXT_CHAR; | |
| 1671 case '"': | |
| 1672 case '\'': | |
| 1673 case '<': | |
| 1674 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID); | |
| 1675 // Fall through. | |
| 1676 default: | |
| 1677 append_char_to_tag_buffer(parser, ensure_lowercase(c), true); | |
| 1678 return NEXT_CHAR; | |
| 1679 } | |
| 1680 } | |
| 1681 | |
| 1682 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#after-attribute-name-state | |
| 1683 static StateResult handle_after_attr_name_state(GumboParser* parser, | |
| 1684 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 1685 switch (c) { | |
| 1686 case '\t': | |
| 1687 case '\n': | |
| 1688 case '\f': | |
| 1689 case ' ': | |
| 1690 return NEXT_CHAR; | |
| 1691 case '/': | |
| 1692 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG); | |
| 1693 return NEXT_CHAR; | |
| 1694 case '=': | |
| 1695 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE); | |
| 1696 return NEXT_CHAR; | |
| 1697 case '>': | |
| 1698 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 1699 return emit_current_tag(parser, output); | |
| 1700 case '\0': | |
| 1701 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL); | |
| 1702 gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME); | |
| 1703 append_char_to_temporary_buffer(parser, 0xfffd); | |
| 1704 return NEXT_CHAR; | |
| 1705 case -1: | |
| 1706 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF); | |
| 1707 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 1708 abandon_current_tag(parser); | |
| 1709 return NEXT_CHAR; | |
| 1710 case '"': | |
| 1711 case '\'': | |
| 1712 case '<': | |
| 1713 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID); | |
| 1714 // Fall through. | |
| 1715 default: | |
| 1716 gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME); | |
| 1717 append_char_to_tag_buffer(parser, ensure_lowercase(c), true); | |
| 1718 return NEXT_CHAR; | |
| 1719 } | |
| 1720 } | |
| 1721 | |
| 1722 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-attribute-value-state | |
| 1723 static StateResult handle_before_attr_value_state(GumboParser* parser, | |
| 1724 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 1725 switch (c) { | |
| 1726 case '\t': | |
| 1727 case '\n': | |
| 1728 case '\f': | |
| 1729 case ' ': | |
| 1730 return NEXT_CHAR; | |
| 1731 case '"': | |
| 1732 gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED); | |
| 1733 reset_tag_buffer_start_point(parser); | |
| 1734 return NEXT_CHAR; | |
| 1735 case '&': | |
| 1736 gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED); | |
| 1737 tokenizer->_reconsume_current_input = true; | |
| 1738 return NEXT_CHAR; | |
| 1739 case '\'': | |
| 1740 gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED); | |
| 1741 reset_tag_buffer_start_point(parser); | |
| 1742 return NEXT_CHAR; | |
| 1743 case '\0': | |
| 1744 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL); | |
| 1745 gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED); | |
| 1746 append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true); | |
| 1747 return NEXT_CHAR; | |
| 1748 case -1: | |
| 1749 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EOF); | |
| 1750 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 1751 abandon_current_tag(parser); | |
| 1752 tokenizer->_reconsume_current_input = true; | |
| 1753 return NEXT_CHAR; | |
| 1754 case '>': | |
| 1755 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_RIGHT_BRACKET); | |
| 1756 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 1757 emit_current_tag(parser, output); | |
| 1758 return RETURN_ERROR; | |
| 1759 case '<': | |
| 1760 case '=': | |
| 1761 case '`': | |
| 1762 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS); | |
| 1763 // Fall through. | |
| 1764 default: | |
| 1765 gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED); | |
| 1766 append_char_to_tag_buffer(parser, c, true); | |
| 1767 return NEXT_CHAR; | |
| 1768 } | |
| 1769 } | |
| 1770 | |
| 1771 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-double-quoted-state | |
| 1772 static StateResult handle_attr_value_double_quoted_state(GumboParser* parser, | |
| 1773 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 1774 switch (c) { | |
| 1775 case '"': | |
| 1776 gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED); | |
| 1777 return NEXT_CHAR; | |
| 1778 case '&': | |
| 1779 tokenizer->_tag_state._attr_value_state = tokenizer->_state; | |
| 1780 gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE); | |
| 1781 tokenizer->_reconsume_current_input = true; | |
| 1782 return NEXT_CHAR; | |
| 1783 case '\0': | |
| 1784 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL); | |
| 1785 append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false); | |
| 1786 return NEXT_CHAR; | |
| 1787 case -1: | |
| 1788 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_DOUBLE_QUOTE_EOF); | |
| 1789 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 1790 abandon_current_tag(parser); | |
| 1791 tokenizer->_reconsume_current_input = true; | |
| 1792 return NEXT_CHAR; | |
| 1793 default: | |
| 1794 append_char_to_tag_buffer(parser, c, false); | |
| 1795 return NEXT_CHAR; | |
| 1796 } | |
| 1797 } | |
| 1798 | |
| 1799 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-single-quoted-state | |
| 1800 static StateResult handle_attr_value_single_quoted_state(GumboParser* parser, | |
| 1801 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 1802 switch (c) { | |
| 1803 case '\'': | |
| 1804 gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED); | |
| 1805 return NEXT_CHAR; | |
| 1806 case '&': | |
| 1807 tokenizer->_tag_state._attr_value_state = tokenizer->_state; | |
| 1808 gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE); | |
| 1809 tokenizer->_reconsume_current_input = true; | |
| 1810 return NEXT_CHAR; | |
| 1811 case '\0': | |
| 1812 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL); | |
| 1813 append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false); | |
| 1814 return NEXT_CHAR; | |
| 1815 case -1: | |
| 1816 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_SINGLE_QUOTE_EOF); | |
| 1817 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 1818 abandon_current_tag(parser); | |
| 1819 tokenizer->_reconsume_current_input = true; | |
| 1820 return NEXT_CHAR; | |
| 1821 default: | |
| 1822 append_char_to_tag_buffer(parser, c, false); | |
| 1823 return NEXT_CHAR; | |
| 1824 } | |
| 1825 } | |
| 1826 | |
| 1827 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-unquoted-state | |
| 1828 static StateResult handle_attr_value_unquoted_state(GumboParser* parser, | |
| 1829 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 1830 switch (c) { | |
| 1831 case '\t': | |
| 1832 case '\n': | |
| 1833 case '\f': | |
| 1834 case ' ': | |
| 1835 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME); | |
| 1836 finish_attribute_value(parser); | |
| 1837 return NEXT_CHAR; | |
| 1838 case '&': | |
| 1839 tokenizer->_tag_state._attr_value_state = tokenizer->_state; | |
| 1840 gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE); | |
| 1841 tokenizer->_reconsume_current_input = true; | |
| 1842 return NEXT_CHAR; | |
| 1843 case '>': | |
| 1844 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 1845 finish_attribute_value(parser); | |
| 1846 return emit_current_tag(parser, output); | |
| 1847 case '\0': | |
| 1848 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL); | |
| 1849 append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true); | |
| 1850 return NEXT_CHAR; | |
| 1851 case -1: | |
| 1852 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EOF); | |
| 1853 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 1854 tokenizer->_reconsume_current_input = true; | |
| 1855 abandon_current_tag(parser); | |
| 1856 return NEXT_CHAR; | |
| 1857 case '<': | |
| 1858 case '=': | |
| 1859 case '"': | |
| 1860 case '\'': | |
| 1861 case '`': | |
| 1862 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS); | |
| 1863 // Fall through. | |
| 1864 default: | |
| 1865 append_char_to_tag_buffer(parser, c, true); | |
| 1866 return NEXT_CHAR; | |
| 1867 } | |
| 1868 } | |
| 1869 | |
| 1870 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-attribute-value-state | |
| 1871 static StateResult handle_char_ref_in_attr_value_state(GumboParser* parser, | |
| 1872 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 1873 OneOrTwoCodepoints char_ref; | |
| 1874 int allowed_char; | |
| 1875 bool is_unquoted = false; | |
| 1876 switch (tokenizer->_tag_state._attr_value_state) { | |
| 1877 case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED: | |
| 1878 allowed_char = '"'; | |
| 1879 break; | |
| 1880 case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED: | |
| 1881 allowed_char = '\''; | |
| 1882 break; | |
| 1883 case GUMBO_LEX_ATTR_VALUE_UNQUOTED: | |
| 1884 allowed_char = '>'; | |
| 1885 is_unquoted = true; | |
| 1886 break; | |
| 1887 default: | |
| 1888 // -Wmaybe-uninitialized is a little overzealous here, and doesn't | |
| 1889 // get that the assert(0) means this codepath will never happen. | |
| 1890 allowed_char = ' '; | |
| 1891 assert(0); | |
| 1892 } | |
| 1893 | |
| 1894 // Ignore the status, since we don't have a convenient way of signalling that | |
| 1895 // a parser error has occurred when the error occurs in the middle of a | |
| 1896 // multi-state token. We'd need a flag inside the TokenizerState to do this, | |
| 1897 // but that's a low priority fix. | |
| 1898 consume_char_ref(parser, &tokenizer->_input, allowed_char, true, &char_ref); | |
| 1899 if (char_ref.first != kGumboNoChar) { | |
| 1900 tokenizer->_reconsume_current_input = true; | |
| 1901 append_char_to_tag_buffer(parser, char_ref.first, is_unquoted); | |
| 1902 if (char_ref.second != kGumboNoChar) { | |
| 1903 append_char_to_tag_buffer(parser, char_ref.second, is_unquoted); | |
| 1904 } | |
| 1905 } else { | |
| 1906 append_char_to_tag_buffer(parser, '&', is_unquoted); | |
| 1907 } | |
| 1908 gumbo_tokenizer_set_state(parser, tokenizer->_tag_state._attr_value_state); | |
| 1909 return NEXT_CHAR; | |
| 1910 } | |
| 1911 | |
| 1912 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#after-attribute-value-quoted-state | |
| 1913 static StateResult handle_after_attr_value_quoted_state(GumboParser* parser, | |
| 1914 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 1915 finish_attribute_value(parser); | |
| 1916 switch (c) { | |
| 1917 case '\t': | |
| 1918 case '\n': | |
| 1919 case '\f': | |
| 1920 case ' ': | |
| 1921 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME); | |
| 1922 return NEXT_CHAR; | |
| 1923 case '/': | |
| 1924 gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG); | |
| 1925 return NEXT_CHAR; | |
| 1926 case '>': | |
| 1927 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 1928 return emit_current_tag(parser, output); | |
| 1929 case -1: | |
| 1930 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_AFTER_EOF); | |
| 1931 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 1932 abandon_current_tag(parser); | |
| 1933 tokenizer->_reconsume_current_input = true; | |
| 1934 return NEXT_CHAR; | |
| 1935 default: | |
| 1936 tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_AFTER_INVALID); | |
| 1937 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME); | |
| 1938 tokenizer->_reconsume_current_input = true; | |
| 1939 return NEXT_CHAR; | |
| 1940 } | |
| 1941 } | |
| 1942 | |
| 1943 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#self-closing-start-tag-state | |
| 1944 static StateResult handle_self_closing_start_tag_state(GumboParser* parser, | |
| 1945 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 1946 switch (c) { | |
| 1947 case '>': | |
| 1948 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 1949 tokenizer->_tag_state._is_self_closing = true; | |
| 1950 return emit_current_tag(parser, output); | |
| 1951 case -1: | |
| 1952 tokenizer_add_parse_error(parser, GUMBO_ERR_SOLIDUS_EOF); | |
| 1953 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 1954 abandon_current_tag(parser); | |
| 1955 return NEXT_CHAR; | |
| 1956 default: | |
| 1957 tokenizer_add_parse_error(parser, GUMBO_ERR_SOLIDUS_INVALID); | |
| 1958 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME); | |
| 1959 tokenizer->_reconsume_current_input = true; | |
| 1960 return NEXT_CHAR; | |
| 1961 } | |
| 1962 } | |
| 1963 | |
| 1964 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#bogus-comment-state | |
| 1965 static StateResult handle_bogus_comment_state(GumboParser* parser, | |
| 1966 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 1967 while (c != '>' && c != -1) { | |
| 1968 if (c == '\0') { | |
| 1969 c = 0xFFFD; | |
| 1970 } | |
| 1971 append_char_to_temporary_buffer(parser, c); | |
| 1972 utf8iterator_next(&tokenizer->_input); | |
| 1973 c = utf8iterator_current(&tokenizer->_input); | |
| 1974 } | |
| 1975 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 1976 return emit_comment(parser, output); | |
| 1977 } | |
| 1978 | |
| 1979 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#markup-declaration-open-state | |
| 1980 static StateResult handle_markup_declaration_state(GumboParser* parser, | |
| 1981 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 1982 if (utf8iterator_maybe_consume_match( | |
| 1983 &tokenizer->_input, "--", sizeof("--") - 1, true)) { | |
| 1984 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START); | |
| 1985 tokenizer->_reconsume_current_input = true; | |
| 1986 } else if (utf8iterator_maybe_consume_match( | |
| 1987 &tokenizer->_input, "DOCTYPE", sizeof("DOCTYPE") - 1, false)) { | |
| 1988 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE); | |
| 1989 tokenizer->_reconsume_current_input = true; | |
| 1990 // If we get here, we know we'll eventually emit a doctype token, so now is | |
| 1991 // the time to initialize the doctype strings. (Not in doctype_state_init, | |
| 1992 // since then they'll leak if ownership never gets transferred to the | |
| 1993 // doctype token. | |
| 1994 tokenizer->_doc_type_state.name = gumbo_copy_stringz(parser, ""); | |
| 1995 tokenizer->_doc_type_state.public_identifier = | |
| 1996 gumbo_copy_stringz(parser, ""); | |
| 1997 tokenizer->_doc_type_state.system_identifier = | |
| 1998 gumbo_copy_stringz(parser, ""); | |
| 1999 } else if (tokenizer->_is_current_node_foreign && | |
| 2000 utf8iterator_maybe_consume_match( | |
| 2001 &tokenizer->_input, "[CDATA[", sizeof("[CDATA[") - 1, true)) { | |
| 2002 gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA); | |
| 2003 tokenizer->_is_in_cdata = true; | |
| 2004 tokenizer->_reconsume_current_input = true; | |
| 2005 } else { | |
| 2006 tokenizer_add_parse_error(parser, GUMBO_ERR_DASHES_OR_DOCTYPE); | |
| 2007 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT); | |
| 2008 tokenizer->_reconsume_current_input = true; | |
| 2009 clear_temporary_buffer(parser); | |
| 2010 } | |
| 2011 return NEXT_CHAR; | |
| 2012 } | |
| 2013 | |
| 2014 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-start-state | |
| 2015 static StateResult handle_comment_start_state(GumboParser* parser, | |
| 2016 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 2017 switch (c) { | |
| 2018 case '-': | |
| 2019 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START_DASH); | |
| 2020 return NEXT_CHAR; | |
| 2021 case '\0': | |
| 2022 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL); | |
| 2023 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT); | |
| 2024 append_char_to_temporary_buffer(parser, kUtf8ReplacementChar); | |
| 2025 return NEXT_CHAR; | |
| 2026 case '>': | |
| 2027 tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID); | |
| 2028 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2029 emit_comment(parser, output); | |
| 2030 return RETURN_ERROR; | |
| 2031 case -1: | |
| 2032 tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF); | |
| 2033 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2034 emit_comment(parser, output); | |
| 2035 return RETURN_ERROR; | |
| 2036 default: | |
| 2037 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT); | |
| 2038 append_char_to_temporary_buffer(parser, c); | |
| 2039 return NEXT_CHAR; | |
| 2040 } | |
| 2041 } | |
| 2042 | |
| 2043 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-start-dash-state | |
| 2044 static StateResult handle_comment_start_dash_state(GumboParser* parser, | |
| 2045 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 2046 switch (c) { | |
| 2047 case '-': | |
| 2048 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END); | |
| 2049 return NEXT_CHAR; | |
| 2050 case '\0': | |
| 2051 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL); | |
| 2052 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT); | |
| 2053 append_char_to_temporary_buffer(parser, '-'); | |
| 2054 append_char_to_temporary_buffer(parser, kUtf8ReplacementChar); | |
| 2055 return NEXT_CHAR; | |
| 2056 case '>': | |
| 2057 tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID); | |
| 2058 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2059 emit_comment(parser, output); | |
| 2060 return RETURN_ERROR; | |
| 2061 case -1: | |
| 2062 tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF); | |
| 2063 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2064 emit_comment(parser, output); | |
| 2065 return RETURN_ERROR; | |
| 2066 default: | |
| 2067 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT); | |
| 2068 append_char_to_temporary_buffer(parser, '-'); | |
| 2069 append_char_to_temporary_buffer(parser, c); | |
| 2070 return NEXT_CHAR; | |
| 2071 } | |
| 2072 } | |
| 2073 | |
| 2074 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-state | |
| 2075 static StateResult handle_comment_state(GumboParser* parser, | |
| 2076 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 2077 switch (c) { | |
| 2078 case '-': | |
| 2079 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH); | |
| 2080 return NEXT_CHAR; | |
| 2081 case '\0': | |
| 2082 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL); | |
| 2083 append_char_to_temporary_buffer(parser, kUtf8ReplacementChar); | |
| 2084 return NEXT_CHAR; | |
| 2085 case -1: | |
| 2086 tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF); | |
| 2087 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2088 emit_comment(parser, output); | |
| 2089 return RETURN_ERROR; | |
| 2090 default: | |
| 2091 append_char_to_temporary_buffer(parser, c); | |
| 2092 return NEXT_CHAR; | |
| 2093 } | |
| 2094 } | |
| 2095 | |
| 2096 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-dash-state | |
| 2097 static StateResult handle_comment_end_dash_state(GumboParser* parser, | |
| 2098 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 2099 switch (c) { | |
| 2100 case '-': | |
| 2101 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END); | |
| 2102 return NEXT_CHAR; | |
| 2103 case '\0': | |
| 2104 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL); | |
| 2105 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT); | |
| 2106 append_char_to_temporary_buffer(parser, '-'); | |
| 2107 append_char_to_temporary_buffer(parser, kUtf8ReplacementChar); | |
| 2108 return NEXT_CHAR; | |
| 2109 case -1: | |
| 2110 tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF); | |
| 2111 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2112 emit_comment(parser, output); | |
| 2113 return RETURN_ERROR; | |
| 2114 default: | |
| 2115 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT); | |
| 2116 append_char_to_temporary_buffer(parser, '-'); | |
| 2117 append_char_to_temporary_buffer(parser, c); | |
| 2118 return NEXT_CHAR; | |
| 2119 } | |
| 2120 } | |
| 2121 | |
| 2122 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-state | |
| 2123 static StateResult handle_comment_end_state(GumboParser* parser, | |
| 2124 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 2125 switch (c) { | |
| 2126 case '>': | |
| 2127 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2128 return emit_comment(parser, output); | |
| 2129 case '\0': | |
| 2130 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL); | |
| 2131 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT); | |
| 2132 append_char_to_temporary_buffer(parser, '-'); | |
| 2133 append_char_to_temporary_buffer(parser, '-'); | |
| 2134 append_char_to_temporary_buffer(parser, kUtf8ReplacementChar); | |
| 2135 return NEXT_CHAR; | |
| 2136 case '!': | |
| 2137 tokenizer_add_parse_error( | |
| 2138 parser, GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH); | |
| 2139 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_BANG); | |
| 2140 return NEXT_CHAR; | |
| 2141 case '-': | |
| 2142 tokenizer_add_parse_error( | |
| 2143 parser, GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH); | |
| 2144 append_char_to_temporary_buffer(parser, '-'); | |
| 2145 return NEXT_CHAR; | |
| 2146 case -1: | |
| 2147 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL); | |
| 2148 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2149 emit_comment(parser, output); | |
| 2150 return RETURN_ERROR; | |
| 2151 default: | |
| 2152 tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID); | |
| 2153 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT); | |
| 2154 append_char_to_temporary_buffer(parser, '-'); | |
| 2155 append_char_to_temporary_buffer(parser, '-'); | |
| 2156 append_char_to_temporary_buffer(parser, c); | |
| 2157 return NEXT_CHAR; | |
| 2158 } | |
| 2159 } | |
| 2160 | |
| 2161 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-bang-state | |
| 2162 static StateResult handle_comment_end_bang_state(GumboParser* parser, | |
| 2163 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 2164 switch (c) { | |
| 2165 case '-': | |
| 2166 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH); | |
| 2167 append_char_to_temporary_buffer(parser, '-'); | |
| 2168 append_char_to_temporary_buffer(parser, '-'); | |
| 2169 append_char_to_temporary_buffer(parser, '!'); | |
| 2170 return NEXT_CHAR; | |
| 2171 case '>': | |
| 2172 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2173 return emit_comment(parser, output); | |
| 2174 case '\0': | |
| 2175 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL); | |
| 2176 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT); | |
| 2177 append_char_to_temporary_buffer(parser, '-'); | |
| 2178 append_char_to_temporary_buffer(parser, '-'); | |
| 2179 append_char_to_temporary_buffer(parser, '!'); | |
| 2180 append_char_to_temporary_buffer(parser, kUtf8ReplacementChar); | |
| 2181 return NEXT_CHAR; | |
| 2182 case -1: | |
| 2183 tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_END_BANG_EOF); | |
| 2184 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2185 emit_comment(parser, output); | |
| 2186 return RETURN_ERROR; | |
| 2187 default: | |
| 2188 gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT); | |
| 2189 append_char_to_temporary_buffer(parser, '-'); | |
| 2190 append_char_to_temporary_buffer(parser, '-'); | |
| 2191 append_char_to_temporary_buffer(parser, '!'); | |
| 2192 append_char_to_temporary_buffer(parser, c); | |
| 2193 return NEXT_CHAR; | |
| 2194 } | |
| 2195 } | |
| 2196 | |
| 2197 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#doctype-state | |
| 2198 static StateResult handle_doctype_state(GumboParser* parser, | |
| 2199 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 2200 assert(!tokenizer->_temporary_buffer.length); | |
| 2201 switch (c) { | |
| 2202 case '\t': | |
| 2203 case '\n': | |
| 2204 case '\f': | |
| 2205 case ' ': | |
| 2206 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME); | |
| 2207 return NEXT_CHAR; | |
| 2208 case -1: | |
| 2209 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF); | |
| 2210 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2211 tokenizer->_doc_type_state.force_quirks = true; | |
| 2212 emit_doctype(parser, output); | |
| 2213 return RETURN_ERROR; | |
| 2214 default: | |
| 2215 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_SPACE); | |
| 2216 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME); | |
| 2217 tokenizer->_reconsume_current_input = true; | |
| 2218 tokenizer->_doc_type_state.force_quirks = true; | |
| 2219 return NEXT_CHAR; | |
| 2220 } | |
| 2221 } | |
| 2222 | |
| 2223 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-doctype-name-state | |
| 2224 static StateResult handle_before_doctype_name_state(GumboParser* parser, | |
| 2225 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 2226 switch (c) { | |
| 2227 case '\t': | |
| 2228 case '\n': | |
| 2229 case '\f': | |
| 2230 case ' ': | |
| 2231 return NEXT_CHAR; | |
| 2232 case '\0': | |
| 2233 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL); | |
| 2234 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME); | |
| 2235 tokenizer->_doc_type_state.force_quirks = true; | |
| 2236 append_char_to_temporary_buffer(parser, kUtf8ReplacementChar); | |
| 2237 return NEXT_CHAR; | |
| 2238 case '>': | |
| 2239 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_RIGHT_BRACKET); | |
| 2240 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2241 tokenizer->_doc_type_state.force_quirks = true; | |
| 2242 emit_doctype(parser, output); | |
| 2243 return RETURN_ERROR; | |
| 2244 case -1: | |
| 2245 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF); | |
| 2246 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2247 tokenizer->_doc_type_state.force_quirks = true; | |
| 2248 emit_doctype(parser, output); | |
| 2249 return RETURN_ERROR; | |
| 2250 default: | |
| 2251 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME); | |
| 2252 tokenizer->_doc_type_state.force_quirks = false; | |
| 2253 append_char_to_temporary_buffer(parser, ensure_lowercase(c)); | |
| 2254 return NEXT_CHAR; | |
| 2255 } | |
| 2256 } | |
| 2257 | |
| 2258 // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#doctype-name-state | |
| 2259 static StateResult handle_doctype_name_state(GumboParser* parser, | |
| 2260 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 2261 switch (c) { | |
| 2262 case '\t': | |
| 2263 case '\n': | |
| 2264 case '\f': | |
| 2265 case ' ': | |
| 2266 gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_NAME); | |
| 2267 gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name); | |
| 2268 finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name); | |
| 2269 return NEXT_CHAR; | |
| 2270 case '>': | |
| 2271 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2272 gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name); | |
| 2273 finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name); | |
| 2274 emit_doctype(parser, output); | |
| 2275 return RETURN_SUCCESS; | |
| 2276 case '\0': | |
| 2277 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL); | |
| 2278 append_char_to_temporary_buffer(parser, kUtf8ReplacementChar); | |
| 2279 return NEXT_CHAR; | |
| 2280 case -1: | |
| 2281 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF); | |
| 2282 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2283 tokenizer->_doc_type_state.force_quirks = true; | |
| 2284 gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name); | |
| 2285 finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name); | |
| 2286 emit_doctype(parser, output); | |
| 2287 return RETURN_ERROR; | |
| 2288 default: | |
| 2289 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME); | |
| 2290 tokenizer->_doc_type_state.force_quirks = false; | |
| 2291 append_char_to_temporary_buffer(parser, ensure_lowercase(c)); | |
| 2292 return NEXT_CHAR; | |
| 2293 } | |
| 2294 } | |
| 2295 | |
| 2296 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-name-state | |
| 2297 static StateResult handle_after_doctype_name_state(GumboParser* parser, | |
| 2298 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 2299 switch (c) { | |
| 2300 case '\t': | |
| 2301 case '\n': | |
| 2302 case '\f': | |
| 2303 case ' ': | |
| 2304 return NEXT_CHAR; | |
| 2305 case '>': | |
| 2306 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2307 emit_doctype(parser, output); | |
| 2308 return RETURN_SUCCESS; | |
| 2309 case -1: | |
| 2310 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF); | |
| 2311 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2312 tokenizer->_doc_type_state.force_quirks = true; | |
| 2313 emit_doctype(parser, output); | |
| 2314 return RETURN_ERROR; | |
| 2315 default: | |
| 2316 if (utf8iterator_maybe_consume_match( | |
| 2317 &tokenizer->_input, "PUBLIC", sizeof("PUBLIC") - 1, false)) { | |
| 2318 gumbo_tokenizer_set_state( | |
| 2319 parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD); | |
| 2320 tokenizer->_reconsume_current_input = true; | |
| 2321 } else if (utf8iterator_maybe_consume_match(&tokenizer->_input, "SYSTEM", | |
| 2322 sizeof("SYSTEM") - 1, false)) { | |
| 2323 gumbo_tokenizer_set_state( | |
| 2324 parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD); | |
| 2325 tokenizer->_reconsume_current_input = true; | |
| 2326 } else { | |
| 2327 tokenizer_add_parse_error( | |
| 2328 parser, GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET); | |
| 2329 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE); | |
| 2330 tokenizer->_doc_type_state.force_quirks = true; | |
| 2331 } | |
| 2332 return NEXT_CHAR; | |
| 2333 } | |
| 2334 } | |
| 2335 | |
| 2336 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-public-keyword-state | |
| 2337 static StateResult handle_after_doctype_public_keyword_state( | |
| 2338 GumboParser* parser, GumboTokenizerState* tokenizer, int c, | |
| 2339 GumboToken* output) { | |
| 2340 switch (c) { | |
| 2341 case '\t': | |
| 2342 case '\n': | |
| 2343 case '\f': | |
| 2344 case ' ': | |
| 2345 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID); | |
| 2346 return NEXT_CHAR; | |
| 2347 case '"': | |
| 2348 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID); | |
| 2349 assert(temporary_buffer_equals(parser, "")); | |
| 2350 gumbo_tokenizer_set_state( | |
| 2351 parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED); | |
| 2352 return NEXT_CHAR; | |
| 2353 case '\'': | |
| 2354 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID); | |
| 2355 assert(temporary_buffer_equals(parser, "")); | |
| 2356 gumbo_tokenizer_set_state( | |
| 2357 parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED); | |
| 2358 return NEXT_CHAR; | |
| 2359 case '>': | |
| 2360 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_RIGHT_BRACKET); | |
| 2361 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2362 tokenizer->_doc_type_state.force_quirks = true; | |
| 2363 emit_doctype(parser, output); | |
| 2364 return RETURN_ERROR; | |
| 2365 case -1: | |
| 2366 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF); | |
| 2367 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2368 tokenizer->_doc_type_state.force_quirks = true; | |
| 2369 emit_doctype(parser, output); | |
| 2370 return RETURN_ERROR; | |
| 2371 default: | |
| 2372 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID); | |
| 2373 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE); | |
| 2374 tokenizer->_doc_type_state.force_quirks = true; | |
| 2375 emit_doctype(parser, output); | |
| 2376 return RETURN_ERROR; | |
| 2377 } | |
| 2378 } | |
| 2379 | |
| 2380 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#before-doctype-public-identifier-state | |
| 2381 static StateResult handle_before_doctype_public_id_state(GumboParser* parser, | |
| 2382 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 2383 switch (c) { | |
| 2384 case '\t': | |
| 2385 case '\n': | |
| 2386 case '\f': | |
| 2387 case ' ': | |
| 2388 return NEXT_CHAR; | |
| 2389 case '"': | |
| 2390 assert(temporary_buffer_equals(parser, "")); | |
| 2391 gumbo_tokenizer_set_state( | |
| 2392 parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED); | |
| 2393 return NEXT_CHAR; | |
| 2394 case '\'': | |
| 2395 assert(temporary_buffer_equals(parser, "")); | |
| 2396 gumbo_tokenizer_set_state( | |
| 2397 parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED); | |
| 2398 return NEXT_CHAR; | |
| 2399 case '>': | |
| 2400 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END); | |
| 2401 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2402 tokenizer->_doc_type_state.force_quirks = true; | |
| 2403 emit_doctype(parser, output); | |
| 2404 return RETURN_ERROR; | |
| 2405 case -1: | |
| 2406 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF); | |
| 2407 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2408 tokenizer->_doc_type_state.force_quirks = true; | |
| 2409 emit_doctype(parser, output); | |
| 2410 return RETURN_ERROR; | |
| 2411 default: | |
| 2412 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID); | |
| 2413 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE); | |
| 2414 tokenizer->_doc_type_state.force_quirks = true; | |
| 2415 emit_doctype(parser, output); | |
| 2416 return RETURN_ERROR; | |
| 2417 } | |
| 2418 } | |
| 2419 | |
| 2420 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-public-identifier-(double-quoted)-state | |
| 2421 static StateResult handle_doctype_public_id_double_quoted_state( | |
| 2422 GumboParser* parser, GumboTokenizerState* tokenizer, int c, | |
| 2423 GumboToken* output) { | |
| 2424 switch (c) { | |
| 2425 case '"': | |
| 2426 gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID); | |
| 2427 finish_doctype_public_id(parser); | |
| 2428 return NEXT_CHAR; | |
| 2429 case '\0': | |
| 2430 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL); | |
| 2431 append_char_to_temporary_buffer(parser, kUtf8ReplacementChar); | |
| 2432 return NEXT_CHAR; | |
| 2433 case '>': | |
| 2434 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END); | |
| 2435 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2436 tokenizer->_doc_type_state.force_quirks = true; | |
| 2437 finish_doctype_public_id(parser); | |
| 2438 emit_doctype(parser, output); | |
| 2439 return RETURN_ERROR; | |
| 2440 case -1: | |
| 2441 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF); | |
| 2442 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2443 tokenizer->_doc_type_state.force_quirks = true; | |
| 2444 finish_doctype_public_id(parser); | |
| 2445 emit_doctype(parser, output); | |
| 2446 return RETURN_ERROR; | |
| 2447 default: | |
| 2448 append_char_to_temporary_buffer(parser, c); | |
| 2449 return NEXT_CHAR; | |
| 2450 } | |
| 2451 } | |
| 2452 | |
| 2453 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-public-identifier-(single-quoted)-state | |
| 2454 static StateResult handle_doctype_public_id_single_quoted_state( | |
| 2455 GumboParser* parser, GumboTokenizerState* tokenizer, int c, | |
| 2456 GumboToken* output) { | |
| 2457 switch (c) { | |
| 2458 case '\'': | |
| 2459 gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID); | |
| 2460 finish_doctype_public_id(parser); | |
| 2461 return NEXT_CHAR; | |
| 2462 case '\0': | |
| 2463 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL); | |
| 2464 append_char_to_temporary_buffer(parser, kUtf8ReplacementChar); | |
| 2465 return NEXT_CHAR; | |
| 2466 case '>': | |
| 2467 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END); | |
| 2468 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2469 tokenizer->_doc_type_state.force_quirks = true; | |
| 2470 finish_doctype_public_id(parser); | |
| 2471 emit_doctype(parser, output); | |
| 2472 return RETURN_ERROR; | |
| 2473 case -1: | |
| 2474 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF); | |
| 2475 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2476 tokenizer->_doc_type_state.force_quirks = true; | |
| 2477 finish_doctype_public_id(parser); | |
| 2478 emit_doctype(parser, output); | |
| 2479 return RETURN_ERROR; | |
| 2480 default: | |
| 2481 append_char_to_temporary_buffer(parser, c); | |
| 2482 return NEXT_CHAR; | |
| 2483 } | |
| 2484 } | |
| 2485 | |
| 2486 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-public-identifier-state | |
| 2487 static StateResult handle_after_doctype_public_id_state(GumboParser* parser, | |
| 2488 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 2489 switch (c) { | |
| 2490 case '\t': | |
| 2491 case '\n': | |
| 2492 case '\f': | |
| 2493 case ' ': | |
| 2494 gumbo_tokenizer_set_state( | |
| 2495 parser, GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID); | |
| 2496 return NEXT_CHAR; | |
| 2497 case '>': | |
| 2498 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2499 emit_doctype(parser, output); | |
| 2500 return RETURN_SUCCESS; | |
| 2501 case '"': | |
| 2502 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID); | |
| 2503 assert(temporary_buffer_equals(parser, "")); | |
| 2504 gumbo_tokenizer_set_state( | |
| 2505 parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED); | |
| 2506 return NEXT_CHAR; | |
| 2507 case '\'': | |
| 2508 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID); | |
| 2509 assert(temporary_buffer_equals(parser, "")); | |
| 2510 gumbo_tokenizer_set_state( | |
| 2511 parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED); | |
| 2512 return NEXT_CHAR; | |
| 2513 case -1: | |
| 2514 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF); | |
| 2515 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2516 tokenizer->_reconsume_current_input = true; | |
| 2517 tokenizer->_doc_type_state.force_quirks = true; | |
| 2518 emit_doctype(parser, output); | |
| 2519 return RETURN_ERROR; | |
| 2520 default: | |
| 2521 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID); | |
| 2522 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE); | |
| 2523 tokenizer->_doc_type_state.force_quirks = true; | |
| 2524 return NEXT_CHAR; | |
| 2525 } | |
| 2526 } | |
| 2527 | |
| 2528 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#between-doctype-public-and-system-identifiers-state | |
| 2529 static StateResult handle_between_doctype_public_system_id_state( | |
| 2530 GumboParser* parser, GumboTokenizerState* tokenizer, int c, | |
| 2531 GumboToken* output) { | |
| 2532 switch (c) { | |
| 2533 case '\t': | |
| 2534 case '\n': | |
| 2535 case '\f': | |
| 2536 case ' ': | |
| 2537 return NEXT_CHAR; | |
| 2538 case '>': | |
| 2539 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2540 emit_doctype(parser, output); | |
| 2541 return RETURN_SUCCESS; | |
| 2542 case '"': | |
| 2543 assert(temporary_buffer_equals(parser, "")); | |
| 2544 gumbo_tokenizer_set_state( | |
| 2545 parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED); | |
| 2546 return NEXT_CHAR; | |
| 2547 case '\'': | |
| 2548 assert(temporary_buffer_equals(parser, "")); | |
| 2549 gumbo_tokenizer_set_state( | |
| 2550 parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED); | |
| 2551 return NEXT_CHAR; | |
| 2552 case -1: | |
| 2553 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF); | |
| 2554 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2555 tokenizer->_doc_type_state.force_quirks = true; | |
| 2556 emit_doctype(parser, output); | |
| 2557 return RETURN_ERROR; | |
| 2558 default: | |
| 2559 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID); | |
| 2560 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE); | |
| 2561 tokenizer->_doc_type_state.force_quirks = true; | |
| 2562 emit_doctype(parser, output); | |
| 2563 return RETURN_ERROR; | |
| 2564 } | |
| 2565 } | |
| 2566 | |
| 2567 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-system-keyword-state | |
| 2568 static StateResult handle_after_doctype_system_keyword_state( | |
| 2569 GumboParser* parser, GumboTokenizerState* tokenizer, int c, | |
| 2570 GumboToken* output) { | |
| 2571 switch (c) { | |
| 2572 case '\t': | |
| 2573 case '\n': | |
| 2574 case '\f': | |
| 2575 case ' ': | |
| 2576 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID); | |
| 2577 return NEXT_CHAR; | |
| 2578 case '"': | |
| 2579 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID); | |
| 2580 assert(temporary_buffer_equals(parser, "")); | |
| 2581 gumbo_tokenizer_set_state( | |
| 2582 parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED); | |
| 2583 return NEXT_CHAR; | |
| 2584 case '\'': | |
| 2585 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID); | |
| 2586 assert(temporary_buffer_equals(parser, "")); | |
| 2587 gumbo_tokenizer_set_state( | |
| 2588 parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED); | |
| 2589 return NEXT_CHAR; | |
| 2590 case '>': | |
| 2591 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END); | |
| 2592 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2593 tokenizer->_doc_type_state.force_quirks = true; | |
| 2594 emit_doctype(parser, output); | |
| 2595 return RETURN_ERROR; | |
| 2596 case -1: | |
| 2597 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF); | |
| 2598 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2599 tokenizer->_doc_type_state.force_quirks = true; | |
| 2600 emit_doctype(parser, output); | |
| 2601 return RETURN_ERROR; | |
| 2602 default: | |
| 2603 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID); | |
| 2604 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE); | |
| 2605 tokenizer->_doc_type_state.force_quirks = true; | |
| 2606 return NEXT_CHAR; | |
| 2607 } | |
| 2608 } | |
| 2609 | |
| 2610 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#before-doctype-system-identifier-state | |
| 2611 static StateResult handle_before_doctype_system_id_state(GumboParser* parser, | |
| 2612 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 2613 switch (c) { | |
| 2614 case '\t': | |
| 2615 case '\n': | |
| 2616 case '\f': | |
| 2617 case ' ': | |
| 2618 return NEXT_CHAR; | |
| 2619 case '"': | |
| 2620 assert(temporary_buffer_equals(parser, "")); | |
| 2621 gumbo_tokenizer_set_state( | |
| 2622 parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED); | |
| 2623 return NEXT_CHAR; | |
| 2624 case '\'': | |
| 2625 assert(temporary_buffer_equals(parser, "")); | |
| 2626 gumbo_tokenizer_set_state( | |
| 2627 parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED); | |
| 2628 return NEXT_CHAR; | |
| 2629 case '>': | |
| 2630 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END); | |
| 2631 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2632 tokenizer->_doc_type_state.force_quirks = true; | |
| 2633 emit_doctype(parser, output); | |
| 2634 return RETURN_ERROR; | |
| 2635 case -1: | |
| 2636 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF); | |
| 2637 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2638 tokenizer->_doc_type_state.force_quirks = true; | |
| 2639 emit_doctype(parser, output); | |
| 2640 return RETURN_ERROR; | |
| 2641 default: | |
| 2642 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID); | |
| 2643 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE); | |
| 2644 tokenizer->_doc_type_state.force_quirks = true; | |
| 2645 return NEXT_CHAR; | |
| 2646 } | |
| 2647 } | |
| 2648 | |
| 2649 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-system-identifier-(double-quoted)-state | |
| 2650 static StateResult handle_doctype_system_id_double_quoted_state( | |
| 2651 GumboParser* parser, GumboTokenizerState* tokenizer, int c, | |
| 2652 GumboToken* output) { | |
| 2653 switch (c) { | |
| 2654 case '"': | |
| 2655 gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID); | |
| 2656 finish_doctype_system_id(parser); | |
| 2657 return NEXT_CHAR; | |
| 2658 case '\0': | |
| 2659 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL); | |
| 2660 append_char_to_temporary_buffer(parser, kUtf8ReplacementChar); | |
| 2661 return NEXT_CHAR; | |
| 2662 case '>': | |
| 2663 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END); | |
| 2664 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2665 tokenizer->_doc_type_state.force_quirks = true; | |
| 2666 finish_doctype_system_id(parser); | |
| 2667 emit_doctype(parser, output); | |
| 2668 return RETURN_ERROR; | |
| 2669 case -1: | |
| 2670 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF); | |
| 2671 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2672 tokenizer->_doc_type_state.force_quirks = true; | |
| 2673 finish_doctype_system_id(parser); | |
| 2674 emit_doctype(parser, output); | |
| 2675 return RETURN_ERROR; | |
| 2676 default: | |
| 2677 append_char_to_temporary_buffer(parser, c); | |
| 2678 return NEXT_CHAR; | |
| 2679 } | |
| 2680 } | |
| 2681 | |
| 2682 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-system-identifier-(single-quoted)-state | |
| 2683 static StateResult handle_doctype_system_id_single_quoted_state( | |
| 2684 GumboParser* parser, GumboTokenizerState* tokenizer, int c, | |
| 2685 GumboToken* output) { | |
| 2686 switch (c) { | |
| 2687 case '\'': | |
| 2688 gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID); | |
| 2689 finish_doctype_system_id(parser); | |
| 2690 return NEXT_CHAR; | |
| 2691 case '\0': | |
| 2692 tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL); | |
| 2693 append_char_to_temporary_buffer(parser, kUtf8ReplacementChar); | |
| 2694 return NEXT_CHAR; | |
| 2695 case '>': | |
| 2696 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END); | |
| 2697 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2698 tokenizer->_doc_type_state.force_quirks = true; | |
| 2699 finish_doctype_system_id(parser); | |
| 2700 emit_doctype(parser, output); | |
| 2701 return RETURN_ERROR; | |
| 2702 case -1: | |
| 2703 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID); | |
| 2704 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2705 tokenizer->_doc_type_state.force_quirks = true; | |
| 2706 finish_doctype_system_id(parser); | |
| 2707 emit_doctype(parser, output); | |
| 2708 return RETURN_ERROR; | |
| 2709 default: | |
| 2710 append_char_to_temporary_buffer(parser, c); | |
| 2711 return NEXT_CHAR; | |
| 2712 } | |
| 2713 } | |
| 2714 | |
| 2715 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-system-identifier-state | |
| 2716 static StateResult handle_after_doctype_system_id_state(GumboParser* parser, | |
| 2717 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 2718 switch (c) { | |
| 2719 case '\t': | |
| 2720 case '\n': | |
| 2721 case '\f': | |
| 2722 case ' ': | |
| 2723 return NEXT_CHAR; | |
| 2724 case '>': | |
| 2725 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2726 emit_doctype(parser, output); | |
| 2727 return RETURN_SUCCESS; | |
| 2728 case -1: | |
| 2729 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF); | |
| 2730 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2731 tokenizer->_doc_type_state.force_quirks = true; | |
| 2732 emit_doctype(parser, output); | |
| 2733 return RETURN_ERROR; | |
| 2734 default: | |
| 2735 tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID); | |
| 2736 gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE); | |
| 2737 return NEXT_CHAR; | |
| 2738 } | |
| 2739 } | |
| 2740 | |
| 2741 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#bogus-doctype-state | |
| 2742 static StateResult handle_bogus_doctype_state(GumboParser* parser, | |
| 2743 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 2744 if (c == '>' || c == -1) { | |
| 2745 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2746 emit_doctype(parser, output); | |
| 2747 return RETURN_ERROR; | |
| 2748 } | |
| 2749 return NEXT_CHAR; | |
| 2750 } | |
| 2751 | |
| 2752 // http://www.whatwg.org/specs/web-apps/current-work/complete.html#cdata-section-state | |
| 2753 static StateResult handle_cdata_state(GumboParser* parser, | |
| 2754 GumboTokenizerState* tokenizer, int c, GumboToken* output) { | |
| 2755 if (c == -1 || utf8iterator_maybe_consume_match( | |
| 2756 &tokenizer->_input, "]]>", sizeof("]]>") - 1, true)) { | |
| 2757 tokenizer->_reconsume_current_input = true; | |
| 2758 reset_token_start_point(tokenizer); | |
| 2759 gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); | |
| 2760 tokenizer->_is_in_cdata = false; | |
| 2761 return NEXT_CHAR; | |
| 2762 } else { | |
| 2763 return emit_current_char(parser, output); | |
| 2764 } | |
| 2765 } | |
| 2766 | |
| 2767 typedef StateResult (*GumboLexerStateFunction)( | |
| 2768 GumboParser*, GumboTokenizerState*, int, GumboToken*); | |
| 2769 | |
| 2770 static GumboLexerStateFunction dispatch_table[] = {handle_data_state, | |
| 2771 handle_char_ref_in_data_state, handle_rcdata_state, | |
| 2772 handle_char_ref_in_rcdata_state, handle_rawtext_state, handle_script_state, | |
| 2773 handle_plaintext_state, handle_tag_open_state, handle_end_tag_open_state, | |
| 2774 handle_tag_name_state, handle_rcdata_lt_state, | |
| 2775 handle_rcdata_end_tag_open_state, handle_rcdata_end_tag_name_state, | |
| 2776 handle_rawtext_lt_state, handle_rawtext_end_tag_open_state, | |
| 2777 handle_rawtext_end_tag_name_state, handle_script_lt_state, | |
| 2778 handle_script_end_tag_open_state, handle_script_end_tag_name_state, | |
| 2779 handle_script_escaped_start_state, handle_script_escaped_start_dash_state, | |
| 2780 handle_script_escaped_state, handle_script_escaped_dash_state, | |
| 2781 handle_script_escaped_dash_dash_state, handle_script_escaped_lt_state, | |
| 2782 handle_script_escaped_end_tag_open_state, | |
| 2783 handle_script_escaped_end_tag_name_state, | |
| 2784 handle_script_double_escaped_start_state, | |
| 2785 handle_script_double_escaped_state, handle_script_double_escaped_dash_state, | |
| 2786 handle_script_double_escaped_dash_dash_state, | |
| 2787 handle_script_double_escaped_lt_state, | |
| 2788 handle_script_double_escaped_end_state, handle_before_attr_name_state, | |
| 2789 handle_attr_name_state, handle_after_attr_name_state, | |
| 2790 handle_before_attr_value_state, handle_attr_value_double_quoted_state, | |
| 2791 handle_attr_value_single_quoted_state, handle_attr_value_unquoted_state, | |
| 2792 handle_char_ref_in_attr_value_state, handle_after_attr_value_quoted_state, | |
| 2793 handle_self_closing_start_tag_state, handle_bogus_comment_state, | |
| 2794 handle_markup_declaration_state, handle_comment_start_state, | |
| 2795 handle_comment_start_dash_state, handle_comment_state, | |
| 2796 handle_comment_end_dash_state, handle_comment_end_state, | |
| 2797 handle_comment_end_bang_state, handle_doctype_state, | |
| 2798 handle_before_doctype_name_state, handle_doctype_name_state, | |
| 2799 handle_after_doctype_name_state, handle_after_doctype_public_keyword_state, | |
| 2800 handle_before_doctype_public_id_state, | |
| 2801 handle_doctype_public_id_double_quoted_state, | |
| 2802 handle_doctype_public_id_single_quoted_state, | |
| 2803 handle_after_doctype_public_id_state, | |
| 2804 handle_between_doctype_public_system_id_state, | |
| 2805 handle_after_doctype_system_keyword_state, | |
| 2806 handle_before_doctype_system_id_state, | |
| 2807 handle_doctype_system_id_double_quoted_state, | |
| 2808 handle_doctype_system_id_single_quoted_state, | |
| 2809 handle_after_doctype_system_id_state, handle_bogus_doctype_state, | |
| 2810 handle_cdata_state}; | |
| 2811 | |
| 2812 bool gumbo_lex(GumboParser* parser, GumboToken* output) { | |
| 2813 // Because of the spec requirements that... | |
| 2814 // | |
| 2815 // 1. Tokens be handled immediately by the parser upon emission. | |
| 2816 // 2. Some states (eg. CDATA, or various error conditions) require the | |
| 2817 // emission of multiple tokens in the same states. | |
| 2818 // 3. The tokenizer often has to reconsume the same character in a different | |
| 2819 // state. | |
| 2820 // | |
| 2821 // ...all state must be held in the GumboTokenizer struct instead of in local | |
| 2822 // variables in this function. That allows us to return from this method with | |
| 2823 // a token, and then immediately jump back to the same state with the same | |
| 2824 // input if we need to return a different token. The various emit_* functions | |
| 2825 // are responsible for changing state (eg. flushing the chardata buffer, | |
| 2826 // reading the next input character) to avoid an infinite loop. | |
| 2827 GumboTokenizerState* tokenizer = parser->_tokenizer_state; | |
| 2828 | |
| 2829 if (tokenizer->_buffered_emit_char != kGumboNoChar) { | |
| 2830 tokenizer->_reconsume_current_input = true; | |
| 2831 emit_char(parser, tokenizer->_buffered_emit_char, output); | |
| 2832 // And now that we've avoided advancing the input, make sure we set | |
| 2833 // _reconsume_current_input back to false to make sure the *next* character | |
| 2834 // isn't consumed twice. | |
| 2835 tokenizer->_reconsume_current_input = false; | |
| 2836 tokenizer->_buffered_emit_char = kGumboNoChar; | |
| 2837 return true; | |
| 2838 } | |
| 2839 | |
| 2840 if (maybe_emit_from_temporary_buffer(parser, output)) { | |
| 2841 return true; | |
| 2842 } | |
| 2843 | |
| 2844 while (1) { | |
| 2845 assert(!tokenizer->_temporary_buffer_emit); | |
| 2846 assert(tokenizer->_buffered_emit_char == kGumboNoChar); | |
| 2847 int c = utf8iterator_current(&tokenizer->_input); | |
| 2848 gumbo_debug( | |
| 2849 "Lexing character '%c' (%d) in state %d.\n", c, c, tokenizer->_state); | |
| 2850 StateResult result = | |
| 2851 dispatch_table[tokenizer->_state](parser, tokenizer, c, output); | |
| 2852 // We need to clear reconsume_current_input before returning to prevent | |
| 2853 // certain infinite loop states. | |
| 2854 bool should_advance = !tokenizer->_reconsume_current_input; | |
| 2855 tokenizer->_reconsume_current_input = false; | |
| 2856 | |
| 2857 if (result == RETURN_SUCCESS) { | |
| 2858 return true; | |
| 2859 } else if (result == RETURN_ERROR) { | |
| 2860 return false; | |
| 2861 } | |
| 2862 | |
| 2863 if (should_advance) { | |
| 2864 utf8iterator_next(&tokenizer->_input); | |
| 2865 } | |
| 2866 } | |
| 2867 } | |
| 2868 | |
| 2869 void gumbo_token_destroy(GumboParser* parser, GumboToken* token) { | |
| 2870 if (!token) return; | |
| 2871 | |
| 2872 switch (token->type) { | |
| 2873 case GUMBO_TOKEN_DOCTYPE: | |
| 2874 gumbo_parser_deallocate(parser, (void*) token->v.doc_type.name); | |
| 2875 gumbo_parser_deallocate( | |
| 2876 parser, (void*) token->v.doc_type.public_identifier); | |
| 2877 gumbo_parser_deallocate( | |
| 2878 parser, (void*) token->v.doc_type.system_identifier); | |
| 2879 return; | |
| 2880 case GUMBO_TOKEN_START_TAG: | |
| 2881 for (unsigned int i = 0; i < token->v.start_tag.attributes.length; ++i) { | |
| 2882 GumboAttribute* attr = token->v.start_tag.attributes.data[i]; | |
| 2883 if (attr) { | |
| 2884 // May have been nulled out if this token was merged with another. | |
| 2885 gumbo_destroy_attribute(parser, attr); | |
| 2886 } | |
| 2887 } | |
| 2888 gumbo_parser_deallocate( | |
| 2889 parser, (void*) token->v.start_tag.attributes.data); | |
| 2890 return; | |
| 2891 case GUMBO_TOKEN_COMMENT: | |
| 2892 gumbo_parser_deallocate(parser, (void*) token->v.text); | |
| 2893 return; | |
| 2894 default: | |
| 2895 return; | |
| 2896 } | |
| 2897 } |
