Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/gumbo-parser/src/tokenizer.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 // Copyright 2010 Google Inc. All Rights Reserved. | |
| 2 // | |
| 3 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 4 // you may not use this file except in compliance with the License. | |
| 5 // You may obtain a copy of the License at | |
| 6 // | |
| 7 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 8 // | |
| 9 // Unless required by applicable law or agreed to in writing, software | |
| 10 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 12 // See the License for the specific language governing permissions and | |
| 13 // limitations under the License. | |
| 14 // | |
| 15 // Author: jdtang@google.com (Jonathan Tang) | |
| 16 // | |
| 17 // This contains an implementation of a tokenizer for HTML5. It consumes a | |
| 18 // buffer of UTF-8 characters, and then emits a stream of tokens. | |
| 19 | |
| 20 #ifndef GUMBO_TOKENIZER_H_ | |
| 21 #define GUMBO_TOKENIZER_H_ | |
| 22 | |
| 23 #include <stdbool.h> | |
| 24 #include <stddef.h> | |
| 25 | |
| 26 #include "gumbo.h" | |
| 27 #include "token_type.h" | |
| 28 #include "tokenizer_states.h" | |
| 29 | |
| 30 #ifdef __cplusplus | |
| 31 extern "C" { | |
| 32 #endif | |
| 33 | |
| 34 struct GumboInternalParser; | |
| 35 | |
| 36 // Struct containing all information pertaining to doctype tokens. | |
| 37 typedef struct GumboInternalTokenDocType { | |
| 38 const char* name; | |
| 39 const char* public_identifier; | |
| 40 const char* system_identifier; | |
| 41 bool force_quirks; | |
| 42 // There's no way to tell a 0-length public or system ID apart from the | |
| 43 // absence of a public or system ID, but they're handled different by the | |
| 44 // spec, so we need bool flags for them. | |
| 45 bool has_public_identifier; | |
| 46 bool has_system_identifier; | |
| 47 } GumboTokenDocType; | |
| 48 | |
| 49 // Struct containing all information pertaining to start tag tokens. | |
| 50 typedef struct GumboInternalTokenStartTag { | |
| 51 GumboTag tag; | |
| 52 GumboVector /* GumboAttribute */ attributes; | |
| 53 bool is_self_closing; | |
| 54 } GumboTokenStartTag; | |
| 55 | |
| 56 // A data structure representing a single token in the input stream. This | |
| 57 // contains an enum for the type, the source position, a GumboStringPiece | |
| 58 // pointing to the original text, and then a union for any parsed data. | |
| 59 typedef struct GumboInternalToken { | |
| 60 GumboTokenType type; | |
| 61 GumboSourcePosition position; | |
| 62 GumboStringPiece original_text; | |
| 63 union { | |
| 64 GumboTokenDocType doc_type; | |
| 65 GumboTokenStartTag start_tag; | |
| 66 GumboTag end_tag; | |
| 67 const char* text; // For comments. | |
| 68 int character; // For character, whitespace, null, and EOF tokens. | |
| 69 } v; | |
| 70 } GumboToken; | |
| 71 | |
| 72 // Initializes the tokenizer state within the GumboParser object, setting up a | |
| 73 // parse of the specified text. | |
| 74 void gumbo_tokenizer_state_init( | |
| 75 struct GumboInternalParser* parser, const char* text, size_t text_length); | |
| 76 | |
| 77 // Destroys the tokenizer state within the GumboParser object, freeing any | |
| 78 // dynamically-allocated structures within it. | |
| 79 void gumbo_tokenizer_state_destroy(struct GumboInternalParser* parser); | |
| 80 | |
| 81 // Sets the tokenizer state to the specified value. This is needed by some | |
| 82 // parser states, which alter the state of the tokenizer in response to tags | |
| 83 // seen. | |
| 84 void gumbo_tokenizer_set_state( | |
| 85 struct GumboInternalParser* parser, GumboTokenizerEnum state); | |
| 86 | |
| 87 // Flags whether the current node is a foreign content element. This is | |
| 88 // necessary for the markup declaration open state, where the tokenizer must be | |
| 89 // aware of the state of the parser to properly tokenize bad comment tags. | |
| 90 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#markup-declaration-open-state | |
| 91 void gumbo_tokenizer_set_is_current_node_foreign( | |
| 92 struct GumboInternalParser* parser, bool is_foreign); | |
| 93 | |
| 94 // Lexes a single token from the specified buffer, filling the output with the | |
| 95 // parsed GumboToken data structure. Returns true for a successful | |
| 96 // tokenization, false if a parse error occurs. | |
| 97 // | |
| 98 // Example: | |
| 99 // struct GumboInternalParser parser; | |
| 100 // GumboToken output; | |
| 101 // gumbo_tokenizer_state_init(&parser, text, strlen(text)); | |
| 102 // while (gumbo_lex(&parser, &output)) { | |
| 103 // ...do stuff with output. | |
| 104 // gumbo_token_destroy(&parser, &token); | |
| 105 // } | |
| 106 // gumbo_tokenizer_state_destroy(&parser); | |
| 107 bool gumbo_lex(struct GumboInternalParser* parser, GumboToken* output); | |
| 108 | |
| 109 // Frees the internally-allocated pointers within an GumboToken. Note that this | |
| 110 // doesn't free the token itself, since oftentimes it will be allocated on the | |
| 111 // stack. A simple call to free() (or GumboParser->deallocator, if | |
| 112 // appropriate) can handle that. | |
| 113 // | |
| 114 // Note that if you are handing over ownership of the internal strings to some | |
| 115 // other data structure - for example, a parse tree - these do not need to be | |
| 116 // freed. | |
| 117 void gumbo_token_destroy(struct GumboInternalParser* parser, GumboToken* token); | |
| 118 | |
| 119 #ifdef __cplusplus | |
| 120 } | |
| 121 #endif | |
| 122 | |
| 123 #endif // GUMBO_TOKENIZER_H_ |
