comparison mupdf-source/thirdparty/gumbo-parser/src/tokenizer.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 // Copyright 2010 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // Author: jdtang@google.com (Jonathan Tang)
16 //
17 // This contains an implementation of a tokenizer for HTML5. It consumes a
18 // buffer of UTF-8 characters, and then emits a stream of tokens.
19
20 #ifndef GUMBO_TOKENIZER_H_
21 #define GUMBO_TOKENIZER_H_
22
23 #include <stdbool.h>
24 #include <stddef.h>
25
26 #include "gumbo.h"
27 #include "token_type.h"
28 #include "tokenizer_states.h"
29
30 #ifdef __cplusplus
31 extern "C" {
32 #endif
33
34 struct GumboInternalParser;
35
36 // Struct containing all information pertaining to doctype tokens.
37 typedef struct GumboInternalTokenDocType {
38 const char* name;
39 const char* public_identifier;
40 const char* system_identifier;
41 bool force_quirks;
42 // There's no way to tell a 0-length public or system ID apart from the
43 // absence of a public or system ID, but they're handled different by the
44 // spec, so we need bool flags for them.
45 bool has_public_identifier;
46 bool has_system_identifier;
47 } GumboTokenDocType;
48
49 // Struct containing all information pertaining to start tag tokens.
50 typedef struct GumboInternalTokenStartTag {
51 GumboTag tag;
52 GumboVector /* GumboAttribute */ attributes;
53 bool is_self_closing;
54 } GumboTokenStartTag;
55
56 // A data structure representing a single token in the input stream. This
57 // contains an enum for the type, the source position, a GumboStringPiece
58 // pointing to the original text, and then a union for any parsed data.
59 typedef struct GumboInternalToken {
60 GumboTokenType type;
61 GumboSourcePosition position;
62 GumboStringPiece original_text;
63 union {
64 GumboTokenDocType doc_type;
65 GumboTokenStartTag start_tag;
66 GumboTag end_tag;
67 const char* text; // For comments.
68 int character; // For character, whitespace, null, and EOF tokens.
69 } v;
70 } GumboToken;
71
72 // Initializes the tokenizer state within the GumboParser object, setting up a
73 // parse of the specified text.
74 void gumbo_tokenizer_state_init(
75 struct GumboInternalParser* parser, const char* text, size_t text_length);
76
77 // Destroys the tokenizer state within the GumboParser object, freeing any
78 // dynamically-allocated structures within it.
79 void gumbo_tokenizer_state_destroy(struct GumboInternalParser* parser);
80
81 // Sets the tokenizer state to the specified value. This is needed by some
82 // parser states, which alter the state of the tokenizer in response to tags
83 // seen.
84 void gumbo_tokenizer_set_state(
85 struct GumboInternalParser* parser, GumboTokenizerEnum state);
86
87 // Flags whether the current node is a foreign content element. This is
88 // necessary for the markup declaration open state, where the tokenizer must be
89 // aware of the state of the parser to properly tokenize bad comment tags.
90 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#markup-declaration-open-state
91 void gumbo_tokenizer_set_is_current_node_foreign(
92 struct GumboInternalParser* parser, bool is_foreign);
93
94 // Lexes a single token from the specified buffer, filling the output with the
95 // parsed GumboToken data structure. Returns true for a successful
96 // tokenization, false if a parse error occurs.
97 //
98 // Example:
99 // struct GumboInternalParser parser;
100 // GumboToken output;
101 // gumbo_tokenizer_state_init(&parser, text, strlen(text));
102 // while (gumbo_lex(&parser, &output)) {
103 // ...do stuff with output.
104 // gumbo_token_destroy(&parser, &token);
105 // }
106 // gumbo_tokenizer_state_destroy(&parser);
107 bool gumbo_lex(struct GumboInternalParser* parser, GumboToken* output);
108
109 // Frees the internally-allocated pointers within an GumboToken. Note that this
110 // doesn't free the token itself, since oftentimes it will be allocated on the
111 // stack. A simple call to free() (or GumboParser->deallocator, if
112 // appropriate) can handle that.
113 //
114 // Note that if you are handing over ownership of the internal strings to some
115 // other data structure - for example, a parse tree - these do not need to be
116 // freed.
117 void gumbo_token_destroy(struct GumboInternalParser* parser, GumboToken* token);
118
119 #ifdef __cplusplus
120 }
121 #endif
122
123 #endif // GUMBO_TOKENIZER_H_