Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/gumbo-parser/src/gumbo.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 // Copyright 2010 Google Inc. All Rights Reserved. | |
| 2 // | |
| 3 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 4 // you may not use this file except in compliance with the License. | |
| 5 // You may obtain a copy of the License at | |
| 6 // | |
| 7 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 8 // | |
| 9 // Unless required by applicable law or agreed to in writing, software | |
| 10 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 12 // See the License for the specific language governing permissions and | |
| 13 // limitations under the License. | |
| 14 // | |
| 15 // Author: jdtang@google.com (Jonathan Tang) | |
| 16 // | |
| 17 // We use Gumbo as a prefix for types, gumbo_ as a prefix for functions, and | |
| 18 // GUMBO_ as a prefix for enum constants (static constants get the Google-style | |
| 19 // kGumbo prefix). | |
| 20 | |
| 21 /** | |
| 22 * @file | |
| 23 * @mainpage Gumbo HTML Parser | |
| 24 * | |
| 25 * This provides a conformant, no-dependencies implementation of the HTML5 | |
| 26 * parsing algorithm. It supports only UTF8; if you need to parse a different | |
| 27 * encoding, run a preprocessing step to convert to UTF8. It returns a parse | |
| 28 * tree made of the structs in this file. | |
| 29 * | |
| 30 * Example: | |
| 31 * @code | |
| 32 * GumboOutput* output = gumbo_parse(input); | |
| 33 * do_something_with_doctype(output->document); | |
| 34 * do_something_with_html_tree(output->root); | |
| 35 * gumbo_destroy_output(&options, output); | |
| 36 * @endcode | |
| 37 * HTML5 Spec: | |
| 38 * | |
| 39 * http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html | |
| 40 */ | |
| 41 | |
| 42 #ifndef GUMBO_GUMBO_H_ | |
| 43 #define GUMBO_GUMBO_H_ | |
| 44 | |
| 45 #ifdef _MSC_VER | |
| 46 #ifndef _CRT_SECURE_NO_WARNINGS | |
| 47 #define _CRT_SECURE_NO_WARNINGS | |
| 48 #endif | |
| 49 #define fileno _fileno | |
| 50 #endif | |
| 51 | |
| 52 #include <stdbool.h> | |
| 53 #include <stddef.h> | |
| 54 | |
| 55 #ifdef __cplusplus | |
| 56 extern "C" { | |
| 57 #endif | |
| 58 | |
| 59 /** | |
| 60 * A struct representing a character position within the original text buffer. | |
| 61 * Line and column numbers are 1-based and offsets are 0-based, which matches | |
| 62 * how most editors and command-line tools work. Also, columns measure | |
| 63 * positions in terms of characters while offsets measure by bytes; this is | |
| 64 * because the offset field is often used to pull out a particular region of | |
| 65 * text (which in most languages that bind to C implies pointer arithmetic on a | |
| 66 * buffer of bytes), while the column field is often used to reference a | |
| 67 * particular column on a printable display, which nowadays is usually UTF-8. | |
| 68 */ | |
| 69 typedef struct { | |
| 70 unsigned int line; | |
| 71 unsigned int column; | |
| 72 unsigned int offset; | |
| 73 } GumboSourcePosition; | |
| 74 | |
| 75 /** | |
| 76 * A SourcePosition used for elements that have no source position, i.e. | |
| 77 * parser-inserted elements. | |
| 78 */ | |
| 79 extern const GumboSourcePosition kGumboEmptySourcePosition; | |
| 80 | |
| 81 /** | |
| 82 * A struct representing a string or part of a string. Strings within the | |
| 83 * parser are represented by a char* and a length; the char* points into | |
| 84 * an existing data buffer owned by some other code (often the original input). | |
| 85 * GumboStringPieces are assumed (by convention) to be immutable, because they | |
| 86 * may share data. Use GumboStringBuffer if you need to construct a string. | |
| 87 * Clients should assume that it is not NUL-terminated, and should always use | |
| 88 * explicit lengths when manipulating them. | |
| 89 */ | |
| 90 typedef struct { | |
| 91 /** A pointer to the beginning of the string. NULL iff length == 0. */ | |
| 92 const char* data; | |
| 93 | |
| 94 /** The length of the string fragment, in bytes. May be zero. */ | |
| 95 size_t length; | |
| 96 } GumboStringPiece; | |
| 97 | |
| 98 /** A constant to represent a 0-length null string. */ | |
| 99 extern const GumboStringPiece kGumboEmptyString; | |
| 100 | |
| 101 /** | |
| 102 * Compares two GumboStringPieces, and returns true if they're equal or false | |
| 103 * otherwise. | |
| 104 */ | |
| 105 bool gumbo_string_equals( | |
| 106 const GumboStringPiece* str1, const GumboStringPiece* str2); | |
| 107 | |
| 108 /** | |
| 109 * Compares two GumboStringPieces ignoring case, and returns true if they're | |
| 110 * equal or false otherwise. | |
| 111 */ | |
| 112 bool gumbo_string_equals_ignore_case( | |
| 113 const GumboStringPiece* str1, const GumboStringPiece* str2); | |
| 114 | |
| 115 /** | |
| 116 * A simple vector implementation. This stores a pointer to a data array and a | |
| 117 * length. All elements are stored as void*; client code must cast to the | |
| 118 * appropriate type. Overflows upon addition result in reallocation of the data | |
| 119 * array, with the size doubling to maintain O(1) amortized cost. There is no | |
| 120 * removal function, as this isn't needed for any of the operations within this | |
| 121 * library. Iteration can be done through inspecting the structure directly in | |
| 122 * a for-loop. | |
| 123 */ | |
| 124 typedef struct { | |
| 125 /** Data elements. This points to a dynamically-allocated array of capacity | |
| 126 * elements, each a void* to the element itself. | |
| 127 */ | |
| 128 void** data; | |
| 129 | |
| 130 /** Number of elements currently in the vector. */ | |
| 131 unsigned int length; | |
| 132 | |
| 133 /** Current array capacity. */ | |
| 134 unsigned int capacity; | |
| 135 } GumboVector; | |
| 136 | |
| 137 /** An empty (0-length, 0-capacity) GumboVector. */ | |
| 138 extern const GumboVector kGumboEmptyVector; | |
| 139 | |
| 140 /** | |
| 141 * Returns the first index at which an element appears in this vector (testing | |
| 142 * by pointer equality), or -1 if it never does. | |
| 143 */ | |
| 144 int gumbo_vector_index_of(GumboVector* vector, const void* element); | |
| 145 | |
| 146 /** | |
| 147 * An enum for all the tags defined in the HTML5 standard. These correspond to | |
| 148 * the tag names themselves. Enum constants exist only for tags which appear in | |
| 149 * the spec itself (or for tags with special handling in the SVG and MathML | |
| 150 * namespaces); any other tags appear as GUMBO_TAG_UNKNOWN and the actual tag | |
| 151 * name can be obtained through original_tag. | |
| 152 * | |
| 153 * This is mostly for API convenience, so that clients of this library don't | |
| 154 * need to perform a strcasecmp to find the normalized tag name. It also has | |
| 155 * efficiency benefits, by letting the parser work with enums instead of | |
| 156 * strings. | |
| 157 */ | |
| 158 typedef enum { | |
| 159 // Load all the tags from an external source, generated from tag.in. | |
| 160 #include "tag_enum.h" | |
| 161 // Used for all tags that don't have special handling in HTML. Add new tags | |
| 162 // to the end of tag.in so as to preserve backwards-compatibility. | |
| 163 GUMBO_TAG_UNKNOWN, | |
| 164 // A marker value to indicate the end of the enum, for iterating over it. | |
| 165 // Also used as the terminator for varargs functions that take tags. | |
| 166 GUMBO_TAG_LAST, | |
| 167 } GumboTag; | |
| 168 | |
| 169 /** | |
| 170 * Returns the normalized (usually all-lowercased, except for foreign content) | |
| 171 * tag name for an GumboTag enum. Return value is static data owned by the | |
| 172 * library. | |
| 173 */ | |
| 174 const char* gumbo_normalized_tagname(GumboTag tag); | |
| 175 | |
| 176 /** | |
| 177 * Extracts the tag name from the original_text field of an element or token by | |
| 178 * stripping off </> characters and attributes and adjusting the passed-in | |
| 179 * GumboStringPiece appropriately. The tag name is in the original case and | |
| 180 * shares a buffer with the original text, to simplify memory management. | |
| 181 * Behavior is undefined if a string-piece that doesn't represent an HTML tag | |
| 182 * (<tagname> or </tagname>) is passed in. If the string piece is completely | |
| 183 * empty (NULL data pointer), then this function will exit successfully as a | |
| 184 * no-op. | |
| 185 */ | |
| 186 void gumbo_tag_from_original_text(GumboStringPiece* text); | |
| 187 | |
| 188 /** | |
| 189 * Fixes the case of SVG elements that are not all lowercase. | |
| 190 * http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#parsing-main-inforeign | |
| 191 * This is not done at parse time because there's no place to store a mutated | |
| 192 * tag name. tag_name is an enum (which will be TAG_UNKNOWN for most SVG tags | |
| 193 * without special handling), while original_tag_name is a pointer into the | |
| 194 * original buffer. Instead, we provide this helper function that clients can | |
| 195 * use to rename SVG tags as appropriate. | |
| 196 * Returns the case-normalized SVG tagname if a replacement is found, or NULL if | |
| 197 * no normalization is called for. The return value is static data and owned by | |
| 198 * the library. | |
| 199 */ | |
| 200 const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname); | |
| 201 | |
| 202 /** | |
| 203 * Converts a tag name string (which may be in upper or mixed case) to a tag | |
| 204 * enum. The `tag` version expects `tagname` to be NULL-terminated | |
| 205 */ | |
| 206 GumboTag gumbo_tag_enum(const char* tagname); | |
| 207 GumboTag gumbo_tagn_enum(const char* tagname, size_t length); | |
| 208 | |
| 209 /** | |
| 210 * Attribute namespaces. | |
| 211 * HTML includes special handling for XLink, XML, and XMLNS namespaces on | |
| 212 * attributes. Everything else goes in the generic "NONE" namespace. | |
| 213 */ | |
| 214 typedef enum { | |
| 215 GUMBO_ATTR_NAMESPACE_NONE, | |
| 216 GUMBO_ATTR_NAMESPACE_XLINK, | |
| 217 GUMBO_ATTR_NAMESPACE_XML, | |
| 218 GUMBO_ATTR_NAMESPACE_XMLNS, | |
| 219 } GumboAttributeNamespaceEnum; | |
| 220 | |
| 221 /** | |
| 222 * A struct representing a single attribute on an HTML tag. This is a | |
| 223 * name-value pair, but also includes information about source locations and | |
| 224 * original source text. | |
| 225 */ | |
| 226 typedef struct { | |
| 227 /** | |
| 228 * The namespace for the attribute. This will usually be | |
| 229 * GUMBO_ATTR_NAMESPACE_NONE, but some XLink/XMLNS/XML attributes take special | |
| 230 * values, per: | |
| 231 * http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#adjust-foreign-attributes | |
| 232 */ | |
| 233 GumboAttributeNamespaceEnum attr_namespace; | |
| 234 | |
| 235 /** | |
| 236 * The name of the attribute. This is in a freshly-allocated buffer to deal | |
| 237 * with case-normalization, and is null-terminated. | |
| 238 */ | |
| 239 const char* name; | |
| 240 | |
| 241 /** | |
| 242 * The original text of the attribute name, as a pointer into the original | |
| 243 * source buffer. | |
| 244 */ | |
| 245 GumboStringPiece original_name; | |
| 246 | |
| 247 /** | |
| 248 * The value of the attribute. This is in a freshly-allocated buffer to deal | |
| 249 * with unescaping, and is null-terminated. It does not include any quotes | |
| 250 * that surround the attribute. If the attribute has no value (for example, | |
| 251 * 'selected' on a checkbox), this will be an empty string. | |
| 252 */ | |
| 253 const char* value; | |
| 254 | |
| 255 /** | |
| 256 * The original text of the value of the attribute. This points into the | |
| 257 * original source buffer. It includes any quotes that surround the | |
| 258 * attribute, and you can look at original_value.data[0] and | |
| 259 * original_value.data[original_value.length - 1] to determine what the quote | |
| 260 * characters were. If the attribute has no value, this will be a 0-length | |
| 261 * string. | |
| 262 */ | |
| 263 GumboStringPiece original_value; | |
| 264 | |
| 265 /** The starting position of the attribute name. */ | |
| 266 GumboSourcePosition name_start; | |
| 267 | |
| 268 /** | |
| 269 * The ending position of the attribute name. This is not always derivable | |
| 270 * from the starting position of the value because of the possibility of | |
| 271 * whitespace around the = sign. | |
| 272 */ | |
| 273 GumboSourcePosition name_end; | |
| 274 | |
| 275 /** The starting position of the attribute value. */ | |
| 276 GumboSourcePosition value_start; | |
| 277 | |
| 278 /** The ending position of the attribute value. */ | |
| 279 GumboSourcePosition value_end; | |
| 280 } GumboAttribute; | |
| 281 | |
| 282 /** | |
| 283 * Given a vector of GumboAttributes, look up the one with the specified name | |
| 284 * and return it, or NULL if no such attribute exists. This uses a | |
| 285 * case-insensitive match, as HTML is case-insensitive. | |
| 286 */ | |
| 287 GumboAttribute* gumbo_get_attribute(const GumboVector* attrs, const char* name); | |
| 288 | |
| 289 /** | |
| 290 * Enum denoting the type of node. This determines the type of the node.v | |
| 291 * union. | |
| 292 */ | |
| 293 typedef enum { | |
| 294 /** Document node. v will be a GumboDocument. */ | |
| 295 GUMBO_NODE_DOCUMENT, | |
| 296 /** Element node. v will be a GumboElement. */ | |
| 297 GUMBO_NODE_ELEMENT, | |
| 298 /** Text node. v will be a GumboText. */ | |
| 299 GUMBO_NODE_TEXT, | |
| 300 /** CDATA node. v will be a GumboText. */ | |
| 301 GUMBO_NODE_CDATA, | |
| 302 /** Comment node. v will be a GumboText, excluding comment delimiters. */ | |
| 303 GUMBO_NODE_COMMENT, | |
| 304 /** Text node, where all contents is whitespace. v will be a GumboText. */ | |
| 305 GUMBO_NODE_WHITESPACE, | |
| 306 /** Template node. This is separate from GUMBO_NODE_ELEMENT because many | |
| 307 * client libraries will want to ignore the contents of template nodes, as | |
| 308 * the spec suggests. Recursing on GUMBO_NODE_ELEMENT will do the right thing | |
| 309 * here, while clients that want to include template contents should also | |
| 310 * check for GUMBO_NODE_TEMPLATE. v will be a GumboElement. */ | |
| 311 GUMBO_NODE_TEMPLATE | |
| 312 } GumboNodeType; | |
| 313 | |
| 314 /** | |
| 315 * Forward declaration of GumboNode so it can be used recursively in | |
| 316 * GumboNode.parent. | |
| 317 */ | |
| 318 typedef struct GumboInternalNode GumboNode; | |
| 319 | |
| 320 /** | |
| 321 * http://www.whatwg.org/specs/web-apps/current-work/complete/dom.html#quirks-mode | |
| 322 */ | |
| 323 typedef enum { | |
| 324 GUMBO_DOCTYPE_NO_QUIRKS, | |
| 325 GUMBO_DOCTYPE_QUIRKS, | |
| 326 GUMBO_DOCTYPE_LIMITED_QUIRKS | |
| 327 } GumboQuirksModeEnum; | |
| 328 | |
| 329 /** | |
| 330 * Namespaces. | |
| 331 * Unlike in X(HT)ML, namespaces in HTML5 are not denoted by a prefix. Rather, | |
| 332 * anything inside an <svg> tag is in the SVG namespace, anything inside the | |
| 333 * <math> tag is in the MathML namespace, and anything else is inside the HTML | |
| 334 * namespace. No other namespaces are supported, so this can be an enum only. | |
| 335 */ | |
| 336 typedef enum { | |
| 337 GUMBO_NAMESPACE_HTML, | |
| 338 GUMBO_NAMESPACE_SVG, | |
| 339 GUMBO_NAMESPACE_MATHML | |
| 340 } GumboNamespaceEnum; | |
| 341 | |
| 342 /** | |
| 343 * Parse flags. | |
| 344 * We track the reasons for parser insertion of nodes and store them in a | |
| 345 * bitvector in the node itself. This lets client code optimize out nodes that | |
| 346 * are implied by the HTML structure of the document, or flag constructs that | |
| 347 * may not be allowed by a style guide, or track the prevalence of incorrect or | |
| 348 * tricky HTML code. | |
| 349 */ | |
| 350 typedef enum { | |
| 351 /** | |
| 352 * A normal node - both start and end tags appear in the source, nothing has | |
| 353 * been reparented. | |
| 354 */ | |
| 355 GUMBO_INSERTION_NORMAL = 0, | |
| 356 | |
| 357 /** | |
| 358 * A node inserted by the parser to fulfill some implicit insertion rule. | |
| 359 * This is usually set in addition to some other flag giving a more specific | |
| 360 * insertion reason; it's a generic catch-all term meaning "The start tag for | |
| 361 * this node did not appear in the document source". | |
| 362 */ | |
| 363 GUMBO_INSERTION_BY_PARSER = 1 << 0, | |
| 364 | |
| 365 /** | |
| 366 * A flag indicating that the end tag for this node did not appear in the | |
| 367 * document source. Note that in some cases, you can still have | |
| 368 * parser-inserted nodes with an explicit end tag: for example, "Text</html>" | |
| 369 * has GUMBO_INSERTED_BY_PARSER set on the <html> node, but | |
| 370 * GUMBO_INSERTED_END_TAG_IMPLICITLY is unset, as the </html> tag actually | |
| 371 * exists. This flag will be set only if the end tag is completely missing; | |
| 372 * in some cases, the end tag may be misplaced (eg. a </body> tag with text | |
| 373 * afterwards), which will leave this flag unset and require clients to | |
| 374 * inspect the parse errors for that case. | |
| 375 */ | |
| 376 GUMBO_INSERTION_IMPLICIT_END_TAG = 1 << 1, | |
| 377 | |
| 378 // Value 1 << 2 was for a flag that has since been removed. | |
| 379 | |
| 380 /** | |
| 381 * A flag for nodes that are inserted because their presence is implied by | |
| 382 * other tags, eg. <html>, <head>, <body>, <tbody>, etc. | |
| 383 */ | |
| 384 GUMBO_INSERTION_IMPLIED = 1 << 3, | |
| 385 | |
| 386 /** | |
| 387 * A flag for nodes that are converted from their end tag equivalents. For | |
| 388 * example, </p> when no paragraph is open implies that the parser should | |
| 389 * create a <p> tag and immediately close it, while </br> means the same thing | |
| 390 * as <br>. | |
| 391 */ | |
| 392 GUMBO_INSERTION_CONVERTED_FROM_END_TAG = 1 << 4, | |
| 393 | |
| 394 /** A flag for nodes that are converted from the parse of an <isindex> tag. */ | |
| 395 GUMBO_INSERTION_FROM_ISINDEX = 1 << 5, | |
| 396 | |
| 397 /** A flag for <image> tags that are rewritten as <img>. */ | |
| 398 GUMBO_INSERTION_FROM_IMAGE = 1 << 6, | |
| 399 | |
| 400 /** | |
| 401 * A flag for nodes that are cloned as a result of the reconstruction of | |
| 402 * active formatting elements. This is set only on the clone; the initial | |
| 403 * portion of the formatting run is a NORMAL node with an IMPLICIT_END_TAG. | |
| 404 */ | |
| 405 GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT = 1 << 7, | |
| 406 | |
| 407 /** A flag for nodes that are cloned by the adoption agency algorithm. */ | |
| 408 GUMBO_INSERTION_ADOPTION_AGENCY_CLONED = 1 << 8, | |
| 409 | |
| 410 /** A flag for nodes that are moved by the adoption agency algorithm. */ | |
| 411 GUMBO_INSERTION_ADOPTION_AGENCY_MOVED = 1 << 9, | |
| 412 | |
| 413 /** | |
| 414 * A flag for nodes that have been foster-parented out of a table (or | |
| 415 * should've been foster-parented, if verbatim mode is set). | |
| 416 */ | |
| 417 GUMBO_INSERTION_FOSTER_PARENTED = 1 << 10, | |
| 418 } GumboParseFlags; | |
| 419 | |
| 420 /** | |
| 421 * Information specific to document nodes. | |
| 422 */ | |
| 423 typedef struct { | |
| 424 /** | |
| 425 * An array of GumboNodes, containing the children of this element. This will | |
| 426 * normally consist of the <html> element and any comment nodes found. | |
| 427 * Pointers are owned. | |
| 428 */ | |
| 429 GumboVector /* GumboNode* */ children; | |
| 430 | |
| 431 // True if there was an explicit doctype token as opposed to it being omitted. | |
| 432 bool has_doctype; | |
| 433 | |
| 434 // Fields from the doctype token, copied verbatim. | |
| 435 const char* name; | |
| 436 const char* public_identifier; | |
| 437 const char* system_identifier; | |
| 438 | |
| 439 /** | |
| 440 * Whether or not the document is in QuirksMode, as determined by the values | |
| 441 * in the GumboTokenDocType template. | |
| 442 */ | |
| 443 GumboQuirksModeEnum doc_type_quirks_mode; | |
| 444 } GumboDocument; | |
| 445 | |
| 446 /** | |
| 447 * The struct used to represent TEXT, CDATA, COMMENT, and WHITESPACE elements. | |
| 448 * This contains just a block of text and its position. | |
| 449 */ | |
| 450 typedef struct { | |
| 451 /** | |
| 452 * The text of this node, after entities have been parsed and decoded. For | |
| 453 * comment/cdata nodes, this does not include the comment delimiters. | |
| 454 */ | |
| 455 const char* text; | |
| 456 | |
| 457 /** | |
| 458 * The original text of this node, as a pointer into the original buffer. For | |
| 459 * comment/cdata nodes, this includes the comment delimiters. | |
| 460 */ | |
| 461 GumboStringPiece original_text; | |
| 462 | |
| 463 /** | |
| 464 * The starting position of this node. This corresponds to the position of | |
| 465 * original_text, before entities are decoded. | |
| 466 * */ | |
| 467 GumboSourcePosition start_pos; | |
| 468 } GumboText; | |
| 469 | |
| 470 /** | |
| 471 * The struct used to represent all HTML elements. This contains information | |
| 472 * about the tag, attributes, and child nodes. | |
| 473 */ | |
| 474 typedef struct { | |
| 475 /** | |
| 476 * An array of GumboNodes, containing the children of this element. Pointers | |
| 477 * are owned. | |
| 478 */ | |
| 479 GumboVector /* GumboNode* */ children; | |
| 480 | |
| 481 /** The GumboTag enum for this element. */ | |
| 482 GumboTag tag; | |
| 483 | |
| 484 /** The GumboNamespaceEnum for this element. */ | |
| 485 GumboNamespaceEnum tag_namespace; | |
| 486 | |
| 487 /** | |
| 488 * A GumboStringPiece pointing to the original tag text for this element, | |
| 489 * pointing directly into the source buffer. If the tag was inserted | |
| 490 * algorithmically (for example, <head> or <tbody> insertion), this will be a | |
| 491 * zero-length string. | |
| 492 */ | |
| 493 GumboStringPiece original_tag; | |
| 494 | |
| 495 /** | |
| 496 * A GumboStringPiece pointing to the original end tag text for this element. | |
| 497 * If the end tag was inserted algorithmically, (for example, closing a | |
| 498 * self-closing tag), this will be a zero-length string. | |
| 499 */ | |
| 500 GumboStringPiece original_end_tag; | |
| 501 | |
| 502 /** The source position for the start of the start tag. */ | |
| 503 GumboSourcePosition start_pos; | |
| 504 | |
| 505 /** The source position for the start of the end tag. */ | |
| 506 GumboSourcePosition end_pos; | |
| 507 | |
| 508 /** | |
| 509 * An array of GumboAttributes, containing the attributes for this tag in the | |
| 510 * order that they were parsed. Pointers are owned. | |
| 511 */ | |
| 512 GumboVector /* GumboAttribute* */ attributes; | |
| 513 } GumboElement; | |
| 514 | |
| 515 /** | |
| 516 * A supertype for GumboElement and GumboText, so that we can include one | |
| 517 * generic type in lists of children and cast as necessary to subtypes. | |
| 518 */ | |
| 519 struct GumboInternalNode { | |
| 520 /** The type of node that this is. */ | |
| 521 GumboNodeType type; | |
| 522 | |
| 523 /** Pointer back to parent node. Not owned. */ | |
| 524 GumboNode* parent; | |
| 525 | |
| 526 /** The index within the parent's children vector of this node. */ | |
| 527 size_t index_within_parent; | |
| 528 | |
| 529 /** | |
| 530 * A bitvector of flags containing information about why this element was | |
| 531 * inserted into the parse tree, including a variety of special parse | |
| 532 * situations. | |
| 533 */ | |
| 534 GumboParseFlags parse_flags; | |
| 535 | |
| 536 /** The actual node data. */ | |
| 537 union { | |
| 538 GumboDocument document; // For GUMBO_NODE_DOCUMENT. | |
| 539 GumboElement element; // For GUMBO_NODE_ELEMENT. | |
| 540 GumboText text; // For everything else. | |
| 541 } v; | |
| 542 }; | |
| 543 | |
| 544 /** | |
| 545 * The type for an allocator function. Takes the 'userdata' member of the | |
| 546 * GumboParser struct as its first argument. Semantics should be the same as | |
| 547 * malloc, i.e. return a block of size_t bytes on success or NULL on failure. | |
| 548 * Allocating a block of 0 bytes behaves as per malloc. | |
| 549 */ | |
| 550 // TODO(jdtang): Add checks throughout the codebase for out-of-memory condition. | |
| 551 typedef void* (*GumboAllocatorFunction)(void* userdata, size_t size); | |
| 552 | |
| 553 /** | |
| 554 * The type for a deallocator function. Takes the 'userdata' member of the | |
| 555 * GumboParser struct as its first argument. | |
| 556 */ | |
| 557 typedef void (*GumboDeallocatorFunction)(void* userdata, void* ptr); | |
| 558 | |
| 559 /** | |
| 560 * Input struct containing configuration options for the parser. | |
| 561 * These let you specify alternate memory managers, provide different error | |
| 562 * handling, etc. | |
| 563 * Use kGumboDefaultOptions for sensible defaults, and only set what you need. | |
| 564 */ | |
| 565 typedef struct GumboInternalOptions { | |
| 566 /** A memory allocator function. Default: malloc. */ | |
| 567 GumboAllocatorFunction allocator; | |
| 568 | |
| 569 /** A memory deallocator function. Default: free. */ | |
| 570 GumboDeallocatorFunction deallocator; | |
| 571 | |
| 572 /** | |
| 573 * An opaque object that's passed in as the first argument to all callbacks | |
| 574 * used by this library. Default: NULL. | |
| 575 */ | |
| 576 void* userdata; | |
| 577 | |
| 578 /** | |
| 579 * The tab-stop size, for computing positions in source code that uses tabs. | |
| 580 * Default: 8. | |
| 581 */ | |
| 582 int tab_stop; | |
| 583 | |
| 584 /** | |
| 585 * Whether or not to stop parsing when the first error is encountered. | |
| 586 * Default: false. | |
| 587 */ | |
| 588 bool stop_on_first_error; | |
| 589 | |
| 590 /** | |
| 591 * The maximum number of errors before the parser stops recording them. This | |
| 592 * is provided so that if the page is totally borked, we don't completely fill | |
| 593 * up the errors vector and exhaust memory with useless redundant errors. Set | |
| 594 * to -1 to disable the limit. | |
| 595 * Default: -1 | |
| 596 */ | |
| 597 int max_errors; | |
| 598 | |
| 599 /** | |
| 600 * The fragment context for parsing: | |
| 601 * https://html.spec.whatwg.org/multipage/syntax.html#parsing-html-fragments | |
| 602 * | |
| 603 * If GUMBO_TAG_LAST is passed here, it is assumed to be "no fragment", i.e. | |
| 604 * the regular parsing algorithm. Otherwise, pass the tag enum for the | |
| 605 * intended parent of the parsed fragment. We use just the tag enum rather | |
| 606 * than a full node because that's enough to set all the parsing context we | |
| 607 * need, and it provides some additional flexibility for client code to act as | |
| 608 * if parsing a fragment even when a full HTML tree isn't available. | |
| 609 * | |
| 610 * Default: GUMBO_TAG_LAST | |
| 611 */ | |
| 612 GumboTag fragment_context; | |
| 613 | |
| 614 /** | |
| 615 * The namespace for the fragment context. This lets client code | |
| 616 * differentiate between, say, parsing a <title> tag in SVG vs. parsing it in | |
| 617 * HTML. | |
| 618 * Default: GUMBO_NAMESPACE_HTML | |
| 619 */ | |
| 620 GumboNamespaceEnum fragment_namespace; | |
| 621 } GumboOptions; | |
| 622 | |
| 623 /** Default options struct; use this with gumbo_parse_with_options. */ | |
| 624 extern const GumboOptions kGumboDefaultOptions; | |
| 625 | |
| 626 /** The output struct containing the results of the parse. */ | |
| 627 typedef struct GumboInternalOutput { | |
| 628 /** | |
| 629 * Pointer to the document node. This is a GumboNode of type NODE_DOCUMENT | |
| 630 * that contains the entire document as its child. | |
| 631 */ | |
| 632 GumboNode* document; | |
| 633 | |
| 634 /** | |
| 635 * Pointer to the root node. This the <html> tag that forms the root of the | |
| 636 * document. | |
| 637 */ | |
| 638 GumboNode* root; | |
| 639 | |
| 640 /** | |
| 641 * A list of errors that occurred during the parse. | |
| 642 * NOTE: In version 1.0 of this library, the API for errors hasn't been fully | |
| 643 * fleshed out and may change in the future. For this reason, the GumboError | |
| 644 * header isn't part of the public API. Contact us if you need errors | |
| 645 * reported so we can work out something appropriate for your use-case. | |
| 646 */ | |
| 647 GumboVector /* GumboError */ errors; | |
| 648 } GumboOutput; | |
| 649 | |
| 650 /** | |
| 651 * Parses a buffer of UTF8 text into an GumboNode parse tree. The buffer must | |
| 652 * live at least as long as the parse tree, as some fields (eg. original_text) | |
| 653 * point directly into the original buffer. | |
| 654 * | |
| 655 * This doesn't support buffers longer than 4 gigabytes. | |
| 656 */ | |
| 657 GumboOutput* gumbo_parse(const char* buffer); | |
| 658 | |
| 659 /** | |
| 660 * Extended version of gumbo_parse that takes an explicit options structure, | |
| 661 * buffer, and length. | |
| 662 */ | |
| 663 GumboOutput* gumbo_parse_with_options( | |
| 664 const GumboOptions* options, const char* buffer, size_t buffer_length); | |
| 665 | |
| 666 /** Release the memory used for the parse tree & parse errors. */ | |
| 667 void gumbo_destroy_output(const GumboOptions* options, GumboOutput* output); | |
| 668 | |
| 669 #ifdef __cplusplus | |
| 670 } | |
| 671 #endif | |
| 672 | |
| 673 #endif // GUMBO_GUMBO_H_ |
