view mupdf-source/thirdparty/gumbo-parser/src/error.h @ 20:eb3dd22fef2c

FIX: the new "sdist" build target is PHONY also
author Franz Glasner <fzglas.hg@dom66.de>
date Thu, 18 Sep 2025 22:04:13 +0200
parents b50eed0cc0ef
children
line wrap: on
line source

// Copyright 2010 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: jdtang@google.com (Jonathan Tang)
//
// Error types, enums, and handling functions.

#ifndef GUMBO_ERROR_H_
#define GUMBO_ERROR_H_
#ifdef _MSC_VER
#ifndef _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_WARNINGS
#endif
#endif
#include <stdint.h>

#include "gumbo.h"
#include "insertion_mode.h"
#include "string_buffer.h"
#include "token_type.h"

#ifdef __cplusplus
extern "C" {
#endif

struct GumboInternalParser;

typedef enum {
  GUMBO_ERR_UTF8_INVALID,
  GUMBO_ERR_UTF8_TRUNCATED,
  GUMBO_ERR_UTF8_NULL,
  GUMBO_ERR_NUMERIC_CHAR_REF_NO_DIGITS,
  GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON,
  GUMBO_ERR_NUMERIC_CHAR_REF_INVALID,
  GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON,
  GUMBO_ERR_NAMED_CHAR_REF_INVALID,
  GUMBO_ERR_TAG_STARTS_WITH_QUESTION,
  GUMBO_ERR_TAG_EOF,
  GUMBO_ERR_TAG_INVALID,
  GUMBO_ERR_CLOSE_TAG_EMPTY,
  GUMBO_ERR_CLOSE_TAG_EOF,
  GUMBO_ERR_CLOSE_TAG_INVALID,
  GUMBO_ERR_SCRIPT_EOF,
  GUMBO_ERR_ATTR_NAME_EOF,
  GUMBO_ERR_ATTR_NAME_INVALID,
  GUMBO_ERR_ATTR_DOUBLE_QUOTE_EOF,
  GUMBO_ERR_ATTR_SINGLE_QUOTE_EOF,
  GUMBO_ERR_ATTR_UNQUOTED_EOF,
  GUMBO_ERR_ATTR_UNQUOTED_RIGHT_BRACKET,
  GUMBO_ERR_ATTR_UNQUOTED_EQUALS,
  GUMBO_ERR_ATTR_AFTER_EOF,
  GUMBO_ERR_ATTR_AFTER_INVALID,
  GUMBO_ERR_DUPLICATE_ATTR,
  GUMBO_ERR_SOLIDUS_EOF,
  GUMBO_ERR_SOLIDUS_INVALID,
  GUMBO_ERR_DASHES_OR_DOCTYPE,
  GUMBO_ERR_COMMENT_EOF,
  GUMBO_ERR_COMMENT_INVALID,
  GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH,
  GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH,
  GUMBO_ERR_COMMENT_SPACE_AFTER_DOUBLE_DASH,
  GUMBO_ERR_COMMENT_END_BANG_EOF,
  GUMBO_ERR_DOCTYPE_EOF,
  GUMBO_ERR_DOCTYPE_INVALID,
  GUMBO_ERR_DOCTYPE_SPACE,
  GUMBO_ERR_DOCTYPE_RIGHT_BRACKET,
  GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET,
  GUMBO_ERR_DOCTYPE_END,
  GUMBO_ERR_PARSER,
  GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG,
} GumboErrorType;

// Additional data for duplicated attributes.
typedef struct GumboInternalDuplicateAttrError {
  // The name of the attribute.  Owned by this struct.
  const char* name;

  // The (0-based) index within the attributes vector of the original
  // occurrence.
  unsigned int original_index;

  // The (0-based) index where the new occurrence would be.
  unsigned int new_index;
} GumboDuplicateAttrError;

// A simplified representation of the tokenizer state, designed to be more
// useful to clients of this library than the internal representation.  This
// condenses the actual states used in the tokenizer state machine into a few
// values that will be familiar to users of HTML.
typedef enum {
  GUMBO_ERR_TOKENIZER_DATA,
  GUMBO_ERR_TOKENIZER_CHAR_REF,
  GUMBO_ERR_TOKENIZER_RCDATA,
  GUMBO_ERR_TOKENIZER_RAWTEXT,
  GUMBO_ERR_TOKENIZER_PLAINTEXT,
  GUMBO_ERR_TOKENIZER_SCRIPT,
  GUMBO_ERR_TOKENIZER_TAG,
  GUMBO_ERR_TOKENIZER_SELF_CLOSING_TAG,
  GUMBO_ERR_TOKENIZER_ATTR_NAME,
  GUMBO_ERR_TOKENIZER_ATTR_VALUE,
  GUMBO_ERR_TOKENIZER_MARKUP_DECLARATION,
  GUMBO_ERR_TOKENIZER_COMMENT,
  GUMBO_ERR_TOKENIZER_DOCTYPE,
  GUMBO_ERR_TOKENIZER_CDATA,
} GumboTokenizerErrorState;

// Additional data for tokenizer errors.
// This records the current state and codepoint encountered - this is usually
// enough to reconstruct what went wrong and provide a friendly error message.
typedef struct GumboInternalTokenizerError {
  // The bad codepoint encountered.
  int codepoint;

  // The state that the tokenizer was in at the time.
  GumboTokenizerErrorState state;
} GumboTokenizerError;

// Additional data for parse errors.
typedef struct GumboInternalParserError {
  // The type of input token that resulted in this error.
  GumboTokenType input_type;

  // The HTML tag of the input token.  TAG_UNKNOWN if this was not a tag token.
  GumboTag input_tag;

  // The insertion mode that the parser was in at the time.
  GumboInsertionMode parser_state;

  // The tag stack at the point of the error.  Note that this is an GumboVector
  // of GumboTag's *stored by value* - cast the void* to an GumboTag directly to
  // get at the tag.
  GumboVector /* GumboTag */ tag_stack;
} GumboParserError;

// The overall error struct representing an error in decoding/tokenizing/parsing
// the HTML.  This contains an enumerated type flag, a source position, and then
// a union of fields containing data specific to the error.
typedef struct GumboInternalError {
  // The type of error.
  GumboErrorType type;

  // The position within the source file where the error occurred.
  GumboSourcePosition position;

  // A pointer to the byte within the original source file text where the error
  // occurred (note that this is not the same as position.offset, as that gives
  // character-based instead of byte-based offsets).
  const char* original_text;

  // Type-specific error information.
  union {
    // The code point we encountered, for:
    // * GUMBO_ERR_UTF8_INVALID
    // * GUMBO_ERR_UTF8_TRUNCATED
    // * GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON
    // * GUMBO_ERR_NUMERIC_CHAR_REF_INVALID
    uint64_t codepoint;

    // Tokenizer errors.
    GumboTokenizerError tokenizer;

    // Short textual data, for:
    // * GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON
    // * GUMBO_ERR_NAMED_CHAR_REF_INVALID
    GumboStringPiece text;

    // Duplicate attribute data, for GUMBO_ERR_DUPLICATE_ATTR.
    GumboDuplicateAttrError duplicate_attr;

    // Parser state, for GUMBO_ERR_PARSER and
    // GUMBO_ERR_UNACKNOWLEDGE_SELF_CLOSING_TAG.
    struct GumboInternalParserError parser;
  } v;
} GumboError;

// Adds a new error to the parser's error list, and returns a pointer to it so
// that clients can fill out the rest of its fields.  May return NULL if we're
// already over the max_errors field specified in GumboOptions.
GumboError* gumbo_add_error(struct GumboInternalParser* parser);

// Initializes the errors vector in the parser.
void gumbo_init_errors(struct GumboInternalParser* errors);

// Frees all the errors in the 'errors_' field of the parser.
void gumbo_destroy_errors(struct GumboInternalParser* errors);

// Frees the memory used for a single GumboError.
void gumbo_error_destroy(struct GumboInternalParser* parser, GumboError* error);

// Prints an error to a string.  This fills an empty GumboStringBuffer with a
// freshly-allocated buffer containing the error message text.  The caller is
// responsible for deleting the buffer.  (Note that the buffer is allocated with
// the allocator specified in the GumboParser config and hence should be freed
// by gumbo_parser_deallocate().)
void gumbo_error_to_string(struct GumboInternalParser* parser,
    const GumboError* error, GumboStringBuffer* output);

// Prints a caret diagnostic to a string.  This fills an empty GumboStringBuffer
// with a freshly-allocated buffer containing the error message text.  The
// caller is responsible for deleting the buffer.  (Note that the buffer is
// allocated with the allocator specified in the GumboParser config and hence
// should be freed by gumbo_parser_deallocate().)
void gumbo_caret_diagnostic_to_string(struct GumboInternalParser* parser,
    const GumboError* error, const char* source_text,
    GumboStringBuffer* output);

// Like gumbo_caret_diagnostic_to_string, but prints the text to stdout instead
// of writing to a string.
void gumbo_print_caret_diagnostic(struct GumboInternalParser* parser,
    const GumboError* error, const char* source_text);

#ifdef __cplusplus
}
#endif

#endif  // GUMBO_ERROR_H_