comparison mupdf-source/thirdparty/gumbo-parser/src/error.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 // Copyright 2010 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // Author: jdtang@google.com (Jonathan Tang)
16
17 #include "error.h"
18
19 #include <assert.h>
20 #include <stdarg.h>
21 #include <stdio.h>
22 #include <string.h>
23
24 #include "gumbo.h"
25 #include "parser.h"
26 #include "string_buffer.h"
27 #include "util.h"
28 #include "vector.h"
29
30 // Prints a formatted message to a StringBuffer. This automatically resizes the
31 // StringBuffer as necessary to fit the message. Returns the number of bytes
32 // written.
33 static int print_message(
34 GumboParser* parser, GumboStringBuffer* output, const char* format, ...) {
35 va_list args;
36 size_t remaining_capacity = output->capacity - output->length;
37 va_start(args, format);
38 int bytes_written = vsnprintf(
39 output->data + output->length, remaining_capacity, format, args);
40 va_end(args);
41 #ifdef _MSC_VER
42 if (bytes_written == -1) {
43 // vsnprintf returns -1 on MSVC++ if there's not enough capacity, instead of
44 // returning the number of bytes that would've been written had there been
45 // enough. In this case, we'll double the buffer size and hope it fits when
46 // we retry (letting it fail and returning 0 if it doesn't), since there's
47 // no way to smartly resize the buffer.
48 gumbo_string_buffer_reserve(parser, output->capacity * 2, output);
49 va_start(args, format);
50 int result = vsnprintf(
51 output->data + output->length, remaining_capacity, format, args);
52 va_end(args);
53 return result == -1 ? 0 : result;
54 }
55 #else
56 // -1 in standard C99 indicates an encoding error. Return 0 and do nothing.
57 if (bytes_written == -1) {
58 return 0;
59 }
60 #endif
61
62 if (bytes_written > remaining_capacity) {
63 gumbo_string_buffer_reserve(
64 parser, output->capacity + bytes_written, output);
65 remaining_capacity = output->capacity - output->length;
66 va_start(args, format);
67 bytes_written = vsnprintf(
68 output->data + output->length, remaining_capacity, format, args);
69 va_end(args);
70 }
71 output->length += bytes_written;
72 return bytes_written;
73 }
74
75 static void print_tag_stack(GumboParser* parser, const GumboParserError* error,
76 GumboStringBuffer* output) {
77 print_message(parser, output, " Currently open tags: ");
78 for (unsigned int i = 0; i < error->tag_stack.length; ++i) {
79 if (i) {
80 print_message(parser, output, ", ");
81 }
82 GumboTag tag = (GumboTag) (intptr_t) error->tag_stack.data[i];
83 print_message(parser, output, gumbo_normalized_tagname(tag));
84 }
85 gumbo_string_buffer_append_codepoint(parser, '.', output);
86 }
87
88 static void handle_parser_error(GumboParser* parser,
89 const GumboParserError* error, GumboStringBuffer* output) {
90 if (error->parser_state == GUMBO_INSERTION_MODE_INITIAL &&
91 error->input_type != GUMBO_TOKEN_DOCTYPE) {
92 print_message(
93 parser, output, "The doctype must be the first token in the document");
94 return;
95 }
96
97 switch (error->input_type) {
98 case GUMBO_TOKEN_DOCTYPE:
99 print_message(parser, output, "This is not a legal doctype");
100 return;
101 case GUMBO_TOKEN_COMMENT:
102 // Should never happen; comments are always legal.
103 assert(0);
104 // But just in case...
105 print_message(parser, output, "Comments aren't legal here");
106 return;
107 case GUMBO_TOKEN_CDATA:
108 case GUMBO_TOKEN_WHITESPACE:
109 case GUMBO_TOKEN_CHARACTER:
110 print_message(parser, output, "Character tokens aren't legal here");
111 return;
112 case GUMBO_TOKEN_NULL:
113 print_message(parser, output, "Null bytes are not allowed in HTML5");
114 return;
115 case GUMBO_TOKEN_EOF:
116 if (error->parser_state == GUMBO_INSERTION_MODE_INITIAL) {
117 print_message(parser, output, "You must provide a doctype");
118 } else {
119 print_message(parser, output, "Premature end of file");
120 print_tag_stack(parser, error, output);
121 }
122 return;
123 case GUMBO_TOKEN_START_TAG:
124 case GUMBO_TOKEN_END_TAG:
125 print_message(parser, output, "That tag isn't allowed here");
126 print_tag_stack(parser, error, output);
127 // TODO(jdtang): Give more specific messaging.
128 return;
129 }
130 }
131
132 // Finds the preceding newline in an original source buffer from a given byte
133 // location. Returns a character pointer to the character after that, or a
134 // pointer to the beginning of the string if this is the first line.
135 static const char* find_last_newline(
136 const char* original_text, const char* error_location) {
137 assert(error_location >= original_text);
138 const char* c = error_location;
139 for (; c != original_text && *c != '\n'; --c) {
140 // There may be an error at EOF, which would be a nul byte.
141 assert(*c || c == error_location);
142 }
143 return c == original_text ? c : c + 1;
144 }
145
146 // Finds the next newline in the original source buffer from a given byte
147 // location. Returns a character pointer to that newline, or a pointer to the
148 // terminating null byte if this is the last line.
149 static const char* find_next_newline(
150 const char* original_text, const char* error_location) {
151 const char* c = error_location;
152 for (; *c && *c != '\n'; ++c)
153 ;
154 return c;
155 }
156
157 GumboError* gumbo_add_error(GumboParser* parser) {
158 int max_errors = parser->_options->max_errors;
159 if (max_errors >= 0 && parser->_output->errors.length >= (unsigned int) max_errors) {
160 return NULL;
161 }
162 GumboError* error = gumbo_parser_allocate(parser, sizeof(GumboError));
163 gumbo_vector_add(parser, error, &parser->_output->errors);
164 return error;
165 }
166
167 void gumbo_error_to_string(
168 GumboParser* parser, const GumboError* error, GumboStringBuffer* output) {
169 print_message(
170 parser, output, "@%d:%d: ", error->position.line, error->position.column);
171 switch (error->type) {
172 case GUMBO_ERR_UTF8_INVALID:
173 print_message(
174 parser, output, "Invalid UTF8 character 0x%x", error->v.codepoint);
175 break;
176 case GUMBO_ERR_UTF8_TRUNCATED:
177 print_message(parser, output,
178 "Input stream ends with a truncated UTF8 character 0x%x",
179 error->v.codepoint);
180 break;
181 case GUMBO_ERR_NUMERIC_CHAR_REF_NO_DIGITS:
182 print_message(
183 parser, output, "No digits after &# in numeric character reference");
184 break;
185 case GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON:
186 print_message(parser, output,
187 "The numeric character reference &#%d should be followed "
188 "by a semicolon",
189 error->v.codepoint);
190 break;
191 case GUMBO_ERR_NUMERIC_CHAR_REF_INVALID:
192 print_message(parser, output,
193 "The numeric character reference &#%d; encodes an invalid "
194 "unicode codepoint",
195 error->v.codepoint);
196 break;
197 case GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON:
198 // The textual data came from one of the literal strings in the table, and
199 // so it'll be null-terminated.
200 print_message(parser, output,
201 "The named character reference &%.*s should be followed by a "
202 "semicolon",
203 (int) error->v.text.length, error->v.text.data);
204 break;
205 case GUMBO_ERR_NAMED_CHAR_REF_INVALID:
206 print_message(parser, output,
207 "The named character reference &%.*s; is not a valid entity name",
208 (int) error->v.text.length, error->v.text.data);
209 break;
210 case GUMBO_ERR_DUPLICATE_ATTR:
211 print_message(parser, output,
212 "Attribute %s occurs multiple times, at positions %d and %d",
213 error->v.duplicate_attr.name, error->v.duplicate_attr.original_index,
214 error->v.duplicate_attr.new_index);
215 break;
216 case GUMBO_ERR_PARSER:
217 case GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG:
218 handle_parser_error(parser, &error->v.parser, output);
219 break;
220 default:
221 print_message(parser, output,
222 "Tokenizer error with an unimplemented error message");
223 break;
224 }
225 gumbo_string_buffer_append_codepoint(parser, '.', output);
226 }
227
228 void gumbo_caret_diagnostic_to_string(GumboParser* parser,
229 const GumboError* error, const char* source_text,
230 GumboStringBuffer* output) {
231 gumbo_error_to_string(parser, error, output);
232
233 const char* line_start = find_last_newline(source_text, error->original_text);
234 const char* line_end = find_next_newline(source_text, error->original_text);
235 GumboStringPiece original_line;
236 original_line.data = line_start;
237 original_line.length = line_end - line_start;
238
239 gumbo_string_buffer_append_codepoint(parser, '\n', output);
240 gumbo_string_buffer_append_string(parser, &original_line, output);
241 gumbo_string_buffer_append_codepoint(parser, '\n', output);
242 gumbo_string_buffer_reserve(
243 parser, output->length + error->position.column, output);
244 int num_spaces = error->position.column - 1;
245 memset(output->data + output->length, ' ', num_spaces);
246 output->length += num_spaces;
247 gumbo_string_buffer_append_codepoint(parser, '^', output);
248 gumbo_string_buffer_append_codepoint(parser, '\n', output);
249 }
250
251 void gumbo_print_caret_diagnostic(
252 GumboParser* parser, const GumboError* error, const char* source_text) {
253 GumboStringBuffer text;
254 gumbo_string_buffer_init(parser, &text);
255 gumbo_caret_diagnostic_to_string(parser, error, source_text, &text);
256 printf("%.*s", (int) text.length, text.data);
257 gumbo_string_buffer_destroy(parser, &text);
258 }
259
260 void gumbo_error_destroy(GumboParser* parser, GumboError* error) {
261 if (error->type == GUMBO_ERR_PARSER ||
262 error->type == GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG) {
263 gumbo_vector_destroy(parser, &error->v.parser.tag_stack);
264 } else if (error->type == GUMBO_ERR_DUPLICATE_ATTR) {
265 gumbo_parser_deallocate(parser, (void*) error->v.duplicate_attr.name);
266 }
267 gumbo_parser_deallocate(parser, error);
268 }
269
270 void gumbo_init_errors(GumboParser* parser) {
271 gumbo_vector_init(parser, 5, &parser->_output->errors);
272 }
273
274 void gumbo_destroy_errors(GumboParser* parser) {
275 for (unsigned int i = 0; i < parser->_output->errors.length; ++i) {
276 gumbo_error_destroy(parser, parser->_output->errors.data[i]);
277 }
278 gumbo_vector_destroy(parser, &parser->_output->errors);
279 }