Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/gumbo-parser/src/utf8.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 // Copyright 2010 Google Inc. All Rights Reserved. | |
| 2 // | |
| 3 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 4 // you may not use this file except in compliance with the License. | |
| 5 // You may obtain a copy of the License at | |
| 6 // | |
| 7 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 8 // | |
| 9 // Unless required by applicable law or agreed to in writing, software | |
| 10 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 12 // See the License for the specific language governing permissions and | |
| 13 // limitations under the License. | |
| 14 // | |
| 15 // Author: jdtang@google.com (Jonathan Tang) | |
| 16 | |
| 17 #include "utf8.h" | |
| 18 | |
| 19 #include <assert.h> | |
| 20 #include <stdint.h> | |
| 21 #include <string.h> | |
| 22 #include <strings.h> // For strncasecmp. | |
| 23 | |
| 24 #include "error.h" | |
| 25 #include "gumbo.h" | |
| 26 #include "parser.h" | |
| 27 #include "util.h" | |
| 28 #include "vector.h" | |
| 29 | |
| 30 const int kUtf8ReplacementChar = 0xFFFD; | |
| 31 | |
| 32 // Reference material: | |
| 33 // Wikipedia: http://en.wikipedia.org/wiki/UTF-8#Description | |
| 34 // RFC 3629: http://tools.ietf.org/html/rfc3629 | |
| 35 // HTML5 Unicode handling: | |
| 36 // http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html#preprocessing-the-input-stream | |
| 37 // | |
| 38 // This implementation is based on a DFA-based decoder by Bjoern Hoehrmann | |
| 39 // <bjoern@hoehrmann.de>. We wrap the inner table-based decoder routine in our | |
| 40 // own handling for newlines, tabs, invalid continuation bytes, and other | |
| 41 // conditions that the HTML5 spec fully specifies but normal UTF8 decoders do | |
| 42 // not handle. | |
| 43 // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. Full text of | |
| 44 // the license agreement and code follows. | |
| 45 | |
| 46 // Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de> | |
| 47 | |
| 48 // Permission is hereby granted, free of charge, to any person obtaining a copy | |
| 49 // of this software and associated documentation files (the "Software"), to deal | |
| 50 // in the Software without restriction, including without limitation the rights | |
| 51 // to | |
| 52 // use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies | |
| 53 // of the Software, and to permit persons to whom the Software is furnished to | |
| 54 // do | |
| 55 // so, subject to the following conditions: | |
| 56 | |
| 57 // The above copyright notice and this permission notice shall be included in | |
| 58 // all copies or substantial portions of the Software. | |
| 59 | |
| 60 #define UTF8_ACCEPT 0 | |
| 61 #define UTF8_REJECT 12 | |
| 62 | |
| 63 static const uint8_t utf8d[] = { | |
| 64 // The first part of the table maps bytes to character classes that | |
| 65 // to reduce the size of the transition table and create bitmasks. | |
| 66 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
| 67 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
| 68 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
| 69 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
| 70 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
| 71 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, | |
| 72 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, | |
| 73 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 2, 2, 2, 2, 2, 2, | |
| 74 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 10, | |
| 75 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 11, 6, 6, 6, 5, 8, 8, 8, 8, 8, | |
| 76 8, 8, 8, 8, 8, 8, | |
| 77 | |
| 78 // The second part is a transition table that maps a combination | |
| 79 // of a state of the automaton and a character class to a state. | |
| 80 0, 12, 24, 36, 60, 96, 84, 12, 12, 12, 48, 72, 12, 12, 12, 12, 12, 12, 12, | |
| 81 12, 12, 12, 12, 12, 12, 0, 12, 12, 12, 12, 12, 0, 12, 0, 12, 12, 12, 24, 12, | |
| 82 12, 12, 12, 12, 24, 12, 24, 12, 12, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12, | |
| 83 12, 12, 12, 24, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 12, 12, | |
| 84 12, 12, 36, 12, 36, 12, 12, 12, 36, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, | |
| 85 12, 36, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, | |
| 86 }; | |
| 87 | |
| 88 uint32_t static inline decode(uint32_t* state, uint32_t* codep, uint32_t byte) { | |
| 89 uint32_t type = utf8d[byte]; | |
| 90 | |
| 91 *codep = (*state != UTF8_ACCEPT) ? (byte & 0x3fu) | (*codep << 6) | |
| 92 : (0xff >> type) & (byte); | |
| 93 | |
| 94 *state = utf8d[256 + *state + type]; | |
| 95 return *state; | |
| 96 } | |
| 97 | |
| 98 // END COPIED CODE. | |
| 99 | |
| 100 // Adds a decoding error to the parser's error list, based on the current state | |
| 101 // of the Utf8Iterator. | |
| 102 static void add_error(Utf8Iterator* iter, GumboErrorType type) { | |
| 103 GumboParser* parser = iter->_parser; | |
| 104 | |
| 105 GumboError* error = gumbo_add_error(parser); | |
| 106 if (!error) { | |
| 107 return; | |
| 108 } | |
| 109 error->type = type; | |
| 110 error->position = iter->_pos; | |
| 111 error->original_text = iter->_start; | |
| 112 | |
| 113 // At the point the error is recorded, the code point hasn't been computed | |
| 114 // yet (and can't be, because it's invalid), so we need to build up the raw | |
| 115 // hex value from the bytes under the cursor. | |
| 116 uint64_t code_point = 0; | |
| 117 for (int i = 0; i < iter->_width; ++i) { | |
| 118 code_point = (code_point << 8) | (unsigned char) iter->_start[i]; | |
| 119 } | |
| 120 error->v.codepoint = code_point; | |
| 121 } | |
| 122 | |
| 123 // Reads the next UTF-8 character in the iter. | |
| 124 // This assumes that iter->_start points to the beginning of the character. | |
| 125 // When this method returns, iter->_width and iter->_current will be set | |
| 126 // appropriately, as well as any error flags. | |
| 127 static void read_char(Utf8Iterator* iter) { | |
| 128 if (iter->_start >= iter->_end) { | |
| 129 // No input left to consume; emit an EOF and set width = 0. | |
| 130 iter->_current = -1; | |
| 131 iter->_width = 0; | |
| 132 return; | |
| 133 } | |
| 134 | |
| 135 uint32_t code_point = 0; | |
| 136 uint32_t state = UTF8_ACCEPT; | |
| 137 for (const char* c = iter->_start; c < iter->_end; ++c) { | |
| 138 decode(&state, &code_point, (uint32_t)(unsigned char) (*c)); | |
| 139 if (state == UTF8_ACCEPT) { | |
| 140 iter->_width = c - iter->_start + 1; | |
| 141 // This is the special handling for carriage returns that is mandated by | |
| 142 // the HTML5 spec. Since we're looking for particular 7-bit literal | |
| 143 // characters, we operate in terms of chars and only need a check for iter | |
| 144 // overrun, instead of having to read in a full next code point. | |
| 145 // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream | |
| 146 if (code_point == '\r') { | |
| 147 assert(iter->_width == 1); | |
| 148 const char* next = c + 1; | |
| 149 if (next < iter->_end && *next == '\n') { | |
| 150 // Advance the iter, as if the carriage return didn't exist. | |
| 151 ++iter->_start; | |
| 152 // Preserve the true offset, since other tools that look at it may be | |
| 153 // unaware of HTML5's rules for converting \r into \n. | |
| 154 ++iter->_pos.offset; | |
| 155 } | |
| 156 code_point = '\n'; | |
| 157 } | |
| 158 if (utf8_is_invalid_code_point(code_point)) { | |
| 159 add_error(iter, GUMBO_ERR_UTF8_INVALID); | |
| 160 code_point = kUtf8ReplacementChar; | |
| 161 } | |
| 162 iter->_current = code_point; | |
| 163 return; | |
| 164 } else if (state == UTF8_REJECT) { | |
| 165 // We don't want to consume the invalid continuation byte of a multi-byte | |
| 166 // run, but we do want to skip past an invalid first byte. | |
| 167 iter->_width = c - iter->_start + (c == iter->_start); | |
| 168 iter->_current = kUtf8ReplacementChar; | |
| 169 add_error(iter, GUMBO_ERR_UTF8_INVALID); | |
| 170 return; | |
| 171 } | |
| 172 } | |
| 173 // If we got here without exiting early, then we've reached the end of the | |
| 174 // iterator. Add an error for truncated input, set the width to consume the | |
| 175 // rest of the iterator, and emit a replacement character. The next time we | |
| 176 // enter this method, it will detect that there's no input to consume and | |
| 177 // output an EOF. | |
| 178 iter->_current = kUtf8ReplacementChar; | |
| 179 iter->_width = iter->_end - iter->_start; | |
| 180 add_error(iter, GUMBO_ERR_UTF8_TRUNCATED); | |
| 181 } | |
| 182 | |
| 183 static void update_position(Utf8Iterator* iter) { | |
| 184 iter->_pos.offset += iter->_width; | |
| 185 if (iter->_current == '\n') { | |
| 186 ++iter->_pos.line; | |
| 187 iter->_pos.column = 1; | |
| 188 } else if (iter->_current == '\t') { | |
| 189 int tab_stop = iter->_parser->_options->tab_stop; | |
| 190 iter->_pos.column = ((iter->_pos.column / tab_stop) + 1) * tab_stop; | |
| 191 } else if (iter->_current != -1) { | |
| 192 ++iter->_pos.column; | |
| 193 } | |
| 194 } | |
| 195 | |
| 196 // Returns true if this Unicode code point is in the list of characters | |
| 197 // forbidden by the HTML5 spec, such as undefined control chars. | |
| 198 bool utf8_is_invalid_code_point(int c) { | |
| 199 return (c >= 0x1 && c <= 0x8) || c == 0xB || (c >= 0xE && c <= 0x1F) || | |
| 200 (c >= 0x7F && c <= 0x9F) || (c >= 0xFDD0 && c <= 0xFDEF) || | |
| 201 ((c & 0xFFFF) == 0xFFFE) || ((c & 0xFFFF) == 0xFFFF); | |
| 202 } | |
| 203 | |
| 204 void utf8iterator_init(GumboParser* parser, const char* source, | |
| 205 size_t source_length, Utf8Iterator* iter) { | |
| 206 iter->_start = source; | |
| 207 iter->_end = source + source_length; | |
| 208 iter->_pos.line = 1; | |
| 209 iter->_pos.column = 1; | |
| 210 iter->_pos.offset = 0; | |
| 211 iter->_parser = parser; | |
| 212 read_char(iter); | |
| 213 } | |
| 214 | |
| 215 void utf8iterator_next(Utf8Iterator* iter) { | |
| 216 // We update positions based on the *last* character read, so that the first | |
| 217 // character following a newline is at column 1 in the next line. | |
| 218 update_position(iter); | |
| 219 iter->_start += iter->_width; | |
| 220 read_char(iter); | |
| 221 } | |
| 222 | |
| 223 int utf8iterator_current(const Utf8Iterator* iter) { return iter->_current; } | |
| 224 | |
| 225 void utf8iterator_get_position( | |
| 226 const Utf8Iterator* iter, GumboSourcePosition* output) { | |
| 227 *output = iter->_pos; | |
| 228 } | |
| 229 | |
| 230 const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter) { | |
| 231 return iter->_start; | |
| 232 } | |
| 233 | |
| 234 const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter) { | |
| 235 return iter->_end; | |
| 236 } | |
| 237 | |
| 238 bool utf8iterator_maybe_consume_match(Utf8Iterator* iter, const char* prefix, | |
| 239 size_t length, bool case_sensitive) { | |
| 240 bool matched = (iter->_start + length <= iter->_end) && | |
| 241 (case_sensitive ? !strncmp(iter->_start, prefix, length) | |
| 242 : !strncasecmp(iter->_start, prefix, length)); | |
| 243 if (matched) { | |
| 244 for (unsigned int i = 0; i < length; ++i) { | |
| 245 utf8iterator_next(iter); | |
| 246 } | |
| 247 return true; | |
| 248 } else { | |
| 249 return false; | |
| 250 } | |
| 251 } | |
| 252 | |
| 253 void utf8iterator_mark(Utf8Iterator* iter) { | |
| 254 iter->_mark = iter->_start; | |
| 255 iter->_mark_pos = iter->_pos; | |
| 256 } | |
| 257 | |
| 258 // Returns the current input stream position to the mark. | |
| 259 void utf8iterator_reset(Utf8Iterator* iter) { | |
| 260 iter->_start = iter->_mark; | |
| 261 iter->_pos = iter->_mark_pos; | |
| 262 read_char(iter); | |
| 263 } | |
| 264 | |
| 265 // Sets the position and original text fields of an error to the value at the | |
| 266 // mark. | |
| 267 void utf8iterator_fill_error_at_mark(Utf8Iterator* iter, GumboError* error) { | |
| 268 error->position = iter->_mark_pos; | |
| 269 error->original_text = iter->_mark; | |
| 270 } |
