Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/gumbo-parser/src/utf8.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 // Copyright 2010 Google Inc. All Rights Reserved. | |
| 2 // | |
| 3 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 4 // you may not use this file except in compliance with the License. | |
| 5 // You may obtain a copy of the License at | |
| 6 // | |
| 7 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 8 // | |
| 9 // Unless required by applicable law or agreed to in writing, software | |
| 10 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 12 // See the License for the specific language governing permissions and | |
| 13 // limitations under the License. | |
| 14 // | |
| 15 // Author: jdtang@google.com (Jonathan Tang) | |
| 16 // | |
| 17 // This contains an implementation of a UTF8 iterator and decoder suitable for | |
| 18 // an HTML5 parser. This does a bit more than straight UTF-8 decoding. The | |
| 19 // HTML5 spec specifies that: | |
| 20 // 1. Decoding errors are parse errors. | |
| 21 // 2. Certain other codepoints (eg. control characters) are parse errors. | |
| 22 // 3. Carriage returns and CR/LF groups are converted to line feeds. | |
| 23 // http://www.whatwg.org/specs/web-apps/current-work/multipage/infrastructure.html#decoded-as-utf-8,-with-error-handling | |
| 24 // | |
| 25 // Also, we want to keep track of source positions for error handling. As a | |
| 26 // result, we fold all that functionality into this decoder, and can't use an | |
| 27 // off-the-shelf library. | |
| 28 // | |
| 29 // This header is internal-only, which is why we prefix functions with only | |
| 30 // utf8_ or utf8_iterator_ instead of gumbo_utf8_. | |
| 31 | |
| 32 #ifndef GUMBO_UTF8_H_ | |
| 33 #define GUMBO_UTF8_H_ | |
| 34 | |
| 35 #include <stdbool.h> | |
| 36 #include <stddef.h> | |
| 37 | |
| 38 #include "gumbo.h" | |
| 39 | |
| 40 #ifdef __cplusplus | |
| 41 extern "C" { | |
| 42 #endif | |
| 43 | |
| 44 struct GumboInternalError; | |
| 45 struct GumboInternalParser; | |
| 46 | |
| 47 // Unicode replacement char. | |
| 48 extern const int kUtf8ReplacementChar; | |
| 49 | |
| 50 typedef struct GumboInternalUtf8Iterator { | |
| 51 // Points at the start of the code point most recently read into 'current'. | |
| 52 const char* _start; | |
| 53 | |
| 54 // Points at the mark. The mark is initially set to the beginning of the | |
| 55 // input. | |
| 56 const char* _mark; | |
| 57 | |
| 58 // Points past the end of the iter, like a past-the-end iterator in the STL. | |
| 59 const char* _end; | |
| 60 | |
| 61 // The code point under the cursor. | |
| 62 int _current; | |
| 63 | |
| 64 // The width in bytes of the current code point. | |
| 65 int _width; | |
| 66 | |
| 67 // The SourcePosition for the current location. | |
| 68 GumboSourcePosition _pos; | |
| 69 | |
| 70 // The SourcePosition for the mark. | |
| 71 GumboSourcePosition _mark_pos; | |
| 72 | |
| 73 // Pointer back to the GumboParser instance, for configuration options and | |
| 74 // error recording. | |
| 75 struct GumboInternalParser* _parser; | |
| 76 } Utf8Iterator; | |
| 77 | |
| 78 // Returns true if this Unicode code point is in the list of characters | |
| 79 // forbidden by the HTML5 spec, such as NUL bytes and undefined control chars. | |
| 80 bool utf8_is_invalid_code_point(int c); | |
| 81 | |
| 82 // Initializes a new Utf8Iterator from the given byte buffer. The source does | |
| 83 // not have to be NUL-terminated, but the length must be passed in explicitly. | |
| 84 void utf8iterator_init(struct GumboInternalParser* parser, const char* source, | |
| 85 size_t source_length, Utf8Iterator* iter); | |
| 86 | |
| 87 // Advances the current position by one code point. | |
| 88 void utf8iterator_next(Utf8Iterator* iter); | |
| 89 | |
| 90 // Returns the current code point as an integer. | |
| 91 int utf8iterator_current(const Utf8Iterator* iter); | |
| 92 | |
| 93 // Retrieves and fills the output parameter with the current source position. | |
| 94 void utf8iterator_get_position( | |
| 95 const Utf8Iterator* iter, GumboSourcePosition* output); | |
| 96 | |
| 97 // Retrieves a character pointer to the start of the current character. | |
| 98 const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter); | |
| 99 | |
| 100 // Retrieves a character pointer to 1 past the end of the buffer. This is | |
| 101 // necessary for certain state machines and string comparisons that would like | |
| 102 // to look directly for ASCII text in the buffer without going through the | |
| 103 // decoder. | |
| 104 const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter); | |
| 105 | |
| 106 // If the upcoming text in the buffer matches the specified prefix (which has | |
| 107 // length 'length'), consume it and return true. Otherwise, return false with | |
| 108 // no other effects. If the length of the string would overflow the buffer, | |
| 109 // this returns false. Note that prefix should not contain null bytes because | |
| 110 // of the use of strncmp/strncasecmp internally. All existing use-cases adhere | |
| 111 // to this. | |
| 112 bool utf8iterator_maybe_consume_match( | |
| 113 Utf8Iterator* iter, const char* prefix, size_t length, bool case_sensitive); | |
| 114 | |
| 115 // "Marks" a particular location of interest in the input stream, so that it can | |
| 116 // later be reset() to. There's also the ability to record an error at the | |
| 117 // point that was marked, as oftentimes that's more useful than the last | |
| 118 // character before the error was detected. | |
| 119 void utf8iterator_mark(Utf8Iterator* iter); | |
| 120 | |
| 121 // Returns the current input stream position to the mark. | |
| 122 void utf8iterator_reset(Utf8Iterator* iter); | |
| 123 | |
| 124 // Sets the position and original text fields of an error to the value at the | |
| 125 // mark. | |
| 126 void utf8iterator_fill_error_at_mark( | |
| 127 Utf8Iterator* iter, struct GumboInternalError* error); | |
| 128 | |
| 129 #ifdef __cplusplus | |
| 130 } | |
| 131 #endif | |
| 132 #endif // GUMBO_UTF8_H_ |
