Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/brotli/c/enc/utf8_util.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /* Copyright 2013 Google Inc. All Rights Reserved. | |
| 2 | |
| 3 Distributed under MIT license. | |
| 4 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT | |
| 5 */ | |
| 6 | |
| 7 /* Heuristics for deciding about the UTF8-ness of strings. */ | |
| 8 | |
| 9 #include "utf8_util.h" | |
| 10 | |
| 11 #include <brotli/types.h> | |
| 12 | |
| 13 #if defined(__cplusplus) || defined(c_plusplus) | |
| 14 extern "C" { | |
| 15 #endif | |
| 16 | |
| 17 static size_t BrotliParseAsUTF8( | |
| 18 int* symbol, const uint8_t* input, size_t size) { | |
| 19 /* ASCII */ | |
| 20 if ((input[0] & 0x80) == 0) { | |
| 21 *symbol = input[0]; | |
| 22 if (*symbol > 0) { | |
| 23 return 1; | |
| 24 } | |
| 25 } | |
| 26 /* 2-byte UTF8 */ | |
| 27 if (size > 1u && | |
| 28 (input[0] & 0xE0) == 0xC0 && | |
| 29 (input[1] & 0xC0) == 0x80) { | |
| 30 *symbol = (((input[0] & 0x1F) << 6) | | |
| 31 (input[1] & 0x3F)); | |
| 32 if (*symbol > 0x7F) { | |
| 33 return 2; | |
| 34 } | |
| 35 } | |
| 36 /* 3-byte UFT8 */ | |
| 37 if (size > 2u && | |
| 38 (input[0] & 0xF0) == 0xE0 && | |
| 39 (input[1] & 0xC0) == 0x80 && | |
| 40 (input[2] & 0xC0) == 0x80) { | |
| 41 *symbol = (((input[0] & 0x0F) << 12) | | |
| 42 ((input[1] & 0x3F) << 6) | | |
| 43 (input[2] & 0x3F)); | |
| 44 if (*symbol > 0x7FF) { | |
| 45 return 3; | |
| 46 } | |
| 47 } | |
| 48 /* 4-byte UFT8 */ | |
| 49 if (size > 3u && | |
| 50 (input[0] & 0xF8) == 0xF0 && | |
| 51 (input[1] & 0xC0) == 0x80 && | |
| 52 (input[2] & 0xC0) == 0x80 && | |
| 53 (input[3] & 0xC0) == 0x80) { | |
| 54 *symbol = (((input[0] & 0x07) << 18) | | |
| 55 ((input[1] & 0x3F) << 12) | | |
| 56 ((input[2] & 0x3F) << 6) | | |
| 57 (input[3] & 0x3F)); | |
| 58 if (*symbol > 0xFFFF && *symbol <= 0x10FFFF) { | |
| 59 return 4; | |
| 60 } | |
| 61 } | |
| 62 /* Not UTF8, emit a special symbol above the UTF8-code space */ | |
| 63 *symbol = 0x110000 | input[0]; | |
| 64 return 1; | |
| 65 } | |
| 66 | |
| 67 /* Returns 1 if at least min_fraction of the data is UTF8-encoded.*/ | |
| 68 BROTLI_BOOL BrotliIsMostlyUTF8( | |
| 69 const uint8_t* data, const size_t pos, const size_t mask, | |
| 70 const size_t length, const double min_fraction) { | |
| 71 size_t size_utf8 = 0; | |
| 72 size_t i = 0; | |
| 73 while (i < length) { | |
| 74 int symbol; | |
| 75 size_t bytes_read = | |
| 76 BrotliParseAsUTF8(&symbol, &data[(pos + i) & mask], length - i); | |
| 77 i += bytes_read; | |
| 78 if (symbol < 0x110000) size_utf8 += bytes_read; | |
| 79 } | |
| 80 return TO_BROTLI_BOOL((double)size_utf8 > min_fraction * (double)length); | |
| 81 } | |
| 82 | |
| 83 #if defined(__cplusplus) || defined(c_plusplus) | |
| 84 } /* extern "C" */ | |
| 85 #endif |
