diff mupdf-source/thirdparty/brotli/c/enc/utf8_util.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/brotli/c/enc/utf8_util.c	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,85 @@
+/* Copyright 2013 Google Inc. All Rights Reserved.
+
+   Distributed under MIT license.
+   See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
+*/
+
+/* Heuristics for deciding about the UTF8-ness of strings. */
+
+#include "utf8_util.h"
+
+#include <brotli/types.h>
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+static size_t BrotliParseAsUTF8(
+    int* symbol, const uint8_t* input, size_t size) {
+  /* ASCII */
+  if ((input[0] & 0x80) == 0) {
+    *symbol = input[0];
+    if (*symbol > 0) {
+      return 1;
+    }
+  }
+  /* 2-byte UTF8 */
+  if (size > 1u &&
+      (input[0] & 0xE0) == 0xC0 &&
+      (input[1] & 0xC0) == 0x80) {
+    *symbol = (((input[0] & 0x1F) << 6) |
+               (input[1] & 0x3F));
+    if (*symbol > 0x7F) {
+      return 2;
+    }
+  }
+  /* 3-byte UFT8 */
+  if (size > 2u &&
+      (input[0] & 0xF0) == 0xE0 &&
+      (input[1] & 0xC0) == 0x80 &&
+      (input[2] & 0xC0) == 0x80) {
+    *symbol = (((input[0] & 0x0F) << 12) |
+               ((input[1] & 0x3F) << 6) |
+               (input[2] & 0x3F));
+    if (*symbol > 0x7FF) {
+      return 3;
+    }
+  }
+  /* 4-byte UFT8 */
+  if (size > 3u &&
+      (input[0] & 0xF8) == 0xF0 &&
+      (input[1] & 0xC0) == 0x80 &&
+      (input[2] & 0xC0) == 0x80 &&
+      (input[3] & 0xC0) == 0x80) {
+    *symbol = (((input[0] & 0x07) << 18) |
+               ((input[1] & 0x3F) << 12) |
+               ((input[2] & 0x3F) << 6) |
+               (input[3] & 0x3F));
+    if (*symbol > 0xFFFF && *symbol <= 0x10FFFF) {
+      return 4;
+    }
+  }
+  /* Not UTF8, emit a special symbol above the UTF8-code space */
+  *symbol = 0x110000 | input[0];
+  return 1;
+}
+
+/* Returns 1 if at least min_fraction of the data is UTF8-encoded.*/
+BROTLI_BOOL BrotliIsMostlyUTF8(
+    const uint8_t* data, const size_t pos, const size_t mask,
+    const size_t length, const double min_fraction) {
+  size_t size_utf8 = 0;
+  size_t i = 0;
+  while (i < length) {
+    int symbol;
+    size_t bytes_read =
+        BrotliParseAsUTF8(&symbol, &data[(pos + i) & mask], length - i);
+    i += bytes_read;
+    if (symbol < 0x110000) size_utf8 += bytes_read;
+  }
+  return TO_BROTLI_BOOL((double)size_utf8 > min_fraction * (double)length);
+}
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}  /* extern "C" */
+#endif