Mercurial > hgrepos > Python2 > PyMuPDF

diff mupdf-source/thirdparty/zxing-cpp/core/src/Utf.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author: Franz Glasner <fzglas.hg@dom66.de>
date: Mon, 15 Sep 2025 11:43:07 +0200
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/zxing-cpp/core/src/Utf.cpp	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,271 @@
+/*
+* Copyright 2016 Nu-book Inc.
+* Copyright 2021 gitlost
+* Copyright 2022 Axel Waggershauser
+*/
+// SPDX-License-Identifier: Apache-2.0
+
+#include "Utf.h"
+
+#include "ZXTestSupport.h"
+#include "ZXAlgorithms.h"
+
+#include <iomanip>
+#include <cstdint>
+#include <sstream>
+
+namespace ZXing {
+
+// TODO: c++20 has char8_t
+#ifndef ZXING_HAS_CHAR8
+#if __cplusplus <= 201703L
+using char8_t = uint8_t;
+#endif
+#endif
+using utf8_t = std::basic_string_view<char8_t>;
+
+using state_t = uint8_t;
+constexpr state_t kAccepted = 0;
+constexpr state_t kRejected [[maybe_unused]] = 12;
+
+inline char32_t Utf8Decode(char8_t byte, state_t& state, char32_t& codep)
+{
+	// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
+	// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
+	static constexpr const state_t kUtf8Data[] = {
+		/* The first part of the table maps bytes to character classes that
+		 * reduce the size of the transition table and create bitmasks. */
+		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+		1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+		7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+		8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+		10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
+
+		/* The second part is a transition table that maps a combination
+		 * of a state of the automaton and a character class to a state. */
+		0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
+		12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
+		12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
+		12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
+		12,36,12,12,12,12,12,12,12,12,12,12,
+	};
+
+	state_t type = kUtf8Data[byte];
+	codep = (state != kAccepted) ? (byte & 0x3fu) | (codep << 6) : (0xff >> type) & (byte);
+	state = kUtf8Data[256 + state + type];
+	return state;
+}
+
+static_assert(sizeof(wchar_t) == 4 || sizeof(wchar_t) == 2, "wchar_t needs to be 2 or 4 bytes wide");
+
+inline bool IsUtf16SurrogatePair(std::wstring_view str)
+{
+	return sizeof(wchar_t) == 2 && str.size() >= 2 && (str[0] & 0xfc00) == 0xd800 && (str[1] & 0xfc00) == 0xdc00;
+}
+
+inline char32_t Utf32FromUtf16Surrogates(std::wstring_view str)
+{
+	return (static_cast<char32_t>(str[0]) << 10) + str[1] - 0x35fdc00;
+}
+
+static size_t Utf8CountCodePoints(utf8_t utf8)
+{
+	size_t count = 0;
+
+	for (size_t i = 0; i < utf8.size();) {
+		if (utf8[i] < 128) {
+			++i;
+		} else {
+			switch (utf8[i] & 0xf0) {
+			case 0xc0: [[fallthrough]];
+			case 0xd0: i += 2; break;
+			case 0xe0: i += 3; break;
+			case 0xf0: i += 4; break;
+			default: // we are in middle of a sequence
+				++i;
+				while (i < utf8.size() && (utf8[i] & 0xc0) == 0x80)
+					++i;
+				break;
+			}
+		}
+		++count;
+	}
+
+	return count;
+}
+
+static void AppendFromUtf8(utf8_t utf8, std::wstring& buffer)
+{
+	buffer.reserve(buffer.size() + Utf8CountCodePoints(utf8));
+
+	char32_t codePoint = 0;
+	state_t state = kAccepted;
+
+	for (auto b : utf8) {
+		if (Utf8Decode(b, state, codePoint) != kAccepted)
+			continue;
+
+		if (sizeof(wchar_t) == 2 && codePoint > 0xffff) { // surrogate pair
+			buffer.push_back(narrow_cast<wchar_t>(0xd7c0 + (codePoint >> 10)));
+			buffer.push_back(narrow_cast<wchar_t>(0xdc00 + (codePoint & 0x3ff)));
+		} else {
+			buffer.push_back(narrow_cast<wchar_t>(codePoint));
+		}
+	}
+}
+
+std::wstring FromUtf8(std::string_view utf8)
+{
+	std::wstring str;
+	AppendFromUtf8({reinterpret_cast<const char8_t*>(utf8.data()), utf8.size()}, str);
+	return str;
+}
+
+#if __cplusplus > 201703L
+std::wstring FromUtf8(std::u8string_view utf8)
+{
+	std::wstring str;
+	AppendFromUtf8(utf8, str);
+	return str;
+}
+#endif
+
+// Count the number of bytes required to store given code points in UTF-8.
+static size_t Utf8CountBytes(std::wstring_view str)
+{
+	int result = 0;
+	for (; str.size(); str.remove_prefix(1)) {
+		if (str.front() < 0x80)
+			result += 1;
+		else if (str.front() < 0x800)
+			result += 2;
+		else if (sizeof(wchar_t) == 4) {
+			if (str.front() < 0x10000)
+				result += 3;
+			else
+				result += 4;
+		} else {
+			if (IsUtf16SurrogatePair(str)) {
+				result += 4;
+				str.remove_prefix(1);
+			} else
+				result += 3;
+		}
+	}
+	return result;
+}
+
+ZXING_EXPORT_TEST_ONLY
+int Utf32ToUtf8(char32_t utf32, char* out)
+{
+	if (utf32 < 0x80) {
+		*out++ = narrow_cast<char8_t>(utf32);
+		return 1;
+	}
+	if (utf32 < 0x800) {
+		*out++ = narrow_cast<char8_t>((utf32 >> 6) | 0xc0);
+		*out++ = narrow_cast<char8_t>((utf32 & 0x3f) | 0x80);
+		return 2;
+	}
+	if (utf32 < 0x10000) {
+		*out++ = narrow_cast<char8_t>((utf32 >> 12) | 0xe0);
+		*out++ = narrow_cast<char8_t>(((utf32 >> 6) & 0x3f) | 0x80);
+		*out++ = narrow_cast<char8_t>((utf32 & 0x3f) | 0x80);
+		return 3;
+	}
+
+	*out++ = narrow_cast<char8_t>((utf32 >> 18) | 0xf0);
+	*out++ = narrow_cast<char8_t>(((utf32 >> 12) & 0x3f) | 0x80);
+	*out++ = narrow_cast<char8_t>(((utf32 >> 6) & 0x3f) | 0x80);
+	*out++ = narrow_cast<char8_t>((utf32 & 0x3f) | 0x80);
+	return 4;
+}
+
+static void AppendToUtf8(std::wstring_view str, std::string& utf8)
+{
+	utf8.reserve(utf8.size() + Utf8CountBytes(str));
+
+	char buffer[4];
+	for (; str.size(); str.remove_prefix(1))
+	{
+		uint32_t cp;
+		if (IsUtf16SurrogatePair(str)) {
+			cp = Utf32FromUtf16Surrogates(str);
+			str.remove_prefix(1);
+		} else
+			cp = str.front();
+
+		auto bufLength = Utf32ToUtf8(cp, buffer);
+		utf8.append(buffer, bufLength);
+	}
+}
+
+std::string ToUtf8(std::wstring_view str)
+{
+	std::string utf8;
+	AppendToUtf8(str, utf8);
+	return utf8;
+}
+
+static bool iswgraph(wchar_t wc)
+{
+	/* Consider all legal codepoints as graphical except for:
+	 * - whitespace
+	 * - C0 and C1 control characters
+	 * - U+2028 and U+2029 (line/para break)
+	 * - U+FFF9 through U+FFFB (interlinear annotation controls)
+	 * The following code is based on libmusls implementation */
+
+	if (wc == ' ' || (unsigned)wc - '\t' < 5)
+		return false;
+	if (wc < 0xff)
+		return ((wc + 1) & 0x7f) >= 0x21;
+	if (wc < 0x2028 || wc - 0x202a < 0xd800 - 0x202a || wc - 0xe000 < 0xfff9 - 0xe000)
+		return true;
+	if (wc - 0xfffc > 0x10ffff - 0xfffc || (wc & 0xfffe) == 0xfffe)
+		return false;
+	return true;
+}
+
+std::wstring EscapeNonGraphical(std::wstring_view str)
+{
+	static const char* const ascii_nongraphs[33] = {
+		"NUL", "SOH", "STX", "ETX", "EOT", "ENQ", "ACK", "BEL",
+		"BS",  "HT",  "LF",  "VT",  "FF",  "CR",  "SO",  "SI",
+		"DLE", "DC1", "DC2", "DC3", "DC4", "NAK", "SYN", "ETB",
+		"CAN",  "EM", "SUB", "ESC",  "FS",  "GS",  "RS",  "US",
+		"DEL",
+	};
+
+	std::wostringstream ws;
+	ws.fill(L'0');
+
+	for (; str.size(); str.remove_prefix(1)) {
+		wchar_t wc = str.front();
+		if (wc < 32 || wc == 127) // Non-graphical ASCII, excluding space
+			ws << "<" << ascii_nongraphs[wc == 127 ? 32 : wc] << ">";
+		else if (wc < 128) // ASCII
+			ws << wc;
+		else if (IsUtf16SurrogatePair(str)) {
+			ws.write(str.data(), 2);
+			str.remove_prefix(1);
+		}
+		// Exclude unpaired surrogates and NO-BREAK spaces NBSP and NUMSP
+		else if ((wc < 0xd800 || wc >= 0xe000) && (iswgraph(wc) && wc != 0xA0 && wc != 0x2007 && wc != 0x2000 && wc != 0xfffd))
+			ws << wc;
+		else // Non-graphical Unicode
+			ws << "<U+" << std::setw(wc < 256 ? 2 : 4) << std::uppercase << std::hex << static_cast<uint32_t>(wc) << ">";
+	}
+
+	return ws.str();
+}
+
+std::string EscapeNonGraphical(std::string_view utf8)
+{
+	return ToUtf8(EscapeNonGraphical(FromUtf8(utf8)));
+}
+
+} // namespace ZXing
author	Franz Glasner <fzglas.hg@dom66.de>
date	Mon, 15 Sep 2025 11:43:07 +0200
parents
children