diff mupdf-source/source/html/txt.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/source/html/txt.c	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,256 @@
+// Copyright (C) 2023-2024 Artifex Software, Inc.
+//
+// This file is part of MuPDF.
+//
+// MuPDF is free software: you can redistribute it and/or modify it under the
+// terms of the GNU Affero General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+// details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
+//
+// Alternative licensing terms are available from the licensor.
+// For commercial licensing, see <https://www.artifex.com/> or contact
+// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
+// CA 94129, USA, for further information.
+
+#include "mupdf/fitz.h"
+#include "mupdf/html.h"
+
+enum { ENCODING_ASCII, ENCODING_UTF8, ENCODING_UTF8_BOM, ENCODING_UTF16_LE, ENCODING_UTF16_BE };
+
+static int
+detect_txt_encoding(fz_context *ctx, fz_buffer *buf)
+{
+	const uint8_t *d = buf->data;
+	size_t len = buf->len;
+	const uint8_t *end = buf->data + len;
+	int count_tabs = 0;
+	int count_hi = 0;
+	int count_controls = 0;
+	int plausibly_utf8 = 1;
+
+	/* If we find a BOM, believe it. */
+	if (len >= 3 && d[0] == 0xef && d[1] == 0xbb && d[2] == 0xBF)
+		return ENCODING_UTF8_BOM;
+	else if (len >= 2 && d[0] == 0xff && d[1] == 0xfe)
+		return ENCODING_UTF16_LE;
+	else if (len >= 2 && d[0] == 0xfe && d[1] == 0xff)
+		return ENCODING_UTF16_BE;
+
+	while (d < end)
+	{
+		uint8_t c = *d++;
+		if (c == 9)
+			count_tabs++;
+		else if (c == 12)
+		{
+			/* Form feed. Ignore that. */
+		}
+		else if (c == 10)
+		{
+			if (d < end && d[0] == 13)
+				d++;
+		}
+		else if (c == 13)
+		{
+			if (d < end && d[0] == 10)
+				d++;
+		}
+		else if (c < 32 || c == 0x7f)
+			count_controls++;
+		else if (c < 0x7f)
+		{
+			/* Reasonable ASCII value */
+		}
+		else
+		{
+			count_hi++;
+			if ((c & 0xf8) == 0xF0)
+			{
+				/* Could be UTF8 with 3 following bytes */
+				if (d+2 >= end ||
+					(d[0] & 0xC0) != 0x80 ||
+					(d[1] & 0xC0) != 0x80 ||
+					(d[2] & 0xC0) != 0x80)
+					plausibly_utf8 = 0;
+				else
+					d += 3;
+			}
+			else if ((c & 0xf0) == 0xE0)
+			{
+				/* Could be UTF8 with 2 following bytes */
+				if (d+1 >= end ||
+					(d[0] & 0xC0) != 0x80 ||
+					(d[1] & 0xC0) != 0x80)
+					plausibly_utf8 = 0;
+				else
+					d += 2;
+			}
+			else if ((c & 0xE0) == 0xC0)
+			{
+				/* Could be UTF8 with 1 following bytes */
+				if (d+1 >= end ||
+					(d[0] & 0xC0) != 0x80)
+					plausibly_utf8 = 0;
+				else
+					d++;
+			}
+			else
+				plausibly_utf8 = 0;
+		}
+	}
+
+	(void)count_tabs;
+	(void)count_hi;
+	(void)count_controls;
+
+	if (plausibly_utf8)
+		return ENCODING_UTF8;
+	return ENCODING_ASCII;
+}
+
+fz_buffer *
+fz_txt_buffer_to_html(fz_context *ctx, fz_buffer *in)
+{
+	int encoding = detect_txt_encoding(ctx, in);
+	fz_stream *stream = fz_open_buffer(ctx, in);
+	fz_buffer *outbuf = NULL;
+	fz_output *out = NULL;
+	int col = 0;
+
+	fz_var(outbuf);
+	fz_var(out);
+
+	fz_try(ctx)
+	{
+		outbuf = fz_new_buffer(ctx, 1024);
+		out = fz_new_output_with_buffer(ctx, outbuf);
+
+		fz_write_string(ctx, out, "<!doctype html><style>body{margin:0}pre{page-break-before:always;margin:0;white-space:pre-wrap;}</style><pre>");
+
+		if (encoding == ENCODING_UTF16_LE || encoding == ENCODING_UTF16_BE)
+		{
+			fz_read_byte(ctx, stream);
+			fz_read_byte(ctx, stream);
+		}
+		else if (encoding == ENCODING_UTF8_BOM)
+		{
+			fz_read_byte(ctx, stream);
+			fz_read_byte(ctx, stream);
+			fz_read_byte(ctx, stream);
+		}
+
+		while (!fz_is_eof(ctx, stream))
+		{
+			int c;
+			switch (encoding)
+			{
+			default:
+			case ENCODING_ASCII:
+				c = fz_read_byte(ctx, stream);
+				break;
+			case ENCODING_UTF8:
+			case ENCODING_UTF8_BOM:
+				c = fz_read_rune(ctx, stream);
+				break;
+			case ENCODING_UTF16_LE:
+				c = fz_read_utf16_le(ctx, stream);
+				break;
+			case ENCODING_UTF16_BE:
+				c = fz_read_utf16_be(ctx, stream);
+			}
+
+			if (c == 10 || c == 13)
+			{
+				col = -1;
+				fz_write_byte(ctx, out, c);
+			}
+			else if (c == 9)
+			{
+				int n = (8 - col) & 7;
+				if (n == 0)
+					n = 8;
+				col += n-1;
+				while (n--)
+					fz_write_byte(ctx, out, ' ');
+			}
+			else if (c == 12)
+			{
+				col = -1;
+				fz_write_string(ctx, out, "</pre><pre>\n");
+			}
+			else if (c == '<')
+				fz_write_string(ctx, out, "&lt;");
+			else if (c == '>')
+				fz_write_string(ctx, out, "&gt;");
+			else if (c == '"')
+				fz_write_string(ctx, out, "&quot;");
+			else
+				fz_write_rune(ctx, out, c);
+
+			++col;
+		}
+
+		fz_close_output(ctx, out);
+	}
+	fz_always(ctx)
+	{
+		fz_drop_stream(ctx, stream);
+		fz_drop_output(ctx, out);
+	}
+	fz_catch(ctx)
+	{
+		fz_drop_buffer(ctx, outbuf);
+		fz_rethrow(ctx);
+	}
+
+	return outbuf;
+}
+
+static fz_buffer *
+txt_to_html(fz_context *ctx, fz_html_font_set *set, fz_buffer *buf, fz_archive *zip, const char *user_css)
+{
+	return fz_txt_buffer_to_html(ctx, buf);
+}
+
+static const fz_htdoc_format_t fz_htdoc_txt =
+{
+	"Text",
+	txt_to_html,
+	0, 1, 0
+};
+
+static fz_document *
+txt_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *zip, void *state)
+{
+	return fz_htdoc_open_document_with_stream_and_dir(ctx, file, zip, &fz_htdoc_txt);
+}
+
+static const char *txt_extensions[] =
+{
+	"txt",
+	"text",
+	"log",
+	NULL
+};
+
+static const char *txt_mimetypes[] =
+{
+	"text.plain",
+	NULL
+};
+
+fz_document_handler txt_document_handler =
+{
+	NULL,
+	txt_open_document,
+	txt_extensions,
+	txt_mimetypes
+};