Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/source/html/txt.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/source/html/txt.c Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,256 @@ +// Copyright (C) 2023-2024 Artifex Software, Inc. +// +// This file is part of MuPDF. +// +// MuPDF is free software: you can redistribute it and/or modify it under the +// terms of the GNU Affero General Public License as published by the Free +// Software Foundation, either version 3 of the License, or (at your option) +// any later version. +// +// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +// details. +// +// You should have received a copy of the GNU Affero General Public License +// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> +// +// Alternative licensing terms are available from the licensor. +// For commercial licensing, see <https://www.artifex.com/> or contact +// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, +// CA 94129, USA, for further information. + +#include "mupdf/fitz.h" +#include "mupdf/html.h" + +enum { ENCODING_ASCII, ENCODING_UTF8, ENCODING_UTF8_BOM, ENCODING_UTF16_LE, ENCODING_UTF16_BE }; + +static int +detect_txt_encoding(fz_context *ctx, fz_buffer *buf) +{ + const uint8_t *d = buf->data; + size_t len = buf->len; + const uint8_t *end = buf->data + len; + int count_tabs = 0; + int count_hi = 0; + int count_controls = 0; + int plausibly_utf8 = 1; + + /* If we find a BOM, believe it. */ + if (len >= 3 && d[0] == 0xef && d[1] == 0xbb && d[2] == 0xBF) + return ENCODING_UTF8_BOM; + else if (len >= 2 && d[0] == 0xff && d[1] == 0xfe) + return ENCODING_UTF16_LE; + else if (len >= 2 && d[0] == 0xfe && d[1] == 0xff) + return ENCODING_UTF16_BE; + + while (d < end) + { + uint8_t c = *d++; + if (c == 9) + count_tabs++; + else if (c == 12) + { + /* Form feed. Ignore that. */ + } + else if (c == 10) + { + if (d < end && d[0] == 13) + d++; + } + else if (c == 13) + { + if (d < end && d[0] == 10) + d++; + } + else if (c < 32 || c == 0x7f) + count_controls++; + else if (c < 0x7f) + { + /* Reasonable ASCII value */ + } + else + { + count_hi++; + if ((c & 0xf8) == 0xF0) + { + /* Could be UTF8 with 3 following bytes */ + if (d+2 >= end || + (d[0] & 0xC0) != 0x80 || + (d[1] & 0xC0) != 0x80 || + (d[2] & 0xC0) != 0x80) + plausibly_utf8 = 0; + else + d += 3; + } + else if ((c & 0xf0) == 0xE0) + { + /* Could be UTF8 with 2 following bytes */ + if (d+1 >= end || + (d[0] & 0xC0) != 0x80 || + (d[1] & 0xC0) != 0x80) + plausibly_utf8 = 0; + else + d += 2; + } + else if ((c & 0xE0) == 0xC0) + { + /* Could be UTF8 with 1 following bytes */ + if (d+1 >= end || + (d[0] & 0xC0) != 0x80) + plausibly_utf8 = 0; + else + d++; + } + else + plausibly_utf8 = 0; + } + } + + (void)count_tabs; + (void)count_hi; + (void)count_controls; + + if (plausibly_utf8) + return ENCODING_UTF8; + return ENCODING_ASCII; +} + +fz_buffer * +fz_txt_buffer_to_html(fz_context *ctx, fz_buffer *in) +{ + int encoding = detect_txt_encoding(ctx, in); + fz_stream *stream = fz_open_buffer(ctx, in); + fz_buffer *outbuf = NULL; + fz_output *out = NULL; + int col = 0; + + fz_var(outbuf); + fz_var(out); + + fz_try(ctx) + { + outbuf = fz_new_buffer(ctx, 1024); + out = fz_new_output_with_buffer(ctx, outbuf); + + fz_write_string(ctx, out, "<!doctype html><style>body{margin:0}pre{page-break-before:always;margin:0;white-space:pre-wrap;}</style><pre>"); + + if (encoding == ENCODING_UTF16_LE || encoding == ENCODING_UTF16_BE) + { + fz_read_byte(ctx, stream); + fz_read_byte(ctx, stream); + } + else if (encoding == ENCODING_UTF8_BOM) + { + fz_read_byte(ctx, stream); + fz_read_byte(ctx, stream); + fz_read_byte(ctx, stream); + } + + while (!fz_is_eof(ctx, stream)) + { + int c; + switch (encoding) + { + default: + case ENCODING_ASCII: + c = fz_read_byte(ctx, stream); + break; + case ENCODING_UTF8: + case ENCODING_UTF8_BOM: + c = fz_read_rune(ctx, stream); + break; + case ENCODING_UTF16_LE: + c = fz_read_utf16_le(ctx, stream); + break; + case ENCODING_UTF16_BE: + c = fz_read_utf16_be(ctx, stream); + } + + if (c == 10 || c == 13) + { + col = -1; + fz_write_byte(ctx, out, c); + } + else if (c == 9) + { + int n = (8 - col) & 7; + if (n == 0) + n = 8; + col += n-1; + while (n--) + fz_write_byte(ctx, out, ' '); + } + else if (c == 12) + { + col = -1; + fz_write_string(ctx, out, "</pre><pre>\n"); + } + else if (c == '<') + fz_write_string(ctx, out, "<"); + else if (c == '>') + fz_write_string(ctx, out, ">"); + else if (c == '"') + fz_write_string(ctx, out, """); + else + fz_write_rune(ctx, out, c); + + ++col; + } + + fz_close_output(ctx, out); + } + fz_always(ctx) + { + fz_drop_stream(ctx, stream); + fz_drop_output(ctx, out); + } + fz_catch(ctx) + { + fz_drop_buffer(ctx, outbuf); + fz_rethrow(ctx); + } + + return outbuf; +} + +static fz_buffer * +txt_to_html(fz_context *ctx, fz_html_font_set *set, fz_buffer *buf, fz_archive *zip, const char *user_css) +{ + return fz_txt_buffer_to_html(ctx, buf); +} + +static const fz_htdoc_format_t fz_htdoc_txt = +{ + "Text", + txt_to_html, + 0, 1, 0 +}; + +static fz_document * +txt_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *zip, void *state) +{ + return fz_htdoc_open_document_with_stream_and_dir(ctx, file, zip, &fz_htdoc_txt); +} + +static const char *txt_extensions[] = +{ + "txt", + "text", + "log", + NULL +}; + +static const char *txt_mimetypes[] = +{ + "text.plain", + NULL +}; + +fz_document_handler txt_document_handler = +{ + NULL, + txt_open_document, + txt_extensions, + txt_mimetypes +};
