comparison mupdf-source/source/html/txt.c @ 3:2c135c81b16c

MERGE: upstream PyMuPDF 1.26.4 with MuPDF 1.26.7
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:44:09 +0200
parents b50eed0cc0ef
children
comparison
equal deleted inserted replaced
0:6015a75abc2d 3:2c135c81b16c
1 // Copyright (C) 2023-2024 Artifex Software, Inc.
2 //
3 // This file is part of MuPDF.
4 //
5 // MuPDF is free software: you can redistribute it and/or modify it under the
6 // terms of the GNU Affero General Public License as published by the Free
7 // Software Foundation, either version 3 of the License, or (at your option)
8 // any later version.
9 //
10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13 // details.
14 //
15 // You should have received a copy of the GNU Affero General Public License
16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17 //
18 // Alternative licensing terms are available from the licensor.
19 // For commercial licensing, see <https://www.artifex.com/> or contact
20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21 // CA 94129, USA, for further information.
22
23 #include "mupdf/fitz.h"
24 #include "mupdf/html.h"
25
26 enum { ENCODING_ASCII, ENCODING_UTF8, ENCODING_UTF8_BOM, ENCODING_UTF16_LE, ENCODING_UTF16_BE };
27
28 static int
29 detect_txt_encoding(fz_context *ctx, fz_buffer *buf)
30 {
31 const uint8_t *d = buf->data;
32 size_t len = buf->len;
33 const uint8_t *end = buf->data + len;
34 int count_tabs = 0;
35 int count_hi = 0;
36 int count_controls = 0;
37 int plausibly_utf8 = 1;
38
39 /* If we find a BOM, believe it. */
40 if (len >= 3 && d[0] == 0xef && d[1] == 0xbb && d[2] == 0xBF)
41 return ENCODING_UTF8_BOM;
42 else if (len >= 2 && d[0] == 0xff && d[1] == 0xfe)
43 return ENCODING_UTF16_LE;
44 else if (len >= 2 && d[0] == 0xfe && d[1] == 0xff)
45 return ENCODING_UTF16_BE;
46
47 while (d < end)
48 {
49 uint8_t c = *d++;
50 if (c == 9)
51 count_tabs++;
52 else if (c == 12)
53 {
54 /* Form feed. Ignore that. */
55 }
56 else if (c == 10)
57 {
58 if (d < end && d[0] == 13)
59 d++;
60 }
61 else if (c == 13)
62 {
63 if (d < end && d[0] == 10)
64 d++;
65 }
66 else if (c < 32 || c == 0x7f)
67 count_controls++;
68 else if (c < 0x7f)
69 {
70 /* Reasonable ASCII value */
71 }
72 else
73 {
74 count_hi++;
75 if ((c & 0xf8) == 0xF0)
76 {
77 /* Could be UTF8 with 3 following bytes */
78 if (d+2 >= end ||
79 (d[0] & 0xC0) != 0x80 ||
80 (d[1] & 0xC0) != 0x80 ||
81 (d[2] & 0xC0) != 0x80)
82 plausibly_utf8 = 0;
83 else
84 d += 3;
85 }
86 else if ((c & 0xf0) == 0xE0)
87 {
88 /* Could be UTF8 with 2 following bytes */
89 if (d+1 >= end ||
90 (d[0] & 0xC0) != 0x80 ||
91 (d[1] & 0xC0) != 0x80)
92 plausibly_utf8 = 0;
93 else
94 d += 2;
95 }
96 else if ((c & 0xE0) == 0xC0)
97 {
98 /* Could be UTF8 with 1 following bytes */
99 if (d+1 >= end ||
100 (d[0] & 0xC0) != 0x80)
101 plausibly_utf8 = 0;
102 else
103 d++;
104 }
105 else
106 plausibly_utf8 = 0;
107 }
108 }
109
110 (void)count_tabs;
111 (void)count_hi;
112 (void)count_controls;
113
114 if (plausibly_utf8)
115 return ENCODING_UTF8;
116 return ENCODING_ASCII;
117 }
118
119 fz_buffer *
120 fz_txt_buffer_to_html(fz_context *ctx, fz_buffer *in)
121 {
122 int encoding = detect_txt_encoding(ctx, in);
123 fz_stream *stream = fz_open_buffer(ctx, in);
124 fz_buffer *outbuf = NULL;
125 fz_output *out = NULL;
126 int col = 0;
127
128 fz_var(outbuf);
129 fz_var(out);
130
131 fz_try(ctx)
132 {
133 outbuf = fz_new_buffer(ctx, 1024);
134 out = fz_new_output_with_buffer(ctx, outbuf);
135
136 fz_write_string(ctx, out, "<!doctype html><style>body{margin:0}pre{page-break-before:always;margin:0;white-space:pre-wrap;}</style><pre>");
137
138 if (encoding == ENCODING_UTF16_LE || encoding == ENCODING_UTF16_BE)
139 {
140 fz_read_byte(ctx, stream);
141 fz_read_byte(ctx, stream);
142 }
143 else if (encoding == ENCODING_UTF8_BOM)
144 {
145 fz_read_byte(ctx, stream);
146 fz_read_byte(ctx, stream);
147 fz_read_byte(ctx, stream);
148 }
149
150 while (!fz_is_eof(ctx, stream))
151 {
152 int c;
153 switch (encoding)
154 {
155 default:
156 case ENCODING_ASCII:
157 c = fz_read_byte(ctx, stream);
158 break;
159 case ENCODING_UTF8:
160 case ENCODING_UTF8_BOM:
161 c = fz_read_rune(ctx, stream);
162 break;
163 case ENCODING_UTF16_LE:
164 c = fz_read_utf16_le(ctx, stream);
165 break;
166 case ENCODING_UTF16_BE:
167 c = fz_read_utf16_be(ctx, stream);
168 }
169
170 if (c == 10 || c == 13)
171 {
172 col = -1;
173 fz_write_byte(ctx, out, c);
174 }
175 else if (c == 9)
176 {
177 int n = (8 - col) & 7;
178 if (n == 0)
179 n = 8;
180 col += n-1;
181 while (n--)
182 fz_write_byte(ctx, out, ' ');
183 }
184 else if (c == 12)
185 {
186 col = -1;
187 fz_write_string(ctx, out, "</pre><pre>\n");
188 }
189 else if (c == '<')
190 fz_write_string(ctx, out, "&lt;");
191 else if (c == '>')
192 fz_write_string(ctx, out, "&gt;");
193 else if (c == '"')
194 fz_write_string(ctx, out, "&quot;");
195 else
196 fz_write_rune(ctx, out, c);
197
198 ++col;
199 }
200
201 fz_close_output(ctx, out);
202 }
203 fz_always(ctx)
204 {
205 fz_drop_stream(ctx, stream);
206 fz_drop_output(ctx, out);
207 }
208 fz_catch(ctx)
209 {
210 fz_drop_buffer(ctx, outbuf);
211 fz_rethrow(ctx);
212 }
213
214 return outbuf;
215 }
216
217 static fz_buffer *
218 txt_to_html(fz_context *ctx, fz_html_font_set *set, fz_buffer *buf, fz_archive *zip, const char *user_css)
219 {
220 return fz_txt_buffer_to_html(ctx, buf);
221 }
222
223 static const fz_htdoc_format_t fz_htdoc_txt =
224 {
225 "Text",
226 txt_to_html,
227 0, 1, 0
228 };
229
230 static fz_document *
231 txt_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *zip, void *state)
232 {
233 return fz_htdoc_open_document_with_stream_and_dir(ctx, file, zip, &fz_htdoc_txt);
234 }
235
236 static const char *txt_extensions[] =
237 {
238 "txt",
239 "text",
240 "log",
241 NULL
242 };
243
244 static const char *txt_mimetypes[] =
245 {
246 "text.plain",
247 NULL
248 };
249
250 fz_document_handler txt_document_handler =
251 {
252 NULL,
253 txt_open_document,
254 txt_extensions,
255 txt_mimetypes
256 };