Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/source/html/txt.c @ 3:2c135c81b16c
MERGE: upstream PyMuPDF 1.26.4 with MuPDF 1.26.7
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:44:09 +0200 |
| parents | b50eed0cc0ef |
| children |
comparison
equal
deleted
inserted
replaced
| 0:6015a75abc2d | 3:2c135c81b16c |
|---|---|
| 1 // Copyright (C) 2023-2024 Artifex Software, Inc. | |
| 2 // | |
| 3 // This file is part of MuPDF. | |
| 4 // | |
| 5 // MuPDF is free software: you can redistribute it and/or modify it under the | |
| 6 // terms of the GNU Affero General Public License as published by the Free | |
| 7 // Software Foundation, either version 3 of the License, or (at your option) | |
| 8 // any later version. | |
| 9 // | |
| 10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY | |
| 11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | |
| 12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more | |
| 13 // details. | |
| 14 // | |
| 15 // You should have received a copy of the GNU Affero General Public License | |
| 16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> | |
| 17 // | |
| 18 // Alternative licensing terms are available from the licensor. | |
| 19 // For commercial licensing, see <https://www.artifex.com/> or contact | |
| 20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, | |
| 21 // CA 94129, USA, for further information. | |
| 22 | |
| 23 #include "mupdf/fitz.h" | |
| 24 #include "mupdf/html.h" | |
| 25 | |
| 26 enum { ENCODING_ASCII, ENCODING_UTF8, ENCODING_UTF8_BOM, ENCODING_UTF16_LE, ENCODING_UTF16_BE }; | |
| 27 | |
| 28 static int | |
| 29 detect_txt_encoding(fz_context *ctx, fz_buffer *buf) | |
| 30 { | |
| 31 const uint8_t *d = buf->data; | |
| 32 size_t len = buf->len; | |
| 33 const uint8_t *end = buf->data + len; | |
| 34 int count_tabs = 0; | |
| 35 int count_hi = 0; | |
| 36 int count_controls = 0; | |
| 37 int plausibly_utf8 = 1; | |
| 38 | |
| 39 /* If we find a BOM, believe it. */ | |
| 40 if (len >= 3 && d[0] == 0xef && d[1] == 0xbb && d[2] == 0xBF) | |
| 41 return ENCODING_UTF8_BOM; | |
| 42 else if (len >= 2 && d[0] == 0xff && d[1] == 0xfe) | |
| 43 return ENCODING_UTF16_LE; | |
| 44 else if (len >= 2 && d[0] == 0xfe && d[1] == 0xff) | |
| 45 return ENCODING_UTF16_BE; | |
| 46 | |
| 47 while (d < end) | |
| 48 { | |
| 49 uint8_t c = *d++; | |
| 50 if (c == 9) | |
| 51 count_tabs++; | |
| 52 else if (c == 12) | |
| 53 { | |
| 54 /* Form feed. Ignore that. */ | |
| 55 } | |
| 56 else if (c == 10) | |
| 57 { | |
| 58 if (d < end && d[0] == 13) | |
| 59 d++; | |
| 60 } | |
| 61 else if (c == 13) | |
| 62 { | |
| 63 if (d < end && d[0] == 10) | |
| 64 d++; | |
| 65 } | |
| 66 else if (c < 32 || c == 0x7f) | |
| 67 count_controls++; | |
| 68 else if (c < 0x7f) | |
| 69 { | |
| 70 /* Reasonable ASCII value */ | |
| 71 } | |
| 72 else | |
| 73 { | |
| 74 count_hi++; | |
| 75 if ((c & 0xf8) == 0xF0) | |
| 76 { | |
| 77 /* Could be UTF8 with 3 following bytes */ | |
| 78 if (d+2 >= end || | |
| 79 (d[0] & 0xC0) != 0x80 || | |
| 80 (d[1] & 0xC0) != 0x80 || | |
| 81 (d[2] & 0xC0) != 0x80) | |
| 82 plausibly_utf8 = 0; | |
| 83 else | |
| 84 d += 3; | |
| 85 } | |
| 86 else if ((c & 0xf0) == 0xE0) | |
| 87 { | |
| 88 /* Could be UTF8 with 2 following bytes */ | |
| 89 if (d+1 >= end || | |
| 90 (d[0] & 0xC0) != 0x80 || | |
| 91 (d[1] & 0xC0) != 0x80) | |
| 92 plausibly_utf8 = 0; | |
| 93 else | |
| 94 d += 2; | |
| 95 } | |
| 96 else if ((c & 0xE0) == 0xC0) | |
| 97 { | |
| 98 /* Could be UTF8 with 1 following bytes */ | |
| 99 if (d+1 >= end || | |
| 100 (d[0] & 0xC0) != 0x80) | |
| 101 plausibly_utf8 = 0; | |
| 102 else | |
| 103 d++; | |
| 104 } | |
| 105 else | |
| 106 plausibly_utf8 = 0; | |
| 107 } | |
| 108 } | |
| 109 | |
| 110 (void)count_tabs; | |
| 111 (void)count_hi; | |
| 112 (void)count_controls; | |
| 113 | |
| 114 if (plausibly_utf8) | |
| 115 return ENCODING_UTF8; | |
| 116 return ENCODING_ASCII; | |
| 117 } | |
| 118 | |
| 119 fz_buffer * | |
| 120 fz_txt_buffer_to_html(fz_context *ctx, fz_buffer *in) | |
| 121 { | |
| 122 int encoding = detect_txt_encoding(ctx, in); | |
| 123 fz_stream *stream = fz_open_buffer(ctx, in); | |
| 124 fz_buffer *outbuf = NULL; | |
| 125 fz_output *out = NULL; | |
| 126 int col = 0; | |
| 127 | |
| 128 fz_var(outbuf); | |
| 129 fz_var(out); | |
| 130 | |
| 131 fz_try(ctx) | |
| 132 { | |
| 133 outbuf = fz_new_buffer(ctx, 1024); | |
| 134 out = fz_new_output_with_buffer(ctx, outbuf); | |
| 135 | |
| 136 fz_write_string(ctx, out, "<!doctype html><style>body{margin:0}pre{page-break-before:always;margin:0;white-space:pre-wrap;}</style><pre>"); | |
| 137 | |
| 138 if (encoding == ENCODING_UTF16_LE || encoding == ENCODING_UTF16_BE) | |
| 139 { | |
| 140 fz_read_byte(ctx, stream); | |
| 141 fz_read_byte(ctx, stream); | |
| 142 } | |
| 143 else if (encoding == ENCODING_UTF8_BOM) | |
| 144 { | |
| 145 fz_read_byte(ctx, stream); | |
| 146 fz_read_byte(ctx, stream); | |
| 147 fz_read_byte(ctx, stream); | |
| 148 } | |
| 149 | |
| 150 while (!fz_is_eof(ctx, stream)) | |
| 151 { | |
| 152 int c; | |
| 153 switch (encoding) | |
| 154 { | |
| 155 default: | |
| 156 case ENCODING_ASCII: | |
| 157 c = fz_read_byte(ctx, stream); | |
| 158 break; | |
| 159 case ENCODING_UTF8: | |
| 160 case ENCODING_UTF8_BOM: | |
| 161 c = fz_read_rune(ctx, stream); | |
| 162 break; | |
| 163 case ENCODING_UTF16_LE: | |
| 164 c = fz_read_utf16_le(ctx, stream); | |
| 165 break; | |
| 166 case ENCODING_UTF16_BE: | |
| 167 c = fz_read_utf16_be(ctx, stream); | |
| 168 } | |
| 169 | |
| 170 if (c == 10 || c == 13) | |
| 171 { | |
| 172 col = -1; | |
| 173 fz_write_byte(ctx, out, c); | |
| 174 } | |
| 175 else if (c == 9) | |
| 176 { | |
| 177 int n = (8 - col) & 7; | |
| 178 if (n == 0) | |
| 179 n = 8; | |
| 180 col += n-1; | |
| 181 while (n--) | |
| 182 fz_write_byte(ctx, out, ' '); | |
| 183 } | |
| 184 else if (c == 12) | |
| 185 { | |
| 186 col = -1; | |
| 187 fz_write_string(ctx, out, "</pre><pre>\n"); | |
| 188 } | |
| 189 else if (c == '<') | |
| 190 fz_write_string(ctx, out, "<"); | |
| 191 else if (c == '>') | |
| 192 fz_write_string(ctx, out, ">"); | |
| 193 else if (c == '"') | |
| 194 fz_write_string(ctx, out, """); | |
| 195 else | |
| 196 fz_write_rune(ctx, out, c); | |
| 197 | |
| 198 ++col; | |
| 199 } | |
| 200 | |
| 201 fz_close_output(ctx, out); | |
| 202 } | |
| 203 fz_always(ctx) | |
| 204 { | |
| 205 fz_drop_stream(ctx, stream); | |
| 206 fz_drop_output(ctx, out); | |
| 207 } | |
| 208 fz_catch(ctx) | |
| 209 { | |
| 210 fz_drop_buffer(ctx, outbuf); | |
| 211 fz_rethrow(ctx); | |
| 212 } | |
| 213 | |
| 214 return outbuf; | |
| 215 } | |
| 216 | |
| 217 static fz_buffer * | |
| 218 txt_to_html(fz_context *ctx, fz_html_font_set *set, fz_buffer *buf, fz_archive *zip, const char *user_css) | |
| 219 { | |
| 220 return fz_txt_buffer_to_html(ctx, buf); | |
| 221 } | |
| 222 | |
| 223 static const fz_htdoc_format_t fz_htdoc_txt = | |
| 224 { | |
| 225 "Text", | |
| 226 txt_to_html, | |
| 227 0, 1, 0 | |
| 228 }; | |
| 229 | |
| 230 static fz_document * | |
| 231 txt_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *zip, void *state) | |
| 232 { | |
| 233 return fz_htdoc_open_document_with_stream_and_dir(ctx, file, zip, &fz_htdoc_txt); | |
| 234 } | |
| 235 | |
| 236 static const char *txt_extensions[] = | |
| 237 { | |
| 238 "txt", | |
| 239 "text", | |
| 240 "log", | |
| 241 NULL | |
| 242 }; | |
| 243 | |
| 244 static const char *txt_mimetypes[] = | |
| 245 { | |
| 246 "text.plain", | |
| 247 NULL | |
| 248 }; | |
| 249 | |
| 250 fz_document_handler txt_document_handler = | |
| 251 { | |
| 252 NULL, | |
| 253 txt_open_document, | |
| 254 txt_extensions, | |
| 255 txt_mimetypes | |
| 256 }; |
