Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/source/html/mobi.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 // Copyright (C) 2004-2025 Artifex Software, Inc. | |
| 2 // | |
| 3 // This file is part of MuPDF. | |
| 4 // | |
| 5 // MuPDF is free software: you can redistribute it and/or modify it under the | |
| 6 // terms of the GNU Affero General Public License as published by the Free | |
| 7 // Software Foundation, either version 3 of the License, or (at your option) | |
| 8 // any later version. | |
| 9 // | |
| 10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY | |
| 11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | |
| 12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more | |
| 13 // details. | |
| 14 // | |
| 15 // You should have received a copy of the GNU Affero General Public License | |
| 16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> | |
| 17 // | |
| 18 // Alternative licensing terms are available from the licensor. | |
| 19 // For commercial licensing, see <https://www.artifex.com/> or contact | |
| 20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, | |
| 21 // CA 94129, USA, for further information. | |
| 22 | |
| 23 #include "mupdf/fitz.h" | |
| 24 #include "html-imp.h" | |
| 25 | |
| 26 #include <string.h> | |
| 27 | |
| 28 #define FORMAT_HTML 1 | |
| 29 #define FORMAT_TEXT 2 | |
| 30 | |
| 31 #define COMPRESSION_NONE 1 | |
| 32 #define COMPRESSION_PALMDOC 2 | |
| 33 #define COMPRESSION_HUFF_CDIC 17480 | |
| 34 | |
| 35 #define TEXT_ENCODING_LATIN_1 0 | |
| 36 #define TEXT_ENCODING_1252 1252 | |
| 37 #define TEXT_ENCODING_UTF8 65001 | |
| 38 | |
| 39 static void | |
| 40 skip_bytes(fz_context *ctx, fz_stream *stm, size_t len) | |
| 41 { | |
| 42 size_t skipped = fz_skip(ctx, stm, len); | |
| 43 if (skipped < len) | |
| 44 fz_throw(ctx, FZ_ERROR_FORMAT, "premature end in data"); | |
| 45 } | |
| 46 | |
| 47 static void | |
| 48 mobi_read_text_none(fz_context *ctx, fz_buffer *out, fz_stream *stm, uint32_t size) | |
| 49 { | |
| 50 unsigned char buf[4096]; | |
| 51 size_t n; | |
| 52 if (size > 4096) | |
| 53 fz_throw(ctx, FZ_ERROR_FORMAT, "text block too large"); | |
| 54 n = fz_read(ctx, stm, buf, size); | |
| 55 if (n < size) | |
| 56 fz_warn(ctx, "premature end in mobi uncompressed text data"); | |
| 57 fz_append_data(ctx, out, buf, n); | |
| 58 } | |
| 59 | |
| 60 static void | |
| 61 mobi_read_text_palmdoc(fz_context *ctx, fz_buffer *out, fz_stream *stm, uint32_t size) | |
| 62 { | |
| 63 // https://wiki.mobileread.com/wiki/PalmDOC | |
| 64 size_t end = out->len + size; | |
| 65 while (out->len < end) | |
| 66 { | |
| 67 int c = fz_read_byte(ctx, stm); | |
| 68 if (c == EOF) | |
| 69 break; | |
| 70 if (c >= 0x01 && c <= 0x08) | |
| 71 { | |
| 72 unsigned char buf[8]; | |
| 73 size_t n = fz_read(ctx, stm, buf, c); | |
| 74 fz_append_data(ctx, out, buf, n); | |
| 75 if (n < (size_t) c) | |
| 76 break; | |
| 77 } | |
| 78 else if (c <= 0x7f) | |
| 79 { | |
| 80 fz_append_byte(ctx, out, c); | |
| 81 } | |
| 82 else if (c >= 0x80 && c <= 0xbf) | |
| 83 { | |
| 84 int cc, x, distance, length; | |
| 85 cc = fz_read_byte(ctx, stm); | |
| 86 if (cc == EOF) | |
| 87 break; | |
| 88 x = (c << 8) | cc; | |
| 89 distance = (x >> 3) & 0x7ff; | |
| 90 length = (x & 7) + 3; | |
| 91 if (distance > 0 && (size_t)distance <= out->len) | |
| 92 { | |
| 93 int i; | |
| 94 int p = (int)(out->len - distance); | |
| 95 for (i = 0; i < length; ++i) | |
| 96 fz_append_byte(ctx, out, out->data[p + i]); | |
| 97 } | |
| 98 } | |
| 99 else if (c >= 0xc0 && c <= 0xff) | |
| 100 { | |
| 101 fz_append_byte(ctx, out, ' '); | |
| 102 fz_append_byte(ctx, out, c ^ 0x80); | |
| 103 } | |
| 104 } | |
| 105 | |
| 106 if (out->len < end) | |
| 107 fz_warn(ctx, "premature end in mobi palmdoc data"); | |
| 108 } | |
| 109 | |
| 110 static uint32_t | |
| 111 mobi_read_data(fz_context *ctx, fz_buffer *out, fz_stream *stm, uint32_t *offset, uint32_t total_count, int format) | |
| 112 { | |
| 113 // https://wiki.mobileread.com/wiki/MOBI | |
| 114 uint32_t compression, text_length, record_count, text_encoding, i; | |
| 115 unsigned char buf[4]; | |
| 116 fz_range range = { 0 }; | |
| 117 fz_stream *rec = NULL; | |
| 118 size_t n; | |
| 119 | |
| 120 fz_var(rec); | |
| 121 | |
| 122 fz_try(ctx) | |
| 123 { | |
| 124 range.offset = offset[0]; | |
| 125 range.length = offset[1] - offset[0]; | |
| 126 rec = fz_open_range_filter(ctx, stm, &range, 1); | |
| 127 | |
| 128 // PalmDOC header | |
| 129 compression = fz_read_uint16(ctx, rec); | |
| 130 skip_bytes(ctx, rec, 2); | |
| 131 text_length = fz_read_uint32(ctx, rec); | |
| 132 record_count = fz_read_uint16(ctx, rec); | |
| 133 skip_bytes(ctx, rec, 2); | |
| 134 skip_bytes(ctx, rec, 2); // encryption | |
| 135 skip_bytes(ctx, rec, 2); | |
| 136 | |
| 137 // Optional MOBI header | |
| 138 text_encoding = TEXT_ENCODING_LATIN_1; | |
| 139 n = fz_read(ctx, rec, buf, 4); | |
| 140 if (n == 4 && !memcmp(buf, "MOBI", 4)) | |
| 141 { | |
| 142 skip_bytes(ctx, rec, 4); | |
| 143 skip_bytes(ctx, rec, 4); | |
| 144 text_encoding = fz_read_uint32(ctx, rec); | |
| 145 } | |
| 146 } | |
| 147 fz_always(ctx) | |
| 148 fz_drop_stream(ctx, rec); | |
| 149 fz_catch(ctx) | |
| 150 fz_rethrow(ctx); | |
| 151 | |
| 152 if (compression != COMPRESSION_NONE && compression != COMPRESSION_PALMDOC) | |
| 153 fz_throw(ctx, FZ_ERROR_FORMAT, "unknown compression method"); | |
| 154 if (text_encoding != TEXT_ENCODING_LATIN_1 && | |
| 155 text_encoding != TEXT_ENCODING_1252 && | |
| 156 text_encoding != TEXT_ENCODING_UTF8) | |
| 157 fz_throw(ctx, FZ_ERROR_FORMAT, "unknown text encoding"); | |
| 158 | |
| 159 for (i = 1; i <= record_count && i < total_count; ++i) | |
| 160 { | |
| 161 uint32_t remain = text_length - (uint32_t)out->len; | |
| 162 uint32_t size = remain < 4096 ? remain : 4096; | |
| 163 | |
| 164 fz_try(ctx) | |
| 165 { | |
| 166 range.offset = offset[i]; | |
| 167 range.length = offset[i + 1] - offset[i]; | |
| 168 rec = fz_open_range_filter(ctx, stm, &range, 1); | |
| 169 | |
| 170 if (compression == COMPRESSION_NONE) | |
| 171 mobi_read_text_none(ctx, out, rec, size); | |
| 172 else | |
| 173 mobi_read_text_palmdoc(ctx, out, rec, size); | |
| 174 } | |
| 175 fz_always(ctx) | |
| 176 fz_drop_stream(ctx, rec); | |
| 177 fz_catch(ctx) | |
| 178 fz_rethrow(ctx); | |
| 179 } | |
| 180 | |
| 181 if (format == FORMAT_TEXT && out->len > 6) | |
| 182 { | |
| 183 if (!memcmp(out->data, "<html>", 6) || !memcmp(out->data, "<HTML>", 6)) | |
| 184 format = FORMAT_HTML; | |
| 185 } | |
| 186 | |
| 187 if (text_encoding != TEXT_ENCODING_UTF8 || format == FORMAT_TEXT) | |
| 188 { | |
| 189 unsigned char *p; | |
| 190 size_t j, z = fz_buffer_extract(ctx, out, &p); | |
| 191 fz_resize_buffer(ctx, out, 0); | |
| 192 if (format == FORMAT_TEXT) | |
| 193 fz_append_string(ctx, out, "<html><head><style>body{white-space:pre-wrap}</style></head><body>"); | |
| 194 for (j = 0; j < z; ++j) | |
| 195 { | |
| 196 int c = p[j]; | |
| 197 if (format == FORMAT_TEXT && (c == '<' || c == '>' || c == '&')) | |
| 198 { | |
| 199 if (c == '<') | |
| 200 fz_append_string(ctx, out, "<"); | |
| 201 else if (c == '>') | |
| 202 fz_append_string(ctx, out, ">"); | |
| 203 else if (c == '&') | |
| 204 fz_append_string(ctx, out, "&"); | |
| 205 } | |
| 206 else | |
| 207 { | |
| 208 switch (text_encoding) | |
| 209 { | |
| 210 case TEXT_ENCODING_UTF8: | |
| 211 fz_append_byte(ctx, out, c); | |
| 212 break; | |
| 213 case TEXT_ENCODING_LATIN_1: | |
| 214 fz_append_rune(ctx, out, c); | |
| 215 break; | |
| 216 case TEXT_ENCODING_1252: | |
| 217 fz_append_rune(ctx, out, fz_unicode_from_windows_1252[c]); | |
| 218 break; | |
| 219 } | |
| 220 } | |
| 221 } | |
| 222 if (format == FORMAT_TEXT) | |
| 223 fz_append_string(ctx, out, "</body></html>"); | |
| 224 fz_free(ctx, p); | |
| 225 } | |
| 226 | |
| 227 return record_count; | |
| 228 } | |
| 229 | |
| 230 static void drop_tree_entry(fz_context *ctx, void *ent) | |
| 231 { | |
| 232 fz_drop_buffer(ctx, ent); | |
| 233 } | |
| 234 | |
| 235 fz_archive * | |
| 236 fz_extract_html_from_mobi(fz_context *ctx, fz_buffer *mobi) | |
| 237 { | |
| 238 fz_stream *stm = NULL; | |
| 239 fz_buffer *buffer = NULL; | |
| 240 fz_tree *tree = NULL; | |
| 241 uint32_t *offsets = NULL; | |
| 242 char buf[32]; | |
| 243 uint32_t i, k, extra; | |
| 244 uint32_t recindex; | |
| 245 uint32_t minoffset, maxoffset; | |
| 246 int format = FORMAT_TEXT; | |
| 247 size_t n; | |
| 248 | |
| 249 // https://wiki.mobileread.com/wiki/PalmDOC | |
| 250 | |
| 251 fz_var(stm); | |
| 252 fz_var(buffer); | |
| 253 fz_var(offsets); | |
| 254 fz_var(tree); | |
| 255 | |
| 256 fz_try(ctx) | |
| 257 { | |
| 258 stm = fz_open_buffer(ctx, mobi); | |
| 259 | |
| 260 skip_bytes(ctx, stm, 32); // database name | |
| 261 skip_bytes(ctx, stm, 28); // database attributes, version, dates, etc | |
| 262 | |
| 263 n = fz_read(ctx, stm, (unsigned char *)buf, 8); // database type and creator | |
| 264 buf[8] = 0; | |
| 265 | |
| 266 if (n == 8 && !memcmp(buf, "BOOKMOBI", 8)) | |
| 267 format = FORMAT_HTML; | |
| 268 else if (n == 8 && !memcmp(buf, "TEXtREAd", 8)) | |
| 269 format = FORMAT_TEXT; | |
| 270 else if (n != 8) | |
| 271 fz_warn(ctx, "premature end in data"); | |
| 272 else | |
| 273 fz_warn(ctx, "Unknown MOBI/PRC format: %s.", buf); | |
| 274 | |
| 275 skip_bytes(ctx, stm, 8); // database internal fields | |
| 276 | |
| 277 // record info list count | |
| 278 n = fz_read_uint16(ctx, stm); | |
| 279 | |
| 280 minoffset = (uint32_t)(fz_tell(ctx, stm) + n * 2 * sizeof (uint32_t) - 1); | |
| 281 maxoffset = (uint32_t)mobi->len; | |
| 282 | |
| 283 // record info list | |
| 284 offsets = fz_malloc_array(ctx, n + 1, uint32_t); | |
| 285 for (i = 0, k = 0; i < n; ++i) | |
| 286 { | |
| 287 uint32_t offset = fz_read_uint32(ctx, stm); | |
| 288 if (offset <= minoffset) | |
| 289 continue; | |
| 290 if (offset >= maxoffset) | |
| 291 continue; | |
| 292 minoffset = offsets[k++] = offset; | |
| 293 skip_bytes(ctx, stm, 4); | |
| 294 } | |
| 295 offsets[k] = (uint32_t)mobi->len; | |
| 296 | |
| 297 // adjust n in case some out of bound offsets were skipped | |
| 298 n = k; | |
| 299 if (n == 0) | |
| 300 fz_throw(ctx, FZ_ERROR_FORMAT, "no mobi records to read"); | |
| 301 | |
| 302 // decompress text data | |
| 303 buffer = fz_new_buffer(ctx, 128 << 10); | |
| 304 extra = mobi_read_data(ctx, buffer, stm, offsets, (uint32_t)n, format); | |
| 305 fz_terminate_buffer(ctx, buffer); | |
| 306 | |
| 307 #ifndef NDEBUG | |
| 308 if (fz_atoi(getenv("FZ_DEBUG_MOBI"))) | |
| 309 fz_save_buffer(ctx, buffer, "mobi.xhtml"); | |
| 310 #endif | |
| 311 | |
| 312 tree = fz_tree_insert(ctx, tree, "index.html", buffer); | |
| 313 buffer = NULL; | |
| 314 | |
| 315 // copy image data records into tree | |
| 316 recindex = 1; | |
| 317 for (i = extra; i < n; ++i) | |
| 318 { | |
| 319 uint32_t size = offsets[i+1] - offsets[i]; | |
| 320 if (size > 8) | |
| 321 { | |
| 322 unsigned char *data = mobi->data + offsets[i]; | |
| 323 if (fz_recognize_image_format(ctx, data)) | |
| 324 { | |
| 325 buffer = fz_new_buffer_from_copied_data(ctx, data, size); | |
| 326 fz_snprintf(buf, sizeof buf, "%05d", recindex); | |
| 327 tree = fz_tree_insert(ctx, tree, buf, buffer); | |
| 328 buffer = NULL; | |
| 329 recindex++; | |
| 330 } | |
| 331 } | |
| 332 } | |
| 333 } | |
| 334 fz_always(ctx) | |
| 335 { | |
| 336 fz_drop_stream(ctx, stm); | |
| 337 fz_free(ctx, offsets); | |
| 338 } | |
| 339 fz_catch(ctx) | |
| 340 { | |
| 341 fz_drop_buffer(ctx, buffer); | |
| 342 fz_drop_tree(ctx, tree, drop_tree_entry); | |
| 343 fz_rethrow(ctx); | |
| 344 } | |
| 345 | |
| 346 return fz_new_tree_archive(ctx, tree); | |
| 347 } |
