comparison mupdf-source/source/html/mobi.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 // Copyright (C) 2004-2025 Artifex Software, Inc.
2 //
3 // This file is part of MuPDF.
4 //
5 // MuPDF is free software: you can redistribute it and/or modify it under the
6 // terms of the GNU Affero General Public License as published by the Free
7 // Software Foundation, either version 3 of the License, or (at your option)
8 // any later version.
9 //
10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13 // details.
14 //
15 // You should have received a copy of the GNU Affero General Public License
16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17 //
18 // Alternative licensing terms are available from the licensor.
19 // For commercial licensing, see <https://www.artifex.com/> or contact
20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21 // CA 94129, USA, for further information.
22
23 #include "mupdf/fitz.h"
24 #include "html-imp.h"
25
26 #include <string.h>
27
28 #define FORMAT_HTML 1
29 #define FORMAT_TEXT 2
30
31 #define COMPRESSION_NONE 1
32 #define COMPRESSION_PALMDOC 2
33 #define COMPRESSION_HUFF_CDIC 17480
34
35 #define TEXT_ENCODING_LATIN_1 0
36 #define TEXT_ENCODING_1252 1252
37 #define TEXT_ENCODING_UTF8 65001
38
39 static void
40 skip_bytes(fz_context *ctx, fz_stream *stm, size_t len)
41 {
42 size_t skipped = fz_skip(ctx, stm, len);
43 if (skipped < len)
44 fz_throw(ctx, FZ_ERROR_FORMAT, "premature end in data");
45 }
46
47 static void
48 mobi_read_text_none(fz_context *ctx, fz_buffer *out, fz_stream *stm, uint32_t size)
49 {
50 unsigned char buf[4096];
51 size_t n;
52 if (size > 4096)
53 fz_throw(ctx, FZ_ERROR_FORMAT, "text block too large");
54 n = fz_read(ctx, stm, buf, size);
55 if (n < size)
56 fz_warn(ctx, "premature end in mobi uncompressed text data");
57 fz_append_data(ctx, out, buf, n);
58 }
59
60 static void
61 mobi_read_text_palmdoc(fz_context *ctx, fz_buffer *out, fz_stream *stm, uint32_t size)
62 {
63 // https://wiki.mobileread.com/wiki/PalmDOC
64 size_t end = out->len + size;
65 while (out->len < end)
66 {
67 int c = fz_read_byte(ctx, stm);
68 if (c == EOF)
69 break;
70 if (c >= 0x01 && c <= 0x08)
71 {
72 unsigned char buf[8];
73 size_t n = fz_read(ctx, stm, buf, c);
74 fz_append_data(ctx, out, buf, n);
75 if (n < (size_t) c)
76 break;
77 }
78 else if (c <= 0x7f)
79 {
80 fz_append_byte(ctx, out, c);
81 }
82 else if (c >= 0x80 && c <= 0xbf)
83 {
84 int cc, x, distance, length;
85 cc = fz_read_byte(ctx, stm);
86 if (cc == EOF)
87 break;
88 x = (c << 8) | cc;
89 distance = (x >> 3) & 0x7ff;
90 length = (x & 7) + 3;
91 if (distance > 0 && (size_t)distance <= out->len)
92 {
93 int i;
94 int p = (int)(out->len - distance);
95 for (i = 0; i < length; ++i)
96 fz_append_byte(ctx, out, out->data[p + i]);
97 }
98 }
99 else if (c >= 0xc0 && c <= 0xff)
100 {
101 fz_append_byte(ctx, out, ' ');
102 fz_append_byte(ctx, out, c ^ 0x80);
103 }
104 }
105
106 if (out->len < end)
107 fz_warn(ctx, "premature end in mobi palmdoc data");
108 }
109
110 static uint32_t
111 mobi_read_data(fz_context *ctx, fz_buffer *out, fz_stream *stm, uint32_t *offset, uint32_t total_count, int format)
112 {
113 // https://wiki.mobileread.com/wiki/MOBI
114 uint32_t compression, text_length, record_count, text_encoding, i;
115 unsigned char buf[4];
116 fz_range range = { 0 };
117 fz_stream *rec = NULL;
118 size_t n;
119
120 fz_var(rec);
121
122 fz_try(ctx)
123 {
124 range.offset = offset[0];
125 range.length = offset[1] - offset[0];
126 rec = fz_open_range_filter(ctx, stm, &range, 1);
127
128 // PalmDOC header
129 compression = fz_read_uint16(ctx, rec);
130 skip_bytes(ctx, rec, 2);
131 text_length = fz_read_uint32(ctx, rec);
132 record_count = fz_read_uint16(ctx, rec);
133 skip_bytes(ctx, rec, 2);
134 skip_bytes(ctx, rec, 2); // encryption
135 skip_bytes(ctx, rec, 2);
136
137 // Optional MOBI header
138 text_encoding = TEXT_ENCODING_LATIN_1;
139 n = fz_read(ctx, rec, buf, 4);
140 if (n == 4 && !memcmp(buf, "MOBI", 4))
141 {
142 skip_bytes(ctx, rec, 4);
143 skip_bytes(ctx, rec, 4);
144 text_encoding = fz_read_uint32(ctx, rec);
145 }
146 }
147 fz_always(ctx)
148 fz_drop_stream(ctx, rec);
149 fz_catch(ctx)
150 fz_rethrow(ctx);
151
152 if (compression != COMPRESSION_NONE && compression != COMPRESSION_PALMDOC)
153 fz_throw(ctx, FZ_ERROR_FORMAT, "unknown compression method");
154 if (text_encoding != TEXT_ENCODING_LATIN_1 &&
155 text_encoding != TEXT_ENCODING_1252 &&
156 text_encoding != TEXT_ENCODING_UTF8)
157 fz_throw(ctx, FZ_ERROR_FORMAT, "unknown text encoding");
158
159 for (i = 1; i <= record_count && i < total_count; ++i)
160 {
161 uint32_t remain = text_length - (uint32_t)out->len;
162 uint32_t size = remain < 4096 ? remain : 4096;
163
164 fz_try(ctx)
165 {
166 range.offset = offset[i];
167 range.length = offset[i + 1] - offset[i];
168 rec = fz_open_range_filter(ctx, stm, &range, 1);
169
170 if (compression == COMPRESSION_NONE)
171 mobi_read_text_none(ctx, out, rec, size);
172 else
173 mobi_read_text_palmdoc(ctx, out, rec, size);
174 }
175 fz_always(ctx)
176 fz_drop_stream(ctx, rec);
177 fz_catch(ctx)
178 fz_rethrow(ctx);
179 }
180
181 if (format == FORMAT_TEXT && out->len > 6)
182 {
183 if (!memcmp(out->data, "<html>", 6) || !memcmp(out->data, "<HTML>", 6))
184 format = FORMAT_HTML;
185 }
186
187 if (text_encoding != TEXT_ENCODING_UTF8 || format == FORMAT_TEXT)
188 {
189 unsigned char *p;
190 size_t j, z = fz_buffer_extract(ctx, out, &p);
191 fz_resize_buffer(ctx, out, 0);
192 if (format == FORMAT_TEXT)
193 fz_append_string(ctx, out, "<html><head><style>body{white-space:pre-wrap}</style></head><body>");
194 for (j = 0; j < z; ++j)
195 {
196 int c = p[j];
197 if (format == FORMAT_TEXT && (c == '<' || c == '>' || c == '&'))
198 {
199 if (c == '<')
200 fz_append_string(ctx, out, "&lt;");
201 else if (c == '>')
202 fz_append_string(ctx, out, "&gt;");
203 else if (c == '&')
204 fz_append_string(ctx, out, "&amp;");
205 }
206 else
207 {
208 switch (text_encoding)
209 {
210 case TEXT_ENCODING_UTF8:
211 fz_append_byte(ctx, out, c);
212 break;
213 case TEXT_ENCODING_LATIN_1:
214 fz_append_rune(ctx, out, c);
215 break;
216 case TEXT_ENCODING_1252:
217 fz_append_rune(ctx, out, fz_unicode_from_windows_1252[c]);
218 break;
219 }
220 }
221 }
222 if (format == FORMAT_TEXT)
223 fz_append_string(ctx, out, "</body></html>");
224 fz_free(ctx, p);
225 }
226
227 return record_count;
228 }
229
230 static void drop_tree_entry(fz_context *ctx, void *ent)
231 {
232 fz_drop_buffer(ctx, ent);
233 }
234
235 fz_archive *
236 fz_extract_html_from_mobi(fz_context *ctx, fz_buffer *mobi)
237 {
238 fz_stream *stm = NULL;
239 fz_buffer *buffer = NULL;
240 fz_tree *tree = NULL;
241 uint32_t *offsets = NULL;
242 char buf[32];
243 uint32_t i, k, extra;
244 uint32_t recindex;
245 uint32_t minoffset, maxoffset;
246 int format = FORMAT_TEXT;
247 size_t n;
248
249 // https://wiki.mobileread.com/wiki/PalmDOC
250
251 fz_var(stm);
252 fz_var(buffer);
253 fz_var(offsets);
254 fz_var(tree);
255
256 fz_try(ctx)
257 {
258 stm = fz_open_buffer(ctx, mobi);
259
260 skip_bytes(ctx, stm, 32); // database name
261 skip_bytes(ctx, stm, 28); // database attributes, version, dates, etc
262
263 n = fz_read(ctx, stm, (unsigned char *)buf, 8); // database type and creator
264 buf[8] = 0;
265
266 if (n == 8 && !memcmp(buf, "BOOKMOBI", 8))
267 format = FORMAT_HTML;
268 else if (n == 8 && !memcmp(buf, "TEXtREAd", 8))
269 format = FORMAT_TEXT;
270 else if (n != 8)
271 fz_warn(ctx, "premature end in data");
272 else
273 fz_warn(ctx, "Unknown MOBI/PRC format: %s.", buf);
274
275 skip_bytes(ctx, stm, 8); // database internal fields
276
277 // record info list count
278 n = fz_read_uint16(ctx, stm);
279
280 minoffset = (uint32_t)(fz_tell(ctx, stm) + n * 2 * sizeof (uint32_t) - 1);
281 maxoffset = (uint32_t)mobi->len;
282
283 // record info list
284 offsets = fz_malloc_array(ctx, n + 1, uint32_t);
285 for (i = 0, k = 0; i < n; ++i)
286 {
287 uint32_t offset = fz_read_uint32(ctx, stm);
288 if (offset <= minoffset)
289 continue;
290 if (offset >= maxoffset)
291 continue;
292 minoffset = offsets[k++] = offset;
293 skip_bytes(ctx, stm, 4);
294 }
295 offsets[k] = (uint32_t)mobi->len;
296
297 // adjust n in case some out of bound offsets were skipped
298 n = k;
299 if (n == 0)
300 fz_throw(ctx, FZ_ERROR_FORMAT, "no mobi records to read");
301
302 // decompress text data
303 buffer = fz_new_buffer(ctx, 128 << 10);
304 extra = mobi_read_data(ctx, buffer, stm, offsets, (uint32_t)n, format);
305 fz_terminate_buffer(ctx, buffer);
306
307 #ifndef NDEBUG
308 if (fz_atoi(getenv("FZ_DEBUG_MOBI")))
309 fz_save_buffer(ctx, buffer, "mobi.xhtml");
310 #endif
311
312 tree = fz_tree_insert(ctx, tree, "index.html", buffer);
313 buffer = NULL;
314
315 // copy image data records into tree
316 recindex = 1;
317 for (i = extra; i < n; ++i)
318 {
319 uint32_t size = offsets[i+1] - offsets[i];
320 if (size > 8)
321 {
322 unsigned char *data = mobi->data + offsets[i];
323 if (fz_recognize_image_format(ctx, data))
324 {
325 buffer = fz_new_buffer_from_copied_data(ctx, data, size);
326 fz_snprintf(buf, sizeof buf, "%05d", recindex);
327 tree = fz_tree_insert(ctx, tree, buf, buffer);
328 buffer = NULL;
329 recindex++;
330 }
331 }
332 }
333 }
334 fz_always(ctx)
335 {
336 fz_drop_stream(ctx, stm);
337 fz_free(ctx, offsets);
338 }
339 fz_catch(ctx)
340 {
341 fz_drop_buffer(ctx, buffer);
342 fz_drop_tree(ctx, tree, drop_tree_entry);
343 fz_rethrow(ctx);
344 }
345
346 return fz_new_tree_archive(ctx, tree);
347 }