comparison mupdf-source/source/html/epub-doc.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 // Copyright (C) 2004-2025 Artifex Software, Inc.
2 //
3 // This file is part of MuPDF.
4 //
5 // MuPDF is free software: you can redistribute it and/or modify it under the
6 // terms of the GNU Affero General Public License as published by the Free
7 // Software Foundation, either version 3 of the License, or (at your option)
8 // any later version.
9 //
10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13 // details.
14 //
15 // You should have received a copy of the GNU Affero General Public License
16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17 //
18 // Alternative licensing terms are available from the licensor.
19 // For commercial licensing, see <https://www.artifex.com/> or contact
20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21 // CA 94129, USA, for further information.
22
23 #include "mupdf/fitz.h"
24 #include "html-imp.h"
25
26 #include <string.h>
27 #include <math.h>
28
29 #include <zlib.h> /* for crc32 */
30
31 enum { T, R, B, L };
32
33 typedef struct epub_chapter epub_chapter;
34 typedef struct epub_page epub_page;
35
36 typedef struct
37 {
38 int max_chapters;
39 int num_chapters;
40 float layout_w;
41 float layout_h;
42 float layout_em;
43 uint32_t css_sum;
44 int use_doc_css;
45 int *pages_in_chapter;
46 } epub_accelerator;
47
48 typedef struct
49 {
50 fz_document super;
51 fz_archive *zip;
52 fz_html_font_set *set;
53 int count;
54 epub_chapter *spine;
55 fz_outline *outline;
56 char *dc_title, *dc_creator;
57 float layout_w, layout_h, layout_em;
58 epub_accelerator *accel;
59 uint32_t css_sum;
60
61 /* A common pattern of use is for us to open a document,
62 * load a page, draw it, drop it, load the next page,
63 * draw it, drop it etc. This means that the HTML for
64 * a chapter might get thrown away between the drop and
65 * the the next load (if the chapter is large, and the
66 * store size is low). Accordingly, we store a handle
67 * to the most recently used html block here, thus
68 * ensuring that the stored copy won't be evicted. */
69 fz_html *most_recent_html;
70 } epub_document;
71
72 struct epub_chapter
73 {
74 epub_document *doc;
75 char *path;
76 int number;
77 epub_chapter *next;
78 };
79
80 struct epub_page
81 {
82 fz_page super;
83 epub_chapter *ch;
84 int number;
85 fz_html *html;
86 };
87
88 static uint32_t
89 user_css_sum(fz_context *ctx)
90 {
91 uint32_t sum = 0;
92 const char *css = fz_user_css(ctx);
93 sum = crc32(0, NULL, 0);
94 if (css)
95 sum = crc32(sum, (Byte*)css, (int)strlen(css));
96 return sum;
97 }
98
99 static int dummy = 1;
100
101 struct encrypted {
102 fz_archive super;
103 fz_archive *chain;
104 fz_tree *info;
105 };
106
107 static int has_encrypted_entry(fz_context *ctx, fz_archive *arch_, const char *name)
108 {
109 struct encrypted *arch = (struct encrypted *)arch_;
110 return fz_has_archive_entry(ctx, arch->chain, name);
111 }
112
113 static fz_stream *open_encrypted_entry(fz_context *ctx, fz_archive *arch_, const char *name)
114 {
115 struct encrypted *arch = (struct encrypted *)arch_;
116 if (fz_tree_lookup(ctx, arch->info, name))
117 return NULL;
118 return fz_open_archive_entry(ctx, arch->chain, name);
119 }
120
121 static fz_buffer *read_encrypted_entry(fz_context *ctx, fz_archive *arch_, const char *name)
122 {
123 struct encrypted *arch = (struct encrypted *)arch_;
124 if (fz_tree_lookup(ctx, arch->info, name))
125 return NULL;
126 return fz_read_archive_entry(ctx, arch->chain, name);
127 }
128
129 static void drop_encrypted_archive(fz_context *ctx, fz_archive *arch_)
130 {
131 struct encrypted *arch = (struct encrypted *)arch_;
132 fz_drop_tree(ctx, arch->info, NULL);
133 fz_drop_archive(ctx, arch->chain);
134 }
135
136 static fz_archive *new_encrypted_archive(fz_context *ctx, fz_archive *chain, fz_tree *info)
137 {
138 struct encrypted *arch;
139
140 arch = fz_new_derived_archive(ctx, NULL, struct encrypted);
141 arch->super.format = "encrypted";
142 arch->super.has_entry = has_encrypted_entry;
143 arch->super.read_entry = read_encrypted_entry;
144 arch->super.open_entry = open_encrypted_entry;
145 arch->super.drop_archive = drop_encrypted_archive;
146 arch->chain = chain;
147 arch->info = info;
148
149 return &arch->super;
150 }
151
152 static void
153 epub_parse_encryption(fz_context *ctx, epub_document *doc, fz_xml *root)
154 {
155 fz_tree *info = NULL;
156 fz_xml *edata;
157
158 for (edata = fz_xml_find_down(root, "EncryptedData"); edata; edata = fz_xml_find_next(edata, "EncryptedData"))
159 {
160 fz_xml *cdata = fz_xml_find_down(edata, "CipherData");
161 fz_xml *cref = fz_xml_find_down(cdata, "CipherReference");
162 char *uri = fz_xml_att(cref, "URI");
163 if (uri)
164 {
165 // TODO: Support reading EncryptedKey and EncryptionMethod to decrypt content.
166 info = fz_tree_insert(ctx, info, uri, &dummy);
167 }
168 }
169
170 if (info)
171 {
172 doc->zip = new_encrypted_archive(ctx, doc->zip, info);
173 }
174 }
175
176 static fz_html *epub_get_laid_out_html(fz_context *ctx, epub_document *doc, epub_chapter *ch);
177
178 static int count_laid_out_pages(fz_html *html)
179 {
180 if (html->tree.root->s.layout.b > 0)
181 return ceilf(html->tree.root->s.layout.b / html->page_h);
182 return 1;
183 }
184
185 static void
186 invalidate_accelerator(fz_context *ctx, epub_accelerator *acc)
187 {
188 int i;
189
190 for (i = 0; i < acc->max_chapters; i++)
191 acc->pages_in_chapter[i] = -1;
192 }
193
194 static int count_chapter_pages(fz_context *ctx, epub_document *doc, epub_chapter *ch)
195 {
196 epub_accelerator *acc = doc->accel;
197 int use_doc_css = fz_use_document_css(ctx);
198
199 if (use_doc_css != acc->use_doc_css || doc->css_sum != acc->css_sum)
200 {
201 acc->use_doc_css = use_doc_css;
202 acc->css_sum = doc->css_sum;
203 invalidate_accelerator(ctx, acc);
204 }
205
206 if (ch->number < acc->num_chapters && acc->pages_in_chapter[ch->number] != -1)
207 return acc->pages_in_chapter[ch->number];
208
209 fz_drop_html(ctx, epub_get_laid_out_html(ctx, doc, ch));
210 return acc->pages_in_chapter[ch->number];
211 }
212
213 static fz_link_dest
214 epub_resolve_link(fz_context *ctx, fz_document *doc_, const char *dest)
215 {
216 epub_document *doc = (epub_document*)doc_;
217 epub_chapter *ch;
218 int i;
219
220 const char *s = strchr(dest, '#');
221 size_t n = s ? (size_t)(s - dest) : strlen(dest);
222 if (s && s[1] == 0)
223 s = NULL;
224
225 for (i = 0, ch = doc->spine; ch; ++i, ch = ch->next)
226 {
227 if (!strncmp(ch->path, dest, n) && ch->path[n] == 0)
228 {
229 if (s)
230 {
231 float y;
232 fz_html *html = epub_get_laid_out_html(ctx, doc, ch);
233 int ph = html->page_h;
234
235 /* Search for a matching fragment */
236 y = fz_find_html_target(ctx, html, s+1);
237 fz_drop_html(ctx, html);
238 if (y >= 0)
239 {
240 int page = y / ph;
241 return fz_make_link_dest_xyz(i, page, 0, y - page * ph, 0);
242 }
243 return fz_make_link_dest_none();
244 }
245 return fz_make_link_dest_xyz(i, 0, 0, 0, 0);
246 }
247 }
248
249 return fz_make_link_dest_none();
250 }
251
252 static void
253 epub_layout(fz_context *ctx, fz_document *doc_, float w, float h, float em)
254 {
255 epub_document *doc = (epub_document*)doc_;
256 uint32_t css_sum = user_css_sum(ctx);
257 int use_doc_css = fz_use_document_css(ctx);
258
259 if (doc->layout_w == w && doc->layout_h == h && doc->layout_em == em && doc->css_sum == css_sum)
260 return;
261 doc->layout_w = w;
262 doc->layout_h = h;
263 doc->layout_em = em;
264
265 if (doc->accel == NULL)
266 return;
267
268 /* When we load the saved accelerator, doc->accel
269 * can be populated with different values than doc.
270 * This is really useful as doc starts out with the
271 * values being 0. If we've got the right values
272 * already, then don't bin the data! */
273 if (doc->accel->layout_w == w &&
274 doc->accel->layout_h == h &&
275 doc->accel->layout_em == em &&
276 doc->accel->use_doc_css == use_doc_css &&
277 doc->accel->css_sum == css_sum)
278 return;
279
280 doc->accel->layout_w = w;
281 doc->accel->layout_h = h;
282 doc->accel->layout_em = em;
283 doc->accel->use_doc_css = use_doc_css;
284 doc->accel->css_sum = css_sum;
285 invalidate_accelerator(ctx, doc->accel);
286 }
287
288 static int
289 epub_count_chapters(fz_context *ctx, fz_document *doc_)
290 {
291 epub_document *doc = (epub_document*)doc_;
292 epub_chapter *ch;
293 int count = 0;
294 for (ch = doc->spine; ch; ch = ch->next)
295 ++count;
296 return count;
297 }
298
299 static int
300 epub_count_pages(fz_context *ctx, fz_document *doc_, int chapter)
301 {
302 epub_document *doc = (epub_document*)doc_;
303 epub_chapter *ch;
304 int i;
305 for (i = 0, ch = doc->spine; ch; ++i, ch = ch->next)
306 {
307 if (i == chapter)
308 {
309 return count_chapter_pages(ctx, doc, ch);
310 }
311 }
312 return 0;
313 }
314
315 #define MAGIC_ACCELERATOR 0xacce1e7a
316 #define MAGIC_ACCEL_EPUB 0x62755065
317 #define ACCEL_VERSION 0x00010001
318
319 static void epub_load_accelerator(fz_context *ctx, epub_document *doc, fz_stream *accel)
320 {
321 int v;
322 float w, h, em;
323 int num_chapters;
324 epub_accelerator *acc = NULL;
325 uint32_t css_sum;
326 int use_doc_css;
327 int make_new = (accel == NULL);
328
329 fz_var(acc);
330
331 if (accel)
332 {
333 /* Try to read the accelerator data. If we fail silently give up. */
334 fz_try(ctx)
335 {
336 v = fz_read_int32_le(ctx, accel);
337 if (v != (int32_t)MAGIC_ACCELERATOR)
338 {
339 make_new = 1;
340 break;
341 }
342
343 v = fz_read_int32_le(ctx, accel);
344 if (v != MAGIC_ACCEL_EPUB)
345 {
346 make_new = 1;
347 break;
348 }
349
350 v = fz_read_int32_le(ctx, accel);
351 if (v != ACCEL_VERSION)
352 {
353 make_new = 1;
354 break;
355 }
356
357 w = fz_read_float_le(ctx, accel);
358 h = fz_read_float_le(ctx, accel);
359 em = fz_read_float_le(ctx, accel);
360 css_sum = fz_read_uint32_le(ctx, accel);
361 use_doc_css = fz_read_int32_le(ctx, accel);
362
363 num_chapters = fz_read_int32_le(ctx, accel);
364 if (num_chapters <= 0)
365 {
366 make_new = 1;
367 break;
368 }
369
370 acc = fz_malloc_struct(ctx, epub_accelerator);
371 acc->pages_in_chapter = Memento_label(fz_malloc_array(ctx, num_chapters, int), "accel_pages_in_chapter");
372 acc->max_chapters = acc->num_chapters = num_chapters;
373 acc->layout_w = w;
374 acc->layout_h = h;
375 acc->layout_em = em;
376 acc->css_sum = css_sum;
377 acc->use_doc_css = use_doc_css;
378
379 for (v = 0; v < num_chapters; v++)
380 acc->pages_in_chapter[v] = fz_read_int32_le(ctx, accel);
381 }
382 fz_catch(ctx)
383 {
384 if (acc)
385 fz_free(ctx, acc->pages_in_chapter);
386 fz_free(ctx, acc);
387 /* Swallow the error and run unaccelerated */
388 fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
389 fz_report_error(ctx);
390 make_new = 1;
391 }
392 }
393
394 /* If we aren't given an accelerator to load (or the one we're given
395 * is bad) create a blank stub and we can fill it out as we go. */
396 if (make_new)
397 {
398 acc = fz_malloc_struct(ctx, epub_accelerator);
399 acc->css_sum = doc->css_sum;
400 acc->use_doc_css = fz_use_document_css(ctx);
401 }
402
403 doc->accel = acc;
404 }
405
406 static void
407 accelerate_chapter(fz_context *ctx, epub_document *doc, epub_chapter *ch, fz_html *html)
408 {
409 epub_accelerator *acc = doc->accel;
410 int p = count_laid_out_pages(html);
411
412 if (ch->number < acc->num_chapters)
413 {
414 if (acc->pages_in_chapter[ch->number] != p && acc->pages_in_chapter[ch->number] != -1)
415 {
416 fz_warn(ctx, "Invalidating stale accelerator data.");
417 invalidate_accelerator(ctx, doc->accel);
418 }
419 acc->pages_in_chapter[ch->number] = p;
420 return;
421 }
422
423 if (ch->number >= acc->max_chapters)
424 {
425 int n = acc->max_chapters;
426 int i;
427 if (n == 0)
428 n = 4;
429 while (n <= ch->number)
430 n *= 2;
431
432 acc->pages_in_chapter = fz_realloc_array(ctx, acc->pages_in_chapter, n, int);
433 for (i = acc->max_chapters; i < n; i++)
434 acc->pages_in_chapter[i] = -1;
435 acc->max_chapters = n;
436 }
437 acc->pages_in_chapter[ch->number] = p;
438 if (acc->num_chapters < ch->number+1)
439 acc->num_chapters = ch->number+1;
440 }
441
442 static void
443 epub_drop_page(fz_context *ctx, fz_page *page_)
444 {
445 epub_page *page = (epub_page *)page_;
446 fz_drop_html(ctx, page->html);
447 }
448
449 static epub_chapter *
450 epub_load_chapter(fz_context *ctx, epub_document *doc, const char *path, int i)
451 {
452 epub_chapter *ch;
453
454 ch = fz_malloc_struct(ctx, epub_chapter);
455 fz_try(ctx)
456 {
457 ch->path = Memento_label(fz_strdup(ctx, path), "chapter_path");
458 ch->number = i;
459 }
460 fz_catch(ctx)
461 {
462 fz_free(ctx, ch);
463 fz_rethrow(ctx);
464 }
465
466 return ch;
467 }
468
469 static fz_html *
470 epub_parse_chapter(fz_context *ctx, epub_document *doc, epub_chapter *ch)
471 {
472 fz_archive *zip = doc->zip;
473 fz_buffer *buf;
474 char base_uri[2048];
475 fz_html *html;
476
477 /* Look for one we made earlier */
478 html = fz_find_html(ctx, doc, ch->number);
479 if (html)
480 return html;
481
482 fz_dirname(base_uri, ch->path, sizeof base_uri);
483
484 buf = fz_read_archive_entry(ctx, zip, ch->path);
485 fz_try(ctx)
486 html = fz_parse_html(ctx, doc->set, zip, base_uri, buf, fz_user_css(ctx), 1, 1, 0);
487 fz_always(ctx)
488 fz_drop_buffer(ctx, buf);
489 fz_catch(ctx)
490 fz_rethrow(ctx);
491
492 return fz_store_html(ctx, html, doc, ch->number);
493 }
494
495 static fz_html *
496 epub_get_laid_out_html(fz_context *ctx, epub_document *doc, epub_chapter *ch)
497 {
498 fz_html *html = epub_parse_chapter(ctx, doc, ch);
499 fz_try(ctx)
500 {
501 fz_layout_html(ctx, html, doc->layout_w, doc->layout_h, doc->layout_em);
502 accelerate_chapter(ctx, doc, ch, html);
503 }
504 fz_catch(ctx)
505 {
506 fz_drop_html(ctx, html);
507 fz_rethrow(ctx);
508 }
509
510 fz_drop_html(ctx, doc->most_recent_html);
511 doc->most_recent_html = fz_keep_html(ctx, html);
512
513 return html;
514 }
515
516 static fz_rect
517 epub_bound_page(fz_context *ctx, fz_page *page_, fz_box_type box)
518 {
519 epub_document *doc = (epub_document*)page_->doc;
520 epub_page *page = (epub_page*)page_;
521 epub_chapter *ch = page->ch;
522 fz_rect bbox;
523 fz_html *html = epub_get_laid_out_html(ctx, doc, ch);
524
525 bbox.x0 = 0;
526 bbox.y0 = 0;
527 bbox.x1 = html->page_w + html->page_margin[L] + html->page_margin[R];
528 bbox.y1 = html->page_h + html->page_margin[T] + html->page_margin[B];
529 fz_drop_html(ctx, html);
530 return bbox;
531 }
532
533 static void
534 epub_run_page(fz_context *ctx, fz_page *page_, fz_device *dev, fz_matrix ctm, fz_cookie *cookie)
535 {
536 epub_page *page = (epub_page*)page_;
537
538 fz_draw_html(ctx, dev, ctm, page->html, page->number);
539 }
540
541 static fz_link *
542 epub_load_links(fz_context *ctx, fz_page *page_)
543 {
544 epub_page *page = (epub_page*)page_;
545 epub_chapter *ch = page->ch;
546
547 return fz_load_html_links(ctx, page->html, page->number, ch->path);
548 }
549
550 static fz_bookmark
551 epub_make_bookmark(fz_context *ctx, fz_document *doc_, fz_location loc)
552 {
553 epub_document *doc = (epub_document*)doc_;
554 epub_chapter *ch;
555 int i;
556
557 for (i = 0, ch = doc->spine; ch; ++i, ch = ch->next)
558 {
559 if (i == loc.chapter)
560 {
561 fz_html *html = epub_get_laid_out_html(ctx, doc, ch);
562 fz_bookmark mark = fz_make_html_bookmark(ctx, html, loc.page);
563 fz_drop_html(ctx, html);
564 return mark;
565 }
566 }
567
568 return 0;
569 }
570
571 static fz_location
572 epub_lookup_bookmark(fz_context *ctx, fz_document *doc_, fz_bookmark mark)
573 {
574 epub_document *doc = (epub_document*)doc_;
575 epub_chapter *ch;
576 int i;
577
578 for (i = 0, ch = doc->spine; ch; ++i, ch = ch->next)
579 {
580 fz_html *html = epub_get_laid_out_html(ctx, doc, ch);
581 int p = fz_lookup_html_bookmark(ctx, html, mark);
582 fz_drop_html(ctx, html);
583 if (p != -1)
584 return fz_make_location(i, p);
585 }
586 return fz_make_location(-1, -1);
587 }
588
589 static fz_page *
590 epub_load_page(fz_context *ctx, fz_document *doc_, int chapter, int number)
591 {
592 epub_document *doc = (epub_document*)doc_;
593 epub_chapter *ch;
594 int i;
595
596 if (chapter < 0)
597 fz_throw(ctx, FZ_ERROR_ARGUMENT, "invalid chapter number: %d", chapter);
598 if (number < 0)
599 fz_throw(ctx, FZ_ERROR_ARGUMENT, "invalid page number: %d", number);
600
601 for (i = 0, ch = doc->spine; ch; ++i, ch = ch->next)
602 {
603 if (i == chapter)
604 {
605 epub_page *page = fz_new_derived_page(ctx, epub_page, doc_);
606 page->super.bound_page = epub_bound_page;
607 page->super.run_page_contents = epub_run_page;
608 page->super.load_links = epub_load_links;
609 page->super.drop_page = epub_drop_page;
610 page->ch = ch;
611 page->number = number;
612 page->html = epub_get_laid_out_html(ctx, doc, ch);
613 return (fz_page*)page;
614 }
615 }
616 return NULL;
617 }
618
619 static void
620 epub_page_label(fz_context *ctx, fz_document *doc_, int chapter, int number, char *buf, size_t size)
621 {
622 fz_snprintf(buf, size, "ch. %d, p. %d", chapter+1, number+1);
623 }
624
625 static void
626 epub_drop_accelerator(fz_context *ctx, epub_accelerator *acc)
627 {
628 if (acc == NULL)
629 return;
630
631 fz_free(ctx, acc->pages_in_chapter);
632 fz_free(ctx, acc);
633 }
634
635 static void
636 epub_drop_document(fz_context *ctx, fz_document *doc_)
637 {
638 epub_document *doc = (epub_document*)doc_;
639 epub_chapter *ch, *next;
640 ch = doc->spine;
641 while (ch)
642 {
643 next = ch->next;
644 fz_free(ctx, ch->path);
645 fz_free(ctx, ch);
646 ch = next;
647 }
648 epub_drop_accelerator(ctx, doc->accel);
649 fz_drop_archive(ctx, doc->zip);
650 fz_drop_html_font_set(ctx, doc->set);
651 fz_drop_outline(ctx, doc->outline);
652 fz_free(ctx, doc->dc_title);
653 fz_free(ctx, doc->dc_creator);
654 fz_drop_html(ctx, doc->most_recent_html);
655 fz_purge_stored_html(ctx, doc);
656 }
657
658 static const char *
659 rel_path_from_idref(fz_xml *manifest, const char *idref)
660 {
661 fz_xml *item;
662 if (!idref)
663 return NULL;
664 item = fz_xml_find_down(manifest, "item");
665 while (item)
666 {
667 const char *id = fz_xml_att(item, "id");
668 if (id && !strcmp(id, idref))
669 return fz_xml_att(item, "href");
670 item = fz_xml_find_next(item, "item");
671 }
672 return NULL;
673 }
674
675 static const char *
676 path_from_idref(char *path, fz_xml *manifest, const char *base_uri, const char *idref, int n)
677 {
678 const char *rel_path = rel_path_from_idref(manifest, idref);
679 if (!rel_path)
680 {
681 path[0] = 0;
682 return NULL;
683 }
684 fz_strlcpy(path, base_uri, n);
685 fz_strlcat(path, "/", n);
686 fz_strlcat(path, rel_path, n);
687 return fz_cleanname(fz_urldecode(path));
688 }
689
690 static fz_outline *
691 epub_parse_ncx_imp(fz_context *ctx, epub_document *doc, fz_xml *node, char *base_uri)
692 {
693 char path[2048];
694 fz_outline *outline, *head, **tailp;
695
696 head = NULL;
697 tailp = &head;
698
699 node = fz_xml_find_down(node, "navPoint");
700 while (node)
701 {
702 char *text = fz_xml_text(fz_xml_down(fz_xml_find_down(fz_xml_find_down(node, "navLabel"), "text")));
703 char *content = fz_xml_att(fz_xml_find_down(node, "content"), "src");
704 if (text && content)
705 {
706 fz_strlcpy(path, base_uri, sizeof path);
707 fz_strlcat(path, "/", sizeof path);
708 fz_strlcat(path, content, sizeof path);
709 fz_urldecode(path);
710 fz_cleanname(path);
711
712 fz_try(ctx)
713 {
714 *tailp = outline = fz_new_outline(ctx);
715 tailp = &(*tailp)->next;
716 outline->title = Memento_label(fz_strdup(ctx, text), "outline_title");
717 outline->uri = Memento_label(fz_strdup(ctx, path), "outline_uri");
718 outline->page = fz_make_location(-1, -1);
719 outline->down = epub_parse_ncx_imp(ctx, doc, node, base_uri);
720 outline->is_open = 1;
721 }
722 fz_catch(ctx)
723 {
724 fz_drop_outline(ctx, head);
725 fz_rethrow(ctx);
726 }
727 }
728 node = fz_xml_find_next(node, "navPoint");
729 }
730
731 return head;
732 }
733
734 static void
735 epub_parse_ncx(fz_context *ctx, epub_document *doc, const char *path)
736 {
737 fz_archive *zip = doc->zip;
738 fz_buffer *buf = NULL;
739 fz_xml_doc *ncx = NULL;
740 char base_uri[2048];
741
742 fz_var(buf);
743 fz_var(ncx);
744
745 fz_try(ctx)
746 {
747 fz_dirname(base_uri, path, sizeof base_uri);
748 buf = fz_read_archive_entry(ctx, zip, path);
749 ncx = fz_parse_xml(ctx, buf, 0);
750 doc->outline = epub_parse_ncx_imp(ctx, doc, fz_xml_find_down(fz_xml_root(ncx), "navMap"), base_uri);
751 }
752 fz_always(ctx)
753 {
754 fz_drop_buffer(ctx, buf);
755 fz_drop_xml(ctx, ncx);
756 }
757 fz_catch(ctx)
758 fz_rethrow(ctx);
759 }
760
761 static char *
762 find_metadata(fz_context *ctx, fz_xml *metadata, char *key)
763 {
764 char *text = fz_xml_text(fz_xml_down(fz_xml_find_down(metadata, key)));
765 if (text)
766 return fz_strdup(ctx, text);
767 return NULL;
768 }
769
770 static fz_buffer *
771 read_container_and_prefix(fz_context *ctx, fz_archive *zip, char *prefix, size_t prefix_len)
772 {
773 int n = fz_count_archive_entries(ctx, zip);
774 int i;
775
776 prefix[0] = 0;
777
778 /* First off, look for the container.xml at the top level. */
779 for (i = 0; i < n; i++)
780 {
781 const char *p = fz_list_archive_entry(ctx, zip, i);
782
783 if (!strcmp(p, "META-INF/container.xml"))
784 return fz_read_archive_entry(ctx, zip, "META-INF/container.xml");
785 }
786
787 /* If that failed, look for the first such file in a subdirectory. */
788 for (i = 0; i < n; i++)
789 {
790 const char *p = fz_list_archive_entry(ctx, zip, i);
791 size_t z = strlen(p);
792 size_t z0 = sizeof("META-INF/container.xml")-1;
793
794 if (z < z0)
795 continue;
796 if (!strcmp(p + z - z0, "META-INF/container.xml"))
797 {
798 if (z - z0 >= prefix_len)
799 {
800 fz_warn(ctx, "Ignoring %s as path too long.", p);
801 continue;
802 }
803 memcpy(prefix, p, z-z0);
804 prefix[z-z0] = 0;
805 return fz_read_archive_entry(ctx, zip, p);
806 }
807 }
808
809 return fz_read_archive_entry(ctx, zip, "META-INF/container.xml");
810 }
811
812 static void
813 epub_parse_header(fz_context *ctx, epub_document *doc)
814 {
815 fz_archive *zip = doc->zip;
816 fz_buffer *buf = NULL;
817 fz_xml_doc *encryption_xml = NULL;
818 fz_xml_doc *container_xml = NULL;
819 fz_xml_doc *content_opf = NULL;
820 fz_xml *container, *rootfiles, *rootfile;
821 fz_xml *package, *manifest, *spine, *itemref, *metadata;
822 char base_uri[2048];
823 const char *full_path;
824 const char *version;
825 char ncx[2048], s[2048];
826 char *prefixed_full_path = NULL;
827 size_t prefix_len;
828 epub_chapter **tailp;
829 int i;
830
831 fz_var(buf);
832 fz_var(encryption_xml);
833 fz_var(container_xml);
834 fz_var(content_opf);
835 fz_var(prefixed_full_path);
836
837 fz_try(ctx)
838 {
839 /* parse META-INF/encryption.xml to figure out which entries are encrypted */
840
841 /* parse META-INF/container.xml to find OPF */
842 /* Reuse base_uri to read the prefix. */
843 buf = read_container_and_prefix(ctx, zip, base_uri, sizeof(base_uri));
844 container_xml = fz_parse_xml(ctx, buf, 0);
845 fz_drop_buffer(ctx, buf);
846 buf = NULL;
847
848 /* Some epub files can be prefixed by a directory name. This (normally
849 * empty!) will be in base_uri. */
850 prefix_len = strlen(base_uri);
851 {
852 /* Further abuse base_uri to hold a temporary name. */
853 const size_t z0 = sizeof("META-INF/encryption.xml")-1;
854 if (sizeof(base_uri) <= prefix_len + z0)
855 fz_throw(ctx, FZ_ERROR_FORMAT, "Prefix too long in epub");
856 strcpy(base_uri + prefix_len, "META-INF/encryption.xml");
857 if (fz_has_archive_entry(ctx, zip, base_uri))
858 {
859 fz_warn(ctx, "EPUB may be locked by DRM");
860
861 buf = fz_read_archive_entry(ctx, zip, base_uri);
862 encryption_xml = fz_parse_xml(ctx, buf, 0);
863 fz_drop_buffer(ctx, buf);
864 buf = NULL;
865
866 epub_parse_encryption(ctx, doc, fz_xml_find(fz_xml_root(encryption_xml), "encryption"));
867 zip = doc->zip;
868 }
869 }
870
871 container = fz_xml_find(fz_xml_root(container_xml), "container");
872 rootfiles = fz_xml_find_down(container, "rootfiles");
873 rootfile = fz_xml_find_down(rootfiles, "rootfile");
874 full_path = fz_xml_att(rootfile, "full-path");
875 if (!full_path)
876 fz_throw(ctx, FZ_ERROR_FORMAT, "cannot find root file in EPUB");
877
878 fz_dirname(base_uri+prefix_len, full_path, sizeof(base_uri) - prefix_len);
879
880 prefixed_full_path = fz_malloc(ctx, strlen(full_path) + prefix_len + 1);
881 memcpy(prefixed_full_path, base_uri, prefix_len);
882 strcpy(prefixed_full_path + prefix_len, full_path);
883
884 /* parse OPF to find NCX and spine */
885
886 buf = fz_read_archive_entry(ctx, zip, prefixed_full_path);
887 content_opf = fz_parse_xml(ctx, buf, 0);
888 fz_drop_buffer(ctx, buf);
889 buf = NULL;
890
891 package = fz_xml_find(fz_xml_root(content_opf), "package");
892 version = fz_xml_att(package, "version");
893 if (!version || strcmp(version, "2.0"))
894 fz_warn(ctx, "unknown epub version: %s", version ? version : "<none>");
895
896 metadata = fz_xml_find_down(package, "metadata");
897 if (metadata)
898 {
899 doc->dc_title = Memento_label(find_metadata(ctx, metadata, "title"), "epub_title");
900 doc->dc_creator = Memento_label(find_metadata(ctx, metadata, "creator"), "epub_creator");
901 }
902
903 manifest = fz_xml_find_down(package, "manifest");
904 spine = fz_xml_find_down(package, "spine");
905
906 if (path_from_idref(ncx, manifest, base_uri, fz_xml_att(spine, "toc"), sizeof ncx))
907 {
908 epub_parse_ncx(ctx, doc, ncx);
909 }
910
911 doc->spine = NULL;
912 tailp = &doc->spine;
913 itemref = fz_xml_find_down(spine, "itemref");
914 i = 0;
915 while (itemref)
916 {
917 if (path_from_idref(s, manifest, base_uri, fz_xml_att(itemref, "idref"), sizeof s))
918 {
919 fz_try(ctx)
920 {
921 *tailp = epub_load_chapter(ctx, doc, s, i);
922 tailp = &(*tailp)->next;
923 i++;
924 }
925 fz_catch(ctx)
926 {
927 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
928 fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
929 fz_report_error(ctx);
930 fz_warn(ctx, "ignoring chapter %s", s);
931 }
932 }
933 itemref = fz_xml_find_next(itemref, "itemref");
934 }
935 }
936 fz_always(ctx)
937 {
938 fz_drop_xml(ctx, content_opf);
939 fz_drop_xml(ctx, container_xml);
940 fz_drop_xml(ctx, encryption_xml);
941 fz_drop_buffer(ctx, buf);
942 fz_free(ctx, prefixed_full_path);
943 }
944 fz_catch(ctx)
945 fz_rethrow(ctx);
946 }
947
948 static fz_outline *
949 epub_load_outline(fz_context *ctx, fz_document *doc_)
950 {
951 epub_document *doc = (epub_document*)doc_;
952 return fz_keep_outline(ctx, doc->outline);
953 }
954
955 static int
956 epub_lookup_metadata(fz_context *ctx, fz_document *doc_, const char *key, char *buf, size_t size)
957 {
958 epub_document *doc = (epub_document*)doc_;
959 if (!strcmp(key, FZ_META_FORMAT))
960 return 1 + (int)fz_strlcpy(buf, "EPUB", size);
961 if (!strcmp(key, FZ_META_INFO_TITLE) && doc->dc_title)
962 return 1 + (int)fz_strlcpy(buf, doc->dc_title, size);
963 if (!strcmp(key, FZ_META_INFO_AUTHOR) && doc->dc_creator)
964 return 1 + (int)fz_strlcpy(buf, doc->dc_creator, size);
965 return -1;
966 }
967
968 static void
969 epub_output_accelerator(fz_context *ctx, fz_document *doc_, fz_output *out)
970 {
971 epub_document *doc = (epub_document*)doc_;
972 int i;
973
974 fz_try(ctx)
975 {
976 if (doc->accel == NULL)
977 fz_throw(ctx, FZ_ERROR_ARGUMENT, "No accelerator data to write");
978
979 fz_write_int32_le(ctx, out, MAGIC_ACCELERATOR);
980 fz_write_int32_le(ctx, out, MAGIC_ACCEL_EPUB);
981 fz_write_int32_le(ctx, out, ACCEL_VERSION);
982 fz_write_float_le(ctx, out, doc->accel->layout_w);
983 fz_write_float_le(ctx, out, doc->accel->layout_h);
984 fz_write_float_le(ctx, out, doc->accel->layout_em);
985 fz_write_uint32_le(ctx, out, doc->accel->css_sum);
986 fz_write_int32_le(ctx, out, doc->accel->use_doc_css);
987 fz_write_int32_le(ctx, out, doc->accel->num_chapters);
988 for (i = 0; i < doc->accel->num_chapters; i++)
989 fz_write_int32_le(ctx, out, doc->accel->pages_in_chapter[i]);
990
991 fz_close_output(ctx, out);
992 }
993 fz_always(ctx)
994 fz_drop_output(ctx, out);
995 fz_catch(ctx)
996 fz_rethrow(ctx);
997 }
998
999 /* Takes ownership of zip. Will always eventually drop it.
1000 * Never takes ownership of accel. */
1001 static fz_document *
1002 epub_init(fz_context *ctx, fz_archive *zip, fz_stream *accel)
1003 {
1004 epub_document *doc = NULL;
1005
1006 fz_var(doc);
1007 fz_var(zip);
1008
1009 fz_try(ctx)
1010 {
1011 doc = fz_new_derived_document(ctx, epub_document);
1012 doc->zip = zip;
1013 zip = NULL;
1014
1015 doc->super.drop_document = epub_drop_document;
1016 doc->super.layout = epub_layout;
1017 doc->super.load_outline = epub_load_outline;
1018 doc->super.resolve_link_dest = epub_resolve_link;
1019 doc->super.make_bookmark = epub_make_bookmark;
1020 doc->super.lookup_bookmark = epub_lookup_bookmark;
1021 doc->super.count_chapters = epub_count_chapters;
1022 doc->super.count_pages = epub_count_pages;
1023 doc->super.load_page = epub_load_page;
1024 doc->super.page_label = epub_page_label;
1025 doc->super.lookup_metadata = epub_lookup_metadata;
1026 doc->super.output_accelerator = epub_output_accelerator;
1027 doc->super.is_reflowable = 1;
1028
1029 doc->set = fz_new_html_font_set(ctx);
1030 doc->css_sum = user_css_sum(ctx);
1031 epub_load_accelerator(ctx, doc, accel);
1032 epub_parse_header(ctx, doc);
1033 }
1034 fz_catch(ctx)
1035 {
1036 fz_drop_archive(ctx, zip);
1037 fz_drop_document(ctx, &doc->super);
1038 fz_rethrow(ctx);
1039 }
1040
1041 return (fz_document*)doc;
1042 }
1043
1044 static fz_document *
1045 epub_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *dir, void *state)
1046 {
1047 fz_stream *file2 = NULL;
1048 fz_document *doc;
1049 fz_archive *zip = NULL;
1050
1051 if (file == NULL)
1052 {
1053 /* Directory case: file == NULL and dir == the directory. */
1054 if (fz_has_archive_entry(ctx, dir, "META-INF/container.xml"))
1055 file2 = file = fz_open_archive_entry(ctx, dir, "META-INF/container.xml");
1056 else
1057 file2 = file = fz_open_archive_entry(ctx, dir, "META-INF\\container.xml");
1058 if (file == NULL)
1059 fz_throw(ctx, FZ_ERROR_FORMAT, "Not an epub file");
1060 zip = fz_keep_archive(ctx, dir);
1061 }
1062 else
1063 {
1064 /* File case: file != NULL and dir can be ignored. */
1065 zip = fz_open_archive_with_stream(ctx, file);
1066 }
1067
1068
1069 fz_try(ctx)
1070 doc = epub_init(ctx, zip, file);
1071 fz_always(ctx)
1072 fz_drop_stream(ctx, file2);
1073 fz_catch(ctx)
1074 fz_rethrow(ctx);
1075
1076 return doc;
1077 }
1078
1079 static int
1080 epub_recognize(fz_context *doc, const fz_document_handler *handler, const char *magic)
1081 {
1082 if (strstr(magic, "META-INF/container.xml") || strstr(magic, "META-INF\\container.xml"))
1083 return 200;
1084 return 0;
1085 }
1086
1087 static int
1088 epub_recognize_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *dir, void **state, fz_document_recognize_state_free_fn **free_state)
1089 {
1090 fz_archive *arch = NULL;
1091 int ret = 0;
1092
1093 fz_var(arch);
1094 fz_var(ret);
1095
1096 if (state)
1097 *state = NULL;
1098 if (free_state)
1099 *free_state = NULL;
1100
1101 fz_try(ctx)
1102 {
1103 if (stream == NULL)
1104 arch = fz_keep_archive(ctx, dir);
1105 else
1106 {
1107 arch = fz_try_open_archive_with_stream(ctx, stream);
1108 if (arch == NULL)
1109 break;
1110 }
1111
1112 if (fz_has_archive_entry(ctx, arch, "META-INF/container.xml") ||
1113 fz_has_archive_entry(ctx, arch, "META-INF\\container.xml"))
1114 ret = 74; /* One less than the 75 that HWPX files are detected as. */
1115 }
1116 fz_always(ctx)
1117 fz_drop_archive(ctx, arch);
1118 fz_catch(ctx)
1119 fz_rethrow(ctx);
1120
1121 return ret;
1122 }
1123
1124 static const char *epub_extensions[] =
1125 {
1126 "epub",
1127 NULL
1128 };
1129
1130 static const char *epub_mimetypes[] =
1131 {
1132 "application/epub+zip",
1133 NULL
1134 };
1135
1136 fz_document_handler epub_document_handler =
1137 {
1138 epub_recognize,
1139 epub_open_document,
1140 epub_extensions,
1141 epub_mimetypes,
1142 epub_recognize_content
1143 };