Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/source/html/epub-doc.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/source/html/epub-doc.c Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,1143 @@ +// Copyright (C) 2004-2025 Artifex Software, Inc. +// +// This file is part of MuPDF. +// +// MuPDF is free software: you can redistribute it and/or modify it under the +// terms of the GNU Affero General Public License as published by the Free +// Software Foundation, either version 3 of the License, or (at your option) +// any later version. +// +// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +// details. +// +// You should have received a copy of the GNU Affero General Public License +// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> +// +// Alternative licensing terms are available from the licensor. +// For commercial licensing, see <https://www.artifex.com/> or contact +// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, +// CA 94129, USA, for further information. + +#include "mupdf/fitz.h" +#include "html-imp.h" + +#include <string.h> +#include <math.h> + +#include <zlib.h> /* for crc32 */ + +enum { T, R, B, L }; + +typedef struct epub_chapter epub_chapter; +typedef struct epub_page epub_page; + +typedef struct +{ + int max_chapters; + int num_chapters; + float layout_w; + float layout_h; + float layout_em; + uint32_t css_sum; + int use_doc_css; + int *pages_in_chapter; +} epub_accelerator; + +typedef struct +{ + fz_document super; + fz_archive *zip; + fz_html_font_set *set; + int count; + epub_chapter *spine; + fz_outline *outline; + char *dc_title, *dc_creator; + float layout_w, layout_h, layout_em; + epub_accelerator *accel; + uint32_t css_sum; + + /* A common pattern of use is for us to open a document, + * load a page, draw it, drop it, load the next page, + * draw it, drop it etc. This means that the HTML for + * a chapter might get thrown away between the drop and + * the the next load (if the chapter is large, and the + * store size is low). Accordingly, we store a handle + * to the most recently used html block here, thus + * ensuring that the stored copy won't be evicted. */ + fz_html *most_recent_html; +} epub_document; + +struct epub_chapter +{ + epub_document *doc; + char *path; + int number; + epub_chapter *next; +}; + +struct epub_page +{ + fz_page super; + epub_chapter *ch; + int number; + fz_html *html; +}; + +static uint32_t +user_css_sum(fz_context *ctx) +{ + uint32_t sum = 0; + const char *css = fz_user_css(ctx); + sum = crc32(0, NULL, 0); + if (css) + sum = crc32(sum, (Byte*)css, (int)strlen(css)); + return sum; +} + +static int dummy = 1; + +struct encrypted { + fz_archive super; + fz_archive *chain; + fz_tree *info; +}; + +static int has_encrypted_entry(fz_context *ctx, fz_archive *arch_, const char *name) +{ + struct encrypted *arch = (struct encrypted *)arch_; + return fz_has_archive_entry(ctx, arch->chain, name); +} + +static fz_stream *open_encrypted_entry(fz_context *ctx, fz_archive *arch_, const char *name) +{ + struct encrypted *arch = (struct encrypted *)arch_; + if (fz_tree_lookup(ctx, arch->info, name)) + return NULL; + return fz_open_archive_entry(ctx, arch->chain, name); +} + +static fz_buffer *read_encrypted_entry(fz_context *ctx, fz_archive *arch_, const char *name) +{ + struct encrypted *arch = (struct encrypted *)arch_; + if (fz_tree_lookup(ctx, arch->info, name)) + return NULL; + return fz_read_archive_entry(ctx, arch->chain, name); +} + +static void drop_encrypted_archive(fz_context *ctx, fz_archive *arch_) +{ + struct encrypted *arch = (struct encrypted *)arch_; + fz_drop_tree(ctx, arch->info, NULL); + fz_drop_archive(ctx, arch->chain); +} + +static fz_archive *new_encrypted_archive(fz_context *ctx, fz_archive *chain, fz_tree *info) +{ + struct encrypted *arch; + + arch = fz_new_derived_archive(ctx, NULL, struct encrypted); + arch->super.format = "encrypted"; + arch->super.has_entry = has_encrypted_entry; + arch->super.read_entry = read_encrypted_entry; + arch->super.open_entry = open_encrypted_entry; + arch->super.drop_archive = drop_encrypted_archive; + arch->chain = chain; + arch->info = info; + + return &arch->super; +} + +static void +epub_parse_encryption(fz_context *ctx, epub_document *doc, fz_xml *root) +{ + fz_tree *info = NULL; + fz_xml *edata; + + for (edata = fz_xml_find_down(root, "EncryptedData"); edata; edata = fz_xml_find_next(edata, "EncryptedData")) + { + fz_xml *cdata = fz_xml_find_down(edata, "CipherData"); + fz_xml *cref = fz_xml_find_down(cdata, "CipherReference"); + char *uri = fz_xml_att(cref, "URI"); + if (uri) + { + // TODO: Support reading EncryptedKey and EncryptionMethod to decrypt content. + info = fz_tree_insert(ctx, info, uri, &dummy); + } + } + + if (info) + { + doc->zip = new_encrypted_archive(ctx, doc->zip, info); + } +} + +static fz_html *epub_get_laid_out_html(fz_context *ctx, epub_document *doc, epub_chapter *ch); + +static int count_laid_out_pages(fz_html *html) +{ + if (html->tree.root->s.layout.b > 0) + return ceilf(html->tree.root->s.layout.b / html->page_h); + return 1; +} + +static void +invalidate_accelerator(fz_context *ctx, epub_accelerator *acc) +{ + int i; + + for (i = 0; i < acc->max_chapters; i++) + acc->pages_in_chapter[i] = -1; +} + +static int count_chapter_pages(fz_context *ctx, epub_document *doc, epub_chapter *ch) +{ + epub_accelerator *acc = doc->accel; + int use_doc_css = fz_use_document_css(ctx); + + if (use_doc_css != acc->use_doc_css || doc->css_sum != acc->css_sum) + { + acc->use_doc_css = use_doc_css; + acc->css_sum = doc->css_sum; + invalidate_accelerator(ctx, acc); + } + + if (ch->number < acc->num_chapters && acc->pages_in_chapter[ch->number] != -1) + return acc->pages_in_chapter[ch->number]; + + fz_drop_html(ctx, epub_get_laid_out_html(ctx, doc, ch)); + return acc->pages_in_chapter[ch->number]; +} + +static fz_link_dest +epub_resolve_link(fz_context *ctx, fz_document *doc_, const char *dest) +{ + epub_document *doc = (epub_document*)doc_; + epub_chapter *ch; + int i; + + const char *s = strchr(dest, '#'); + size_t n = s ? (size_t)(s - dest) : strlen(dest); + if (s && s[1] == 0) + s = NULL; + + for (i = 0, ch = doc->spine; ch; ++i, ch = ch->next) + { + if (!strncmp(ch->path, dest, n) && ch->path[n] == 0) + { + if (s) + { + float y; + fz_html *html = epub_get_laid_out_html(ctx, doc, ch); + int ph = html->page_h; + + /* Search for a matching fragment */ + y = fz_find_html_target(ctx, html, s+1); + fz_drop_html(ctx, html); + if (y >= 0) + { + int page = y / ph; + return fz_make_link_dest_xyz(i, page, 0, y - page * ph, 0); + } + return fz_make_link_dest_none(); + } + return fz_make_link_dest_xyz(i, 0, 0, 0, 0); + } + } + + return fz_make_link_dest_none(); +} + +static void +epub_layout(fz_context *ctx, fz_document *doc_, float w, float h, float em) +{ + epub_document *doc = (epub_document*)doc_; + uint32_t css_sum = user_css_sum(ctx); + int use_doc_css = fz_use_document_css(ctx); + + if (doc->layout_w == w && doc->layout_h == h && doc->layout_em == em && doc->css_sum == css_sum) + return; + doc->layout_w = w; + doc->layout_h = h; + doc->layout_em = em; + + if (doc->accel == NULL) + return; + + /* When we load the saved accelerator, doc->accel + * can be populated with different values than doc. + * This is really useful as doc starts out with the + * values being 0. If we've got the right values + * already, then don't bin the data! */ + if (doc->accel->layout_w == w && + doc->accel->layout_h == h && + doc->accel->layout_em == em && + doc->accel->use_doc_css == use_doc_css && + doc->accel->css_sum == css_sum) + return; + + doc->accel->layout_w = w; + doc->accel->layout_h = h; + doc->accel->layout_em = em; + doc->accel->use_doc_css = use_doc_css; + doc->accel->css_sum = css_sum; + invalidate_accelerator(ctx, doc->accel); +} + +static int +epub_count_chapters(fz_context *ctx, fz_document *doc_) +{ + epub_document *doc = (epub_document*)doc_; + epub_chapter *ch; + int count = 0; + for (ch = doc->spine; ch; ch = ch->next) + ++count; + return count; +} + +static int +epub_count_pages(fz_context *ctx, fz_document *doc_, int chapter) +{ + epub_document *doc = (epub_document*)doc_; + epub_chapter *ch; + int i; + for (i = 0, ch = doc->spine; ch; ++i, ch = ch->next) + { + if (i == chapter) + { + return count_chapter_pages(ctx, doc, ch); + } + } + return 0; +} + +#define MAGIC_ACCELERATOR 0xacce1e7a +#define MAGIC_ACCEL_EPUB 0x62755065 +#define ACCEL_VERSION 0x00010001 + +static void epub_load_accelerator(fz_context *ctx, epub_document *doc, fz_stream *accel) +{ + int v; + float w, h, em; + int num_chapters; + epub_accelerator *acc = NULL; + uint32_t css_sum; + int use_doc_css; + int make_new = (accel == NULL); + + fz_var(acc); + + if (accel) + { + /* Try to read the accelerator data. If we fail silently give up. */ + fz_try(ctx) + { + v = fz_read_int32_le(ctx, accel); + if (v != (int32_t)MAGIC_ACCELERATOR) + { + make_new = 1; + break; + } + + v = fz_read_int32_le(ctx, accel); + if (v != MAGIC_ACCEL_EPUB) + { + make_new = 1; + break; + } + + v = fz_read_int32_le(ctx, accel); + if (v != ACCEL_VERSION) + { + make_new = 1; + break; + } + + w = fz_read_float_le(ctx, accel); + h = fz_read_float_le(ctx, accel); + em = fz_read_float_le(ctx, accel); + css_sum = fz_read_uint32_le(ctx, accel); + use_doc_css = fz_read_int32_le(ctx, accel); + + num_chapters = fz_read_int32_le(ctx, accel); + if (num_chapters <= 0) + { + make_new = 1; + break; + } + + acc = fz_malloc_struct(ctx, epub_accelerator); + acc->pages_in_chapter = Memento_label(fz_malloc_array(ctx, num_chapters, int), "accel_pages_in_chapter"); + acc->max_chapters = acc->num_chapters = num_chapters; + acc->layout_w = w; + acc->layout_h = h; + acc->layout_em = em; + acc->css_sum = css_sum; + acc->use_doc_css = use_doc_css; + + for (v = 0; v < num_chapters; v++) + acc->pages_in_chapter[v] = fz_read_int32_le(ctx, accel); + } + fz_catch(ctx) + { + if (acc) + fz_free(ctx, acc->pages_in_chapter); + fz_free(ctx, acc); + /* Swallow the error and run unaccelerated */ + fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); + fz_report_error(ctx); + make_new = 1; + } + } + + /* If we aren't given an accelerator to load (or the one we're given + * is bad) create a blank stub and we can fill it out as we go. */ + if (make_new) + { + acc = fz_malloc_struct(ctx, epub_accelerator); + acc->css_sum = doc->css_sum; + acc->use_doc_css = fz_use_document_css(ctx); + } + + doc->accel = acc; +} + +static void +accelerate_chapter(fz_context *ctx, epub_document *doc, epub_chapter *ch, fz_html *html) +{ + epub_accelerator *acc = doc->accel; + int p = count_laid_out_pages(html); + + if (ch->number < acc->num_chapters) + { + if (acc->pages_in_chapter[ch->number] != p && acc->pages_in_chapter[ch->number] != -1) + { + fz_warn(ctx, "Invalidating stale accelerator data."); + invalidate_accelerator(ctx, doc->accel); + } + acc->pages_in_chapter[ch->number] = p; + return; + } + + if (ch->number >= acc->max_chapters) + { + int n = acc->max_chapters; + int i; + if (n == 0) + n = 4; + while (n <= ch->number) + n *= 2; + + acc->pages_in_chapter = fz_realloc_array(ctx, acc->pages_in_chapter, n, int); + for (i = acc->max_chapters; i < n; i++) + acc->pages_in_chapter[i] = -1; + acc->max_chapters = n; + } + acc->pages_in_chapter[ch->number] = p; + if (acc->num_chapters < ch->number+1) + acc->num_chapters = ch->number+1; +} + +static void +epub_drop_page(fz_context *ctx, fz_page *page_) +{ + epub_page *page = (epub_page *)page_; + fz_drop_html(ctx, page->html); +} + +static epub_chapter * +epub_load_chapter(fz_context *ctx, epub_document *doc, const char *path, int i) +{ + epub_chapter *ch; + + ch = fz_malloc_struct(ctx, epub_chapter); + fz_try(ctx) + { + ch->path = Memento_label(fz_strdup(ctx, path), "chapter_path"); + ch->number = i; + } + fz_catch(ctx) + { + fz_free(ctx, ch); + fz_rethrow(ctx); + } + + return ch; +} + +static fz_html * +epub_parse_chapter(fz_context *ctx, epub_document *doc, epub_chapter *ch) +{ + fz_archive *zip = doc->zip; + fz_buffer *buf; + char base_uri[2048]; + fz_html *html; + + /* Look for one we made earlier */ + html = fz_find_html(ctx, doc, ch->number); + if (html) + return html; + + fz_dirname(base_uri, ch->path, sizeof base_uri); + + buf = fz_read_archive_entry(ctx, zip, ch->path); + fz_try(ctx) + html = fz_parse_html(ctx, doc->set, zip, base_uri, buf, fz_user_css(ctx), 1, 1, 0); + fz_always(ctx) + fz_drop_buffer(ctx, buf); + fz_catch(ctx) + fz_rethrow(ctx); + + return fz_store_html(ctx, html, doc, ch->number); +} + +static fz_html * +epub_get_laid_out_html(fz_context *ctx, epub_document *doc, epub_chapter *ch) +{ + fz_html *html = epub_parse_chapter(ctx, doc, ch); + fz_try(ctx) + { + fz_layout_html(ctx, html, doc->layout_w, doc->layout_h, doc->layout_em); + accelerate_chapter(ctx, doc, ch, html); + } + fz_catch(ctx) + { + fz_drop_html(ctx, html); + fz_rethrow(ctx); + } + + fz_drop_html(ctx, doc->most_recent_html); + doc->most_recent_html = fz_keep_html(ctx, html); + + return html; +} + +static fz_rect +epub_bound_page(fz_context *ctx, fz_page *page_, fz_box_type box) +{ + epub_document *doc = (epub_document*)page_->doc; + epub_page *page = (epub_page*)page_; + epub_chapter *ch = page->ch; + fz_rect bbox; + fz_html *html = epub_get_laid_out_html(ctx, doc, ch); + + bbox.x0 = 0; + bbox.y0 = 0; + bbox.x1 = html->page_w + html->page_margin[L] + html->page_margin[R]; + bbox.y1 = html->page_h + html->page_margin[T] + html->page_margin[B]; + fz_drop_html(ctx, html); + return bbox; +} + +static void +epub_run_page(fz_context *ctx, fz_page *page_, fz_device *dev, fz_matrix ctm, fz_cookie *cookie) +{ + epub_page *page = (epub_page*)page_; + + fz_draw_html(ctx, dev, ctm, page->html, page->number); +} + +static fz_link * +epub_load_links(fz_context *ctx, fz_page *page_) +{ + epub_page *page = (epub_page*)page_; + epub_chapter *ch = page->ch; + + return fz_load_html_links(ctx, page->html, page->number, ch->path); +} + +static fz_bookmark +epub_make_bookmark(fz_context *ctx, fz_document *doc_, fz_location loc) +{ + epub_document *doc = (epub_document*)doc_; + epub_chapter *ch; + int i; + + for (i = 0, ch = doc->spine; ch; ++i, ch = ch->next) + { + if (i == loc.chapter) + { + fz_html *html = epub_get_laid_out_html(ctx, doc, ch); + fz_bookmark mark = fz_make_html_bookmark(ctx, html, loc.page); + fz_drop_html(ctx, html); + return mark; + } + } + + return 0; +} + +static fz_location +epub_lookup_bookmark(fz_context *ctx, fz_document *doc_, fz_bookmark mark) +{ + epub_document *doc = (epub_document*)doc_; + epub_chapter *ch; + int i; + + for (i = 0, ch = doc->spine; ch; ++i, ch = ch->next) + { + fz_html *html = epub_get_laid_out_html(ctx, doc, ch); + int p = fz_lookup_html_bookmark(ctx, html, mark); + fz_drop_html(ctx, html); + if (p != -1) + return fz_make_location(i, p); + } + return fz_make_location(-1, -1); +} + +static fz_page * +epub_load_page(fz_context *ctx, fz_document *doc_, int chapter, int number) +{ + epub_document *doc = (epub_document*)doc_; + epub_chapter *ch; + int i; + + if (chapter < 0) + fz_throw(ctx, FZ_ERROR_ARGUMENT, "invalid chapter number: %d", chapter); + if (number < 0) + fz_throw(ctx, FZ_ERROR_ARGUMENT, "invalid page number: %d", number); + + for (i = 0, ch = doc->spine; ch; ++i, ch = ch->next) + { + if (i == chapter) + { + epub_page *page = fz_new_derived_page(ctx, epub_page, doc_); + page->super.bound_page = epub_bound_page; + page->super.run_page_contents = epub_run_page; + page->super.load_links = epub_load_links; + page->super.drop_page = epub_drop_page; + page->ch = ch; + page->number = number; + page->html = epub_get_laid_out_html(ctx, doc, ch); + return (fz_page*)page; + } + } + return NULL; +} + +static void +epub_page_label(fz_context *ctx, fz_document *doc_, int chapter, int number, char *buf, size_t size) +{ + fz_snprintf(buf, size, "ch. %d, p. %d", chapter+1, number+1); +} + +static void +epub_drop_accelerator(fz_context *ctx, epub_accelerator *acc) +{ + if (acc == NULL) + return; + + fz_free(ctx, acc->pages_in_chapter); + fz_free(ctx, acc); +} + +static void +epub_drop_document(fz_context *ctx, fz_document *doc_) +{ + epub_document *doc = (epub_document*)doc_; + epub_chapter *ch, *next; + ch = doc->spine; + while (ch) + { + next = ch->next; + fz_free(ctx, ch->path); + fz_free(ctx, ch); + ch = next; + } + epub_drop_accelerator(ctx, doc->accel); + fz_drop_archive(ctx, doc->zip); + fz_drop_html_font_set(ctx, doc->set); + fz_drop_outline(ctx, doc->outline); + fz_free(ctx, doc->dc_title); + fz_free(ctx, doc->dc_creator); + fz_drop_html(ctx, doc->most_recent_html); + fz_purge_stored_html(ctx, doc); +} + +static const char * +rel_path_from_idref(fz_xml *manifest, const char *idref) +{ + fz_xml *item; + if (!idref) + return NULL; + item = fz_xml_find_down(manifest, "item"); + while (item) + { + const char *id = fz_xml_att(item, "id"); + if (id && !strcmp(id, idref)) + return fz_xml_att(item, "href"); + item = fz_xml_find_next(item, "item"); + } + return NULL; +} + +static const char * +path_from_idref(char *path, fz_xml *manifest, const char *base_uri, const char *idref, int n) +{ + const char *rel_path = rel_path_from_idref(manifest, idref); + if (!rel_path) + { + path[0] = 0; + return NULL; + } + fz_strlcpy(path, base_uri, n); + fz_strlcat(path, "/", n); + fz_strlcat(path, rel_path, n); + return fz_cleanname(fz_urldecode(path)); +} + +static fz_outline * +epub_parse_ncx_imp(fz_context *ctx, epub_document *doc, fz_xml *node, char *base_uri) +{ + char path[2048]; + fz_outline *outline, *head, **tailp; + + head = NULL; + tailp = &head; + + node = fz_xml_find_down(node, "navPoint"); + while (node) + { + char *text = fz_xml_text(fz_xml_down(fz_xml_find_down(fz_xml_find_down(node, "navLabel"), "text"))); + char *content = fz_xml_att(fz_xml_find_down(node, "content"), "src"); + if (text && content) + { + fz_strlcpy(path, base_uri, sizeof path); + fz_strlcat(path, "/", sizeof path); + fz_strlcat(path, content, sizeof path); + fz_urldecode(path); + fz_cleanname(path); + + fz_try(ctx) + { + *tailp = outline = fz_new_outline(ctx); + tailp = &(*tailp)->next; + outline->title = Memento_label(fz_strdup(ctx, text), "outline_title"); + outline->uri = Memento_label(fz_strdup(ctx, path), "outline_uri"); + outline->page = fz_make_location(-1, -1); + outline->down = epub_parse_ncx_imp(ctx, doc, node, base_uri); + outline->is_open = 1; + } + fz_catch(ctx) + { + fz_drop_outline(ctx, head); + fz_rethrow(ctx); + } + } + node = fz_xml_find_next(node, "navPoint"); + } + + return head; +} + +static void +epub_parse_ncx(fz_context *ctx, epub_document *doc, const char *path) +{ + fz_archive *zip = doc->zip; + fz_buffer *buf = NULL; + fz_xml_doc *ncx = NULL; + char base_uri[2048]; + + fz_var(buf); + fz_var(ncx); + + fz_try(ctx) + { + fz_dirname(base_uri, path, sizeof base_uri); + buf = fz_read_archive_entry(ctx, zip, path); + ncx = fz_parse_xml(ctx, buf, 0); + doc->outline = epub_parse_ncx_imp(ctx, doc, fz_xml_find_down(fz_xml_root(ncx), "navMap"), base_uri); + } + fz_always(ctx) + { + fz_drop_buffer(ctx, buf); + fz_drop_xml(ctx, ncx); + } + fz_catch(ctx) + fz_rethrow(ctx); +} + +static char * +find_metadata(fz_context *ctx, fz_xml *metadata, char *key) +{ + char *text = fz_xml_text(fz_xml_down(fz_xml_find_down(metadata, key))); + if (text) + return fz_strdup(ctx, text); + return NULL; +} + +static fz_buffer * +read_container_and_prefix(fz_context *ctx, fz_archive *zip, char *prefix, size_t prefix_len) +{ + int n = fz_count_archive_entries(ctx, zip); + int i; + + prefix[0] = 0; + + /* First off, look for the container.xml at the top level. */ + for (i = 0; i < n; i++) + { + const char *p = fz_list_archive_entry(ctx, zip, i); + + if (!strcmp(p, "META-INF/container.xml")) + return fz_read_archive_entry(ctx, zip, "META-INF/container.xml"); + } + + /* If that failed, look for the first such file in a subdirectory. */ + for (i = 0; i < n; i++) + { + const char *p = fz_list_archive_entry(ctx, zip, i); + size_t z = strlen(p); + size_t z0 = sizeof("META-INF/container.xml")-1; + + if (z < z0) + continue; + if (!strcmp(p + z - z0, "META-INF/container.xml")) + { + if (z - z0 >= prefix_len) + { + fz_warn(ctx, "Ignoring %s as path too long.", p); + continue; + } + memcpy(prefix, p, z-z0); + prefix[z-z0] = 0; + return fz_read_archive_entry(ctx, zip, p); + } + } + + return fz_read_archive_entry(ctx, zip, "META-INF/container.xml"); +} + +static void +epub_parse_header(fz_context *ctx, epub_document *doc) +{ + fz_archive *zip = doc->zip; + fz_buffer *buf = NULL; + fz_xml_doc *encryption_xml = NULL; + fz_xml_doc *container_xml = NULL; + fz_xml_doc *content_opf = NULL; + fz_xml *container, *rootfiles, *rootfile; + fz_xml *package, *manifest, *spine, *itemref, *metadata; + char base_uri[2048]; + const char *full_path; + const char *version; + char ncx[2048], s[2048]; + char *prefixed_full_path = NULL; + size_t prefix_len; + epub_chapter **tailp; + int i; + + fz_var(buf); + fz_var(encryption_xml); + fz_var(container_xml); + fz_var(content_opf); + fz_var(prefixed_full_path); + + fz_try(ctx) + { + /* parse META-INF/encryption.xml to figure out which entries are encrypted */ + + /* parse META-INF/container.xml to find OPF */ + /* Reuse base_uri to read the prefix. */ + buf = read_container_and_prefix(ctx, zip, base_uri, sizeof(base_uri)); + container_xml = fz_parse_xml(ctx, buf, 0); + fz_drop_buffer(ctx, buf); + buf = NULL; + + /* Some epub files can be prefixed by a directory name. This (normally + * empty!) will be in base_uri. */ + prefix_len = strlen(base_uri); + { + /* Further abuse base_uri to hold a temporary name. */ + const size_t z0 = sizeof("META-INF/encryption.xml")-1; + if (sizeof(base_uri) <= prefix_len + z0) + fz_throw(ctx, FZ_ERROR_FORMAT, "Prefix too long in epub"); + strcpy(base_uri + prefix_len, "META-INF/encryption.xml"); + if (fz_has_archive_entry(ctx, zip, base_uri)) + { + fz_warn(ctx, "EPUB may be locked by DRM"); + + buf = fz_read_archive_entry(ctx, zip, base_uri); + encryption_xml = fz_parse_xml(ctx, buf, 0); + fz_drop_buffer(ctx, buf); + buf = NULL; + + epub_parse_encryption(ctx, doc, fz_xml_find(fz_xml_root(encryption_xml), "encryption")); + zip = doc->zip; + } + } + + container = fz_xml_find(fz_xml_root(container_xml), "container"); + rootfiles = fz_xml_find_down(container, "rootfiles"); + rootfile = fz_xml_find_down(rootfiles, "rootfile"); + full_path = fz_xml_att(rootfile, "full-path"); + if (!full_path) + fz_throw(ctx, FZ_ERROR_FORMAT, "cannot find root file in EPUB"); + + fz_dirname(base_uri+prefix_len, full_path, sizeof(base_uri) - prefix_len); + + prefixed_full_path = fz_malloc(ctx, strlen(full_path) + prefix_len + 1); + memcpy(prefixed_full_path, base_uri, prefix_len); + strcpy(prefixed_full_path + prefix_len, full_path); + + /* parse OPF to find NCX and spine */ + + buf = fz_read_archive_entry(ctx, zip, prefixed_full_path); + content_opf = fz_parse_xml(ctx, buf, 0); + fz_drop_buffer(ctx, buf); + buf = NULL; + + package = fz_xml_find(fz_xml_root(content_opf), "package"); + version = fz_xml_att(package, "version"); + if (!version || strcmp(version, "2.0")) + fz_warn(ctx, "unknown epub version: %s", version ? version : "<none>"); + + metadata = fz_xml_find_down(package, "metadata"); + if (metadata) + { + doc->dc_title = Memento_label(find_metadata(ctx, metadata, "title"), "epub_title"); + doc->dc_creator = Memento_label(find_metadata(ctx, metadata, "creator"), "epub_creator"); + } + + manifest = fz_xml_find_down(package, "manifest"); + spine = fz_xml_find_down(package, "spine"); + + if (path_from_idref(ncx, manifest, base_uri, fz_xml_att(spine, "toc"), sizeof ncx)) + { + epub_parse_ncx(ctx, doc, ncx); + } + + doc->spine = NULL; + tailp = &doc->spine; + itemref = fz_xml_find_down(spine, "itemref"); + i = 0; + while (itemref) + { + if (path_from_idref(s, manifest, base_uri, fz_xml_att(itemref, "idref"), sizeof s)) + { + fz_try(ctx) + { + *tailp = epub_load_chapter(ctx, doc, s, i); + tailp = &(*tailp)->next; + i++; + } + fz_catch(ctx) + { + fz_rethrow_if(ctx, FZ_ERROR_TRYLATER); + fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); + fz_report_error(ctx); + fz_warn(ctx, "ignoring chapter %s", s); + } + } + itemref = fz_xml_find_next(itemref, "itemref"); + } + } + fz_always(ctx) + { + fz_drop_xml(ctx, content_opf); + fz_drop_xml(ctx, container_xml); + fz_drop_xml(ctx, encryption_xml); + fz_drop_buffer(ctx, buf); + fz_free(ctx, prefixed_full_path); + } + fz_catch(ctx) + fz_rethrow(ctx); +} + +static fz_outline * +epub_load_outline(fz_context *ctx, fz_document *doc_) +{ + epub_document *doc = (epub_document*)doc_; + return fz_keep_outline(ctx, doc->outline); +} + +static int +epub_lookup_metadata(fz_context *ctx, fz_document *doc_, const char *key, char *buf, size_t size) +{ + epub_document *doc = (epub_document*)doc_; + if (!strcmp(key, FZ_META_FORMAT)) + return 1 + (int)fz_strlcpy(buf, "EPUB", size); + if (!strcmp(key, FZ_META_INFO_TITLE) && doc->dc_title) + return 1 + (int)fz_strlcpy(buf, doc->dc_title, size); + if (!strcmp(key, FZ_META_INFO_AUTHOR) && doc->dc_creator) + return 1 + (int)fz_strlcpy(buf, doc->dc_creator, size); + return -1; +} + +static void +epub_output_accelerator(fz_context *ctx, fz_document *doc_, fz_output *out) +{ + epub_document *doc = (epub_document*)doc_; + int i; + + fz_try(ctx) + { + if (doc->accel == NULL) + fz_throw(ctx, FZ_ERROR_ARGUMENT, "No accelerator data to write"); + + fz_write_int32_le(ctx, out, MAGIC_ACCELERATOR); + fz_write_int32_le(ctx, out, MAGIC_ACCEL_EPUB); + fz_write_int32_le(ctx, out, ACCEL_VERSION); + fz_write_float_le(ctx, out, doc->accel->layout_w); + fz_write_float_le(ctx, out, doc->accel->layout_h); + fz_write_float_le(ctx, out, doc->accel->layout_em); + fz_write_uint32_le(ctx, out, doc->accel->css_sum); + fz_write_int32_le(ctx, out, doc->accel->use_doc_css); + fz_write_int32_le(ctx, out, doc->accel->num_chapters); + for (i = 0; i < doc->accel->num_chapters; i++) + fz_write_int32_le(ctx, out, doc->accel->pages_in_chapter[i]); + + fz_close_output(ctx, out); + } + fz_always(ctx) + fz_drop_output(ctx, out); + fz_catch(ctx) + fz_rethrow(ctx); +} + +/* Takes ownership of zip. Will always eventually drop it. + * Never takes ownership of accel. */ +static fz_document * +epub_init(fz_context *ctx, fz_archive *zip, fz_stream *accel) +{ + epub_document *doc = NULL; + + fz_var(doc); + fz_var(zip); + + fz_try(ctx) + { + doc = fz_new_derived_document(ctx, epub_document); + doc->zip = zip; + zip = NULL; + + doc->super.drop_document = epub_drop_document; + doc->super.layout = epub_layout; + doc->super.load_outline = epub_load_outline; + doc->super.resolve_link_dest = epub_resolve_link; + doc->super.make_bookmark = epub_make_bookmark; + doc->super.lookup_bookmark = epub_lookup_bookmark; + doc->super.count_chapters = epub_count_chapters; + doc->super.count_pages = epub_count_pages; + doc->super.load_page = epub_load_page; + doc->super.page_label = epub_page_label; + doc->super.lookup_metadata = epub_lookup_metadata; + doc->super.output_accelerator = epub_output_accelerator; + doc->super.is_reflowable = 1; + + doc->set = fz_new_html_font_set(ctx); + doc->css_sum = user_css_sum(ctx); + epub_load_accelerator(ctx, doc, accel); + epub_parse_header(ctx, doc); + } + fz_catch(ctx) + { + fz_drop_archive(ctx, zip); + fz_drop_document(ctx, &doc->super); + fz_rethrow(ctx); + } + + return (fz_document*)doc; +} + +static fz_document * +epub_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *dir, void *state) +{ + fz_stream *file2 = NULL; + fz_document *doc; + fz_archive *zip = NULL; + + if (file == NULL) + { + /* Directory case: file == NULL and dir == the directory. */ + if (fz_has_archive_entry(ctx, dir, "META-INF/container.xml")) + file2 = file = fz_open_archive_entry(ctx, dir, "META-INF/container.xml"); + else + file2 = file = fz_open_archive_entry(ctx, dir, "META-INF\\container.xml"); + if (file == NULL) + fz_throw(ctx, FZ_ERROR_FORMAT, "Not an epub file"); + zip = fz_keep_archive(ctx, dir); + } + else + { + /* File case: file != NULL and dir can be ignored. */ + zip = fz_open_archive_with_stream(ctx, file); + } + + + fz_try(ctx) + doc = epub_init(ctx, zip, file); + fz_always(ctx) + fz_drop_stream(ctx, file2); + fz_catch(ctx) + fz_rethrow(ctx); + + return doc; +} + +static int +epub_recognize(fz_context *doc, const fz_document_handler *handler, const char *magic) +{ + if (strstr(magic, "META-INF/container.xml") || strstr(magic, "META-INF\\container.xml")) + return 200; + return 0; +} + +static int +epub_recognize_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *dir, void **state, fz_document_recognize_state_free_fn **free_state) +{ + fz_archive *arch = NULL; + int ret = 0; + + fz_var(arch); + fz_var(ret); + + if (state) + *state = NULL; + if (free_state) + *free_state = NULL; + + fz_try(ctx) + { + if (stream == NULL) + arch = fz_keep_archive(ctx, dir); + else + { + arch = fz_try_open_archive_with_stream(ctx, stream); + if (arch == NULL) + break; + } + + if (fz_has_archive_entry(ctx, arch, "META-INF/container.xml") || + fz_has_archive_entry(ctx, arch, "META-INF\\container.xml")) + ret = 74; /* One less than the 75 that HWPX files are detected as. */ + } + fz_always(ctx) + fz_drop_archive(ctx, arch); + fz_catch(ctx) + fz_rethrow(ctx); + + return ret; +} + +static const char *epub_extensions[] = +{ + "epub", + NULL +}; + +static const char *epub_mimetypes[] = +{ + "application/epub+zip", + NULL +}; + +fz_document_handler epub_document_handler = +{ + epub_recognize, + epub_open_document, + epub_extensions, + epub_mimetypes, + epub_recognize_content +};
