Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/source/html/html-parse.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/source/html/html-parse.c Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,2392 @@ +// Copyright (C) 2004-2025 Artifex Software, Inc. +// +// This file is part of MuPDF. +// +// MuPDF is free software: you can redistribute it and/or modify it under the +// terms of the GNU Affero General Public License as published by the Free +// Software Foundation, either version 3 of the License, or (at your option) +// any later version. +// +// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +// details. +// +// You should have received a copy of the GNU Affero General Public License +// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> +// +// Alternative licensing terms are available from the licensor. +// For commercial licensing, see <https://www.artifex.com/> or contact +// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, +// CA 94129, USA, for further information. + +#include "mupdf/fitz.h" +#include "mupdf/ucdn.h" +#include "html-imp.h" + +#include <string.h> +#include <stdio.h> +#include <assert.h> + +enum { T, R, B, L }; + +#define DEFAULT_DIR FZ_BIDI_LTR + +static const char *html_default_css = +"@page{margin:3em 2em}" +"a{color:#06C;text-decoration:underline}" +"address{display:block;font-style:italic}" +"b{font-weight:bold}" +"bdo{direction:rtl;unicode-bidi:bidi-override}" +"blockquote{display:block;margin:1em 40px}" +"body{display:block;margin:1em}" +"cite{font-style:italic}" +"code{font-family:monospace}" +"dd{display:block;margin:0 0 0 40px}" +"del{text-decoration:line-through}" +"div{display:block}" +"dl{display:block;margin:1em 0}" +"dt{display:block}" +"em{font-style:italic}" +"h1{display:block;font-size:2em;font-weight:bold;margin:0.67em 0;page-break-after:avoid}" +"h2{display:block;font-size:1.5em;font-weight:bold;margin:0.83em 0;page-break-after:avoid}" +"h3{display:block;font-size:1.17em;font-weight:bold;margin:1em 0;page-break-after:avoid}" +"h4{display:block;font-size:1em;font-weight:bold;margin:1.33em 0;page-break-after:avoid}" +"h5{display:block;font-size:0.83em;font-weight:bold;margin:1.67em 0;page-break-after:avoid}" +"h6{display:block;font-size:0.67em;font-weight:bold;margin:2.33em 0;page-break-after:avoid}" +"head{display:none}" +"hr{border-style:solid;border-width:1px;display:block;margin-bottom:0.5em;margin-top:0.5em;text-align:center}" +"html{display:block}" +"i{font-style:italic}" +"ins{text-decoration:underline}" +"kbd{font-family:monospace}" +"li{display:list-item}" +"menu{display:block;list-style-type:disc;margin:1em 0;padding:0 0 0 30pt}" +"ol{display:block;list-style-type:decimal;margin:1em 0;padding:0 0 0 30pt}" +"p{display:block;margin:1em 0}" +"pre{display:block;font-family:monospace;margin:1em 0;white-space:pre}" +"samp{font-family:monospace}" +"script{display:none}" +"small{font-size:0.83em}" +"strong{font-weight:bold}" +"style{display:none}" +"sub{font-size:0.83em;vertical-align:sub}" +"sup{font-size:0.83em;vertical-align:super}" +"table{display:table;border-spacing:2px}" +"tbody{display:table-row-group}" +"td{display:table-cell;padding:1px;background-color:inherit}" +"tfoot{display:table-footer-group}" +"th{display:table-cell;font-weight:bold;padding:1px;text-align:center;background-color:inherit}" +"thead{display:table-header-group}" +"tr{display:table-row}" +"ul{display:block;list-style-type:disc;margin:1em 0;padding:0 0 0 30pt}" +"ul ul{list-style-type:circle}" +"ul ul ul{list-style-type:square}" +"var{font-style:italic}" +"colgroup{display:table-column-group}" +"col{display:table-column}" +"caption{display:block;text-align:center}" +; + +static const char *mobi_default_css = +"pagebreak{display:block;page-break-before:always}" +"dl,ol,ul{margin:0}" +"p{margin:0}" +"blockquote{margin:0 40px}" +"center{display:block;text-align:center}" +"big{font-size:1.17em}" +"strike{text-decoration:line-through}" +; + +static const char *fb2_default_css = +"@page{margin:3em 2em}" +"FictionBook{display:block;margin:1em}" +"stylesheet,binary{display:none}" +"description>*{display:none}" +"description>title-info{display:block}" +"description>title-info>*{display:none}" +"description>title-info>coverpage{display:block;page-break-before:always;page-break-after:always}" +"body,section,title,subtitle,p,cite,epigraph,text-author,date,poem,stanza,v,empty-line{display:block}" +"image{display:block}" +"p>image{display:inline}" +"table{display:table}" +"tr{display:table-row}" +"th,td{display:table-cell}" +"a{color:#06C;text-decoration:underline}" +"a[type=note]{font-size:small;vertical-align:super}" +"code{white-space:pre;font-family:monospace}" +"emphasis{font-style:italic}" +"strikethrough{text-decoration:line-through}" +"strong{font-weight:bold}" +"sub{font-size:small;vertical-align:sub}" +"sup{font-size:small;vertical-align:super}" +"image{margin:1em 0;text-align:center}" +"cite,poem{margin:1em 2em}" +"subtitle,epigraph,stanza{margin:1em 0}" +"title>p{text-align:center;font-size:x-large}" +"subtitle{text-align:center;font-size:large}" +"p{margin-top:1em;text-align:justify}" +"empty-line{padding-top:1em}" +"p+p{margin-top:0;text-indent:1.5em}" +"empty-line+p{margin-top:0}" +"section>title{page-break-before:always}" +; + +static const char *known_html_tags[] = { + // TODO: add known FB2 tags? + // Sorted list of all HTML tags. + "a", "abbr", "acronym", "address", "annotation-xml", "applet", "area", + "article", "aside", "audio", "b", "base", "basefont", "bdi", "bdo", + "bgsound", "big", "blink", "blockquote", "body", "br", "button", + "canvas", "caption", "center", "cite", "code", "col", "colgroup", + "data", "datalist", "dd", "del", "desc", "details", "dfn", "dir", + "div", "dl", "dt", "em", "embed", "fieldset", "figcaption", "figure", + "font", "footer", "foreignobject", "form", "frame", "frameset", "h1", + "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html", + "i", "iframe", "image", "img", "input", "ins", "isindex", "kbd", + "keygen", "label", "legend", "li", "link", "listing", "main", + "malignmark", "map", "mark", "marquee", "math", "menu", "menuitem", + "meta", "meter", "mglyph", "mi", "mn", "mo", "ms", "mtext", "multicol", + "nav", "nextid", "nobr", "noembed", "noframes", "noscript", "object", + "ol", "optgroup", "option", "output", "p", "param", "plaintext", "pre", + "progress", "q", "rb", "rp", "rt", "rtc", "ruby", "s", "samp", + "script", "section", "select", "small", "source", "spacer", "span", + "strike", "strong", "style", "sub", "summary", "sup", "svg", "table", + "tbody", "td", "template", "textarea", "tfoot", "th", "thead", "time", + "title", "tr", "track", "tt", "u", "ul", "var", "video", "wbr", "xmp", +}; + +static const char *known_fb2_tags[] = { + "FictionBook", "a", "binary", "body", "cite", "code", "coverpage", + "date", "description", "emphasis", "empty-line", "epigraph", "image", + "p", "poem", "section", "stanza", "strikethrough", "strong", + "stylesheet", "sub", "subtitle", "sup", "table", "td", "text-author", + "th", "title", "title-info", "tr", "v", +}; + +static const char *find_known_html_tag(const char *tag) +{ + int l = 0; + int r = nelem(known_html_tags) / 2 - 1; + while (l <= r) + { + int m = (l + r) >> 1; + int c = strcmp(tag, known_html_tags[m]); + if (c < 0) + r = m - 1; + else if (c > 0) + l = m + 1; + else + return known_html_tags[m]; + } + return NULL; +} + +static const char *find_known_fb2_tag(const char *tag) +{ + int l = 0; + int r = nelem(known_fb2_tags) / 2 - 1; + while (l <= r) + { + int m = (l + r) >> 1; + int c = strcmp(tag, known_fb2_tags[m]); + if (c < 0) + r = m - 1; + else if (c > 0) + l = m + 1; + else + return known_fb2_tags[m]; + } + return NULL; +} + +struct genstate +{ + fz_pool *pool; + fz_html_font_set *set; + fz_archive *zip; + fz_tree *images; + fz_xml_doc *xml; + int is_fb2; + const char *base_uri; + fz_css *css; + int at_bol; + fz_html_box *emit_white; + int last_brk_cls; + + int list_counter; + int section_depth; + fz_bidi_direction markup_dir; + fz_text_language markup_lang; + char *href; + + fz_css_style_splay *styles; +}; + +static int iswhite(int c) +{ + return c == ' ' || c == '\t' || c == '\r' || c == '\n'; +} + +static int is_all_white(const char *s) +{ + while (*s) + { + if (!iswhite(*s)) + return 0; + ++s; + } + return 1; +} + +/* TODO: pool allocator for flow nodes */ +/* TODO: store text by pointing to a giant buffer */ + +static void fz_drop_html_flow(fz_context *ctx, fz_html_flow *flow) +{ + while (flow) + { + fz_html_flow *next = flow->next; + if (flow->type == FLOW_IMAGE) + fz_drop_image(ctx, flow->content.image); + flow = next; + } +} + +static fz_html_flow *add_flow(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, int type, int extras) +{ + size_t size = (type == FLOW_IMAGE ? sizeof(fz_html_flow) : offsetof(fz_html_flow, content) + extras); + fz_html_flow *flow; + + /* Shouldn't happen, but bug 705324. */ + if (top == NULL || top->type != BOX_FLOW) + return NULL; + + flow = fz_pool_alloc(ctx, pool, size); + flow->type = type; + flow->expand = 0; + flow->bidi_level = 0; + flow->markup_lang = 0; + flow->breaks_line = 0; + flow->box = inline_box; + (*top->s.build.flow_tail) = flow; + top->s.build.flow_tail = &flow->next; + return flow; +} + +static void add_flow_space(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box) +{ + fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_SPACE, 0); + if (flow) + flow->expand = 1; +} + +static void add_flow_break(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box) +{ + (void)add_flow(ctx, pool, top, inline_box, FLOW_BREAK, 0); +} + +static void add_flow_sbreak(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box) +{ + (void)add_flow(ctx, pool, top, inline_box, FLOW_SBREAK, 0); +} + +static void add_flow_shyphen(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box) +{ + (void)add_flow(ctx, pool, top, inline_box, FLOW_SHYPHEN, 0); +} + +static void add_flow_word(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, const char *a, const char *b, int lang) +{ + fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_WORD, b - a + 1); + if (flow == NULL) + return; + memcpy(flow->content.text, a, b - a); + flow->content.text[b - a] = 0; + flow->markup_lang = lang; +} + +static void add_flow_image(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, fz_image *img) +{ + fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_IMAGE, 0); + if (flow) + flow->content.image = fz_keep_image(ctx, img); +} + +static void add_flow_anchor(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box) +{ + (void)add_flow(ctx, pool, top, inline_box, FLOW_ANCHOR, 0); +} + +fz_html_flow *fz_html_split_flow(fz_context *ctx, fz_pool *pool, fz_html_flow *flow, size_t offset) +{ + fz_html_flow *new_flow; + char *text; + size_t len; + + assert(flow->type == FLOW_WORD); + + if (offset == 0) + return flow; + text = flow->content.text; + while (*text && offset) + { + int rune; + text += fz_chartorune(&rune, text); + offset--; + } + len = strlen(text); + new_flow = fz_pool_alloc(ctx, pool, offsetof(fz_html_flow, content) + len+1); + memcpy(new_flow, flow, offsetof(fz_html_flow, content)); + new_flow->next = flow->next; + flow->next = new_flow; + strcpy(new_flow->content.text, text); + *text = 0; + return new_flow; +} + +static void flush_space(fz_context *ctx, fz_html_box *flow, int lang, struct genstate *g) +{ + static const char *space = " "; + fz_pool *pool = g->pool; + if (g->emit_white) + { + int bsp = g->emit_white->style->white_space & WS_ALLOW_BREAK_SPACE; + if (!g->at_bol) + { + if (bsp) + add_flow_space(ctx, pool, flow, g->emit_white); + else + add_flow_word(ctx, pool, flow, g->emit_white, space, space+1, lang); + } + g->emit_white = 0; + } +} + +/* pair-wise lookup table for UAX#14 linebreaks +The linebreak table entries mean: +^ prohibited break + never break before A and after B, even with one or more spaces in between +% indirect break + do not break before A, unless one or more spaces follow B +_ direct break + break allowed before A +*/ +static const char *pairbrk[32] = +{ +/* -OCCQGNESIPPNAHIIHBBBZCWHHJJJREEZ- */ +/* -PLPULSXYSROULLDNYAB2WMJ23LVTIBMW- */ +/* - J- */ + "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^", /* OP open punctuation */ + "_^^%%^^^^%%____%%%__^^^________%", /* CL close punctuation */ + "_^^%%^^^^%%%%%_%%%__^^^________%", /* CP close parenthesis */ + "^^^%%%^^^%%%%%%%%%%%^^^%%%%%%%%%", /* QU quotation */ + "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%%%%", /* GL non-breaking glue */ + "_^^%%%^^^______%%%__^^^________%", /* NS nonstarters */ + "_^^%%%^^^______%%%__^^^________%", /* EX exclamation/interrogation */ + "_^^%%%^^^__%_%_%%%__^^^________%", /* SY symbols allowing break after */ + "_^^%%%^^^__%%%_%%%__^^^________%", /* IS infix numeric separator */ + "%^^%%%^^^__%%%%%%%__^^^%%%%%_%%%", /* PR prefix numeric */ + "%^^%%%^^^__%%%_%%%__^^^________%", /* PO postfix numeric */ + "%^^%%%^^^%%%%%_%%%__^^^________%", /* NU numeric */ + "%^^%%%^^^%%%%%_%%%__^^^________%", /* AL ordinary alphabetic and symbol characters */ + "%^^%%%^^^%%%%%_%%%__^^^________%", /* HL hebrew letter */ + "_^^%%%^^^_%____%%%__^^^________%", /* ID ideographic */ + "_^^%%%^^^______%%%__^^^________%", /* IN inseparable characters */ + "_^^%_%^^^__%___%%%__^^^________%", /* HY hyphens */ + "_^^%_%^^^______%%%__^^^________%", /* BA break after */ + "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%%%%", /* BB break before */ + "_^^%%%^^^______%%%_^^^^________%", /* B2 break opportunity before and after */ + "____________________^___________", /* ZW zero width space */ + "%^^%%%^^^%_%%%_%%%__^^^________%", /* CM combining mark */ + "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%%%%", /* WJ word joiner */ + "_^^%%%^^^_%____%%%__^^^___%%___%", /* H2 hangul leading/vowel syllable */ + "_^^%%%^^^_%____%%%__^^^____%___%", /* H3 hangul leading/vowel/trailing syllable */ + "_^^%%%^^^_%____%%%__^^^%%%%____%", /* JL hangul leading jamo */ + "_^^%%%^^^_%____%%%__^^^___%%___%", /* JV hangul vowel jamo */ + "_^^%%%^^^_%____%%%__^^^____%___%", /* JT hangul trailing jamo */ + "_^^%%%^^^______%%%__^^^_____%__%", /* RI regional indicator */ + "_^^%%%^^^_%____%%%__^^^_______%%", /* EB emoji base */ + "_^^%%%^^^_%____%%%__^^^________%", /* EM emoji modifier */ + "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%%%%", /* ZWJ zero width joiner */ +}; + +static fz_html_box * +find_flow_encloser(fz_context *ctx, fz_html_box *flow) +{ + /* This code was written to assume that there will always be a + * flow box enclosing callers of this. Bug 705324 shows that + * this isn't always the case. In the absence of a reproducer + * file, all I can do is try to patch around the issue so that + * we won't crash. */ + while (flow->type != BOX_FLOW) + { + if (flow->up == NULL) + { + fz_warn(ctx, "Flow encloser not found. Please report this file!"); + break; + } + flow = flow->up; + } + return flow; +} + +static void generate_text(fz_context *ctx, fz_html_box *box, const char *text, int lang, struct genstate *g) +{ + fz_html_box *flow; + fz_pool *pool = g->pool; + int collapse = box->style->white_space & WS_COLLAPSE; + int bsp = box->style->white_space & WS_ALLOW_BREAK_SPACE; + int bnl = box->style->white_space & WS_FORCE_BREAK_NEWLINE; + + static const char *space = " "; + + flow = find_flow_encloser(ctx, box); + if (flow == NULL) + return; + + while (*text) + { + if (bnl && (*text == '\n' || *text == '\r')) + { + if (text[0] == '\r' && text[1] == '\n') + text += 2; + else + text += 1; + add_flow_break(ctx, pool, flow, box); + g->at_bol = 1; + } + else if (iswhite(*text)) + { + if (collapse) + { + if (bnl) + while (*text == ' ' || *text == '\t') + ++text; + else + while (iswhite(*text)) + ++text; + g->emit_white = box; + } + else + { + // TODO: tabs + if (bsp) + add_flow_space(ctx, pool, flow, box); + else + add_flow_word(ctx, pool, flow, box, space, space+1, lang); + ++text; + } + g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ; /* don't add sbreaks after a space */ + } + else + { + const char *prev, *mark = text; + int c; + + flush_space(ctx, flow, lang, g); + + if (g->at_bol) + g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ; + + while (*text && !iswhite(*text)) + { + prev = text; + text += fz_chartorune(&c, text); + if (c == 0xAD) /* soft hyphen */ + { + if (mark != prev) + add_flow_word(ctx, pool, flow, box, mark, prev, lang); + add_flow_shyphen(ctx, pool, flow, box); + mark = text; + g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ; /* don't add sbreaks after a soft hyphen */ + } + else if (bsp) /* allow soft breaks */ + { + int this_brk_cls = ucdn_get_resolved_linebreak_class(c); + if (this_brk_cls <= UCDN_LINEBREAK_CLASS_ZWJ) + { + int brk = pairbrk[g->last_brk_cls][this_brk_cls]; + + /* we handle spaces elsewhere, so ignore these classes */ + if (brk == '@') brk = '^'; + if (brk == '#') brk = '^'; + if (brk == '%') brk = '^'; + + if (brk == '_') + { + if (mark != prev) + add_flow_word(ctx, pool, flow, box, mark, prev, lang); + add_flow_sbreak(ctx, pool, flow, box); + mark = prev; + } + + g->last_brk_cls = this_brk_cls; + } + } + } + if (mark != text) + add_flow_word(ctx, pool, flow, box, mark, text, lang); + + g->at_bol = 0; + } + } +} + +static fz_image *load_html_image(fz_context *ctx, fz_archive *zip, const char *base_uri, const char *src) +{ + char path[2048]; + fz_image *img = NULL; + fz_buffer *buf = NULL; + + fz_var(img); + fz_var(buf); + + fz_try(ctx) + { + if (!strncmp(src, "data:image/jpeg;base64,", 23)) + buf = fz_new_buffer_from_base64(ctx, src+23, 0); + else if (!strncmp(src, "data:image/png;base64,", 22)) + buf = fz_new_buffer_from_base64(ctx, src+22, 0); + else if (!strncmp(src, "data:image/gif;base64,", 22)) + buf = fz_new_buffer_from_base64(ctx, src+22, 0); + else + { + fz_strlcpy(path, base_uri, sizeof path); + fz_strlcat(path, "/", sizeof path); + fz_strlcat(path, src, sizeof path); + fz_urldecode(path); + fz_cleanname(path); + buf = fz_read_archive_entry(ctx, zip, path); + } +#if FZ_ENABLE_SVG + if (strstr(src, ".svg")) + img = fz_new_image_from_svg(ctx, buf, base_uri, zip); + else +#endif + img = fz_new_image_from_buffer(ctx, buf); + } + fz_always(ctx) + fz_drop_buffer(ctx, buf); + fz_catch(ctx) + { + fz_ignore_error(ctx); + fz_warn(ctx, "html: cannot load image src='%s'", src); + } + + return img; +} + +static fz_image *load_svg_image(fz_context *ctx, fz_archive *zip, const char *base_uri, + fz_xml_doc *xmldoc, fz_xml *node) +{ + fz_image *img = NULL; +#if FZ_ENABLE_SVG + fz_try(ctx) + img = fz_new_image_from_svg_xml(ctx, xmldoc, node, base_uri, zip); + fz_catch(ctx) + { + fz_ignore_error(ctx); + fz_warn(ctx, "html: cannot load embedded svg document"); + } +#endif + return img; +} + +static void generate_image(fz_context *ctx, fz_html_box *box, fz_image *img, struct genstate *g) +{ + fz_html_box *flow; + fz_pool *pool = g->pool; + + flow = find_flow_encloser(ctx, box); + + flush_space(ctx, flow, 0, g); + + if (!img) + { + const char *alt = "[image]"; + add_flow_word(ctx, pool, flow, box, alt, alt + 7, 0); + } + else + { + fz_try(ctx) + { + add_flow_sbreak(ctx, pool, flow, box); + add_flow_image(ctx, pool, flow, box, img); + add_flow_sbreak(ctx, pool, flow, box); + } + fz_always(ctx) + { + fz_drop_image(ctx, img); + } + fz_catch(ctx) + fz_rethrow(ctx); + } + + g->at_bol = 0; +} + +static void fz_drop_html_box(fz_context *ctx, fz_html_box *box) +{ + while (box) + { + fz_html_box *next = box->next; + if (box->type == BOX_FLOW) + fz_drop_html_flow(ctx, box->u.flow.head); + fz_drop_html_box(ctx, box->down); + box = next; + } +} + +static void fz_drop_html_imp(fz_context *ctx, fz_storable *stor) +{ + fz_html *html = (fz_html *)stor; + fz_drop_html_box(ctx, html->tree.root); + fz_drop_pool(ctx, html->tree.pool); +} + +static void fz_drop_story_imp(fz_context *ctx, fz_storable *stor) +{ + fz_story *story = (fz_story *)stor; + fz_free(ctx, story->user_css); + fz_drop_html_font_set(ctx, story->font_set); + fz_drop_xml(ctx, story->dom); + fz_drop_html_box(ctx, story->tree.root); + fz_drop_buffer(ctx, story->warnings); + fz_drop_archive(ctx, story->zip); + /* The pool must be the last thing dropped. */ + fz_drop_pool(ctx, story->tree.pool); +} + +/* Drop a structure derived from an html_tree. The exact things + * freed here will depend upon the drop function with which it + * was created. */ +static void +fz_drop_html_tree(fz_context *ctx, fz_html_tree *tree) +{ + fz_defer_reap_start(ctx); + fz_drop_storable(ctx, &tree->storable); + fz_defer_reap_end(ctx); +} + +void fz_drop_html(fz_context *ctx, fz_html *html) +{ + fz_drop_html_tree(ctx, &html->tree); +} + +void fz_drop_story(fz_context *ctx, fz_story *story) +{ + if (!story) + return; + + fz_drop_html_tree(ctx, &story->tree); +} + +fz_html *fz_keep_html(fz_context *ctx, fz_html *html) +{ + return fz_keep_storable(ctx, &html->tree.storable); +} + +static fz_html_box *new_box(fz_context *ctx, struct genstate *g, fz_xml *node, int type, fz_css_style *style) +{ + fz_html_box *box; + const char *tag = fz_xml_tag(node); + const char *id = fz_xml_att(node, "id"); + const char *href; + + if (type == BOX_INLINE) + box = fz_pool_alloc(ctx, g->pool, offsetof(fz_html_box, u)); + else if (type == BOX_FLOW) + box = fz_pool_alloc(ctx, g->pool, offsetof(fz_html_box, u) + sizeof(box->u.flow)); + else + box = fz_pool_alloc(ctx, g->pool, offsetof(fz_html_box, u) + sizeof(box->u.block)); + + box->type = type; + box->is_first_flow = 0; + box->markup_dir = g->markup_dir; + box->heading = 0; + box->list_item = 0; + + box->style = fz_css_enlist(ctx, style, &g->styles, g->pool); + + if (tag) + { + box->tag = find_known_html_tag(tag); + if (!box->tag && g->is_fb2) + box->tag = find_known_fb2_tag(tag); + if (!box->tag) + box->tag = fz_pool_strdup(ctx, g->pool, tag); + } + else + { + box->tag = "#anon"; + } + + if (id) + box->id = fz_pool_strdup(ctx, g->pool, id); + + if (tag && tag[0]=='a' && tag[1]==0) + { + // Support deprecated anchor syntax with id in "name" instead of "id" attribute. + if (!id) + { + const char *name = fz_xml_att(node, "name"); + if (name) + box->id = fz_pool_strdup(ctx, g->pool, name); + } + + if (g->is_fb2) + { + href = fz_xml_att(node, "l:href"); + if (!href) + href = fz_xml_att(node, "xlink:href"); + } + else + { + href = fz_xml_att(node, "href"); + } + if (href) + g->href = fz_pool_strdup(ctx, g->pool, href); + } + + if (g->href) + box->href = g->href; + + if (type == BOX_FLOW) + { + box->u.flow.head = NULL; + box->s.build.flow_tail = &box->u.flow.head; + } + + return box; +} + +static void append_box(fz_context *ctx, fz_html_box *parent, fz_html_box *child) +{ + child->up = parent; + if (!parent->down) + parent->down = child; + if (parent->s.build.last_child) + parent->s.build.last_child->next = child; + parent->s.build.last_child = child; +} + +static fz_html_box *find_block_context(fz_context *ctx, fz_html_box *box) +{ + while (box->type != BOX_BLOCK && box->type != BOX_TABLE_CELL) + box = box->up; + return box; +} + +static fz_html_box *find_table_row_context(fz_context *ctx, fz_html_box *box) +{ + fz_html_box *look = box; + while (look && look->type != BOX_TABLE) + look = look->up; + if (look) + return look; + fz_warn(ctx, "table-row not inside table element"); + return NULL; +} + +static fz_html_box *find_table_cell_context(fz_context *ctx, fz_html_box *box) +{ + fz_html_box *look = box; + while (look && look->type != BOX_TABLE_ROW) + look = look->up; + if (look) + return look; + fz_warn(ctx, "table-cell not inside table-row element"); + return NULL; +} + +static fz_html_box *find_inline_context(fz_context *ctx, struct genstate *g, fz_html_box *box) +{ + fz_css_style style; + fz_html_box *flow_box; + + if (box->type == BOX_FLOW || box->type == BOX_INLINE) + return box; + + // We have an inline element that is not in an existing flow/inline context. + + // Find the closest block level box to insert content into. + while (box->type != BOX_BLOCK && box->type != BOX_TABLE_CELL) + box = box->up; + + // Concatenate onto the last open flow box if we have one. + if (box->s.build.last_child && box->s.build.last_child->type == BOX_FLOW) + return box->s.build.last_child; + + // No flow box found, create and insert one! + + // TODO: null style instead of default for flow box? + fz_default_css_style(ctx, &style); + flow_box = new_box(ctx, g, NULL, BOX_FLOW, &style); + flow_box->is_first_flow = !box->down; + g->at_bol = 1; + + append_box(ctx, box, flow_box); + + return flow_box; +} + +static void gen2_children(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *root_node, fz_css_match *root_match); + +static void gen2_text(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node) +{ + fz_html_box *anon_box; + fz_css_style style; + const char *text; + int collapse; + + text = fz_xml_text(node); + collapse = root_box->style->white_space & WS_COLLAPSE; + if (collapse && is_all_white(text)) + { + g->emit_white = root_box; + } + else + { + if (root_box->type != BOX_INLINE) + { + /* Create anonymous inline box, with the same style as the top block box. */ + style = *root_box->style; + + // Make sure not to recursively multiply font sizes + style.font_size.value = 1; + style.font_size.unit = N_SCALE; + + root_box = find_inline_context(ctx, g, root_box); + anon_box = new_box(ctx, g, NULL, BOX_INLINE, &style); + append_box(ctx, root_box, anon_box); + root_box = anon_box; + } + + generate_text(ctx, root_box, text, g->markup_lang, g); + } +} + +static fz_html_box *gen2_inline(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style) +{ + fz_html_box *this_box; + fz_html_box *flow_box; + root_box = find_inline_context(ctx, g, root_box); + this_box = new_box(ctx, g, node, BOX_INLINE, style); + append_box(ctx, root_box, this_box); + if (this_box->id) + { + flow_box = find_flow_encloser(ctx, this_box); + add_flow_anchor(ctx, g->pool, flow_box, this_box); + } + return this_box; +} + +static void gen2_break(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node) +{ + fz_html_box *this_box; + fz_html_box *flow_box; + + if (root_box->type != BOX_INLINE) + { + /* Create inline box to hold the <br> tag, with the same style as containing block. */ + /* Make sure not to recursively multiply font sizes. */ + fz_css_style style = *root_box->style; + style.font_size.value = 1; + style.font_size.unit = N_SCALE; + this_box = new_box(ctx, g, node, BOX_INLINE, &style); + append_box(ctx, find_inline_context(ctx, g, root_box), this_box); + } + else + { + this_box = root_box; + } + + flow_box = find_flow_encloser(ctx, this_box); + add_flow_break(ctx, g->pool, flow_box, this_box); + g->at_bol = 1; +} + +static fz_html_box *gen2_block(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style) +{ + fz_html_box *this_box; + root_box = find_block_context(ctx, root_box); + this_box = new_box(ctx, g, node, BOX_BLOCK, style); + append_box(ctx, root_box, this_box); + return this_box; +} + +static fz_html_box *gen2_table(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style) +{ + fz_html_box *this_box; + root_box = find_block_context(ctx, root_box); + this_box = new_box(ctx, g, node, BOX_TABLE, style); + append_box(ctx, root_box, this_box); + return this_box; +} + +static fz_html_box *gen2_table_row(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style) +{ + fz_html_box *this_box, *table_box; + + table_box = find_table_row_context(ctx, root_box); + if (!table_box) + return gen2_block(ctx, g, root_box, node, style); + + this_box = new_box(ctx, g, node, BOX_TABLE_ROW, style); + append_box(ctx, table_box, this_box); + return this_box; +} + +static fz_html_box *gen2_table_cell(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style) +{ + fz_html_box *this_box, *row_box; + + row_box = find_table_cell_context(ctx, root_box); + if (!row_box) + return gen2_block(ctx, g, root_box, node, style); + + this_box = new_box(ctx, g, node, BOX_TABLE_CELL, style); + append_box(ctx, row_box, this_box); + return this_box; +} + +static void gen2_image_common(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_image *img, int display, fz_css_style *style) +{ + fz_html_box *img_block_box; + fz_html_box *img_inline_box; + + if (display == DIS_INLINE || display == DIS_INLINE_BLOCK) + { + root_box = find_inline_context(ctx, g, root_box); + img_inline_box = new_box(ctx, g, node, BOX_INLINE, style); + append_box(ctx, root_box, img_inline_box); + generate_image(ctx, img_inline_box, img, g); + } + else + { + root_box = find_block_context(ctx, root_box); + img_block_box = new_box(ctx, g, node, BOX_BLOCK, style); + append_box(ctx, root_box, img_block_box); + + root_box = find_inline_context(ctx, g, img_block_box); + img_inline_box = new_box(ctx, g, NULL, BOX_INLINE, style); + append_box(ctx, root_box, img_inline_box); + generate_image(ctx, img_inline_box, img, g); + } +} + +static void gen2_image_html(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, int display, fz_css_style *style) +{ + const char *src = fz_xml_att(node, "src"); + if (src) + { + fz_css_style local_style = *style; + fz_image *img; + int w, h; + const char *w_att = fz_xml_att(node, "width"); + const char *h_att = fz_xml_att(node, "height"); + + if (w_att && (w = fz_atoi(w_att)) > 0) + { + local_style.width.value = w; + local_style.width.unit = strchr(w_att, '%') ? N_PERCENT : N_LENGTH; + } + if (h_att && (h = fz_atoi(h_att)) > 0) + { + local_style.height.value = h; + local_style.height.unit = strchr(h_att, '%') ? N_PERCENT : N_LENGTH; + } + + img = load_html_image(ctx, g->zip, g->base_uri, src); + gen2_image_common(ctx, g, root_box, node, img, display, &local_style); + } +} + +static void gen2_image_fb2(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, int display, fz_css_style *style) +{ + const char *src = fz_xml_att(node, "l:href"); + if (!src) + src = fz_xml_att(node, "xlink:href"); + if (src && src[0] == '#') + { + fz_image *img = fz_tree_lookup(ctx, g->images, src+1); + gen2_image_common(ctx, g, root_box, node, fz_keep_image(ctx, img), display, style); + } +} + +static void gen2_image_svg(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, int display, fz_css_style *style) +{ + fz_image *img = load_svg_image(ctx, g->zip, g->base_uri, g->xml, node); + gen2_image_common(ctx, g, root_box, node, img, display, style); +} + +static int get_heading_from_tag(fz_context *ctx, struct genstate *g, const char *tag) +{ + if (tag[0] == 'h' && tag[1] != 0 && tag[2] == 0) + { + switch (tag[1]) + { + case '1': return 1; + case '2': return 2; + case '3': return 3; + case '4': return 4; + case '5': return 5; + case '6': return 6; + } + } + if (g->is_fb2) + { + if (!strcmp(tag, "title") || !strcmp(tag, "subtitle")) + return fz_mini(g->section_depth, 6); + } + return 0; +} + +static void gen2_tag(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, + fz_css_match *match, int display, fz_css_style *style) +{ + fz_html_box *this_box; + const char *tag; + const char *lang_att; + const char *dir_att; + + int save_markup_dir = g->markup_dir; + int save_markup_lang = g->markup_lang; + char *save_href = g->href; + + if (display == DIS_NONE) + return; + + tag = fz_xml_tag(node); + + dir_att = fz_xml_att(node, "dir"); + if (dir_att) + { + if (!strcmp(dir_att, "auto")) + g->markup_dir = FZ_BIDI_NEUTRAL; + else if (!strcmp(dir_att, "rtl")) + g->markup_dir = FZ_BIDI_RTL; + else if (!strcmp(dir_att, "ltr")) + g->markup_dir = FZ_BIDI_LTR; + else + g->markup_dir = DEFAULT_DIR; + } + + lang_att = fz_xml_att(node, "lang"); + if (lang_att) + g->markup_lang = fz_text_language_from_string(lang_att); + + switch (display) + { + case DIS_INLINE_BLOCK: + // TODO handle inline block as a flow node + this_box = gen2_block(ctx, g, root_box, node, style); + break; + + case DIS_BLOCK: + this_box = gen2_block(ctx, g, root_box, node, style); + this_box->heading = get_heading_from_tag(ctx, g, tag); + break; + + case DIS_LIST_ITEM: + this_box = gen2_block(ctx, g, root_box, node, style); + this_box->list_item = ++g->list_counter; + break; + + // TODO: https://www.w3.org/TR/CSS2/tables.html#anonymous-boxes + // + // The table generation code should insert and create anonymous boxes + // for any missing child/parent elements. + // + // MISSING CHILDREN: + // 1: Wrap consecutive BLOCK found in a TABLE in an anon TABLE_ROW. + // 2: Wrap consecutive BLOCK found in a TABLE_ROW in an anon TABLE_CELL. + // + // MISSING PARENTS: + // 1: Wrap consecutive TABLE_CELL found outside TABLE_ROW in an anon TABLE_ROW + // 2: Wrap consecutive TABLE_ROW found outside TABLE in an anon TABLE + // + // For now we ignore this and treat any such elements that are out of + // context as plain block elements. + + case DIS_TABLE: + this_box = gen2_table(ctx, g, root_box, node, style); + break; + case DIS_TABLE_GROUP: + // no box for table-row-group elements + this_box = root_box; + break; + case DIS_TABLE_ROW: + this_box = gen2_table_row(ctx, g, root_box, node, style); + break; + case DIS_TABLE_CELL: + this_box = gen2_table_cell(ctx, g, root_box, node, style); + break; + + case DIS_INLINE: + default: + this_box = gen2_inline(ctx, g, root_box, node, style); + break; + } + + if (tag && (!strcmp(tag, "ol") || !strcmp(tag, "ul") || !strcmp(tag, "dl"))) + { + int save_list_counter = g->list_counter; + g->list_counter = 0; + gen2_children(ctx, g, this_box, node, match); + g->list_counter = save_list_counter; + } + else if (tag && !strcmp(tag, "section")) + { + int save_section_depth = g->section_depth; + g->section_depth++; + gen2_children(ctx, g, this_box, node, match); + g->section_depth = save_section_depth; + } + else + { + gen2_children(ctx, g, this_box, node, match); + } + + g->markup_dir = save_markup_dir; + g->markup_lang = save_markup_lang; + g->href = save_href; +} + +static void gen2_children(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *root_node, fz_css_match *root_match) +{ + fz_xml *node; + const char *tag; + fz_css_match match; + fz_css_style style; + int display; + + for (node = fz_xml_down(root_node); node; node = fz_xml_next(node)) + { + tag = fz_xml_tag(node); + if (tag) + { + fz_match_css(ctx, &match, root_match, g->css, node); + fz_apply_css_style(ctx, g->set, &style, &match); + display = fz_get_css_match_display(&match); + if (tag[0]=='b' && tag[1]=='r' && tag[2]==0) + { + gen2_break(ctx, g, root_box, node); + } + else if (tag[0]=='i' && tag[1]=='m' && tag[2]=='g' && tag[3]==0) + { + gen2_image_html(ctx, g, root_box, node, display, &style); + } + else if (g->is_fb2 && tag[0]=='i' && tag[1]=='m' && tag[2]=='a' && tag[3]=='g' && tag[4]=='e' && tag[5]==0) + { + gen2_image_fb2(ctx, g, root_box, node, display, &style); + } + else if (tag[0]=='s' && tag[1]=='v' && tag[2]=='g' && tag[3]==0) + { + gen2_image_svg(ctx, g, root_box, node, display, &style); + } + else + { + gen2_tag(ctx, g, root_box, node, &match, display, &style); + } + } + else + { + gen2_text(ctx, g, root_box, node); + } + } +} + +static char *concat_text(fz_context *ctx, fz_xml *root) +{ + fz_xml *node; + size_t i = 0, n = 1; + char *s; + for (node = fz_xml_down(root); node; node = fz_xml_next(node)) + { + const char *text = fz_xml_text(node); + n += text ? strlen(text) : 0; + } + s = Memento_label(fz_malloc(ctx, n), "concat_html"); + for (node = fz_xml_down(root); node; node = fz_xml_next(node)) + { + const char *text = fz_xml_text(node); + if (text) + { + n = strlen(text); + memcpy(s+i, text, n); + i += n; + } + } + s[i] = 0; + return s; +} + +static void +html_load_css_link(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_css *css, fz_xml *root, const char *href) +{ + char path[2048]; + char css_base_uri[2048]; + fz_buffer *buf; + + fz_var(buf); + + fz_strlcpy(path, base_uri, sizeof path); + fz_strlcat(path, "/", sizeof path); + fz_strlcat(path, href, sizeof path); + fz_urldecode(path); + fz_cleanname(path); + + fz_dirname(css_base_uri, path, sizeof css_base_uri); + + buf = NULL; + fz_try(ctx) + { + buf = fz_read_archive_entry(ctx, zip, path); + fz_parse_css(ctx, css, fz_string_from_buffer(ctx, buf), path); + fz_add_css_font_faces(ctx, set, zip, css_base_uri, css); + } + fz_always(ctx) + fz_drop_buffer(ctx, buf); + fz_catch(ctx) + { + fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); + fz_report_error(ctx); + fz_warn(ctx, "ignoring stylesheet %s", path); + } +} + +static void +html_load_css(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_css *css, fz_xml *root) +{ + fz_xml *html, *head, *node; + + html = fz_xml_find(root, "html"); + head = fz_xml_find_down(html, "head"); + for (node = fz_xml_down(head); node; node = fz_xml_next(node)) + { + if (fz_xml_is_tag(node, "link")) + { + char *rel = fz_xml_att(node, "rel"); + if (rel && !fz_strcasecmp(rel, "stylesheet")) + { + char *type = fz_xml_att(node, "type"); + if ((type && !strcmp(type, "text/css")) || !type) + { + char *href = fz_xml_att(node, "href"); + if (href) + { + html_load_css_link(ctx, set, zip, base_uri, css, root, href); + } + } + } + } + else if (fz_xml_is_tag(node, "style")) + { + char *s = concat_text(ctx, node); + fz_try(ctx) + { + fz_parse_css(ctx, css, s, "<style>"); + fz_add_css_font_faces(ctx, set, zip, base_uri, css); + } + fz_always(ctx) + fz_free(ctx, s); + fz_catch(ctx) + { + fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); + fz_report_error(ctx); + fz_warn(ctx, "ignoring inline stylesheet"); + } + } + } +} + +static void +fb2_load_css(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_css *css, fz_xml *root) +{ + fz_xml *fictionbook, *stylesheet; + + fictionbook = fz_xml_find(root, "FictionBook"); + stylesheet = fz_xml_find_down(fictionbook, "stylesheet"); + if (stylesheet) + { + char *s = concat_text(ctx, stylesheet); + fz_try(ctx) + { + fz_parse_css(ctx, css, s, "<stylesheet>"); + fz_add_css_font_faces(ctx, set, zip, base_uri, css); + } + fz_catch(ctx) + { + fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); + fz_report_error(ctx); + fz_warn(ctx, "ignoring inline stylesheet"); + } + fz_free(ctx, s); + } +} + +static fz_tree * +load_fb2_images(fz_context *ctx, fz_xml *root) +{ + fz_xml *fictionbook, *binary; + fz_tree *images = NULL; + + fictionbook = fz_xml_find(root, "FictionBook"); + for (binary = fz_xml_find_down(fictionbook, "binary"); binary; binary = fz_xml_find_next(binary, "binary")) + { + const char *id = fz_xml_att(binary, "id"); + char *b64 = NULL; + fz_buffer *buf = NULL; + fz_image *img = NULL; + + fz_var(b64); + fz_var(buf); + + if (id == NULL) + { + fz_warn(ctx, "Skipping image with no id"); + continue; + } + + fz_try(ctx) + { + b64 = concat_text(ctx, binary); + buf = fz_new_buffer_from_base64(ctx, b64, strlen(b64)); + img = fz_new_image_from_buffer(ctx, buf); + } + fz_always(ctx) + { + fz_drop_buffer(ctx, buf); + fz_free(ctx, b64); + } + fz_catch(ctx) + fz_rethrow(ctx); + + images = fz_tree_insert(ctx, images, id, img); + } + + return images; +} + +typedef struct +{ + uint32_t *data; + size_t cap; + size_t len; +} uni_buf; + +typedef struct +{ + fz_context *ctx; + fz_pool *pool; + fz_html_flow *flow; + uni_buf *buffer; +} bidi_data; + +static void fragment_cb(const uint32_t *fragment, + size_t fragment_len, + int bidi_level, + int script, + void *arg) +{ + bidi_data *data = (bidi_data *)arg; + + /* We are guaranteed that fragmentOffset will be at the beginning + * of flow. */ + while (fragment_len > 0) + { + size_t len; + + if (data->flow->type == FLOW_SPACE) + { + len = 1; + } + else if (data->flow->type == FLOW_BREAK || data->flow->type == FLOW_SBREAK || + data->flow->type == FLOW_SHYPHEN || data->flow->type == FLOW_ANCHOR) + { + len = 0; + } + else + { + /* Must be text */ + len = fz_utflen(data->flow->content.text); + if (len > fragment_len) + { + /* We need to split this flow box */ + (void)fz_html_split_flow(data->ctx, data->pool, data->flow, fragment_len); + len = fz_utflen(data->flow->content.text); + } + } + + /* This flow box is entirely contained within this fragment. */ + data->flow->bidi_level = bidi_level; + data->flow->script = script; + data->flow = data->flow->next; + fragment_len -= len; + } +} + +static fz_bidi_direction +detect_flow_directionality(fz_context *ctx, fz_pool *pool, uni_buf *buffer, fz_bidi_direction bidi_dir, fz_html_flow *flow) +{ + fz_html_flow *end = flow; + bidi_data data; + + while (end) + { + unsigned int level = end->bidi_level; + + /* Gather the text from the flow up into a single buffer (at + * least, as much of it as has the same direction markup). */ + buffer->len = 0; + while (end && (level & 1) == (end->bidi_level & 1)) + { + size_t len = 0; + const char *text = ""; + int broken = 0; + + switch (end->type) + { + case FLOW_WORD: + len = fz_utflen(end->content.text); + text = end->content.text; + break; + case FLOW_SPACE: + len = 1; + text = " "; + break; + case FLOW_SHYPHEN: + case FLOW_SBREAK: + break; + case FLOW_BREAK: + case FLOW_IMAGE: + broken = 1; + break; + } + + end = end->next; + + if (broken) + break; + + /* Make sure the buffer is large enough */ + if (buffer->len + len > buffer->cap) + { + size_t newcap = buffer->cap; + if (newcap < 128) + newcap = 128; /* Sensible small default */ + + while (newcap < buffer->len + len) + newcap = (newcap * 3) / 2; + + buffer->data = fz_realloc_array(ctx, buffer->data, newcap, uint32_t); + buffer->cap = newcap; + } + + /* Expand the utf8 text into Unicode and store it in the buffer */ + while (*text) + { + int rune; + text += fz_chartorune(&rune, text); + buffer->data[buffer->len++] = rune; + } + } + + /* Detect directionality for the buffer */ + data.ctx = ctx; + data.pool = pool; + data.flow = flow; + data.buffer = buffer; + fz_bidi_fragment_text(ctx, buffer->data, buffer->len, &bidi_dir, fragment_cb, &data, 0 /* Flags */); + flow = end; + } + return bidi_dir; +} + +static void +detect_box_directionality(fz_context *ctx, fz_pool *pool, uni_buf *buffer, fz_html_box *box) +{ + while (box) + { + if (box->type == BOX_FLOW) + box->markup_dir = detect_flow_directionality(ctx, pool, buffer, box->markup_dir, box->u.flow.head); + detect_box_directionality(ctx, pool, buffer, box->down); + box = box->next; + } +} + +static void +detect_directionality(fz_context *ctx, fz_pool *pool, fz_html_box *box) +{ + uni_buf buffer = { NULL }; + + fz_try(ctx) + detect_box_directionality(ctx, pool, &buffer, box); + fz_always(ctx) + fz_free(ctx, buffer.data); + fz_catch(ctx) + fz_rethrow(ctx); +} + +static fz_xml_doc * +parse_to_xml(fz_context *ctx, fz_buffer *buf, int try_xml, int try_html5) +{ + fz_xml_doc *xml; + + if (try_xml && try_html5) + { + fz_try(ctx) + xml = fz_parse_xml(ctx, buf, 1); + fz_catch(ctx) + { + if (fz_caught(ctx) == FZ_ERROR_SYNTAX) + { + fz_report_error(ctx); + fz_warn(ctx, "syntax error in XHTML; retrying using HTML5 parser"); + xml = fz_parse_xml_from_html5(ctx, buf); + } + else + fz_rethrow(ctx); + } + } + else if (try_xml) + xml = fz_parse_xml(ctx, buf, 1); + else + { + assert(try_html5); + xml = fz_parse_xml_from_html5(ctx, buf); + } + + return xml; +} + +static void move_background_color_style_up(fz_context *ctx, struct genstate *g, fz_html_box *root, fz_html_box *from) +{ + fz_css_color transparent = { 0, 0, 0, 0 }; + fz_css_style s1, s2; + memcpy(&s1, root->style, sizeof s1); + memcpy(&s2, from->style, sizeof s2); + s1.background_color = s2.background_color; + s2.background_color = transparent; + root->style = fz_css_enlist(ctx, &s1, &g->styles, g->pool); + from->style = fz_css_enlist(ctx, &s2, &g->styles, g->pool); +} + +static void move_background_color_up(fz_context *ctx, struct genstate *g, fz_html_box *root) +{ + fz_html_box *html, *body; + + if (root->style->background_color.a != 0) + { + return; + } + + html = root->down; + if (html && !strcmp(html->tag, "html")) + { + if (html->style->background_color.a != 0) + { + move_background_color_style_up(ctx, g, root, html); + return; + } + + body = html->down; + if (body && !strcmp(body->tag, "body")) + { + if (body->style->background_color.a != 0) + { + move_background_color_style_up(ctx, g, root, body); + return; + } + } + } +} + +static void +xml_to_boxes(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, const char *user_css, + fz_xml_doc *xml, fz_html_tree *tree, char **rtitle, int try_fictionbook, int is_mobi) +{ + fz_xml *root, *node; + char *title; + + fz_css_match root_match, match; + struct genstate g = {0}; + + g.pool = NULL; + g.set = set; + g.zip = zip; + g.images = NULL; + g.xml = xml; + g.is_fb2 = 0; + g.base_uri = base_uri; + g.css = NULL; + g.at_bol = 0; + g.emit_white = 0; + g.last_brk_cls = UCDN_LINEBREAK_CLASS_OP; + g.list_counter = 0; + g.section_depth = 0; + g.markup_dir = FZ_BIDI_LTR; + g.markup_lang = FZ_LANG_UNSET; + g.href = NULL; + g.styles = NULL; + + if (rtitle) + *rtitle = NULL; + + root = fz_xml_root(g.xml); + g.css = fz_new_css(ctx); + +#ifndef NDEBUG + if (fz_atoi(getenv("FZ_DEBUG_XML"))) + fz_debug_xml(root, 0); +#endif + + fz_try(ctx) + { + if (try_fictionbook && fz_xml_find(root, "FictionBook")) + { + g.is_fb2 = 1; + fz_parse_css(ctx, g.css, fb2_default_css, "<default:fb2>"); + if (fz_use_document_css(ctx)) + fb2_load_css(ctx, g.set, g.zip, g.base_uri, g.css, root); + g.images = load_fb2_images(ctx, root); + } + else if (is_mobi) + { + g.is_fb2 = 0; + fz_parse_css(ctx, g.css, html_default_css, "<default:html>"); + fz_parse_css(ctx, g.css, mobi_default_css, "<default:mobi>"); + if (fz_use_document_css(ctx)) + html_load_css(ctx, g.set, g.zip, g.base_uri, g.css, root); + } + else + { + g.is_fb2 = 0; + fz_parse_css(ctx, g.css, html_default_css, "<default:html>"); + if (fz_use_document_css(ctx)) + html_load_css(ctx, g.set, g.zip, g.base_uri, g.css, root); + } + + if (user_css) + { + fz_parse_css(ctx, g.css, user_css, "<user>"); + fz_add_css_font_faces(ctx, g.set, g.zip, ".", g.css); + } + } + fz_catch(ctx) + { + fz_drop_tree(ctx, g.images, (void(*)(fz_context*,void*))fz_drop_image); + fz_drop_css(ctx, g.css); + fz_rethrow_if(ctx, FZ_ERROR_TRYLATER); + fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); + fz_report_error(ctx); + fz_warn(ctx, "ignoring styles"); + g.css = fz_new_css(ctx); + g.images = NULL; + } + +#ifndef NDEBUG + if (fz_atoi(getenv("FZ_DEBUG_CSS"))) + fz_debug_css(ctx, g.css); +#endif + + fz_try(ctx) + { + fz_css_style style; + int display; + + fz_match_css_at_page(ctx, &root_match, g.css); + fz_apply_css_style(ctx, g.set, &style, &root_match); + + g.pool = tree->pool; + g.markup_dir = DEFAULT_DIR; + g.markup_lang = FZ_LANG_UNSET; + + // Create root node + tree->root = new_box(ctx, &g, NULL, BOX_BLOCK, &style); + // TODO: transfer page margins out of this hacky box + + tree->root->tag = ":root"; + tree->root->s.layout.em = 0; + tree->root->s.layout.x = 0; + tree->root->s.layout.y = 0; + tree->root->s.layout.w = 0; + tree->root->s.layout.b = 0; + + // Create document node (html). + fz_match_css(ctx, &match, &root_match, g.css, root); + fz_apply_css_style(ctx, g.set, &style, &match); + display = fz_get_css_match_display(&match); + gen2_tag(ctx, &g, tree->root, root, &match, display, &style); + + detect_directionality(ctx, g.pool, tree->root); + + if (g.is_fb2) + { + node = fz_xml_find(root, "FictionBook"); + node = fz_xml_find_down(node, "description"); + node = fz_xml_find_down(node, "title-info"); + node = fz_xml_find_down(node, "book-title"); + if (rtitle) + { + title = fz_xml_text(fz_xml_down(node)); + if (title) + *rtitle = fz_pool_strdup(ctx, g.pool, title); + } + } + else + { + node = fz_xml_find(root, "html"); + node = fz_xml_find_down(node, "head"); + node = fz_xml_find_down(node, "title"); + if (rtitle) + { + title = fz_xml_text(fz_xml_down(node)); + if (title) + *rtitle = fz_pool_strdup(ctx, g.pool, title); + } + + // Move html or body background-color to :root. + move_background_color_up(ctx, &g, tree->root); + } + } + fz_always(ctx) + { + fz_drop_tree(ctx, g.images, (void(*)(fz_context*,void*))fz_drop_image); + fz_drop_css(ctx, g.css); + } + fz_catch(ctx) + { + if (rtitle) + { + fz_free(ctx, *rtitle); + *rtitle = NULL; + } + fz_rethrow(ctx); + } +} + +static const char *mobi_font_size[7] = { + "0.67em", + "0.83em", + "1em", + "1.17em", + "1.33em", + "1.5em", + "1.67em", +}; + +static void +patch_mobi_html(fz_context *ctx, fz_pool *pool, fz_xml *node) +{ + fz_xml *down; + char buf[500]; + while (node) + { + char *tag = fz_xml_tag(node); + if (tag) + { + // Read MOBI attributes, convert to inline CSS style + if (!strcmp(tag, "font")) + { + const char *size = fz_xml_att(node, "size"); + if (size) + { + if (!strcmp(size, "1")) size = mobi_font_size[0]; + else if (!strcmp(size, "2")) size = mobi_font_size[1]; + else if (!strcmp(size, "3")) size = mobi_font_size[2]; + else if (!strcmp(size, "4")) size = mobi_font_size[3]; + else if (!strcmp(size, "5")) size = mobi_font_size[4]; + else if (!strcmp(size, "6")) size = mobi_font_size[5]; + else if (!strcmp(size, "7")) size = mobi_font_size[6]; + else if (!strcmp(size, "+1")) size = mobi_font_size[3]; + else if (!strcmp(size, "+2")) size = mobi_font_size[4]; + else if (!strcmp(size, "+3")) size = mobi_font_size[5]; + else if (!strcmp(size, "+4")) size = mobi_font_size[6]; + else if (!strcmp(size, "+5")) size = mobi_font_size[6]; + else if (!strcmp(size, "+6")) size = mobi_font_size[6]; + else if (!strcmp(size, "-1")) size = mobi_font_size[1]; + else if (!strcmp(size, "-2")) size = mobi_font_size[0]; + else if (!strcmp(size, "-3")) size = mobi_font_size[0]; + else if (!strcmp(size, "-4")) size = mobi_font_size[0]; + else if (!strcmp(size, "-5")) size = mobi_font_size[0]; + else if (!strcmp(size, "-6")) size = mobi_font_size[0]; + fz_snprintf(buf, sizeof buf, "font-size:%s", size); + fz_xml_add_att(ctx, pool, node, "style", buf); + } + } + else + { + char *height = fz_xml_att(node, "height"); + char *width = fz_xml_att(node, "width"); + char *align = fz_xml_att(node, "align"); + if (height || width || align) + { + buf[0] = 0; + if (height) + { + fz_strlcat(buf, "margin-top:", sizeof buf); + fz_strlcat(buf, height, sizeof buf); + fz_strlcat(buf, ";", sizeof buf); + } + if (width) + { + fz_strlcat(buf, "text-indent:", sizeof buf); + fz_strlcat(buf, width, sizeof buf); + fz_strlcat(buf, ";", sizeof buf); + } + if (align) + { + fz_strlcat(buf, "text-align:", sizeof buf); + fz_strlcat(buf, align, sizeof buf); + fz_strlcat(buf, ";", sizeof buf); + } + fz_xml_add_att(ctx, pool, node, "style", buf); + } + if (!strcmp(tag, "img")) + { + char *recindex = fz_xml_att(node, "recindex"); + if (recindex) + fz_xml_add_att(ctx, pool, node, "src", recindex); + } + } + } + + down = fz_xml_down(node); + if (down) + patch_mobi_html(ctx, pool, down); + + node = fz_xml_next(node); + } +} + +static void +fz_parse_html_tree(fz_context *ctx, + fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buf, const char *user_css, + int try_xml, int try_html5, fz_html_tree *tree, char **rtitle, int try_fictionbook, int patch_mobi) +{ + fz_xml_doc *xml; + + if (rtitle) + *rtitle = NULL; + + xml = parse_to_xml(ctx, buf, try_xml, try_html5); + + if (patch_mobi) + patch_mobi_html(ctx, xml->u.doc.pool, xml); + + fz_try(ctx) + xml_to_boxes(ctx, set, zip, base_uri, user_css, xml, tree, rtitle, try_fictionbook, patch_mobi); + fz_always(ctx) + fz_drop_xml(ctx, xml); + fz_catch(ctx) + fz_rethrow(ctx); +} + +#define fz_new_derived_html_tree(CTX, TYPE, DROP) \ + ((TYPE *)Memento_label(fz_new_html_tree_of_size(CTX, sizeof(TYPE), DROP), #TYPE)) + +static fz_html_tree * +fz_new_html_tree_of_size(fz_context *ctx, size_t size, fz_store_drop_fn *drop) +{ + fz_pool *pool = fz_new_pool(ctx); + fz_html_tree *tree; + + fz_try(ctx) + { + tree = fz_pool_alloc(ctx, pool, size); + FZ_INIT_STORABLE(tree, 1, drop); + tree->pool = pool; + } + fz_catch(ctx) + { + fz_drop_pool(ctx, pool); + fz_rethrow(ctx); + } + + return tree; +} + +fz_html * +fz_parse_html(fz_context *ctx, + fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buf, const char *user_css, + int try_xml, int try_html5, int patch_mobi) +{ + fz_html *html = fz_new_derived_html_tree(ctx, fz_html, fz_drop_html_imp); + + html->layout_w = 0; + html->layout_h = 0; + html->layout_em = 0; + + fz_try(ctx) + fz_parse_html_tree(ctx, set, zip, base_uri, buf, user_css, try_xml, try_html5, &html->tree, &html->title, 1, patch_mobi); + fz_catch(ctx) + { + fz_drop_html(ctx, html); + fz_rethrow(ctx); + } + + return html; +} + +typedef struct +{ + int saved; + fz_warning_cb *old; + void *arg; + fz_buffer *buffer; + fz_context *ctx; +} warning_save; + +static void +warn_to_buffer(void *user, const char *message) +{ + warning_save *save = (warning_save *)user; + fz_context *ctx = save->ctx; + + fz_try(ctx) + { + fz_append_string(ctx, save->buffer, message); + fz_append_byte(ctx, save->buffer, '\n'); + } + fz_catch(ctx) + { + /* Silently swallow the error. */ + fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); + fz_report_error(ctx); + } +} + +static void +redirect_warnings_to_buffer(fz_context *ctx, fz_buffer *buf, warning_save *save) +{ + save->saved = 1; + save->old = fz_warning_callback(ctx, &save->arg); + save->buffer = buf; + save->ctx = ctx; + + fz_flush_warnings(ctx); + fz_set_warning_callback(ctx, warn_to_buffer, save); +} + +static void +restore_warnings(fz_context *ctx, warning_save *save) +{ + if (!save->saved) + return; + + fz_flush_warnings(ctx); + fz_set_warning_callback(ctx, save->old, save->arg); +} + +fz_story * +fz_new_story(fz_context *ctx, fz_buffer *buf, const char *user_css, float em, fz_archive *zip) +{ + fz_story *story = fz_new_derived_html_tree(ctx, fz_story, fz_drop_story_imp); + warning_save saved = { 0 }; + fz_buffer *local_buffer = NULL; + + if (buf == NULL) + { + local_buffer = fz_new_buffer(ctx, 0); + buf = local_buffer; + } + + fz_var(local_buffer); + fz_var(saved); + + fz_try(ctx) + { + story->zip = fz_keep_archive(ctx, zip); + story->font_set = fz_new_html_font_set(ctx); + story->em = em; + story->user_css = user_css ? fz_strdup(ctx, user_css) : NULL; + story->warnings = fz_new_buffer(ctx, 128); + redirect_warnings_to_buffer(ctx, story->warnings, &saved); + story->dom = parse_to_xml(ctx, buf, 0, 1); + } + fz_always(ctx) + { + restore_warnings(ctx, &saved); + fz_drop_buffer(ctx, local_buffer); + } + fz_catch(ctx) + { + fz_drop_html_tree(ctx, &story->tree); + fz_rethrow(ctx); + } + + return story; +} + +fz_html * +fz_parse_xhtml(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buf, const char *user_css) +{ + /* try as XML first, fall back to HTML5 */ + return fz_parse_html(ctx, set, zip, base_uri, buf, user_css, 1, 1, 0); +} + +static void indent(int level) +{ + while (level-- > 0) + putchar('\t'); +} + +static void +fz_debug_html_flow(fz_context *ctx, fz_html_flow *flow, int level) +{ + fz_html_box *sbox = NULL; + while (flow) + { + if (flow->box != sbox) { + sbox = flow->box; + indent(level); +#ifndef NDEBUG + printf("@style <%s> em=%g font='%s'", sbox->tag, sbox->s.layout.em, fz_font_name(ctx, sbox->style->font)); +#else + printf("@style em=%g font='%s'", sbox->s.layout.em, fz_font_name(ctx, sbox->style->font)); +#endif + if (fz_font_is_serif(ctx, sbox->style->font)) + printf(" serif"); + else + printf(" sans"); + if (fz_font_is_monospaced(ctx, sbox->style->font)) + printf(" monospaced"); + if (fz_font_is_bold(ctx, sbox->style->font)) + printf(" bold"); + if (fz_font_is_italic(ctx, sbox->style->font)) + printf(" italic"); + if (sbox->style->small_caps) + printf(" small-caps"); + printf("\n"); + } + + indent(level); + switch (flow->type) { + case FLOW_WORD: printf("word "); break; + case FLOW_SPACE: printf("space"); break; + case FLOW_SBREAK: printf("sbrk "); break; + case FLOW_SHYPHEN: printf("shy "); break; + case FLOW_BREAK: printf("break"); break; + case FLOW_IMAGE: printf("image"); break; + case FLOW_ANCHOR: printf("anchor"); break; + } + // printf(" y=%g x=%g w=%g", flow->y, flow->x, flow->w); + if (flow->type == FLOW_IMAGE) + printf(" h=%g", flow->h); + if (flow->type == FLOW_WORD) + printf(" text='%s'", flow->content.text); + printf("\n"); + if (flow->breaks_line) { + indent(level); + printf("*\n"); + } + + flow = flow->next; + } +} + +fz_structure fz_html_tag_to_structure(const char *tag) +{ + if (!strcmp(tag, "body")) return FZ_STRUCTURE_DOCUMENT; + if (!strcmp(tag, "div")) return FZ_STRUCTURE_DIV; + if (!strcmp(tag, "span")) return FZ_STRUCTURE_SPAN; + if (!strcmp(tag, "blockquote")) return FZ_STRUCTURE_BLOCKQUOTE; + if (!strcmp(tag, "p")) return FZ_STRUCTURE_P; + if (!strcmp(tag, "h1")) return FZ_STRUCTURE_H1; + if (!strcmp(tag, "h2")) return FZ_STRUCTURE_H2; + if (!strcmp(tag, "h3")) return FZ_STRUCTURE_H3; + if (!strcmp(tag, "h4")) return FZ_STRUCTURE_H4; + if (!strcmp(tag, "h5")) return FZ_STRUCTURE_H5; + if (!strcmp(tag, "h6")) return FZ_STRUCTURE_H6; + if (!strcmp(tag, "ol")) return FZ_STRUCTURE_LIST; + if (!strcmp(tag, "ul")) return FZ_STRUCTURE_LIST; + if (!strcmp(tag, "dl")) return FZ_STRUCTURE_LIST; + if (!strcmp(tag, "li")) return FZ_STRUCTURE_LISTITEM; + if (!strcmp(tag, "table")) return FZ_STRUCTURE_TABLE; + if (!strcmp(tag, "tr")) return FZ_STRUCTURE_TR; + if (!strcmp(tag, "th")) return FZ_STRUCTURE_TH; + if (!strcmp(tag, "td")) return FZ_STRUCTURE_TD; + if (!strcmp(tag, "thead")) return FZ_STRUCTURE_THEAD; + if (!strcmp(tag, "tbody")) return FZ_STRUCTURE_TBODY; + if (!strcmp(tag, "tfoot")) return FZ_STRUCTURE_TFOOT; + return FZ_STRUCTURE_INVALID; +} + +static void +fz_debug_html_box(fz_context *ctx, fz_html_box *box, int level) +{ + while (box) + { + indent(level); + printf("box "); + switch (box->type) { + case BOX_BLOCK: printf("block"); break; + case BOX_FLOW: printf("flow"); break; + case BOX_INLINE: printf("inline"); break; + case BOX_TABLE: printf("table"); break; + case BOX_TABLE_ROW: printf("table-row"); break; + case BOX_TABLE_CELL: printf("table-cell"); break; + } + + printf(" <%s>", box->tag); + // printf(" em=%g", box->em); + // printf(" x=%g y=%g w=%g b=%g", box->x, box->y, box->w, box->b); + + if (box->is_first_flow) + printf(" is-first-flow"); + if (box->list_item) + printf(" list=%d", box->list_item); + if (box->id) + printf(" id=(%s)", box->id); + if (box->href) + printf(" href=(%s)", box->href); + printf("\n"); + + if (box->type == BOX_BLOCK || box->type == BOX_TABLE) { + indent(level+1); + printf(">margin=(%g %g %g %g)\n", box->u.block.margin[0], box->u.block.margin[1], box->u.block.margin[2], box->u.block.margin[3]); + //indent(level+1); + //printf(">padding=(%g %g %g %g)\n", box->u.block.padding[0], box->u.block.padding[1], box->u.block.padding[2], box->u.block.padding[3]); + //indent(level+1); + //printf(">border=(%g %g %g %g)\n", box->u.block.border[0], box->u.block.border[1], box->u.block.border[2], box->u.block.border[3]); + } + + if (box->down) + fz_debug_html_box(ctx, box->down, level + 1); + if (box->type == BOX_FLOW) { + indent(level+1); + printf("flow\n"); + fz_debug_html_flow(ctx, box->u.flow.head, level + 2); + } + + box = box->next; + } +} + +void +fz_debug_html(fz_context *ctx, fz_html_box *box) +{ + fz_debug_html_box(ctx, box, 0); +} + +static size_t +fz_html_size(fz_context *ctx, fz_html *html) +{ + return html ? fz_pool_size(ctx, html->tree.pool) : 0; +} + +/* Magic to make html storable. */ +typedef struct { + int refs; + void *doc; + int chapter_num; +} fz_html_key; + +static int +fz_make_hash_html_key(fz_context *ctx, fz_store_hash *hash, void *key_) +{ + fz_html_key *key = (fz_html_key *)key_; + hash->u.pi.ptr = key->doc; + hash->u.pi.i = key->chapter_num; + return 1; +} + +static void * +fz_keep_html_key(fz_context *ctx, void *key_) +{ + fz_html_key *key = (fz_html_key *)key_; + return fz_keep_imp(ctx, key, &key->refs); +} + +static void +fz_drop_html_key(fz_context *ctx, void *key_) +{ + fz_html_key *key = (fz_html_key *)key_; + if (fz_drop_imp(ctx, key, &key->refs)) + { + fz_free(ctx, key); + } +} + +static int +fz_cmp_html_key(fz_context *ctx, void *k0_, void *k1_) +{ + fz_html_key *k0 = (fz_html_key *)k0_; + fz_html_key *k1 = (fz_html_key *)k1_; + return k0->doc == k1->doc && k0->chapter_num == k1->chapter_num; +} + +static void +fz_format_html_key(fz_context *ctx, char *s, size_t n, void *key_) +{ + fz_html_key *key = (fz_html_key *)key_; + fz_snprintf(s, n, "(html doc=%p, ch=%d)", key->doc, key->chapter_num); +} + +static const fz_store_type fz_html_store_type = +{ + "fz_html", + fz_make_hash_html_key, + fz_keep_html_key, + fz_drop_html_key, + fz_cmp_html_key, + fz_format_html_key, + NULL +}; + +fz_html *fz_store_html(fz_context *ctx, fz_html *html, void *doc, int chapter) +{ + fz_html_key *key = NULL; + fz_html *other_html; + + /* Stick the parsed html in the store */ + fz_var(key); + + fz_try(ctx) + { + key = fz_malloc_struct(ctx, fz_html_key); + key->refs = 1; + key->doc = doc; + key->chapter_num = chapter; + other_html = fz_store_item(ctx, key, html, fz_html_size(ctx, html), &fz_html_store_type); + if (other_html) + { + fz_drop_html(ctx, html); + html = other_html; + } + } + fz_always(ctx) + fz_drop_html_key(ctx, key); + fz_catch(ctx) + { + /* Do nothing */ + } + + return html; +} + +fz_html *fz_find_html(fz_context *ctx, void *doc, int chapter) +{ + fz_html_key key; + + key.refs = 1; + key.doc = doc; + key.chapter_num = chapter; + return fz_find_item(ctx, &fz_drop_html_imp, &key, &fz_html_store_type); +} + +static int +html_filter_store(fz_context *ctx, void *doc, void *key_) +{ + fz_html_key *key = (fz_html_key *)key_; + + return (doc == key->doc); +} + +void fz_purge_stored_html(fz_context *ctx, void *doc) +{ + fz_filter_store(ctx, html_filter_store, doc, &fz_html_store_type); +} + +static void +convert_to_boxes(fz_context *ctx, fz_story *story) +{ + warning_save saved = { 0 }; + + if (story->dom == NULL) + return; + + fz_var(saved); + + fz_try(ctx) + { + redirect_warnings_to_buffer(ctx, story->warnings, &saved); + xml_to_boxes(ctx, story->font_set, story->zip, ".", story->user_css, story->dom, &story->tree, NULL, 0, 0); + } + fz_always(ctx) + { + fz_drop_xml(ctx, story->dom); + story->dom = NULL; + restore_warnings(ctx, &saved); + } + fz_catch(ctx) + fz_rethrow(ctx); +} + +int fz_place_story(fz_context *ctx, fz_story *story, fz_rect where, fz_rect *filled) +{ + return fz_place_story_flags(ctx, story, where, filled, 0); +} + +int fz_place_story_flags(fz_context *ctx, fz_story *story, fz_rect where, fz_rect *filled, int flags) +{ + float w, h; + + if (filled) + *filled = fz_empty_rect; + + if (story == NULL || story->complete) + return 0; + + /* Convert from XML to box model on the first attempt to place. + * The DOM is unusable from here on in. */ + convert_to_boxes(ctx, story); + + w = where.x1 - where.x0; + h = where.y1 - where.y0; + /* Confusingly, we call the layout using restart_draw, not restart_place, + * because we don't want to destroy the current values in restart_place + * in case we have to retry later. This means the values are left in + * the correct struct though! */ + story->restart_draw.start = story->restart_place.start; + story->restart_draw.start_flow = story->restart_place.start_flow; + story->restart_draw.end = NULL; + story->restart_draw.end_flow = NULL; + story->restart_draw.reason = FZ_HTML_RESTART_REASON_NONE; + story->restart_draw.flags = flags; + story->bbox = where; + fz_restartable_layout_html(ctx, &story->tree, where.x0, where.y0, w, h, story->em, &story->restart_draw); + story->restart_draw.start = story->restart_place.start; + story->restart_draw.start_flow = story->restart_place.start_flow; + + if (filled) + { + fz_html_box *b = story->tree.root; + filled->x0 = b->s.layout.x - b->u.block.margin[L] - b->u.block.border[L] - b->u.block.padding[L]; + filled->x1 = b->s.layout.w + b->u.block.margin[R] + b->u.block.border[R] + b->u.block.padding[R] + b->s.layout.x; + filled->y0 = b->s.layout.y - b->u.block.margin[T] - b->u.block.border[T] - b->u.block.padding[T]; + filled->y1 = b->s.layout.b + b->u.block.margin[B] + b->u.block.border[B] + b->u.block.padding[B]; + } + +#ifndef NDEBUG + if (fz_atoi(getenv("FZ_DEBUG_HTML"))) + fz_debug_html(ctx, story->tree.root); +#endif + + if (story->restart_draw.end == NULL) + return FZ_HTML_RESTART_REASON_NONE; + if (story->restart_draw.reason == FZ_HTML_RESTART_REASON_LINE_WIDTH) + return FZ_HTML_RESTART_REASON_LINE_WIDTH; + return FZ_HTML_RESTART_REASON_LINE_HEIGHT; +} + +const char * +fz_story_warnings(fz_context *ctx, fz_story *story) +{ + unsigned char *data; + + if (!story) + return NULL; + + convert_to_boxes(ctx, story); + + fz_terminate_buffer(ctx, story->warnings); + + if (fz_buffer_storage(ctx, story->warnings, &data) == 0) + return NULL; + + return (const char *)data; +}
