Mercurial > hgrepos > Python2 > PyMuPDF
view mupdf-source/source/html/html-parse.c @ 16:bd5bb0742cc3
A "check" target that checks for all venv and host prerequisites
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Wed, 17 Sep 2025 21:11:25 +0200 |
| parents | b50eed0cc0ef |
| children |
line wrap: on
line source
// Copyright (C) 2004-2025 Artifex Software, Inc. // // This file is part of MuPDF. // // MuPDF is free software: you can redistribute it and/or modify it under the // terms of the GNU Affero General Public License as published by the Free // Software Foundation, either version 3 of the License, or (at your option) // any later version. // // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more // details. // // You should have received a copy of the GNU Affero General Public License // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> // // Alternative licensing terms are available from the licensor. // For commercial licensing, see <https://www.artifex.com/> or contact // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, // CA 94129, USA, for further information. #include "mupdf/fitz.h" #include "mupdf/ucdn.h" #include "html-imp.h" #include <string.h> #include <stdio.h> #include <assert.h> enum { T, R, B, L }; #define DEFAULT_DIR FZ_BIDI_LTR static const char *html_default_css = "@page{margin:3em 2em}" "a{color:#06C;text-decoration:underline}" "address{display:block;font-style:italic}" "b{font-weight:bold}" "bdo{direction:rtl;unicode-bidi:bidi-override}" "blockquote{display:block;margin:1em 40px}" "body{display:block;margin:1em}" "cite{font-style:italic}" "code{font-family:monospace}" "dd{display:block;margin:0 0 0 40px}" "del{text-decoration:line-through}" "div{display:block}" "dl{display:block;margin:1em 0}" "dt{display:block}" "em{font-style:italic}" "h1{display:block;font-size:2em;font-weight:bold;margin:0.67em 0;page-break-after:avoid}" "h2{display:block;font-size:1.5em;font-weight:bold;margin:0.83em 0;page-break-after:avoid}" "h3{display:block;font-size:1.17em;font-weight:bold;margin:1em 0;page-break-after:avoid}" "h4{display:block;font-size:1em;font-weight:bold;margin:1.33em 0;page-break-after:avoid}" "h5{display:block;font-size:0.83em;font-weight:bold;margin:1.67em 0;page-break-after:avoid}" "h6{display:block;font-size:0.67em;font-weight:bold;margin:2.33em 0;page-break-after:avoid}" "head{display:none}" "hr{border-style:solid;border-width:1px;display:block;margin-bottom:0.5em;margin-top:0.5em;text-align:center}" "html{display:block}" "i{font-style:italic}" "ins{text-decoration:underline}" "kbd{font-family:monospace}" "li{display:list-item}" "menu{display:block;list-style-type:disc;margin:1em 0;padding:0 0 0 30pt}" "ol{display:block;list-style-type:decimal;margin:1em 0;padding:0 0 0 30pt}" "p{display:block;margin:1em 0}" "pre{display:block;font-family:monospace;margin:1em 0;white-space:pre}" "samp{font-family:monospace}" "script{display:none}" "small{font-size:0.83em}" "strong{font-weight:bold}" "style{display:none}" "sub{font-size:0.83em;vertical-align:sub}" "sup{font-size:0.83em;vertical-align:super}" "table{display:table;border-spacing:2px}" "tbody{display:table-row-group}" "td{display:table-cell;padding:1px;background-color:inherit}" "tfoot{display:table-footer-group}" "th{display:table-cell;font-weight:bold;padding:1px;text-align:center;background-color:inherit}" "thead{display:table-header-group}" "tr{display:table-row}" "ul{display:block;list-style-type:disc;margin:1em 0;padding:0 0 0 30pt}" "ul ul{list-style-type:circle}" "ul ul ul{list-style-type:square}" "var{font-style:italic}" "colgroup{display:table-column-group}" "col{display:table-column}" "caption{display:block;text-align:center}" ; static const char *mobi_default_css = "pagebreak{display:block;page-break-before:always}" "dl,ol,ul{margin:0}" "p{margin:0}" "blockquote{margin:0 40px}" "center{display:block;text-align:center}" "big{font-size:1.17em}" "strike{text-decoration:line-through}" ; static const char *fb2_default_css = "@page{margin:3em 2em}" "FictionBook{display:block;margin:1em}" "stylesheet,binary{display:none}" "description>*{display:none}" "description>title-info{display:block}" "description>title-info>*{display:none}" "description>title-info>coverpage{display:block;page-break-before:always;page-break-after:always}" "body,section,title,subtitle,p,cite,epigraph,text-author,date,poem,stanza,v,empty-line{display:block}" "image{display:block}" "p>image{display:inline}" "table{display:table}" "tr{display:table-row}" "th,td{display:table-cell}" "a{color:#06C;text-decoration:underline}" "a[type=note]{font-size:small;vertical-align:super}" "code{white-space:pre;font-family:monospace}" "emphasis{font-style:italic}" "strikethrough{text-decoration:line-through}" "strong{font-weight:bold}" "sub{font-size:small;vertical-align:sub}" "sup{font-size:small;vertical-align:super}" "image{margin:1em 0;text-align:center}" "cite,poem{margin:1em 2em}" "subtitle,epigraph,stanza{margin:1em 0}" "title>p{text-align:center;font-size:x-large}" "subtitle{text-align:center;font-size:large}" "p{margin-top:1em;text-align:justify}" "empty-line{padding-top:1em}" "p+p{margin-top:0;text-indent:1.5em}" "empty-line+p{margin-top:0}" "section>title{page-break-before:always}" ; static const char *known_html_tags[] = { // TODO: add known FB2 tags? // Sorted list of all HTML tags. "a", "abbr", "acronym", "address", "annotation-xml", "applet", "area", "article", "aside", "audio", "b", "base", "basefont", "bdi", "bdo", "bgsound", "big", "blink", "blockquote", "body", "br", "button", "canvas", "caption", "center", "cite", "code", "col", "colgroup", "data", "datalist", "dd", "del", "desc", "details", "dfn", "dir", "div", "dl", "dt", "em", "embed", "fieldset", "figcaption", "figure", "font", "footer", "foreignobject", "form", "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html", "i", "iframe", "image", "img", "input", "ins", "isindex", "kbd", "keygen", "label", "legend", "li", "link", "listing", "main", "malignmark", "map", "mark", "marquee", "math", "menu", "menuitem", "meta", "meter", "mglyph", "mi", "mn", "mo", "ms", "mtext", "multicol", "nav", "nextid", "nobr", "noembed", "noframes", "noscript", "object", "ol", "optgroup", "option", "output", "p", "param", "plaintext", "pre", "progress", "q", "rb", "rp", "rt", "rtc", "ruby", "s", "samp", "script", "section", "select", "small", "source", "spacer", "span", "strike", "strong", "style", "sub", "summary", "sup", "svg", "table", "tbody", "td", "template", "textarea", "tfoot", "th", "thead", "time", "title", "tr", "track", "tt", "u", "ul", "var", "video", "wbr", "xmp", }; static const char *known_fb2_tags[] = { "FictionBook", "a", "binary", "body", "cite", "code", "coverpage", "date", "description", "emphasis", "empty-line", "epigraph", "image", "p", "poem", "section", "stanza", "strikethrough", "strong", "stylesheet", "sub", "subtitle", "sup", "table", "td", "text-author", "th", "title", "title-info", "tr", "v", }; static const char *find_known_html_tag(const char *tag) { int l = 0; int r = nelem(known_html_tags) / 2 - 1; while (l <= r) { int m = (l + r) >> 1; int c = strcmp(tag, known_html_tags[m]); if (c < 0) r = m - 1; else if (c > 0) l = m + 1; else return known_html_tags[m]; } return NULL; } static const char *find_known_fb2_tag(const char *tag) { int l = 0; int r = nelem(known_fb2_tags) / 2 - 1; while (l <= r) { int m = (l + r) >> 1; int c = strcmp(tag, known_fb2_tags[m]); if (c < 0) r = m - 1; else if (c > 0) l = m + 1; else return known_fb2_tags[m]; } return NULL; } struct genstate { fz_pool *pool; fz_html_font_set *set; fz_archive *zip; fz_tree *images; fz_xml_doc *xml; int is_fb2; const char *base_uri; fz_css *css; int at_bol; fz_html_box *emit_white; int last_brk_cls; int list_counter; int section_depth; fz_bidi_direction markup_dir; fz_text_language markup_lang; char *href; fz_css_style_splay *styles; }; static int iswhite(int c) { return c == ' ' || c == '\t' || c == '\r' || c == '\n'; } static int is_all_white(const char *s) { while (*s) { if (!iswhite(*s)) return 0; ++s; } return 1; } /* TODO: pool allocator for flow nodes */ /* TODO: store text by pointing to a giant buffer */ static void fz_drop_html_flow(fz_context *ctx, fz_html_flow *flow) { while (flow) { fz_html_flow *next = flow->next; if (flow->type == FLOW_IMAGE) fz_drop_image(ctx, flow->content.image); flow = next; } } static fz_html_flow *add_flow(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, int type, int extras) { size_t size = (type == FLOW_IMAGE ? sizeof(fz_html_flow) : offsetof(fz_html_flow, content) + extras); fz_html_flow *flow; /* Shouldn't happen, but bug 705324. */ if (top == NULL || top->type != BOX_FLOW) return NULL; flow = fz_pool_alloc(ctx, pool, size); flow->type = type; flow->expand = 0; flow->bidi_level = 0; flow->markup_lang = 0; flow->breaks_line = 0; flow->box = inline_box; (*top->s.build.flow_tail) = flow; top->s.build.flow_tail = &flow->next; return flow; } static void add_flow_space(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box) { fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_SPACE, 0); if (flow) flow->expand = 1; } static void add_flow_break(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box) { (void)add_flow(ctx, pool, top, inline_box, FLOW_BREAK, 0); } static void add_flow_sbreak(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box) { (void)add_flow(ctx, pool, top, inline_box, FLOW_SBREAK, 0); } static void add_flow_shyphen(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box) { (void)add_flow(ctx, pool, top, inline_box, FLOW_SHYPHEN, 0); } static void add_flow_word(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, const char *a, const char *b, int lang) { fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_WORD, b - a + 1); if (flow == NULL) return; memcpy(flow->content.text, a, b - a); flow->content.text[b - a] = 0; flow->markup_lang = lang; } static void add_flow_image(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, fz_image *img) { fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_IMAGE, 0); if (flow) flow->content.image = fz_keep_image(ctx, img); } static void add_flow_anchor(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box) { (void)add_flow(ctx, pool, top, inline_box, FLOW_ANCHOR, 0); } fz_html_flow *fz_html_split_flow(fz_context *ctx, fz_pool *pool, fz_html_flow *flow, size_t offset) { fz_html_flow *new_flow; char *text; size_t len; assert(flow->type == FLOW_WORD); if (offset == 0) return flow; text = flow->content.text; while (*text && offset) { int rune; text += fz_chartorune(&rune, text); offset--; } len = strlen(text); new_flow = fz_pool_alloc(ctx, pool, offsetof(fz_html_flow, content) + len+1); memcpy(new_flow, flow, offsetof(fz_html_flow, content)); new_flow->next = flow->next; flow->next = new_flow; strcpy(new_flow->content.text, text); *text = 0; return new_flow; } static void flush_space(fz_context *ctx, fz_html_box *flow, int lang, struct genstate *g) { static const char *space = " "; fz_pool *pool = g->pool; if (g->emit_white) { int bsp = g->emit_white->style->white_space & WS_ALLOW_BREAK_SPACE; if (!g->at_bol) { if (bsp) add_flow_space(ctx, pool, flow, g->emit_white); else add_flow_word(ctx, pool, flow, g->emit_white, space, space+1, lang); } g->emit_white = 0; } } /* pair-wise lookup table for UAX#14 linebreaks The linebreak table entries mean: ^ prohibited break never break before A and after B, even with one or more spaces in between % indirect break do not break before A, unless one or more spaces follow B _ direct break break allowed before A */ static const char *pairbrk[32] = { /* -OCCQGNESIPPNAHIIHBBBZCWHHJJJREEZ- */ /* -PLPULSXYSROULLDNYAB2WMJ23LVTIBMW- */ /* - J- */ "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^", /* OP open punctuation */ "_^^%%^^^^%%____%%%__^^^________%", /* CL close punctuation */ "_^^%%^^^^%%%%%_%%%__^^^________%", /* CP close parenthesis */ "^^^%%%^^^%%%%%%%%%%%^^^%%%%%%%%%", /* QU quotation */ "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%%%%", /* GL non-breaking glue */ "_^^%%%^^^______%%%__^^^________%", /* NS nonstarters */ "_^^%%%^^^______%%%__^^^________%", /* EX exclamation/interrogation */ "_^^%%%^^^__%_%_%%%__^^^________%", /* SY symbols allowing break after */ "_^^%%%^^^__%%%_%%%__^^^________%", /* IS infix numeric separator */ "%^^%%%^^^__%%%%%%%__^^^%%%%%_%%%", /* PR prefix numeric */ "%^^%%%^^^__%%%_%%%__^^^________%", /* PO postfix numeric */ "%^^%%%^^^%%%%%_%%%__^^^________%", /* NU numeric */ "%^^%%%^^^%%%%%_%%%__^^^________%", /* AL ordinary alphabetic and symbol characters */ "%^^%%%^^^%%%%%_%%%__^^^________%", /* HL hebrew letter */ "_^^%%%^^^_%____%%%__^^^________%", /* ID ideographic */ "_^^%%%^^^______%%%__^^^________%", /* IN inseparable characters */ "_^^%_%^^^__%___%%%__^^^________%", /* HY hyphens */ "_^^%_%^^^______%%%__^^^________%", /* BA break after */ "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%%%%", /* BB break before */ "_^^%%%^^^______%%%_^^^^________%", /* B2 break opportunity before and after */ "____________________^___________", /* ZW zero width space */ "%^^%%%^^^%_%%%_%%%__^^^________%", /* CM combining mark */ "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%%%%", /* WJ word joiner */ "_^^%%%^^^_%____%%%__^^^___%%___%", /* H2 hangul leading/vowel syllable */ "_^^%%%^^^_%____%%%__^^^____%___%", /* H3 hangul leading/vowel/trailing syllable */ "_^^%%%^^^_%____%%%__^^^%%%%____%", /* JL hangul leading jamo */ "_^^%%%^^^_%____%%%__^^^___%%___%", /* JV hangul vowel jamo */ "_^^%%%^^^_%____%%%__^^^____%___%", /* JT hangul trailing jamo */ "_^^%%%^^^______%%%__^^^_____%__%", /* RI regional indicator */ "_^^%%%^^^_%____%%%__^^^_______%%", /* EB emoji base */ "_^^%%%^^^_%____%%%__^^^________%", /* EM emoji modifier */ "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%%%%", /* ZWJ zero width joiner */ }; static fz_html_box * find_flow_encloser(fz_context *ctx, fz_html_box *flow) { /* This code was written to assume that there will always be a * flow box enclosing callers of this. Bug 705324 shows that * this isn't always the case. In the absence of a reproducer * file, all I can do is try to patch around the issue so that * we won't crash. */ while (flow->type != BOX_FLOW) { if (flow->up == NULL) { fz_warn(ctx, "Flow encloser not found. Please report this file!"); break; } flow = flow->up; } return flow; } static void generate_text(fz_context *ctx, fz_html_box *box, const char *text, int lang, struct genstate *g) { fz_html_box *flow; fz_pool *pool = g->pool; int collapse = box->style->white_space & WS_COLLAPSE; int bsp = box->style->white_space & WS_ALLOW_BREAK_SPACE; int bnl = box->style->white_space & WS_FORCE_BREAK_NEWLINE; static const char *space = " "; flow = find_flow_encloser(ctx, box); if (flow == NULL) return; while (*text) { if (bnl && (*text == '\n' || *text == '\r')) { if (text[0] == '\r' && text[1] == '\n') text += 2; else text += 1; add_flow_break(ctx, pool, flow, box); g->at_bol = 1; } else if (iswhite(*text)) { if (collapse) { if (bnl) while (*text == ' ' || *text == '\t') ++text; else while (iswhite(*text)) ++text; g->emit_white = box; } else { // TODO: tabs if (bsp) add_flow_space(ctx, pool, flow, box); else add_flow_word(ctx, pool, flow, box, space, space+1, lang); ++text; } g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ; /* don't add sbreaks after a space */ } else { const char *prev, *mark = text; int c; flush_space(ctx, flow, lang, g); if (g->at_bol) g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ; while (*text && !iswhite(*text)) { prev = text; text += fz_chartorune(&c, text); if (c == 0xAD) /* soft hyphen */ { if (mark != prev) add_flow_word(ctx, pool, flow, box, mark, prev, lang); add_flow_shyphen(ctx, pool, flow, box); mark = text; g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ; /* don't add sbreaks after a soft hyphen */ } else if (bsp) /* allow soft breaks */ { int this_brk_cls = ucdn_get_resolved_linebreak_class(c); if (this_brk_cls <= UCDN_LINEBREAK_CLASS_ZWJ) { int brk = pairbrk[g->last_brk_cls][this_brk_cls]; /* we handle spaces elsewhere, so ignore these classes */ if (brk == '@') brk = '^'; if (brk == '#') brk = '^'; if (brk == '%') brk = '^'; if (brk == '_') { if (mark != prev) add_flow_word(ctx, pool, flow, box, mark, prev, lang); add_flow_sbreak(ctx, pool, flow, box); mark = prev; } g->last_brk_cls = this_brk_cls; } } } if (mark != text) add_flow_word(ctx, pool, flow, box, mark, text, lang); g->at_bol = 0; } } } static fz_image *load_html_image(fz_context *ctx, fz_archive *zip, const char *base_uri, const char *src) { char path[2048]; fz_image *img = NULL; fz_buffer *buf = NULL; fz_var(img); fz_var(buf); fz_try(ctx) { if (!strncmp(src, "data:image/jpeg;base64,", 23)) buf = fz_new_buffer_from_base64(ctx, src+23, 0); else if (!strncmp(src, "data:image/png;base64,", 22)) buf = fz_new_buffer_from_base64(ctx, src+22, 0); else if (!strncmp(src, "data:image/gif;base64,", 22)) buf = fz_new_buffer_from_base64(ctx, src+22, 0); else { fz_strlcpy(path, base_uri, sizeof path); fz_strlcat(path, "/", sizeof path); fz_strlcat(path, src, sizeof path); fz_urldecode(path); fz_cleanname(path); buf = fz_read_archive_entry(ctx, zip, path); } #if FZ_ENABLE_SVG if (strstr(src, ".svg")) img = fz_new_image_from_svg(ctx, buf, base_uri, zip); else #endif img = fz_new_image_from_buffer(ctx, buf); } fz_always(ctx) fz_drop_buffer(ctx, buf); fz_catch(ctx) { fz_ignore_error(ctx); fz_warn(ctx, "html: cannot load image src='%s'", src); } return img; } static fz_image *load_svg_image(fz_context *ctx, fz_archive *zip, const char *base_uri, fz_xml_doc *xmldoc, fz_xml *node) { fz_image *img = NULL; #if FZ_ENABLE_SVG fz_try(ctx) img = fz_new_image_from_svg_xml(ctx, xmldoc, node, base_uri, zip); fz_catch(ctx) { fz_ignore_error(ctx); fz_warn(ctx, "html: cannot load embedded svg document"); } #endif return img; } static void generate_image(fz_context *ctx, fz_html_box *box, fz_image *img, struct genstate *g) { fz_html_box *flow; fz_pool *pool = g->pool; flow = find_flow_encloser(ctx, box); flush_space(ctx, flow, 0, g); if (!img) { const char *alt = "[image]"; add_flow_word(ctx, pool, flow, box, alt, alt + 7, 0); } else { fz_try(ctx) { add_flow_sbreak(ctx, pool, flow, box); add_flow_image(ctx, pool, flow, box, img); add_flow_sbreak(ctx, pool, flow, box); } fz_always(ctx) { fz_drop_image(ctx, img); } fz_catch(ctx) fz_rethrow(ctx); } g->at_bol = 0; } static void fz_drop_html_box(fz_context *ctx, fz_html_box *box) { while (box) { fz_html_box *next = box->next; if (box->type == BOX_FLOW) fz_drop_html_flow(ctx, box->u.flow.head); fz_drop_html_box(ctx, box->down); box = next; } } static void fz_drop_html_imp(fz_context *ctx, fz_storable *stor) { fz_html *html = (fz_html *)stor; fz_drop_html_box(ctx, html->tree.root); fz_drop_pool(ctx, html->tree.pool); } static void fz_drop_story_imp(fz_context *ctx, fz_storable *stor) { fz_story *story = (fz_story *)stor; fz_free(ctx, story->user_css); fz_drop_html_font_set(ctx, story->font_set); fz_drop_xml(ctx, story->dom); fz_drop_html_box(ctx, story->tree.root); fz_drop_buffer(ctx, story->warnings); fz_drop_archive(ctx, story->zip); /* The pool must be the last thing dropped. */ fz_drop_pool(ctx, story->tree.pool); } /* Drop a structure derived from an html_tree. The exact things * freed here will depend upon the drop function with which it * was created. */ static void fz_drop_html_tree(fz_context *ctx, fz_html_tree *tree) { fz_defer_reap_start(ctx); fz_drop_storable(ctx, &tree->storable); fz_defer_reap_end(ctx); } void fz_drop_html(fz_context *ctx, fz_html *html) { fz_drop_html_tree(ctx, &html->tree); } void fz_drop_story(fz_context *ctx, fz_story *story) { if (!story) return; fz_drop_html_tree(ctx, &story->tree); } fz_html *fz_keep_html(fz_context *ctx, fz_html *html) { return fz_keep_storable(ctx, &html->tree.storable); } static fz_html_box *new_box(fz_context *ctx, struct genstate *g, fz_xml *node, int type, fz_css_style *style) { fz_html_box *box; const char *tag = fz_xml_tag(node); const char *id = fz_xml_att(node, "id"); const char *href; if (type == BOX_INLINE) box = fz_pool_alloc(ctx, g->pool, offsetof(fz_html_box, u)); else if (type == BOX_FLOW) box = fz_pool_alloc(ctx, g->pool, offsetof(fz_html_box, u) + sizeof(box->u.flow)); else box = fz_pool_alloc(ctx, g->pool, offsetof(fz_html_box, u) + sizeof(box->u.block)); box->type = type; box->is_first_flow = 0; box->markup_dir = g->markup_dir; box->heading = 0; box->list_item = 0; box->style = fz_css_enlist(ctx, style, &g->styles, g->pool); if (tag) { box->tag = find_known_html_tag(tag); if (!box->tag && g->is_fb2) box->tag = find_known_fb2_tag(tag); if (!box->tag) box->tag = fz_pool_strdup(ctx, g->pool, tag); } else { box->tag = "#anon"; } if (id) box->id = fz_pool_strdup(ctx, g->pool, id); if (tag && tag[0]=='a' && tag[1]==0) { // Support deprecated anchor syntax with id in "name" instead of "id" attribute. if (!id) { const char *name = fz_xml_att(node, "name"); if (name) box->id = fz_pool_strdup(ctx, g->pool, name); } if (g->is_fb2) { href = fz_xml_att(node, "l:href"); if (!href) href = fz_xml_att(node, "xlink:href"); } else { href = fz_xml_att(node, "href"); } if (href) g->href = fz_pool_strdup(ctx, g->pool, href); } if (g->href) box->href = g->href; if (type == BOX_FLOW) { box->u.flow.head = NULL; box->s.build.flow_tail = &box->u.flow.head; } return box; } static void append_box(fz_context *ctx, fz_html_box *parent, fz_html_box *child) { child->up = parent; if (!parent->down) parent->down = child; if (parent->s.build.last_child) parent->s.build.last_child->next = child; parent->s.build.last_child = child; } static fz_html_box *find_block_context(fz_context *ctx, fz_html_box *box) { while (box->type != BOX_BLOCK && box->type != BOX_TABLE_CELL) box = box->up; return box; } static fz_html_box *find_table_row_context(fz_context *ctx, fz_html_box *box) { fz_html_box *look = box; while (look && look->type != BOX_TABLE) look = look->up; if (look) return look; fz_warn(ctx, "table-row not inside table element"); return NULL; } static fz_html_box *find_table_cell_context(fz_context *ctx, fz_html_box *box) { fz_html_box *look = box; while (look && look->type != BOX_TABLE_ROW) look = look->up; if (look) return look; fz_warn(ctx, "table-cell not inside table-row element"); return NULL; } static fz_html_box *find_inline_context(fz_context *ctx, struct genstate *g, fz_html_box *box) { fz_css_style style; fz_html_box *flow_box; if (box->type == BOX_FLOW || box->type == BOX_INLINE) return box; // We have an inline element that is not in an existing flow/inline context. // Find the closest block level box to insert content into. while (box->type != BOX_BLOCK && box->type != BOX_TABLE_CELL) box = box->up; // Concatenate onto the last open flow box if we have one. if (box->s.build.last_child && box->s.build.last_child->type == BOX_FLOW) return box->s.build.last_child; // No flow box found, create and insert one! // TODO: null style instead of default for flow box? fz_default_css_style(ctx, &style); flow_box = new_box(ctx, g, NULL, BOX_FLOW, &style); flow_box->is_first_flow = !box->down; g->at_bol = 1; append_box(ctx, box, flow_box); return flow_box; } static void gen2_children(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *root_node, fz_css_match *root_match); static void gen2_text(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node) { fz_html_box *anon_box; fz_css_style style; const char *text; int collapse; text = fz_xml_text(node); collapse = root_box->style->white_space & WS_COLLAPSE; if (collapse && is_all_white(text)) { g->emit_white = root_box; } else { if (root_box->type != BOX_INLINE) { /* Create anonymous inline box, with the same style as the top block box. */ style = *root_box->style; // Make sure not to recursively multiply font sizes style.font_size.value = 1; style.font_size.unit = N_SCALE; root_box = find_inline_context(ctx, g, root_box); anon_box = new_box(ctx, g, NULL, BOX_INLINE, &style); append_box(ctx, root_box, anon_box); root_box = anon_box; } generate_text(ctx, root_box, text, g->markup_lang, g); } } static fz_html_box *gen2_inline(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style) { fz_html_box *this_box; fz_html_box *flow_box; root_box = find_inline_context(ctx, g, root_box); this_box = new_box(ctx, g, node, BOX_INLINE, style); append_box(ctx, root_box, this_box); if (this_box->id) { flow_box = find_flow_encloser(ctx, this_box); add_flow_anchor(ctx, g->pool, flow_box, this_box); } return this_box; } static void gen2_break(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node) { fz_html_box *this_box; fz_html_box *flow_box; if (root_box->type != BOX_INLINE) { /* Create inline box to hold the <br> tag, with the same style as containing block. */ /* Make sure not to recursively multiply font sizes. */ fz_css_style style = *root_box->style; style.font_size.value = 1; style.font_size.unit = N_SCALE; this_box = new_box(ctx, g, node, BOX_INLINE, &style); append_box(ctx, find_inline_context(ctx, g, root_box), this_box); } else { this_box = root_box; } flow_box = find_flow_encloser(ctx, this_box); add_flow_break(ctx, g->pool, flow_box, this_box); g->at_bol = 1; } static fz_html_box *gen2_block(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style) { fz_html_box *this_box; root_box = find_block_context(ctx, root_box); this_box = new_box(ctx, g, node, BOX_BLOCK, style); append_box(ctx, root_box, this_box); return this_box; } static fz_html_box *gen2_table(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style) { fz_html_box *this_box; root_box = find_block_context(ctx, root_box); this_box = new_box(ctx, g, node, BOX_TABLE, style); append_box(ctx, root_box, this_box); return this_box; } static fz_html_box *gen2_table_row(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style) { fz_html_box *this_box, *table_box; table_box = find_table_row_context(ctx, root_box); if (!table_box) return gen2_block(ctx, g, root_box, node, style); this_box = new_box(ctx, g, node, BOX_TABLE_ROW, style); append_box(ctx, table_box, this_box); return this_box; } static fz_html_box *gen2_table_cell(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style) { fz_html_box *this_box, *row_box; row_box = find_table_cell_context(ctx, root_box); if (!row_box) return gen2_block(ctx, g, root_box, node, style); this_box = new_box(ctx, g, node, BOX_TABLE_CELL, style); append_box(ctx, row_box, this_box); return this_box; } static void gen2_image_common(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_image *img, int display, fz_css_style *style) { fz_html_box *img_block_box; fz_html_box *img_inline_box; if (display == DIS_INLINE || display == DIS_INLINE_BLOCK) { root_box = find_inline_context(ctx, g, root_box); img_inline_box = new_box(ctx, g, node, BOX_INLINE, style); append_box(ctx, root_box, img_inline_box); generate_image(ctx, img_inline_box, img, g); } else { root_box = find_block_context(ctx, root_box); img_block_box = new_box(ctx, g, node, BOX_BLOCK, style); append_box(ctx, root_box, img_block_box); root_box = find_inline_context(ctx, g, img_block_box); img_inline_box = new_box(ctx, g, NULL, BOX_INLINE, style); append_box(ctx, root_box, img_inline_box); generate_image(ctx, img_inline_box, img, g); } } static void gen2_image_html(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, int display, fz_css_style *style) { const char *src = fz_xml_att(node, "src"); if (src) { fz_css_style local_style = *style; fz_image *img; int w, h; const char *w_att = fz_xml_att(node, "width"); const char *h_att = fz_xml_att(node, "height"); if (w_att && (w = fz_atoi(w_att)) > 0) { local_style.width.value = w; local_style.width.unit = strchr(w_att, '%') ? N_PERCENT : N_LENGTH; } if (h_att && (h = fz_atoi(h_att)) > 0) { local_style.height.value = h; local_style.height.unit = strchr(h_att, '%') ? N_PERCENT : N_LENGTH; } img = load_html_image(ctx, g->zip, g->base_uri, src); gen2_image_common(ctx, g, root_box, node, img, display, &local_style); } } static void gen2_image_fb2(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, int display, fz_css_style *style) { const char *src = fz_xml_att(node, "l:href"); if (!src) src = fz_xml_att(node, "xlink:href"); if (src && src[0] == '#') { fz_image *img = fz_tree_lookup(ctx, g->images, src+1); gen2_image_common(ctx, g, root_box, node, fz_keep_image(ctx, img), display, style); } } static void gen2_image_svg(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, int display, fz_css_style *style) { fz_image *img = load_svg_image(ctx, g->zip, g->base_uri, g->xml, node); gen2_image_common(ctx, g, root_box, node, img, display, style); } static int get_heading_from_tag(fz_context *ctx, struct genstate *g, const char *tag) { if (tag[0] == 'h' && tag[1] != 0 && tag[2] == 0) { switch (tag[1]) { case '1': return 1; case '2': return 2; case '3': return 3; case '4': return 4; case '5': return 5; case '6': return 6; } } if (g->is_fb2) { if (!strcmp(tag, "title") || !strcmp(tag, "subtitle")) return fz_mini(g->section_depth, 6); } return 0; } static void gen2_tag(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_match *match, int display, fz_css_style *style) { fz_html_box *this_box; const char *tag; const char *lang_att; const char *dir_att; int save_markup_dir = g->markup_dir; int save_markup_lang = g->markup_lang; char *save_href = g->href; if (display == DIS_NONE) return; tag = fz_xml_tag(node); dir_att = fz_xml_att(node, "dir"); if (dir_att) { if (!strcmp(dir_att, "auto")) g->markup_dir = FZ_BIDI_NEUTRAL; else if (!strcmp(dir_att, "rtl")) g->markup_dir = FZ_BIDI_RTL; else if (!strcmp(dir_att, "ltr")) g->markup_dir = FZ_BIDI_LTR; else g->markup_dir = DEFAULT_DIR; } lang_att = fz_xml_att(node, "lang"); if (lang_att) g->markup_lang = fz_text_language_from_string(lang_att); switch (display) { case DIS_INLINE_BLOCK: // TODO handle inline block as a flow node this_box = gen2_block(ctx, g, root_box, node, style); break; case DIS_BLOCK: this_box = gen2_block(ctx, g, root_box, node, style); this_box->heading = get_heading_from_tag(ctx, g, tag); break; case DIS_LIST_ITEM: this_box = gen2_block(ctx, g, root_box, node, style); this_box->list_item = ++g->list_counter; break; // TODO: https://www.w3.org/TR/CSS2/tables.html#anonymous-boxes // // The table generation code should insert and create anonymous boxes // for any missing child/parent elements. // // MISSING CHILDREN: // 1: Wrap consecutive BLOCK found in a TABLE in an anon TABLE_ROW. // 2: Wrap consecutive BLOCK found in a TABLE_ROW in an anon TABLE_CELL. // // MISSING PARENTS: // 1: Wrap consecutive TABLE_CELL found outside TABLE_ROW in an anon TABLE_ROW // 2: Wrap consecutive TABLE_ROW found outside TABLE in an anon TABLE // // For now we ignore this and treat any such elements that are out of // context as plain block elements. case DIS_TABLE: this_box = gen2_table(ctx, g, root_box, node, style); break; case DIS_TABLE_GROUP: // no box for table-row-group elements this_box = root_box; break; case DIS_TABLE_ROW: this_box = gen2_table_row(ctx, g, root_box, node, style); break; case DIS_TABLE_CELL: this_box = gen2_table_cell(ctx, g, root_box, node, style); break; case DIS_INLINE: default: this_box = gen2_inline(ctx, g, root_box, node, style); break; } if (tag && (!strcmp(tag, "ol") || !strcmp(tag, "ul") || !strcmp(tag, "dl"))) { int save_list_counter = g->list_counter; g->list_counter = 0; gen2_children(ctx, g, this_box, node, match); g->list_counter = save_list_counter; } else if (tag && !strcmp(tag, "section")) { int save_section_depth = g->section_depth; g->section_depth++; gen2_children(ctx, g, this_box, node, match); g->section_depth = save_section_depth; } else { gen2_children(ctx, g, this_box, node, match); } g->markup_dir = save_markup_dir; g->markup_lang = save_markup_lang; g->href = save_href; } static void gen2_children(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *root_node, fz_css_match *root_match) { fz_xml *node; const char *tag; fz_css_match match; fz_css_style style; int display; for (node = fz_xml_down(root_node); node; node = fz_xml_next(node)) { tag = fz_xml_tag(node); if (tag) { fz_match_css(ctx, &match, root_match, g->css, node); fz_apply_css_style(ctx, g->set, &style, &match); display = fz_get_css_match_display(&match); if (tag[0]=='b' && tag[1]=='r' && tag[2]==0) { gen2_break(ctx, g, root_box, node); } else if (tag[0]=='i' && tag[1]=='m' && tag[2]=='g' && tag[3]==0) { gen2_image_html(ctx, g, root_box, node, display, &style); } else if (g->is_fb2 && tag[0]=='i' && tag[1]=='m' && tag[2]=='a' && tag[3]=='g' && tag[4]=='e' && tag[5]==0) { gen2_image_fb2(ctx, g, root_box, node, display, &style); } else if (tag[0]=='s' && tag[1]=='v' && tag[2]=='g' && tag[3]==0) { gen2_image_svg(ctx, g, root_box, node, display, &style); } else { gen2_tag(ctx, g, root_box, node, &match, display, &style); } } else { gen2_text(ctx, g, root_box, node); } } } static char *concat_text(fz_context *ctx, fz_xml *root) { fz_xml *node; size_t i = 0, n = 1; char *s; for (node = fz_xml_down(root); node; node = fz_xml_next(node)) { const char *text = fz_xml_text(node); n += text ? strlen(text) : 0; } s = Memento_label(fz_malloc(ctx, n), "concat_html"); for (node = fz_xml_down(root); node; node = fz_xml_next(node)) { const char *text = fz_xml_text(node); if (text) { n = strlen(text); memcpy(s+i, text, n); i += n; } } s[i] = 0; return s; } static void html_load_css_link(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_css *css, fz_xml *root, const char *href) { char path[2048]; char css_base_uri[2048]; fz_buffer *buf; fz_var(buf); fz_strlcpy(path, base_uri, sizeof path); fz_strlcat(path, "/", sizeof path); fz_strlcat(path, href, sizeof path); fz_urldecode(path); fz_cleanname(path); fz_dirname(css_base_uri, path, sizeof css_base_uri); buf = NULL; fz_try(ctx) { buf = fz_read_archive_entry(ctx, zip, path); fz_parse_css(ctx, css, fz_string_from_buffer(ctx, buf), path); fz_add_css_font_faces(ctx, set, zip, css_base_uri, css); } fz_always(ctx) fz_drop_buffer(ctx, buf); fz_catch(ctx) { fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); fz_report_error(ctx); fz_warn(ctx, "ignoring stylesheet %s", path); } } static void html_load_css(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_css *css, fz_xml *root) { fz_xml *html, *head, *node; html = fz_xml_find(root, "html"); head = fz_xml_find_down(html, "head"); for (node = fz_xml_down(head); node; node = fz_xml_next(node)) { if (fz_xml_is_tag(node, "link")) { char *rel = fz_xml_att(node, "rel"); if (rel && !fz_strcasecmp(rel, "stylesheet")) { char *type = fz_xml_att(node, "type"); if ((type && !strcmp(type, "text/css")) || !type) { char *href = fz_xml_att(node, "href"); if (href) { html_load_css_link(ctx, set, zip, base_uri, css, root, href); } } } } else if (fz_xml_is_tag(node, "style")) { char *s = concat_text(ctx, node); fz_try(ctx) { fz_parse_css(ctx, css, s, "<style>"); fz_add_css_font_faces(ctx, set, zip, base_uri, css); } fz_always(ctx) fz_free(ctx, s); fz_catch(ctx) { fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); fz_report_error(ctx); fz_warn(ctx, "ignoring inline stylesheet"); } } } } static void fb2_load_css(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_css *css, fz_xml *root) { fz_xml *fictionbook, *stylesheet; fictionbook = fz_xml_find(root, "FictionBook"); stylesheet = fz_xml_find_down(fictionbook, "stylesheet"); if (stylesheet) { char *s = concat_text(ctx, stylesheet); fz_try(ctx) { fz_parse_css(ctx, css, s, "<stylesheet>"); fz_add_css_font_faces(ctx, set, zip, base_uri, css); } fz_catch(ctx) { fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); fz_report_error(ctx); fz_warn(ctx, "ignoring inline stylesheet"); } fz_free(ctx, s); } } static fz_tree * load_fb2_images(fz_context *ctx, fz_xml *root) { fz_xml *fictionbook, *binary; fz_tree *images = NULL; fictionbook = fz_xml_find(root, "FictionBook"); for (binary = fz_xml_find_down(fictionbook, "binary"); binary; binary = fz_xml_find_next(binary, "binary")) { const char *id = fz_xml_att(binary, "id"); char *b64 = NULL; fz_buffer *buf = NULL; fz_image *img = NULL; fz_var(b64); fz_var(buf); if (id == NULL) { fz_warn(ctx, "Skipping image with no id"); continue; } fz_try(ctx) { b64 = concat_text(ctx, binary); buf = fz_new_buffer_from_base64(ctx, b64, strlen(b64)); img = fz_new_image_from_buffer(ctx, buf); } fz_always(ctx) { fz_drop_buffer(ctx, buf); fz_free(ctx, b64); } fz_catch(ctx) fz_rethrow(ctx); images = fz_tree_insert(ctx, images, id, img); } return images; } typedef struct { uint32_t *data; size_t cap; size_t len; } uni_buf; typedef struct { fz_context *ctx; fz_pool *pool; fz_html_flow *flow; uni_buf *buffer; } bidi_data; static void fragment_cb(const uint32_t *fragment, size_t fragment_len, int bidi_level, int script, void *arg) { bidi_data *data = (bidi_data *)arg; /* We are guaranteed that fragmentOffset will be at the beginning * of flow. */ while (fragment_len > 0) { size_t len; if (data->flow->type == FLOW_SPACE) { len = 1; } else if (data->flow->type == FLOW_BREAK || data->flow->type == FLOW_SBREAK || data->flow->type == FLOW_SHYPHEN || data->flow->type == FLOW_ANCHOR) { len = 0; } else { /* Must be text */ len = fz_utflen(data->flow->content.text); if (len > fragment_len) { /* We need to split this flow box */ (void)fz_html_split_flow(data->ctx, data->pool, data->flow, fragment_len); len = fz_utflen(data->flow->content.text); } } /* This flow box is entirely contained within this fragment. */ data->flow->bidi_level = bidi_level; data->flow->script = script; data->flow = data->flow->next; fragment_len -= len; } } static fz_bidi_direction detect_flow_directionality(fz_context *ctx, fz_pool *pool, uni_buf *buffer, fz_bidi_direction bidi_dir, fz_html_flow *flow) { fz_html_flow *end = flow; bidi_data data; while (end) { unsigned int level = end->bidi_level; /* Gather the text from the flow up into a single buffer (at * least, as much of it as has the same direction markup). */ buffer->len = 0; while (end && (level & 1) == (end->bidi_level & 1)) { size_t len = 0; const char *text = ""; int broken = 0; switch (end->type) { case FLOW_WORD: len = fz_utflen(end->content.text); text = end->content.text; break; case FLOW_SPACE: len = 1; text = " "; break; case FLOW_SHYPHEN: case FLOW_SBREAK: break; case FLOW_BREAK: case FLOW_IMAGE: broken = 1; break; } end = end->next; if (broken) break; /* Make sure the buffer is large enough */ if (buffer->len + len > buffer->cap) { size_t newcap = buffer->cap; if (newcap < 128) newcap = 128; /* Sensible small default */ while (newcap < buffer->len + len) newcap = (newcap * 3) / 2; buffer->data = fz_realloc_array(ctx, buffer->data, newcap, uint32_t); buffer->cap = newcap; } /* Expand the utf8 text into Unicode and store it in the buffer */ while (*text) { int rune; text += fz_chartorune(&rune, text); buffer->data[buffer->len++] = rune; } } /* Detect directionality for the buffer */ data.ctx = ctx; data.pool = pool; data.flow = flow; data.buffer = buffer; fz_bidi_fragment_text(ctx, buffer->data, buffer->len, &bidi_dir, fragment_cb, &data, 0 /* Flags */); flow = end; } return bidi_dir; } static void detect_box_directionality(fz_context *ctx, fz_pool *pool, uni_buf *buffer, fz_html_box *box) { while (box) { if (box->type == BOX_FLOW) box->markup_dir = detect_flow_directionality(ctx, pool, buffer, box->markup_dir, box->u.flow.head); detect_box_directionality(ctx, pool, buffer, box->down); box = box->next; } } static void detect_directionality(fz_context *ctx, fz_pool *pool, fz_html_box *box) { uni_buf buffer = { NULL }; fz_try(ctx) detect_box_directionality(ctx, pool, &buffer, box); fz_always(ctx) fz_free(ctx, buffer.data); fz_catch(ctx) fz_rethrow(ctx); } static fz_xml_doc * parse_to_xml(fz_context *ctx, fz_buffer *buf, int try_xml, int try_html5) { fz_xml_doc *xml; if (try_xml && try_html5) { fz_try(ctx) xml = fz_parse_xml(ctx, buf, 1); fz_catch(ctx) { if (fz_caught(ctx) == FZ_ERROR_SYNTAX) { fz_report_error(ctx); fz_warn(ctx, "syntax error in XHTML; retrying using HTML5 parser"); xml = fz_parse_xml_from_html5(ctx, buf); } else fz_rethrow(ctx); } } else if (try_xml) xml = fz_parse_xml(ctx, buf, 1); else { assert(try_html5); xml = fz_parse_xml_from_html5(ctx, buf); } return xml; } static void move_background_color_style_up(fz_context *ctx, struct genstate *g, fz_html_box *root, fz_html_box *from) { fz_css_color transparent = { 0, 0, 0, 0 }; fz_css_style s1, s2; memcpy(&s1, root->style, sizeof s1); memcpy(&s2, from->style, sizeof s2); s1.background_color = s2.background_color; s2.background_color = transparent; root->style = fz_css_enlist(ctx, &s1, &g->styles, g->pool); from->style = fz_css_enlist(ctx, &s2, &g->styles, g->pool); } static void move_background_color_up(fz_context *ctx, struct genstate *g, fz_html_box *root) { fz_html_box *html, *body; if (root->style->background_color.a != 0) { return; } html = root->down; if (html && !strcmp(html->tag, "html")) { if (html->style->background_color.a != 0) { move_background_color_style_up(ctx, g, root, html); return; } body = html->down; if (body && !strcmp(body->tag, "body")) { if (body->style->background_color.a != 0) { move_background_color_style_up(ctx, g, root, body); return; } } } } static void xml_to_boxes(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, const char *user_css, fz_xml_doc *xml, fz_html_tree *tree, char **rtitle, int try_fictionbook, int is_mobi) { fz_xml *root, *node; char *title; fz_css_match root_match, match; struct genstate g = {0}; g.pool = NULL; g.set = set; g.zip = zip; g.images = NULL; g.xml = xml; g.is_fb2 = 0; g.base_uri = base_uri; g.css = NULL; g.at_bol = 0; g.emit_white = 0; g.last_brk_cls = UCDN_LINEBREAK_CLASS_OP; g.list_counter = 0; g.section_depth = 0; g.markup_dir = FZ_BIDI_LTR; g.markup_lang = FZ_LANG_UNSET; g.href = NULL; g.styles = NULL; if (rtitle) *rtitle = NULL; root = fz_xml_root(g.xml); g.css = fz_new_css(ctx); #ifndef NDEBUG if (fz_atoi(getenv("FZ_DEBUG_XML"))) fz_debug_xml(root, 0); #endif fz_try(ctx) { if (try_fictionbook && fz_xml_find(root, "FictionBook")) { g.is_fb2 = 1; fz_parse_css(ctx, g.css, fb2_default_css, "<default:fb2>"); if (fz_use_document_css(ctx)) fb2_load_css(ctx, g.set, g.zip, g.base_uri, g.css, root); g.images = load_fb2_images(ctx, root); } else if (is_mobi) { g.is_fb2 = 0; fz_parse_css(ctx, g.css, html_default_css, "<default:html>"); fz_parse_css(ctx, g.css, mobi_default_css, "<default:mobi>"); if (fz_use_document_css(ctx)) html_load_css(ctx, g.set, g.zip, g.base_uri, g.css, root); } else { g.is_fb2 = 0; fz_parse_css(ctx, g.css, html_default_css, "<default:html>"); if (fz_use_document_css(ctx)) html_load_css(ctx, g.set, g.zip, g.base_uri, g.css, root); } if (user_css) { fz_parse_css(ctx, g.css, user_css, "<user>"); fz_add_css_font_faces(ctx, g.set, g.zip, ".", g.css); } } fz_catch(ctx) { fz_drop_tree(ctx, g.images, (void(*)(fz_context*,void*))fz_drop_image); fz_drop_css(ctx, g.css); fz_rethrow_if(ctx, FZ_ERROR_TRYLATER); fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); fz_report_error(ctx); fz_warn(ctx, "ignoring styles"); g.css = fz_new_css(ctx); g.images = NULL; } #ifndef NDEBUG if (fz_atoi(getenv("FZ_DEBUG_CSS"))) fz_debug_css(ctx, g.css); #endif fz_try(ctx) { fz_css_style style; int display; fz_match_css_at_page(ctx, &root_match, g.css); fz_apply_css_style(ctx, g.set, &style, &root_match); g.pool = tree->pool; g.markup_dir = DEFAULT_DIR; g.markup_lang = FZ_LANG_UNSET; // Create root node tree->root = new_box(ctx, &g, NULL, BOX_BLOCK, &style); // TODO: transfer page margins out of this hacky box tree->root->tag = ":root"; tree->root->s.layout.em = 0; tree->root->s.layout.x = 0; tree->root->s.layout.y = 0; tree->root->s.layout.w = 0; tree->root->s.layout.b = 0; // Create document node (html). fz_match_css(ctx, &match, &root_match, g.css, root); fz_apply_css_style(ctx, g.set, &style, &match); display = fz_get_css_match_display(&match); gen2_tag(ctx, &g, tree->root, root, &match, display, &style); detect_directionality(ctx, g.pool, tree->root); if (g.is_fb2) { node = fz_xml_find(root, "FictionBook"); node = fz_xml_find_down(node, "description"); node = fz_xml_find_down(node, "title-info"); node = fz_xml_find_down(node, "book-title"); if (rtitle) { title = fz_xml_text(fz_xml_down(node)); if (title) *rtitle = fz_pool_strdup(ctx, g.pool, title); } } else { node = fz_xml_find(root, "html"); node = fz_xml_find_down(node, "head"); node = fz_xml_find_down(node, "title"); if (rtitle) { title = fz_xml_text(fz_xml_down(node)); if (title) *rtitle = fz_pool_strdup(ctx, g.pool, title); } // Move html or body background-color to :root. move_background_color_up(ctx, &g, tree->root); } } fz_always(ctx) { fz_drop_tree(ctx, g.images, (void(*)(fz_context*,void*))fz_drop_image); fz_drop_css(ctx, g.css); } fz_catch(ctx) { if (rtitle) { fz_free(ctx, *rtitle); *rtitle = NULL; } fz_rethrow(ctx); } } static const char *mobi_font_size[7] = { "0.67em", "0.83em", "1em", "1.17em", "1.33em", "1.5em", "1.67em", }; static void patch_mobi_html(fz_context *ctx, fz_pool *pool, fz_xml *node) { fz_xml *down; char buf[500]; while (node) { char *tag = fz_xml_tag(node); if (tag) { // Read MOBI attributes, convert to inline CSS style if (!strcmp(tag, "font")) { const char *size = fz_xml_att(node, "size"); if (size) { if (!strcmp(size, "1")) size = mobi_font_size[0]; else if (!strcmp(size, "2")) size = mobi_font_size[1]; else if (!strcmp(size, "3")) size = mobi_font_size[2]; else if (!strcmp(size, "4")) size = mobi_font_size[3]; else if (!strcmp(size, "5")) size = mobi_font_size[4]; else if (!strcmp(size, "6")) size = mobi_font_size[5]; else if (!strcmp(size, "7")) size = mobi_font_size[6]; else if (!strcmp(size, "+1")) size = mobi_font_size[3]; else if (!strcmp(size, "+2")) size = mobi_font_size[4]; else if (!strcmp(size, "+3")) size = mobi_font_size[5]; else if (!strcmp(size, "+4")) size = mobi_font_size[6]; else if (!strcmp(size, "+5")) size = mobi_font_size[6]; else if (!strcmp(size, "+6")) size = mobi_font_size[6]; else if (!strcmp(size, "-1")) size = mobi_font_size[1]; else if (!strcmp(size, "-2")) size = mobi_font_size[0]; else if (!strcmp(size, "-3")) size = mobi_font_size[0]; else if (!strcmp(size, "-4")) size = mobi_font_size[0]; else if (!strcmp(size, "-5")) size = mobi_font_size[0]; else if (!strcmp(size, "-6")) size = mobi_font_size[0]; fz_snprintf(buf, sizeof buf, "font-size:%s", size); fz_xml_add_att(ctx, pool, node, "style", buf); } } else { char *height = fz_xml_att(node, "height"); char *width = fz_xml_att(node, "width"); char *align = fz_xml_att(node, "align"); if (height || width || align) { buf[0] = 0; if (height) { fz_strlcat(buf, "margin-top:", sizeof buf); fz_strlcat(buf, height, sizeof buf); fz_strlcat(buf, ";", sizeof buf); } if (width) { fz_strlcat(buf, "text-indent:", sizeof buf); fz_strlcat(buf, width, sizeof buf); fz_strlcat(buf, ";", sizeof buf); } if (align) { fz_strlcat(buf, "text-align:", sizeof buf); fz_strlcat(buf, align, sizeof buf); fz_strlcat(buf, ";", sizeof buf); } fz_xml_add_att(ctx, pool, node, "style", buf); } if (!strcmp(tag, "img")) { char *recindex = fz_xml_att(node, "recindex"); if (recindex) fz_xml_add_att(ctx, pool, node, "src", recindex); } } } down = fz_xml_down(node); if (down) patch_mobi_html(ctx, pool, down); node = fz_xml_next(node); } } static void fz_parse_html_tree(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buf, const char *user_css, int try_xml, int try_html5, fz_html_tree *tree, char **rtitle, int try_fictionbook, int patch_mobi) { fz_xml_doc *xml; if (rtitle) *rtitle = NULL; xml = parse_to_xml(ctx, buf, try_xml, try_html5); if (patch_mobi) patch_mobi_html(ctx, xml->u.doc.pool, xml); fz_try(ctx) xml_to_boxes(ctx, set, zip, base_uri, user_css, xml, tree, rtitle, try_fictionbook, patch_mobi); fz_always(ctx) fz_drop_xml(ctx, xml); fz_catch(ctx) fz_rethrow(ctx); } #define fz_new_derived_html_tree(CTX, TYPE, DROP) \ ((TYPE *)Memento_label(fz_new_html_tree_of_size(CTX, sizeof(TYPE), DROP), #TYPE)) static fz_html_tree * fz_new_html_tree_of_size(fz_context *ctx, size_t size, fz_store_drop_fn *drop) { fz_pool *pool = fz_new_pool(ctx); fz_html_tree *tree; fz_try(ctx) { tree = fz_pool_alloc(ctx, pool, size); FZ_INIT_STORABLE(tree, 1, drop); tree->pool = pool; } fz_catch(ctx) { fz_drop_pool(ctx, pool); fz_rethrow(ctx); } return tree; } fz_html * fz_parse_html(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buf, const char *user_css, int try_xml, int try_html5, int patch_mobi) { fz_html *html = fz_new_derived_html_tree(ctx, fz_html, fz_drop_html_imp); html->layout_w = 0; html->layout_h = 0; html->layout_em = 0; fz_try(ctx) fz_parse_html_tree(ctx, set, zip, base_uri, buf, user_css, try_xml, try_html5, &html->tree, &html->title, 1, patch_mobi); fz_catch(ctx) { fz_drop_html(ctx, html); fz_rethrow(ctx); } return html; } typedef struct { int saved; fz_warning_cb *old; void *arg; fz_buffer *buffer; fz_context *ctx; } warning_save; static void warn_to_buffer(void *user, const char *message) { warning_save *save = (warning_save *)user; fz_context *ctx = save->ctx; fz_try(ctx) { fz_append_string(ctx, save->buffer, message); fz_append_byte(ctx, save->buffer, '\n'); } fz_catch(ctx) { /* Silently swallow the error. */ fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); fz_report_error(ctx); } } static void redirect_warnings_to_buffer(fz_context *ctx, fz_buffer *buf, warning_save *save) { save->saved = 1; save->old = fz_warning_callback(ctx, &save->arg); save->buffer = buf; save->ctx = ctx; fz_flush_warnings(ctx); fz_set_warning_callback(ctx, warn_to_buffer, save); } static void restore_warnings(fz_context *ctx, warning_save *save) { if (!save->saved) return; fz_flush_warnings(ctx); fz_set_warning_callback(ctx, save->old, save->arg); } fz_story * fz_new_story(fz_context *ctx, fz_buffer *buf, const char *user_css, float em, fz_archive *zip) { fz_story *story = fz_new_derived_html_tree(ctx, fz_story, fz_drop_story_imp); warning_save saved = { 0 }; fz_buffer *local_buffer = NULL; if (buf == NULL) { local_buffer = fz_new_buffer(ctx, 0); buf = local_buffer; } fz_var(local_buffer); fz_var(saved); fz_try(ctx) { story->zip = fz_keep_archive(ctx, zip); story->font_set = fz_new_html_font_set(ctx); story->em = em; story->user_css = user_css ? fz_strdup(ctx, user_css) : NULL; story->warnings = fz_new_buffer(ctx, 128); redirect_warnings_to_buffer(ctx, story->warnings, &saved); story->dom = parse_to_xml(ctx, buf, 0, 1); } fz_always(ctx) { restore_warnings(ctx, &saved); fz_drop_buffer(ctx, local_buffer); } fz_catch(ctx) { fz_drop_html_tree(ctx, &story->tree); fz_rethrow(ctx); } return story; } fz_html * fz_parse_xhtml(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buf, const char *user_css) { /* try as XML first, fall back to HTML5 */ return fz_parse_html(ctx, set, zip, base_uri, buf, user_css, 1, 1, 0); } static void indent(int level) { while (level-- > 0) putchar('\t'); } static void fz_debug_html_flow(fz_context *ctx, fz_html_flow *flow, int level) { fz_html_box *sbox = NULL; while (flow) { if (flow->box != sbox) { sbox = flow->box; indent(level); #ifndef NDEBUG printf("@style <%s> em=%g font='%s'", sbox->tag, sbox->s.layout.em, fz_font_name(ctx, sbox->style->font)); #else printf("@style em=%g font='%s'", sbox->s.layout.em, fz_font_name(ctx, sbox->style->font)); #endif if (fz_font_is_serif(ctx, sbox->style->font)) printf(" serif"); else printf(" sans"); if (fz_font_is_monospaced(ctx, sbox->style->font)) printf(" monospaced"); if (fz_font_is_bold(ctx, sbox->style->font)) printf(" bold"); if (fz_font_is_italic(ctx, sbox->style->font)) printf(" italic"); if (sbox->style->small_caps) printf(" small-caps"); printf("\n"); } indent(level); switch (flow->type) { case FLOW_WORD: printf("word "); break; case FLOW_SPACE: printf("space"); break; case FLOW_SBREAK: printf("sbrk "); break; case FLOW_SHYPHEN: printf("shy "); break; case FLOW_BREAK: printf("break"); break; case FLOW_IMAGE: printf("image"); break; case FLOW_ANCHOR: printf("anchor"); break; } // printf(" y=%g x=%g w=%g", flow->y, flow->x, flow->w); if (flow->type == FLOW_IMAGE) printf(" h=%g", flow->h); if (flow->type == FLOW_WORD) printf(" text='%s'", flow->content.text); printf("\n"); if (flow->breaks_line) { indent(level); printf("*\n"); } flow = flow->next; } } fz_structure fz_html_tag_to_structure(const char *tag) { if (!strcmp(tag, "body")) return FZ_STRUCTURE_DOCUMENT; if (!strcmp(tag, "div")) return FZ_STRUCTURE_DIV; if (!strcmp(tag, "span")) return FZ_STRUCTURE_SPAN; if (!strcmp(tag, "blockquote")) return FZ_STRUCTURE_BLOCKQUOTE; if (!strcmp(tag, "p")) return FZ_STRUCTURE_P; if (!strcmp(tag, "h1")) return FZ_STRUCTURE_H1; if (!strcmp(tag, "h2")) return FZ_STRUCTURE_H2; if (!strcmp(tag, "h3")) return FZ_STRUCTURE_H3; if (!strcmp(tag, "h4")) return FZ_STRUCTURE_H4; if (!strcmp(tag, "h5")) return FZ_STRUCTURE_H5; if (!strcmp(tag, "h6")) return FZ_STRUCTURE_H6; if (!strcmp(tag, "ol")) return FZ_STRUCTURE_LIST; if (!strcmp(tag, "ul")) return FZ_STRUCTURE_LIST; if (!strcmp(tag, "dl")) return FZ_STRUCTURE_LIST; if (!strcmp(tag, "li")) return FZ_STRUCTURE_LISTITEM; if (!strcmp(tag, "table")) return FZ_STRUCTURE_TABLE; if (!strcmp(tag, "tr")) return FZ_STRUCTURE_TR; if (!strcmp(tag, "th")) return FZ_STRUCTURE_TH; if (!strcmp(tag, "td")) return FZ_STRUCTURE_TD; if (!strcmp(tag, "thead")) return FZ_STRUCTURE_THEAD; if (!strcmp(tag, "tbody")) return FZ_STRUCTURE_TBODY; if (!strcmp(tag, "tfoot")) return FZ_STRUCTURE_TFOOT; return FZ_STRUCTURE_INVALID; } static void fz_debug_html_box(fz_context *ctx, fz_html_box *box, int level) { while (box) { indent(level); printf("box "); switch (box->type) { case BOX_BLOCK: printf("block"); break; case BOX_FLOW: printf("flow"); break; case BOX_INLINE: printf("inline"); break; case BOX_TABLE: printf("table"); break; case BOX_TABLE_ROW: printf("table-row"); break; case BOX_TABLE_CELL: printf("table-cell"); break; } printf(" <%s>", box->tag); // printf(" em=%g", box->em); // printf(" x=%g y=%g w=%g b=%g", box->x, box->y, box->w, box->b); if (box->is_first_flow) printf(" is-first-flow"); if (box->list_item) printf(" list=%d", box->list_item); if (box->id) printf(" id=(%s)", box->id); if (box->href) printf(" href=(%s)", box->href); printf("\n"); if (box->type == BOX_BLOCK || box->type == BOX_TABLE) { indent(level+1); printf(">margin=(%g %g %g %g)\n", box->u.block.margin[0], box->u.block.margin[1], box->u.block.margin[2], box->u.block.margin[3]); //indent(level+1); //printf(">padding=(%g %g %g %g)\n", box->u.block.padding[0], box->u.block.padding[1], box->u.block.padding[2], box->u.block.padding[3]); //indent(level+1); //printf(">border=(%g %g %g %g)\n", box->u.block.border[0], box->u.block.border[1], box->u.block.border[2], box->u.block.border[3]); } if (box->down) fz_debug_html_box(ctx, box->down, level + 1); if (box->type == BOX_FLOW) { indent(level+1); printf("flow\n"); fz_debug_html_flow(ctx, box->u.flow.head, level + 2); } box = box->next; } } void fz_debug_html(fz_context *ctx, fz_html_box *box) { fz_debug_html_box(ctx, box, 0); } static size_t fz_html_size(fz_context *ctx, fz_html *html) { return html ? fz_pool_size(ctx, html->tree.pool) : 0; } /* Magic to make html storable. */ typedef struct { int refs; void *doc; int chapter_num; } fz_html_key; static int fz_make_hash_html_key(fz_context *ctx, fz_store_hash *hash, void *key_) { fz_html_key *key = (fz_html_key *)key_; hash->u.pi.ptr = key->doc; hash->u.pi.i = key->chapter_num; return 1; } static void * fz_keep_html_key(fz_context *ctx, void *key_) { fz_html_key *key = (fz_html_key *)key_; return fz_keep_imp(ctx, key, &key->refs); } static void fz_drop_html_key(fz_context *ctx, void *key_) { fz_html_key *key = (fz_html_key *)key_; if (fz_drop_imp(ctx, key, &key->refs)) { fz_free(ctx, key); } } static int fz_cmp_html_key(fz_context *ctx, void *k0_, void *k1_) { fz_html_key *k0 = (fz_html_key *)k0_; fz_html_key *k1 = (fz_html_key *)k1_; return k0->doc == k1->doc && k0->chapter_num == k1->chapter_num; } static void fz_format_html_key(fz_context *ctx, char *s, size_t n, void *key_) { fz_html_key *key = (fz_html_key *)key_; fz_snprintf(s, n, "(html doc=%p, ch=%d)", key->doc, key->chapter_num); } static const fz_store_type fz_html_store_type = { "fz_html", fz_make_hash_html_key, fz_keep_html_key, fz_drop_html_key, fz_cmp_html_key, fz_format_html_key, NULL }; fz_html *fz_store_html(fz_context *ctx, fz_html *html, void *doc, int chapter) { fz_html_key *key = NULL; fz_html *other_html; /* Stick the parsed html in the store */ fz_var(key); fz_try(ctx) { key = fz_malloc_struct(ctx, fz_html_key); key->refs = 1; key->doc = doc; key->chapter_num = chapter; other_html = fz_store_item(ctx, key, html, fz_html_size(ctx, html), &fz_html_store_type); if (other_html) { fz_drop_html(ctx, html); html = other_html; } } fz_always(ctx) fz_drop_html_key(ctx, key); fz_catch(ctx) { /* Do nothing */ } return html; } fz_html *fz_find_html(fz_context *ctx, void *doc, int chapter) { fz_html_key key; key.refs = 1; key.doc = doc; key.chapter_num = chapter; return fz_find_item(ctx, &fz_drop_html_imp, &key, &fz_html_store_type); } static int html_filter_store(fz_context *ctx, void *doc, void *key_) { fz_html_key *key = (fz_html_key *)key_; return (doc == key->doc); } void fz_purge_stored_html(fz_context *ctx, void *doc) { fz_filter_store(ctx, html_filter_store, doc, &fz_html_store_type); } static void convert_to_boxes(fz_context *ctx, fz_story *story) { warning_save saved = { 0 }; if (story->dom == NULL) return; fz_var(saved); fz_try(ctx) { redirect_warnings_to_buffer(ctx, story->warnings, &saved); xml_to_boxes(ctx, story->font_set, story->zip, ".", story->user_css, story->dom, &story->tree, NULL, 0, 0); } fz_always(ctx) { fz_drop_xml(ctx, story->dom); story->dom = NULL; restore_warnings(ctx, &saved); } fz_catch(ctx) fz_rethrow(ctx); } int fz_place_story(fz_context *ctx, fz_story *story, fz_rect where, fz_rect *filled) { return fz_place_story_flags(ctx, story, where, filled, 0); } int fz_place_story_flags(fz_context *ctx, fz_story *story, fz_rect where, fz_rect *filled, int flags) { float w, h; if (filled) *filled = fz_empty_rect; if (story == NULL || story->complete) return 0; /* Convert from XML to box model on the first attempt to place. * The DOM is unusable from here on in. */ convert_to_boxes(ctx, story); w = where.x1 - where.x0; h = where.y1 - where.y0; /* Confusingly, we call the layout using restart_draw, not restart_place, * because we don't want to destroy the current values in restart_place * in case we have to retry later. This means the values are left in * the correct struct though! */ story->restart_draw.start = story->restart_place.start; story->restart_draw.start_flow = story->restart_place.start_flow; story->restart_draw.end = NULL; story->restart_draw.end_flow = NULL; story->restart_draw.reason = FZ_HTML_RESTART_REASON_NONE; story->restart_draw.flags = flags; story->bbox = where; fz_restartable_layout_html(ctx, &story->tree, where.x0, where.y0, w, h, story->em, &story->restart_draw); story->restart_draw.start = story->restart_place.start; story->restart_draw.start_flow = story->restart_place.start_flow; if (filled) { fz_html_box *b = story->tree.root; filled->x0 = b->s.layout.x - b->u.block.margin[L] - b->u.block.border[L] - b->u.block.padding[L]; filled->x1 = b->s.layout.w + b->u.block.margin[R] + b->u.block.border[R] + b->u.block.padding[R] + b->s.layout.x; filled->y0 = b->s.layout.y - b->u.block.margin[T] - b->u.block.border[T] - b->u.block.padding[T]; filled->y1 = b->s.layout.b + b->u.block.margin[B] + b->u.block.border[B] + b->u.block.padding[B]; } #ifndef NDEBUG if (fz_atoi(getenv("FZ_DEBUG_HTML"))) fz_debug_html(ctx, story->tree.root); #endif if (story->restart_draw.end == NULL) return FZ_HTML_RESTART_REASON_NONE; if (story->restart_draw.reason == FZ_HTML_RESTART_REASON_LINE_WIDTH) return FZ_HTML_RESTART_REASON_LINE_WIDTH; return FZ_HTML_RESTART_REASON_LINE_HEIGHT; } const char * fz_story_warnings(fz_context *ctx, fz_story *story) { unsigned char *data; if (!story) return NULL; convert_to_boxes(ctx, story); fz_terminate_buffer(ctx, story->warnings); if (fz_buffer_storage(ctx, story->warnings, &data) == 0) return NULL; return (const char *)data; }
