Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/source/html/html-parse.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 // Copyright (C) 2004-2025 Artifex Software, Inc. | |
| 2 // | |
| 3 // This file is part of MuPDF. | |
| 4 // | |
| 5 // MuPDF is free software: you can redistribute it and/or modify it under the | |
| 6 // terms of the GNU Affero General Public License as published by the Free | |
| 7 // Software Foundation, either version 3 of the License, or (at your option) | |
| 8 // any later version. | |
| 9 // | |
| 10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY | |
| 11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | |
| 12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more | |
| 13 // details. | |
| 14 // | |
| 15 // You should have received a copy of the GNU Affero General Public License | |
| 16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> | |
| 17 // | |
| 18 // Alternative licensing terms are available from the licensor. | |
| 19 // For commercial licensing, see <https://www.artifex.com/> or contact | |
| 20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, | |
| 21 // CA 94129, USA, for further information. | |
| 22 | |
| 23 #include "mupdf/fitz.h" | |
| 24 #include "mupdf/ucdn.h" | |
| 25 #include "html-imp.h" | |
| 26 | |
| 27 #include <string.h> | |
| 28 #include <stdio.h> | |
| 29 #include <assert.h> | |
| 30 | |
| 31 enum { T, R, B, L }; | |
| 32 | |
| 33 #define DEFAULT_DIR FZ_BIDI_LTR | |
| 34 | |
| 35 static const char *html_default_css = | |
| 36 "@page{margin:3em 2em}" | |
| 37 "a{color:#06C;text-decoration:underline}" | |
| 38 "address{display:block;font-style:italic}" | |
| 39 "b{font-weight:bold}" | |
| 40 "bdo{direction:rtl;unicode-bidi:bidi-override}" | |
| 41 "blockquote{display:block;margin:1em 40px}" | |
| 42 "body{display:block;margin:1em}" | |
| 43 "cite{font-style:italic}" | |
| 44 "code{font-family:monospace}" | |
| 45 "dd{display:block;margin:0 0 0 40px}" | |
| 46 "del{text-decoration:line-through}" | |
| 47 "div{display:block}" | |
| 48 "dl{display:block;margin:1em 0}" | |
| 49 "dt{display:block}" | |
| 50 "em{font-style:italic}" | |
| 51 "h1{display:block;font-size:2em;font-weight:bold;margin:0.67em 0;page-break-after:avoid}" | |
| 52 "h2{display:block;font-size:1.5em;font-weight:bold;margin:0.83em 0;page-break-after:avoid}" | |
| 53 "h3{display:block;font-size:1.17em;font-weight:bold;margin:1em 0;page-break-after:avoid}" | |
| 54 "h4{display:block;font-size:1em;font-weight:bold;margin:1.33em 0;page-break-after:avoid}" | |
| 55 "h5{display:block;font-size:0.83em;font-weight:bold;margin:1.67em 0;page-break-after:avoid}" | |
| 56 "h6{display:block;font-size:0.67em;font-weight:bold;margin:2.33em 0;page-break-after:avoid}" | |
| 57 "head{display:none}" | |
| 58 "hr{border-style:solid;border-width:1px;display:block;margin-bottom:0.5em;margin-top:0.5em;text-align:center}" | |
| 59 "html{display:block}" | |
| 60 "i{font-style:italic}" | |
| 61 "ins{text-decoration:underline}" | |
| 62 "kbd{font-family:monospace}" | |
| 63 "li{display:list-item}" | |
| 64 "menu{display:block;list-style-type:disc;margin:1em 0;padding:0 0 0 30pt}" | |
| 65 "ol{display:block;list-style-type:decimal;margin:1em 0;padding:0 0 0 30pt}" | |
| 66 "p{display:block;margin:1em 0}" | |
| 67 "pre{display:block;font-family:monospace;margin:1em 0;white-space:pre}" | |
| 68 "samp{font-family:monospace}" | |
| 69 "script{display:none}" | |
| 70 "small{font-size:0.83em}" | |
| 71 "strong{font-weight:bold}" | |
| 72 "style{display:none}" | |
| 73 "sub{font-size:0.83em;vertical-align:sub}" | |
| 74 "sup{font-size:0.83em;vertical-align:super}" | |
| 75 "table{display:table;border-spacing:2px}" | |
| 76 "tbody{display:table-row-group}" | |
| 77 "td{display:table-cell;padding:1px;background-color:inherit}" | |
| 78 "tfoot{display:table-footer-group}" | |
| 79 "th{display:table-cell;font-weight:bold;padding:1px;text-align:center;background-color:inherit}" | |
| 80 "thead{display:table-header-group}" | |
| 81 "tr{display:table-row}" | |
| 82 "ul{display:block;list-style-type:disc;margin:1em 0;padding:0 0 0 30pt}" | |
| 83 "ul ul{list-style-type:circle}" | |
| 84 "ul ul ul{list-style-type:square}" | |
| 85 "var{font-style:italic}" | |
| 86 "colgroup{display:table-column-group}" | |
| 87 "col{display:table-column}" | |
| 88 "caption{display:block;text-align:center}" | |
| 89 ; | |
| 90 | |
| 91 static const char *mobi_default_css = | |
| 92 "pagebreak{display:block;page-break-before:always}" | |
| 93 "dl,ol,ul{margin:0}" | |
| 94 "p{margin:0}" | |
| 95 "blockquote{margin:0 40px}" | |
| 96 "center{display:block;text-align:center}" | |
| 97 "big{font-size:1.17em}" | |
| 98 "strike{text-decoration:line-through}" | |
| 99 ; | |
| 100 | |
| 101 static const char *fb2_default_css = | |
| 102 "@page{margin:3em 2em}" | |
| 103 "FictionBook{display:block;margin:1em}" | |
| 104 "stylesheet,binary{display:none}" | |
| 105 "description>*{display:none}" | |
| 106 "description>title-info{display:block}" | |
| 107 "description>title-info>*{display:none}" | |
| 108 "description>title-info>coverpage{display:block;page-break-before:always;page-break-after:always}" | |
| 109 "body,section,title,subtitle,p,cite,epigraph,text-author,date,poem,stanza,v,empty-line{display:block}" | |
| 110 "image{display:block}" | |
| 111 "p>image{display:inline}" | |
| 112 "table{display:table}" | |
| 113 "tr{display:table-row}" | |
| 114 "th,td{display:table-cell}" | |
| 115 "a{color:#06C;text-decoration:underline}" | |
| 116 "a[type=note]{font-size:small;vertical-align:super}" | |
| 117 "code{white-space:pre;font-family:monospace}" | |
| 118 "emphasis{font-style:italic}" | |
| 119 "strikethrough{text-decoration:line-through}" | |
| 120 "strong{font-weight:bold}" | |
| 121 "sub{font-size:small;vertical-align:sub}" | |
| 122 "sup{font-size:small;vertical-align:super}" | |
| 123 "image{margin:1em 0;text-align:center}" | |
| 124 "cite,poem{margin:1em 2em}" | |
| 125 "subtitle,epigraph,stanza{margin:1em 0}" | |
| 126 "title>p{text-align:center;font-size:x-large}" | |
| 127 "subtitle{text-align:center;font-size:large}" | |
| 128 "p{margin-top:1em;text-align:justify}" | |
| 129 "empty-line{padding-top:1em}" | |
| 130 "p+p{margin-top:0;text-indent:1.5em}" | |
| 131 "empty-line+p{margin-top:0}" | |
| 132 "section>title{page-break-before:always}" | |
| 133 ; | |
| 134 | |
| 135 static const char *known_html_tags[] = { | |
| 136 // TODO: add known FB2 tags? | |
| 137 // Sorted list of all HTML tags. | |
| 138 "a", "abbr", "acronym", "address", "annotation-xml", "applet", "area", | |
| 139 "article", "aside", "audio", "b", "base", "basefont", "bdi", "bdo", | |
| 140 "bgsound", "big", "blink", "blockquote", "body", "br", "button", | |
| 141 "canvas", "caption", "center", "cite", "code", "col", "colgroup", | |
| 142 "data", "datalist", "dd", "del", "desc", "details", "dfn", "dir", | |
| 143 "div", "dl", "dt", "em", "embed", "fieldset", "figcaption", "figure", | |
| 144 "font", "footer", "foreignobject", "form", "frame", "frameset", "h1", | |
| 145 "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html", | |
| 146 "i", "iframe", "image", "img", "input", "ins", "isindex", "kbd", | |
| 147 "keygen", "label", "legend", "li", "link", "listing", "main", | |
| 148 "malignmark", "map", "mark", "marquee", "math", "menu", "menuitem", | |
| 149 "meta", "meter", "mglyph", "mi", "mn", "mo", "ms", "mtext", "multicol", | |
| 150 "nav", "nextid", "nobr", "noembed", "noframes", "noscript", "object", | |
| 151 "ol", "optgroup", "option", "output", "p", "param", "plaintext", "pre", | |
| 152 "progress", "q", "rb", "rp", "rt", "rtc", "ruby", "s", "samp", | |
| 153 "script", "section", "select", "small", "source", "spacer", "span", | |
| 154 "strike", "strong", "style", "sub", "summary", "sup", "svg", "table", | |
| 155 "tbody", "td", "template", "textarea", "tfoot", "th", "thead", "time", | |
| 156 "title", "tr", "track", "tt", "u", "ul", "var", "video", "wbr", "xmp", | |
| 157 }; | |
| 158 | |
| 159 static const char *known_fb2_tags[] = { | |
| 160 "FictionBook", "a", "binary", "body", "cite", "code", "coverpage", | |
| 161 "date", "description", "emphasis", "empty-line", "epigraph", "image", | |
| 162 "p", "poem", "section", "stanza", "strikethrough", "strong", | |
| 163 "stylesheet", "sub", "subtitle", "sup", "table", "td", "text-author", | |
| 164 "th", "title", "title-info", "tr", "v", | |
| 165 }; | |
| 166 | |
| 167 static const char *find_known_html_tag(const char *tag) | |
| 168 { | |
| 169 int l = 0; | |
| 170 int r = nelem(known_html_tags) / 2 - 1; | |
| 171 while (l <= r) | |
| 172 { | |
| 173 int m = (l + r) >> 1; | |
| 174 int c = strcmp(tag, known_html_tags[m]); | |
| 175 if (c < 0) | |
| 176 r = m - 1; | |
| 177 else if (c > 0) | |
| 178 l = m + 1; | |
| 179 else | |
| 180 return known_html_tags[m]; | |
| 181 } | |
| 182 return NULL; | |
| 183 } | |
| 184 | |
| 185 static const char *find_known_fb2_tag(const char *tag) | |
| 186 { | |
| 187 int l = 0; | |
| 188 int r = nelem(known_fb2_tags) / 2 - 1; | |
| 189 while (l <= r) | |
| 190 { | |
| 191 int m = (l + r) >> 1; | |
| 192 int c = strcmp(tag, known_fb2_tags[m]); | |
| 193 if (c < 0) | |
| 194 r = m - 1; | |
| 195 else if (c > 0) | |
| 196 l = m + 1; | |
| 197 else | |
| 198 return known_fb2_tags[m]; | |
| 199 } | |
| 200 return NULL; | |
| 201 } | |
| 202 | |
| 203 struct genstate | |
| 204 { | |
| 205 fz_pool *pool; | |
| 206 fz_html_font_set *set; | |
| 207 fz_archive *zip; | |
| 208 fz_tree *images; | |
| 209 fz_xml_doc *xml; | |
| 210 int is_fb2; | |
| 211 const char *base_uri; | |
| 212 fz_css *css; | |
| 213 int at_bol; | |
| 214 fz_html_box *emit_white; | |
| 215 int last_brk_cls; | |
| 216 | |
| 217 int list_counter; | |
| 218 int section_depth; | |
| 219 fz_bidi_direction markup_dir; | |
| 220 fz_text_language markup_lang; | |
| 221 char *href; | |
| 222 | |
| 223 fz_css_style_splay *styles; | |
| 224 }; | |
| 225 | |
| 226 static int iswhite(int c) | |
| 227 { | |
| 228 return c == ' ' || c == '\t' || c == '\r' || c == '\n'; | |
| 229 } | |
| 230 | |
| 231 static int is_all_white(const char *s) | |
| 232 { | |
| 233 while (*s) | |
| 234 { | |
| 235 if (!iswhite(*s)) | |
| 236 return 0; | |
| 237 ++s; | |
| 238 } | |
| 239 return 1; | |
| 240 } | |
| 241 | |
| 242 /* TODO: pool allocator for flow nodes */ | |
| 243 /* TODO: store text by pointing to a giant buffer */ | |
| 244 | |
| 245 static void fz_drop_html_flow(fz_context *ctx, fz_html_flow *flow) | |
| 246 { | |
| 247 while (flow) | |
| 248 { | |
| 249 fz_html_flow *next = flow->next; | |
| 250 if (flow->type == FLOW_IMAGE) | |
| 251 fz_drop_image(ctx, flow->content.image); | |
| 252 flow = next; | |
| 253 } | |
| 254 } | |
| 255 | |
| 256 static fz_html_flow *add_flow(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, int type, int extras) | |
| 257 { | |
| 258 size_t size = (type == FLOW_IMAGE ? sizeof(fz_html_flow) : offsetof(fz_html_flow, content) + extras); | |
| 259 fz_html_flow *flow; | |
| 260 | |
| 261 /* Shouldn't happen, but bug 705324. */ | |
| 262 if (top == NULL || top->type != BOX_FLOW) | |
| 263 return NULL; | |
| 264 | |
| 265 flow = fz_pool_alloc(ctx, pool, size); | |
| 266 flow->type = type; | |
| 267 flow->expand = 0; | |
| 268 flow->bidi_level = 0; | |
| 269 flow->markup_lang = 0; | |
| 270 flow->breaks_line = 0; | |
| 271 flow->box = inline_box; | |
| 272 (*top->s.build.flow_tail) = flow; | |
| 273 top->s.build.flow_tail = &flow->next; | |
| 274 return flow; | |
| 275 } | |
| 276 | |
| 277 static void add_flow_space(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box) | |
| 278 { | |
| 279 fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_SPACE, 0); | |
| 280 if (flow) | |
| 281 flow->expand = 1; | |
| 282 } | |
| 283 | |
| 284 static void add_flow_break(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box) | |
| 285 { | |
| 286 (void)add_flow(ctx, pool, top, inline_box, FLOW_BREAK, 0); | |
| 287 } | |
| 288 | |
| 289 static void add_flow_sbreak(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box) | |
| 290 { | |
| 291 (void)add_flow(ctx, pool, top, inline_box, FLOW_SBREAK, 0); | |
| 292 } | |
| 293 | |
| 294 static void add_flow_shyphen(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box) | |
| 295 { | |
| 296 (void)add_flow(ctx, pool, top, inline_box, FLOW_SHYPHEN, 0); | |
| 297 } | |
| 298 | |
| 299 static void add_flow_word(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, const char *a, const char *b, int lang) | |
| 300 { | |
| 301 fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_WORD, b - a + 1); | |
| 302 if (flow == NULL) | |
| 303 return; | |
| 304 memcpy(flow->content.text, a, b - a); | |
| 305 flow->content.text[b - a] = 0; | |
| 306 flow->markup_lang = lang; | |
| 307 } | |
| 308 | |
| 309 static void add_flow_image(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, fz_image *img) | |
| 310 { | |
| 311 fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_IMAGE, 0); | |
| 312 if (flow) | |
| 313 flow->content.image = fz_keep_image(ctx, img); | |
| 314 } | |
| 315 | |
| 316 static void add_flow_anchor(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box) | |
| 317 { | |
| 318 (void)add_flow(ctx, pool, top, inline_box, FLOW_ANCHOR, 0); | |
| 319 } | |
| 320 | |
| 321 fz_html_flow *fz_html_split_flow(fz_context *ctx, fz_pool *pool, fz_html_flow *flow, size_t offset) | |
| 322 { | |
| 323 fz_html_flow *new_flow; | |
| 324 char *text; | |
| 325 size_t len; | |
| 326 | |
| 327 assert(flow->type == FLOW_WORD); | |
| 328 | |
| 329 if (offset == 0) | |
| 330 return flow; | |
| 331 text = flow->content.text; | |
| 332 while (*text && offset) | |
| 333 { | |
| 334 int rune; | |
| 335 text += fz_chartorune(&rune, text); | |
| 336 offset--; | |
| 337 } | |
| 338 len = strlen(text); | |
| 339 new_flow = fz_pool_alloc(ctx, pool, offsetof(fz_html_flow, content) + len+1); | |
| 340 memcpy(new_flow, flow, offsetof(fz_html_flow, content)); | |
| 341 new_flow->next = flow->next; | |
| 342 flow->next = new_flow; | |
| 343 strcpy(new_flow->content.text, text); | |
| 344 *text = 0; | |
| 345 return new_flow; | |
| 346 } | |
| 347 | |
| 348 static void flush_space(fz_context *ctx, fz_html_box *flow, int lang, struct genstate *g) | |
| 349 { | |
| 350 static const char *space = " "; | |
| 351 fz_pool *pool = g->pool; | |
| 352 if (g->emit_white) | |
| 353 { | |
| 354 int bsp = g->emit_white->style->white_space & WS_ALLOW_BREAK_SPACE; | |
| 355 if (!g->at_bol) | |
| 356 { | |
| 357 if (bsp) | |
| 358 add_flow_space(ctx, pool, flow, g->emit_white); | |
| 359 else | |
| 360 add_flow_word(ctx, pool, flow, g->emit_white, space, space+1, lang); | |
| 361 } | |
| 362 g->emit_white = 0; | |
| 363 } | |
| 364 } | |
| 365 | |
| 366 /* pair-wise lookup table for UAX#14 linebreaks | |
| 367 The linebreak table entries mean: | |
| 368 ^ prohibited break | |
| 369 never break before A and after B, even with one or more spaces in between | |
| 370 % indirect break | |
| 371 do not break before A, unless one or more spaces follow B | |
| 372 _ direct break | |
| 373 break allowed before A | |
| 374 */ | |
| 375 static const char *pairbrk[32] = | |
| 376 { | |
| 377 /* -OCCQGNESIPPNAHIIHBBBZCWHHJJJREEZ- */ | |
| 378 /* -PLPULSXYSROULLDNYAB2WMJ23LVTIBMW- */ | |
| 379 /* - J- */ | |
| 380 "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^", /* OP open punctuation */ | |
| 381 "_^^%%^^^^%%____%%%__^^^________%", /* CL close punctuation */ | |
| 382 "_^^%%^^^^%%%%%_%%%__^^^________%", /* CP close parenthesis */ | |
| 383 "^^^%%%^^^%%%%%%%%%%%^^^%%%%%%%%%", /* QU quotation */ | |
| 384 "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%%%%", /* GL non-breaking glue */ | |
| 385 "_^^%%%^^^______%%%__^^^________%", /* NS nonstarters */ | |
| 386 "_^^%%%^^^______%%%__^^^________%", /* EX exclamation/interrogation */ | |
| 387 "_^^%%%^^^__%_%_%%%__^^^________%", /* SY symbols allowing break after */ | |
| 388 "_^^%%%^^^__%%%_%%%__^^^________%", /* IS infix numeric separator */ | |
| 389 "%^^%%%^^^__%%%%%%%__^^^%%%%%_%%%", /* PR prefix numeric */ | |
| 390 "%^^%%%^^^__%%%_%%%__^^^________%", /* PO postfix numeric */ | |
| 391 "%^^%%%^^^%%%%%_%%%__^^^________%", /* NU numeric */ | |
| 392 "%^^%%%^^^%%%%%_%%%__^^^________%", /* AL ordinary alphabetic and symbol characters */ | |
| 393 "%^^%%%^^^%%%%%_%%%__^^^________%", /* HL hebrew letter */ | |
| 394 "_^^%%%^^^_%____%%%__^^^________%", /* ID ideographic */ | |
| 395 "_^^%%%^^^______%%%__^^^________%", /* IN inseparable characters */ | |
| 396 "_^^%_%^^^__%___%%%__^^^________%", /* HY hyphens */ | |
| 397 "_^^%_%^^^______%%%__^^^________%", /* BA break after */ | |
| 398 "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%%%%", /* BB break before */ | |
| 399 "_^^%%%^^^______%%%_^^^^________%", /* B2 break opportunity before and after */ | |
| 400 "____________________^___________", /* ZW zero width space */ | |
| 401 "%^^%%%^^^%_%%%_%%%__^^^________%", /* CM combining mark */ | |
| 402 "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%%%%", /* WJ word joiner */ | |
| 403 "_^^%%%^^^_%____%%%__^^^___%%___%", /* H2 hangul leading/vowel syllable */ | |
| 404 "_^^%%%^^^_%____%%%__^^^____%___%", /* H3 hangul leading/vowel/trailing syllable */ | |
| 405 "_^^%%%^^^_%____%%%__^^^%%%%____%", /* JL hangul leading jamo */ | |
| 406 "_^^%%%^^^_%____%%%__^^^___%%___%", /* JV hangul vowel jamo */ | |
| 407 "_^^%%%^^^_%____%%%__^^^____%___%", /* JT hangul trailing jamo */ | |
| 408 "_^^%%%^^^______%%%__^^^_____%__%", /* RI regional indicator */ | |
| 409 "_^^%%%^^^_%____%%%__^^^_______%%", /* EB emoji base */ | |
| 410 "_^^%%%^^^_%____%%%__^^^________%", /* EM emoji modifier */ | |
| 411 "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%%%%", /* ZWJ zero width joiner */ | |
| 412 }; | |
| 413 | |
| 414 static fz_html_box * | |
| 415 find_flow_encloser(fz_context *ctx, fz_html_box *flow) | |
| 416 { | |
| 417 /* This code was written to assume that there will always be a | |
| 418 * flow box enclosing callers of this. Bug 705324 shows that | |
| 419 * this isn't always the case. In the absence of a reproducer | |
| 420 * file, all I can do is try to patch around the issue so that | |
| 421 * we won't crash. */ | |
| 422 while (flow->type != BOX_FLOW) | |
| 423 { | |
| 424 if (flow->up == NULL) | |
| 425 { | |
| 426 fz_warn(ctx, "Flow encloser not found. Please report this file!"); | |
| 427 break; | |
| 428 } | |
| 429 flow = flow->up; | |
| 430 } | |
| 431 return flow; | |
| 432 } | |
| 433 | |
| 434 static void generate_text(fz_context *ctx, fz_html_box *box, const char *text, int lang, struct genstate *g) | |
| 435 { | |
| 436 fz_html_box *flow; | |
| 437 fz_pool *pool = g->pool; | |
| 438 int collapse = box->style->white_space & WS_COLLAPSE; | |
| 439 int bsp = box->style->white_space & WS_ALLOW_BREAK_SPACE; | |
| 440 int bnl = box->style->white_space & WS_FORCE_BREAK_NEWLINE; | |
| 441 | |
| 442 static const char *space = " "; | |
| 443 | |
| 444 flow = find_flow_encloser(ctx, box); | |
| 445 if (flow == NULL) | |
| 446 return; | |
| 447 | |
| 448 while (*text) | |
| 449 { | |
| 450 if (bnl && (*text == '\n' || *text == '\r')) | |
| 451 { | |
| 452 if (text[0] == '\r' && text[1] == '\n') | |
| 453 text += 2; | |
| 454 else | |
| 455 text += 1; | |
| 456 add_flow_break(ctx, pool, flow, box); | |
| 457 g->at_bol = 1; | |
| 458 } | |
| 459 else if (iswhite(*text)) | |
| 460 { | |
| 461 if (collapse) | |
| 462 { | |
| 463 if (bnl) | |
| 464 while (*text == ' ' || *text == '\t') | |
| 465 ++text; | |
| 466 else | |
| 467 while (iswhite(*text)) | |
| 468 ++text; | |
| 469 g->emit_white = box; | |
| 470 } | |
| 471 else | |
| 472 { | |
| 473 // TODO: tabs | |
| 474 if (bsp) | |
| 475 add_flow_space(ctx, pool, flow, box); | |
| 476 else | |
| 477 add_flow_word(ctx, pool, flow, box, space, space+1, lang); | |
| 478 ++text; | |
| 479 } | |
| 480 g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ; /* don't add sbreaks after a space */ | |
| 481 } | |
| 482 else | |
| 483 { | |
| 484 const char *prev, *mark = text; | |
| 485 int c; | |
| 486 | |
| 487 flush_space(ctx, flow, lang, g); | |
| 488 | |
| 489 if (g->at_bol) | |
| 490 g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ; | |
| 491 | |
| 492 while (*text && !iswhite(*text)) | |
| 493 { | |
| 494 prev = text; | |
| 495 text += fz_chartorune(&c, text); | |
| 496 if (c == 0xAD) /* soft hyphen */ | |
| 497 { | |
| 498 if (mark != prev) | |
| 499 add_flow_word(ctx, pool, flow, box, mark, prev, lang); | |
| 500 add_flow_shyphen(ctx, pool, flow, box); | |
| 501 mark = text; | |
| 502 g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ; /* don't add sbreaks after a soft hyphen */ | |
| 503 } | |
| 504 else if (bsp) /* allow soft breaks */ | |
| 505 { | |
| 506 int this_brk_cls = ucdn_get_resolved_linebreak_class(c); | |
| 507 if (this_brk_cls <= UCDN_LINEBREAK_CLASS_ZWJ) | |
| 508 { | |
| 509 int brk = pairbrk[g->last_brk_cls][this_brk_cls]; | |
| 510 | |
| 511 /* we handle spaces elsewhere, so ignore these classes */ | |
| 512 if (brk == '@') brk = '^'; | |
| 513 if (brk == '#') brk = '^'; | |
| 514 if (brk == '%') brk = '^'; | |
| 515 | |
| 516 if (brk == '_') | |
| 517 { | |
| 518 if (mark != prev) | |
| 519 add_flow_word(ctx, pool, flow, box, mark, prev, lang); | |
| 520 add_flow_sbreak(ctx, pool, flow, box); | |
| 521 mark = prev; | |
| 522 } | |
| 523 | |
| 524 g->last_brk_cls = this_brk_cls; | |
| 525 } | |
| 526 } | |
| 527 } | |
| 528 if (mark != text) | |
| 529 add_flow_word(ctx, pool, flow, box, mark, text, lang); | |
| 530 | |
| 531 g->at_bol = 0; | |
| 532 } | |
| 533 } | |
| 534 } | |
| 535 | |
| 536 static fz_image *load_html_image(fz_context *ctx, fz_archive *zip, const char *base_uri, const char *src) | |
| 537 { | |
| 538 char path[2048]; | |
| 539 fz_image *img = NULL; | |
| 540 fz_buffer *buf = NULL; | |
| 541 | |
| 542 fz_var(img); | |
| 543 fz_var(buf); | |
| 544 | |
| 545 fz_try(ctx) | |
| 546 { | |
| 547 if (!strncmp(src, "data:image/jpeg;base64,", 23)) | |
| 548 buf = fz_new_buffer_from_base64(ctx, src+23, 0); | |
| 549 else if (!strncmp(src, "data:image/png;base64,", 22)) | |
| 550 buf = fz_new_buffer_from_base64(ctx, src+22, 0); | |
| 551 else if (!strncmp(src, "data:image/gif;base64,", 22)) | |
| 552 buf = fz_new_buffer_from_base64(ctx, src+22, 0); | |
| 553 else | |
| 554 { | |
| 555 fz_strlcpy(path, base_uri, sizeof path); | |
| 556 fz_strlcat(path, "/", sizeof path); | |
| 557 fz_strlcat(path, src, sizeof path); | |
| 558 fz_urldecode(path); | |
| 559 fz_cleanname(path); | |
| 560 buf = fz_read_archive_entry(ctx, zip, path); | |
| 561 } | |
| 562 #if FZ_ENABLE_SVG | |
| 563 if (strstr(src, ".svg")) | |
| 564 img = fz_new_image_from_svg(ctx, buf, base_uri, zip); | |
| 565 else | |
| 566 #endif | |
| 567 img = fz_new_image_from_buffer(ctx, buf); | |
| 568 } | |
| 569 fz_always(ctx) | |
| 570 fz_drop_buffer(ctx, buf); | |
| 571 fz_catch(ctx) | |
| 572 { | |
| 573 fz_ignore_error(ctx); | |
| 574 fz_warn(ctx, "html: cannot load image src='%s'", src); | |
| 575 } | |
| 576 | |
| 577 return img; | |
| 578 } | |
| 579 | |
| 580 static fz_image *load_svg_image(fz_context *ctx, fz_archive *zip, const char *base_uri, | |
| 581 fz_xml_doc *xmldoc, fz_xml *node) | |
| 582 { | |
| 583 fz_image *img = NULL; | |
| 584 #if FZ_ENABLE_SVG | |
| 585 fz_try(ctx) | |
| 586 img = fz_new_image_from_svg_xml(ctx, xmldoc, node, base_uri, zip); | |
| 587 fz_catch(ctx) | |
| 588 { | |
| 589 fz_ignore_error(ctx); | |
| 590 fz_warn(ctx, "html: cannot load embedded svg document"); | |
| 591 } | |
| 592 #endif | |
| 593 return img; | |
| 594 } | |
| 595 | |
| 596 static void generate_image(fz_context *ctx, fz_html_box *box, fz_image *img, struct genstate *g) | |
| 597 { | |
| 598 fz_html_box *flow; | |
| 599 fz_pool *pool = g->pool; | |
| 600 | |
| 601 flow = find_flow_encloser(ctx, box); | |
| 602 | |
| 603 flush_space(ctx, flow, 0, g); | |
| 604 | |
| 605 if (!img) | |
| 606 { | |
| 607 const char *alt = "[image]"; | |
| 608 add_flow_word(ctx, pool, flow, box, alt, alt + 7, 0); | |
| 609 } | |
| 610 else | |
| 611 { | |
| 612 fz_try(ctx) | |
| 613 { | |
| 614 add_flow_sbreak(ctx, pool, flow, box); | |
| 615 add_flow_image(ctx, pool, flow, box, img); | |
| 616 add_flow_sbreak(ctx, pool, flow, box); | |
| 617 } | |
| 618 fz_always(ctx) | |
| 619 { | |
| 620 fz_drop_image(ctx, img); | |
| 621 } | |
| 622 fz_catch(ctx) | |
| 623 fz_rethrow(ctx); | |
| 624 } | |
| 625 | |
| 626 g->at_bol = 0; | |
| 627 } | |
| 628 | |
| 629 static void fz_drop_html_box(fz_context *ctx, fz_html_box *box) | |
| 630 { | |
| 631 while (box) | |
| 632 { | |
| 633 fz_html_box *next = box->next; | |
| 634 if (box->type == BOX_FLOW) | |
| 635 fz_drop_html_flow(ctx, box->u.flow.head); | |
| 636 fz_drop_html_box(ctx, box->down); | |
| 637 box = next; | |
| 638 } | |
| 639 } | |
| 640 | |
| 641 static void fz_drop_html_imp(fz_context *ctx, fz_storable *stor) | |
| 642 { | |
| 643 fz_html *html = (fz_html *)stor; | |
| 644 fz_drop_html_box(ctx, html->tree.root); | |
| 645 fz_drop_pool(ctx, html->tree.pool); | |
| 646 } | |
| 647 | |
| 648 static void fz_drop_story_imp(fz_context *ctx, fz_storable *stor) | |
| 649 { | |
| 650 fz_story *story = (fz_story *)stor; | |
| 651 fz_free(ctx, story->user_css); | |
| 652 fz_drop_html_font_set(ctx, story->font_set); | |
| 653 fz_drop_xml(ctx, story->dom); | |
| 654 fz_drop_html_box(ctx, story->tree.root); | |
| 655 fz_drop_buffer(ctx, story->warnings); | |
| 656 fz_drop_archive(ctx, story->zip); | |
| 657 /* The pool must be the last thing dropped. */ | |
| 658 fz_drop_pool(ctx, story->tree.pool); | |
| 659 } | |
| 660 | |
| 661 /* Drop a structure derived from an html_tree. The exact things | |
| 662 * freed here will depend upon the drop function with which it | |
| 663 * was created. */ | |
| 664 static void | |
| 665 fz_drop_html_tree(fz_context *ctx, fz_html_tree *tree) | |
| 666 { | |
| 667 fz_defer_reap_start(ctx); | |
| 668 fz_drop_storable(ctx, &tree->storable); | |
| 669 fz_defer_reap_end(ctx); | |
| 670 } | |
| 671 | |
| 672 void fz_drop_html(fz_context *ctx, fz_html *html) | |
| 673 { | |
| 674 fz_drop_html_tree(ctx, &html->tree); | |
| 675 } | |
| 676 | |
| 677 void fz_drop_story(fz_context *ctx, fz_story *story) | |
| 678 { | |
| 679 if (!story) | |
| 680 return; | |
| 681 | |
| 682 fz_drop_html_tree(ctx, &story->tree); | |
| 683 } | |
| 684 | |
| 685 fz_html *fz_keep_html(fz_context *ctx, fz_html *html) | |
| 686 { | |
| 687 return fz_keep_storable(ctx, &html->tree.storable); | |
| 688 } | |
| 689 | |
| 690 static fz_html_box *new_box(fz_context *ctx, struct genstate *g, fz_xml *node, int type, fz_css_style *style) | |
| 691 { | |
| 692 fz_html_box *box; | |
| 693 const char *tag = fz_xml_tag(node); | |
| 694 const char *id = fz_xml_att(node, "id"); | |
| 695 const char *href; | |
| 696 | |
| 697 if (type == BOX_INLINE) | |
| 698 box = fz_pool_alloc(ctx, g->pool, offsetof(fz_html_box, u)); | |
| 699 else if (type == BOX_FLOW) | |
| 700 box = fz_pool_alloc(ctx, g->pool, offsetof(fz_html_box, u) + sizeof(box->u.flow)); | |
| 701 else | |
| 702 box = fz_pool_alloc(ctx, g->pool, offsetof(fz_html_box, u) + sizeof(box->u.block)); | |
| 703 | |
| 704 box->type = type; | |
| 705 box->is_first_flow = 0; | |
| 706 box->markup_dir = g->markup_dir; | |
| 707 box->heading = 0; | |
| 708 box->list_item = 0; | |
| 709 | |
| 710 box->style = fz_css_enlist(ctx, style, &g->styles, g->pool); | |
| 711 | |
| 712 if (tag) | |
| 713 { | |
| 714 box->tag = find_known_html_tag(tag); | |
| 715 if (!box->tag && g->is_fb2) | |
| 716 box->tag = find_known_fb2_tag(tag); | |
| 717 if (!box->tag) | |
| 718 box->tag = fz_pool_strdup(ctx, g->pool, tag); | |
| 719 } | |
| 720 else | |
| 721 { | |
| 722 box->tag = "#anon"; | |
| 723 } | |
| 724 | |
| 725 if (id) | |
| 726 box->id = fz_pool_strdup(ctx, g->pool, id); | |
| 727 | |
| 728 if (tag && tag[0]=='a' && tag[1]==0) | |
| 729 { | |
| 730 // Support deprecated anchor syntax with id in "name" instead of "id" attribute. | |
| 731 if (!id) | |
| 732 { | |
| 733 const char *name = fz_xml_att(node, "name"); | |
| 734 if (name) | |
| 735 box->id = fz_pool_strdup(ctx, g->pool, name); | |
| 736 } | |
| 737 | |
| 738 if (g->is_fb2) | |
| 739 { | |
| 740 href = fz_xml_att(node, "l:href"); | |
| 741 if (!href) | |
| 742 href = fz_xml_att(node, "xlink:href"); | |
| 743 } | |
| 744 else | |
| 745 { | |
| 746 href = fz_xml_att(node, "href"); | |
| 747 } | |
| 748 if (href) | |
| 749 g->href = fz_pool_strdup(ctx, g->pool, href); | |
| 750 } | |
| 751 | |
| 752 if (g->href) | |
| 753 box->href = g->href; | |
| 754 | |
| 755 if (type == BOX_FLOW) | |
| 756 { | |
| 757 box->u.flow.head = NULL; | |
| 758 box->s.build.flow_tail = &box->u.flow.head; | |
| 759 } | |
| 760 | |
| 761 return box; | |
| 762 } | |
| 763 | |
| 764 static void append_box(fz_context *ctx, fz_html_box *parent, fz_html_box *child) | |
| 765 { | |
| 766 child->up = parent; | |
| 767 if (!parent->down) | |
| 768 parent->down = child; | |
| 769 if (parent->s.build.last_child) | |
| 770 parent->s.build.last_child->next = child; | |
| 771 parent->s.build.last_child = child; | |
| 772 } | |
| 773 | |
| 774 static fz_html_box *find_block_context(fz_context *ctx, fz_html_box *box) | |
| 775 { | |
| 776 while (box->type != BOX_BLOCK && box->type != BOX_TABLE_CELL) | |
| 777 box = box->up; | |
| 778 return box; | |
| 779 } | |
| 780 | |
| 781 static fz_html_box *find_table_row_context(fz_context *ctx, fz_html_box *box) | |
| 782 { | |
| 783 fz_html_box *look = box; | |
| 784 while (look && look->type != BOX_TABLE) | |
| 785 look = look->up; | |
| 786 if (look) | |
| 787 return look; | |
| 788 fz_warn(ctx, "table-row not inside table element"); | |
| 789 return NULL; | |
| 790 } | |
| 791 | |
| 792 static fz_html_box *find_table_cell_context(fz_context *ctx, fz_html_box *box) | |
| 793 { | |
| 794 fz_html_box *look = box; | |
| 795 while (look && look->type != BOX_TABLE_ROW) | |
| 796 look = look->up; | |
| 797 if (look) | |
| 798 return look; | |
| 799 fz_warn(ctx, "table-cell not inside table-row element"); | |
| 800 return NULL; | |
| 801 } | |
| 802 | |
| 803 static fz_html_box *find_inline_context(fz_context *ctx, struct genstate *g, fz_html_box *box) | |
| 804 { | |
| 805 fz_css_style style; | |
| 806 fz_html_box *flow_box; | |
| 807 | |
| 808 if (box->type == BOX_FLOW || box->type == BOX_INLINE) | |
| 809 return box; | |
| 810 | |
| 811 // We have an inline element that is not in an existing flow/inline context. | |
| 812 | |
| 813 // Find the closest block level box to insert content into. | |
| 814 while (box->type != BOX_BLOCK && box->type != BOX_TABLE_CELL) | |
| 815 box = box->up; | |
| 816 | |
| 817 // Concatenate onto the last open flow box if we have one. | |
| 818 if (box->s.build.last_child && box->s.build.last_child->type == BOX_FLOW) | |
| 819 return box->s.build.last_child; | |
| 820 | |
| 821 // No flow box found, create and insert one! | |
| 822 | |
| 823 // TODO: null style instead of default for flow box? | |
| 824 fz_default_css_style(ctx, &style); | |
| 825 flow_box = new_box(ctx, g, NULL, BOX_FLOW, &style); | |
| 826 flow_box->is_first_flow = !box->down; | |
| 827 g->at_bol = 1; | |
| 828 | |
| 829 append_box(ctx, box, flow_box); | |
| 830 | |
| 831 return flow_box; | |
| 832 } | |
| 833 | |
| 834 static void gen2_children(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *root_node, fz_css_match *root_match); | |
| 835 | |
| 836 static void gen2_text(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node) | |
| 837 { | |
| 838 fz_html_box *anon_box; | |
| 839 fz_css_style style; | |
| 840 const char *text; | |
| 841 int collapse; | |
| 842 | |
| 843 text = fz_xml_text(node); | |
| 844 collapse = root_box->style->white_space & WS_COLLAPSE; | |
| 845 if (collapse && is_all_white(text)) | |
| 846 { | |
| 847 g->emit_white = root_box; | |
| 848 } | |
| 849 else | |
| 850 { | |
| 851 if (root_box->type != BOX_INLINE) | |
| 852 { | |
| 853 /* Create anonymous inline box, with the same style as the top block box. */ | |
| 854 style = *root_box->style; | |
| 855 | |
| 856 // Make sure not to recursively multiply font sizes | |
| 857 style.font_size.value = 1; | |
| 858 style.font_size.unit = N_SCALE; | |
| 859 | |
| 860 root_box = find_inline_context(ctx, g, root_box); | |
| 861 anon_box = new_box(ctx, g, NULL, BOX_INLINE, &style); | |
| 862 append_box(ctx, root_box, anon_box); | |
| 863 root_box = anon_box; | |
| 864 } | |
| 865 | |
| 866 generate_text(ctx, root_box, text, g->markup_lang, g); | |
| 867 } | |
| 868 } | |
| 869 | |
| 870 static fz_html_box *gen2_inline(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style) | |
| 871 { | |
| 872 fz_html_box *this_box; | |
| 873 fz_html_box *flow_box; | |
| 874 root_box = find_inline_context(ctx, g, root_box); | |
| 875 this_box = new_box(ctx, g, node, BOX_INLINE, style); | |
| 876 append_box(ctx, root_box, this_box); | |
| 877 if (this_box->id) | |
| 878 { | |
| 879 flow_box = find_flow_encloser(ctx, this_box); | |
| 880 add_flow_anchor(ctx, g->pool, flow_box, this_box); | |
| 881 } | |
| 882 return this_box; | |
| 883 } | |
| 884 | |
| 885 static void gen2_break(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node) | |
| 886 { | |
| 887 fz_html_box *this_box; | |
| 888 fz_html_box *flow_box; | |
| 889 | |
| 890 if (root_box->type != BOX_INLINE) | |
| 891 { | |
| 892 /* Create inline box to hold the <br> tag, with the same style as containing block. */ | |
| 893 /* Make sure not to recursively multiply font sizes. */ | |
| 894 fz_css_style style = *root_box->style; | |
| 895 style.font_size.value = 1; | |
| 896 style.font_size.unit = N_SCALE; | |
| 897 this_box = new_box(ctx, g, node, BOX_INLINE, &style); | |
| 898 append_box(ctx, find_inline_context(ctx, g, root_box), this_box); | |
| 899 } | |
| 900 else | |
| 901 { | |
| 902 this_box = root_box; | |
| 903 } | |
| 904 | |
| 905 flow_box = find_flow_encloser(ctx, this_box); | |
| 906 add_flow_break(ctx, g->pool, flow_box, this_box); | |
| 907 g->at_bol = 1; | |
| 908 } | |
| 909 | |
| 910 static fz_html_box *gen2_block(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style) | |
| 911 { | |
| 912 fz_html_box *this_box; | |
| 913 root_box = find_block_context(ctx, root_box); | |
| 914 this_box = new_box(ctx, g, node, BOX_BLOCK, style); | |
| 915 append_box(ctx, root_box, this_box); | |
| 916 return this_box; | |
| 917 } | |
| 918 | |
| 919 static fz_html_box *gen2_table(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style) | |
| 920 { | |
| 921 fz_html_box *this_box; | |
| 922 root_box = find_block_context(ctx, root_box); | |
| 923 this_box = new_box(ctx, g, node, BOX_TABLE, style); | |
| 924 append_box(ctx, root_box, this_box); | |
| 925 return this_box; | |
| 926 } | |
| 927 | |
| 928 static fz_html_box *gen2_table_row(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style) | |
| 929 { | |
| 930 fz_html_box *this_box, *table_box; | |
| 931 | |
| 932 table_box = find_table_row_context(ctx, root_box); | |
| 933 if (!table_box) | |
| 934 return gen2_block(ctx, g, root_box, node, style); | |
| 935 | |
| 936 this_box = new_box(ctx, g, node, BOX_TABLE_ROW, style); | |
| 937 append_box(ctx, table_box, this_box); | |
| 938 return this_box; | |
| 939 } | |
| 940 | |
| 941 static fz_html_box *gen2_table_cell(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style) | |
| 942 { | |
| 943 fz_html_box *this_box, *row_box; | |
| 944 | |
| 945 row_box = find_table_cell_context(ctx, root_box); | |
| 946 if (!row_box) | |
| 947 return gen2_block(ctx, g, root_box, node, style); | |
| 948 | |
| 949 this_box = new_box(ctx, g, node, BOX_TABLE_CELL, style); | |
| 950 append_box(ctx, row_box, this_box); | |
| 951 return this_box; | |
| 952 } | |
| 953 | |
| 954 static void gen2_image_common(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_image *img, int display, fz_css_style *style) | |
| 955 { | |
| 956 fz_html_box *img_block_box; | |
| 957 fz_html_box *img_inline_box; | |
| 958 | |
| 959 if (display == DIS_INLINE || display == DIS_INLINE_BLOCK) | |
| 960 { | |
| 961 root_box = find_inline_context(ctx, g, root_box); | |
| 962 img_inline_box = new_box(ctx, g, node, BOX_INLINE, style); | |
| 963 append_box(ctx, root_box, img_inline_box); | |
| 964 generate_image(ctx, img_inline_box, img, g); | |
| 965 } | |
| 966 else | |
| 967 { | |
| 968 root_box = find_block_context(ctx, root_box); | |
| 969 img_block_box = new_box(ctx, g, node, BOX_BLOCK, style); | |
| 970 append_box(ctx, root_box, img_block_box); | |
| 971 | |
| 972 root_box = find_inline_context(ctx, g, img_block_box); | |
| 973 img_inline_box = new_box(ctx, g, NULL, BOX_INLINE, style); | |
| 974 append_box(ctx, root_box, img_inline_box); | |
| 975 generate_image(ctx, img_inline_box, img, g); | |
| 976 } | |
| 977 } | |
| 978 | |
| 979 static void gen2_image_html(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, int display, fz_css_style *style) | |
| 980 { | |
| 981 const char *src = fz_xml_att(node, "src"); | |
| 982 if (src) | |
| 983 { | |
| 984 fz_css_style local_style = *style; | |
| 985 fz_image *img; | |
| 986 int w, h; | |
| 987 const char *w_att = fz_xml_att(node, "width"); | |
| 988 const char *h_att = fz_xml_att(node, "height"); | |
| 989 | |
| 990 if (w_att && (w = fz_atoi(w_att)) > 0) | |
| 991 { | |
| 992 local_style.width.value = w; | |
| 993 local_style.width.unit = strchr(w_att, '%') ? N_PERCENT : N_LENGTH; | |
| 994 } | |
| 995 if (h_att && (h = fz_atoi(h_att)) > 0) | |
| 996 { | |
| 997 local_style.height.value = h; | |
| 998 local_style.height.unit = strchr(h_att, '%') ? N_PERCENT : N_LENGTH; | |
| 999 } | |
| 1000 | |
| 1001 img = load_html_image(ctx, g->zip, g->base_uri, src); | |
| 1002 gen2_image_common(ctx, g, root_box, node, img, display, &local_style); | |
| 1003 } | |
| 1004 } | |
| 1005 | |
| 1006 static void gen2_image_fb2(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, int display, fz_css_style *style) | |
| 1007 { | |
| 1008 const char *src = fz_xml_att(node, "l:href"); | |
| 1009 if (!src) | |
| 1010 src = fz_xml_att(node, "xlink:href"); | |
| 1011 if (src && src[0] == '#') | |
| 1012 { | |
| 1013 fz_image *img = fz_tree_lookup(ctx, g->images, src+1); | |
| 1014 gen2_image_common(ctx, g, root_box, node, fz_keep_image(ctx, img), display, style); | |
| 1015 } | |
| 1016 } | |
| 1017 | |
| 1018 static void gen2_image_svg(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, int display, fz_css_style *style) | |
| 1019 { | |
| 1020 fz_image *img = load_svg_image(ctx, g->zip, g->base_uri, g->xml, node); | |
| 1021 gen2_image_common(ctx, g, root_box, node, img, display, style); | |
| 1022 } | |
| 1023 | |
| 1024 static int get_heading_from_tag(fz_context *ctx, struct genstate *g, const char *tag) | |
| 1025 { | |
| 1026 if (tag[0] == 'h' && tag[1] != 0 && tag[2] == 0) | |
| 1027 { | |
| 1028 switch (tag[1]) | |
| 1029 { | |
| 1030 case '1': return 1; | |
| 1031 case '2': return 2; | |
| 1032 case '3': return 3; | |
| 1033 case '4': return 4; | |
| 1034 case '5': return 5; | |
| 1035 case '6': return 6; | |
| 1036 } | |
| 1037 } | |
| 1038 if (g->is_fb2) | |
| 1039 { | |
| 1040 if (!strcmp(tag, "title") || !strcmp(tag, "subtitle")) | |
| 1041 return fz_mini(g->section_depth, 6); | |
| 1042 } | |
| 1043 return 0; | |
| 1044 } | |
| 1045 | |
| 1046 static void gen2_tag(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, | |
| 1047 fz_css_match *match, int display, fz_css_style *style) | |
| 1048 { | |
| 1049 fz_html_box *this_box; | |
| 1050 const char *tag; | |
| 1051 const char *lang_att; | |
| 1052 const char *dir_att; | |
| 1053 | |
| 1054 int save_markup_dir = g->markup_dir; | |
| 1055 int save_markup_lang = g->markup_lang; | |
| 1056 char *save_href = g->href; | |
| 1057 | |
| 1058 if (display == DIS_NONE) | |
| 1059 return; | |
| 1060 | |
| 1061 tag = fz_xml_tag(node); | |
| 1062 | |
| 1063 dir_att = fz_xml_att(node, "dir"); | |
| 1064 if (dir_att) | |
| 1065 { | |
| 1066 if (!strcmp(dir_att, "auto")) | |
| 1067 g->markup_dir = FZ_BIDI_NEUTRAL; | |
| 1068 else if (!strcmp(dir_att, "rtl")) | |
| 1069 g->markup_dir = FZ_BIDI_RTL; | |
| 1070 else if (!strcmp(dir_att, "ltr")) | |
| 1071 g->markup_dir = FZ_BIDI_LTR; | |
| 1072 else | |
| 1073 g->markup_dir = DEFAULT_DIR; | |
| 1074 } | |
| 1075 | |
| 1076 lang_att = fz_xml_att(node, "lang"); | |
| 1077 if (lang_att) | |
| 1078 g->markup_lang = fz_text_language_from_string(lang_att); | |
| 1079 | |
| 1080 switch (display) | |
| 1081 { | |
| 1082 case DIS_INLINE_BLOCK: | |
| 1083 // TODO handle inline block as a flow node | |
| 1084 this_box = gen2_block(ctx, g, root_box, node, style); | |
| 1085 break; | |
| 1086 | |
| 1087 case DIS_BLOCK: | |
| 1088 this_box = gen2_block(ctx, g, root_box, node, style); | |
| 1089 this_box->heading = get_heading_from_tag(ctx, g, tag); | |
| 1090 break; | |
| 1091 | |
| 1092 case DIS_LIST_ITEM: | |
| 1093 this_box = gen2_block(ctx, g, root_box, node, style); | |
| 1094 this_box->list_item = ++g->list_counter; | |
| 1095 break; | |
| 1096 | |
| 1097 // TODO: https://www.w3.org/TR/CSS2/tables.html#anonymous-boxes | |
| 1098 // | |
| 1099 // The table generation code should insert and create anonymous boxes | |
| 1100 // for any missing child/parent elements. | |
| 1101 // | |
| 1102 // MISSING CHILDREN: | |
| 1103 // 1: Wrap consecutive BLOCK found in a TABLE in an anon TABLE_ROW. | |
| 1104 // 2: Wrap consecutive BLOCK found in a TABLE_ROW in an anon TABLE_CELL. | |
| 1105 // | |
| 1106 // MISSING PARENTS: | |
| 1107 // 1: Wrap consecutive TABLE_CELL found outside TABLE_ROW in an anon TABLE_ROW | |
| 1108 // 2: Wrap consecutive TABLE_ROW found outside TABLE in an anon TABLE | |
| 1109 // | |
| 1110 // For now we ignore this and treat any such elements that are out of | |
| 1111 // context as plain block elements. | |
| 1112 | |
| 1113 case DIS_TABLE: | |
| 1114 this_box = gen2_table(ctx, g, root_box, node, style); | |
| 1115 break; | |
| 1116 case DIS_TABLE_GROUP: | |
| 1117 // no box for table-row-group elements | |
| 1118 this_box = root_box; | |
| 1119 break; | |
| 1120 case DIS_TABLE_ROW: | |
| 1121 this_box = gen2_table_row(ctx, g, root_box, node, style); | |
| 1122 break; | |
| 1123 case DIS_TABLE_CELL: | |
| 1124 this_box = gen2_table_cell(ctx, g, root_box, node, style); | |
| 1125 break; | |
| 1126 | |
| 1127 case DIS_INLINE: | |
| 1128 default: | |
| 1129 this_box = gen2_inline(ctx, g, root_box, node, style); | |
| 1130 break; | |
| 1131 } | |
| 1132 | |
| 1133 if (tag && (!strcmp(tag, "ol") || !strcmp(tag, "ul") || !strcmp(tag, "dl"))) | |
| 1134 { | |
| 1135 int save_list_counter = g->list_counter; | |
| 1136 g->list_counter = 0; | |
| 1137 gen2_children(ctx, g, this_box, node, match); | |
| 1138 g->list_counter = save_list_counter; | |
| 1139 } | |
| 1140 else if (tag && !strcmp(tag, "section")) | |
| 1141 { | |
| 1142 int save_section_depth = g->section_depth; | |
| 1143 g->section_depth++; | |
| 1144 gen2_children(ctx, g, this_box, node, match); | |
| 1145 g->section_depth = save_section_depth; | |
| 1146 } | |
| 1147 else | |
| 1148 { | |
| 1149 gen2_children(ctx, g, this_box, node, match); | |
| 1150 } | |
| 1151 | |
| 1152 g->markup_dir = save_markup_dir; | |
| 1153 g->markup_lang = save_markup_lang; | |
| 1154 g->href = save_href; | |
| 1155 } | |
| 1156 | |
| 1157 static void gen2_children(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *root_node, fz_css_match *root_match) | |
| 1158 { | |
| 1159 fz_xml *node; | |
| 1160 const char *tag; | |
| 1161 fz_css_match match; | |
| 1162 fz_css_style style; | |
| 1163 int display; | |
| 1164 | |
| 1165 for (node = fz_xml_down(root_node); node; node = fz_xml_next(node)) | |
| 1166 { | |
| 1167 tag = fz_xml_tag(node); | |
| 1168 if (tag) | |
| 1169 { | |
| 1170 fz_match_css(ctx, &match, root_match, g->css, node); | |
| 1171 fz_apply_css_style(ctx, g->set, &style, &match); | |
| 1172 display = fz_get_css_match_display(&match); | |
| 1173 if (tag[0]=='b' && tag[1]=='r' && tag[2]==0) | |
| 1174 { | |
| 1175 gen2_break(ctx, g, root_box, node); | |
| 1176 } | |
| 1177 else if (tag[0]=='i' && tag[1]=='m' && tag[2]=='g' && tag[3]==0) | |
| 1178 { | |
| 1179 gen2_image_html(ctx, g, root_box, node, display, &style); | |
| 1180 } | |
| 1181 else if (g->is_fb2 && tag[0]=='i' && tag[1]=='m' && tag[2]=='a' && tag[3]=='g' && tag[4]=='e' && tag[5]==0) | |
| 1182 { | |
| 1183 gen2_image_fb2(ctx, g, root_box, node, display, &style); | |
| 1184 } | |
| 1185 else if (tag[0]=='s' && tag[1]=='v' && tag[2]=='g' && tag[3]==0) | |
| 1186 { | |
| 1187 gen2_image_svg(ctx, g, root_box, node, display, &style); | |
| 1188 } | |
| 1189 else | |
| 1190 { | |
| 1191 gen2_tag(ctx, g, root_box, node, &match, display, &style); | |
| 1192 } | |
| 1193 } | |
| 1194 else | |
| 1195 { | |
| 1196 gen2_text(ctx, g, root_box, node); | |
| 1197 } | |
| 1198 } | |
| 1199 } | |
| 1200 | |
| 1201 static char *concat_text(fz_context *ctx, fz_xml *root) | |
| 1202 { | |
| 1203 fz_xml *node; | |
| 1204 size_t i = 0, n = 1; | |
| 1205 char *s; | |
| 1206 for (node = fz_xml_down(root); node; node = fz_xml_next(node)) | |
| 1207 { | |
| 1208 const char *text = fz_xml_text(node); | |
| 1209 n += text ? strlen(text) : 0; | |
| 1210 } | |
| 1211 s = Memento_label(fz_malloc(ctx, n), "concat_html"); | |
| 1212 for (node = fz_xml_down(root); node; node = fz_xml_next(node)) | |
| 1213 { | |
| 1214 const char *text = fz_xml_text(node); | |
| 1215 if (text) | |
| 1216 { | |
| 1217 n = strlen(text); | |
| 1218 memcpy(s+i, text, n); | |
| 1219 i += n; | |
| 1220 } | |
| 1221 } | |
| 1222 s[i] = 0; | |
| 1223 return s; | |
| 1224 } | |
| 1225 | |
| 1226 static void | |
| 1227 html_load_css_link(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_css *css, fz_xml *root, const char *href) | |
| 1228 { | |
| 1229 char path[2048]; | |
| 1230 char css_base_uri[2048]; | |
| 1231 fz_buffer *buf; | |
| 1232 | |
| 1233 fz_var(buf); | |
| 1234 | |
| 1235 fz_strlcpy(path, base_uri, sizeof path); | |
| 1236 fz_strlcat(path, "/", sizeof path); | |
| 1237 fz_strlcat(path, href, sizeof path); | |
| 1238 fz_urldecode(path); | |
| 1239 fz_cleanname(path); | |
| 1240 | |
| 1241 fz_dirname(css_base_uri, path, sizeof css_base_uri); | |
| 1242 | |
| 1243 buf = NULL; | |
| 1244 fz_try(ctx) | |
| 1245 { | |
| 1246 buf = fz_read_archive_entry(ctx, zip, path); | |
| 1247 fz_parse_css(ctx, css, fz_string_from_buffer(ctx, buf), path); | |
| 1248 fz_add_css_font_faces(ctx, set, zip, css_base_uri, css); | |
| 1249 } | |
| 1250 fz_always(ctx) | |
| 1251 fz_drop_buffer(ctx, buf); | |
| 1252 fz_catch(ctx) | |
| 1253 { | |
| 1254 fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); | |
| 1255 fz_report_error(ctx); | |
| 1256 fz_warn(ctx, "ignoring stylesheet %s", path); | |
| 1257 } | |
| 1258 } | |
| 1259 | |
| 1260 static void | |
| 1261 html_load_css(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_css *css, fz_xml *root) | |
| 1262 { | |
| 1263 fz_xml *html, *head, *node; | |
| 1264 | |
| 1265 html = fz_xml_find(root, "html"); | |
| 1266 head = fz_xml_find_down(html, "head"); | |
| 1267 for (node = fz_xml_down(head); node; node = fz_xml_next(node)) | |
| 1268 { | |
| 1269 if (fz_xml_is_tag(node, "link")) | |
| 1270 { | |
| 1271 char *rel = fz_xml_att(node, "rel"); | |
| 1272 if (rel && !fz_strcasecmp(rel, "stylesheet")) | |
| 1273 { | |
| 1274 char *type = fz_xml_att(node, "type"); | |
| 1275 if ((type && !strcmp(type, "text/css")) || !type) | |
| 1276 { | |
| 1277 char *href = fz_xml_att(node, "href"); | |
| 1278 if (href) | |
| 1279 { | |
| 1280 html_load_css_link(ctx, set, zip, base_uri, css, root, href); | |
| 1281 } | |
| 1282 } | |
| 1283 } | |
| 1284 } | |
| 1285 else if (fz_xml_is_tag(node, "style")) | |
| 1286 { | |
| 1287 char *s = concat_text(ctx, node); | |
| 1288 fz_try(ctx) | |
| 1289 { | |
| 1290 fz_parse_css(ctx, css, s, "<style>"); | |
| 1291 fz_add_css_font_faces(ctx, set, zip, base_uri, css); | |
| 1292 } | |
| 1293 fz_always(ctx) | |
| 1294 fz_free(ctx, s); | |
| 1295 fz_catch(ctx) | |
| 1296 { | |
| 1297 fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); | |
| 1298 fz_report_error(ctx); | |
| 1299 fz_warn(ctx, "ignoring inline stylesheet"); | |
| 1300 } | |
| 1301 } | |
| 1302 } | |
| 1303 } | |
| 1304 | |
| 1305 static void | |
| 1306 fb2_load_css(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_css *css, fz_xml *root) | |
| 1307 { | |
| 1308 fz_xml *fictionbook, *stylesheet; | |
| 1309 | |
| 1310 fictionbook = fz_xml_find(root, "FictionBook"); | |
| 1311 stylesheet = fz_xml_find_down(fictionbook, "stylesheet"); | |
| 1312 if (stylesheet) | |
| 1313 { | |
| 1314 char *s = concat_text(ctx, stylesheet); | |
| 1315 fz_try(ctx) | |
| 1316 { | |
| 1317 fz_parse_css(ctx, css, s, "<stylesheet>"); | |
| 1318 fz_add_css_font_faces(ctx, set, zip, base_uri, css); | |
| 1319 } | |
| 1320 fz_catch(ctx) | |
| 1321 { | |
| 1322 fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); | |
| 1323 fz_report_error(ctx); | |
| 1324 fz_warn(ctx, "ignoring inline stylesheet"); | |
| 1325 } | |
| 1326 fz_free(ctx, s); | |
| 1327 } | |
| 1328 } | |
| 1329 | |
| 1330 static fz_tree * | |
| 1331 load_fb2_images(fz_context *ctx, fz_xml *root) | |
| 1332 { | |
| 1333 fz_xml *fictionbook, *binary; | |
| 1334 fz_tree *images = NULL; | |
| 1335 | |
| 1336 fictionbook = fz_xml_find(root, "FictionBook"); | |
| 1337 for (binary = fz_xml_find_down(fictionbook, "binary"); binary; binary = fz_xml_find_next(binary, "binary")) | |
| 1338 { | |
| 1339 const char *id = fz_xml_att(binary, "id"); | |
| 1340 char *b64 = NULL; | |
| 1341 fz_buffer *buf = NULL; | |
| 1342 fz_image *img = NULL; | |
| 1343 | |
| 1344 fz_var(b64); | |
| 1345 fz_var(buf); | |
| 1346 | |
| 1347 if (id == NULL) | |
| 1348 { | |
| 1349 fz_warn(ctx, "Skipping image with no id"); | |
| 1350 continue; | |
| 1351 } | |
| 1352 | |
| 1353 fz_try(ctx) | |
| 1354 { | |
| 1355 b64 = concat_text(ctx, binary); | |
| 1356 buf = fz_new_buffer_from_base64(ctx, b64, strlen(b64)); | |
| 1357 img = fz_new_image_from_buffer(ctx, buf); | |
| 1358 } | |
| 1359 fz_always(ctx) | |
| 1360 { | |
| 1361 fz_drop_buffer(ctx, buf); | |
| 1362 fz_free(ctx, b64); | |
| 1363 } | |
| 1364 fz_catch(ctx) | |
| 1365 fz_rethrow(ctx); | |
| 1366 | |
| 1367 images = fz_tree_insert(ctx, images, id, img); | |
| 1368 } | |
| 1369 | |
| 1370 return images; | |
| 1371 } | |
| 1372 | |
| 1373 typedef struct | |
| 1374 { | |
| 1375 uint32_t *data; | |
| 1376 size_t cap; | |
| 1377 size_t len; | |
| 1378 } uni_buf; | |
| 1379 | |
| 1380 typedef struct | |
| 1381 { | |
| 1382 fz_context *ctx; | |
| 1383 fz_pool *pool; | |
| 1384 fz_html_flow *flow; | |
| 1385 uni_buf *buffer; | |
| 1386 } bidi_data; | |
| 1387 | |
| 1388 static void fragment_cb(const uint32_t *fragment, | |
| 1389 size_t fragment_len, | |
| 1390 int bidi_level, | |
| 1391 int script, | |
| 1392 void *arg) | |
| 1393 { | |
| 1394 bidi_data *data = (bidi_data *)arg; | |
| 1395 | |
| 1396 /* We are guaranteed that fragmentOffset will be at the beginning | |
| 1397 * of flow. */ | |
| 1398 while (fragment_len > 0) | |
| 1399 { | |
| 1400 size_t len; | |
| 1401 | |
| 1402 if (data->flow->type == FLOW_SPACE) | |
| 1403 { | |
| 1404 len = 1; | |
| 1405 } | |
| 1406 else if (data->flow->type == FLOW_BREAK || data->flow->type == FLOW_SBREAK || | |
| 1407 data->flow->type == FLOW_SHYPHEN || data->flow->type == FLOW_ANCHOR) | |
| 1408 { | |
| 1409 len = 0; | |
| 1410 } | |
| 1411 else | |
| 1412 { | |
| 1413 /* Must be text */ | |
| 1414 len = fz_utflen(data->flow->content.text); | |
| 1415 if (len > fragment_len) | |
| 1416 { | |
| 1417 /* We need to split this flow box */ | |
| 1418 (void)fz_html_split_flow(data->ctx, data->pool, data->flow, fragment_len); | |
| 1419 len = fz_utflen(data->flow->content.text); | |
| 1420 } | |
| 1421 } | |
| 1422 | |
| 1423 /* This flow box is entirely contained within this fragment. */ | |
| 1424 data->flow->bidi_level = bidi_level; | |
| 1425 data->flow->script = script; | |
| 1426 data->flow = data->flow->next; | |
| 1427 fragment_len -= len; | |
| 1428 } | |
| 1429 } | |
| 1430 | |
| 1431 static fz_bidi_direction | |
| 1432 detect_flow_directionality(fz_context *ctx, fz_pool *pool, uni_buf *buffer, fz_bidi_direction bidi_dir, fz_html_flow *flow) | |
| 1433 { | |
| 1434 fz_html_flow *end = flow; | |
| 1435 bidi_data data; | |
| 1436 | |
| 1437 while (end) | |
| 1438 { | |
| 1439 unsigned int level = end->bidi_level; | |
| 1440 | |
| 1441 /* Gather the text from the flow up into a single buffer (at | |
| 1442 * least, as much of it as has the same direction markup). */ | |
| 1443 buffer->len = 0; | |
| 1444 while (end && (level & 1) == (end->bidi_level & 1)) | |
| 1445 { | |
| 1446 size_t len = 0; | |
| 1447 const char *text = ""; | |
| 1448 int broken = 0; | |
| 1449 | |
| 1450 switch (end->type) | |
| 1451 { | |
| 1452 case FLOW_WORD: | |
| 1453 len = fz_utflen(end->content.text); | |
| 1454 text = end->content.text; | |
| 1455 break; | |
| 1456 case FLOW_SPACE: | |
| 1457 len = 1; | |
| 1458 text = " "; | |
| 1459 break; | |
| 1460 case FLOW_SHYPHEN: | |
| 1461 case FLOW_SBREAK: | |
| 1462 break; | |
| 1463 case FLOW_BREAK: | |
| 1464 case FLOW_IMAGE: | |
| 1465 broken = 1; | |
| 1466 break; | |
| 1467 } | |
| 1468 | |
| 1469 end = end->next; | |
| 1470 | |
| 1471 if (broken) | |
| 1472 break; | |
| 1473 | |
| 1474 /* Make sure the buffer is large enough */ | |
| 1475 if (buffer->len + len > buffer->cap) | |
| 1476 { | |
| 1477 size_t newcap = buffer->cap; | |
| 1478 if (newcap < 128) | |
| 1479 newcap = 128; /* Sensible small default */ | |
| 1480 | |
| 1481 while (newcap < buffer->len + len) | |
| 1482 newcap = (newcap * 3) / 2; | |
| 1483 | |
| 1484 buffer->data = fz_realloc_array(ctx, buffer->data, newcap, uint32_t); | |
| 1485 buffer->cap = newcap; | |
| 1486 } | |
| 1487 | |
| 1488 /* Expand the utf8 text into Unicode and store it in the buffer */ | |
| 1489 while (*text) | |
| 1490 { | |
| 1491 int rune; | |
| 1492 text += fz_chartorune(&rune, text); | |
| 1493 buffer->data[buffer->len++] = rune; | |
| 1494 } | |
| 1495 } | |
| 1496 | |
| 1497 /* Detect directionality for the buffer */ | |
| 1498 data.ctx = ctx; | |
| 1499 data.pool = pool; | |
| 1500 data.flow = flow; | |
| 1501 data.buffer = buffer; | |
| 1502 fz_bidi_fragment_text(ctx, buffer->data, buffer->len, &bidi_dir, fragment_cb, &data, 0 /* Flags */); | |
| 1503 flow = end; | |
| 1504 } | |
| 1505 return bidi_dir; | |
| 1506 } | |
| 1507 | |
| 1508 static void | |
| 1509 detect_box_directionality(fz_context *ctx, fz_pool *pool, uni_buf *buffer, fz_html_box *box) | |
| 1510 { | |
| 1511 while (box) | |
| 1512 { | |
| 1513 if (box->type == BOX_FLOW) | |
| 1514 box->markup_dir = detect_flow_directionality(ctx, pool, buffer, box->markup_dir, box->u.flow.head); | |
| 1515 detect_box_directionality(ctx, pool, buffer, box->down); | |
| 1516 box = box->next; | |
| 1517 } | |
| 1518 } | |
| 1519 | |
| 1520 static void | |
| 1521 detect_directionality(fz_context *ctx, fz_pool *pool, fz_html_box *box) | |
| 1522 { | |
| 1523 uni_buf buffer = { NULL }; | |
| 1524 | |
| 1525 fz_try(ctx) | |
| 1526 detect_box_directionality(ctx, pool, &buffer, box); | |
| 1527 fz_always(ctx) | |
| 1528 fz_free(ctx, buffer.data); | |
| 1529 fz_catch(ctx) | |
| 1530 fz_rethrow(ctx); | |
| 1531 } | |
| 1532 | |
| 1533 static fz_xml_doc * | |
| 1534 parse_to_xml(fz_context *ctx, fz_buffer *buf, int try_xml, int try_html5) | |
| 1535 { | |
| 1536 fz_xml_doc *xml; | |
| 1537 | |
| 1538 if (try_xml && try_html5) | |
| 1539 { | |
| 1540 fz_try(ctx) | |
| 1541 xml = fz_parse_xml(ctx, buf, 1); | |
| 1542 fz_catch(ctx) | |
| 1543 { | |
| 1544 if (fz_caught(ctx) == FZ_ERROR_SYNTAX) | |
| 1545 { | |
| 1546 fz_report_error(ctx); | |
| 1547 fz_warn(ctx, "syntax error in XHTML; retrying using HTML5 parser"); | |
| 1548 xml = fz_parse_xml_from_html5(ctx, buf); | |
| 1549 } | |
| 1550 else | |
| 1551 fz_rethrow(ctx); | |
| 1552 } | |
| 1553 } | |
| 1554 else if (try_xml) | |
| 1555 xml = fz_parse_xml(ctx, buf, 1); | |
| 1556 else | |
| 1557 { | |
| 1558 assert(try_html5); | |
| 1559 xml = fz_parse_xml_from_html5(ctx, buf); | |
| 1560 } | |
| 1561 | |
| 1562 return xml; | |
| 1563 } | |
| 1564 | |
| 1565 static void move_background_color_style_up(fz_context *ctx, struct genstate *g, fz_html_box *root, fz_html_box *from) | |
| 1566 { | |
| 1567 fz_css_color transparent = { 0, 0, 0, 0 }; | |
| 1568 fz_css_style s1, s2; | |
| 1569 memcpy(&s1, root->style, sizeof s1); | |
| 1570 memcpy(&s2, from->style, sizeof s2); | |
| 1571 s1.background_color = s2.background_color; | |
| 1572 s2.background_color = transparent; | |
| 1573 root->style = fz_css_enlist(ctx, &s1, &g->styles, g->pool); | |
| 1574 from->style = fz_css_enlist(ctx, &s2, &g->styles, g->pool); | |
| 1575 } | |
| 1576 | |
| 1577 static void move_background_color_up(fz_context *ctx, struct genstate *g, fz_html_box *root) | |
| 1578 { | |
| 1579 fz_html_box *html, *body; | |
| 1580 | |
| 1581 if (root->style->background_color.a != 0) | |
| 1582 { | |
| 1583 return; | |
| 1584 } | |
| 1585 | |
| 1586 html = root->down; | |
| 1587 if (html && !strcmp(html->tag, "html")) | |
| 1588 { | |
| 1589 if (html->style->background_color.a != 0) | |
| 1590 { | |
| 1591 move_background_color_style_up(ctx, g, root, html); | |
| 1592 return; | |
| 1593 } | |
| 1594 | |
| 1595 body = html->down; | |
| 1596 if (body && !strcmp(body->tag, "body")) | |
| 1597 { | |
| 1598 if (body->style->background_color.a != 0) | |
| 1599 { | |
| 1600 move_background_color_style_up(ctx, g, root, body); | |
| 1601 return; | |
| 1602 } | |
| 1603 } | |
| 1604 } | |
| 1605 } | |
| 1606 | |
| 1607 static void | |
| 1608 xml_to_boxes(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, const char *user_css, | |
| 1609 fz_xml_doc *xml, fz_html_tree *tree, char **rtitle, int try_fictionbook, int is_mobi) | |
| 1610 { | |
| 1611 fz_xml *root, *node; | |
| 1612 char *title; | |
| 1613 | |
| 1614 fz_css_match root_match, match; | |
| 1615 struct genstate g = {0}; | |
| 1616 | |
| 1617 g.pool = NULL; | |
| 1618 g.set = set; | |
| 1619 g.zip = zip; | |
| 1620 g.images = NULL; | |
| 1621 g.xml = xml; | |
| 1622 g.is_fb2 = 0; | |
| 1623 g.base_uri = base_uri; | |
| 1624 g.css = NULL; | |
| 1625 g.at_bol = 0; | |
| 1626 g.emit_white = 0; | |
| 1627 g.last_brk_cls = UCDN_LINEBREAK_CLASS_OP; | |
| 1628 g.list_counter = 0; | |
| 1629 g.section_depth = 0; | |
| 1630 g.markup_dir = FZ_BIDI_LTR; | |
| 1631 g.markup_lang = FZ_LANG_UNSET; | |
| 1632 g.href = NULL; | |
| 1633 g.styles = NULL; | |
| 1634 | |
| 1635 if (rtitle) | |
| 1636 *rtitle = NULL; | |
| 1637 | |
| 1638 root = fz_xml_root(g.xml); | |
| 1639 g.css = fz_new_css(ctx); | |
| 1640 | |
| 1641 #ifndef NDEBUG | |
| 1642 if (fz_atoi(getenv("FZ_DEBUG_XML"))) | |
| 1643 fz_debug_xml(root, 0); | |
| 1644 #endif | |
| 1645 | |
| 1646 fz_try(ctx) | |
| 1647 { | |
| 1648 if (try_fictionbook && fz_xml_find(root, "FictionBook")) | |
| 1649 { | |
| 1650 g.is_fb2 = 1; | |
| 1651 fz_parse_css(ctx, g.css, fb2_default_css, "<default:fb2>"); | |
| 1652 if (fz_use_document_css(ctx)) | |
| 1653 fb2_load_css(ctx, g.set, g.zip, g.base_uri, g.css, root); | |
| 1654 g.images = load_fb2_images(ctx, root); | |
| 1655 } | |
| 1656 else if (is_mobi) | |
| 1657 { | |
| 1658 g.is_fb2 = 0; | |
| 1659 fz_parse_css(ctx, g.css, html_default_css, "<default:html>"); | |
| 1660 fz_parse_css(ctx, g.css, mobi_default_css, "<default:mobi>"); | |
| 1661 if (fz_use_document_css(ctx)) | |
| 1662 html_load_css(ctx, g.set, g.zip, g.base_uri, g.css, root); | |
| 1663 } | |
| 1664 else | |
| 1665 { | |
| 1666 g.is_fb2 = 0; | |
| 1667 fz_parse_css(ctx, g.css, html_default_css, "<default:html>"); | |
| 1668 if (fz_use_document_css(ctx)) | |
| 1669 html_load_css(ctx, g.set, g.zip, g.base_uri, g.css, root); | |
| 1670 } | |
| 1671 | |
| 1672 if (user_css) | |
| 1673 { | |
| 1674 fz_parse_css(ctx, g.css, user_css, "<user>"); | |
| 1675 fz_add_css_font_faces(ctx, g.set, g.zip, ".", g.css); | |
| 1676 } | |
| 1677 } | |
| 1678 fz_catch(ctx) | |
| 1679 { | |
| 1680 fz_drop_tree(ctx, g.images, (void(*)(fz_context*,void*))fz_drop_image); | |
| 1681 fz_drop_css(ctx, g.css); | |
| 1682 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER); | |
| 1683 fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); | |
| 1684 fz_report_error(ctx); | |
| 1685 fz_warn(ctx, "ignoring styles"); | |
| 1686 g.css = fz_new_css(ctx); | |
| 1687 g.images = NULL; | |
| 1688 } | |
| 1689 | |
| 1690 #ifndef NDEBUG | |
| 1691 if (fz_atoi(getenv("FZ_DEBUG_CSS"))) | |
| 1692 fz_debug_css(ctx, g.css); | |
| 1693 #endif | |
| 1694 | |
| 1695 fz_try(ctx) | |
| 1696 { | |
| 1697 fz_css_style style; | |
| 1698 int display; | |
| 1699 | |
| 1700 fz_match_css_at_page(ctx, &root_match, g.css); | |
| 1701 fz_apply_css_style(ctx, g.set, &style, &root_match); | |
| 1702 | |
| 1703 g.pool = tree->pool; | |
| 1704 g.markup_dir = DEFAULT_DIR; | |
| 1705 g.markup_lang = FZ_LANG_UNSET; | |
| 1706 | |
| 1707 // Create root node | |
| 1708 tree->root = new_box(ctx, &g, NULL, BOX_BLOCK, &style); | |
| 1709 // TODO: transfer page margins out of this hacky box | |
| 1710 | |
| 1711 tree->root->tag = ":root"; | |
| 1712 tree->root->s.layout.em = 0; | |
| 1713 tree->root->s.layout.x = 0; | |
| 1714 tree->root->s.layout.y = 0; | |
| 1715 tree->root->s.layout.w = 0; | |
| 1716 tree->root->s.layout.b = 0; | |
| 1717 | |
| 1718 // Create document node (html). | |
| 1719 fz_match_css(ctx, &match, &root_match, g.css, root); | |
| 1720 fz_apply_css_style(ctx, g.set, &style, &match); | |
| 1721 display = fz_get_css_match_display(&match); | |
| 1722 gen2_tag(ctx, &g, tree->root, root, &match, display, &style); | |
| 1723 | |
| 1724 detect_directionality(ctx, g.pool, tree->root); | |
| 1725 | |
| 1726 if (g.is_fb2) | |
| 1727 { | |
| 1728 node = fz_xml_find(root, "FictionBook"); | |
| 1729 node = fz_xml_find_down(node, "description"); | |
| 1730 node = fz_xml_find_down(node, "title-info"); | |
| 1731 node = fz_xml_find_down(node, "book-title"); | |
| 1732 if (rtitle) | |
| 1733 { | |
| 1734 title = fz_xml_text(fz_xml_down(node)); | |
| 1735 if (title) | |
| 1736 *rtitle = fz_pool_strdup(ctx, g.pool, title); | |
| 1737 } | |
| 1738 } | |
| 1739 else | |
| 1740 { | |
| 1741 node = fz_xml_find(root, "html"); | |
| 1742 node = fz_xml_find_down(node, "head"); | |
| 1743 node = fz_xml_find_down(node, "title"); | |
| 1744 if (rtitle) | |
| 1745 { | |
| 1746 title = fz_xml_text(fz_xml_down(node)); | |
| 1747 if (title) | |
| 1748 *rtitle = fz_pool_strdup(ctx, g.pool, title); | |
| 1749 } | |
| 1750 | |
| 1751 // Move html or body background-color to :root. | |
| 1752 move_background_color_up(ctx, &g, tree->root); | |
| 1753 } | |
| 1754 } | |
| 1755 fz_always(ctx) | |
| 1756 { | |
| 1757 fz_drop_tree(ctx, g.images, (void(*)(fz_context*,void*))fz_drop_image); | |
| 1758 fz_drop_css(ctx, g.css); | |
| 1759 } | |
| 1760 fz_catch(ctx) | |
| 1761 { | |
| 1762 if (rtitle) | |
| 1763 { | |
| 1764 fz_free(ctx, *rtitle); | |
| 1765 *rtitle = NULL; | |
| 1766 } | |
| 1767 fz_rethrow(ctx); | |
| 1768 } | |
| 1769 } | |
| 1770 | |
| 1771 static const char *mobi_font_size[7] = { | |
| 1772 "0.67em", | |
| 1773 "0.83em", | |
| 1774 "1em", | |
| 1775 "1.17em", | |
| 1776 "1.33em", | |
| 1777 "1.5em", | |
| 1778 "1.67em", | |
| 1779 }; | |
| 1780 | |
| 1781 static void | |
| 1782 patch_mobi_html(fz_context *ctx, fz_pool *pool, fz_xml *node) | |
| 1783 { | |
| 1784 fz_xml *down; | |
| 1785 char buf[500]; | |
| 1786 while (node) | |
| 1787 { | |
| 1788 char *tag = fz_xml_tag(node); | |
| 1789 if (tag) | |
| 1790 { | |
| 1791 // Read MOBI attributes, convert to inline CSS style | |
| 1792 if (!strcmp(tag, "font")) | |
| 1793 { | |
| 1794 const char *size = fz_xml_att(node, "size"); | |
| 1795 if (size) | |
| 1796 { | |
| 1797 if (!strcmp(size, "1")) size = mobi_font_size[0]; | |
| 1798 else if (!strcmp(size, "2")) size = mobi_font_size[1]; | |
| 1799 else if (!strcmp(size, "3")) size = mobi_font_size[2]; | |
| 1800 else if (!strcmp(size, "4")) size = mobi_font_size[3]; | |
| 1801 else if (!strcmp(size, "5")) size = mobi_font_size[4]; | |
| 1802 else if (!strcmp(size, "6")) size = mobi_font_size[5]; | |
| 1803 else if (!strcmp(size, "7")) size = mobi_font_size[6]; | |
| 1804 else if (!strcmp(size, "+1")) size = mobi_font_size[3]; | |
| 1805 else if (!strcmp(size, "+2")) size = mobi_font_size[4]; | |
| 1806 else if (!strcmp(size, "+3")) size = mobi_font_size[5]; | |
| 1807 else if (!strcmp(size, "+4")) size = mobi_font_size[6]; | |
| 1808 else if (!strcmp(size, "+5")) size = mobi_font_size[6]; | |
| 1809 else if (!strcmp(size, "+6")) size = mobi_font_size[6]; | |
| 1810 else if (!strcmp(size, "-1")) size = mobi_font_size[1]; | |
| 1811 else if (!strcmp(size, "-2")) size = mobi_font_size[0]; | |
| 1812 else if (!strcmp(size, "-3")) size = mobi_font_size[0]; | |
| 1813 else if (!strcmp(size, "-4")) size = mobi_font_size[0]; | |
| 1814 else if (!strcmp(size, "-5")) size = mobi_font_size[0]; | |
| 1815 else if (!strcmp(size, "-6")) size = mobi_font_size[0]; | |
| 1816 fz_snprintf(buf, sizeof buf, "font-size:%s", size); | |
| 1817 fz_xml_add_att(ctx, pool, node, "style", buf); | |
| 1818 } | |
| 1819 } | |
| 1820 else | |
| 1821 { | |
| 1822 char *height = fz_xml_att(node, "height"); | |
| 1823 char *width = fz_xml_att(node, "width"); | |
| 1824 char *align = fz_xml_att(node, "align"); | |
| 1825 if (height || width || align) | |
| 1826 { | |
| 1827 buf[0] = 0; | |
| 1828 if (height) | |
| 1829 { | |
| 1830 fz_strlcat(buf, "margin-top:", sizeof buf); | |
| 1831 fz_strlcat(buf, height, sizeof buf); | |
| 1832 fz_strlcat(buf, ";", sizeof buf); | |
| 1833 } | |
| 1834 if (width) | |
| 1835 { | |
| 1836 fz_strlcat(buf, "text-indent:", sizeof buf); | |
| 1837 fz_strlcat(buf, width, sizeof buf); | |
| 1838 fz_strlcat(buf, ";", sizeof buf); | |
| 1839 } | |
| 1840 if (align) | |
| 1841 { | |
| 1842 fz_strlcat(buf, "text-align:", sizeof buf); | |
| 1843 fz_strlcat(buf, align, sizeof buf); | |
| 1844 fz_strlcat(buf, ";", sizeof buf); | |
| 1845 } | |
| 1846 fz_xml_add_att(ctx, pool, node, "style", buf); | |
| 1847 } | |
| 1848 if (!strcmp(tag, "img")) | |
| 1849 { | |
| 1850 char *recindex = fz_xml_att(node, "recindex"); | |
| 1851 if (recindex) | |
| 1852 fz_xml_add_att(ctx, pool, node, "src", recindex); | |
| 1853 } | |
| 1854 } | |
| 1855 } | |
| 1856 | |
| 1857 down = fz_xml_down(node); | |
| 1858 if (down) | |
| 1859 patch_mobi_html(ctx, pool, down); | |
| 1860 | |
| 1861 node = fz_xml_next(node); | |
| 1862 } | |
| 1863 } | |
| 1864 | |
| 1865 static void | |
| 1866 fz_parse_html_tree(fz_context *ctx, | |
| 1867 fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buf, const char *user_css, | |
| 1868 int try_xml, int try_html5, fz_html_tree *tree, char **rtitle, int try_fictionbook, int patch_mobi) | |
| 1869 { | |
| 1870 fz_xml_doc *xml; | |
| 1871 | |
| 1872 if (rtitle) | |
| 1873 *rtitle = NULL; | |
| 1874 | |
| 1875 xml = parse_to_xml(ctx, buf, try_xml, try_html5); | |
| 1876 | |
| 1877 if (patch_mobi) | |
| 1878 patch_mobi_html(ctx, xml->u.doc.pool, xml); | |
| 1879 | |
| 1880 fz_try(ctx) | |
| 1881 xml_to_boxes(ctx, set, zip, base_uri, user_css, xml, tree, rtitle, try_fictionbook, patch_mobi); | |
| 1882 fz_always(ctx) | |
| 1883 fz_drop_xml(ctx, xml); | |
| 1884 fz_catch(ctx) | |
| 1885 fz_rethrow(ctx); | |
| 1886 } | |
| 1887 | |
| 1888 #define fz_new_derived_html_tree(CTX, TYPE, DROP) \ | |
| 1889 ((TYPE *)Memento_label(fz_new_html_tree_of_size(CTX, sizeof(TYPE), DROP), #TYPE)) | |
| 1890 | |
| 1891 static fz_html_tree * | |
| 1892 fz_new_html_tree_of_size(fz_context *ctx, size_t size, fz_store_drop_fn *drop) | |
| 1893 { | |
| 1894 fz_pool *pool = fz_new_pool(ctx); | |
| 1895 fz_html_tree *tree; | |
| 1896 | |
| 1897 fz_try(ctx) | |
| 1898 { | |
| 1899 tree = fz_pool_alloc(ctx, pool, size); | |
| 1900 FZ_INIT_STORABLE(tree, 1, drop); | |
| 1901 tree->pool = pool; | |
| 1902 } | |
| 1903 fz_catch(ctx) | |
| 1904 { | |
| 1905 fz_drop_pool(ctx, pool); | |
| 1906 fz_rethrow(ctx); | |
| 1907 } | |
| 1908 | |
| 1909 return tree; | |
| 1910 } | |
| 1911 | |
| 1912 fz_html * | |
| 1913 fz_parse_html(fz_context *ctx, | |
| 1914 fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buf, const char *user_css, | |
| 1915 int try_xml, int try_html5, int patch_mobi) | |
| 1916 { | |
| 1917 fz_html *html = fz_new_derived_html_tree(ctx, fz_html, fz_drop_html_imp); | |
| 1918 | |
| 1919 html->layout_w = 0; | |
| 1920 html->layout_h = 0; | |
| 1921 html->layout_em = 0; | |
| 1922 | |
| 1923 fz_try(ctx) | |
| 1924 fz_parse_html_tree(ctx, set, zip, base_uri, buf, user_css, try_xml, try_html5, &html->tree, &html->title, 1, patch_mobi); | |
| 1925 fz_catch(ctx) | |
| 1926 { | |
| 1927 fz_drop_html(ctx, html); | |
| 1928 fz_rethrow(ctx); | |
| 1929 } | |
| 1930 | |
| 1931 return html; | |
| 1932 } | |
| 1933 | |
| 1934 typedef struct | |
| 1935 { | |
| 1936 int saved; | |
| 1937 fz_warning_cb *old; | |
| 1938 void *arg; | |
| 1939 fz_buffer *buffer; | |
| 1940 fz_context *ctx; | |
| 1941 } warning_save; | |
| 1942 | |
| 1943 static void | |
| 1944 warn_to_buffer(void *user, const char *message) | |
| 1945 { | |
| 1946 warning_save *save = (warning_save *)user; | |
| 1947 fz_context *ctx = save->ctx; | |
| 1948 | |
| 1949 fz_try(ctx) | |
| 1950 { | |
| 1951 fz_append_string(ctx, save->buffer, message); | |
| 1952 fz_append_byte(ctx, save->buffer, '\n'); | |
| 1953 } | |
| 1954 fz_catch(ctx) | |
| 1955 { | |
| 1956 /* Silently swallow the error. */ | |
| 1957 fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); | |
| 1958 fz_report_error(ctx); | |
| 1959 } | |
| 1960 } | |
| 1961 | |
| 1962 static void | |
| 1963 redirect_warnings_to_buffer(fz_context *ctx, fz_buffer *buf, warning_save *save) | |
| 1964 { | |
| 1965 save->saved = 1; | |
| 1966 save->old = fz_warning_callback(ctx, &save->arg); | |
| 1967 save->buffer = buf; | |
| 1968 save->ctx = ctx; | |
| 1969 | |
| 1970 fz_flush_warnings(ctx); | |
| 1971 fz_set_warning_callback(ctx, warn_to_buffer, save); | |
| 1972 } | |
| 1973 | |
| 1974 static void | |
| 1975 restore_warnings(fz_context *ctx, warning_save *save) | |
| 1976 { | |
| 1977 if (!save->saved) | |
| 1978 return; | |
| 1979 | |
| 1980 fz_flush_warnings(ctx); | |
| 1981 fz_set_warning_callback(ctx, save->old, save->arg); | |
| 1982 } | |
| 1983 | |
| 1984 fz_story * | |
| 1985 fz_new_story(fz_context *ctx, fz_buffer *buf, const char *user_css, float em, fz_archive *zip) | |
| 1986 { | |
| 1987 fz_story *story = fz_new_derived_html_tree(ctx, fz_story, fz_drop_story_imp); | |
| 1988 warning_save saved = { 0 }; | |
| 1989 fz_buffer *local_buffer = NULL; | |
| 1990 | |
| 1991 if (buf == NULL) | |
| 1992 { | |
| 1993 local_buffer = fz_new_buffer(ctx, 0); | |
| 1994 buf = local_buffer; | |
| 1995 } | |
| 1996 | |
| 1997 fz_var(local_buffer); | |
| 1998 fz_var(saved); | |
| 1999 | |
| 2000 fz_try(ctx) | |
| 2001 { | |
| 2002 story->zip = fz_keep_archive(ctx, zip); | |
| 2003 story->font_set = fz_new_html_font_set(ctx); | |
| 2004 story->em = em; | |
| 2005 story->user_css = user_css ? fz_strdup(ctx, user_css) : NULL; | |
| 2006 story->warnings = fz_new_buffer(ctx, 128); | |
| 2007 redirect_warnings_to_buffer(ctx, story->warnings, &saved); | |
| 2008 story->dom = parse_to_xml(ctx, buf, 0, 1); | |
| 2009 } | |
| 2010 fz_always(ctx) | |
| 2011 { | |
| 2012 restore_warnings(ctx, &saved); | |
| 2013 fz_drop_buffer(ctx, local_buffer); | |
| 2014 } | |
| 2015 fz_catch(ctx) | |
| 2016 { | |
| 2017 fz_drop_html_tree(ctx, &story->tree); | |
| 2018 fz_rethrow(ctx); | |
| 2019 } | |
| 2020 | |
| 2021 return story; | |
| 2022 } | |
| 2023 | |
| 2024 fz_html * | |
| 2025 fz_parse_xhtml(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buf, const char *user_css) | |
| 2026 { | |
| 2027 /* try as XML first, fall back to HTML5 */ | |
| 2028 return fz_parse_html(ctx, set, zip, base_uri, buf, user_css, 1, 1, 0); | |
| 2029 } | |
| 2030 | |
| 2031 static void indent(int level) | |
| 2032 { | |
| 2033 while (level-- > 0) | |
| 2034 putchar('\t'); | |
| 2035 } | |
| 2036 | |
| 2037 static void | |
| 2038 fz_debug_html_flow(fz_context *ctx, fz_html_flow *flow, int level) | |
| 2039 { | |
| 2040 fz_html_box *sbox = NULL; | |
| 2041 while (flow) | |
| 2042 { | |
| 2043 if (flow->box != sbox) { | |
| 2044 sbox = flow->box; | |
| 2045 indent(level); | |
| 2046 #ifndef NDEBUG | |
| 2047 printf("@style <%s> em=%g font='%s'", sbox->tag, sbox->s.layout.em, fz_font_name(ctx, sbox->style->font)); | |
| 2048 #else | |
| 2049 printf("@style em=%g font='%s'", sbox->s.layout.em, fz_font_name(ctx, sbox->style->font)); | |
| 2050 #endif | |
| 2051 if (fz_font_is_serif(ctx, sbox->style->font)) | |
| 2052 printf(" serif"); | |
| 2053 else | |
| 2054 printf(" sans"); | |
| 2055 if (fz_font_is_monospaced(ctx, sbox->style->font)) | |
| 2056 printf(" monospaced"); | |
| 2057 if (fz_font_is_bold(ctx, sbox->style->font)) | |
| 2058 printf(" bold"); | |
| 2059 if (fz_font_is_italic(ctx, sbox->style->font)) | |
| 2060 printf(" italic"); | |
| 2061 if (sbox->style->small_caps) | |
| 2062 printf(" small-caps"); | |
| 2063 printf("\n"); | |
| 2064 } | |
| 2065 | |
| 2066 indent(level); | |
| 2067 switch (flow->type) { | |
| 2068 case FLOW_WORD: printf("word "); break; | |
| 2069 case FLOW_SPACE: printf("space"); break; | |
| 2070 case FLOW_SBREAK: printf("sbrk "); break; | |
| 2071 case FLOW_SHYPHEN: printf("shy "); break; | |
| 2072 case FLOW_BREAK: printf("break"); break; | |
| 2073 case FLOW_IMAGE: printf("image"); break; | |
| 2074 case FLOW_ANCHOR: printf("anchor"); break; | |
| 2075 } | |
| 2076 // printf(" y=%g x=%g w=%g", flow->y, flow->x, flow->w); | |
| 2077 if (flow->type == FLOW_IMAGE) | |
| 2078 printf(" h=%g", flow->h); | |
| 2079 if (flow->type == FLOW_WORD) | |
| 2080 printf(" text='%s'", flow->content.text); | |
| 2081 printf("\n"); | |
| 2082 if (flow->breaks_line) { | |
| 2083 indent(level); | |
| 2084 printf("*\n"); | |
| 2085 } | |
| 2086 | |
| 2087 flow = flow->next; | |
| 2088 } | |
| 2089 } | |
| 2090 | |
| 2091 fz_structure fz_html_tag_to_structure(const char *tag) | |
| 2092 { | |
| 2093 if (!strcmp(tag, "body")) return FZ_STRUCTURE_DOCUMENT; | |
| 2094 if (!strcmp(tag, "div")) return FZ_STRUCTURE_DIV; | |
| 2095 if (!strcmp(tag, "span")) return FZ_STRUCTURE_SPAN; | |
| 2096 if (!strcmp(tag, "blockquote")) return FZ_STRUCTURE_BLOCKQUOTE; | |
| 2097 if (!strcmp(tag, "p")) return FZ_STRUCTURE_P; | |
| 2098 if (!strcmp(tag, "h1")) return FZ_STRUCTURE_H1; | |
| 2099 if (!strcmp(tag, "h2")) return FZ_STRUCTURE_H2; | |
| 2100 if (!strcmp(tag, "h3")) return FZ_STRUCTURE_H3; | |
| 2101 if (!strcmp(tag, "h4")) return FZ_STRUCTURE_H4; | |
| 2102 if (!strcmp(tag, "h5")) return FZ_STRUCTURE_H5; | |
| 2103 if (!strcmp(tag, "h6")) return FZ_STRUCTURE_H6; | |
| 2104 if (!strcmp(tag, "ol")) return FZ_STRUCTURE_LIST; | |
| 2105 if (!strcmp(tag, "ul")) return FZ_STRUCTURE_LIST; | |
| 2106 if (!strcmp(tag, "dl")) return FZ_STRUCTURE_LIST; | |
| 2107 if (!strcmp(tag, "li")) return FZ_STRUCTURE_LISTITEM; | |
| 2108 if (!strcmp(tag, "table")) return FZ_STRUCTURE_TABLE; | |
| 2109 if (!strcmp(tag, "tr")) return FZ_STRUCTURE_TR; | |
| 2110 if (!strcmp(tag, "th")) return FZ_STRUCTURE_TH; | |
| 2111 if (!strcmp(tag, "td")) return FZ_STRUCTURE_TD; | |
| 2112 if (!strcmp(tag, "thead")) return FZ_STRUCTURE_THEAD; | |
| 2113 if (!strcmp(tag, "tbody")) return FZ_STRUCTURE_TBODY; | |
| 2114 if (!strcmp(tag, "tfoot")) return FZ_STRUCTURE_TFOOT; | |
| 2115 return FZ_STRUCTURE_INVALID; | |
| 2116 } | |
| 2117 | |
| 2118 static void | |
| 2119 fz_debug_html_box(fz_context *ctx, fz_html_box *box, int level) | |
| 2120 { | |
| 2121 while (box) | |
| 2122 { | |
| 2123 indent(level); | |
| 2124 printf("box "); | |
| 2125 switch (box->type) { | |
| 2126 case BOX_BLOCK: printf("block"); break; | |
| 2127 case BOX_FLOW: printf("flow"); break; | |
| 2128 case BOX_INLINE: printf("inline"); break; | |
| 2129 case BOX_TABLE: printf("table"); break; | |
| 2130 case BOX_TABLE_ROW: printf("table-row"); break; | |
| 2131 case BOX_TABLE_CELL: printf("table-cell"); break; | |
| 2132 } | |
| 2133 | |
| 2134 printf(" <%s>", box->tag); | |
| 2135 // printf(" em=%g", box->em); | |
| 2136 // printf(" x=%g y=%g w=%g b=%g", box->x, box->y, box->w, box->b); | |
| 2137 | |
| 2138 if (box->is_first_flow) | |
| 2139 printf(" is-first-flow"); | |
| 2140 if (box->list_item) | |
| 2141 printf(" list=%d", box->list_item); | |
| 2142 if (box->id) | |
| 2143 printf(" id=(%s)", box->id); | |
| 2144 if (box->href) | |
| 2145 printf(" href=(%s)", box->href); | |
| 2146 printf("\n"); | |
| 2147 | |
| 2148 if (box->type == BOX_BLOCK || box->type == BOX_TABLE) { | |
| 2149 indent(level+1); | |
| 2150 printf(">margin=(%g %g %g %g)\n", box->u.block.margin[0], box->u.block.margin[1], box->u.block.margin[2], box->u.block.margin[3]); | |
| 2151 //indent(level+1); | |
| 2152 //printf(">padding=(%g %g %g %g)\n", box->u.block.padding[0], box->u.block.padding[1], box->u.block.padding[2], box->u.block.padding[3]); | |
| 2153 //indent(level+1); | |
| 2154 //printf(">border=(%g %g %g %g)\n", box->u.block.border[0], box->u.block.border[1], box->u.block.border[2], box->u.block.border[3]); | |
| 2155 } | |
| 2156 | |
| 2157 if (box->down) | |
| 2158 fz_debug_html_box(ctx, box->down, level + 1); | |
| 2159 if (box->type == BOX_FLOW) { | |
| 2160 indent(level+1); | |
| 2161 printf("flow\n"); | |
| 2162 fz_debug_html_flow(ctx, box->u.flow.head, level + 2); | |
| 2163 } | |
| 2164 | |
| 2165 box = box->next; | |
| 2166 } | |
| 2167 } | |
| 2168 | |
| 2169 void | |
| 2170 fz_debug_html(fz_context *ctx, fz_html_box *box) | |
| 2171 { | |
| 2172 fz_debug_html_box(ctx, box, 0); | |
| 2173 } | |
| 2174 | |
| 2175 static size_t | |
| 2176 fz_html_size(fz_context *ctx, fz_html *html) | |
| 2177 { | |
| 2178 return html ? fz_pool_size(ctx, html->tree.pool) : 0; | |
| 2179 } | |
| 2180 | |
| 2181 /* Magic to make html storable. */ | |
| 2182 typedef struct { | |
| 2183 int refs; | |
| 2184 void *doc; | |
| 2185 int chapter_num; | |
| 2186 } fz_html_key; | |
| 2187 | |
| 2188 static int | |
| 2189 fz_make_hash_html_key(fz_context *ctx, fz_store_hash *hash, void *key_) | |
| 2190 { | |
| 2191 fz_html_key *key = (fz_html_key *)key_; | |
| 2192 hash->u.pi.ptr = key->doc; | |
| 2193 hash->u.pi.i = key->chapter_num; | |
| 2194 return 1; | |
| 2195 } | |
| 2196 | |
| 2197 static void * | |
| 2198 fz_keep_html_key(fz_context *ctx, void *key_) | |
| 2199 { | |
| 2200 fz_html_key *key = (fz_html_key *)key_; | |
| 2201 return fz_keep_imp(ctx, key, &key->refs); | |
| 2202 } | |
| 2203 | |
| 2204 static void | |
| 2205 fz_drop_html_key(fz_context *ctx, void *key_) | |
| 2206 { | |
| 2207 fz_html_key *key = (fz_html_key *)key_; | |
| 2208 if (fz_drop_imp(ctx, key, &key->refs)) | |
| 2209 { | |
| 2210 fz_free(ctx, key); | |
| 2211 } | |
| 2212 } | |
| 2213 | |
| 2214 static int | |
| 2215 fz_cmp_html_key(fz_context *ctx, void *k0_, void *k1_) | |
| 2216 { | |
| 2217 fz_html_key *k0 = (fz_html_key *)k0_; | |
| 2218 fz_html_key *k1 = (fz_html_key *)k1_; | |
| 2219 return k0->doc == k1->doc && k0->chapter_num == k1->chapter_num; | |
| 2220 } | |
| 2221 | |
| 2222 static void | |
| 2223 fz_format_html_key(fz_context *ctx, char *s, size_t n, void *key_) | |
| 2224 { | |
| 2225 fz_html_key *key = (fz_html_key *)key_; | |
| 2226 fz_snprintf(s, n, "(html doc=%p, ch=%d)", key->doc, key->chapter_num); | |
| 2227 } | |
| 2228 | |
| 2229 static const fz_store_type fz_html_store_type = | |
| 2230 { | |
| 2231 "fz_html", | |
| 2232 fz_make_hash_html_key, | |
| 2233 fz_keep_html_key, | |
| 2234 fz_drop_html_key, | |
| 2235 fz_cmp_html_key, | |
| 2236 fz_format_html_key, | |
| 2237 NULL | |
| 2238 }; | |
| 2239 | |
| 2240 fz_html *fz_store_html(fz_context *ctx, fz_html *html, void *doc, int chapter) | |
| 2241 { | |
| 2242 fz_html_key *key = NULL; | |
| 2243 fz_html *other_html; | |
| 2244 | |
| 2245 /* Stick the parsed html in the store */ | |
| 2246 fz_var(key); | |
| 2247 | |
| 2248 fz_try(ctx) | |
| 2249 { | |
| 2250 key = fz_malloc_struct(ctx, fz_html_key); | |
| 2251 key->refs = 1; | |
| 2252 key->doc = doc; | |
| 2253 key->chapter_num = chapter; | |
| 2254 other_html = fz_store_item(ctx, key, html, fz_html_size(ctx, html), &fz_html_store_type); | |
| 2255 if (other_html) | |
| 2256 { | |
| 2257 fz_drop_html(ctx, html); | |
| 2258 html = other_html; | |
| 2259 } | |
| 2260 } | |
| 2261 fz_always(ctx) | |
| 2262 fz_drop_html_key(ctx, key); | |
| 2263 fz_catch(ctx) | |
| 2264 { | |
| 2265 /* Do nothing */ | |
| 2266 } | |
| 2267 | |
| 2268 return html; | |
| 2269 } | |
| 2270 | |
| 2271 fz_html *fz_find_html(fz_context *ctx, void *doc, int chapter) | |
| 2272 { | |
| 2273 fz_html_key key; | |
| 2274 | |
| 2275 key.refs = 1; | |
| 2276 key.doc = doc; | |
| 2277 key.chapter_num = chapter; | |
| 2278 return fz_find_item(ctx, &fz_drop_html_imp, &key, &fz_html_store_type); | |
| 2279 } | |
| 2280 | |
| 2281 static int | |
| 2282 html_filter_store(fz_context *ctx, void *doc, void *key_) | |
| 2283 { | |
| 2284 fz_html_key *key = (fz_html_key *)key_; | |
| 2285 | |
| 2286 return (doc == key->doc); | |
| 2287 } | |
| 2288 | |
| 2289 void fz_purge_stored_html(fz_context *ctx, void *doc) | |
| 2290 { | |
| 2291 fz_filter_store(ctx, html_filter_store, doc, &fz_html_store_type); | |
| 2292 } | |
| 2293 | |
| 2294 static void | |
| 2295 convert_to_boxes(fz_context *ctx, fz_story *story) | |
| 2296 { | |
| 2297 warning_save saved = { 0 }; | |
| 2298 | |
| 2299 if (story->dom == NULL) | |
| 2300 return; | |
| 2301 | |
| 2302 fz_var(saved); | |
| 2303 | |
| 2304 fz_try(ctx) | |
| 2305 { | |
| 2306 redirect_warnings_to_buffer(ctx, story->warnings, &saved); | |
| 2307 xml_to_boxes(ctx, story->font_set, story->zip, ".", story->user_css, story->dom, &story->tree, NULL, 0, 0); | |
| 2308 } | |
| 2309 fz_always(ctx) | |
| 2310 { | |
| 2311 fz_drop_xml(ctx, story->dom); | |
| 2312 story->dom = NULL; | |
| 2313 restore_warnings(ctx, &saved); | |
| 2314 } | |
| 2315 fz_catch(ctx) | |
| 2316 fz_rethrow(ctx); | |
| 2317 } | |
| 2318 | |
| 2319 int fz_place_story(fz_context *ctx, fz_story *story, fz_rect where, fz_rect *filled) | |
| 2320 { | |
| 2321 return fz_place_story_flags(ctx, story, where, filled, 0); | |
| 2322 } | |
| 2323 | |
| 2324 int fz_place_story_flags(fz_context *ctx, fz_story *story, fz_rect where, fz_rect *filled, int flags) | |
| 2325 { | |
| 2326 float w, h; | |
| 2327 | |
| 2328 if (filled) | |
| 2329 *filled = fz_empty_rect; | |
| 2330 | |
| 2331 if (story == NULL || story->complete) | |
| 2332 return 0; | |
| 2333 | |
| 2334 /* Convert from XML to box model on the first attempt to place. | |
| 2335 * The DOM is unusable from here on in. */ | |
| 2336 convert_to_boxes(ctx, story); | |
| 2337 | |
| 2338 w = where.x1 - where.x0; | |
| 2339 h = where.y1 - where.y0; | |
| 2340 /* Confusingly, we call the layout using restart_draw, not restart_place, | |
| 2341 * because we don't want to destroy the current values in restart_place | |
| 2342 * in case we have to retry later. This means the values are left in | |
| 2343 * the correct struct though! */ | |
| 2344 story->restart_draw.start = story->restart_place.start; | |
| 2345 story->restart_draw.start_flow = story->restart_place.start_flow; | |
| 2346 story->restart_draw.end = NULL; | |
| 2347 story->restart_draw.end_flow = NULL; | |
| 2348 story->restart_draw.reason = FZ_HTML_RESTART_REASON_NONE; | |
| 2349 story->restart_draw.flags = flags; | |
| 2350 story->bbox = where; | |
| 2351 fz_restartable_layout_html(ctx, &story->tree, where.x0, where.y0, w, h, story->em, &story->restart_draw); | |
| 2352 story->restart_draw.start = story->restart_place.start; | |
| 2353 story->restart_draw.start_flow = story->restart_place.start_flow; | |
| 2354 | |
| 2355 if (filled) | |
| 2356 { | |
| 2357 fz_html_box *b = story->tree.root; | |
| 2358 filled->x0 = b->s.layout.x - b->u.block.margin[L] - b->u.block.border[L] - b->u.block.padding[L]; | |
| 2359 filled->x1 = b->s.layout.w + b->u.block.margin[R] + b->u.block.border[R] + b->u.block.padding[R] + b->s.layout.x; | |
| 2360 filled->y0 = b->s.layout.y - b->u.block.margin[T] - b->u.block.border[T] - b->u.block.padding[T]; | |
| 2361 filled->y1 = b->s.layout.b + b->u.block.margin[B] + b->u.block.border[B] + b->u.block.padding[B]; | |
| 2362 } | |
| 2363 | |
| 2364 #ifndef NDEBUG | |
| 2365 if (fz_atoi(getenv("FZ_DEBUG_HTML"))) | |
| 2366 fz_debug_html(ctx, story->tree.root); | |
| 2367 #endif | |
| 2368 | |
| 2369 if (story->restart_draw.end == NULL) | |
| 2370 return FZ_HTML_RESTART_REASON_NONE; | |
| 2371 if (story->restart_draw.reason == FZ_HTML_RESTART_REASON_LINE_WIDTH) | |
| 2372 return FZ_HTML_RESTART_REASON_LINE_WIDTH; | |
| 2373 return FZ_HTML_RESTART_REASON_LINE_HEIGHT; | |
| 2374 } | |
| 2375 | |
| 2376 const char * | |
| 2377 fz_story_warnings(fz_context *ctx, fz_story *story) | |
| 2378 { | |
| 2379 unsigned char *data; | |
| 2380 | |
| 2381 if (!story) | |
| 2382 return NULL; | |
| 2383 | |
| 2384 convert_to_boxes(ctx, story); | |
| 2385 | |
| 2386 fz_terminate_buffer(ctx, story->warnings); | |
| 2387 | |
| 2388 if (fz_buffer_storage(ctx, story->warnings, &data) == 0) | |
| 2389 return NULL; | |
| 2390 | |
| 2391 return (const char *)data; | |
| 2392 } |
