comparison mupdf-source/source/html/html-parse.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 // Copyright (C) 2004-2025 Artifex Software, Inc.
2 //
3 // This file is part of MuPDF.
4 //
5 // MuPDF is free software: you can redistribute it and/or modify it under the
6 // terms of the GNU Affero General Public License as published by the Free
7 // Software Foundation, either version 3 of the License, or (at your option)
8 // any later version.
9 //
10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13 // details.
14 //
15 // You should have received a copy of the GNU Affero General Public License
16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17 //
18 // Alternative licensing terms are available from the licensor.
19 // For commercial licensing, see <https://www.artifex.com/> or contact
20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21 // CA 94129, USA, for further information.
22
23 #include "mupdf/fitz.h"
24 #include "mupdf/ucdn.h"
25 #include "html-imp.h"
26
27 #include <string.h>
28 #include <stdio.h>
29 #include <assert.h>
30
31 enum { T, R, B, L };
32
33 #define DEFAULT_DIR FZ_BIDI_LTR
34
35 static const char *html_default_css =
36 "@page{margin:3em 2em}"
37 "a{color:#06C;text-decoration:underline}"
38 "address{display:block;font-style:italic}"
39 "b{font-weight:bold}"
40 "bdo{direction:rtl;unicode-bidi:bidi-override}"
41 "blockquote{display:block;margin:1em 40px}"
42 "body{display:block;margin:1em}"
43 "cite{font-style:italic}"
44 "code{font-family:monospace}"
45 "dd{display:block;margin:0 0 0 40px}"
46 "del{text-decoration:line-through}"
47 "div{display:block}"
48 "dl{display:block;margin:1em 0}"
49 "dt{display:block}"
50 "em{font-style:italic}"
51 "h1{display:block;font-size:2em;font-weight:bold;margin:0.67em 0;page-break-after:avoid}"
52 "h2{display:block;font-size:1.5em;font-weight:bold;margin:0.83em 0;page-break-after:avoid}"
53 "h3{display:block;font-size:1.17em;font-weight:bold;margin:1em 0;page-break-after:avoid}"
54 "h4{display:block;font-size:1em;font-weight:bold;margin:1.33em 0;page-break-after:avoid}"
55 "h5{display:block;font-size:0.83em;font-weight:bold;margin:1.67em 0;page-break-after:avoid}"
56 "h6{display:block;font-size:0.67em;font-weight:bold;margin:2.33em 0;page-break-after:avoid}"
57 "head{display:none}"
58 "hr{border-style:solid;border-width:1px;display:block;margin-bottom:0.5em;margin-top:0.5em;text-align:center}"
59 "html{display:block}"
60 "i{font-style:italic}"
61 "ins{text-decoration:underline}"
62 "kbd{font-family:monospace}"
63 "li{display:list-item}"
64 "menu{display:block;list-style-type:disc;margin:1em 0;padding:0 0 0 30pt}"
65 "ol{display:block;list-style-type:decimal;margin:1em 0;padding:0 0 0 30pt}"
66 "p{display:block;margin:1em 0}"
67 "pre{display:block;font-family:monospace;margin:1em 0;white-space:pre}"
68 "samp{font-family:monospace}"
69 "script{display:none}"
70 "small{font-size:0.83em}"
71 "strong{font-weight:bold}"
72 "style{display:none}"
73 "sub{font-size:0.83em;vertical-align:sub}"
74 "sup{font-size:0.83em;vertical-align:super}"
75 "table{display:table;border-spacing:2px}"
76 "tbody{display:table-row-group}"
77 "td{display:table-cell;padding:1px;background-color:inherit}"
78 "tfoot{display:table-footer-group}"
79 "th{display:table-cell;font-weight:bold;padding:1px;text-align:center;background-color:inherit}"
80 "thead{display:table-header-group}"
81 "tr{display:table-row}"
82 "ul{display:block;list-style-type:disc;margin:1em 0;padding:0 0 0 30pt}"
83 "ul ul{list-style-type:circle}"
84 "ul ul ul{list-style-type:square}"
85 "var{font-style:italic}"
86 "colgroup{display:table-column-group}"
87 "col{display:table-column}"
88 "caption{display:block;text-align:center}"
89 ;
90
91 static const char *mobi_default_css =
92 "pagebreak{display:block;page-break-before:always}"
93 "dl,ol,ul{margin:0}"
94 "p{margin:0}"
95 "blockquote{margin:0 40px}"
96 "center{display:block;text-align:center}"
97 "big{font-size:1.17em}"
98 "strike{text-decoration:line-through}"
99 ;
100
101 static const char *fb2_default_css =
102 "@page{margin:3em 2em}"
103 "FictionBook{display:block;margin:1em}"
104 "stylesheet,binary{display:none}"
105 "description>*{display:none}"
106 "description>title-info{display:block}"
107 "description>title-info>*{display:none}"
108 "description>title-info>coverpage{display:block;page-break-before:always;page-break-after:always}"
109 "body,section,title,subtitle,p,cite,epigraph,text-author,date,poem,stanza,v,empty-line{display:block}"
110 "image{display:block}"
111 "p>image{display:inline}"
112 "table{display:table}"
113 "tr{display:table-row}"
114 "th,td{display:table-cell}"
115 "a{color:#06C;text-decoration:underline}"
116 "a[type=note]{font-size:small;vertical-align:super}"
117 "code{white-space:pre;font-family:monospace}"
118 "emphasis{font-style:italic}"
119 "strikethrough{text-decoration:line-through}"
120 "strong{font-weight:bold}"
121 "sub{font-size:small;vertical-align:sub}"
122 "sup{font-size:small;vertical-align:super}"
123 "image{margin:1em 0;text-align:center}"
124 "cite,poem{margin:1em 2em}"
125 "subtitle,epigraph,stanza{margin:1em 0}"
126 "title>p{text-align:center;font-size:x-large}"
127 "subtitle{text-align:center;font-size:large}"
128 "p{margin-top:1em;text-align:justify}"
129 "empty-line{padding-top:1em}"
130 "p+p{margin-top:0;text-indent:1.5em}"
131 "empty-line+p{margin-top:0}"
132 "section>title{page-break-before:always}"
133 ;
134
135 static const char *known_html_tags[] = {
136 // TODO: add known FB2 tags?
137 // Sorted list of all HTML tags.
138 "a", "abbr", "acronym", "address", "annotation-xml", "applet", "area",
139 "article", "aside", "audio", "b", "base", "basefont", "bdi", "bdo",
140 "bgsound", "big", "blink", "blockquote", "body", "br", "button",
141 "canvas", "caption", "center", "cite", "code", "col", "colgroup",
142 "data", "datalist", "dd", "del", "desc", "details", "dfn", "dir",
143 "div", "dl", "dt", "em", "embed", "fieldset", "figcaption", "figure",
144 "font", "footer", "foreignobject", "form", "frame", "frameset", "h1",
145 "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html",
146 "i", "iframe", "image", "img", "input", "ins", "isindex", "kbd",
147 "keygen", "label", "legend", "li", "link", "listing", "main",
148 "malignmark", "map", "mark", "marquee", "math", "menu", "menuitem",
149 "meta", "meter", "mglyph", "mi", "mn", "mo", "ms", "mtext", "multicol",
150 "nav", "nextid", "nobr", "noembed", "noframes", "noscript", "object",
151 "ol", "optgroup", "option", "output", "p", "param", "plaintext", "pre",
152 "progress", "q", "rb", "rp", "rt", "rtc", "ruby", "s", "samp",
153 "script", "section", "select", "small", "source", "spacer", "span",
154 "strike", "strong", "style", "sub", "summary", "sup", "svg", "table",
155 "tbody", "td", "template", "textarea", "tfoot", "th", "thead", "time",
156 "title", "tr", "track", "tt", "u", "ul", "var", "video", "wbr", "xmp",
157 };
158
159 static const char *known_fb2_tags[] = {
160 "FictionBook", "a", "binary", "body", "cite", "code", "coverpage",
161 "date", "description", "emphasis", "empty-line", "epigraph", "image",
162 "p", "poem", "section", "stanza", "strikethrough", "strong",
163 "stylesheet", "sub", "subtitle", "sup", "table", "td", "text-author",
164 "th", "title", "title-info", "tr", "v",
165 };
166
167 static const char *find_known_html_tag(const char *tag)
168 {
169 int l = 0;
170 int r = nelem(known_html_tags) / 2 - 1;
171 while (l <= r)
172 {
173 int m = (l + r) >> 1;
174 int c = strcmp(tag, known_html_tags[m]);
175 if (c < 0)
176 r = m - 1;
177 else if (c > 0)
178 l = m + 1;
179 else
180 return known_html_tags[m];
181 }
182 return NULL;
183 }
184
185 static const char *find_known_fb2_tag(const char *tag)
186 {
187 int l = 0;
188 int r = nelem(known_fb2_tags) / 2 - 1;
189 while (l <= r)
190 {
191 int m = (l + r) >> 1;
192 int c = strcmp(tag, known_fb2_tags[m]);
193 if (c < 0)
194 r = m - 1;
195 else if (c > 0)
196 l = m + 1;
197 else
198 return known_fb2_tags[m];
199 }
200 return NULL;
201 }
202
203 struct genstate
204 {
205 fz_pool *pool;
206 fz_html_font_set *set;
207 fz_archive *zip;
208 fz_tree *images;
209 fz_xml_doc *xml;
210 int is_fb2;
211 const char *base_uri;
212 fz_css *css;
213 int at_bol;
214 fz_html_box *emit_white;
215 int last_brk_cls;
216
217 int list_counter;
218 int section_depth;
219 fz_bidi_direction markup_dir;
220 fz_text_language markup_lang;
221 char *href;
222
223 fz_css_style_splay *styles;
224 };
225
226 static int iswhite(int c)
227 {
228 return c == ' ' || c == '\t' || c == '\r' || c == '\n';
229 }
230
231 static int is_all_white(const char *s)
232 {
233 while (*s)
234 {
235 if (!iswhite(*s))
236 return 0;
237 ++s;
238 }
239 return 1;
240 }
241
242 /* TODO: pool allocator for flow nodes */
243 /* TODO: store text by pointing to a giant buffer */
244
245 static void fz_drop_html_flow(fz_context *ctx, fz_html_flow *flow)
246 {
247 while (flow)
248 {
249 fz_html_flow *next = flow->next;
250 if (flow->type == FLOW_IMAGE)
251 fz_drop_image(ctx, flow->content.image);
252 flow = next;
253 }
254 }
255
256 static fz_html_flow *add_flow(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, int type, int extras)
257 {
258 size_t size = (type == FLOW_IMAGE ? sizeof(fz_html_flow) : offsetof(fz_html_flow, content) + extras);
259 fz_html_flow *flow;
260
261 /* Shouldn't happen, but bug 705324. */
262 if (top == NULL || top->type != BOX_FLOW)
263 return NULL;
264
265 flow = fz_pool_alloc(ctx, pool, size);
266 flow->type = type;
267 flow->expand = 0;
268 flow->bidi_level = 0;
269 flow->markup_lang = 0;
270 flow->breaks_line = 0;
271 flow->box = inline_box;
272 (*top->s.build.flow_tail) = flow;
273 top->s.build.flow_tail = &flow->next;
274 return flow;
275 }
276
277 static void add_flow_space(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box)
278 {
279 fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_SPACE, 0);
280 if (flow)
281 flow->expand = 1;
282 }
283
284 static void add_flow_break(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box)
285 {
286 (void)add_flow(ctx, pool, top, inline_box, FLOW_BREAK, 0);
287 }
288
289 static void add_flow_sbreak(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box)
290 {
291 (void)add_flow(ctx, pool, top, inline_box, FLOW_SBREAK, 0);
292 }
293
294 static void add_flow_shyphen(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box)
295 {
296 (void)add_flow(ctx, pool, top, inline_box, FLOW_SHYPHEN, 0);
297 }
298
299 static void add_flow_word(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, const char *a, const char *b, int lang)
300 {
301 fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_WORD, b - a + 1);
302 if (flow == NULL)
303 return;
304 memcpy(flow->content.text, a, b - a);
305 flow->content.text[b - a] = 0;
306 flow->markup_lang = lang;
307 }
308
309 static void add_flow_image(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, fz_image *img)
310 {
311 fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_IMAGE, 0);
312 if (flow)
313 flow->content.image = fz_keep_image(ctx, img);
314 }
315
316 static void add_flow_anchor(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box)
317 {
318 (void)add_flow(ctx, pool, top, inline_box, FLOW_ANCHOR, 0);
319 }
320
321 fz_html_flow *fz_html_split_flow(fz_context *ctx, fz_pool *pool, fz_html_flow *flow, size_t offset)
322 {
323 fz_html_flow *new_flow;
324 char *text;
325 size_t len;
326
327 assert(flow->type == FLOW_WORD);
328
329 if (offset == 0)
330 return flow;
331 text = flow->content.text;
332 while (*text && offset)
333 {
334 int rune;
335 text += fz_chartorune(&rune, text);
336 offset--;
337 }
338 len = strlen(text);
339 new_flow = fz_pool_alloc(ctx, pool, offsetof(fz_html_flow, content) + len+1);
340 memcpy(new_flow, flow, offsetof(fz_html_flow, content));
341 new_flow->next = flow->next;
342 flow->next = new_flow;
343 strcpy(new_flow->content.text, text);
344 *text = 0;
345 return new_flow;
346 }
347
348 static void flush_space(fz_context *ctx, fz_html_box *flow, int lang, struct genstate *g)
349 {
350 static const char *space = " ";
351 fz_pool *pool = g->pool;
352 if (g->emit_white)
353 {
354 int bsp = g->emit_white->style->white_space & WS_ALLOW_BREAK_SPACE;
355 if (!g->at_bol)
356 {
357 if (bsp)
358 add_flow_space(ctx, pool, flow, g->emit_white);
359 else
360 add_flow_word(ctx, pool, flow, g->emit_white, space, space+1, lang);
361 }
362 g->emit_white = 0;
363 }
364 }
365
366 /* pair-wise lookup table for UAX#14 linebreaks
367 The linebreak table entries mean:
368 ^ prohibited break
369 never break before A and after B, even with one or more spaces in between
370 % indirect break
371 do not break before A, unless one or more spaces follow B
372 _ direct break
373 break allowed before A
374 */
375 static const char *pairbrk[32] =
376 {
377 /* -OCCQGNESIPPNAHIIHBBBZCWHHJJJREEZ- */
378 /* -PLPULSXYSROULLDNYAB2WMJ23LVTIBMW- */
379 /* - J- */
380 "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^", /* OP open punctuation */
381 "_^^%%^^^^%%____%%%__^^^________%", /* CL close punctuation */
382 "_^^%%^^^^%%%%%_%%%__^^^________%", /* CP close parenthesis */
383 "^^^%%%^^^%%%%%%%%%%%^^^%%%%%%%%%", /* QU quotation */
384 "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%%%%", /* GL non-breaking glue */
385 "_^^%%%^^^______%%%__^^^________%", /* NS nonstarters */
386 "_^^%%%^^^______%%%__^^^________%", /* EX exclamation/interrogation */
387 "_^^%%%^^^__%_%_%%%__^^^________%", /* SY symbols allowing break after */
388 "_^^%%%^^^__%%%_%%%__^^^________%", /* IS infix numeric separator */
389 "%^^%%%^^^__%%%%%%%__^^^%%%%%_%%%", /* PR prefix numeric */
390 "%^^%%%^^^__%%%_%%%__^^^________%", /* PO postfix numeric */
391 "%^^%%%^^^%%%%%_%%%__^^^________%", /* NU numeric */
392 "%^^%%%^^^%%%%%_%%%__^^^________%", /* AL ordinary alphabetic and symbol characters */
393 "%^^%%%^^^%%%%%_%%%__^^^________%", /* HL hebrew letter */
394 "_^^%%%^^^_%____%%%__^^^________%", /* ID ideographic */
395 "_^^%%%^^^______%%%__^^^________%", /* IN inseparable characters */
396 "_^^%_%^^^__%___%%%__^^^________%", /* HY hyphens */
397 "_^^%_%^^^______%%%__^^^________%", /* BA break after */
398 "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%%%%", /* BB break before */
399 "_^^%%%^^^______%%%_^^^^________%", /* B2 break opportunity before and after */
400 "____________________^___________", /* ZW zero width space */
401 "%^^%%%^^^%_%%%_%%%__^^^________%", /* CM combining mark */
402 "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%%%%", /* WJ word joiner */
403 "_^^%%%^^^_%____%%%__^^^___%%___%", /* H2 hangul leading/vowel syllable */
404 "_^^%%%^^^_%____%%%__^^^____%___%", /* H3 hangul leading/vowel/trailing syllable */
405 "_^^%%%^^^_%____%%%__^^^%%%%____%", /* JL hangul leading jamo */
406 "_^^%%%^^^_%____%%%__^^^___%%___%", /* JV hangul vowel jamo */
407 "_^^%%%^^^_%____%%%__^^^____%___%", /* JT hangul trailing jamo */
408 "_^^%%%^^^______%%%__^^^_____%__%", /* RI regional indicator */
409 "_^^%%%^^^_%____%%%__^^^_______%%", /* EB emoji base */
410 "_^^%%%^^^_%____%%%__^^^________%", /* EM emoji modifier */
411 "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%%%%", /* ZWJ zero width joiner */
412 };
413
414 static fz_html_box *
415 find_flow_encloser(fz_context *ctx, fz_html_box *flow)
416 {
417 /* This code was written to assume that there will always be a
418 * flow box enclosing callers of this. Bug 705324 shows that
419 * this isn't always the case. In the absence of a reproducer
420 * file, all I can do is try to patch around the issue so that
421 * we won't crash. */
422 while (flow->type != BOX_FLOW)
423 {
424 if (flow->up == NULL)
425 {
426 fz_warn(ctx, "Flow encloser not found. Please report this file!");
427 break;
428 }
429 flow = flow->up;
430 }
431 return flow;
432 }
433
434 static void generate_text(fz_context *ctx, fz_html_box *box, const char *text, int lang, struct genstate *g)
435 {
436 fz_html_box *flow;
437 fz_pool *pool = g->pool;
438 int collapse = box->style->white_space & WS_COLLAPSE;
439 int bsp = box->style->white_space & WS_ALLOW_BREAK_SPACE;
440 int bnl = box->style->white_space & WS_FORCE_BREAK_NEWLINE;
441
442 static const char *space = " ";
443
444 flow = find_flow_encloser(ctx, box);
445 if (flow == NULL)
446 return;
447
448 while (*text)
449 {
450 if (bnl && (*text == '\n' || *text == '\r'))
451 {
452 if (text[0] == '\r' && text[1] == '\n')
453 text += 2;
454 else
455 text += 1;
456 add_flow_break(ctx, pool, flow, box);
457 g->at_bol = 1;
458 }
459 else if (iswhite(*text))
460 {
461 if (collapse)
462 {
463 if (bnl)
464 while (*text == ' ' || *text == '\t')
465 ++text;
466 else
467 while (iswhite(*text))
468 ++text;
469 g->emit_white = box;
470 }
471 else
472 {
473 // TODO: tabs
474 if (bsp)
475 add_flow_space(ctx, pool, flow, box);
476 else
477 add_flow_word(ctx, pool, flow, box, space, space+1, lang);
478 ++text;
479 }
480 g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ; /* don't add sbreaks after a space */
481 }
482 else
483 {
484 const char *prev, *mark = text;
485 int c;
486
487 flush_space(ctx, flow, lang, g);
488
489 if (g->at_bol)
490 g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ;
491
492 while (*text && !iswhite(*text))
493 {
494 prev = text;
495 text += fz_chartorune(&c, text);
496 if (c == 0xAD) /* soft hyphen */
497 {
498 if (mark != prev)
499 add_flow_word(ctx, pool, flow, box, mark, prev, lang);
500 add_flow_shyphen(ctx, pool, flow, box);
501 mark = text;
502 g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ; /* don't add sbreaks after a soft hyphen */
503 }
504 else if (bsp) /* allow soft breaks */
505 {
506 int this_brk_cls = ucdn_get_resolved_linebreak_class(c);
507 if (this_brk_cls <= UCDN_LINEBREAK_CLASS_ZWJ)
508 {
509 int brk = pairbrk[g->last_brk_cls][this_brk_cls];
510
511 /* we handle spaces elsewhere, so ignore these classes */
512 if (brk == '@') brk = '^';
513 if (brk == '#') brk = '^';
514 if (brk == '%') brk = '^';
515
516 if (brk == '_')
517 {
518 if (mark != prev)
519 add_flow_word(ctx, pool, flow, box, mark, prev, lang);
520 add_flow_sbreak(ctx, pool, flow, box);
521 mark = prev;
522 }
523
524 g->last_brk_cls = this_brk_cls;
525 }
526 }
527 }
528 if (mark != text)
529 add_flow_word(ctx, pool, flow, box, mark, text, lang);
530
531 g->at_bol = 0;
532 }
533 }
534 }
535
536 static fz_image *load_html_image(fz_context *ctx, fz_archive *zip, const char *base_uri, const char *src)
537 {
538 char path[2048];
539 fz_image *img = NULL;
540 fz_buffer *buf = NULL;
541
542 fz_var(img);
543 fz_var(buf);
544
545 fz_try(ctx)
546 {
547 if (!strncmp(src, "data:image/jpeg;base64,", 23))
548 buf = fz_new_buffer_from_base64(ctx, src+23, 0);
549 else if (!strncmp(src, "data:image/png;base64,", 22))
550 buf = fz_new_buffer_from_base64(ctx, src+22, 0);
551 else if (!strncmp(src, "data:image/gif;base64,", 22))
552 buf = fz_new_buffer_from_base64(ctx, src+22, 0);
553 else
554 {
555 fz_strlcpy(path, base_uri, sizeof path);
556 fz_strlcat(path, "/", sizeof path);
557 fz_strlcat(path, src, sizeof path);
558 fz_urldecode(path);
559 fz_cleanname(path);
560 buf = fz_read_archive_entry(ctx, zip, path);
561 }
562 #if FZ_ENABLE_SVG
563 if (strstr(src, ".svg"))
564 img = fz_new_image_from_svg(ctx, buf, base_uri, zip);
565 else
566 #endif
567 img = fz_new_image_from_buffer(ctx, buf);
568 }
569 fz_always(ctx)
570 fz_drop_buffer(ctx, buf);
571 fz_catch(ctx)
572 {
573 fz_ignore_error(ctx);
574 fz_warn(ctx, "html: cannot load image src='%s'", src);
575 }
576
577 return img;
578 }
579
580 static fz_image *load_svg_image(fz_context *ctx, fz_archive *zip, const char *base_uri,
581 fz_xml_doc *xmldoc, fz_xml *node)
582 {
583 fz_image *img = NULL;
584 #if FZ_ENABLE_SVG
585 fz_try(ctx)
586 img = fz_new_image_from_svg_xml(ctx, xmldoc, node, base_uri, zip);
587 fz_catch(ctx)
588 {
589 fz_ignore_error(ctx);
590 fz_warn(ctx, "html: cannot load embedded svg document");
591 }
592 #endif
593 return img;
594 }
595
596 static void generate_image(fz_context *ctx, fz_html_box *box, fz_image *img, struct genstate *g)
597 {
598 fz_html_box *flow;
599 fz_pool *pool = g->pool;
600
601 flow = find_flow_encloser(ctx, box);
602
603 flush_space(ctx, flow, 0, g);
604
605 if (!img)
606 {
607 const char *alt = "[image]";
608 add_flow_word(ctx, pool, flow, box, alt, alt + 7, 0);
609 }
610 else
611 {
612 fz_try(ctx)
613 {
614 add_flow_sbreak(ctx, pool, flow, box);
615 add_flow_image(ctx, pool, flow, box, img);
616 add_flow_sbreak(ctx, pool, flow, box);
617 }
618 fz_always(ctx)
619 {
620 fz_drop_image(ctx, img);
621 }
622 fz_catch(ctx)
623 fz_rethrow(ctx);
624 }
625
626 g->at_bol = 0;
627 }
628
629 static void fz_drop_html_box(fz_context *ctx, fz_html_box *box)
630 {
631 while (box)
632 {
633 fz_html_box *next = box->next;
634 if (box->type == BOX_FLOW)
635 fz_drop_html_flow(ctx, box->u.flow.head);
636 fz_drop_html_box(ctx, box->down);
637 box = next;
638 }
639 }
640
641 static void fz_drop_html_imp(fz_context *ctx, fz_storable *stor)
642 {
643 fz_html *html = (fz_html *)stor;
644 fz_drop_html_box(ctx, html->tree.root);
645 fz_drop_pool(ctx, html->tree.pool);
646 }
647
648 static void fz_drop_story_imp(fz_context *ctx, fz_storable *stor)
649 {
650 fz_story *story = (fz_story *)stor;
651 fz_free(ctx, story->user_css);
652 fz_drop_html_font_set(ctx, story->font_set);
653 fz_drop_xml(ctx, story->dom);
654 fz_drop_html_box(ctx, story->tree.root);
655 fz_drop_buffer(ctx, story->warnings);
656 fz_drop_archive(ctx, story->zip);
657 /* The pool must be the last thing dropped. */
658 fz_drop_pool(ctx, story->tree.pool);
659 }
660
661 /* Drop a structure derived from an html_tree. The exact things
662 * freed here will depend upon the drop function with which it
663 * was created. */
664 static void
665 fz_drop_html_tree(fz_context *ctx, fz_html_tree *tree)
666 {
667 fz_defer_reap_start(ctx);
668 fz_drop_storable(ctx, &tree->storable);
669 fz_defer_reap_end(ctx);
670 }
671
672 void fz_drop_html(fz_context *ctx, fz_html *html)
673 {
674 fz_drop_html_tree(ctx, &html->tree);
675 }
676
677 void fz_drop_story(fz_context *ctx, fz_story *story)
678 {
679 if (!story)
680 return;
681
682 fz_drop_html_tree(ctx, &story->tree);
683 }
684
685 fz_html *fz_keep_html(fz_context *ctx, fz_html *html)
686 {
687 return fz_keep_storable(ctx, &html->tree.storable);
688 }
689
690 static fz_html_box *new_box(fz_context *ctx, struct genstate *g, fz_xml *node, int type, fz_css_style *style)
691 {
692 fz_html_box *box;
693 const char *tag = fz_xml_tag(node);
694 const char *id = fz_xml_att(node, "id");
695 const char *href;
696
697 if (type == BOX_INLINE)
698 box = fz_pool_alloc(ctx, g->pool, offsetof(fz_html_box, u));
699 else if (type == BOX_FLOW)
700 box = fz_pool_alloc(ctx, g->pool, offsetof(fz_html_box, u) + sizeof(box->u.flow));
701 else
702 box = fz_pool_alloc(ctx, g->pool, offsetof(fz_html_box, u) + sizeof(box->u.block));
703
704 box->type = type;
705 box->is_first_flow = 0;
706 box->markup_dir = g->markup_dir;
707 box->heading = 0;
708 box->list_item = 0;
709
710 box->style = fz_css_enlist(ctx, style, &g->styles, g->pool);
711
712 if (tag)
713 {
714 box->tag = find_known_html_tag(tag);
715 if (!box->tag && g->is_fb2)
716 box->tag = find_known_fb2_tag(tag);
717 if (!box->tag)
718 box->tag = fz_pool_strdup(ctx, g->pool, tag);
719 }
720 else
721 {
722 box->tag = "#anon";
723 }
724
725 if (id)
726 box->id = fz_pool_strdup(ctx, g->pool, id);
727
728 if (tag && tag[0]=='a' && tag[1]==0)
729 {
730 // Support deprecated anchor syntax with id in "name" instead of "id" attribute.
731 if (!id)
732 {
733 const char *name = fz_xml_att(node, "name");
734 if (name)
735 box->id = fz_pool_strdup(ctx, g->pool, name);
736 }
737
738 if (g->is_fb2)
739 {
740 href = fz_xml_att(node, "l:href");
741 if (!href)
742 href = fz_xml_att(node, "xlink:href");
743 }
744 else
745 {
746 href = fz_xml_att(node, "href");
747 }
748 if (href)
749 g->href = fz_pool_strdup(ctx, g->pool, href);
750 }
751
752 if (g->href)
753 box->href = g->href;
754
755 if (type == BOX_FLOW)
756 {
757 box->u.flow.head = NULL;
758 box->s.build.flow_tail = &box->u.flow.head;
759 }
760
761 return box;
762 }
763
764 static void append_box(fz_context *ctx, fz_html_box *parent, fz_html_box *child)
765 {
766 child->up = parent;
767 if (!parent->down)
768 parent->down = child;
769 if (parent->s.build.last_child)
770 parent->s.build.last_child->next = child;
771 parent->s.build.last_child = child;
772 }
773
774 static fz_html_box *find_block_context(fz_context *ctx, fz_html_box *box)
775 {
776 while (box->type != BOX_BLOCK && box->type != BOX_TABLE_CELL)
777 box = box->up;
778 return box;
779 }
780
781 static fz_html_box *find_table_row_context(fz_context *ctx, fz_html_box *box)
782 {
783 fz_html_box *look = box;
784 while (look && look->type != BOX_TABLE)
785 look = look->up;
786 if (look)
787 return look;
788 fz_warn(ctx, "table-row not inside table element");
789 return NULL;
790 }
791
792 static fz_html_box *find_table_cell_context(fz_context *ctx, fz_html_box *box)
793 {
794 fz_html_box *look = box;
795 while (look && look->type != BOX_TABLE_ROW)
796 look = look->up;
797 if (look)
798 return look;
799 fz_warn(ctx, "table-cell not inside table-row element");
800 return NULL;
801 }
802
803 static fz_html_box *find_inline_context(fz_context *ctx, struct genstate *g, fz_html_box *box)
804 {
805 fz_css_style style;
806 fz_html_box *flow_box;
807
808 if (box->type == BOX_FLOW || box->type == BOX_INLINE)
809 return box;
810
811 // We have an inline element that is not in an existing flow/inline context.
812
813 // Find the closest block level box to insert content into.
814 while (box->type != BOX_BLOCK && box->type != BOX_TABLE_CELL)
815 box = box->up;
816
817 // Concatenate onto the last open flow box if we have one.
818 if (box->s.build.last_child && box->s.build.last_child->type == BOX_FLOW)
819 return box->s.build.last_child;
820
821 // No flow box found, create and insert one!
822
823 // TODO: null style instead of default for flow box?
824 fz_default_css_style(ctx, &style);
825 flow_box = new_box(ctx, g, NULL, BOX_FLOW, &style);
826 flow_box->is_first_flow = !box->down;
827 g->at_bol = 1;
828
829 append_box(ctx, box, flow_box);
830
831 return flow_box;
832 }
833
834 static void gen2_children(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *root_node, fz_css_match *root_match);
835
836 static void gen2_text(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node)
837 {
838 fz_html_box *anon_box;
839 fz_css_style style;
840 const char *text;
841 int collapse;
842
843 text = fz_xml_text(node);
844 collapse = root_box->style->white_space & WS_COLLAPSE;
845 if (collapse && is_all_white(text))
846 {
847 g->emit_white = root_box;
848 }
849 else
850 {
851 if (root_box->type != BOX_INLINE)
852 {
853 /* Create anonymous inline box, with the same style as the top block box. */
854 style = *root_box->style;
855
856 // Make sure not to recursively multiply font sizes
857 style.font_size.value = 1;
858 style.font_size.unit = N_SCALE;
859
860 root_box = find_inline_context(ctx, g, root_box);
861 anon_box = new_box(ctx, g, NULL, BOX_INLINE, &style);
862 append_box(ctx, root_box, anon_box);
863 root_box = anon_box;
864 }
865
866 generate_text(ctx, root_box, text, g->markup_lang, g);
867 }
868 }
869
870 static fz_html_box *gen2_inline(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style)
871 {
872 fz_html_box *this_box;
873 fz_html_box *flow_box;
874 root_box = find_inline_context(ctx, g, root_box);
875 this_box = new_box(ctx, g, node, BOX_INLINE, style);
876 append_box(ctx, root_box, this_box);
877 if (this_box->id)
878 {
879 flow_box = find_flow_encloser(ctx, this_box);
880 add_flow_anchor(ctx, g->pool, flow_box, this_box);
881 }
882 return this_box;
883 }
884
885 static void gen2_break(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node)
886 {
887 fz_html_box *this_box;
888 fz_html_box *flow_box;
889
890 if (root_box->type != BOX_INLINE)
891 {
892 /* Create inline box to hold the <br> tag, with the same style as containing block. */
893 /* Make sure not to recursively multiply font sizes. */
894 fz_css_style style = *root_box->style;
895 style.font_size.value = 1;
896 style.font_size.unit = N_SCALE;
897 this_box = new_box(ctx, g, node, BOX_INLINE, &style);
898 append_box(ctx, find_inline_context(ctx, g, root_box), this_box);
899 }
900 else
901 {
902 this_box = root_box;
903 }
904
905 flow_box = find_flow_encloser(ctx, this_box);
906 add_flow_break(ctx, g->pool, flow_box, this_box);
907 g->at_bol = 1;
908 }
909
910 static fz_html_box *gen2_block(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style)
911 {
912 fz_html_box *this_box;
913 root_box = find_block_context(ctx, root_box);
914 this_box = new_box(ctx, g, node, BOX_BLOCK, style);
915 append_box(ctx, root_box, this_box);
916 return this_box;
917 }
918
919 static fz_html_box *gen2_table(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style)
920 {
921 fz_html_box *this_box;
922 root_box = find_block_context(ctx, root_box);
923 this_box = new_box(ctx, g, node, BOX_TABLE, style);
924 append_box(ctx, root_box, this_box);
925 return this_box;
926 }
927
928 static fz_html_box *gen2_table_row(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style)
929 {
930 fz_html_box *this_box, *table_box;
931
932 table_box = find_table_row_context(ctx, root_box);
933 if (!table_box)
934 return gen2_block(ctx, g, root_box, node, style);
935
936 this_box = new_box(ctx, g, node, BOX_TABLE_ROW, style);
937 append_box(ctx, table_box, this_box);
938 return this_box;
939 }
940
941 static fz_html_box *gen2_table_cell(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style)
942 {
943 fz_html_box *this_box, *row_box;
944
945 row_box = find_table_cell_context(ctx, root_box);
946 if (!row_box)
947 return gen2_block(ctx, g, root_box, node, style);
948
949 this_box = new_box(ctx, g, node, BOX_TABLE_CELL, style);
950 append_box(ctx, row_box, this_box);
951 return this_box;
952 }
953
954 static void gen2_image_common(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_image *img, int display, fz_css_style *style)
955 {
956 fz_html_box *img_block_box;
957 fz_html_box *img_inline_box;
958
959 if (display == DIS_INLINE || display == DIS_INLINE_BLOCK)
960 {
961 root_box = find_inline_context(ctx, g, root_box);
962 img_inline_box = new_box(ctx, g, node, BOX_INLINE, style);
963 append_box(ctx, root_box, img_inline_box);
964 generate_image(ctx, img_inline_box, img, g);
965 }
966 else
967 {
968 root_box = find_block_context(ctx, root_box);
969 img_block_box = new_box(ctx, g, node, BOX_BLOCK, style);
970 append_box(ctx, root_box, img_block_box);
971
972 root_box = find_inline_context(ctx, g, img_block_box);
973 img_inline_box = new_box(ctx, g, NULL, BOX_INLINE, style);
974 append_box(ctx, root_box, img_inline_box);
975 generate_image(ctx, img_inline_box, img, g);
976 }
977 }
978
979 static void gen2_image_html(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, int display, fz_css_style *style)
980 {
981 const char *src = fz_xml_att(node, "src");
982 if (src)
983 {
984 fz_css_style local_style = *style;
985 fz_image *img;
986 int w, h;
987 const char *w_att = fz_xml_att(node, "width");
988 const char *h_att = fz_xml_att(node, "height");
989
990 if (w_att && (w = fz_atoi(w_att)) > 0)
991 {
992 local_style.width.value = w;
993 local_style.width.unit = strchr(w_att, '%') ? N_PERCENT : N_LENGTH;
994 }
995 if (h_att && (h = fz_atoi(h_att)) > 0)
996 {
997 local_style.height.value = h;
998 local_style.height.unit = strchr(h_att, '%') ? N_PERCENT : N_LENGTH;
999 }
1000
1001 img = load_html_image(ctx, g->zip, g->base_uri, src);
1002 gen2_image_common(ctx, g, root_box, node, img, display, &local_style);
1003 }
1004 }
1005
1006 static void gen2_image_fb2(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, int display, fz_css_style *style)
1007 {
1008 const char *src = fz_xml_att(node, "l:href");
1009 if (!src)
1010 src = fz_xml_att(node, "xlink:href");
1011 if (src && src[0] == '#')
1012 {
1013 fz_image *img = fz_tree_lookup(ctx, g->images, src+1);
1014 gen2_image_common(ctx, g, root_box, node, fz_keep_image(ctx, img), display, style);
1015 }
1016 }
1017
1018 static void gen2_image_svg(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, int display, fz_css_style *style)
1019 {
1020 fz_image *img = load_svg_image(ctx, g->zip, g->base_uri, g->xml, node);
1021 gen2_image_common(ctx, g, root_box, node, img, display, style);
1022 }
1023
1024 static int get_heading_from_tag(fz_context *ctx, struct genstate *g, const char *tag)
1025 {
1026 if (tag[0] == 'h' && tag[1] != 0 && tag[2] == 0)
1027 {
1028 switch (tag[1])
1029 {
1030 case '1': return 1;
1031 case '2': return 2;
1032 case '3': return 3;
1033 case '4': return 4;
1034 case '5': return 5;
1035 case '6': return 6;
1036 }
1037 }
1038 if (g->is_fb2)
1039 {
1040 if (!strcmp(tag, "title") || !strcmp(tag, "subtitle"))
1041 return fz_mini(g->section_depth, 6);
1042 }
1043 return 0;
1044 }
1045
1046 static void gen2_tag(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node,
1047 fz_css_match *match, int display, fz_css_style *style)
1048 {
1049 fz_html_box *this_box;
1050 const char *tag;
1051 const char *lang_att;
1052 const char *dir_att;
1053
1054 int save_markup_dir = g->markup_dir;
1055 int save_markup_lang = g->markup_lang;
1056 char *save_href = g->href;
1057
1058 if (display == DIS_NONE)
1059 return;
1060
1061 tag = fz_xml_tag(node);
1062
1063 dir_att = fz_xml_att(node, "dir");
1064 if (dir_att)
1065 {
1066 if (!strcmp(dir_att, "auto"))
1067 g->markup_dir = FZ_BIDI_NEUTRAL;
1068 else if (!strcmp(dir_att, "rtl"))
1069 g->markup_dir = FZ_BIDI_RTL;
1070 else if (!strcmp(dir_att, "ltr"))
1071 g->markup_dir = FZ_BIDI_LTR;
1072 else
1073 g->markup_dir = DEFAULT_DIR;
1074 }
1075
1076 lang_att = fz_xml_att(node, "lang");
1077 if (lang_att)
1078 g->markup_lang = fz_text_language_from_string(lang_att);
1079
1080 switch (display)
1081 {
1082 case DIS_INLINE_BLOCK:
1083 // TODO handle inline block as a flow node
1084 this_box = gen2_block(ctx, g, root_box, node, style);
1085 break;
1086
1087 case DIS_BLOCK:
1088 this_box = gen2_block(ctx, g, root_box, node, style);
1089 this_box->heading = get_heading_from_tag(ctx, g, tag);
1090 break;
1091
1092 case DIS_LIST_ITEM:
1093 this_box = gen2_block(ctx, g, root_box, node, style);
1094 this_box->list_item = ++g->list_counter;
1095 break;
1096
1097 // TODO: https://www.w3.org/TR/CSS2/tables.html#anonymous-boxes
1098 //
1099 // The table generation code should insert and create anonymous boxes
1100 // for any missing child/parent elements.
1101 //
1102 // MISSING CHILDREN:
1103 // 1: Wrap consecutive BLOCK found in a TABLE in an anon TABLE_ROW.
1104 // 2: Wrap consecutive BLOCK found in a TABLE_ROW in an anon TABLE_CELL.
1105 //
1106 // MISSING PARENTS:
1107 // 1: Wrap consecutive TABLE_CELL found outside TABLE_ROW in an anon TABLE_ROW
1108 // 2: Wrap consecutive TABLE_ROW found outside TABLE in an anon TABLE
1109 //
1110 // For now we ignore this and treat any such elements that are out of
1111 // context as plain block elements.
1112
1113 case DIS_TABLE:
1114 this_box = gen2_table(ctx, g, root_box, node, style);
1115 break;
1116 case DIS_TABLE_GROUP:
1117 // no box for table-row-group elements
1118 this_box = root_box;
1119 break;
1120 case DIS_TABLE_ROW:
1121 this_box = gen2_table_row(ctx, g, root_box, node, style);
1122 break;
1123 case DIS_TABLE_CELL:
1124 this_box = gen2_table_cell(ctx, g, root_box, node, style);
1125 break;
1126
1127 case DIS_INLINE:
1128 default:
1129 this_box = gen2_inline(ctx, g, root_box, node, style);
1130 break;
1131 }
1132
1133 if (tag && (!strcmp(tag, "ol") || !strcmp(tag, "ul") || !strcmp(tag, "dl")))
1134 {
1135 int save_list_counter = g->list_counter;
1136 g->list_counter = 0;
1137 gen2_children(ctx, g, this_box, node, match);
1138 g->list_counter = save_list_counter;
1139 }
1140 else if (tag && !strcmp(tag, "section"))
1141 {
1142 int save_section_depth = g->section_depth;
1143 g->section_depth++;
1144 gen2_children(ctx, g, this_box, node, match);
1145 g->section_depth = save_section_depth;
1146 }
1147 else
1148 {
1149 gen2_children(ctx, g, this_box, node, match);
1150 }
1151
1152 g->markup_dir = save_markup_dir;
1153 g->markup_lang = save_markup_lang;
1154 g->href = save_href;
1155 }
1156
1157 static void gen2_children(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *root_node, fz_css_match *root_match)
1158 {
1159 fz_xml *node;
1160 const char *tag;
1161 fz_css_match match;
1162 fz_css_style style;
1163 int display;
1164
1165 for (node = fz_xml_down(root_node); node; node = fz_xml_next(node))
1166 {
1167 tag = fz_xml_tag(node);
1168 if (tag)
1169 {
1170 fz_match_css(ctx, &match, root_match, g->css, node);
1171 fz_apply_css_style(ctx, g->set, &style, &match);
1172 display = fz_get_css_match_display(&match);
1173 if (tag[0]=='b' && tag[1]=='r' && tag[2]==0)
1174 {
1175 gen2_break(ctx, g, root_box, node);
1176 }
1177 else if (tag[0]=='i' && tag[1]=='m' && tag[2]=='g' && tag[3]==0)
1178 {
1179 gen2_image_html(ctx, g, root_box, node, display, &style);
1180 }
1181 else if (g->is_fb2 && tag[0]=='i' && tag[1]=='m' && tag[2]=='a' && tag[3]=='g' && tag[4]=='e' && tag[5]==0)
1182 {
1183 gen2_image_fb2(ctx, g, root_box, node, display, &style);
1184 }
1185 else if (tag[0]=='s' && tag[1]=='v' && tag[2]=='g' && tag[3]==0)
1186 {
1187 gen2_image_svg(ctx, g, root_box, node, display, &style);
1188 }
1189 else
1190 {
1191 gen2_tag(ctx, g, root_box, node, &match, display, &style);
1192 }
1193 }
1194 else
1195 {
1196 gen2_text(ctx, g, root_box, node);
1197 }
1198 }
1199 }
1200
1201 static char *concat_text(fz_context *ctx, fz_xml *root)
1202 {
1203 fz_xml *node;
1204 size_t i = 0, n = 1;
1205 char *s;
1206 for (node = fz_xml_down(root); node; node = fz_xml_next(node))
1207 {
1208 const char *text = fz_xml_text(node);
1209 n += text ? strlen(text) : 0;
1210 }
1211 s = Memento_label(fz_malloc(ctx, n), "concat_html");
1212 for (node = fz_xml_down(root); node; node = fz_xml_next(node))
1213 {
1214 const char *text = fz_xml_text(node);
1215 if (text)
1216 {
1217 n = strlen(text);
1218 memcpy(s+i, text, n);
1219 i += n;
1220 }
1221 }
1222 s[i] = 0;
1223 return s;
1224 }
1225
1226 static void
1227 html_load_css_link(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_css *css, fz_xml *root, const char *href)
1228 {
1229 char path[2048];
1230 char css_base_uri[2048];
1231 fz_buffer *buf;
1232
1233 fz_var(buf);
1234
1235 fz_strlcpy(path, base_uri, sizeof path);
1236 fz_strlcat(path, "/", sizeof path);
1237 fz_strlcat(path, href, sizeof path);
1238 fz_urldecode(path);
1239 fz_cleanname(path);
1240
1241 fz_dirname(css_base_uri, path, sizeof css_base_uri);
1242
1243 buf = NULL;
1244 fz_try(ctx)
1245 {
1246 buf = fz_read_archive_entry(ctx, zip, path);
1247 fz_parse_css(ctx, css, fz_string_from_buffer(ctx, buf), path);
1248 fz_add_css_font_faces(ctx, set, zip, css_base_uri, css);
1249 }
1250 fz_always(ctx)
1251 fz_drop_buffer(ctx, buf);
1252 fz_catch(ctx)
1253 {
1254 fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
1255 fz_report_error(ctx);
1256 fz_warn(ctx, "ignoring stylesheet %s", path);
1257 }
1258 }
1259
1260 static void
1261 html_load_css(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_css *css, fz_xml *root)
1262 {
1263 fz_xml *html, *head, *node;
1264
1265 html = fz_xml_find(root, "html");
1266 head = fz_xml_find_down(html, "head");
1267 for (node = fz_xml_down(head); node; node = fz_xml_next(node))
1268 {
1269 if (fz_xml_is_tag(node, "link"))
1270 {
1271 char *rel = fz_xml_att(node, "rel");
1272 if (rel && !fz_strcasecmp(rel, "stylesheet"))
1273 {
1274 char *type = fz_xml_att(node, "type");
1275 if ((type && !strcmp(type, "text/css")) || !type)
1276 {
1277 char *href = fz_xml_att(node, "href");
1278 if (href)
1279 {
1280 html_load_css_link(ctx, set, zip, base_uri, css, root, href);
1281 }
1282 }
1283 }
1284 }
1285 else if (fz_xml_is_tag(node, "style"))
1286 {
1287 char *s = concat_text(ctx, node);
1288 fz_try(ctx)
1289 {
1290 fz_parse_css(ctx, css, s, "<style>");
1291 fz_add_css_font_faces(ctx, set, zip, base_uri, css);
1292 }
1293 fz_always(ctx)
1294 fz_free(ctx, s);
1295 fz_catch(ctx)
1296 {
1297 fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
1298 fz_report_error(ctx);
1299 fz_warn(ctx, "ignoring inline stylesheet");
1300 }
1301 }
1302 }
1303 }
1304
1305 static void
1306 fb2_load_css(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_css *css, fz_xml *root)
1307 {
1308 fz_xml *fictionbook, *stylesheet;
1309
1310 fictionbook = fz_xml_find(root, "FictionBook");
1311 stylesheet = fz_xml_find_down(fictionbook, "stylesheet");
1312 if (stylesheet)
1313 {
1314 char *s = concat_text(ctx, stylesheet);
1315 fz_try(ctx)
1316 {
1317 fz_parse_css(ctx, css, s, "<stylesheet>");
1318 fz_add_css_font_faces(ctx, set, zip, base_uri, css);
1319 }
1320 fz_catch(ctx)
1321 {
1322 fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
1323 fz_report_error(ctx);
1324 fz_warn(ctx, "ignoring inline stylesheet");
1325 }
1326 fz_free(ctx, s);
1327 }
1328 }
1329
1330 static fz_tree *
1331 load_fb2_images(fz_context *ctx, fz_xml *root)
1332 {
1333 fz_xml *fictionbook, *binary;
1334 fz_tree *images = NULL;
1335
1336 fictionbook = fz_xml_find(root, "FictionBook");
1337 for (binary = fz_xml_find_down(fictionbook, "binary"); binary; binary = fz_xml_find_next(binary, "binary"))
1338 {
1339 const char *id = fz_xml_att(binary, "id");
1340 char *b64 = NULL;
1341 fz_buffer *buf = NULL;
1342 fz_image *img = NULL;
1343
1344 fz_var(b64);
1345 fz_var(buf);
1346
1347 if (id == NULL)
1348 {
1349 fz_warn(ctx, "Skipping image with no id");
1350 continue;
1351 }
1352
1353 fz_try(ctx)
1354 {
1355 b64 = concat_text(ctx, binary);
1356 buf = fz_new_buffer_from_base64(ctx, b64, strlen(b64));
1357 img = fz_new_image_from_buffer(ctx, buf);
1358 }
1359 fz_always(ctx)
1360 {
1361 fz_drop_buffer(ctx, buf);
1362 fz_free(ctx, b64);
1363 }
1364 fz_catch(ctx)
1365 fz_rethrow(ctx);
1366
1367 images = fz_tree_insert(ctx, images, id, img);
1368 }
1369
1370 return images;
1371 }
1372
1373 typedef struct
1374 {
1375 uint32_t *data;
1376 size_t cap;
1377 size_t len;
1378 } uni_buf;
1379
1380 typedef struct
1381 {
1382 fz_context *ctx;
1383 fz_pool *pool;
1384 fz_html_flow *flow;
1385 uni_buf *buffer;
1386 } bidi_data;
1387
1388 static void fragment_cb(const uint32_t *fragment,
1389 size_t fragment_len,
1390 int bidi_level,
1391 int script,
1392 void *arg)
1393 {
1394 bidi_data *data = (bidi_data *)arg;
1395
1396 /* We are guaranteed that fragmentOffset will be at the beginning
1397 * of flow. */
1398 while (fragment_len > 0)
1399 {
1400 size_t len;
1401
1402 if (data->flow->type == FLOW_SPACE)
1403 {
1404 len = 1;
1405 }
1406 else if (data->flow->type == FLOW_BREAK || data->flow->type == FLOW_SBREAK ||
1407 data->flow->type == FLOW_SHYPHEN || data->flow->type == FLOW_ANCHOR)
1408 {
1409 len = 0;
1410 }
1411 else
1412 {
1413 /* Must be text */
1414 len = fz_utflen(data->flow->content.text);
1415 if (len > fragment_len)
1416 {
1417 /* We need to split this flow box */
1418 (void)fz_html_split_flow(data->ctx, data->pool, data->flow, fragment_len);
1419 len = fz_utflen(data->flow->content.text);
1420 }
1421 }
1422
1423 /* This flow box is entirely contained within this fragment. */
1424 data->flow->bidi_level = bidi_level;
1425 data->flow->script = script;
1426 data->flow = data->flow->next;
1427 fragment_len -= len;
1428 }
1429 }
1430
1431 static fz_bidi_direction
1432 detect_flow_directionality(fz_context *ctx, fz_pool *pool, uni_buf *buffer, fz_bidi_direction bidi_dir, fz_html_flow *flow)
1433 {
1434 fz_html_flow *end = flow;
1435 bidi_data data;
1436
1437 while (end)
1438 {
1439 unsigned int level = end->bidi_level;
1440
1441 /* Gather the text from the flow up into a single buffer (at
1442 * least, as much of it as has the same direction markup). */
1443 buffer->len = 0;
1444 while (end && (level & 1) == (end->bidi_level & 1))
1445 {
1446 size_t len = 0;
1447 const char *text = "";
1448 int broken = 0;
1449
1450 switch (end->type)
1451 {
1452 case FLOW_WORD:
1453 len = fz_utflen(end->content.text);
1454 text = end->content.text;
1455 break;
1456 case FLOW_SPACE:
1457 len = 1;
1458 text = " ";
1459 break;
1460 case FLOW_SHYPHEN:
1461 case FLOW_SBREAK:
1462 break;
1463 case FLOW_BREAK:
1464 case FLOW_IMAGE:
1465 broken = 1;
1466 break;
1467 }
1468
1469 end = end->next;
1470
1471 if (broken)
1472 break;
1473
1474 /* Make sure the buffer is large enough */
1475 if (buffer->len + len > buffer->cap)
1476 {
1477 size_t newcap = buffer->cap;
1478 if (newcap < 128)
1479 newcap = 128; /* Sensible small default */
1480
1481 while (newcap < buffer->len + len)
1482 newcap = (newcap * 3) / 2;
1483
1484 buffer->data = fz_realloc_array(ctx, buffer->data, newcap, uint32_t);
1485 buffer->cap = newcap;
1486 }
1487
1488 /* Expand the utf8 text into Unicode and store it in the buffer */
1489 while (*text)
1490 {
1491 int rune;
1492 text += fz_chartorune(&rune, text);
1493 buffer->data[buffer->len++] = rune;
1494 }
1495 }
1496
1497 /* Detect directionality for the buffer */
1498 data.ctx = ctx;
1499 data.pool = pool;
1500 data.flow = flow;
1501 data.buffer = buffer;
1502 fz_bidi_fragment_text(ctx, buffer->data, buffer->len, &bidi_dir, fragment_cb, &data, 0 /* Flags */);
1503 flow = end;
1504 }
1505 return bidi_dir;
1506 }
1507
1508 static void
1509 detect_box_directionality(fz_context *ctx, fz_pool *pool, uni_buf *buffer, fz_html_box *box)
1510 {
1511 while (box)
1512 {
1513 if (box->type == BOX_FLOW)
1514 box->markup_dir = detect_flow_directionality(ctx, pool, buffer, box->markup_dir, box->u.flow.head);
1515 detect_box_directionality(ctx, pool, buffer, box->down);
1516 box = box->next;
1517 }
1518 }
1519
1520 static void
1521 detect_directionality(fz_context *ctx, fz_pool *pool, fz_html_box *box)
1522 {
1523 uni_buf buffer = { NULL };
1524
1525 fz_try(ctx)
1526 detect_box_directionality(ctx, pool, &buffer, box);
1527 fz_always(ctx)
1528 fz_free(ctx, buffer.data);
1529 fz_catch(ctx)
1530 fz_rethrow(ctx);
1531 }
1532
1533 static fz_xml_doc *
1534 parse_to_xml(fz_context *ctx, fz_buffer *buf, int try_xml, int try_html5)
1535 {
1536 fz_xml_doc *xml;
1537
1538 if (try_xml && try_html5)
1539 {
1540 fz_try(ctx)
1541 xml = fz_parse_xml(ctx, buf, 1);
1542 fz_catch(ctx)
1543 {
1544 if (fz_caught(ctx) == FZ_ERROR_SYNTAX)
1545 {
1546 fz_report_error(ctx);
1547 fz_warn(ctx, "syntax error in XHTML; retrying using HTML5 parser");
1548 xml = fz_parse_xml_from_html5(ctx, buf);
1549 }
1550 else
1551 fz_rethrow(ctx);
1552 }
1553 }
1554 else if (try_xml)
1555 xml = fz_parse_xml(ctx, buf, 1);
1556 else
1557 {
1558 assert(try_html5);
1559 xml = fz_parse_xml_from_html5(ctx, buf);
1560 }
1561
1562 return xml;
1563 }
1564
1565 static void move_background_color_style_up(fz_context *ctx, struct genstate *g, fz_html_box *root, fz_html_box *from)
1566 {
1567 fz_css_color transparent = { 0, 0, 0, 0 };
1568 fz_css_style s1, s2;
1569 memcpy(&s1, root->style, sizeof s1);
1570 memcpy(&s2, from->style, sizeof s2);
1571 s1.background_color = s2.background_color;
1572 s2.background_color = transparent;
1573 root->style = fz_css_enlist(ctx, &s1, &g->styles, g->pool);
1574 from->style = fz_css_enlist(ctx, &s2, &g->styles, g->pool);
1575 }
1576
1577 static void move_background_color_up(fz_context *ctx, struct genstate *g, fz_html_box *root)
1578 {
1579 fz_html_box *html, *body;
1580
1581 if (root->style->background_color.a != 0)
1582 {
1583 return;
1584 }
1585
1586 html = root->down;
1587 if (html && !strcmp(html->tag, "html"))
1588 {
1589 if (html->style->background_color.a != 0)
1590 {
1591 move_background_color_style_up(ctx, g, root, html);
1592 return;
1593 }
1594
1595 body = html->down;
1596 if (body && !strcmp(body->tag, "body"))
1597 {
1598 if (body->style->background_color.a != 0)
1599 {
1600 move_background_color_style_up(ctx, g, root, body);
1601 return;
1602 }
1603 }
1604 }
1605 }
1606
1607 static void
1608 xml_to_boxes(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, const char *user_css,
1609 fz_xml_doc *xml, fz_html_tree *tree, char **rtitle, int try_fictionbook, int is_mobi)
1610 {
1611 fz_xml *root, *node;
1612 char *title;
1613
1614 fz_css_match root_match, match;
1615 struct genstate g = {0};
1616
1617 g.pool = NULL;
1618 g.set = set;
1619 g.zip = zip;
1620 g.images = NULL;
1621 g.xml = xml;
1622 g.is_fb2 = 0;
1623 g.base_uri = base_uri;
1624 g.css = NULL;
1625 g.at_bol = 0;
1626 g.emit_white = 0;
1627 g.last_brk_cls = UCDN_LINEBREAK_CLASS_OP;
1628 g.list_counter = 0;
1629 g.section_depth = 0;
1630 g.markup_dir = FZ_BIDI_LTR;
1631 g.markup_lang = FZ_LANG_UNSET;
1632 g.href = NULL;
1633 g.styles = NULL;
1634
1635 if (rtitle)
1636 *rtitle = NULL;
1637
1638 root = fz_xml_root(g.xml);
1639 g.css = fz_new_css(ctx);
1640
1641 #ifndef NDEBUG
1642 if (fz_atoi(getenv("FZ_DEBUG_XML")))
1643 fz_debug_xml(root, 0);
1644 #endif
1645
1646 fz_try(ctx)
1647 {
1648 if (try_fictionbook && fz_xml_find(root, "FictionBook"))
1649 {
1650 g.is_fb2 = 1;
1651 fz_parse_css(ctx, g.css, fb2_default_css, "<default:fb2>");
1652 if (fz_use_document_css(ctx))
1653 fb2_load_css(ctx, g.set, g.zip, g.base_uri, g.css, root);
1654 g.images = load_fb2_images(ctx, root);
1655 }
1656 else if (is_mobi)
1657 {
1658 g.is_fb2 = 0;
1659 fz_parse_css(ctx, g.css, html_default_css, "<default:html>");
1660 fz_parse_css(ctx, g.css, mobi_default_css, "<default:mobi>");
1661 if (fz_use_document_css(ctx))
1662 html_load_css(ctx, g.set, g.zip, g.base_uri, g.css, root);
1663 }
1664 else
1665 {
1666 g.is_fb2 = 0;
1667 fz_parse_css(ctx, g.css, html_default_css, "<default:html>");
1668 if (fz_use_document_css(ctx))
1669 html_load_css(ctx, g.set, g.zip, g.base_uri, g.css, root);
1670 }
1671
1672 if (user_css)
1673 {
1674 fz_parse_css(ctx, g.css, user_css, "<user>");
1675 fz_add_css_font_faces(ctx, g.set, g.zip, ".", g.css);
1676 }
1677 }
1678 fz_catch(ctx)
1679 {
1680 fz_drop_tree(ctx, g.images, (void(*)(fz_context*,void*))fz_drop_image);
1681 fz_drop_css(ctx, g.css);
1682 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
1683 fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
1684 fz_report_error(ctx);
1685 fz_warn(ctx, "ignoring styles");
1686 g.css = fz_new_css(ctx);
1687 g.images = NULL;
1688 }
1689
1690 #ifndef NDEBUG
1691 if (fz_atoi(getenv("FZ_DEBUG_CSS")))
1692 fz_debug_css(ctx, g.css);
1693 #endif
1694
1695 fz_try(ctx)
1696 {
1697 fz_css_style style;
1698 int display;
1699
1700 fz_match_css_at_page(ctx, &root_match, g.css);
1701 fz_apply_css_style(ctx, g.set, &style, &root_match);
1702
1703 g.pool = tree->pool;
1704 g.markup_dir = DEFAULT_DIR;
1705 g.markup_lang = FZ_LANG_UNSET;
1706
1707 // Create root node
1708 tree->root = new_box(ctx, &g, NULL, BOX_BLOCK, &style);
1709 // TODO: transfer page margins out of this hacky box
1710
1711 tree->root->tag = ":root";
1712 tree->root->s.layout.em = 0;
1713 tree->root->s.layout.x = 0;
1714 tree->root->s.layout.y = 0;
1715 tree->root->s.layout.w = 0;
1716 tree->root->s.layout.b = 0;
1717
1718 // Create document node (html).
1719 fz_match_css(ctx, &match, &root_match, g.css, root);
1720 fz_apply_css_style(ctx, g.set, &style, &match);
1721 display = fz_get_css_match_display(&match);
1722 gen2_tag(ctx, &g, tree->root, root, &match, display, &style);
1723
1724 detect_directionality(ctx, g.pool, tree->root);
1725
1726 if (g.is_fb2)
1727 {
1728 node = fz_xml_find(root, "FictionBook");
1729 node = fz_xml_find_down(node, "description");
1730 node = fz_xml_find_down(node, "title-info");
1731 node = fz_xml_find_down(node, "book-title");
1732 if (rtitle)
1733 {
1734 title = fz_xml_text(fz_xml_down(node));
1735 if (title)
1736 *rtitle = fz_pool_strdup(ctx, g.pool, title);
1737 }
1738 }
1739 else
1740 {
1741 node = fz_xml_find(root, "html");
1742 node = fz_xml_find_down(node, "head");
1743 node = fz_xml_find_down(node, "title");
1744 if (rtitle)
1745 {
1746 title = fz_xml_text(fz_xml_down(node));
1747 if (title)
1748 *rtitle = fz_pool_strdup(ctx, g.pool, title);
1749 }
1750
1751 // Move html or body background-color to :root.
1752 move_background_color_up(ctx, &g, tree->root);
1753 }
1754 }
1755 fz_always(ctx)
1756 {
1757 fz_drop_tree(ctx, g.images, (void(*)(fz_context*,void*))fz_drop_image);
1758 fz_drop_css(ctx, g.css);
1759 }
1760 fz_catch(ctx)
1761 {
1762 if (rtitle)
1763 {
1764 fz_free(ctx, *rtitle);
1765 *rtitle = NULL;
1766 }
1767 fz_rethrow(ctx);
1768 }
1769 }
1770
1771 static const char *mobi_font_size[7] = {
1772 "0.67em",
1773 "0.83em",
1774 "1em",
1775 "1.17em",
1776 "1.33em",
1777 "1.5em",
1778 "1.67em",
1779 };
1780
1781 static void
1782 patch_mobi_html(fz_context *ctx, fz_pool *pool, fz_xml *node)
1783 {
1784 fz_xml *down;
1785 char buf[500];
1786 while (node)
1787 {
1788 char *tag = fz_xml_tag(node);
1789 if (tag)
1790 {
1791 // Read MOBI attributes, convert to inline CSS style
1792 if (!strcmp(tag, "font"))
1793 {
1794 const char *size = fz_xml_att(node, "size");
1795 if (size)
1796 {
1797 if (!strcmp(size, "1")) size = mobi_font_size[0];
1798 else if (!strcmp(size, "2")) size = mobi_font_size[1];
1799 else if (!strcmp(size, "3")) size = mobi_font_size[2];
1800 else if (!strcmp(size, "4")) size = mobi_font_size[3];
1801 else if (!strcmp(size, "5")) size = mobi_font_size[4];
1802 else if (!strcmp(size, "6")) size = mobi_font_size[5];
1803 else if (!strcmp(size, "7")) size = mobi_font_size[6];
1804 else if (!strcmp(size, "+1")) size = mobi_font_size[3];
1805 else if (!strcmp(size, "+2")) size = mobi_font_size[4];
1806 else if (!strcmp(size, "+3")) size = mobi_font_size[5];
1807 else if (!strcmp(size, "+4")) size = mobi_font_size[6];
1808 else if (!strcmp(size, "+5")) size = mobi_font_size[6];
1809 else if (!strcmp(size, "+6")) size = mobi_font_size[6];
1810 else if (!strcmp(size, "-1")) size = mobi_font_size[1];
1811 else if (!strcmp(size, "-2")) size = mobi_font_size[0];
1812 else if (!strcmp(size, "-3")) size = mobi_font_size[0];
1813 else if (!strcmp(size, "-4")) size = mobi_font_size[0];
1814 else if (!strcmp(size, "-5")) size = mobi_font_size[0];
1815 else if (!strcmp(size, "-6")) size = mobi_font_size[0];
1816 fz_snprintf(buf, sizeof buf, "font-size:%s", size);
1817 fz_xml_add_att(ctx, pool, node, "style", buf);
1818 }
1819 }
1820 else
1821 {
1822 char *height = fz_xml_att(node, "height");
1823 char *width = fz_xml_att(node, "width");
1824 char *align = fz_xml_att(node, "align");
1825 if (height || width || align)
1826 {
1827 buf[0] = 0;
1828 if (height)
1829 {
1830 fz_strlcat(buf, "margin-top:", sizeof buf);
1831 fz_strlcat(buf, height, sizeof buf);
1832 fz_strlcat(buf, ";", sizeof buf);
1833 }
1834 if (width)
1835 {
1836 fz_strlcat(buf, "text-indent:", sizeof buf);
1837 fz_strlcat(buf, width, sizeof buf);
1838 fz_strlcat(buf, ";", sizeof buf);
1839 }
1840 if (align)
1841 {
1842 fz_strlcat(buf, "text-align:", sizeof buf);
1843 fz_strlcat(buf, align, sizeof buf);
1844 fz_strlcat(buf, ";", sizeof buf);
1845 }
1846 fz_xml_add_att(ctx, pool, node, "style", buf);
1847 }
1848 if (!strcmp(tag, "img"))
1849 {
1850 char *recindex = fz_xml_att(node, "recindex");
1851 if (recindex)
1852 fz_xml_add_att(ctx, pool, node, "src", recindex);
1853 }
1854 }
1855 }
1856
1857 down = fz_xml_down(node);
1858 if (down)
1859 patch_mobi_html(ctx, pool, down);
1860
1861 node = fz_xml_next(node);
1862 }
1863 }
1864
1865 static void
1866 fz_parse_html_tree(fz_context *ctx,
1867 fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buf, const char *user_css,
1868 int try_xml, int try_html5, fz_html_tree *tree, char **rtitle, int try_fictionbook, int patch_mobi)
1869 {
1870 fz_xml_doc *xml;
1871
1872 if (rtitle)
1873 *rtitle = NULL;
1874
1875 xml = parse_to_xml(ctx, buf, try_xml, try_html5);
1876
1877 if (patch_mobi)
1878 patch_mobi_html(ctx, xml->u.doc.pool, xml);
1879
1880 fz_try(ctx)
1881 xml_to_boxes(ctx, set, zip, base_uri, user_css, xml, tree, rtitle, try_fictionbook, patch_mobi);
1882 fz_always(ctx)
1883 fz_drop_xml(ctx, xml);
1884 fz_catch(ctx)
1885 fz_rethrow(ctx);
1886 }
1887
1888 #define fz_new_derived_html_tree(CTX, TYPE, DROP) \
1889 ((TYPE *)Memento_label(fz_new_html_tree_of_size(CTX, sizeof(TYPE), DROP), #TYPE))
1890
1891 static fz_html_tree *
1892 fz_new_html_tree_of_size(fz_context *ctx, size_t size, fz_store_drop_fn *drop)
1893 {
1894 fz_pool *pool = fz_new_pool(ctx);
1895 fz_html_tree *tree;
1896
1897 fz_try(ctx)
1898 {
1899 tree = fz_pool_alloc(ctx, pool, size);
1900 FZ_INIT_STORABLE(tree, 1, drop);
1901 tree->pool = pool;
1902 }
1903 fz_catch(ctx)
1904 {
1905 fz_drop_pool(ctx, pool);
1906 fz_rethrow(ctx);
1907 }
1908
1909 return tree;
1910 }
1911
1912 fz_html *
1913 fz_parse_html(fz_context *ctx,
1914 fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buf, const char *user_css,
1915 int try_xml, int try_html5, int patch_mobi)
1916 {
1917 fz_html *html = fz_new_derived_html_tree(ctx, fz_html, fz_drop_html_imp);
1918
1919 html->layout_w = 0;
1920 html->layout_h = 0;
1921 html->layout_em = 0;
1922
1923 fz_try(ctx)
1924 fz_parse_html_tree(ctx, set, zip, base_uri, buf, user_css, try_xml, try_html5, &html->tree, &html->title, 1, patch_mobi);
1925 fz_catch(ctx)
1926 {
1927 fz_drop_html(ctx, html);
1928 fz_rethrow(ctx);
1929 }
1930
1931 return html;
1932 }
1933
1934 typedef struct
1935 {
1936 int saved;
1937 fz_warning_cb *old;
1938 void *arg;
1939 fz_buffer *buffer;
1940 fz_context *ctx;
1941 } warning_save;
1942
1943 static void
1944 warn_to_buffer(void *user, const char *message)
1945 {
1946 warning_save *save = (warning_save *)user;
1947 fz_context *ctx = save->ctx;
1948
1949 fz_try(ctx)
1950 {
1951 fz_append_string(ctx, save->buffer, message);
1952 fz_append_byte(ctx, save->buffer, '\n');
1953 }
1954 fz_catch(ctx)
1955 {
1956 /* Silently swallow the error. */
1957 fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
1958 fz_report_error(ctx);
1959 }
1960 }
1961
1962 static void
1963 redirect_warnings_to_buffer(fz_context *ctx, fz_buffer *buf, warning_save *save)
1964 {
1965 save->saved = 1;
1966 save->old = fz_warning_callback(ctx, &save->arg);
1967 save->buffer = buf;
1968 save->ctx = ctx;
1969
1970 fz_flush_warnings(ctx);
1971 fz_set_warning_callback(ctx, warn_to_buffer, save);
1972 }
1973
1974 static void
1975 restore_warnings(fz_context *ctx, warning_save *save)
1976 {
1977 if (!save->saved)
1978 return;
1979
1980 fz_flush_warnings(ctx);
1981 fz_set_warning_callback(ctx, save->old, save->arg);
1982 }
1983
1984 fz_story *
1985 fz_new_story(fz_context *ctx, fz_buffer *buf, const char *user_css, float em, fz_archive *zip)
1986 {
1987 fz_story *story = fz_new_derived_html_tree(ctx, fz_story, fz_drop_story_imp);
1988 warning_save saved = { 0 };
1989 fz_buffer *local_buffer = NULL;
1990
1991 if (buf == NULL)
1992 {
1993 local_buffer = fz_new_buffer(ctx, 0);
1994 buf = local_buffer;
1995 }
1996
1997 fz_var(local_buffer);
1998 fz_var(saved);
1999
2000 fz_try(ctx)
2001 {
2002 story->zip = fz_keep_archive(ctx, zip);
2003 story->font_set = fz_new_html_font_set(ctx);
2004 story->em = em;
2005 story->user_css = user_css ? fz_strdup(ctx, user_css) : NULL;
2006 story->warnings = fz_new_buffer(ctx, 128);
2007 redirect_warnings_to_buffer(ctx, story->warnings, &saved);
2008 story->dom = parse_to_xml(ctx, buf, 0, 1);
2009 }
2010 fz_always(ctx)
2011 {
2012 restore_warnings(ctx, &saved);
2013 fz_drop_buffer(ctx, local_buffer);
2014 }
2015 fz_catch(ctx)
2016 {
2017 fz_drop_html_tree(ctx, &story->tree);
2018 fz_rethrow(ctx);
2019 }
2020
2021 return story;
2022 }
2023
2024 fz_html *
2025 fz_parse_xhtml(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buf, const char *user_css)
2026 {
2027 /* try as XML first, fall back to HTML5 */
2028 return fz_parse_html(ctx, set, zip, base_uri, buf, user_css, 1, 1, 0);
2029 }
2030
2031 static void indent(int level)
2032 {
2033 while (level-- > 0)
2034 putchar('\t');
2035 }
2036
2037 static void
2038 fz_debug_html_flow(fz_context *ctx, fz_html_flow *flow, int level)
2039 {
2040 fz_html_box *sbox = NULL;
2041 while (flow)
2042 {
2043 if (flow->box != sbox) {
2044 sbox = flow->box;
2045 indent(level);
2046 #ifndef NDEBUG
2047 printf("@style <%s> em=%g font='%s'", sbox->tag, sbox->s.layout.em, fz_font_name(ctx, sbox->style->font));
2048 #else
2049 printf("@style em=%g font='%s'", sbox->s.layout.em, fz_font_name(ctx, sbox->style->font));
2050 #endif
2051 if (fz_font_is_serif(ctx, sbox->style->font))
2052 printf(" serif");
2053 else
2054 printf(" sans");
2055 if (fz_font_is_monospaced(ctx, sbox->style->font))
2056 printf(" monospaced");
2057 if (fz_font_is_bold(ctx, sbox->style->font))
2058 printf(" bold");
2059 if (fz_font_is_italic(ctx, sbox->style->font))
2060 printf(" italic");
2061 if (sbox->style->small_caps)
2062 printf(" small-caps");
2063 printf("\n");
2064 }
2065
2066 indent(level);
2067 switch (flow->type) {
2068 case FLOW_WORD: printf("word "); break;
2069 case FLOW_SPACE: printf("space"); break;
2070 case FLOW_SBREAK: printf("sbrk "); break;
2071 case FLOW_SHYPHEN: printf("shy "); break;
2072 case FLOW_BREAK: printf("break"); break;
2073 case FLOW_IMAGE: printf("image"); break;
2074 case FLOW_ANCHOR: printf("anchor"); break;
2075 }
2076 // printf(" y=%g x=%g w=%g", flow->y, flow->x, flow->w);
2077 if (flow->type == FLOW_IMAGE)
2078 printf(" h=%g", flow->h);
2079 if (flow->type == FLOW_WORD)
2080 printf(" text='%s'", flow->content.text);
2081 printf("\n");
2082 if (flow->breaks_line) {
2083 indent(level);
2084 printf("*\n");
2085 }
2086
2087 flow = flow->next;
2088 }
2089 }
2090
2091 fz_structure fz_html_tag_to_structure(const char *tag)
2092 {
2093 if (!strcmp(tag, "body")) return FZ_STRUCTURE_DOCUMENT;
2094 if (!strcmp(tag, "div")) return FZ_STRUCTURE_DIV;
2095 if (!strcmp(tag, "span")) return FZ_STRUCTURE_SPAN;
2096 if (!strcmp(tag, "blockquote")) return FZ_STRUCTURE_BLOCKQUOTE;
2097 if (!strcmp(tag, "p")) return FZ_STRUCTURE_P;
2098 if (!strcmp(tag, "h1")) return FZ_STRUCTURE_H1;
2099 if (!strcmp(tag, "h2")) return FZ_STRUCTURE_H2;
2100 if (!strcmp(tag, "h3")) return FZ_STRUCTURE_H3;
2101 if (!strcmp(tag, "h4")) return FZ_STRUCTURE_H4;
2102 if (!strcmp(tag, "h5")) return FZ_STRUCTURE_H5;
2103 if (!strcmp(tag, "h6")) return FZ_STRUCTURE_H6;
2104 if (!strcmp(tag, "ol")) return FZ_STRUCTURE_LIST;
2105 if (!strcmp(tag, "ul")) return FZ_STRUCTURE_LIST;
2106 if (!strcmp(tag, "dl")) return FZ_STRUCTURE_LIST;
2107 if (!strcmp(tag, "li")) return FZ_STRUCTURE_LISTITEM;
2108 if (!strcmp(tag, "table")) return FZ_STRUCTURE_TABLE;
2109 if (!strcmp(tag, "tr")) return FZ_STRUCTURE_TR;
2110 if (!strcmp(tag, "th")) return FZ_STRUCTURE_TH;
2111 if (!strcmp(tag, "td")) return FZ_STRUCTURE_TD;
2112 if (!strcmp(tag, "thead")) return FZ_STRUCTURE_THEAD;
2113 if (!strcmp(tag, "tbody")) return FZ_STRUCTURE_TBODY;
2114 if (!strcmp(tag, "tfoot")) return FZ_STRUCTURE_TFOOT;
2115 return FZ_STRUCTURE_INVALID;
2116 }
2117
2118 static void
2119 fz_debug_html_box(fz_context *ctx, fz_html_box *box, int level)
2120 {
2121 while (box)
2122 {
2123 indent(level);
2124 printf("box ");
2125 switch (box->type) {
2126 case BOX_BLOCK: printf("block"); break;
2127 case BOX_FLOW: printf("flow"); break;
2128 case BOX_INLINE: printf("inline"); break;
2129 case BOX_TABLE: printf("table"); break;
2130 case BOX_TABLE_ROW: printf("table-row"); break;
2131 case BOX_TABLE_CELL: printf("table-cell"); break;
2132 }
2133
2134 printf(" <%s>", box->tag);
2135 // printf(" em=%g", box->em);
2136 // printf(" x=%g y=%g w=%g b=%g", box->x, box->y, box->w, box->b);
2137
2138 if (box->is_first_flow)
2139 printf(" is-first-flow");
2140 if (box->list_item)
2141 printf(" list=%d", box->list_item);
2142 if (box->id)
2143 printf(" id=(%s)", box->id);
2144 if (box->href)
2145 printf(" href=(%s)", box->href);
2146 printf("\n");
2147
2148 if (box->type == BOX_BLOCK || box->type == BOX_TABLE) {
2149 indent(level+1);
2150 printf(">margin=(%g %g %g %g)\n", box->u.block.margin[0], box->u.block.margin[1], box->u.block.margin[2], box->u.block.margin[3]);
2151 //indent(level+1);
2152 //printf(">padding=(%g %g %g %g)\n", box->u.block.padding[0], box->u.block.padding[1], box->u.block.padding[2], box->u.block.padding[3]);
2153 //indent(level+1);
2154 //printf(">border=(%g %g %g %g)\n", box->u.block.border[0], box->u.block.border[1], box->u.block.border[2], box->u.block.border[3]);
2155 }
2156
2157 if (box->down)
2158 fz_debug_html_box(ctx, box->down, level + 1);
2159 if (box->type == BOX_FLOW) {
2160 indent(level+1);
2161 printf("flow\n");
2162 fz_debug_html_flow(ctx, box->u.flow.head, level + 2);
2163 }
2164
2165 box = box->next;
2166 }
2167 }
2168
2169 void
2170 fz_debug_html(fz_context *ctx, fz_html_box *box)
2171 {
2172 fz_debug_html_box(ctx, box, 0);
2173 }
2174
2175 static size_t
2176 fz_html_size(fz_context *ctx, fz_html *html)
2177 {
2178 return html ? fz_pool_size(ctx, html->tree.pool) : 0;
2179 }
2180
2181 /* Magic to make html storable. */
2182 typedef struct {
2183 int refs;
2184 void *doc;
2185 int chapter_num;
2186 } fz_html_key;
2187
2188 static int
2189 fz_make_hash_html_key(fz_context *ctx, fz_store_hash *hash, void *key_)
2190 {
2191 fz_html_key *key = (fz_html_key *)key_;
2192 hash->u.pi.ptr = key->doc;
2193 hash->u.pi.i = key->chapter_num;
2194 return 1;
2195 }
2196
2197 static void *
2198 fz_keep_html_key(fz_context *ctx, void *key_)
2199 {
2200 fz_html_key *key = (fz_html_key *)key_;
2201 return fz_keep_imp(ctx, key, &key->refs);
2202 }
2203
2204 static void
2205 fz_drop_html_key(fz_context *ctx, void *key_)
2206 {
2207 fz_html_key *key = (fz_html_key *)key_;
2208 if (fz_drop_imp(ctx, key, &key->refs))
2209 {
2210 fz_free(ctx, key);
2211 }
2212 }
2213
2214 static int
2215 fz_cmp_html_key(fz_context *ctx, void *k0_, void *k1_)
2216 {
2217 fz_html_key *k0 = (fz_html_key *)k0_;
2218 fz_html_key *k1 = (fz_html_key *)k1_;
2219 return k0->doc == k1->doc && k0->chapter_num == k1->chapter_num;
2220 }
2221
2222 static void
2223 fz_format_html_key(fz_context *ctx, char *s, size_t n, void *key_)
2224 {
2225 fz_html_key *key = (fz_html_key *)key_;
2226 fz_snprintf(s, n, "(html doc=%p, ch=%d)", key->doc, key->chapter_num);
2227 }
2228
2229 static const fz_store_type fz_html_store_type =
2230 {
2231 "fz_html",
2232 fz_make_hash_html_key,
2233 fz_keep_html_key,
2234 fz_drop_html_key,
2235 fz_cmp_html_key,
2236 fz_format_html_key,
2237 NULL
2238 };
2239
2240 fz_html *fz_store_html(fz_context *ctx, fz_html *html, void *doc, int chapter)
2241 {
2242 fz_html_key *key = NULL;
2243 fz_html *other_html;
2244
2245 /* Stick the parsed html in the store */
2246 fz_var(key);
2247
2248 fz_try(ctx)
2249 {
2250 key = fz_malloc_struct(ctx, fz_html_key);
2251 key->refs = 1;
2252 key->doc = doc;
2253 key->chapter_num = chapter;
2254 other_html = fz_store_item(ctx, key, html, fz_html_size(ctx, html), &fz_html_store_type);
2255 if (other_html)
2256 {
2257 fz_drop_html(ctx, html);
2258 html = other_html;
2259 }
2260 }
2261 fz_always(ctx)
2262 fz_drop_html_key(ctx, key);
2263 fz_catch(ctx)
2264 {
2265 /* Do nothing */
2266 }
2267
2268 return html;
2269 }
2270
2271 fz_html *fz_find_html(fz_context *ctx, void *doc, int chapter)
2272 {
2273 fz_html_key key;
2274
2275 key.refs = 1;
2276 key.doc = doc;
2277 key.chapter_num = chapter;
2278 return fz_find_item(ctx, &fz_drop_html_imp, &key, &fz_html_store_type);
2279 }
2280
2281 static int
2282 html_filter_store(fz_context *ctx, void *doc, void *key_)
2283 {
2284 fz_html_key *key = (fz_html_key *)key_;
2285
2286 return (doc == key->doc);
2287 }
2288
2289 void fz_purge_stored_html(fz_context *ctx, void *doc)
2290 {
2291 fz_filter_store(ctx, html_filter_store, doc, &fz_html_store_type);
2292 }
2293
2294 static void
2295 convert_to_boxes(fz_context *ctx, fz_story *story)
2296 {
2297 warning_save saved = { 0 };
2298
2299 if (story->dom == NULL)
2300 return;
2301
2302 fz_var(saved);
2303
2304 fz_try(ctx)
2305 {
2306 redirect_warnings_to_buffer(ctx, story->warnings, &saved);
2307 xml_to_boxes(ctx, story->font_set, story->zip, ".", story->user_css, story->dom, &story->tree, NULL, 0, 0);
2308 }
2309 fz_always(ctx)
2310 {
2311 fz_drop_xml(ctx, story->dom);
2312 story->dom = NULL;
2313 restore_warnings(ctx, &saved);
2314 }
2315 fz_catch(ctx)
2316 fz_rethrow(ctx);
2317 }
2318
2319 int fz_place_story(fz_context *ctx, fz_story *story, fz_rect where, fz_rect *filled)
2320 {
2321 return fz_place_story_flags(ctx, story, where, filled, 0);
2322 }
2323
2324 int fz_place_story_flags(fz_context *ctx, fz_story *story, fz_rect where, fz_rect *filled, int flags)
2325 {
2326 float w, h;
2327
2328 if (filled)
2329 *filled = fz_empty_rect;
2330
2331 if (story == NULL || story->complete)
2332 return 0;
2333
2334 /* Convert from XML to box model on the first attempt to place.
2335 * The DOM is unusable from here on in. */
2336 convert_to_boxes(ctx, story);
2337
2338 w = where.x1 - where.x0;
2339 h = where.y1 - where.y0;
2340 /* Confusingly, we call the layout using restart_draw, not restart_place,
2341 * because we don't want to destroy the current values in restart_place
2342 * in case we have to retry later. This means the values are left in
2343 * the correct struct though! */
2344 story->restart_draw.start = story->restart_place.start;
2345 story->restart_draw.start_flow = story->restart_place.start_flow;
2346 story->restart_draw.end = NULL;
2347 story->restart_draw.end_flow = NULL;
2348 story->restart_draw.reason = FZ_HTML_RESTART_REASON_NONE;
2349 story->restart_draw.flags = flags;
2350 story->bbox = where;
2351 fz_restartable_layout_html(ctx, &story->tree, where.x0, where.y0, w, h, story->em, &story->restart_draw);
2352 story->restart_draw.start = story->restart_place.start;
2353 story->restart_draw.start_flow = story->restart_place.start_flow;
2354
2355 if (filled)
2356 {
2357 fz_html_box *b = story->tree.root;
2358 filled->x0 = b->s.layout.x - b->u.block.margin[L] - b->u.block.border[L] - b->u.block.padding[L];
2359 filled->x1 = b->s.layout.w + b->u.block.margin[R] + b->u.block.border[R] + b->u.block.padding[R] + b->s.layout.x;
2360 filled->y0 = b->s.layout.y - b->u.block.margin[T] - b->u.block.border[T] - b->u.block.padding[T];
2361 filled->y1 = b->s.layout.b + b->u.block.margin[B] + b->u.block.border[B] + b->u.block.padding[B];
2362 }
2363
2364 #ifndef NDEBUG
2365 if (fz_atoi(getenv("FZ_DEBUG_HTML")))
2366 fz_debug_html(ctx, story->tree.root);
2367 #endif
2368
2369 if (story->restart_draw.end == NULL)
2370 return FZ_HTML_RESTART_REASON_NONE;
2371 if (story->restart_draw.reason == FZ_HTML_RESTART_REASON_LINE_WIDTH)
2372 return FZ_HTML_RESTART_REASON_LINE_WIDTH;
2373 return FZ_HTML_RESTART_REASON_LINE_HEIGHT;
2374 }
2375
2376 const char *
2377 fz_story_warnings(fz_context *ctx, fz_story *story)
2378 {
2379 unsigned char *data;
2380
2381 if (!story)
2382 return NULL;
2383
2384 convert_to_boxes(ctx, story);
2385
2386 fz_terminate_buffer(ctx, story->warnings);
2387
2388 if (fz_buffer_storage(ctx, story->warnings, &data) == 0)
2389 return NULL;
2390
2391 return (const char *)data;
2392 }