comparison mupdf-source/source/html/html-imp.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 // Copyright (C) 2004-2025 Artifex Software, Inc.
2 //
3 // This file is part of MuPDF.
4 //
5 // MuPDF is free software: you can redistribute it and/or modify it under the
6 // terms of the GNU Affero General Public License as published by the Free
7 // Software Foundation, either version 3 of the License, or (at your option)
8 // any later version.
9 //
10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13 // details.
14 //
15 // You should have received a copy of the GNU Affero General Public License
16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17 //
18 // Alternative licensing terms are available from the licensor.
19 // For commercial licensing, see <https://www.artifex.com/> or contact
20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21 // CA 94129, USA, for further information.
22
23 #ifndef SOURCE_HTML_IMP_H
24 #define SOURCE_HTML_IMP_H
25
26 #include "mupdf/fitz.h"
27 #include "mupdf/html.h"
28
29 #include "../fitz/xml-imp.h"
30
31 typedef struct fz_html_font_face_s fz_html_font_face;
32 typedef struct fz_html_box_s fz_html_box;
33 typedef struct fz_html_flow_s fz_html_flow;
34 typedef struct fz_css_style_splay_s fz_css_style_splay;
35
36 typedef struct fz_css_s fz_css;
37 typedef struct fz_css_rule_s fz_css_rule;
38 typedef struct fz_css_match_s fz_css_match;
39 typedef struct fz_css_style_s fz_css_style;
40
41 typedef struct fz_css_selector_s fz_css_selector;
42 typedef struct fz_css_condition_s fz_css_condition;
43 typedef struct fz_css_property_s fz_css_property;
44 typedef struct fz_css_value_s fz_css_value;
45 typedef struct fz_css_number_s fz_css_number;
46 typedef struct fz_css_color_s fz_css_color;
47
48 struct fz_html_font_face_s
49 {
50 char *family;
51 int is_bold;
52 int is_italic;
53 int is_small_caps;
54 fz_font *font;
55 char *src;
56 fz_html_font_face *next;
57 };
58
59 struct fz_html_font_set_s
60 {
61 fz_font *fonts[12]; /* Times, Helvetica, Courier in R,I,B,BI */
62 fz_html_font_face *custom;
63 };
64
65 #define UCS_MAX 0x10ffff
66
67 enum
68 {
69 CSS_KEYWORD = UCS_MAX+1,
70 CSS_HASH,
71 CSS_STRING,
72 CSS_NUMBER,
73 CSS_LENGTH,
74 CSS_PERCENT,
75 CSS_URI,
76 };
77
78 struct fz_css_s
79 {
80 fz_pool *pool;
81 fz_css_rule *rule;
82 };
83
84 struct fz_css_rule_s
85 {
86 fz_css_selector *selector;
87 fz_css_property *declaration;
88 fz_css_rule *next;
89 int loaded;
90 };
91
92 struct fz_css_selector_s
93 {
94 char *name;
95 int combine;
96 fz_css_condition *cond;
97 fz_css_selector *left;
98 fz_css_selector *right;
99 fz_css_selector *next;
100 };
101
102 struct fz_css_condition_s
103 {
104 int type;
105 char *key;
106 char *val;
107 fz_css_condition *next;
108 };
109
110 struct fz_css_property_s
111 {
112 int name;
113 fz_css_value *value;
114 short spec;
115 short important;
116 fz_css_property *next;
117 };
118
119 struct fz_css_value_s
120 {
121 int type;
122 char *data;
123 fz_css_value *args; /* function arguments */
124 fz_css_value *next;
125 };
126
127 enum
128 {
129 PRO_BACKGROUND_COLOR,
130 PRO_BORDER_BOTTOM_COLOR,
131 PRO_BORDER_BOTTOM_STYLE,
132 PRO_BORDER_BOTTOM_WIDTH,
133 PRO_BORDER_LEFT_COLOR,
134 PRO_BORDER_LEFT_STYLE,
135 PRO_BORDER_LEFT_WIDTH,
136 PRO_BORDER_RIGHT_COLOR,
137 PRO_BORDER_RIGHT_STYLE,
138 PRO_BORDER_RIGHT_WIDTH,
139 PRO_BORDER_TOP_COLOR,
140 PRO_BORDER_TOP_STYLE,
141 PRO_BORDER_TOP_WIDTH,
142 PRO_BORDER_SPACING,
143 PRO_COLOR,
144 PRO_DIRECTION,
145 PRO_DISPLAY,
146 PRO_FONT,
147 PRO_FONT_FAMILY,
148 PRO_FONT_SIZE,
149 PRO_FONT_STYLE,
150 PRO_FONT_VARIANT,
151 PRO_FONT_WEIGHT,
152 PRO_HEIGHT,
153 PRO_LEADING,
154 PRO_LETTER_SPACING,
155 PRO_LINE_HEIGHT,
156 PRO_LIST_STYLE_IMAGE,
157 PRO_LIST_STYLE_POSITION,
158 PRO_LIST_STYLE_TYPE,
159 PRO_MARGIN_BOTTOM,
160 PRO_MARGIN_LEFT,
161 PRO_MARGIN_RIGHT,
162 PRO_MARGIN_TOP,
163 PRO_ORPHANS,
164 PRO_OVERFLOW_WRAP,
165 PRO_PADDING_BOTTOM,
166 PRO_PADDING_LEFT,
167 PRO_PADDING_RIGHT,
168 PRO_PADDING_TOP,
169 PRO_PAGE_BREAK_AFTER,
170 PRO_PAGE_BREAK_BEFORE,
171 PRO_QUOTES,
172 PRO_SRC,
173 PRO_TEXT_ALIGN,
174 PRO_TEXT_DECORATION,
175 PRO_TEXT_FILL_COLOR,
176 PRO_TEXT_INDENT,
177 PRO_TEXT_TRANSFORM,
178 PRO_TEXT_STROKE_WIDTH,
179 PRO_TEXT_STROKE_COLOR,
180 PRO_VERTICAL_ALIGN,
181 PRO_VISIBILITY,
182 PRO_WHITE_SPACE,
183 PRO_WIDOWS,
184 PRO_WIDTH,
185 PRO_WORD_SPACING,
186
187 /* Number of real properties. */
188 NUM_PROPERTIES,
189
190 /* Short-hand properties (always expanded when applied, never used as is): */
191 PRO_BORDER,
192 PRO_BORDER_BOTTOM,
193 PRO_BORDER_COLOR,
194 PRO_BORDER_LEFT,
195 PRO_BORDER_RIGHT,
196 PRO_BORDER_STYLE,
197 PRO_BORDER_TOP,
198 PRO_BORDER_WIDTH,
199 PRO_LIST_STYLE,
200 PRO_MARGIN,
201 PRO_PADDING,
202 };
203
204 struct fz_css_match_s
205 {
206 fz_css_match *up;
207 short spec[NUM_PROPERTIES];
208 fz_css_value *value[NUM_PROPERTIES];
209 };
210
211 enum { DIS_NONE, DIS_BLOCK, DIS_INLINE, DIS_LIST_ITEM, DIS_INLINE_BLOCK, DIS_TABLE, DIS_TABLE_GROUP, DIS_TABLE_ROW, DIS_TABLE_CELL };
212 enum { POS_STATIC, POS_RELATIVE, POS_ABSOLUTE, POS_FIXED };
213 enum { TA_LEFT, TA_RIGHT, TA_CENTER, TA_JUSTIFY };
214 enum { VA_BASELINE, VA_SUB, VA_SUPER, VA_TOP, VA_BOTTOM, VA_TEXT_TOP, VA_TEXT_BOTTOM };
215 enum { BS_NONE, BS_SOLID };
216 enum { V_VISIBLE, V_HIDDEN, V_COLLAPSE };
217 enum { PB_AUTO, PB_ALWAYS, PB_AVOID, PB_LEFT, PB_RIGHT };
218 enum { TD_NONE, TD_UNDERLINE, TD_LINE_THROUGH };
219
220 enum {
221 WS_COLLAPSE = 1,
222 WS_ALLOW_BREAK_SPACE = 2,
223 WS_FORCE_BREAK_NEWLINE = 4,
224 WS_NORMAL = WS_COLLAPSE | WS_ALLOW_BREAK_SPACE,
225 WS_PRE = WS_FORCE_BREAK_NEWLINE,
226 WS_NOWRAP = WS_COLLAPSE,
227 WS_PRE_WRAP = WS_ALLOW_BREAK_SPACE | WS_FORCE_BREAK_NEWLINE,
228 WS_PRE_LINE = WS_COLLAPSE | WS_ALLOW_BREAK_SPACE | WS_FORCE_BREAK_NEWLINE
229 };
230
231 enum {
232 LST_NONE,
233 LST_DISC, LST_CIRCLE, LST_SQUARE,
234 LST_DECIMAL, LST_DECIMAL_ZERO,
235 LST_LC_ROMAN, LST_UC_ROMAN,
236 LST_LC_GREEK, LST_UC_GREEK,
237 LST_LC_LATIN, LST_UC_LATIN,
238 LST_LC_ALPHA, LST_UC_ALPHA,
239 LST_ARMENIAN, LST_GEORGIAN,
240 };
241
242 enum {
243 OVERFLOW_WRAP_NORMAL = 0,
244 OVERFLOW_WRAP_BREAK_WORD = 1
245 /* We do not support 'anywhere'. */
246 };
247
248 enum { N_NUMBER='u', N_LENGTH='p', N_SCALE='m', N_PERCENT='%', N_AUTO='a', N_UNDEFINED='x' };
249
250 struct fz_css_number_s
251 {
252 float value;
253 int unit;
254 };
255
256 struct fz_css_color_s
257 {
258 unsigned char r, g, b, a;
259 };
260
261 struct fz_css_style_s
262 {
263 fz_css_number font_size;
264 fz_css_number width, height;
265 fz_css_number margin[4];
266 fz_css_number padding[4];
267 fz_css_number border_width[4];
268 fz_css_number border_spacing;
269 fz_css_number text_indent;
270 fz_css_number text_stroke_width;
271 unsigned int visibility : 2;
272 unsigned int white_space : 3;
273 unsigned int text_align : 2;
274 unsigned int vertical_align : 3;
275 unsigned int list_style_type : 4;
276 unsigned int page_break_before : 3;
277 unsigned int page_break_after : 3;
278 unsigned int border_style_0 : 1;
279 unsigned int border_style_1 : 1;
280 unsigned int border_style_2 : 1;
281 unsigned int border_style_3 : 1;
282 unsigned int small_caps : 1;
283 unsigned int text_decoration: 2;
284 unsigned int overflow_wrap : 1;
285 /* Ensure the extra bits in the bitfield are copied
286 * on structure copies. */
287 unsigned int blank : 3;
288 fz_css_number line_height;
289 fz_css_number leading;
290 fz_css_color background_color;
291 fz_css_color border_color[4];
292 fz_css_color color;
293 fz_css_color text_fill_color;
294 fz_css_color text_stroke_color;
295 fz_font *font;
296 };
297
298 struct fz_css_style_splay_s {
299 fz_css_style style;
300 fz_css_style_splay *lt;
301 fz_css_style_splay *gt;
302 fz_css_style_splay *up;
303 };
304
305 enum
306 {
307 BOX_BLOCK, /* block-level: contains block, break, flow, and table boxes */
308 BOX_FLOW, /* block-level: contains only inline boxes */
309 BOX_INLINE, /* inline-level: contains only inline boxes */
310 BOX_TABLE, /* table: contains table-row */
311 BOX_TABLE_ROW, /* table-row: contains table-cell */
312 BOX_TABLE_CELL, /* table-cell: contains block */
313 };
314
315 typedef struct
316 {
317 fz_storable storable;
318 fz_pool *pool; /* pool allocator for this html tree */
319 fz_html_box *root;
320 } fz_html_tree;
321
322 struct fz_html_s
323 {
324 /* fz_html is derived from fz_html_tree, so must start with that. */
325 /* Arguably 'tree' should be called 'super'. */
326 fz_html_tree tree;
327
328 float page_w, page_h;
329 float layout_w, layout_h, layout_em;
330 float page_margin[4];
331 char *title;
332 };
333
334 typedef enum
335 {
336 FZ_HTML_RESTART_REASON_NONE = 0,
337 FZ_HTML_RESTART_REASON_LINE_HEIGHT = 1,
338 FZ_HTML_RESTART_REASON_LINE_WIDTH = 2
339 } fz_html_restart_reason;
340
341 enum
342 {
343 FZ_HTML_RESTARTER_FLAGS_NO_OVERFLOW = 1
344 };
345
346 typedef struct {
347 /* start will be filled in on entry with the first node to start
348 * operation on. NULL means start 'immediately'. As we traverse
349 * the tree, once we reach the node to start on, we set this to
350 * NULL, hence if 'start != NULL' then we are still skipping to
351 * find the starting node. */
352 fz_html_box *start;
353
354 /* If start is a BOX_FLOW, then start_flow will be the flow entry
355 * at which we should start. */
356 fz_html_flow *start_flow;
357
358
359 /* end should be NULL on entry. On exit, if it's NULL, then we
360 * finished. Otherwise, this is where we should restart the
361 * process the next time. */
362 fz_html_box *end;
363
364 /* If end is a BOX_FLOW, then end_flow will be the flow entry at which
365 * we should restart next time. */
366 fz_html_flow *end_flow;
367
368
369 /* Workspace used on the traversal of the tree to store a good place
370 * to restart. Typically this will be set to an enclosing box with
371 * a border, so that if we then fail to put any content into the box
372 * we'll elide the entire box/border, not output an empty one. */
373 fz_html_box *potential;
374
375 fz_html_restart_reason reason;
376
377 int flags;
378 } fz_html_restarter;
379
380 struct fz_story
381 {
382 /* fz_story is derived from fz_html_tree, so must start with */
383 /* that. Arguably 'tree' should be called 'super'. */
384 fz_html_tree tree;
385
386 /* The user_css (or NULL) */
387 char *user_css;
388
389 /* The HTML story as XML nodes with a DOM */
390 fz_xml *dom;
391
392 /* The fontset for the content. */
393 fz_html_font_set *font_set;
394
395 /* restart_place holds the start position for the next place.
396 * This is updated by draw. */
397 fz_html_restarter restart_place;
398
399 /* restart_draw holds the start position for the next draw.
400 * This is updated by place. */
401 fz_html_restarter restart_draw;
402
403 /* complete is set true when all the story has been placed and
404 * drawn. */
405 int complete;
406
407 /* The last bbox we laid out for. Used for making a clipping
408 * rectangle. */
409 fz_rect bbox;
410
411 /* The default 'em' size. */
412 float em;
413
414 /* Collected parsing warnings. */
415 fz_buffer *warnings;
416
417 /* Rectangle layout count. */
418 int rect_count;
419
420 /* Archive from which to load any resources. */
421 fz_archive *zip;
422 };
423
424 struct fz_html_box_s
425 {
426 unsigned int type : 3;
427 unsigned int is_first_flow : 1; /* for text-indent */
428 unsigned int markup_dir : 2;
429 unsigned int heading : 3;
430 unsigned int list_item : 21;
431
432 fz_html_box *up, *down, *next;
433
434 const char *tag, *id, *href;
435 const fz_css_style *style;
436
437 union {
438 /* Only needed during build stage */
439 struct {
440 fz_html_box *last_child;
441 fz_html_flow **flow_tail;
442 } build;
443
444 /* Only needed during layout */
445 struct {
446 float x, y, w, b; /* content */
447 float em, baseline;
448 } layout;
449 } s;
450
451 union {
452 /* Only BOX_FLOW use the following */
453 struct {
454 fz_html_flow *head;
455 } flow;
456
457 /* Only BOX_{BLOCK,TABLE,TABLE_ROW,TABLE_CELL} use the following */
458 struct {
459 float margin[4]; // TODO: is margin needed post layout?
460 float border[4];
461 float padding[4];
462 } block;
463 } u;
464 };
465
466 static inline int
467 fz_html_box_has_boxes(fz_html_box *box)
468 {
469 return (box->type == BOX_BLOCK || box->type == BOX_TABLE || box->type == BOX_TABLE_ROW || box->type == BOX_TABLE_CELL);
470 }
471
472 enum
473 {
474 FLOW_WORD = 0,
475 FLOW_SPACE = 1,
476 FLOW_BREAK = 2,
477 FLOW_IMAGE = 3,
478 FLOW_SBREAK = 4,
479 FLOW_SHYPHEN = 5,
480 FLOW_ANCHOR = 6
481 };
482
483 struct fz_html_flow_s
484 {
485 /* What type of node */
486 unsigned int type : 3;
487
488 /* Whether this should expand during justification */
489 unsigned int expand : 1;
490
491 /* Whether this node is currently taken as a line break */
492 unsigned int breaks_line : 1;
493
494 /* Whether this word node can be split or consists of a single glyph cluster */
495 unsigned int atomic : 1;
496
497 /* Whether lines may be broken before this word for overflow-wrap: word-break */
498 unsigned int overflow_wrap : 1;
499
500 /* Direction setting for text - UAX#9 says 125 is the max */
501 unsigned int bidi_level : 7;
502
503 /* The script detected by the bidi code. */
504 unsigned int script : 8;
505
506 /* Whether the markup specifies a given language. */
507 unsigned short markup_lang;
508
509 float x, y, w, h;
510 fz_html_box *box; /* for style and em */
511 fz_html_flow *next;
512 union {
513 char text[1];
514 fz_image *image;
515 } content;
516 };
517
518
519 fz_css *fz_new_css(fz_context *ctx);
520 void fz_parse_css(fz_context *ctx, fz_css *css, const char *source, const char *file);
521 fz_css_property *fz_parse_css_properties(fz_context *ctx, fz_pool *pool, const char *source);
522 void fz_drop_css(fz_context *ctx, fz_css *css);
523 void fz_debug_css(fz_context *ctx, fz_css *css);
524 const char *fz_css_property_name(int name);
525
526 void fz_match_css(fz_context *ctx, fz_css_match *match, fz_css_match *up, fz_css *css, fz_xml *node);
527 void fz_match_css_at_page(fz_context *ctx, fz_css_match *match, fz_css *css);
528
529 int fz_get_css_match_display(fz_css_match *node);
530 void fz_default_css_style(fz_context *ctx, fz_css_style *style);
531 void fz_apply_css_style(fz_context *ctx, fz_html_font_set *set, fz_css_style *style, fz_css_match *match);
532
533 /*
534 Lookup style in the splay tree, returning a pointer
535 to the found instance if there is one, creating and
536 inserting (and moving to root) one if there is not.
537 */
538 const fz_css_style *fz_css_enlist(fz_context *ctx, const fz_css_style *style, fz_css_style_splay **tree, fz_pool *pool);
539
540 float fz_from_css_number(fz_css_number number, float em, float percent_value, float auto_value);
541 float fz_from_css_number_scale(fz_css_number number, float scale);
542 int fz_css_number_defined(fz_css_number number);
543
544 fz_html_font_set *fz_new_html_font_set(fz_context *ctx);
545 void fz_add_html_font_face(fz_context *ctx, fz_html_font_set *set,
546 const char *family, int is_bold, int is_italic, int is_small_caps, const char *src, fz_font *font);
547 fz_font *fz_load_html_font(fz_context *ctx, fz_html_font_set *set, const char *family, int is_bold, int is_italic, int is_small_caps);
548 void fz_drop_html_font_set(fz_context *ctx, fz_html_font_set *htx);
549
550 void fz_add_css_font_faces(fz_context *ctx, fz_html_font_set *set, fz_archive *dir, const char *base_uri, fz_css *css);
551
552 void fz_layout_html(fz_context *ctx, fz_html *html, float w, float h, float em);
553 void fz_draw_html(fz_context *ctx, fz_device *dev, fz_matrix ctm, fz_html *html, int page);
554 fz_outline *fz_load_html_outline(fz_context *ctx, fz_html *node);
555
556 float fz_find_html_target(fz_context *ctx, fz_html *html, const char *id);
557 fz_link *fz_load_html_links(fz_context *ctx, fz_html *html, int page, const char *base_uri);
558 fz_html *fz_keep_html(fz_context *ctx, fz_html *html);
559 void fz_drop_html(fz_context *ctx, fz_html *html);
560 fz_bookmark fz_make_html_bookmark(fz_context *ctx, fz_html *html, int page);
561 int fz_lookup_html_bookmark(fz_context *ctx, fz_html *html, fz_bookmark mark);
562 void fz_debug_html(fz_context *ctx, fz_html_box *box);
563
564 fz_html *fz_store_html(fz_context *ctx, fz_html *html, void *doc, int chapter);
565 fz_html *fz_find_html(fz_context *ctx, void *doc, int chapter);
566 void fz_purge_stored_html(fz_context *ctx, void *doc);
567
568 void fz_restartable_layout_html(fz_context *ctx, fz_html_tree *tree, float start_x, float start_y, float page_w, float page_h, float em, fz_html_restarter *restart);
569
570 fz_html_flow *fz_html_split_flow(fz_context *ctx, fz_pool *pool, fz_html_flow *flow, size_t offset);
571
572 fz_archive *fz_extract_html_from_mobi(fz_context *ctx, fz_buffer *mobi);
573
574 fz_structure fz_html_tag_to_structure(const char *tag);
575
576 fz_html *fz_parse_html(fz_context *ctx,
577 fz_html_font_set *set, fz_archive *dir, const char *base_uri, fz_buffer *buf, const char *user_css,
578 int try_xml, int try_html5, int patch_mobi);
579
580 fz_buffer *fz_txt_buffer_to_html(fz_context *ctx, fz_buffer *in);
581
582
583 #endif