comparison mupdf-source/source/html/html-outline.c @ 3:2c135c81b16c

MERGE: upstream PyMuPDF 1.26.4 with MuPDF 1.26.7
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:44:09 +0200
parents b50eed0cc0ef
children
comparison
equal deleted inserted replaced
0:6015a75abc2d 3:2c135c81b16c
1 // Copyright (C) 2004-2024 Artifex Software, Inc.
2 //
3 // This file is part of MuPDF.
4 //
5 // MuPDF is free software: you can redistribute it and/or modify it under the
6 // terms of the GNU Affero General Public License as published by the Free
7 // Software Foundation, either version 3 of the License, or (at your option)
8 // any later version.
9 //
10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13 // details.
14 //
15 // You should have received a copy of the GNU Affero General Public License
16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17 //
18 // Alternative licensing terms are available from the licensor.
19 // For commercial licensing, see <https://www.artifex.com/> or contact
20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21 // CA 94129, USA, for further information.
22
23 #include "mupdf/fitz.h"
24 #include "html-imp.h"
25
26 #include <string.h>
27
28 enum { T, R, B, L };
29
30 static int is_internal_uri(const char *uri)
31 {
32 while (*uri >= 'a' && *uri <= 'z')
33 ++uri;
34 if (uri[0] == ':' && uri[1] == '/' && uri[2] == '/')
35 return 0;
36 return 1;
37 }
38
39 static fz_link *load_link_flow(fz_context *ctx, fz_html_flow *flow, fz_link *head, int page, float page_h, const char *dir, const char *file)
40 {
41 fz_link *link;
42 fz_html_flow *next;
43 char path[2048];
44 fz_rect bbox;
45 const char *dest;
46 const char *href;
47 float end;
48
49 float page_y0 = page * page_h;
50 float page_y1 = (page + 1) * page_h;
51
52 while (flow)
53 {
54 next = flow->next;
55 if (flow->y >= page_y0 && flow->y <= page_y1)
56 {
57 href = flow->box->href;
58 if (href)
59 {
60 /* Coalesce contiguous flow boxes into one link node */
61 end = flow->x + flow->w;
62 while (next &&
63 next->y == flow->y &&
64 next->h == flow->h &&
65 next->box->href == href)
66 {
67 end = next->x + next->w;
68 next = next->next;
69 }
70
71 bbox.x0 = flow->x;
72 bbox.y0 = flow->y - page * page_h;
73 bbox.x1 = end;
74 bbox.y1 = bbox.y0 + flow->h;
75 if (flow->type != FLOW_IMAGE)
76 {
77 /* flow->y is the baseline, adjust bbox appropriately */
78 bbox.y0 -= 0.8f * flow->h;
79 bbox.y1 -= 0.8f * flow->h;
80 }
81
82 if (is_internal_uri(href))
83 {
84 if (href[0] == '#')
85 {
86 fz_strlcpy(path, file, sizeof path);
87 fz_strlcat(path, href, sizeof path);
88 }
89 else
90 {
91 fz_strlcpy(path, dir, sizeof path);
92 fz_strlcat(path, "/", sizeof path);
93 fz_strlcat(path, href, sizeof path);
94 }
95 fz_urldecode(path);
96 fz_cleanname(path);
97
98 dest = path;
99 }
100 else
101 {
102 dest = href;
103 }
104
105 link = fz_new_derived_link(ctx, fz_link, bbox, dest);
106 link->next = head;
107 head = link;
108 }
109 }
110 flow = next;
111 }
112 return head;
113 }
114
115 static fz_link *load_link_box(fz_context *ctx, fz_html_box *box, fz_link *head, int page, float page_h, const char *dir, const char *file)
116 {
117 while (box)
118 {
119 if (box->type == BOX_FLOW)
120 head = load_link_flow(ctx, box->u.flow.head, head, page, page_h, dir, file);
121 if (box->down)
122 head = load_link_box(ctx, box->down, head, page, page_h, dir, file);
123 box = box->next;
124 }
125 return head;
126 }
127
128 fz_link *
129 fz_load_html_links(fz_context *ctx, fz_html *html, int page, const char *file)
130 {
131 fz_link *link, *head;
132 char dir[2048];
133 fz_dirname(dir, file, sizeof dir);
134
135 head = load_link_box(ctx, html->tree.root, NULL, page, html->page_h, dir, file);
136
137 for (link = head; link; link = link->next)
138 {
139 /* Adjust for page margins */
140 link->rect.x0 += html->page_margin[L];
141 link->rect.x1 += html->page_margin[L];
142 link->rect.y0 += html->page_margin[T];
143 link->rect.y1 += html->page_margin[T];
144 }
145
146 return head;
147 }
148
149 static fz_html_flow *
150 find_first_content(fz_html_box *box)
151 {
152 while (box)
153 {
154 if (box->type == BOX_FLOW)
155 return box->u.flow.head;
156 box = box->down;
157 }
158 return NULL;
159 }
160
161 static float
162 find_flow_target(fz_html_flow *flow, const char *id)
163 {
164 while (flow)
165 {
166 if (flow->box->id && !strcmp(id, flow->box->id))
167 return flow->y;
168 flow = flow->next;
169 }
170 return -1;
171 }
172
173 static float
174 find_box_target(fz_html_box *box, const char *id)
175 {
176 float y;
177 while (box)
178 {
179 if (box->id && !strcmp(id, box->id))
180 {
181 fz_html_flow *flow = find_first_content(box);
182 if (flow)
183 return flow->y;
184 return box->s.layout.y;
185 }
186 if (box->type == BOX_FLOW)
187 {
188 y = find_flow_target(box->u.flow.head, id);
189 if (y >= 0)
190 return y;
191 }
192 else
193 {
194 y = find_box_target(box->down, id);
195 if (y >= 0)
196 return y;
197 }
198 box = box->next;
199 }
200 return -1;
201 }
202
203 float
204 fz_find_html_target(fz_context *ctx, fz_html *html, const char *id)
205 {
206 return find_box_target(html->tree.root, id);
207 }
208
209 static fz_html_flow *
210 make_flow_bookmark(fz_context *ctx, fz_html_flow *flow, float y, fz_html_flow **candidate)
211 {
212 while (flow)
213 {
214 *candidate = flow;
215 if (flow->y >= y)
216 return flow;
217 flow = flow->next;
218 }
219 return NULL;
220 }
221
222 static fz_html_flow *
223 make_box_bookmark(fz_context *ctx, fz_html_box *box, float y, fz_html_flow **candidate)
224 {
225 fz_html_flow *mark;
226 fz_html_flow *dummy = NULL;
227 if (candidate == NULL)
228 candidate = &dummy;
229 while (box)
230 {
231 if (box->type == BOX_FLOW)
232 {
233 if (box->s.layout.y >= y)
234 {
235 mark = make_flow_bookmark(ctx, box->u.flow.head, y, candidate);
236 if (mark)
237 return mark;
238 }
239 else
240 *candidate = make_flow_bookmark(ctx, box->u.flow.head, y, candidate);
241 }
242 else
243 {
244 mark = make_box_bookmark(ctx, box->down, y, candidate);
245 if (mark)
246 return mark;
247 }
248 box = box->next;
249 }
250 return *candidate;
251 }
252
253 fz_bookmark
254 fz_make_html_bookmark(fz_context *ctx, fz_html *html, int page)
255 {
256 return (fz_bookmark)make_box_bookmark(ctx, html->tree.root, page * html->page_h, NULL);
257 }
258
259 static int
260 lookup_flow_bookmark(fz_context *ctx, fz_html_flow *flow, fz_html_flow *mark)
261 {
262 while (flow)
263 {
264 if (flow == mark)
265 return 1;
266 flow = flow->next;
267 }
268 return 0;
269 }
270
271 static int
272 lookup_box_bookmark(fz_context *ctx, fz_html_box *box, fz_html_flow *mark)
273 {
274 while (box)
275 {
276 if (box->type == BOX_FLOW)
277 {
278 if (lookup_flow_bookmark(ctx, box->u.flow.head, mark))
279 return 1;
280 }
281 else
282 {
283 if (lookup_box_bookmark(ctx, box->down, mark))
284 return 1;
285 }
286 box = box->next;
287 }
288 return 0;
289 }
290
291 int
292 fz_lookup_html_bookmark(fz_context *ctx, fz_html *html, fz_bookmark mark)
293 {
294 fz_html_flow *flow = (fz_html_flow*)mark;
295 if (flow && lookup_box_bookmark(ctx, html->tree.root, flow))
296 return (int)(flow->y / html->page_h);
297 return -1;
298 }
299
300 struct outline_parser
301 {
302 fz_html *html;
303 fz_buffer *cat;
304 fz_outline *head;
305 fz_outline **tail[6];
306 fz_outline **down[6];
307 int level[6];
308 int current;
309 int id;
310 };
311
312 static void
313 cat_html_flow(fz_context *ctx, fz_buffer *cat, fz_html_flow *flow)
314 {
315 while (flow)
316 {
317 switch (flow->type)
318 {
319 case FLOW_WORD:
320 fz_append_string(ctx, cat, flow->content.text);
321 break;
322 case FLOW_SPACE:
323 case FLOW_BREAK:
324 fz_append_byte(ctx, cat, ' ');
325 break;
326 default:
327 break;
328 }
329 flow = flow->next;
330 }
331 }
332
333 static void
334 cat_html_box(fz_context *ctx, fz_buffer *cat, fz_html_box *box)
335 {
336 while (box)
337 {
338 if (box->type == BOX_FLOW)
339 cat_html_flow(ctx, cat, box->u.flow.head);
340 cat_html_box(ctx, cat, box->down);
341 box = box->next;
342 }
343 }
344
345 static const char *
346 cat_html_text(fz_context *ctx, struct outline_parser *x, fz_html_box *box)
347 {
348 if (!x->cat)
349 x->cat = fz_new_buffer(ctx, 1024);
350 else
351 fz_clear_buffer(ctx, x->cat);
352
353 cat_html_flow(ctx, x->cat, box->u.flow.head);
354 cat_html_box(ctx, x->cat, box->down);
355
356 return fz_string_from_buffer(ctx, x->cat);
357 }
358
359 static void
360 add_html_outline(fz_context *ctx, struct outline_parser *x, fz_html_box *box)
361 {
362 fz_outline *node;
363 char buf[100];
364 int heading;
365
366 node = fz_new_outline(ctx);
367 fz_try(ctx)
368 {
369 node->title = Memento_label(fz_strdup(ctx, cat_html_text(ctx, x, box)), "outline_title");
370 if (!box->id)
371 {
372 fz_snprintf(buf, sizeof buf, "'%d", x->id++);
373 box->id = Memento_label(fz_pool_strdup(ctx, x->html->tree.pool, buf), "box_id");
374 }
375 node->uri = Memento_label(fz_asprintf(ctx, "#%s", box->id), "outline_uri");
376 node->is_open = 1;
377 }
378 fz_catch(ctx)
379 {
380 fz_free(ctx, node);
381 fz_rethrow(ctx);
382 }
383
384 heading = box->heading;
385 if (x->level[x->current] < heading && x->current < 5)
386 {
387 x->tail[x->current+1] = x->down[x->current];
388 x->current += 1;
389 }
390 else
391 {
392 while (x->current > 0 && x->level[x->current] > heading)
393 {
394 x->current -= 1;
395 }
396 }
397 x->level[x->current] = heading;
398
399 *(x->tail[x->current]) = node;
400 x->tail[x->current] = &node->next;
401 x->down[x->current] = &node->down;
402 }
403
404 static void
405 load_html_outline(fz_context *ctx, struct outline_parser *x, fz_html_box *box)
406 {
407 while (box)
408 {
409 int heading = box->heading;
410 if (heading)
411 add_html_outline(ctx, x, box);
412 if (box->down)
413 load_html_outline(ctx, x, box->down);
414 box = box->next;
415 }
416 }
417
418 fz_outline *
419 fz_load_html_outline(fz_context *ctx, fz_html *html)
420 {
421 struct outline_parser state;
422 state.html = html;
423 state.cat = NULL;
424 state.head = NULL;
425 state.tail[0] = &state.head;
426 state.down[0] = NULL;
427 state.level[0] = 99;
428 state.current = 0;
429 state.id = 1;
430 fz_try(ctx)
431 load_html_outline(ctx, &state, html->tree.root);
432 fz_always(ctx)
433 fz_drop_buffer(ctx, state.cat);
434 fz_catch(ctx)
435 {
436 fz_drop_outline(ctx, state.head);
437 state.head = NULL;
438 }
439 return state.head;
440 }