Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/source/html/html-outline.c @ 3:2c135c81b16c
MERGE: upstream PyMuPDF 1.26.4 with MuPDF 1.26.7
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:44:09 +0200 |
| parents | b50eed0cc0ef |
| children |
comparison
equal
deleted
inserted
replaced
| 0:6015a75abc2d | 3:2c135c81b16c |
|---|---|
| 1 // Copyright (C) 2004-2024 Artifex Software, Inc. | |
| 2 // | |
| 3 // This file is part of MuPDF. | |
| 4 // | |
| 5 // MuPDF is free software: you can redistribute it and/or modify it under the | |
| 6 // terms of the GNU Affero General Public License as published by the Free | |
| 7 // Software Foundation, either version 3 of the License, or (at your option) | |
| 8 // any later version. | |
| 9 // | |
| 10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY | |
| 11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | |
| 12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more | |
| 13 // details. | |
| 14 // | |
| 15 // You should have received a copy of the GNU Affero General Public License | |
| 16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> | |
| 17 // | |
| 18 // Alternative licensing terms are available from the licensor. | |
| 19 // For commercial licensing, see <https://www.artifex.com/> or contact | |
| 20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, | |
| 21 // CA 94129, USA, for further information. | |
| 22 | |
| 23 #include "mupdf/fitz.h" | |
| 24 #include "html-imp.h" | |
| 25 | |
| 26 #include <string.h> | |
| 27 | |
| 28 enum { T, R, B, L }; | |
| 29 | |
| 30 static int is_internal_uri(const char *uri) | |
| 31 { | |
| 32 while (*uri >= 'a' && *uri <= 'z') | |
| 33 ++uri; | |
| 34 if (uri[0] == ':' && uri[1] == '/' && uri[2] == '/') | |
| 35 return 0; | |
| 36 return 1; | |
| 37 } | |
| 38 | |
| 39 static fz_link *load_link_flow(fz_context *ctx, fz_html_flow *flow, fz_link *head, int page, float page_h, const char *dir, const char *file) | |
| 40 { | |
| 41 fz_link *link; | |
| 42 fz_html_flow *next; | |
| 43 char path[2048]; | |
| 44 fz_rect bbox; | |
| 45 const char *dest; | |
| 46 const char *href; | |
| 47 float end; | |
| 48 | |
| 49 float page_y0 = page * page_h; | |
| 50 float page_y1 = (page + 1) * page_h; | |
| 51 | |
| 52 while (flow) | |
| 53 { | |
| 54 next = flow->next; | |
| 55 if (flow->y >= page_y0 && flow->y <= page_y1) | |
| 56 { | |
| 57 href = flow->box->href; | |
| 58 if (href) | |
| 59 { | |
| 60 /* Coalesce contiguous flow boxes into one link node */ | |
| 61 end = flow->x + flow->w; | |
| 62 while (next && | |
| 63 next->y == flow->y && | |
| 64 next->h == flow->h && | |
| 65 next->box->href == href) | |
| 66 { | |
| 67 end = next->x + next->w; | |
| 68 next = next->next; | |
| 69 } | |
| 70 | |
| 71 bbox.x0 = flow->x; | |
| 72 bbox.y0 = flow->y - page * page_h; | |
| 73 bbox.x1 = end; | |
| 74 bbox.y1 = bbox.y0 + flow->h; | |
| 75 if (flow->type != FLOW_IMAGE) | |
| 76 { | |
| 77 /* flow->y is the baseline, adjust bbox appropriately */ | |
| 78 bbox.y0 -= 0.8f * flow->h; | |
| 79 bbox.y1 -= 0.8f * flow->h; | |
| 80 } | |
| 81 | |
| 82 if (is_internal_uri(href)) | |
| 83 { | |
| 84 if (href[0] == '#') | |
| 85 { | |
| 86 fz_strlcpy(path, file, sizeof path); | |
| 87 fz_strlcat(path, href, sizeof path); | |
| 88 } | |
| 89 else | |
| 90 { | |
| 91 fz_strlcpy(path, dir, sizeof path); | |
| 92 fz_strlcat(path, "/", sizeof path); | |
| 93 fz_strlcat(path, href, sizeof path); | |
| 94 } | |
| 95 fz_urldecode(path); | |
| 96 fz_cleanname(path); | |
| 97 | |
| 98 dest = path; | |
| 99 } | |
| 100 else | |
| 101 { | |
| 102 dest = href; | |
| 103 } | |
| 104 | |
| 105 link = fz_new_derived_link(ctx, fz_link, bbox, dest); | |
| 106 link->next = head; | |
| 107 head = link; | |
| 108 } | |
| 109 } | |
| 110 flow = next; | |
| 111 } | |
| 112 return head; | |
| 113 } | |
| 114 | |
| 115 static fz_link *load_link_box(fz_context *ctx, fz_html_box *box, fz_link *head, int page, float page_h, const char *dir, const char *file) | |
| 116 { | |
| 117 while (box) | |
| 118 { | |
| 119 if (box->type == BOX_FLOW) | |
| 120 head = load_link_flow(ctx, box->u.flow.head, head, page, page_h, dir, file); | |
| 121 if (box->down) | |
| 122 head = load_link_box(ctx, box->down, head, page, page_h, dir, file); | |
| 123 box = box->next; | |
| 124 } | |
| 125 return head; | |
| 126 } | |
| 127 | |
| 128 fz_link * | |
| 129 fz_load_html_links(fz_context *ctx, fz_html *html, int page, const char *file) | |
| 130 { | |
| 131 fz_link *link, *head; | |
| 132 char dir[2048]; | |
| 133 fz_dirname(dir, file, sizeof dir); | |
| 134 | |
| 135 head = load_link_box(ctx, html->tree.root, NULL, page, html->page_h, dir, file); | |
| 136 | |
| 137 for (link = head; link; link = link->next) | |
| 138 { | |
| 139 /* Adjust for page margins */ | |
| 140 link->rect.x0 += html->page_margin[L]; | |
| 141 link->rect.x1 += html->page_margin[L]; | |
| 142 link->rect.y0 += html->page_margin[T]; | |
| 143 link->rect.y1 += html->page_margin[T]; | |
| 144 } | |
| 145 | |
| 146 return head; | |
| 147 } | |
| 148 | |
| 149 static fz_html_flow * | |
| 150 find_first_content(fz_html_box *box) | |
| 151 { | |
| 152 while (box) | |
| 153 { | |
| 154 if (box->type == BOX_FLOW) | |
| 155 return box->u.flow.head; | |
| 156 box = box->down; | |
| 157 } | |
| 158 return NULL; | |
| 159 } | |
| 160 | |
| 161 static float | |
| 162 find_flow_target(fz_html_flow *flow, const char *id) | |
| 163 { | |
| 164 while (flow) | |
| 165 { | |
| 166 if (flow->box->id && !strcmp(id, flow->box->id)) | |
| 167 return flow->y; | |
| 168 flow = flow->next; | |
| 169 } | |
| 170 return -1; | |
| 171 } | |
| 172 | |
| 173 static float | |
| 174 find_box_target(fz_html_box *box, const char *id) | |
| 175 { | |
| 176 float y; | |
| 177 while (box) | |
| 178 { | |
| 179 if (box->id && !strcmp(id, box->id)) | |
| 180 { | |
| 181 fz_html_flow *flow = find_first_content(box); | |
| 182 if (flow) | |
| 183 return flow->y; | |
| 184 return box->s.layout.y; | |
| 185 } | |
| 186 if (box->type == BOX_FLOW) | |
| 187 { | |
| 188 y = find_flow_target(box->u.flow.head, id); | |
| 189 if (y >= 0) | |
| 190 return y; | |
| 191 } | |
| 192 else | |
| 193 { | |
| 194 y = find_box_target(box->down, id); | |
| 195 if (y >= 0) | |
| 196 return y; | |
| 197 } | |
| 198 box = box->next; | |
| 199 } | |
| 200 return -1; | |
| 201 } | |
| 202 | |
| 203 float | |
| 204 fz_find_html_target(fz_context *ctx, fz_html *html, const char *id) | |
| 205 { | |
| 206 return find_box_target(html->tree.root, id); | |
| 207 } | |
| 208 | |
| 209 static fz_html_flow * | |
| 210 make_flow_bookmark(fz_context *ctx, fz_html_flow *flow, float y, fz_html_flow **candidate) | |
| 211 { | |
| 212 while (flow) | |
| 213 { | |
| 214 *candidate = flow; | |
| 215 if (flow->y >= y) | |
| 216 return flow; | |
| 217 flow = flow->next; | |
| 218 } | |
| 219 return NULL; | |
| 220 } | |
| 221 | |
| 222 static fz_html_flow * | |
| 223 make_box_bookmark(fz_context *ctx, fz_html_box *box, float y, fz_html_flow **candidate) | |
| 224 { | |
| 225 fz_html_flow *mark; | |
| 226 fz_html_flow *dummy = NULL; | |
| 227 if (candidate == NULL) | |
| 228 candidate = &dummy; | |
| 229 while (box) | |
| 230 { | |
| 231 if (box->type == BOX_FLOW) | |
| 232 { | |
| 233 if (box->s.layout.y >= y) | |
| 234 { | |
| 235 mark = make_flow_bookmark(ctx, box->u.flow.head, y, candidate); | |
| 236 if (mark) | |
| 237 return mark; | |
| 238 } | |
| 239 else | |
| 240 *candidate = make_flow_bookmark(ctx, box->u.flow.head, y, candidate); | |
| 241 } | |
| 242 else | |
| 243 { | |
| 244 mark = make_box_bookmark(ctx, box->down, y, candidate); | |
| 245 if (mark) | |
| 246 return mark; | |
| 247 } | |
| 248 box = box->next; | |
| 249 } | |
| 250 return *candidate; | |
| 251 } | |
| 252 | |
| 253 fz_bookmark | |
| 254 fz_make_html_bookmark(fz_context *ctx, fz_html *html, int page) | |
| 255 { | |
| 256 return (fz_bookmark)make_box_bookmark(ctx, html->tree.root, page * html->page_h, NULL); | |
| 257 } | |
| 258 | |
| 259 static int | |
| 260 lookup_flow_bookmark(fz_context *ctx, fz_html_flow *flow, fz_html_flow *mark) | |
| 261 { | |
| 262 while (flow) | |
| 263 { | |
| 264 if (flow == mark) | |
| 265 return 1; | |
| 266 flow = flow->next; | |
| 267 } | |
| 268 return 0; | |
| 269 } | |
| 270 | |
| 271 static int | |
| 272 lookup_box_bookmark(fz_context *ctx, fz_html_box *box, fz_html_flow *mark) | |
| 273 { | |
| 274 while (box) | |
| 275 { | |
| 276 if (box->type == BOX_FLOW) | |
| 277 { | |
| 278 if (lookup_flow_bookmark(ctx, box->u.flow.head, mark)) | |
| 279 return 1; | |
| 280 } | |
| 281 else | |
| 282 { | |
| 283 if (lookup_box_bookmark(ctx, box->down, mark)) | |
| 284 return 1; | |
| 285 } | |
| 286 box = box->next; | |
| 287 } | |
| 288 return 0; | |
| 289 } | |
| 290 | |
| 291 int | |
| 292 fz_lookup_html_bookmark(fz_context *ctx, fz_html *html, fz_bookmark mark) | |
| 293 { | |
| 294 fz_html_flow *flow = (fz_html_flow*)mark; | |
| 295 if (flow && lookup_box_bookmark(ctx, html->tree.root, flow)) | |
| 296 return (int)(flow->y / html->page_h); | |
| 297 return -1; | |
| 298 } | |
| 299 | |
| 300 struct outline_parser | |
| 301 { | |
| 302 fz_html *html; | |
| 303 fz_buffer *cat; | |
| 304 fz_outline *head; | |
| 305 fz_outline **tail[6]; | |
| 306 fz_outline **down[6]; | |
| 307 int level[6]; | |
| 308 int current; | |
| 309 int id; | |
| 310 }; | |
| 311 | |
| 312 static void | |
| 313 cat_html_flow(fz_context *ctx, fz_buffer *cat, fz_html_flow *flow) | |
| 314 { | |
| 315 while (flow) | |
| 316 { | |
| 317 switch (flow->type) | |
| 318 { | |
| 319 case FLOW_WORD: | |
| 320 fz_append_string(ctx, cat, flow->content.text); | |
| 321 break; | |
| 322 case FLOW_SPACE: | |
| 323 case FLOW_BREAK: | |
| 324 fz_append_byte(ctx, cat, ' '); | |
| 325 break; | |
| 326 default: | |
| 327 break; | |
| 328 } | |
| 329 flow = flow->next; | |
| 330 } | |
| 331 } | |
| 332 | |
| 333 static void | |
| 334 cat_html_box(fz_context *ctx, fz_buffer *cat, fz_html_box *box) | |
| 335 { | |
| 336 while (box) | |
| 337 { | |
| 338 if (box->type == BOX_FLOW) | |
| 339 cat_html_flow(ctx, cat, box->u.flow.head); | |
| 340 cat_html_box(ctx, cat, box->down); | |
| 341 box = box->next; | |
| 342 } | |
| 343 } | |
| 344 | |
| 345 static const char * | |
| 346 cat_html_text(fz_context *ctx, struct outline_parser *x, fz_html_box *box) | |
| 347 { | |
| 348 if (!x->cat) | |
| 349 x->cat = fz_new_buffer(ctx, 1024); | |
| 350 else | |
| 351 fz_clear_buffer(ctx, x->cat); | |
| 352 | |
| 353 cat_html_flow(ctx, x->cat, box->u.flow.head); | |
| 354 cat_html_box(ctx, x->cat, box->down); | |
| 355 | |
| 356 return fz_string_from_buffer(ctx, x->cat); | |
| 357 } | |
| 358 | |
| 359 static void | |
| 360 add_html_outline(fz_context *ctx, struct outline_parser *x, fz_html_box *box) | |
| 361 { | |
| 362 fz_outline *node; | |
| 363 char buf[100]; | |
| 364 int heading; | |
| 365 | |
| 366 node = fz_new_outline(ctx); | |
| 367 fz_try(ctx) | |
| 368 { | |
| 369 node->title = Memento_label(fz_strdup(ctx, cat_html_text(ctx, x, box)), "outline_title"); | |
| 370 if (!box->id) | |
| 371 { | |
| 372 fz_snprintf(buf, sizeof buf, "'%d", x->id++); | |
| 373 box->id = Memento_label(fz_pool_strdup(ctx, x->html->tree.pool, buf), "box_id"); | |
| 374 } | |
| 375 node->uri = Memento_label(fz_asprintf(ctx, "#%s", box->id), "outline_uri"); | |
| 376 node->is_open = 1; | |
| 377 } | |
| 378 fz_catch(ctx) | |
| 379 { | |
| 380 fz_free(ctx, node); | |
| 381 fz_rethrow(ctx); | |
| 382 } | |
| 383 | |
| 384 heading = box->heading; | |
| 385 if (x->level[x->current] < heading && x->current < 5) | |
| 386 { | |
| 387 x->tail[x->current+1] = x->down[x->current]; | |
| 388 x->current += 1; | |
| 389 } | |
| 390 else | |
| 391 { | |
| 392 while (x->current > 0 && x->level[x->current] > heading) | |
| 393 { | |
| 394 x->current -= 1; | |
| 395 } | |
| 396 } | |
| 397 x->level[x->current] = heading; | |
| 398 | |
| 399 *(x->tail[x->current]) = node; | |
| 400 x->tail[x->current] = &node->next; | |
| 401 x->down[x->current] = &node->down; | |
| 402 } | |
| 403 | |
| 404 static void | |
| 405 load_html_outline(fz_context *ctx, struct outline_parser *x, fz_html_box *box) | |
| 406 { | |
| 407 while (box) | |
| 408 { | |
| 409 int heading = box->heading; | |
| 410 if (heading) | |
| 411 add_html_outline(ctx, x, box); | |
| 412 if (box->down) | |
| 413 load_html_outline(ctx, x, box->down); | |
| 414 box = box->next; | |
| 415 } | |
| 416 } | |
| 417 | |
| 418 fz_outline * | |
| 419 fz_load_html_outline(fz_context *ctx, fz_html *html) | |
| 420 { | |
| 421 struct outline_parser state; | |
| 422 state.html = html; | |
| 423 state.cat = NULL; | |
| 424 state.head = NULL; | |
| 425 state.tail[0] = &state.head; | |
| 426 state.down[0] = NULL; | |
| 427 state.level[0] = 99; | |
| 428 state.current = 0; | |
| 429 state.id = 1; | |
| 430 fz_try(ctx) | |
| 431 load_html_outline(ctx, &state, html->tree.root); | |
| 432 fz_always(ctx) | |
| 433 fz_drop_buffer(ctx, state.cat); | |
| 434 fz_catch(ctx) | |
| 435 { | |
| 436 fz_drop_outline(ctx, state.head); | |
| 437 state.head = NULL; | |
| 438 } | |
| 439 return state.head; | |
| 440 } |
