Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/source/html/html-outline.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/source/html/html-outline.c Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,440 @@ +// Copyright (C) 2004-2024 Artifex Software, Inc. +// +// This file is part of MuPDF. +// +// MuPDF is free software: you can redistribute it and/or modify it under the +// terms of the GNU Affero General Public License as published by the Free +// Software Foundation, either version 3 of the License, or (at your option) +// any later version. +// +// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +// details. +// +// You should have received a copy of the GNU Affero General Public License +// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> +// +// Alternative licensing terms are available from the licensor. +// For commercial licensing, see <https://www.artifex.com/> or contact +// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, +// CA 94129, USA, for further information. + +#include "mupdf/fitz.h" +#include "html-imp.h" + +#include <string.h> + +enum { T, R, B, L }; + +static int is_internal_uri(const char *uri) +{ + while (*uri >= 'a' && *uri <= 'z') + ++uri; + if (uri[0] == ':' && uri[1] == '/' && uri[2] == '/') + return 0; + return 1; +} + +static fz_link *load_link_flow(fz_context *ctx, fz_html_flow *flow, fz_link *head, int page, float page_h, const char *dir, const char *file) +{ + fz_link *link; + fz_html_flow *next; + char path[2048]; + fz_rect bbox; + const char *dest; + const char *href; + float end; + + float page_y0 = page * page_h; + float page_y1 = (page + 1) * page_h; + + while (flow) + { + next = flow->next; + if (flow->y >= page_y0 && flow->y <= page_y1) + { + href = flow->box->href; + if (href) + { + /* Coalesce contiguous flow boxes into one link node */ + end = flow->x + flow->w; + while (next && + next->y == flow->y && + next->h == flow->h && + next->box->href == href) + { + end = next->x + next->w; + next = next->next; + } + + bbox.x0 = flow->x; + bbox.y0 = flow->y - page * page_h; + bbox.x1 = end; + bbox.y1 = bbox.y0 + flow->h; + if (flow->type != FLOW_IMAGE) + { + /* flow->y is the baseline, adjust bbox appropriately */ + bbox.y0 -= 0.8f * flow->h; + bbox.y1 -= 0.8f * flow->h; + } + + if (is_internal_uri(href)) + { + if (href[0] == '#') + { + fz_strlcpy(path, file, sizeof path); + fz_strlcat(path, href, sizeof path); + } + else + { + fz_strlcpy(path, dir, sizeof path); + fz_strlcat(path, "/", sizeof path); + fz_strlcat(path, href, sizeof path); + } + fz_urldecode(path); + fz_cleanname(path); + + dest = path; + } + else + { + dest = href; + } + + link = fz_new_derived_link(ctx, fz_link, bbox, dest); + link->next = head; + head = link; + } + } + flow = next; + } + return head; +} + +static fz_link *load_link_box(fz_context *ctx, fz_html_box *box, fz_link *head, int page, float page_h, const char *dir, const char *file) +{ + while (box) + { + if (box->type == BOX_FLOW) + head = load_link_flow(ctx, box->u.flow.head, head, page, page_h, dir, file); + if (box->down) + head = load_link_box(ctx, box->down, head, page, page_h, dir, file); + box = box->next; + } + return head; +} + +fz_link * +fz_load_html_links(fz_context *ctx, fz_html *html, int page, const char *file) +{ + fz_link *link, *head; + char dir[2048]; + fz_dirname(dir, file, sizeof dir); + + head = load_link_box(ctx, html->tree.root, NULL, page, html->page_h, dir, file); + + for (link = head; link; link = link->next) + { + /* Adjust for page margins */ + link->rect.x0 += html->page_margin[L]; + link->rect.x1 += html->page_margin[L]; + link->rect.y0 += html->page_margin[T]; + link->rect.y1 += html->page_margin[T]; + } + + return head; +} + +static fz_html_flow * +find_first_content(fz_html_box *box) +{ + while (box) + { + if (box->type == BOX_FLOW) + return box->u.flow.head; + box = box->down; + } + return NULL; +} + +static float +find_flow_target(fz_html_flow *flow, const char *id) +{ + while (flow) + { + if (flow->box->id && !strcmp(id, flow->box->id)) + return flow->y; + flow = flow->next; + } + return -1; +} + +static float +find_box_target(fz_html_box *box, const char *id) +{ + float y; + while (box) + { + if (box->id && !strcmp(id, box->id)) + { + fz_html_flow *flow = find_first_content(box); + if (flow) + return flow->y; + return box->s.layout.y; + } + if (box->type == BOX_FLOW) + { + y = find_flow_target(box->u.flow.head, id); + if (y >= 0) + return y; + } + else + { + y = find_box_target(box->down, id); + if (y >= 0) + return y; + } + box = box->next; + } + return -1; +} + +float +fz_find_html_target(fz_context *ctx, fz_html *html, const char *id) +{ + return find_box_target(html->tree.root, id); +} + +static fz_html_flow * +make_flow_bookmark(fz_context *ctx, fz_html_flow *flow, float y, fz_html_flow **candidate) +{ + while (flow) + { + *candidate = flow; + if (flow->y >= y) + return flow; + flow = flow->next; + } + return NULL; +} + +static fz_html_flow * +make_box_bookmark(fz_context *ctx, fz_html_box *box, float y, fz_html_flow **candidate) +{ + fz_html_flow *mark; + fz_html_flow *dummy = NULL; + if (candidate == NULL) + candidate = &dummy; + while (box) + { + if (box->type == BOX_FLOW) + { + if (box->s.layout.y >= y) + { + mark = make_flow_bookmark(ctx, box->u.flow.head, y, candidate); + if (mark) + return mark; + } + else + *candidate = make_flow_bookmark(ctx, box->u.flow.head, y, candidate); + } + else + { + mark = make_box_bookmark(ctx, box->down, y, candidate); + if (mark) + return mark; + } + box = box->next; + } + return *candidate; +} + +fz_bookmark +fz_make_html_bookmark(fz_context *ctx, fz_html *html, int page) +{ + return (fz_bookmark)make_box_bookmark(ctx, html->tree.root, page * html->page_h, NULL); +} + +static int +lookup_flow_bookmark(fz_context *ctx, fz_html_flow *flow, fz_html_flow *mark) +{ + while (flow) + { + if (flow == mark) + return 1; + flow = flow->next; + } + return 0; +} + +static int +lookup_box_bookmark(fz_context *ctx, fz_html_box *box, fz_html_flow *mark) +{ + while (box) + { + if (box->type == BOX_FLOW) + { + if (lookup_flow_bookmark(ctx, box->u.flow.head, mark)) + return 1; + } + else + { + if (lookup_box_bookmark(ctx, box->down, mark)) + return 1; + } + box = box->next; + } + return 0; +} + +int +fz_lookup_html_bookmark(fz_context *ctx, fz_html *html, fz_bookmark mark) +{ + fz_html_flow *flow = (fz_html_flow*)mark; + if (flow && lookup_box_bookmark(ctx, html->tree.root, flow)) + return (int)(flow->y / html->page_h); + return -1; +} + +struct outline_parser +{ + fz_html *html; + fz_buffer *cat; + fz_outline *head; + fz_outline **tail[6]; + fz_outline **down[6]; + int level[6]; + int current; + int id; +}; + +static void +cat_html_flow(fz_context *ctx, fz_buffer *cat, fz_html_flow *flow) +{ + while (flow) + { + switch (flow->type) + { + case FLOW_WORD: + fz_append_string(ctx, cat, flow->content.text); + break; + case FLOW_SPACE: + case FLOW_BREAK: + fz_append_byte(ctx, cat, ' '); + break; + default: + break; + } + flow = flow->next; + } +} + +static void +cat_html_box(fz_context *ctx, fz_buffer *cat, fz_html_box *box) +{ + while (box) + { + if (box->type == BOX_FLOW) + cat_html_flow(ctx, cat, box->u.flow.head); + cat_html_box(ctx, cat, box->down); + box = box->next; + } +} + +static const char * +cat_html_text(fz_context *ctx, struct outline_parser *x, fz_html_box *box) +{ + if (!x->cat) + x->cat = fz_new_buffer(ctx, 1024); + else + fz_clear_buffer(ctx, x->cat); + + cat_html_flow(ctx, x->cat, box->u.flow.head); + cat_html_box(ctx, x->cat, box->down); + + return fz_string_from_buffer(ctx, x->cat); +} + +static void +add_html_outline(fz_context *ctx, struct outline_parser *x, fz_html_box *box) +{ + fz_outline *node; + char buf[100]; + int heading; + + node = fz_new_outline(ctx); + fz_try(ctx) + { + node->title = Memento_label(fz_strdup(ctx, cat_html_text(ctx, x, box)), "outline_title"); + if (!box->id) + { + fz_snprintf(buf, sizeof buf, "'%d", x->id++); + box->id = Memento_label(fz_pool_strdup(ctx, x->html->tree.pool, buf), "box_id"); + } + node->uri = Memento_label(fz_asprintf(ctx, "#%s", box->id), "outline_uri"); + node->is_open = 1; + } + fz_catch(ctx) + { + fz_free(ctx, node); + fz_rethrow(ctx); + } + + heading = box->heading; + if (x->level[x->current] < heading && x->current < 5) + { + x->tail[x->current+1] = x->down[x->current]; + x->current += 1; + } + else + { + while (x->current > 0 && x->level[x->current] > heading) + { + x->current -= 1; + } + } + x->level[x->current] = heading; + + *(x->tail[x->current]) = node; + x->tail[x->current] = &node->next; + x->down[x->current] = &node->down; +} + +static void +load_html_outline(fz_context *ctx, struct outline_parser *x, fz_html_box *box) +{ + while (box) + { + int heading = box->heading; + if (heading) + add_html_outline(ctx, x, box); + if (box->down) + load_html_outline(ctx, x, box->down); + box = box->next; + } +} + +fz_outline * +fz_load_html_outline(fz_context *ctx, fz_html *html) +{ + struct outline_parser state; + state.html = html; + state.cat = NULL; + state.head = NULL; + state.tail[0] = &state.head; + state.down[0] = NULL; + state.level[0] = 99; + state.current = 0; + state.id = 1; + fz_try(ctx) + load_html_outline(ctx, &state, html->tree.root); + fz_always(ctx) + fz_drop_buffer(ctx, state.cat); + fz_catch(ctx) + { + fz_drop_outline(ctx, state.head); + state.head = NULL; + } + return state.head; +}
