Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/source/pdf/pdf-label.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/source/pdf/pdf-label.c Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,262 @@ +// Copyright (C) 2004-2025 Artifex Software, Inc. +// +// This file is part of MuPDF. +// +// MuPDF is free software: you can redistribute it and/or modify it under the +// terms of the GNU Affero General Public License as published by the Free +// Software Foundation, either version 3 of the License, or (at your option) +// any later version. +// +// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +// details. +// +// You should have received a copy of the GNU Affero General Public License +// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> +// +// Alternative licensing terms are available from the licensor. +// For commercial licensing, see <https://www.artifex.com/> or contact +// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, +// CA 94129, USA, for further information. + +#include "mupdf/fitz.h" +#include "mupdf/pdf.h" + +#include <stdarg.h> +#include <stdlib.h> +#include <string.h> + +typedef struct pdf_object_labels pdf_object_labels; +typedef struct pdf_object_label_node pdf_object_label_node; + +struct pdf_object_label_node +{ + int num; + char *path; + pdf_object_label_node *next; +}; + +struct pdf_object_labels +{ + fz_pool *pool; + int object_count; + int root, info, encrypt; + unsigned short *pages; + char *seen; + pdf_object_label_node **nodes; +}; + +static void +add_object_label(fz_context *ctx, pdf_object_labels *g, char *path, int a, int b) +{ + pdf_object_label_node *node, **root; + + node = fz_pool_alloc(ctx, g->pool, sizeof(pdf_object_label_node)); + node->path = fz_pool_strdup(ctx, g->pool, path); + node->num = b; + + root = &g->nodes[a]; + node->next = *root; + *root = node; +} + +static void +scan_object_label_rec(fz_context *ctx, pdf_object_labels *g, char *root_path, pdf_obj *obj, int top) +{ + char path[100]; + int i, n; + if (pdf_is_indirect(ctx, obj)) + ; + else if (pdf_is_dict(ctx, obj)) + { + n = pdf_dict_len(ctx, obj); + for (i = 0; i < n; ++i) + { + pdf_obj *key = pdf_dict_get_key(ctx, obj, i); + pdf_obj *val = pdf_dict_get_val(ctx, obj, i); + if (val && key != PDF_NAME(Parent) && key != PDF_NAME(P) && key != PDF_NAME(Prev) && key != PDF_NAME(Last)) + { + if (pdf_is_indirect(ctx, val)) + { + fz_snprintf(path, sizeof path, "%s/%s", root_path, pdf_to_name(ctx, key)); + add_object_label(ctx, g, path, pdf_to_num(ctx, val), top); + } + else if (pdf_is_dict(ctx, val) || pdf_is_array(ctx, val)) + { + fz_snprintf(path, sizeof path, "%s/%s", root_path, pdf_to_name(ctx, key)); + scan_object_label_rec(ctx, g, path, val, top); + } + } + } + } + else if (pdf_is_array(ctx, obj)) + { + n = pdf_array_len(ctx, obj); + for (i = 0; i < n; ++i) + { + pdf_obj *val = pdf_array_get(ctx, obj, i); + if (val) + { + if (pdf_is_indirect(ctx, val)) + { + fz_snprintf(path, sizeof path, "%s/%d", root_path, i+1); + add_object_label(ctx, g, path, pdf_to_num(ctx, val), top); + } + else if (pdf_is_dict(ctx, val) || pdf_is_array(ctx, val)) + { + fz_snprintf(path, sizeof path, "%s/%d", root_path, i+1); + scan_object_label_rec(ctx, g, path, val, top); + } + } + } + } +} + +static void +scan_object_label(fz_context *ctx, pdf_document *doc, pdf_object_labels *g, int num) +{ + pdf_obj *obj = pdf_load_object(ctx, doc, num); + fz_try(ctx) + scan_object_label_rec(ctx, g, "", obj, num); + fz_always(ctx) + pdf_drop_obj(ctx, obj); + fz_catch(ctx) + fz_rethrow(ctx); +} + +pdf_object_labels * +pdf_load_object_labels(fz_context *ctx, pdf_document *doc) +{ + pdf_object_labels *g = NULL; + fz_pool *pool; + int i, n, page_count; + + n = pdf_count_objects(ctx, doc); + + pool = fz_new_pool(ctx); + fz_try(ctx) + { + g = fz_pool_alloc(ctx, pool, sizeof(pdf_object_labels)); + g->pool = pool; + g->object_count = n; + g->root = pdf_to_num(ctx, pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root))); + g->info = pdf_to_num(ctx, pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Info))); + g->encrypt = pdf_to_num(ctx, pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Encrypt))); + g->seen = fz_pool_alloc(ctx, pool, n); + g->nodes = fz_pool_alloc(ctx, pool, g->object_count * sizeof(pdf_object_label_node*)); + g->pages = fz_pool_alloc(ctx, pool, g->object_count * sizeof(unsigned short)); + + page_count = pdf_count_pages(ctx, doc); + for (i = 0; i < page_count; ++i) + g->pages[pdf_to_num(ctx, pdf_lookup_page_obj(ctx, doc, i))] = i+1; + + for (i = 1; i < n; ++i) + scan_object_label(ctx, doc, g, i); + } + fz_catch(ctx) + { + fz_drop_pool(ctx, pool); + } + return g; +} + +void +pdf_drop_object_labels(fz_context *ctx, pdf_object_labels *g) +{ + if (g) + fz_drop_pool(ctx, g->pool); +} + +static char * +prepend(char *path_buffer, char *path, const char *fmt, ...) +{ + char buf[256]; + size_t z; + va_list args; + + va_start(args, fmt); + z = fz_vsnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + + /* We always want to leave ourselves at least 3 chars for + * a future "..." */ + if (path_buffer + z + 3 <= path) + { + path -= z; + memcpy(path, buf, z); + return path; + } + + /* Just put ... in now. */ + path -= 3; + path[0] = '.'; + path[1] = '.'; + path[2] = '.'; + + return path; +} + +static void +find_paths(fz_context *ctx, pdf_object_labels *g, int here, char *path_buffer, char *leaf_path, pdf_label_object_fn *callback, void *arg) +{ + pdf_object_label_node *node; + int next; + if (here == g->root) + { + prepend(path_buffer, leaf_path, "trailer/Root"); + callback(ctx, arg, prepend(path_buffer, leaf_path, "trailer/Root")); + return; + } + if (here == g->info) + { + callback(ctx, arg, prepend(path_buffer, leaf_path, "trailer/Info")); + return; + } + if (here == g->encrypt) + { + callback(ctx, arg, prepend(path_buffer, leaf_path, "trailer/Encrypt")); + return; + } + if (g->pages[here]) + { + callback(ctx, arg, prepend(path_buffer, leaf_path, "pages/%d", g->pages[here])); + } + for (node = g->nodes[here]; node; node = node->next) + { + next = node->num; + if (next < 1 || next >= g->object_count) + continue; + if (g->seen[next]) + continue; + if (g->pages[next]) + { + callback(ctx, arg, prepend(path_buffer, leaf_path, "pages/%d%s", g->pages[next], node->path)); + } + else + { + char *p = prepend(path_buffer, leaf_path, "%s", node->path); + g->seen[next] = 1; + // if we've run out of room in the path buffer, send this and stop. + if (p[0] == '.' && p[1] == '.' && p[2] == '.') + callback(ctx, arg, p); + else + find_paths(ctx, g, next, path_buffer, p, callback, arg); + g->seen[next] = 0; + } + } +} + +void +pdf_label_object(fz_context *ctx, pdf_object_labels *g, int num, pdf_label_object_fn *callback, void *arg) +{ + int i; + char path[4096]; + + if (num < 1 || num >= g->object_count) + return; + for (i = 1; i < g->object_count; ++i) + g->seen[i] = 0; + path[sizeof(path)-1] = 0; + find_paths(ctx, g, num, path, &path[sizeof(path)-1], callback, arg); +}
