Mercurial > hgrepos > Python2 > PyMuPDF

diff mupdf-source/source/pdf/pdf-clean-file.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author: Franz Glasner <fzglas.hg@dom66.de>
date: Mon, 15 Sep 2025 11:43:07 +0200
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/source/pdf/pdf-clean-file.c	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,551 @@
+// Copyright (C) 2004-2025 Artifex Software, Inc.
+//
+// This file is part of MuPDF.
+//
+// MuPDF is free software: you can redistribute it and/or modify it under the
+// terms of the GNU Affero General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+// details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
+//
+// Alternative licensing terms are available from the licensor.
+// For commercial licensing, see <https://www.artifex.com/> or contact
+// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
+// CA 94129, USA, for further information.
+
+#include "mupdf/fitz.h"
+#include "mupdf/pdf.h"
+
+#include <string.h>
+
+static int
+string_in_names_list(fz_context *ctx, pdf_obj *p, pdf_obj *names_list)
+{
+	int n = pdf_array_len(ctx, names_list);
+	int i;
+	char *str = pdf_to_str_buf(ctx, p);
+
+	for (i = 0; i < n ; i += 2)
+	{
+		if (!strcmp(pdf_to_str_buf(ctx, pdf_array_get(ctx, names_list, i)), str))
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * Recreate page tree to only retain specified pages.
+ */
+
+static void retainpage(fz_context *ctx, pdf_document *doc, pdf_obj *parentobj, pdf_obj *kids, int page, pdf_obj *structparents, pdf_obj *ostructparents)
+{
+	pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, page);
+
+	pdf_flatten_inheritable_page_items(ctx, pageref);
+
+	pdf_dict_put(ctx, pageref, PDF_NAME(Parent), parentobj);
+
+	/* Store page object in new kids array */
+	pdf_array_push(ctx, kids, pageref);
+
+	if (structparents)
+	{
+		int parentnum = pdf_dict_get_int(ctx, pageref, PDF_NAME(StructParents));
+		pdf_obj *parent = pdf_lookup_number(ctx, ostructparents, parentnum);
+		pdf_obj *nums = pdf_dict_get(ctx, structparents, PDF_NAME(Nums));
+		pdf_obj *limits = pdf_dict_get(ctx, structparents, PDF_NAME(Limits));
+		int min, max;
+		pdf_array_push_int(ctx, nums, parentnum);
+		pdf_array_push(ctx, nums, parent);
+		if (limits == NULL)
+		{
+			min = max = parentnum;
+			limits = pdf_new_array(ctx, doc, 2);
+			pdf_dict_put_drop(ctx, structparents, PDF_NAME(Limits), limits);
+		}
+		else
+		{
+			min = pdf_array_get_int(ctx, limits, 0);
+			max = pdf_array_get_int(ctx, limits, 1);
+			if (min > parentnum)
+				min = parentnum;
+			if (max < parentnum)
+				max = parentnum;
+		}
+		pdf_array_put_int(ctx, limits, 0, min);
+		pdf_array_put_int(ctx, limits, 1, max);
+	}
+}
+
+static int dest_is_valid_page(fz_context *ctx, pdf_obj *obj, int *page_object_nums, int pagecount)
+{
+	int i;
+	int num = pdf_to_num(ctx, obj);
+
+	if (num == 0)
+		return 0;
+	for (i = 0; i < pagecount; i++)
+	{
+		if (page_object_nums[i] == num)
+			return 1;
+	}
+	return 0;
+}
+
+static int dest_is_valid(fz_context *ctx, pdf_obj *o, int page_count, int *page_object_nums, pdf_obj *names_list)
+{
+	pdf_obj *p;
+
+	p = pdf_dict_get(ctx, o, PDF_NAME(A));
+	if (pdf_name_eq(ctx, pdf_dict_get(ctx, p, PDF_NAME(S)), PDF_NAME(GoTo)))
+	{
+		pdf_obj *d = pdf_dict_get(ctx, p, PDF_NAME(D));
+		if (pdf_is_array(ctx, d) && !dest_is_valid_page(ctx, pdf_array_get(ctx, d, 0), page_object_nums, page_count))
+			return 0;
+		else if (pdf_is_string(ctx, d) && !string_in_names_list(ctx, d, names_list))
+			return 0;
+	}
+
+	p = pdf_dict_get(ctx, o, PDF_NAME(Dest));
+	if (p == NULL)
+		return 1; /* A name with no dest counts as valid. */
+	else if (pdf_is_string(ctx, p))
+		return string_in_names_list(ctx, p, names_list);
+	else if (!dest_is_valid_page(ctx, pdf_array_get(ctx, p, 0), page_object_nums, page_count))
+		return 0;
+
+	return 1;
+}
+
+static int strip_stale_annot_refs(fz_context *ctx, pdf_obj *field, int page_count, int *page_object_nums)
+{
+	pdf_obj *kids = pdf_dict_get(ctx, field, PDF_NAME(Kids));
+	int len = pdf_array_len(ctx, kids);
+	int j;
+
+	if (kids)
+	{
+		for (j = 0; j < len; j++)
+		{
+			if (strip_stale_annot_refs(ctx, pdf_array_get(ctx, kids, j), page_count, page_object_nums))
+			{
+				pdf_array_delete(ctx, kids, j);
+				len--;
+				j--;
+			}
+		}
+
+		return pdf_array_len(ctx, kids) == 0;
+	}
+	else
+	{
+		pdf_obj *page = pdf_dict_get(ctx, field, PDF_NAME(P));
+		int page_num = pdf_to_num(ctx, page);
+
+		for (j = 0; j < page_count; j++)
+			if (page_num == page_object_nums[j])
+				return 0;
+
+		return 1;
+	}
+}
+
+static int strip_outlines(fz_context *ctx, pdf_document *doc, pdf_obj *outlines, int page_count, int *page_object_nums, pdf_obj *names_list, pdf_mark_bits *marks);
+
+static int strip_outline(fz_context *ctx, pdf_document *doc, pdf_obj *outlines, int page_count, int *page_object_nums, pdf_obj *names_list, pdf_obj **pfirst, pdf_obj **plast, pdf_mark_bits *marks)
+{
+	pdf_obj *prev = NULL;
+	pdf_obj *first = NULL;
+	pdf_obj *current;
+	int count = 0;
+
+	for (current = outlines; current != NULL; )
+	{
+		int nc;
+
+		/* Strip any children to start with. This takes care of
+		 * First/Last/Count for us. */
+		nc = strip_outlines(ctx, doc, current, page_count, page_object_nums, names_list, marks);
+
+		if (!dest_is_valid(ctx, current, page_count, page_object_nums, names_list))
+		{
+			if (nc == 0)
+			{
+				/* Outline with invalid dest and no children. Drop it by
+				 * pulling the next one in here. */
+				pdf_obj *next = pdf_dict_get(ctx, current, PDF_NAME(Next));
+				if (!pdf_is_dict(ctx, next))
+				{
+					/* There is no next one to pull in */
+					if (prev != NULL)
+						pdf_dict_del(ctx, prev, PDF_NAME(Next));
+				}
+				else if (prev != NULL)
+				{
+					pdf_dict_put(ctx, prev, PDF_NAME(Next), next);
+					pdf_dict_put(ctx, next, PDF_NAME(Prev), prev);
+				}
+				else
+				{
+					pdf_dict_del(ctx, next, PDF_NAME(Prev));
+				}
+				current = next;
+			}
+			else
+			{
+				/* Outline with invalid dest, but children. Just drop the dest. */
+				pdf_dict_del(ctx, current, PDF_NAME(Dest));
+				pdf_dict_del(ctx, current, PDF_NAME(A));
+				current = pdf_dict_get(ctx, current, PDF_NAME(Next));
+			}
+		}
+		else
+		{
+			/* Keep this one */
+			if (first == NULL)
+				first = current;
+			prev = current;
+			current = pdf_dict_get(ctx, current, PDF_NAME(Next));
+			count++;
+		}
+	}
+
+	*pfirst = first;
+	*plast = prev;
+
+	return count;
+}
+
+static int strip_outlines(fz_context *ctx, pdf_document *doc, pdf_obj *outlines, int page_count, int *page_object_nums, pdf_obj *names_list, pdf_mark_bits *marks)
+{
+	int nc;
+	pdf_obj *first;
+	pdf_obj *last;
+
+	if (!pdf_is_dict(ctx, outlines))
+		return 0;
+
+	if (pdf_mark_bits_set(ctx, marks, outlines))
+		fz_throw(ctx, FZ_ERROR_FORMAT, "Cycle detected in outlines");
+
+	first = pdf_dict_get(ctx, outlines, PDF_NAME(First));
+	if (!pdf_is_dict(ctx, first))
+		nc = 0;
+	else
+		nc = strip_outline(ctx, doc, first, page_count, page_object_nums, names_list, &first, &last, marks);
+
+	if (nc == 0)
+	{
+		pdf_dict_del(ctx, outlines, PDF_NAME(First));
+		pdf_dict_del(ctx, outlines, PDF_NAME(Last));
+		pdf_dict_del(ctx, outlines, PDF_NAME(Count));
+	}
+	else
+	{
+		int old_count = pdf_dict_get_int(ctx, outlines, PDF_NAME(Count));
+		pdf_dict_put(ctx, outlines, PDF_NAME(First), first);
+		pdf_dict_put(ctx, outlines, PDF_NAME(Last), last);
+		pdf_dict_put_int(ctx, outlines, PDF_NAME(Count), old_count > 0 ? nc : -nc);
+	}
+
+	return nc;
+}
+
+static void pdf_rearrange_pages_imp(fz_context *ctx, pdf_document *doc, int count, const int *new_page_list, pdf_clean_options_structure structure)
+{
+	pdf_obj *oldroot, *pages, *kids, *olddests;
+	pdf_obj *root = NULL;
+	pdf_obj *names_list = NULL;
+	pdf_obj *outlines;
+	pdf_obj *ocproperties;
+	pdf_obj *allfields = NULL;
+	int pagecount, i;
+	int *page_object_nums = NULL;
+	pdf_obj *structtreeroot = NULL;
+	pdf_obj *ostructparents = NULL;
+	pdf_obj *structparents = NULL;
+	pdf_mark_bits *marks = NULL;
+
+	/* Keep only pages/type and (reduced) dest entries to avoid
+	 * references to unretained pages */
+	oldroot = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root));
+	pages = pdf_dict_get(ctx, oldroot, PDF_NAME(Pages));
+	olddests = pdf_load_name_tree(ctx, doc, PDF_NAME(Dests));
+	outlines = pdf_dict_get(ctx, oldroot, PDF_NAME(Outlines));
+	ocproperties = pdf_dict_get(ctx, oldroot, PDF_NAME(OCProperties));
+	if (structure == PDF_CLEAN_STRUCTURE_KEEP)
+	{
+		structtreeroot = pdf_dict_get(ctx, oldroot, PDF_NAME(StructTreeRoot));
+		ostructparents = pdf_dict_get(ctx, structtreeroot, PDF_NAME(ParentTree));
+		if (structtreeroot)
+			structparents = pdf_new_dict(ctx, doc, 3);
+	}
+
+	fz_var(root);
+	fz_var(names_list);
+	fz_var(allfields);
+	fz_var(page_object_nums);
+	fz_var(kids);
+	fz_var(marks);
+
+	fz_try(ctx)
+	{
+		root = pdf_new_dict(ctx, doc, 3);
+		pdf_dict_put(ctx, root, PDF_NAME(Type), pdf_dict_get(ctx, oldroot, PDF_NAME(Type)));
+		pdf_dict_put(ctx, root, PDF_NAME(Pages), pdf_dict_get(ctx, oldroot, PDF_NAME(Pages)));
+		if (structtreeroot)
+		{
+			pdf_dict_put(ctx, root, PDF_NAME(StructTreeRoot), structtreeroot);
+			pdf_dict_put(ctx, structtreeroot, PDF_NAME(ParentTree), structparents);
+			pdf_dict_put_array(ctx, structparents, PDF_NAME(Nums), 2);
+		}
+		if (outlines)
+			pdf_dict_put(ctx, root, PDF_NAME(Outlines), outlines);
+		if (ocproperties)
+			pdf_dict_put(ctx, root, PDF_NAME(OCProperties), ocproperties);
+
+		pdf_update_object(ctx, doc, pdf_to_num(ctx, oldroot), root);
+
+		/* Create a new kids array with only the pages we want to keep */
+		kids = pdf_new_array(ctx, doc, 1);
+
+		/* Retain pages specified */
+		for (i = 0; i < count; ++i)
+			retainpage(ctx, doc, pages, kids, new_page_list[i], structparents, ostructparents);
+
+		/* Update page count */
+		pdf_dict_put_int(ctx, pages, PDF_NAME(Count), pdf_array_len(ctx, kids));
+		pdf_dict_put(ctx, pages, PDF_NAME(Kids), kids);
+
+		pagecount = pdf_count_pages(ctx, doc);
+		page_object_nums = fz_calloc(ctx, pagecount, sizeof(*page_object_nums));
+		for (i = 0; i < pagecount; i++)
+		{
+			pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, i);
+			page_object_nums[i] = pdf_to_num(ctx, pageref);
+		}
+
+		/* If we had an old Dests tree (now reformed as an olddests
+		 * dictionary), keep any entries in there that point to
+		 * valid pages. This may mean we keep more than we need, but
+		 * it's safe at least. */
+		if (olddests)
+		{
+			pdf_obj *names, *dests;
+			int len = pdf_dict_len(ctx, olddests);
+
+			names = pdf_dict_put_dict(ctx, root, PDF_NAME(Names), 1);
+			dests = pdf_dict_put_dict(ctx, names, PDF_NAME(Dests), 1);
+			names_list = pdf_dict_put_array(ctx, dests, PDF_NAME(Names), 32);
+
+			for (i = 0; i < len; i++)
+			{
+				pdf_obj *key = pdf_dict_get_key(ctx, olddests, i);
+				pdf_obj *val = pdf_dict_get_val(ctx, olddests, i);
+				pdf_obj *dest = pdf_dict_get(ctx, val, PDF_NAME(D));
+
+				dest = pdf_array_get(ctx, dest ? dest : val, 0);
+				if (dest_is_valid_page(ctx, dest, page_object_nums, pagecount))
+				{
+					pdf_array_push_string(ctx, names_list, pdf_to_name(ctx, key), strlen(pdf_to_name(ctx, key)));
+					pdf_array_push(ctx, names_list, val);
+				}
+			}
+
+			pdf_drop_obj(ctx, olddests);
+		}
+
+		/* Edit each pages /Annot list to remove any links that point to nowhere. */
+		for (i = 0; i < pagecount; i++)
+		{
+			pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, i);
+
+			pdf_obj *annots = pdf_dict_get(ctx, pageref, PDF_NAME(Annots));
+
+			int len = pdf_array_len(ctx, annots);
+			int j;
+
+			for (j = 0; j < len; j++)
+			{
+				pdf_obj *o = pdf_array_get(ctx, annots, j);
+
+				if (!pdf_name_eq(ctx, pdf_dict_get(ctx, o, PDF_NAME(Subtype)), PDF_NAME(Link)))
+					continue;
+
+				if (!dest_is_valid(ctx, o, pagecount, page_object_nums, names_list))
+				{
+					/* Remove this annotation */
+					pdf_array_delete(ctx, annots, j);
+					len--;
+					j--;
+				}
+			}
+		}
+
+		/* Locate all fields on retained pages */
+		allfields = pdf_new_array(ctx, doc, 1);
+		for (i = 0; i < pagecount; i++)
+		{
+			pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, i);
+
+			pdf_obj *annots = pdf_dict_get(ctx, pageref, PDF_NAME(Annots));
+
+			int len = pdf_array_len(ctx, annots);
+			int j;
+
+			for (j = 0; j < len; j++)
+			{
+				pdf_obj *f = pdf_array_get(ctx, annots, j);
+
+				if (pdf_dict_get(ctx, f, PDF_NAME(Subtype)) == PDF_NAME(Widget))
+					pdf_array_push(ctx, allfields, f);
+			}
+		}
+
+		/* From non-terminal widget fields, strip out annot references not
+		 * belonging to any retained page. */
+		for (i = 0; i < pdf_array_len(ctx, allfields); i++)
+		{
+			pdf_obj *f = pdf_array_get(ctx, allfields, i);
+
+			while (pdf_dict_get(ctx, f, PDF_NAME(Parent)))
+				f = pdf_dict_get(ctx, f, PDF_NAME(Parent));
+
+			strip_stale_annot_refs(ctx, f, pagecount, page_object_nums);
+		}
+
+		/* For terminal fields, if action destination is not valid,
+		 * remove the action */
+		for (i = 0; i < pdf_array_len(ctx, allfields); i++)
+		{
+			pdf_obj *f = pdf_array_get(ctx, allfields, i);
+
+			if (!dest_is_valid(ctx, f, pagecount, page_object_nums, names_list))
+				pdf_dict_del(ctx, f, PDF_NAME(A));
+		}
+
+		marks = pdf_new_mark_bits(ctx, doc);
+		if (strip_outlines(ctx, doc, outlines, pagecount, page_object_nums, names_list, marks) == 0)
+		{
+			pdf_dict_del(ctx, root, PDF_NAME(Outlines));
+		}
+	}
+	fz_always(ctx)
+	{
+		pdf_drop_mark_bits(ctx, marks);
+		fz_free(ctx, page_object_nums);
+		pdf_drop_obj(ctx, allfields);
+		pdf_drop_obj(ctx, root);
+		pdf_drop_obj(ctx, kids);
+		pdf_drop_obj(ctx, structparents);
+	}
+	fz_catch(ctx)
+	{
+		fz_rethrow(ctx);
+	}
+}
+
+void pdf_rearrange_pages(fz_context *ctx, pdf_document *doc, int count, const int *new_page_list, pdf_clean_options_structure structure)
+{
+	if (structure < PDF_CLEAN_STRUCTURE_DROP || structure > PDF_CLEAN_STRUCTURE_KEEP)
+		fz_throw(ctx, FZ_ERROR_ARGUMENT, "Invalid structure argument");
+
+	pdf_begin_operation(ctx, doc, "Rearrange pages");
+	fz_try(ctx)
+	{
+		pdf_rearrange_pages_imp(ctx, doc, count, new_page_list, structure);
+		pdf_end_operation(ctx, doc);
+	}
+	fz_catch(ctx)
+	{
+		pdf_abandon_operation(ctx, doc);
+		pdf_sync_open_pages(ctx, doc);
+		fz_rethrow(ctx);
+	}
+	pdf_sync_open_pages(ctx, doc);
+}
+
+void pdf_clean_file(fz_context *ctx, char *infile, char *outfile, char *password, pdf_clean_options *opts, int argc, char *argv[])
+{
+	pdf_clean_options default_opts = { 0 };
+	pdf_document *pdf = NULL;
+	int *pages = NULL;
+	int cap, len, page;
+
+	fz_var(pdf);
+	fz_var(pages);
+
+	if (opts == NULL)
+		opts = &default_opts;
+	if (argc > 0 && argv == NULL)
+		fz_throw(ctx, FZ_ERROR_ARGUMENT, "arguments array must be set if arguments exist");
+
+	fz_try(ctx)
+	{
+		pdf = pdf_open_document(ctx, infile);
+		if (pdf_needs_password(ctx, pdf))
+			if (!pdf_authenticate_password(ctx, pdf, password))
+				fz_throw(ctx, FZ_ERROR_ARGUMENT, "cannot authenticate password: %s", infile);
+
+		len = cap = 0;
+
+		/* Only retain the specified subset of the pages */
+		if (argc)
+		{
+			int pagecount = pdf_count_pages(ctx, pdf);
+			int argidx = 0;
+
+			while (argc - argidx)
+			{
+				int spage, epage;
+				const char *pagelist = argv[argidx];
+
+				while ((pagelist = fz_parse_page_range(ctx, pagelist, &spage, &epage, pagecount)))
+				{
+					if (len + (epage - spage + 1) >= cap)
+					{
+						int n = cap ? cap * 2 : 8;
+						while (len + (epage - spage + 1) >= n)
+							n *= 2;
+						pages = fz_realloc_array(ctx, pages, n, int);
+						cap = n;
+					}
+
+					if (spage < epage)
+						for (page = spage; page <= epage; ++page)
+							pages[len++] = page - 1;
+					else
+						for (page = spage; page >= epage; --page)
+							pages[len++] = page - 1;
+				}
+
+				argidx++;
+			}
+
+			pdf_rearrange_pages(ctx, pdf, len, pages, opts->structure);
+		}
+
+		pdf_rewrite_images(ctx, pdf, &opts->image);
+
+		if (opts->subset_fonts)
+			pdf_subset_fonts(ctx, pdf, len, pages);
+
+		pdf_save_document(ctx, pdf, outfile, &opts->write);
+	}
+	fz_always(ctx)
+	{
+		fz_free(ctx, pages);
+		pdf_drop_document(ctx, pdf);
+	}
+	fz_catch(ctx)
+	{
+		fz_rethrow(ctx);
+	}
+}
author	Franz Glasner <fzglas.hg@dom66.de>
date	Mon, 15 Sep 2025 11:43:07 +0200
parents
children