Python2/PyMuPDF: mupdf-source/source/pdf/pdf-clean.c comparison

comparison mupdf-source/source/pdf/pdf-clean.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.

author	Franz Glasner <fzglas.hg@dom66.de>
date	Mon, 15 Sep 2025 11:43:07 +0200
parents
children

comparison

equal deleted inserted replaced

-:1d09e1dec1d9
+:b50eed0cc0ef
+// Copyright (C) 2004-2025 Artifex Software, Inc.
+//
+// This file is part of MuPDF.
+//
+// MuPDF is free software: you can redistribute it and/or modify it under the
+// terms of the GNU Affero General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+// details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
+//
+// Alternative licensing terms are available from the licensor.
+// For commercial licensing, see <https://www.artifex.com/> or contact
+// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
+// CA 94129, USA, for further information.
+#include "mupdf/fitz.h"
+#include "pdf-annot-imp.h"
+#include <string.h>
+#include <assert.h>
+static void
+pdf_filter_xobject(fz_context *ctx, pdf_document *doc, pdf_obj *xobj, pdf_obj *page_res, pdf_filter_options *options, pdf_cycle_list *cycle_up);
+static void
+pdf_filter_type3(fz_context *ctx, pdf_document *doc, pdf_obj *obj, pdf_obj *page_res, pdf_filter_options *options, pdf_cycle_list *cycle_up);
+static void
+pdf_filter_resources(fz_context *ctx, pdf_document *doc, pdf_obj *in_res, pdf_obj *res, pdf_filter_options *options, pdf_cycle_list *cycle_up)
+{
+	pdf_obj *obj;
+	int i, n;
+	if (!options->recurse)
+		return;
+	/* ExtGState */
+	obj = pdf_dict_get(ctx, res, PDF_NAME(ExtGState));
+	if (obj)
+	{
+		n = pdf_dict_len(ctx, obj);
+		for (i = 0; i < n; i++)
+		{
+			pdf_obj *smask = pdf_dict_get(ctx, pdf_dict_get_val(ctx, obj, i), PDF_NAME(SMask));
+			if (smask)
+			{
+				pdf_obj *g = pdf_dict_get(ctx, smask, PDF_NAME(G));
+				if (g)
+				{
+					/* Transparency group XObject */
+					pdf_filter_xobject(ctx, doc, g, in_res, options, cycle_up);
+				}
+			}
+		}
+	}
+	/* Pattern */
+	obj = pdf_dict_get(ctx, res, PDF_NAME(Pattern));
+	if (obj)
+	{
+		n = pdf_dict_len(ctx, obj);
+		for (i = 0; i < n; i++)
+		{
+			pdf_obj *pat = pdf_dict_get_val(ctx, obj, i);
+			if (pat && pdf_dict_get_int(ctx, pat, PDF_NAME(PatternType)) == 1)
+			{
+				pdf_filter_xobject(ctx, doc, pat, in_res, options, cycle_up);
+			}
+		}
+	}
+	/* XObject */
+	if (!options->instance_forms)
+	{
+		obj = pdf_dict_get(ctx, res, PDF_NAME(XObject));
+		if (obj)
+		{
+			n = pdf_dict_len(ctx, obj);
+			for (i = 0; i < n; i++)
+			{
+				pdf_obj *xobj = pdf_dict_get_val(ctx, obj, i);
+				if (xobj && pdf_dict_get(ctx, xobj, PDF_NAME(Subtype)) == PDF_NAME(Form))
+				{
+					pdf_filter_xobject(ctx, doc, xobj, in_res, options, cycle_up);
+				}
+			}
+		}
+	}
+	/* Font */
+	obj = pdf_dict_get(ctx, res, PDF_NAME(Font));
+	if (obj)
+	{
+		n = pdf_dict_len(ctx, obj);
+		for (i = 0; i < n; i++)
+		{
+			pdf_obj *font = pdf_dict_get_val(ctx, obj, i);
+			if (font && pdf_dict_get(ctx, font, PDF_NAME(Subtype)) == PDF_NAME(Type3))
+			{
+				pdf_filter_type3(ctx, doc, font, in_res, options, cycle_up);
+			}
+		}
+	}
+}
+/*
+	Clean a content stream's rendering operations, with an optional post
+	processing step.
+	Firstly, this filters the PDF operators used to avoid (some cases of)
+	repetition, and leaves the content stream in a balanced state with an
+	unchanged top level matrix etc. At the same time, the resources actually
+	used are collected into a new resource dictionary.
+	Next, the resources themselves are recursively cleaned (as appropriate)
+	in the same way, if the 'recurse' flag is set.
+*/
+static void
+pdf_filter_content_stream(
+	fz_context *ctx,
+	pdf_document *doc,
+	pdf_obj *in_stm,
+	pdf_obj *in_res,
+	fz_matrix transform,
+	pdf_filter_options *options,
+	int struct_parents,
+	fz_buffer **out_buf,
+	pdf_obj **out_res,
+	pdf_cycle_list *cycle_up)
+{
+	pdf_processor *proc_buffer = NULL;
+	pdf_processor *top = NULL;
+	pdf_processor **list = NULL;
+	int num_filters = 0;
+	int i;
+	fz_var(proc_buffer);
+	*out_buf = NULL;
+	*out_res = NULL;
+	if (options->filters)
+		for (; options->filters[num_filters].filter != NULL; num_filters++);
+	if (num_filters > 0)
+		list = fz_calloc(ctx, num_filters, sizeof(pdf_processor *));
+	fz_try(ctx)
+	{
+		*out_buf = fz_new_buffer(ctx, 1024);
+		top = proc_buffer = pdf_new_buffer_processor(ctx, *out_buf, options->ascii, options->newlines);
+		if (num_filters > 0)
+		{
+			for (i = num_filters - 1; i >= 0; i--)
+				top = list[i] = options->filters[i].filter(ctx, doc, top, struct_parents, transform, options, options->filters[i].options);
+		}
+		pdf_process_contents(ctx, top, doc, in_res, in_stm, NULL, out_res);
+		pdf_close_processor(ctx, top);
+		pdf_filter_resources(ctx, doc, in_res, *out_res, options, cycle_up);
+	}
+	fz_always(ctx)
+	{
+		for (i = 0; i < num_filters; i++)
+			pdf_drop_processor(ctx, list[i]);
+		pdf_drop_processor(ctx, proc_buffer);
+		fz_free(ctx, list);
+	}
+	fz_catch(ctx)
+	{
+		fz_drop_buffer(ctx, *out_buf);
+		*out_buf = NULL;
+		pdf_drop_obj(ctx, *out_res);
+		*out_res = NULL;
+		fz_rethrow(ctx);
+	}
+}
+/*
+	Clean a Type 3 font's CharProcs content streams. This works almost
+	exactly like pdf_filter_content_stream, but the resource dictionary is
+	shared between all off the CharProcs.
+*/
+static void
+pdf_filter_type3(fz_context *ctx, pdf_document *doc, pdf_obj *obj, pdf_obj *page_res, pdf_filter_options *options, pdf_cycle_list *cycle_up)
+{
+	pdf_cycle_list cycle;
+	pdf_processor *proc_buffer = NULL;
+	pdf_processor *proc_filter = NULL;
+	pdf_obj *in_res;
+	pdf_obj *out_res = NULL;
+	pdf_obj *charprocs;
+	int i, n;
+	int num_filters = 0;
+	pdf_processor **list = NULL;
+	fz_buffer *buffer = NULL;
+	pdf_processor *top = NULL;
+	pdf_obj *res = NULL;
+	fz_buffer *new_buf = NULL;
+	fz_var(out_res);
+	fz_var(proc_buffer);
+	fz_var(proc_filter);
+	fz_var(buffer);
+	fz_var(res);
+	fz_var(new_buf);
+	/* We cannot combine instancing with type3 fonts. The new names for
+	 * instanced form/image resources would clash, since they start over for
+	 * each content stream. This is not a problem for now, because we only
+	 * use instancing with redaction, and redaction doesn't clean type3
+	 * fonts.
+	 */
+	assert(!options->instance_forms);
+	/* Avoid recursive cycles! */
+	if (pdf_cycle(ctx, &cycle, cycle_up, obj))
+		return;
+	if (options->filters)
+		for (; options->filters[num_filters].filter != NULL; num_filters++);
+	if (num_filters > 0)
+		list = fz_calloc(ctx, num_filters, sizeof(pdf_processor *));
+	fz_try(ctx)
+	{
+		in_res = pdf_dict_get(ctx, obj, PDF_NAME(Resources));
+		if (!in_res)
+			in_res = page_res;
+		buffer = fz_new_buffer(ctx, 1024);
+		top = proc_buffer = pdf_new_buffer_processor(ctx, buffer, options->ascii, options->newlines);
+		if (num_filters > 0)
+		{
+			for (i = num_filters - 1; i >= 0; i--)
+				top = list[i] = options->filters[i].filter(ctx, doc, top, -1, fz_identity, options, options->filters[i].options);
+		}
+		pdf_processor_push_resources(ctx, top, in_res);
+		charprocs = pdf_dict_get(ctx, obj, PDF_NAME(CharProcs));
+		n = pdf_dict_len(ctx, charprocs);
+		for (i = 0; i < n; i++)
+		{
+			pdf_obj *val = pdf_dict_get_val(ctx, charprocs, i);
+			if (i > 0)
+			{
+				pdf_reset_processor(ctx, top);
+				fz_clear_buffer(ctx, buffer);
+			}
+			pdf_process_raw_contents(ctx, top, doc, in_res, val, NULL);
+			pdf_close_processor(ctx, top);
+			if (!options->no_update)
+			{
+				new_buf = fz_clone_buffer(ctx, buffer);
+				pdf_update_stream(ctx, doc, val, new_buf, 0);
+				fz_drop_buffer(ctx, new_buf);
+				new_buf = NULL;
+			}
+		}
+	}
+	fz_always(ctx)
+	{
+		res = pdf_processor_pop_resources(ctx, top);
+		for (i = 0; i < num_filters; i++)
+			pdf_drop_processor(ctx, list[i]);
+		pdf_drop_processor(ctx, proc_buffer);
+		fz_free(ctx, list);
+		fz_drop_buffer(ctx, new_buf);
+		fz_drop_buffer(ctx, buffer);
+	}
+	fz_catch(ctx)
+	{
+		pdf_drop_obj(ctx, res);
+		fz_rethrow(ctx);
+	}
+	pdf_dict_put_drop(ctx, obj, PDF_NAME(Resources), res);
+}
+static void
+pdf_filter_xobject(fz_context *ctx, pdf_document *doc, pdf_obj *stm, pdf_obj *page_res, pdf_filter_options *options, pdf_cycle_list *cycle_up)
+{
+	pdf_cycle_list cycle;
+	int struct_parents;
+	pdf_obj *new_res = NULL;
+	fz_buffer *new_buf = NULL;
+	pdf_obj *old_res;
+	fz_var(new_buf);
+	fz_var(new_res);
+	// TODO for RJW: XObject can also be a StructParent; how do we handle that case?
+	struct_parents = pdf_dict_get_int_default(ctx, stm, PDF_NAME(StructParents), -1);
+	old_res = pdf_dict_get(ctx, stm, PDF_NAME(Resources));
+	if (!old_res)
+		old_res = page_res;
+	// TODO: don't clean objects more than once.
+	/* Avoid recursive cycles! */
+	if (pdf_cycle(ctx, &cycle, cycle_up, stm))
+		return;
+	fz_try(ctx)
+	{
+		pdf_filter_content_stream(ctx, doc, stm, old_res, fz_identity, options, struct_parents, &new_buf, &new_res, &cycle);
+		if (!options->no_update)
+		{
+			pdf_update_stream(ctx, doc, stm, new_buf, 0);
+			pdf_dict_put(ctx, stm, PDF_NAME(Resources), new_res);
+		}
+	}
+	fz_always(ctx)
+	{
+		fz_drop_buffer(ctx, new_buf);
+		pdf_drop_obj(ctx, new_res);
+	}
+	fz_catch(ctx)
+		fz_rethrow(ctx);
+}
+pdf_obj *
+pdf_filter_xobject_instance(fz_context *ctx, pdf_obj *old_xobj, pdf_obj *page_res, fz_matrix transform, pdf_filter_options *options, pdf_cycle_list *cycle_up)
+{
+	pdf_cycle_list cycle;
+	pdf_document *doc = pdf_get_bound_document(ctx, old_xobj);
+	pdf_obj *new_xobj;
+	pdf_obj *new_res, *old_res;
+	fz_buffer *new_buf;
+	int struct_parents;
+	fz_matrix matrix;
+	fz_var(new_xobj);
+	fz_var(new_buf);
+	fz_var(new_res);
+	// TODO for RJW: XObject can also be a StructParent; how do we handle that case?
+	// TODO for RJW: will we run into trouble by duplicating StructParents stuff?
+	struct_parents = pdf_dict_get_int_default(ctx, old_xobj, PDF_NAME(StructParents), -1);
+	old_res = pdf_dict_get(ctx, old_xobj, PDF_NAME(Resources));
+	if (!old_res)
+		old_res = page_res;
+	if (pdf_cycle(ctx, &cycle, cycle_up, old_xobj))
+		return pdf_keep_obj(ctx, old_xobj);
+	matrix = pdf_dict_get_matrix(ctx, old_xobj, PDF_NAME(Matrix));
+	transform = fz_concat(matrix, transform);
+	fz_try(ctx)
+	{
+		new_xobj = pdf_add_object_drop(ctx, doc, pdf_copy_dict(ctx, old_xobj));
+		pdf_filter_content_stream(ctx, doc, old_xobj, old_res, transform, options, struct_parents, &new_buf, &new_res, &cycle);
+		if (!options->no_update)
+		{
+			pdf_update_stream(ctx, doc, new_xobj, new_buf, 0);
+			pdf_dict_put(ctx, new_xobj, PDF_NAME(Resources), new_res);
+		}
+	}
+	fz_always(ctx)
+	{
+		fz_drop_buffer(ctx, new_buf);
+		pdf_drop_obj(ctx, new_res);
+	}
+	fz_catch(ctx)
+	{
+		pdf_drop_obj(ctx, new_xobj);
+		fz_rethrow(ctx);
+	}
+	return new_xobj;
+}
+void pdf_filter_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page, pdf_filter_options *options)
+{
+	pdf_obj *contents, *old_res;
+	pdf_obj *new_res;
+	fz_buffer *buffer;
+	int struct_parents;
+	struct_parents = pdf_dict_get_int_default(ctx, page->obj, PDF_NAME(StructParents), -1);
+	contents = pdf_page_contents(ctx, page);
+	old_res = pdf_page_resources(ctx, page);
+	pdf_filter_content_stream(ctx, doc, contents, old_res, fz_identity, options, struct_parents, &buffer, &new_res, NULL);
+	fz_try(ctx)
+	{
+		if (options->complete)
+			options->complete(ctx, buffer, options->opaque);
+		if (!options->no_update)
+		{
+			/* Always create a new stream object to replace the page contents. This is useful
+			   both if the contents is an array of streams, is entirely missing or if the contents
+			   are shared between pages. */
+			contents = pdf_add_object_drop(ctx, doc, pdf_new_dict(ctx, doc, 1));
+			pdf_dict_put_drop(ctx, page->obj, PDF_NAME(Contents), contents);
+			pdf_update_stream(ctx, doc, contents, buffer, 0);
+			pdf_dict_put(ctx, page->obj, PDF_NAME(Resources), new_res);
+		}
+	}
+	fz_always(ctx)
+	{
+		fz_drop_buffer(ctx, buffer);
+		pdf_drop_obj(ctx, new_res);
+	}
+	fz_catch(ctx)
+		fz_rethrow(ctx);
+}
+void pdf_filter_annot_contents(fz_context *ctx, pdf_document *doc, pdf_annot *annot, pdf_filter_options *options)
+{
+	pdf_obj *ap = pdf_dict_get(ctx, annot->obj, PDF_NAME(AP));
+	if (pdf_is_dict(ctx, ap))
+	{
+		int i, n = pdf_dict_len(ctx, ap);
+		for (i = 0; i < n; i++)
+		{
+			pdf_obj *stm = pdf_dict_get_val(ctx, ap, i);
+			if (pdf_is_stream(ctx, stm))
+			{
+				pdf_filter_xobject(ctx, doc, stm, NULL, options, NULL);
+			}
+		}
+	}
+}
+/* REDACTIONS */
+struct redact_filter_state {
+	pdf_filter_options filter_opts;
+	pdf_sanitize_filter_options sanitize_opts;
+	pdf_filter_factory filter_list[2];
+	pdf_page *page;
+	pdf_annot *target; // NULL if all
+	int line_art;
+	int text;
+};
+static void pdf_run_obj_to_buf(fz_context *ctx, fz_buffer *buffer, pdf_obj *obj, pdf_page *page)
+{
+	pdf_processor *proc = pdf_new_buffer_processor(ctx, buffer, 0, 0);
+	pdf_obj *res;
+	fz_try(ctx)
+	{
+		res = pdf_xobject_resources(ctx, obj);
+		if (res == NULL)
+			res = pdf_page_resources(ctx, page);
+		pdf_process_contents(ctx, proc, page->doc, res, obj, NULL, NULL);
+		pdf_close_processor(ctx, proc);
+	}
+	fz_always(ctx)
+		pdf_drop_processor(ctx, proc);
+	fz_catch(ctx)
+		fz_rethrow(ctx);
+}
+static void
+pdf_redact_end_page(fz_context *ctx, fz_buffer *buf, void *opaque)
+{
+	struct redact_filter_state *red = opaque;
+	pdf_page *page = red->page;
+	pdf_annot *annot;
+	pdf_obj *qp;
+	int i, n;
+	fz_append_string(ctx, buf, " 0 g\n");
+	for (annot = pdf_first_annot(ctx, page); annot; annot = pdf_next_annot(ctx, annot))
+	{
+		if (red->target != NULL && red->target != annot)
+			continue;
+		if (pdf_dict_get(ctx, annot->obj, PDF_NAME(Subtype)) == PDF_NAME(Redact))
+		{
+			pdf_obj *ro = pdf_dict_get(ctx, annot->obj, PDF_NAME(RO));
+			if (ro)
+			{
+				pdf_run_obj_to_buf(ctx, buf, ro, page);
+			}
+			else
+			{
+				qp = pdf_dict_get(ctx, annot->obj, PDF_NAME(QuadPoints));
+				n = pdf_array_len(ctx, qp);
+				if (n > 0)
+				{
+					for (i = 0; i < n; i += 8)
+					{
+						fz_quad q = pdf_to_quad(ctx, qp, i);
+						fz_append_printf(ctx, buf, "%g %g m\n", q.ll.x, q.ll.y);
+						fz_append_printf(ctx, buf, "%g %g l\n", q.lr.x, q.lr.y);
+						fz_append_printf(ctx, buf, "%g %g l\n", q.ur.x, q.ur.y);
+						fz_append_printf(ctx, buf, "%g %g l\n", q.ul.x, q.ul.y);
+						fz_append_string(ctx, buf, "f\n");
+					}
+				}
+				else
+				{
+					fz_rect r = pdf_dict_get_rect(ctx, annot->obj, PDF_NAME(Rect));
+					fz_append_printf(ctx, buf, "%g %g m\n", r.x0, r.y0);
+					fz_append_printf(ctx, buf, "%g %g l\n", r.x1, r.y0);
+					fz_append_printf(ctx, buf, "%g %g l\n", r.x1, r.y1);
+					fz_append_printf(ctx, buf, "%g %g l\n", r.x0, r.y1);
+					fz_append_string(ctx, buf, "f\n");
+				}
+			}
+		}
+	}
+}
+static int
+pdf_redact_text_filter(fz_context *ctx, void *opaque, int *ucsbuf, int ucslen, fz_matrix trm, fz_matrix ctm, fz_rect bbox)
+{
+	struct redact_filter_state *red = opaque;
+	pdf_page *page = red->page;
+	pdf_annot *annot;
+	pdf_obj *qp;
+	fz_rect r;
+	fz_quad q;
+	int i, n;
+	float w, h;
+	trm = fz_concat(trm, ctm);
+	bbox = fz_transform_rect(bbox, trm);
+	/* Shrink character bbox a bit */
+	w = bbox.x1 - bbox.x0;
+	h = bbox.y1 - bbox.y0;
+	bbox.x0 += w / 10;
+	bbox.x1 -= w / 10;
+	bbox.y0 += h / 10;
+	bbox.y1 -= h / 10;
+	for (annot = pdf_first_annot(ctx, page); annot; annot = pdf_next_annot(ctx, annot))
+	{
+		if (red->target != NULL && red->target != annot)
+			continue;
+		if (pdf_dict_get(ctx, annot->obj, PDF_NAME(Subtype)) == PDF_NAME(Redact))
+		{
+			qp = pdf_dict_get(ctx, annot->obj, PDF_NAME(QuadPoints));
+			n = pdf_array_len(ctx, qp);
+			/* Note, we test for the intersection being a valid rectangle, NOT
+			 * a non-empty one. This is because we can have 'empty' character
+			 * boxes (say for diacritics), that while 0 width, do have a defined
+			 * position on the plane, and hence inclusion makes sense. */
+			if (n > 0)
+			{
+				for (i = 0; i < n; i += 8)
+				{
+					q = pdf_to_quad(ctx, qp, i);
+					r = fz_rect_from_quad(q);
+					if (fz_is_valid_rect(fz_intersect_rect(bbox, r)))
+						return 1;
+				}
+			}
+			else
+			{
+				r = pdf_dict_get_rect(ctx, annot->obj, PDF_NAME(Rect));
+				if (fz_is_valid_rect(fz_intersect_rect(bbox, r)))
+					return 1;
+			}
+		}
+	}
+	return 0;
+}
+static fz_pixmap *
+pdf_redact_image_imp(fz_context *ctx, fz_matrix ctm, fz_image *image, fz_pixmap *pixmap, fz_pixmap **pmask, fz_quad q)
+{
+	fz_matrix inv_ctm;
+	fz_irect r;
+	int x, y, k, n, bpp;
+	unsigned char white;
+	fz_pixmap *mask = *pmask;
+	int pixmap_cloned = 0;
+	if (!pixmap)
+	{
+		fz_pixmap *original = fz_get_pixmap_from_image(ctx, image, NULL, NULL, NULL, NULL);
+		int imagemask = image->imagemask;
+		fz_try(ctx)
+		{
+			pixmap = fz_clone_pixmap(ctx, original);
+			if (imagemask)
+				fz_invert_pixmap_alpha(ctx, pixmap);
+		}
+		fz_always(ctx)
+			fz_drop_pixmap(ctx, original);
+		fz_catch(ctx)
+			fz_rethrow(ctx);
+		pixmap_cloned = 1;
+	}
+	if (!mask && image->mask)
+	{
+		fz_pixmap *original = fz_get_pixmap_from_image(ctx, image->mask, NULL, NULL, NULL, NULL);
+		fz_try(ctx)
+		{
+			mask = fz_clone_pixmap(ctx, original);
+			*pmask = mask;
+		}
+		fz_always(ctx)
+		{
+			fz_drop_pixmap(ctx, original);
+		}
+		fz_catch(ctx)
+		{
+			if (pixmap_cloned)
+				fz_drop_pixmap(ctx, pixmap);
+			fz_rethrow(ctx);
+		}
+	}
+	/* If we have a 1x1 image, to which a mask is being applied
+	 * then it's the mask we really want to change, not the
+	 * image. We might have just a small section of the image
+	 * being covered, and setting the whole thing to white
+	 * will blank stuff outside the desired area. */
+	if (!mask || pixmap->w > 1 || pixmap->h > 1)
+	{
+		n = pixmap->n - pixmap->alpha;
+		bpp = pixmap->n;
+		if (fz_colorspace_is_subtractive(ctx, pixmap->colorspace))
+			white = 0;
+		else
+			white = 255;
+		inv_ctm = fz_post_scale(fz_invert_matrix(ctm), pixmap->w, pixmap->h);
+		r = fz_round_rect(fz_transform_rect(fz_rect_from_quad(q), inv_ctm));
+		r.x0 = fz_clampi(r.x0, 0, pixmap->w);
+		r.x1 = fz_clampi(r.x1, 0, pixmap->w);
+		r.y1 = fz_clampi(pixmap->h - r.y1, 0, pixmap->h);
+		r.y0 = fz_clampi(pixmap->h - r.y0, 0, pixmap->h);
+		for (y = r.y1; y < r.y0; ++y)
+		{
+			for (x = r.x0; x < r.x1; ++x)
+			{
+				unsigned char *s = &pixmap->samples[(size_t)y * pixmap->stride + (size_t)x * bpp];
+				for (k = 0; k < n; ++k)
+					s[k] = white;
+				if (pixmap->alpha)
+					s[k] = 255;
+			}
+		}
+	}
+	if (mask)
+	{
+		inv_ctm = fz_post_scale(fz_invert_matrix(ctm), mask->w, mask->h);
+		r = fz_round_rect(fz_transform_rect(fz_rect_from_quad(q), inv_ctm));
+		r.x0 = fz_clampi(r.x0, 0, mask->w);
+		r.x1 = fz_clampi(r.x1, 0, mask->w);
+		r.y1 = fz_clampi(mask->h - r.y1, 0, mask->h);
+		r.y0 = fz_clampi(mask->h - r.y0, 0, mask->h);
+		for (y = r.y1; y < r.y0; ++y)
+		{
+			unsigned char *s = &mask->samples[(size_t)y * mask->stride + (size_t)r.x0];
+			memset(s, 0xff, r.x1-r.x0);
+		}
+	}
+	return pixmap;
+}
+static fz_image *
+pdf_redact_image_filter_remove(fz_context *ctx, void *opaque, fz_matrix ctm, const char *name, fz_image *image, fz_rect clip)
+{
+	fz_pixmap *redacted = NULL;
+	struct redact_filter_state *red = opaque;
+	pdf_page *page = red->page;
+	pdf_annot *annot;
+	pdf_obj *qp;
+	fz_rect area;
+	fz_rect r;
+	int i, n;
+	fz_var(redacted);
+	area = fz_transform_rect(fz_unit_rect, ctm);
+	for (annot = pdf_first_annot(ctx, page); annot; annot = pdf_next_annot(ctx, annot))
+	{
+		if (red->target != NULL && red->target != annot)
+			continue;
+		if (pdf_dict_get(ctx, annot->obj, PDF_NAME(Subtype)) == PDF_NAME(Redact))
+		{
+			qp = pdf_dict_get(ctx, annot->obj, PDF_NAME(QuadPoints));
+			n = pdf_array_len(ctx, qp);
+			if (n > 0)
+			{
+				for (i = 0; i < n; i += 8)
+				{
+					r = fz_rect_from_quad(pdf_to_quad(ctx, qp, i));
+					r = fz_intersect_rect(r, area);
+					if (!fz_is_empty_rect(r))
+						return NULL;
+				}
+			}
+			else
+			{
+				r = pdf_dict_get_rect(ctx, annot->obj, PDF_NAME(Rect));
+				r = fz_intersect_rect(r, area);
+				if (!fz_is_empty_rect(r))
+					return NULL;
+			}
+		}
+	}
+	return fz_keep_image(ctx, image);
+}
+static fz_image *
+pdf_redact_image_filter_remove_invisible(fz_context *ctx, void *opaque, fz_matrix ctm, const char *name, fz_image *image, fz_rect clip)
+{
+	fz_pixmap *redacted = NULL;
+	struct redact_filter_state *red = opaque;
+	pdf_page *page = red->page;
+	pdf_annot *annot;
+	pdf_obj *qp;
+	fz_rect area;
+	fz_rect r;
+	int i, n;
+	fz_var(redacted);
+	area = fz_transform_rect(fz_unit_rect, ctm);
+	/* Restrict the are of the image to that which can actually be seen. */
+	area = fz_intersect_rect(area, clip);
+	for (annot = pdf_first_annot(ctx, page); annot; annot = pdf_next_annot(ctx, annot))
+	{
+		if (red->target != NULL && red->target != annot)
+			continue;
+		if (pdf_dict_get(ctx, annot->obj, PDF_NAME(Subtype)) == PDF_NAME(Redact))
+		{
+			qp = pdf_dict_get(ctx, annot->obj, PDF_NAME(QuadPoints));
+			n = pdf_array_len(ctx, qp);
+			if (n > 0)
+			{
+				for (i = 0; i < n; i += 8)
+				{
+					r = fz_rect_from_quad(pdf_to_quad(ctx, qp, i));
+					r = fz_intersect_rect(r, area);
+					if (!fz_is_empty_rect(r))
+						return NULL;
+				}
+			}
+			else
+			{
+				r = pdf_dict_get_rect(ctx, annot->obj, PDF_NAME(Rect));
+				r = fz_intersect_rect(r, area);
+				if (!fz_is_empty_rect(r))
+					return NULL;
+			}
+		}
+	}
+	return fz_keep_image(ctx, image);
+}
+static fz_image *
+pdf_redact_image_filter_pixels(fz_context *ctx, void *opaque, fz_matrix ctm, const char *name, fz_image *image, fz_rect clip)
+{
+	fz_pixmap *redacted = NULL;
+	fz_pixmap *mask = NULL;
+	struct redact_filter_state *red = opaque;
+	pdf_page *page = red->page;
+	pdf_annot *annot;
+	pdf_obj *qp;
+	fz_quad area, q;
+	fz_rect r;
+	int i, n;
+	fz_var(redacted);
+	fz_var(mask);
+	area = fz_transform_quad(fz_quad_from_rect(fz_unit_rect), ctm);
+	/* First see if we can redact the image completely */
+	for (annot = pdf_first_annot(ctx, page); annot; annot = pdf_next_annot(ctx, annot))
+	{
+		if (red->target != NULL && red->target != annot)
+			continue;
+		if (pdf_dict_get(ctx, annot->obj, PDF_NAME(Subtype)) == PDF_NAME(Redact))
+		{
+			qp = pdf_dict_get(ctx, annot->obj, PDF_NAME(QuadPoints));
+			n = pdf_array_len(ctx, qp);
+			if (n > 0)
+			{
+				for (i = 0; i < n; i += 8)
+				{
+					q = pdf_to_quad(ctx, qp, i);
+					if (fz_is_quad_inside_quad(area, q))
+						return NULL;
+				}
+			}
+			else
+			{
+				r = pdf_dict_get_rect(ctx, annot->obj, PDF_NAME(Rect));
+				q = fz_quad_from_rect(r);
+				if (fz_is_quad_inside_quad(area, q))
+					return NULL;
+			}
+		}
+	}
+	/* Blank out redacted parts of the image if necessary */
+	fz_try(ctx)
+	{
+		for (annot = pdf_first_annot(ctx, page); annot; annot = pdf_next_annot(ctx, annot))
+		{
+			if (red->target != NULL && red->target != annot)
+				continue;
+			if (pdf_dict_get(ctx, annot->obj, PDF_NAME(Subtype)) == PDF_NAME(Redact))
+			{
+				qp = pdf_dict_get(ctx, annot->obj, PDF_NAME(QuadPoints));
+				n = pdf_array_len(ctx, qp);
+				if (n > 0)
+				{
+					for (i = 0; i < n; i += 8)
+					{
+						q = pdf_to_quad(ctx, qp, i);
+						if (fz_is_quad_intersecting_quad(area, q))
+							redacted = pdf_redact_image_imp(ctx, ctm, image, redacted, &mask, q);
+					}
+				}
+				else
+				{
+					r = pdf_dict_get_rect(ctx, annot->obj, PDF_NAME(Rect));
+					q = fz_quad_from_rect(r);
+					if (fz_is_quad_intersecting_quad(area, q))
+						redacted = pdf_redact_image_imp(ctx, ctm, image, redacted, &mask, q);
+				}
+			}
+		}
+	}
+	fz_catch(ctx)
+	{
+		fz_drop_pixmap(ctx, redacted);
+		fz_drop_pixmap(ctx, mask);
+		fz_rethrow(ctx);
+	}
+	if (redacted)
+	{
+		int imagemask = image->imagemask;
+		fz_image *imask = fz_keep_image(ctx, image->mask);
+		fz_var(imask);
+		fz_try(ctx)
+		{
+			if (mask)
+			{
+				fz_drop_image(ctx, imask);
+				imask = NULL;
+				imask = fz_new_image_from_pixmap(ctx, mask, NULL);
+			}
+			image = fz_new_image_from_pixmap(ctx, redacted, NULL);
+			image->imagemask = imagemask;
+			image->mask = imask;
+			imask = NULL;
+		}
+		fz_always(ctx)
+		{
+			fz_drop_pixmap(ctx, redacted);
+			fz_drop_pixmap(ctx, mask);
+			fz_drop_image(ctx, imask);
+		}
+		fz_catch(ctx)
+			fz_rethrow(ctx);
+		return image;
+	}
+	return fz_keep_image(ctx, image);
+}
+/* Returns 0 if area does not intersect with any of our redactions.
+* Returns 2 if area is completely included within one of our redactions.
+* Returns 1 otherwise. */
+static int
+rect_touches_redactions(fz_context *ctx, fz_rect area, struct redact_filter_state *red)
+{
+	pdf_annot *annot;
+	pdf_obj *qp;
+	fz_quad q;
+	fz_rect r, s;
+	int i, n;
+	pdf_page *page = red->page;
+	for (annot = pdf_first_annot(ctx, page); annot; annot = pdf_next_annot(ctx, annot))
+	{
+		if (red->target != NULL && red->target != annot)
+			continue;
+		if (pdf_dict_get(ctx, annot->obj, PDF_NAME(Subtype)) == PDF_NAME(Redact))
+		{
+			qp = pdf_dict_get(ctx, annot->obj, PDF_NAME(QuadPoints));
+			n = pdf_array_len(ctx, qp);
+			if (n > 0)
+			{
+				for (i = 0; i < n; i += 8)
+				{
+					q = pdf_to_quad(ctx, qp, i);
+					r = fz_rect_from_quad(q);
+					s = fz_intersect_rect(r, area);
+					if (!fz_is_empty_rect(s))
+					{
+						if (fz_contains_rect(r, area))
+							return 2;
+						return 1;
+					}
+				}
+			}
+			else
+			{
+				r = pdf_dict_get_rect(ctx, annot->obj, PDF_NAME(Rect));
+				s = fz_intersect_rect(r, area);
+				if (!fz_is_empty_rect(s))
+				{
+					if (fz_contains_rect(r, area))
+						return 2;
+					return 1;
+				}
+			}
+		}
+	}
+	return 0;
+}
+static void
+pdf_redact_page_links(fz_context *ctx, struct redact_filter_state *red)
+{
+	pdf_obj *annots;
+	pdf_obj *link;
+	fz_rect area;
+	int k;
+	annots = pdf_dict_get(ctx, red->page->obj, PDF_NAME(Annots));
+	k = 0;
+	while (k < pdf_array_len(ctx, annots))
+	{
+		link = pdf_array_get(ctx, annots, k);
+		if (pdf_dict_get(ctx, link, PDF_NAME(Subtype)) == PDF_NAME(Link))
+		{
+			area = pdf_dict_get_rect(ctx, link, PDF_NAME(Rect));
+			if (rect_touches_redactions(ctx, area, red))
+			{
+				pdf_array_delete(ctx, annots, k);
+				continue;
+			}
+		}
+		++k;
+	}
+}
+static void
+pdf_redact_page_annotations(fz_context *ctx, struct redact_filter_state *red)
+{
+	pdf_annot *annot;
+	fz_rect area;
+restart:
+	for (annot = pdf_first_annot(ctx, red->page); annot; annot = pdf_next_annot(ctx, annot))
+	{
+		if (pdf_annot_type(ctx, annot) == PDF_ANNOT_FREE_TEXT)
+		{
+			area = pdf_dict_get_rect(ctx, pdf_annot_obj(ctx, annot), PDF_NAME(Rect));
+			if (rect_touches_redactions(ctx, area, red))
+			{
+				pdf_delete_annot(ctx, red->page, annot);
+				goto restart;
+			}
+		}
+	}
+}
+static int culler(fz_context *ctx, void *opaque, fz_rect bbox, fz_cull_type type)
+{
+	struct redact_filter_state *red = opaque;
+	switch (type)
+	{
+	case FZ_CULL_PATH_FILL:
+	case FZ_CULL_PATH_STROKE:
+	case FZ_CULL_PATH_FILL_STROKE:
+	case FZ_CULL_CLIP_PATH_FILL:
+	case FZ_CULL_CLIP_PATH_STROKE:
+	case FZ_CULL_CLIP_PATH_FILL_STROKE:
+		if (red->line_art == PDF_REDACT_LINE_ART_REMOVE_IF_COVERED)
+			return (rect_touches_redactions(ctx, bbox, red) == 2);
+		else if (red->line_art == PDF_REDACT_LINE_ART_REMOVE_IF_TOUCHED)
+			return (rect_touches_redactions(ctx, bbox, red) != 0);
+		return 0;
+	default:
+		return 0;
+	}
+}
+static
+void init_redact_filter(fz_context *ctx, pdf_redact_options *redact_opts, struct redact_filter_state *red, pdf_page *page, pdf_annot *target)
+{
+	int black_boxes = redact_opts ? redact_opts->black_boxes : 0;
+	int image_method = redact_opts ? redact_opts->image_method : PDF_REDACT_IMAGE_PIXELS;
+	int line_art = redact_opts ? redact_opts->line_art : PDF_REDACT_LINE_ART_NONE;
+	int text = redact_opts ? redact_opts->text : PDF_REDACT_TEXT_REMOVE;
+	memset(&red->filter_opts, 0, sizeof red->filter_opts);
+	memset(&red->sanitize_opts, 0, sizeof red->sanitize_opts);
+	red->filter_opts.recurse = 0; /* don't redact patterns, softmasks, and type3 fonts */
+	red->filter_opts.instance_forms = 1; /* redact xobjects with instancing */
+	red->filter_opts.ascii = 1;
+	red->filter_opts.opaque = red;
+	red->filter_opts.filters = red->filter_list;
+	if (black_boxes)
+		red->filter_opts.complete = pdf_redact_end_page;
+	red->line_art = line_art;
+	red->text = text;
+	red->sanitize_opts.opaque = red;
+	if (text == PDF_REDACT_TEXT_REMOVE)
+		red->sanitize_opts.text_filter = pdf_redact_text_filter;
+	if (image_method == PDF_REDACT_IMAGE_PIXELS)
+		red->sanitize_opts.image_filter = pdf_redact_image_filter_pixels;
+	if (image_method == PDF_REDACT_IMAGE_REMOVE)
+		red->sanitize_opts.image_filter = pdf_redact_image_filter_remove;
+	if (image_method == PDF_REDACT_IMAGE_REMOVE_UNLESS_INVISIBLE)
+		red->sanitize_opts.image_filter = pdf_redact_image_filter_remove_invisible;
+	red->sanitize_opts.culler = culler;
+	red->filter_list[0].filter = pdf_new_sanitize_filter;
+	red->filter_list[0].options = &red->sanitize_opts;
+	red->filter_list[1].filter = NULL;
+	red->filter_list[1].options = NULL;
+	red->page = page;
+	red->target = target;
+}
+static int
+pdf_apply_redaction_imp(fz_context *ctx, pdf_page *page, pdf_annot *target, pdf_redact_options *redact_opts)
+{
+	pdf_annot *annot;
+	int has_redactions = 0;
+	struct redact_filter_state red;
+	pdf_document *doc = page->doc;
+	for (annot = pdf_first_annot(ctx, page); annot; annot = pdf_next_annot(ctx, annot)) {
+		if (target != NULL && target != annot)
+			continue;
+		if (pdf_dict_get(ctx, annot->obj, PDF_NAME(Subtype)) == PDF_NAME(Redact))
+			has_redactions = 1;
+	}
+	if (!has_redactions)
+		return 0;
+	init_redact_filter(ctx, redact_opts, &red, page, target);
+	if (target)
+		pdf_begin_operation(ctx, doc, "Apply redaction");
+	else
+		pdf_begin_operation(ctx, doc, "Apply redactions on page");
+	fz_try(ctx)
+	{
+		pdf_filter_page_contents(ctx, doc, page, &red.filter_opts);
+		pdf_redact_page_links(ctx, &red);
+		pdf_redact_page_annotations(ctx, &red);
+		annot = pdf_first_annot(ctx, page);
+		while (annot)
+		{
+			if (target == NULL || annot == target)
+			{
+				if (pdf_dict_get(ctx, annot->obj, PDF_NAME(Subtype)) == PDF_NAME(Redact))
+				{
+					pdf_delete_annot(ctx, page, annot);
+					annot = pdf_first_annot(ctx, page);
+					continue;
+				}
+			}
+			annot = pdf_next_annot(ctx, annot);
+		}
+		doc->redacted = 1;
+		pdf_end_operation(ctx, doc);
+	}
+	fz_catch(ctx)
+	{
+		pdf_abandon_operation(ctx, doc);
+		fz_rethrow(ctx);
+	}
+	return 1;
+}
+int
+pdf_redact_page(fz_context *ctx, pdf_document *doc, pdf_page *page, pdf_redact_options *redact_opts)
+{
+	if (page == NULL || page->doc != doc)
+		fz_throw(ctx, FZ_ERROR_ARGUMENT, "Can't redact a page not from the doc");
+	return pdf_apply_redaction_imp(ctx, page, NULL, redact_opts);
+}
+int
+pdf_apply_redaction(fz_context *ctx, pdf_annot *annot, pdf_redact_options *redact_opts)
+{
+	return pdf_apply_redaction_imp(ctx, annot->page, annot, redact_opts);
+}
+/* Hard clipping of pages */
+struct clip_filter_state {
+	pdf_filter_options filter_opts;
+	pdf_sanitize_filter_options sanitize_opts;
+	pdf_filter_factory filter_list[2];
+	pdf_page *page;
+	fz_rect clip;
+};
+static int clip_culler(fz_context *ctx, void *opaque, fz_rect bbox, fz_cull_type type)
+{
+	struct clip_filter_state *hc = opaque;
+	switch (type)
+	{
+	case FZ_CULL_PATH_FILL:
+	case FZ_CULL_PATH_STROKE:
+	case FZ_CULL_PATH_FILL_STROKE:
+	case FZ_CULL_CLIP_PATH_FILL:
+	case FZ_CULL_CLIP_PATH_STROKE:
+	case FZ_CULL_CLIP_PATH_FILL_STROKE:
+	case FZ_CULL_GLYPH:
+	case FZ_CULL_IMAGE:
+	case FZ_CULL_SHADING:
+		return (fz_is_empty_rect(fz_intersect_rect(bbox, hc->clip)));
+	default:
+		return 0;
+	}
+}
+static
+void init_clip_filter(fz_context *ctx, struct clip_filter_state *hc, pdf_page *page, fz_rect *clip)
+{
+	memset(&hc->filter_opts, 0, sizeof hc->filter_opts);
+	memset(&hc->sanitize_opts, 0, sizeof hc->sanitize_opts);
+	hc->filter_opts.recurse = 0; /* don't redact patterns, softmasks, and type3 fonts */
+	hc->filter_opts.instance_forms = 1; /* redact xobjects with instancing */
+	hc->filter_opts.ascii = 0;
+	hc->filter_opts.opaque = hc;
+	hc->filter_opts.filters = hc->filter_list;
+	hc->clip = *clip;
+	hc->sanitize_opts.opaque = hc;
+	hc->sanitize_opts.culler = clip_culler;
+	hc->filter_list[0].filter = pdf_new_sanitize_filter;
+	hc->filter_list[0].options = &hc->sanitize_opts;
+	hc->filter_list[1].filter = NULL;
+	hc->filter_list[1].options = NULL;
+	hc->page = page;
+}
+static void
+pdf_clip_page_links(fz_context *ctx, struct clip_filter_state *hc)
+{
+	pdf_obj *annots;
+	pdf_obj *link;
+	fz_rect area;
+	int k;
+	annots = pdf_dict_get(ctx, hc->page->obj, PDF_NAME(Annots));
+	k = 0;
+	while (k < pdf_array_len(ctx, annots))
+	{
+		link = pdf_array_get(ctx, annots, k);
+		if (pdf_dict_get(ctx, link, PDF_NAME(Subtype)) == PDF_NAME(Link))
+		{
+			area = pdf_dict_get_rect(ctx, link, PDF_NAME(Rect));
+			if (fz_is_empty_rect(fz_intersect_rect(area, hc->clip)))
+			{
+				pdf_array_delete(ctx, annots, k);
+				continue;
+			}
+		}
+		++k;
+	}
+}
+static void
+pdf_clip_page_annotations(fz_context *ctx, struct clip_filter_state *hc)
+{
+	pdf_annot *annot;
+	fz_rect area;
+restart:
+	for (annot = pdf_first_annot(ctx, hc->page); annot; annot = pdf_next_annot(ctx, annot))
+	{
+		if (pdf_annot_type(ctx, annot) == PDF_ANNOT_FREE_TEXT)
+		{
+			area = pdf_dict_get_rect(ctx, pdf_annot_obj(ctx, annot), PDF_NAME(Rect));
+			if (fz_is_empty_rect(fz_intersect_rect(area, hc->clip)))
+			{
+				pdf_delete_annot(ctx, hc->page, annot);
+				goto restart;
+			}
+		}
+	}
+}
+void
+pdf_clip_page(fz_context *ctx, pdf_page *page, fz_rect *clip)
+{
+	pdf_document *doc;
+	struct clip_filter_state hc;
+	if (page == NULL)
+		return;
+	doc = page->doc;
+	init_clip_filter(ctx, &hc, page, clip);
+	pdf_begin_operation(ctx, doc, "Apply hard clip to page");
+	fz_try(ctx)
+	{
+		pdf_filter_page_contents(ctx, doc, page, &hc.filter_opts);
+		pdf_clip_page_links(ctx, &hc);
+		pdf_clip_page_annotations(ctx, &hc);
+		pdf_end_operation(ctx, doc);
+	}
+	fz_catch(ctx)
+	{
+		pdf_abandon_operation(ctx, doc);
+		fz_rethrow(ctx);
+	}
+}

Mercurial > hgrepos > Python2 > PyMuPDF

comparison mupdf-source/source/pdf/pdf-clean.c @ 2:b50eed0cc0ef upstream