comparison mupdf-source/source/pdf/pdf-clean.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 // Copyright (C) 2004-2025 Artifex Software, Inc.
2 //
3 // This file is part of MuPDF.
4 //
5 // MuPDF is free software: you can redistribute it and/or modify it under the
6 // terms of the GNU Affero General Public License as published by the Free
7 // Software Foundation, either version 3 of the License, or (at your option)
8 // any later version.
9 //
10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13 // details.
14 //
15 // You should have received a copy of the GNU Affero General Public License
16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17 //
18 // Alternative licensing terms are available from the licensor.
19 // For commercial licensing, see <https://www.artifex.com/> or contact
20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21 // CA 94129, USA, for further information.
22
23 #include "mupdf/fitz.h"
24 #include "pdf-annot-imp.h"
25
26 #include <string.h>
27 #include <assert.h>
28
29 static void
30 pdf_filter_xobject(fz_context *ctx, pdf_document *doc, pdf_obj *xobj, pdf_obj *page_res, pdf_filter_options *options, pdf_cycle_list *cycle_up);
31
32 static void
33 pdf_filter_type3(fz_context *ctx, pdf_document *doc, pdf_obj *obj, pdf_obj *page_res, pdf_filter_options *options, pdf_cycle_list *cycle_up);
34
35 static void
36 pdf_filter_resources(fz_context *ctx, pdf_document *doc, pdf_obj *in_res, pdf_obj *res, pdf_filter_options *options, pdf_cycle_list *cycle_up)
37 {
38 pdf_obj *obj;
39 int i, n;
40
41 if (!options->recurse)
42 return;
43
44 /* ExtGState */
45 obj = pdf_dict_get(ctx, res, PDF_NAME(ExtGState));
46 if (obj)
47 {
48 n = pdf_dict_len(ctx, obj);
49 for (i = 0; i < n; i++)
50 {
51 pdf_obj *smask = pdf_dict_get(ctx, pdf_dict_get_val(ctx, obj, i), PDF_NAME(SMask));
52 if (smask)
53 {
54 pdf_obj *g = pdf_dict_get(ctx, smask, PDF_NAME(G));
55 if (g)
56 {
57 /* Transparency group XObject */
58 pdf_filter_xobject(ctx, doc, g, in_res, options, cycle_up);
59 }
60 }
61 }
62 }
63
64 /* Pattern */
65 obj = pdf_dict_get(ctx, res, PDF_NAME(Pattern));
66 if (obj)
67 {
68 n = pdf_dict_len(ctx, obj);
69 for (i = 0; i < n; i++)
70 {
71 pdf_obj *pat = pdf_dict_get_val(ctx, obj, i);
72 if (pat && pdf_dict_get_int(ctx, pat, PDF_NAME(PatternType)) == 1)
73 {
74 pdf_filter_xobject(ctx, doc, pat, in_res, options, cycle_up);
75 }
76 }
77 }
78
79 /* XObject */
80 if (!options->instance_forms)
81 {
82 obj = pdf_dict_get(ctx, res, PDF_NAME(XObject));
83 if (obj)
84 {
85 n = pdf_dict_len(ctx, obj);
86 for (i = 0; i < n; i++)
87 {
88 pdf_obj *xobj = pdf_dict_get_val(ctx, obj, i);
89 if (xobj && pdf_dict_get(ctx, xobj, PDF_NAME(Subtype)) == PDF_NAME(Form))
90 {
91 pdf_filter_xobject(ctx, doc, xobj, in_res, options, cycle_up);
92 }
93 }
94 }
95 }
96
97 /* Font */
98 obj = pdf_dict_get(ctx, res, PDF_NAME(Font));
99 if (obj)
100 {
101 n = pdf_dict_len(ctx, obj);
102 for (i = 0; i < n; i++)
103 {
104 pdf_obj *font = pdf_dict_get_val(ctx, obj, i);
105 if (font && pdf_dict_get(ctx, font, PDF_NAME(Subtype)) == PDF_NAME(Type3))
106 {
107 pdf_filter_type3(ctx, doc, font, in_res, options, cycle_up);
108 }
109 }
110 }
111
112 }
113
114 /*
115 Clean a content stream's rendering operations, with an optional post
116 processing step.
117
118 Firstly, this filters the PDF operators used to avoid (some cases of)
119 repetition, and leaves the content stream in a balanced state with an
120 unchanged top level matrix etc. At the same time, the resources actually
121 used are collected into a new resource dictionary.
122
123 Next, the resources themselves are recursively cleaned (as appropriate)
124 in the same way, if the 'recurse' flag is set.
125 */
126 static void
127 pdf_filter_content_stream(
128 fz_context *ctx,
129 pdf_document *doc,
130 pdf_obj *in_stm,
131 pdf_obj *in_res,
132 fz_matrix transform,
133 pdf_filter_options *options,
134 int struct_parents,
135 fz_buffer **out_buf,
136 pdf_obj **out_res,
137 pdf_cycle_list *cycle_up)
138 {
139 pdf_processor *proc_buffer = NULL;
140 pdf_processor *top = NULL;
141 pdf_processor **list = NULL;
142 int num_filters = 0;
143 int i;
144
145 fz_var(proc_buffer);
146
147 *out_buf = NULL;
148 *out_res = NULL;
149
150 if (options->filters)
151 for (; options->filters[num_filters].filter != NULL; num_filters++);
152
153 if (num_filters > 0)
154 list = fz_calloc(ctx, num_filters, sizeof(pdf_processor *));
155
156 fz_try(ctx)
157 {
158 *out_buf = fz_new_buffer(ctx, 1024);
159 top = proc_buffer = pdf_new_buffer_processor(ctx, *out_buf, options->ascii, options->newlines);
160 if (num_filters > 0)
161 {
162 for (i = num_filters - 1; i >= 0; i--)
163 top = list[i] = options->filters[i].filter(ctx, doc, top, struct_parents, transform, options, options->filters[i].options);
164 }
165
166 pdf_process_contents(ctx, top, doc, in_res, in_stm, NULL, out_res);
167 pdf_close_processor(ctx, top);
168
169 pdf_filter_resources(ctx, doc, in_res, *out_res, options, cycle_up);
170 }
171 fz_always(ctx)
172 {
173 for (i = 0; i < num_filters; i++)
174 pdf_drop_processor(ctx, list[i]);
175 pdf_drop_processor(ctx, proc_buffer);
176 fz_free(ctx, list);
177 }
178 fz_catch(ctx)
179 {
180 fz_drop_buffer(ctx, *out_buf);
181 *out_buf = NULL;
182 pdf_drop_obj(ctx, *out_res);
183 *out_res = NULL;
184 fz_rethrow(ctx);
185 }
186 }
187
188 /*
189 Clean a Type 3 font's CharProcs content streams. This works almost
190 exactly like pdf_filter_content_stream, but the resource dictionary is
191 shared between all off the CharProcs.
192 */
193 static void
194 pdf_filter_type3(fz_context *ctx, pdf_document *doc, pdf_obj *obj, pdf_obj *page_res, pdf_filter_options *options, pdf_cycle_list *cycle_up)
195 {
196 pdf_cycle_list cycle;
197 pdf_processor *proc_buffer = NULL;
198 pdf_processor *proc_filter = NULL;
199 pdf_obj *in_res;
200 pdf_obj *out_res = NULL;
201 pdf_obj *charprocs;
202 int i, n;
203 int num_filters = 0;
204 pdf_processor **list = NULL;
205 fz_buffer *buffer = NULL;
206 pdf_processor *top = NULL;
207 pdf_obj *res = NULL;
208 fz_buffer *new_buf = NULL;
209
210 fz_var(out_res);
211 fz_var(proc_buffer);
212 fz_var(proc_filter);
213 fz_var(buffer);
214 fz_var(res);
215 fz_var(new_buf);
216
217 /* We cannot combine instancing with type3 fonts. The new names for
218 * instanced form/image resources would clash, since they start over for
219 * each content stream. This is not a problem for now, because we only
220 * use instancing with redaction, and redaction doesn't clean type3
221 * fonts.
222 */
223 assert(!options->instance_forms);
224
225 /* Avoid recursive cycles! */
226 if (pdf_cycle(ctx, &cycle, cycle_up, obj))
227 return;
228
229 if (options->filters)
230 for (; options->filters[num_filters].filter != NULL; num_filters++);
231
232 if (num_filters > 0)
233 list = fz_calloc(ctx, num_filters, sizeof(pdf_processor *));
234
235 fz_try(ctx)
236 {
237 in_res = pdf_dict_get(ctx, obj, PDF_NAME(Resources));
238 if (!in_res)
239 in_res = page_res;
240
241 buffer = fz_new_buffer(ctx, 1024);
242 top = proc_buffer = pdf_new_buffer_processor(ctx, buffer, options->ascii, options->newlines);
243 if (num_filters > 0)
244 {
245 for (i = num_filters - 1; i >= 0; i--)
246 top = list[i] = options->filters[i].filter(ctx, doc, top, -1, fz_identity, options, options->filters[i].options);
247 }
248
249 pdf_processor_push_resources(ctx, top, in_res);
250 charprocs = pdf_dict_get(ctx, obj, PDF_NAME(CharProcs));
251 n = pdf_dict_len(ctx, charprocs);
252 for (i = 0; i < n; i++)
253 {
254 pdf_obj *val = pdf_dict_get_val(ctx, charprocs, i);
255
256 if (i > 0)
257 {
258 pdf_reset_processor(ctx, top);
259 fz_clear_buffer(ctx, buffer);
260 }
261 pdf_process_raw_contents(ctx, top, doc, in_res, val, NULL);
262
263 pdf_close_processor(ctx, top);
264
265 if (!options->no_update)
266 {
267 new_buf = fz_clone_buffer(ctx, buffer);
268 pdf_update_stream(ctx, doc, val, new_buf, 0);
269 fz_drop_buffer(ctx, new_buf);
270 new_buf = NULL;
271 }
272 }
273
274 }
275 fz_always(ctx)
276 {
277 res = pdf_processor_pop_resources(ctx, top);
278 for (i = 0; i < num_filters; i++)
279 pdf_drop_processor(ctx, list[i]);
280 pdf_drop_processor(ctx, proc_buffer);
281 fz_free(ctx, list);
282 fz_drop_buffer(ctx, new_buf);
283 fz_drop_buffer(ctx, buffer);
284 }
285 fz_catch(ctx)
286 {
287 pdf_drop_obj(ctx, res);
288 fz_rethrow(ctx);
289 }
290 pdf_dict_put_drop(ctx, obj, PDF_NAME(Resources), res);
291 }
292
293 static void
294 pdf_filter_xobject(fz_context *ctx, pdf_document *doc, pdf_obj *stm, pdf_obj *page_res, pdf_filter_options *options, pdf_cycle_list *cycle_up)
295 {
296 pdf_cycle_list cycle;
297 int struct_parents;
298 pdf_obj *new_res = NULL;
299 fz_buffer *new_buf = NULL;
300 pdf_obj *old_res;
301
302 fz_var(new_buf);
303 fz_var(new_res);
304
305 // TODO for RJW: XObject can also be a StructParent; how do we handle that case?
306
307 struct_parents = pdf_dict_get_int_default(ctx, stm, PDF_NAME(StructParents), -1);
308
309 old_res = pdf_dict_get(ctx, stm, PDF_NAME(Resources));
310 if (!old_res)
311 old_res = page_res;
312
313 // TODO: don't clean objects more than once.
314
315 /* Avoid recursive cycles! */
316 if (pdf_cycle(ctx, &cycle, cycle_up, stm))
317 return;
318 fz_try(ctx)
319 {
320 pdf_filter_content_stream(ctx, doc, stm, old_res, fz_identity, options, struct_parents, &new_buf, &new_res, &cycle);
321 if (!options->no_update)
322 {
323 pdf_update_stream(ctx, doc, stm, new_buf, 0);
324 pdf_dict_put(ctx, stm, PDF_NAME(Resources), new_res);
325 }
326 }
327 fz_always(ctx)
328 {
329 fz_drop_buffer(ctx, new_buf);
330 pdf_drop_obj(ctx, new_res);
331 }
332 fz_catch(ctx)
333 fz_rethrow(ctx);
334 }
335
336 pdf_obj *
337 pdf_filter_xobject_instance(fz_context *ctx, pdf_obj *old_xobj, pdf_obj *page_res, fz_matrix transform, pdf_filter_options *options, pdf_cycle_list *cycle_up)
338 {
339 pdf_cycle_list cycle;
340 pdf_document *doc = pdf_get_bound_document(ctx, old_xobj);
341 pdf_obj *new_xobj;
342 pdf_obj *new_res, *old_res;
343 fz_buffer *new_buf;
344 int struct_parents;
345 fz_matrix matrix;
346
347 fz_var(new_xobj);
348 fz_var(new_buf);
349 fz_var(new_res);
350
351 // TODO for RJW: XObject can also be a StructParent; how do we handle that case?
352 // TODO for RJW: will we run into trouble by duplicating StructParents stuff?
353
354 struct_parents = pdf_dict_get_int_default(ctx, old_xobj, PDF_NAME(StructParents), -1);
355
356 old_res = pdf_dict_get(ctx, old_xobj, PDF_NAME(Resources));
357 if (!old_res)
358 old_res = page_res;
359
360 if (pdf_cycle(ctx, &cycle, cycle_up, old_xobj))
361 return pdf_keep_obj(ctx, old_xobj);
362
363 matrix = pdf_dict_get_matrix(ctx, old_xobj, PDF_NAME(Matrix));
364 transform = fz_concat(matrix, transform);
365
366 fz_try(ctx)
367 {
368 new_xobj = pdf_add_object_drop(ctx, doc, pdf_copy_dict(ctx, old_xobj));
369 pdf_filter_content_stream(ctx, doc, old_xobj, old_res, transform, options, struct_parents, &new_buf, &new_res, &cycle);
370 if (!options->no_update)
371 {
372 pdf_update_stream(ctx, doc, new_xobj, new_buf, 0);
373 pdf_dict_put(ctx, new_xobj, PDF_NAME(Resources), new_res);
374 }
375 }
376 fz_always(ctx)
377 {
378 fz_drop_buffer(ctx, new_buf);
379 pdf_drop_obj(ctx, new_res);
380 }
381 fz_catch(ctx)
382 {
383 pdf_drop_obj(ctx, new_xobj);
384 fz_rethrow(ctx);
385 }
386
387 return new_xobj;
388 }
389
390 void pdf_filter_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page, pdf_filter_options *options)
391 {
392 pdf_obj *contents, *old_res;
393 pdf_obj *new_res;
394 fz_buffer *buffer;
395 int struct_parents;
396
397 struct_parents = pdf_dict_get_int_default(ctx, page->obj, PDF_NAME(StructParents), -1);
398
399 contents = pdf_page_contents(ctx, page);
400 old_res = pdf_page_resources(ctx, page);
401
402 pdf_filter_content_stream(ctx, doc, contents, old_res, fz_identity, options, struct_parents, &buffer, &new_res, NULL);
403
404 fz_try(ctx)
405 {
406 if (options->complete)
407 options->complete(ctx, buffer, options->opaque);
408 if (!options->no_update)
409 {
410 /* Always create a new stream object to replace the page contents. This is useful
411 both if the contents is an array of streams, is entirely missing or if the contents
412 are shared between pages. */
413 contents = pdf_add_object_drop(ctx, doc, pdf_new_dict(ctx, doc, 1));
414 pdf_dict_put_drop(ctx, page->obj, PDF_NAME(Contents), contents);
415 pdf_update_stream(ctx, doc, contents, buffer, 0);
416 pdf_dict_put(ctx, page->obj, PDF_NAME(Resources), new_res);
417 }
418 }
419 fz_always(ctx)
420 {
421 fz_drop_buffer(ctx, buffer);
422 pdf_drop_obj(ctx, new_res);
423 }
424 fz_catch(ctx)
425 fz_rethrow(ctx);
426 }
427
428 void pdf_filter_annot_contents(fz_context *ctx, pdf_document *doc, pdf_annot *annot, pdf_filter_options *options)
429 {
430 pdf_obj *ap = pdf_dict_get(ctx, annot->obj, PDF_NAME(AP));
431 if (pdf_is_dict(ctx, ap))
432 {
433 int i, n = pdf_dict_len(ctx, ap);
434 for (i = 0; i < n; i++)
435 {
436 pdf_obj *stm = pdf_dict_get_val(ctx, ap, i);
437 if (pdf_is_stream(ctx, stm))
438 {
439 pdf_filter_xobject(ctx, doc, stm, NULL, options, NULL);
440 }
441 }
442 }
443 }
444
445 /* REDACTIONS */
446
447 struct redact_filter_state {
448 pdf_filter_options filter_opts;
449 pdf_sanitize_filter_options sanitize_opts;
450 pdf_filter_factory filter_list[2];
451 pdf_page *page;
452 pdf_annot *target; // NULL if all
453 int line_art;
454 int text;
455 };
456
457
458 static void pdf_run_obj_to_buf(fz_context *ctx, fz_buffer *buffer, pdf_obj *obj, pdf_page *page)
459 {
460 pdf_processor *proc = pdf_new_buffer_processor(ctx, buffer, 0, 0);
461 pdf_obj *res;
462
463
464 fz_try(ctx)
465 {
466 res = pdf_xobject_resources(ctx, obj);
467 if (res == NULL)
468 res = pdf_page_resources(ctx, page);
469
470 pdf_process_contents(ctx, proc, page->doc, res, obj, NULL, NULL);
471 pdf_close_processor(ctx, proc);
472 }
473 fz_always(ctx)
474 pdf_drop_processor(ctx, proc);
475 fz_catch(ctx)
476 fz_rethrow(ctx);
477 }
478
479 static void
480 pdf_redact_end_page(fz_context *ctx, fz_buffer *buf, void *opaque)
481 {
482 struct redact_filter_state *red = opaque;
483 pdf_page *page = red->page;
484 pdf_annot *annot;
485 pdf_obj *qp;
486 int i, n;
487
488 fz_append_string(ctx, buf, " 0 g\n");
489
490 for (annot = pdf_first_annot(ctx, page); annot; annot = pdf_next_annot(ctx, annot))
491 {
492 if (red->target != NULL && red->target != annot)
493 continue;
494 if (pdf_dict_get(ctx, annot->obj, PDF_NAME(Subtype)) == PDF_NAME(Redact))
495 {
496 pdf_obj *ro = pdf_dict_get(ctx, annot->obj, PDF_NAME(RO));
497 if (ro)
498 {
499 pdf_run_obj_to_buf(ctx, buf, ro, page);
500 }
501 else
502 {
503 qp = pdf_dict_get(ctx, annot->obj, PDF_NAME(QuadPoints));
504 n = pdf_array_len(ctx, qp);
505 if (n > 0)
506 {
507 for (i = 0; i < n; i += 8)
508 {
509 fz_quad q = pdf_to_quad(ctx, qp, i);
510 fz_append_printf(ctx, buf, "%g %g m\n", q.ll.x, q.ll.y);
511 fz_append_printf(ctx, buf, "%g %g l\n", q.lr.x, q.lr.y);
512 fz_append_printf(ctx, buf, "%g %g l\n", q.ur.x, q.ur.y);
513 fz_append_printf(ctx, buf, "%g %g l\n", q.ul.x, q.ul.y);
514 fz_append_string(ctx, buf, "f\n");
515 }
516 }
517 else
518 {
519 fz_rect r = pdf_dict_get_rect(ctx, annot->obj, PDF_NAME(Rect));
520 fz_append_printf(ctx, buf, "%g %g m\n", r.x0, r.y0);
521 fz_append_printf(ctx, buf, "%g %g l\n", r.x1, r.y0);
522 fz_append_printf(ctx, buf, "%g %g l\n", r.x1, r.y1);
523 fz_append_printf(ctx, buf, "%g %g l\n", r.x0, r.y1);
524 fz_append_string(ctx, buf, "f\n");
525 }
526 }
527 }
528 }
529 }
530
531 static int
532 pdf_redact_text_filter(fz_context *ctx, void *opaque, int *ucsbuf, int ucslen, fz_matrix trm, fz_matrix ctm, fz_rect bbox)
533 {
534 struct redact_filter_state *red = opaque;
535 pdf_page *page = red->page;
536 pdf_annot *annot;
537 pdf_obj *qp;
538 fz_rect r;
539 fz_quad q;
540 int i, n;
541 float w, h;
542
543 trm = fz_concat(trm, ctm);
544 bbox = fz_transform_rect(bbox, trm);
545
546 /* Shrink character bbox a bit */
547 w = bbox.x1 - bbox.x0;
548 h = bbox.y1 - bbox.y0;
549 bbox.x0 += w / 10;
550 bbox.x1 -= w / 10;
551 bbox.y0 += h / 10;
552 bbox.y1 -= h / 10;
553
554 for (annot = pdf_first_annot(ctx, page); annot; annot = pdf_next_annot(ctx, annot))
555 {
556 if (red->target != NULL && red->target != annot)
557 continue;
558 if (pdf_dict_get(ctx, annot->obj, PDF_NAME(Subtype)) == PDF_NAME(Redact))
559 {
560 qp = pdf_dict_get(ctx, annot->obj, PDF_NAME(QuadPoints));
561 n = pdf_array_len(ctx, qp);
562 /* Note, we test for the intersection being a valid rectangle, NOT
563 * a non-empty one. This is because we can have 'empty' character
564 * boxes (say for diacritics), that while 0 width, do have a defined
565 * position on the plane, and hence inclusion makes sense. */
566 if (n > 0)
567 {
568 for (i = 0; i < n; i += 8)
569 {
570 q = pdf_to_quad(ctx, qp, i);
571 r = fz_rect_from_quad(q);
572 if (fz_is_valid_rect(fz_intersect_rect(bbox, r)))
573 return 1;
574 }
575 }
576 else
577 {
578 r = pdf_dict_get_rect(ctx, annot->obj, PDF_NAME(Rect));
579 if (fz_is_valid_rect(fz_intersect_rect(bbox, r)))
580 return 1;
581 }
582 }
583 }
584
585 return 0;
586 }
587
588 static fz_pixmap *
589 pdf_redact_image_imp(fz_context *ctx, fz_matrix ctm, fz_image *image, fz_pixmap *pixmap, fz_pixmap **pmask, fz_quad q)
590 {
591 fz_matrix inv_ctm;
592 fz_irect r;
593 int x, y, k, n, bpp;
594 unsigned char white;
595 fz_pixmap *mask = *pmask;
596 int pixmap_cloned = 0;
597
598 if (!pixmap)
599 {
600 fz_pixmap *original = fz_get_pixmap_from_image(ctx, image, NULL, NULL, NULL, NULL);
601 int imagemask = image->imagemask;
602
603 fz_try(ctx)
604 {
605 pixmap = fz_clone_pixmap(ctx, original);
606 if (imagemask)
607 fz_invert_pixmap_alpha(ctx, pixmap);
608 }
609 fz_always(ctx)
610 fz_drop_pixmap(ctx, original);
611 fz_catch(ctx)
612 fz_rethrow(ctx);
613 pixmap_cloned = 1;
614 }
615
616 if (!mask && image->mask)
617 {
618 fz_pixmap *original = fz_get_pixmap_from_image(ctx, image->mask, NULL, NULL, NULL, NULL);
619
620 fz_try(ctx)
621 {
622 mask = fz_clone_pixmap(ctx, original);
623 *pmask = mask;
624 }
625 fz_always(ctx)
626 {
627 fz_drop_pixmap(ctx, original);
628 }
629 fz_catch(ctx)
630 {
631 if (pixmap_cloned)
632 fz_drop_pixmap(ctx, pixmap);
633 fz_rethrow(ctx);
634 }
635 }
636
637 /* If we have a 1x1 image, to which a mask is being applied
638 * then it's the mask we really want to change, not the
639 * image. We might have just a small section of the image
640 * being covered, and setting the whole thing to white
641 * will blank stuff outside the desired area. */
642 if (!mask || pixmap->w > 1 || pixmap->h > 1)
643 {
644 n = pixmap->n - pixmap->alpha;
645 bpp = pixmap->n;
646 if (fz_colorspace_is_subtractive(ctx, pixmap->colorspace))
647 white = 0;
648 else
649 white = 255;
650
651 inv_ctm = fz_post_scale(fz_invert_matrix(ctm), pixmap->w, pixmap->h);
652 r = fz_round_rect(fz_transform_rect(fz_rect_from_quad(q), inv_ctm));
653 r.x0 = fz_clampi(r.x0, 0, pixmap->w);
654 r.x1 = fz_clampi(r.x1, 0, pixmap->w);
655 r.y1 = fz_clampi(pixmap->h - r.y1, 0, pixmap->h);
656 r.y0 = fz_clampi(pixmap->h - r.y0, 0, pixmap->h);
657 for (y = r.y1; y < r.y0; ++y)
658 {
659 for (x = r.x0; x < r.x1; ++x)
660 {
661 unsigned char *s = &pixmap->samples[(size_t)y * pixmap->stride + (size_t)x * bpp];
662 for (k = 0; k < n; ++k)
663 s[k] = white;
664 if (pixmap->alpha)
665 s[k] = 255;
666 }
667 }
668 }
669
670 if (mask)
671 {
672 inv_ctm = fz_post_scale(fz_invert_matrix(ctm), mask->w, mask->h);
673 r = fz_round_rect(fz_transform_rect(fz_rect_from_quad(q), inv_ctm));
674 r.x0 = fz_clampi(r.x0, 0, mask->w);
675 r.x1 = fz_clampi(r.x1, 0, mask->w);
676 r.y1 = fz_clampi(mask->h - r.y1, 0, mask->h);
677 r.y0 = fz_clampi(mask->h - r.y0, 0, mask->h);
678 for (y = r.y1; y < r.y0; ++y)
679 {
680 unsigned char *s = &mask->samples[(size_t)y * mask->stride + (size_t)r.x0];
681 memset(s, 0xff, r.x1-r.x0);
682 }
683 }
684
685 return pixmap;
686 }
687
688 static fz_image *
689 pdf_redact_image_filter_remove(fz_context *ctx, void *opaque, fz_matrix ctm, const char *name, fz_image *image, fz_rect clip)
690 {
691 fz_pixmap *redacted = NULL;
692 struct redact_filter_state *red = opaque;
693 pdf_page *page = red->page;
694 pdf_annot *annot;
695 pdf_obj *qp;
696 fz_rect area;
697 fz_rect r;
698 int i, n;
699
700 fz_var(redacted);
701
702 area = fz_transform_rect(fz_unit_rect, ctm);
703
704 for (annot = pdf_first_annot(ctx, page); annot; annot = pdf_next_annot(ctx, annot))
705 {
706 if (red->target != NULL && red->target != annot)
707 continue;
708 if (pdf_dict_get(ctx, annot->obj, PDF_NAME(Subtype)) == PDF_NAME(Redact))
709 {
710 qp = pdf_dict_get(ctx, annot->obj, PDF_NAME(QuadPoints));
711 n = pdf_array_len(ctx, qp);
712 if (n > 0)
713 {
714 for (i = 0; i < n; i += 8)
715 {
716 r = fz_rect_from_quad(pdf_to_quad(ctx, qp, i));
717 r = fz_intersect_rect(r, area);
718 if (!fz_is_empty_rect(r))
719 return NULL;
720 }
721 }
722 else
723 {
724 r = pdf_dict_get_rect(ctx, annot->obj, PDF_NAME(Rect));
725 r = fz_intersect_rect(r, area);
726 if (!fz_is_empty_rect(r))
727 return NULL;
728 }
729 }
730 }
731
732 return fz_keep_image(ctx, image);
733 }
734
735 static fz_image *
736 pdf_redact_image_filter_remove_invisible(fz_context *ctx, void *opaque, fz_matrix ctm, const char *name, fz_image *image, fz_rect clip)
737 {
738 fz_pixmap *redacted = NULL;
739 struct redact_filter_state *red = opaque;
740 pdf_page *page = red->page;
741 pdf_annot *annot;
742 pdf_obj *qp;
743 fz_rect area;
744 fz_rect r;
745 int i, n;
746
747 fz_var(redacted);
748
749 area = fz_transform_rect(fz_unit_rect, ctm);
750
751 /* Restrict the are of the image to that which can actually be seen. */
752 area = fz_intersect_rect(area, clip);
753
754 for (annot = pdf_first_annot(ctx, page); annot; annot = pdf_next_annot(ctx, annot))
755 {
756 if (red->target != NULL && red->target != annot)
757 continue;
758 if (pdf_dict_get(ctx, annot->obj, PDF_NAME(Subtype)) == PDF_NAME(Redact))
759 {
760 qp = pdf_dict_get(ctx, annot->obj, PDF_NAME(QuadPoints));
761 n = pdf_array_len(ctx, qp);
762 if (n > 0)
763 {
764 for (i = 0; i < n; i += 8)
765 {
766 r = fz_rect_from_quad(pdf_to_quad(ctx, qp, i));
767 r = fz_intersect_rect(r, area);
768 if (!fz_is_empty_rect(r))
769 return NULL;
770 }
771 }
772 else
773 {
774 r = pdf_dict_get_rect(ctx, annot->obj, PDF_NAME(Rect));
775 r = fz_intersect_rect(r, area);
776 if (!fz_is_empty_rect(r))
777 return NULL;
778 }
779 }
780 }
781
782 return fz_keep_image(ctx, image);
783 }
784
785 static fz_image *
786 pdf_redact_image_filter_pixels(fz_context *ctx, void *opaque, fz_matrix ctm, const char *name, fz_image *image, fz_rect clip)
787 {
788 fz_pixmap *redacted = NULL;
789 fz_pixmap *mask = NULL;
790 struct redact_filter_state *red = opaque;
791 pdf_page *page = red->page;
792 pdf_annot *annot;
793 pdf_obj *qp;
794 fz_quad area, q;
795 fz_rect r;
796 int i, n;
797
798 fz_var(redacted);
799 fz_var(mask);
800
801 area = fz_transform_quad(fz_quad_from_rect(fz_unit_rect), ctm);
802
803 /* First see if we can redact the image completely */
804 for (annot = pdf_first_annot(ctx, page); annot; annot = pdf_next_annot(ctx, annot))
805 {
806 if (red->target != NULL && red->target != annot)
807 continue;
808 if (pdf_dict_get(ctx, annot->obj, PDF_NAME(Subtype)) == PDF_NAME(Redact))
809 {
810 qp = pdf_dict_get(ctx, annot->obj, PDF_NAME(QuadPoints));
811 n = pdf_array_len(ctx, qp);
812 if (n > 0)
813 {
814 for (i = 0; i < n; i += 8)
815 {
816 q = pdf_to_quad(ctx, qp, i);
817 if (fz_is_quad_inside_quad(area, q))
818 return NULL;
819 }
820 }
821 else
822 {
823 r = pdf_dict_get_rect(ctx, annot->obj, PDF_NAME(Rect));
824 q = fz_quad_from_rect(r);
825 if (fz_is_quad_inside_quad(area, q))
826 return NULL;
827 }
828 }
829 }
830
831 /* Blank out redacted parts of the image if necessary */
832 fz_try(ctx)
833 {
834 for (annot = pdf_first_annot(ctx, page); annot; annot = pdf_next_annot(ctx, annot))
835 {
836 if (red->target != NULL && red->target != annot)
837 continue;
838 if (pdf_dict_get(ctx, annot->obj, PDF_NAME(Subtype)) == PDF_NAME(Redact))
839 {
840 qp = pdf_dict_get(ctx, annot->obj, PDF_NAME(QuadPoints));
841 n = pdf_array_len(ctx, qp);
842 if (n > 0)
843 {
844 for (i = 0; i < n; i += 8)
845 {
846 q = pdf_to_quad(ctx, qp, i);
847 if (fz_is_quad_intersecting_quad(area, q))
848 redacted = pdf_redact_image_imp(ctx, ctm, image, redacted, &mask, q);
849 }
850 }
851 else
852 {
853 r = pdf_dict_get_rect(ctx, annot->obj, PDF_NAME(Rect));
854 q = fz_quad_from_rect(r);
855 if (fz_is_quad_intersecting_quad(area, q))
856 redacted = pdf_redact_image_imp(ctx, ctm, image, redacted, &mask, q);
857 }
858 }
859 }
860 }
861 fz_catch(ctx)
862 {
863 fz_drop_pixmap(ctx, redacted);
864 fz_drop_pixmap(ctx, mask);
865 fz_rethrow(ctx);
866 }
867
868 if (redacted)
869 {
870 int imagemask = image->imagemask;
871 fz_image *imask = fz_keep_image(ctx, image->mask);
872
873 fz_var(imask);
874
875 fz_try(ctx)
876 {
877 if (mask)
878 {
879 fz_drop_image(ctx, imask);
880 imask = NULL;
881 imask = fz_new_image_from_pixmap(ctx, mask, NULL);
882 }
883 image = fz_new_image_from_pixmap(ctx, redacted, NULL);
884 image->imagemask = imagemask;
885 image->mask = imask;
886 imask = NULL;
887 }
888 fz_always(ctx)
889 {
890 fz_drop_pixmap(ctx, redacted);
891 fz_drop_pixmap(ctx, mask);
892 fz_drop_image(ctx, imask);
893 }
894 fz_catch(ctx)
895 fz_rethrow(ctx);
896 return image;
897 }
898
899 return fz_keep_image(ctx, image);
900 }
901
902 /* Returns 0 if area does not intersect with any of our redactions.
903 * Returns 2 if area is completely included within one of our redactions.
904 * Returns 1 otherwise. */
905 static int
906 rect_touches_redactions(fz_context *ctx, fz_rect area, struct redact_filter_state *red)
907 {
908 pdf_annot *annot;
909 pdf_obj *qp;
910 fz_quad q;
911 fz_rect r, s;
912 int i, n;
913 pdf_page *page = red->page;
914
915 for (annot = pdf_first_annot(ctx, page); annot; annot = pdf_next_annot(ctx, annot))
916 {
917 if (red->target != NULL && red->target != annot)
918 continue;
919 if (pdf_dict_get(ctx, annot->obj, PDF_NAME(Subtype)) == PDF_NAME(Redact))
920 {
921 qp = pdf_dict_get(ctx, annot->obj, PDF_NAME(QuadPoints));
922 n = pdf_array_len(ctx, qp);
923 if (n > 0)
924 {
925 for (i = 0; i < n; i += 8)
926 {
927 q = pdf_to_quad(ctx, qp, i);
928 r = fz_rect_from_quad(q);
929 s = fz_intersect_rect(r, area);
930 if (!fz_is_empty_rect(s))
931 {
932 if (fz_contains_rect(r, area))
933 return 2;
934 return 1;
935 }
936 }
937 }
938 else
939 {
940 r = pdf_dict_get_rect(ctx, annot->obj, PDF_NAME(Rect));
941 s = fz_intersect_rect(r, area);
942 if (!fz_is_empty_rect(s))
943 {
944 if (fz_contains_rect(r, area))
945 return 2;
946 return 1;
947 }
948 }
949 }
950 }
951 return 0;
952 }
953
954 static void
955 pdf_redact_page_links(fz_context *ctx, struct redact_filter_state *red)
956 {
957 pdf_obj *annots;
958 pdf_obj *link;
959 fz_rect area;
960 int k;
961
962 annots = pdf_dict_get(ctx, red->page->obj, PDF_NAME(Annots));
963 k = 0;
964 while (k < pdf_array_len(ctx, annots))
965 {
966 link = pdf_array_get(ctx, annots, k);
967 if (pdf_dict_get(ctx, link, PDF_NAME(Subtype)) == PDF_NAME(Link))
968 {
969 area = pdf_dict_get_rect(ctx, link, PDF_NAME(Rect));
970 if (rect_touches_redactions(ctx, area, red))
971 {
972 pdf_array_delete(ctx, annots, k);
973 continue;
974 }
975 }
976 ++k;
977 }
978 }
979
980 static void
981 pdf_redact_page_annotations(fz_context *ctx, struct redact_filter_state *red)
982 {
983 pdf_annot *annot;
984 fz_rect area;
985
986 restart:
987 for (annot = pdf_first_annot(ctx, red->page); annot; annot = pdf_next_annot(ctx, annot))
988 {
989 if (pdf_annot_type(ctx, annot) == PDF_ANNOT_FREE_TEXT)
990 {
991 area = pdf_dict_get_rect(ctx, pdf_annot_obj(ctx, annot), PDF_NAME(Rect));
992 if (rect_touches_redactions(ctx, area, red))
993 {
994 pdf_delete_annot(ctx, red->page, annot);
995 goto restart;
996 }
997 }
998 }
999 }
1000
1001 static int culler(fz_context *ctx, void *opaque, fz_rect bbox, fz_cull_type type)
1002 {
1003 struct redact_filter_state *red = opaque;
1004
1005 switch (type)
1006 {
1007 case FZ_CULL_PATH_FILL:
1008 case FZ_CULL_PATH_STROKE:
1009 case FZ_CULL_PATH_FILL_STROKE:
1010 case FZ_CULL_CLIP_PATH_FILL:
1011 case FZ_CULL_CLIP_PATH_STROKE:
1012 case FZ_CULL_CLIP_PATH_FILL_STROKE:
1013 if (red->line_art == PDF_REDACT_LINE_ART_REMOVE_IF_COVERED)
1014 return (rect_touches_redactions(ctx, bbox, red) == 2);
1015 else if (red->line_art == PDF_REDACT_LINE_ART_REMOVE_IF_TOUCHED)
1016 return (rect_touches_redactions(ctx, bbox, red) != 0);
1017 return 0;
1018 default:
1019 return 0;
1020 }
1021 }
1022
1023 static
1024 void init_redact_filter(fz_context *ctx, pdf_redact_options *redact_opts, struct redact_filter_state *red, pdf_page *page, pdf_annot *target)
1025 {
1026 int black_boxes = redact_opts ? redact_opts->black_boxes : 0;
1027 int image_method = redact_opts ? redact_opts->image_method : PDF_REDACT_IMAGE_PIXELS;
1028 int line_art = redact_opts ? redact_opts->line_art : PDF_REDACT_LINE_ART_NONE;
1029 int text = redact_opts ? redact_opts->text : PDF_REDACT_TEXT_REMOVE;
1030
1031 memset(&red->filter_opts, 0, sizeof red->filter_opts);
1032 memset(&red->sanitize_opts, 0, sizeof red->sanitize_opts);
1033
1034 red->filter_opts.recurse = 0; /* don't redact patterns, softmasks, and type3 fonts */
1035 red->filter_opts.instance_forms = 1; /* redact xobjects with instancing */
1036 red->filter_opts.ascii = 1;
1037 red->filter_opts.opaque = red;
1038 red->filter_opts.filters = red->filter_list;
1039 if (black_boxes)
1040 red->filter_opts.complete = pdf_redact_end_page;
1041 red->line_art = line_art;
1042 red->text = text;
1043
1044 red->sanitize_opts.opaque = red;
1045 if (text == PDF_REDACT_TEXT_REMOVE)
1046 red->sanitize_opts.text_filter = pdf_redact_text_filter;
1047 if (image_method == PDF_REDACT_IMAGE_PIXELS)
1048 red->sanitize_opts.image_filter = pdf_redact_image_filter_pixels;
1049 if (image_method == PDF_REDACT_IMAGE_REMOVE)
1050 red->sanitize_opts.image_filter = pdf_redact_image_filter_remove;
1051 if (image_method == PDF_REDACT_IMAGE_REMOVE_UNLESS_INVISIBLE)
1052 red->sanitize_opts.image_filter = pdf_redact_image_filter_remove_invisible;
1053 red->sanitize_opts.culler = culler;
1054
1055 red->filter_list[0].filter = pdf_new_sanitize_filter;
1056 red->filter_list[0].options = &red->sanitize_opts;
1057 red->filter_list[1].filter = NULL;
1058 red->filter_list[1].options = NULL;
1059
1060 red->page = page;
1061 red->target = target;
1062 }
1063
1064 static int
1065 pdf_apply_redaction_imp(fz_context *ctx, pdf_page *page, pdf_annot *target, pdf_redact_options *redact_opts)
1066 {
1067 pdf_annot *annot;
1068 int has_redactions = 0;
1069 struct redact_filter_state red;
1070 pdf_document *doc = page->doc;
1071
1072 for (annot = pdf_first_annot(ctx, page); annot; annot = pdf_next_annot(ctx, annot)) {
1073 if (target != NULL && target != annot)
1074 continue;
1075 if (pdf_dict_get(ctx, annot->obj, PDF_NAME(Subtype)) == PDF_NAME(Redact))
1076 has_redactions = 1;
1077 }
1078
1079 if (!has_redactions)
1080 return 0;
1081
1082 init_redact_filter(ctx, redact_opts, &red, page, target);
1083
1084 if (target)
1085 pdf_begin_operation(ctx, doc, "Apply redaction");
1086 else
1087 pdf_begin_operation(ctx, doc, "Apply redactions on page");
1088 fz_try(ctx)
1089 {
1090 pdf_filter_page_contents(ctx, doc, page, &red.filter_opts);
1091 pdf_redact_page_links(ctx, &red);
1092 pdf_redact_page_annotations(ctx, &red);
1093
1094 annot = pdf_first_annot(ctx, page);
1095 while (annot)
1096 {
1097 if (target == NULL || annot == target)
1098 {
1099 if (pdf_dict_get(ctx, annot->obj, PDF_NAME(Subtype)) == PDF_NAME(Redact))
1100 {
1101 pdf_delete_annot(ctx, page, annot);
1102 annot = pdf_first_annot(ctx, page);
1103 continue;
1104 }
1105 }
1106 annot = pdf_next_annot(ctx, annot);
1107 }
1108
1109 doc->redacted = 1;
1110 pdf_end_operation(ctx, doc);
1111 }
1112 fz_catch(ctx)
1113 {
1114 pdf_abandon_operation(ctx, doc);
1115 fz_rethrow(ctx);
1116 }
1117
1118 return 1;
1119 }
1120
1121 int
1122 pdf_redact_page(fz_context *ctx, pdf_document *doc, pdf_page *page, pdf_redact_options *redact_opts)
1123 {
1124 if (page == NULL || page->doc != doc)
1125 fz_throw(ctx, FZ_ERROR_ARGUMENT, "Can't redact a page not from the doc");
1126 return pdf_apply_redaction_imp(ctx, page, NULL, redact_opts);
1127 }
1128
1129 int
1130 pdf_apply_redaction(fz_context *ctx, pdf_annot *annot, pdf_redact_options *redact_opts)
1131 {
1132 return pdf_apply_redaction_imp(ctx, annot->page, annot, redact_opts);
1133 }
1134
1135 /* Hard clipping of pages */
1136
1137 struct clip_filter_state {
1138 pdf_filter_options filter_opts;
1139 pdf_sanitize_filter_options sanitize_opts;
1140 pdf_filter_factory filter_list[2];
1141 pdf_page *page;
1142 fz_rect clip;
1143 };
1144
1145 static int clip_culler(fz_context *ctx, void *opaque, fz_rect bbox, fz_cull_type type)
1146 {
1147 struct clip_filter_state *hc = opaque;
1148
1149 switch (type)
1150 {
1151 case FZ_CULL_PATH_FILL:
1152 case FZ_CULL_PATH_STROKE:
1153 case FZ_CULL_PATH_FILL_STROKE:
1154 case FZ_CULL_CLIP_PATH_FILL:
1155 case FZ_CULL_CLIP_PATH_STROKE:
1156 case FZ_CULL_CLIP_PATH_FILL_STROKE:
1157 case FZ_CULL_GLYPH:
1158 case FZ_CULL_IMAGE:
1159 case FZ_CULL_SHADING:
1160 return (fz_is_empty_rect(fz_intersect_rect(bbox, hc->clip)));
1161 default:
1162 return 0;
1163 }
1164 }
1165
1166 static
1167 void init_clip_filter(fz_context *ctx, struct clip_filter_state *hc, pdf_page *page, fz_rect *clip)
1168 {
1169 memset(&hc->filter_opts, 0, sizeof hc->filter_opts);
1170 memset(&hc->sanitize_opts, 0, sizeof hc->sanitize_opts);
1171
1172 hc->filter_opts.recurse = 0; /* don't redact patterns, softmasks, and type3 fonts */
1173 hc->filter_opts.instance_forms = 1; /* redact xobjects with instancing */
1174 hc->filter_opts.ascii = 0;
1175 hc->filter_opts.opaque = hc;
1176 hc->filter_opts.filters = hc->filter_list;
1177 hc->clip = *clip;
1178
1179 hc->sanitize_opts.opaque = hc;
1180 hc->sanitize_opts.culler = clip_culler;
1181
1182 hc->filter_list[0].filter = pdf_new_sanitize_filter;
1183 hc->filter_list[0].options = &hc->sanitize_opts;
1184 hc->filter_list[1].filter = NULL;
1185 hc->filter_list[1].options = NULL;
1186
1187 hc->page = page;
1188 }
1189
1190 static void
1191 pdf_clip_page_links(fz_context *ctx, struct clip_filter_state *hc)
1192 {
1193 pdf_obj *annots;
1194 pdf_obj *link;
1195 fz_rect area;
1196 int k;
1197
1198 annots = pdf_dict_get(ctx, hc->page->obj, PDF_NAME(Annots));
1199 k = 0;
1200 while (k < pdf_array_len(ctx, annots))
1201 {
1202 link = pdf_array_get(ctx, annots, k);
1203 if (pdf_dict_get(ctx, link, PDF_NAME(Subtype)) == PDF_NAME(Link))
1204 {
1205 area = pdf_dict_get_rect(ctx, link, PDF_NAME(Rect));
1206 if (fz_is_empty_rect(fz_intersect_rect(area, hc->clip)))
1207 {
1208 pdf_array_delete(ctx, annots, k);
1209 continue;
1210 }
1211 }
1212 ++k;
1213 }
1214 }
1215
1216 static void
1217 pdf_clip_page_annotations(fz_context *ctx, struct clip_filter_state *hc)
1218 {
1219 pdf_annot *annot;
1220 fz_rect area;
1221
1222 restart:
1223 for (annot = pdf_first_annot(ctx, hc->page); annot; annot = pdf_next_annot(ctx, annot))
1224 {
1225 if (pdf_annot_type(ctx, annot) == PDF_ANNOT_FREE_TEXT)
1226 {
1227 area = pdf_dict_get_rect(ctx, pdf_annot_obj(ctx, annot), PDF_NAME(Rect));
1228 if (fz_is_empty_rect(fz_intersect_rect(area, hc->clip)))
1229 {
1230 pdf_delete_annot(ctx, hc->page, annot);
1231 goto restart;
1232 }
1233 }
1234 }
1235 }
1236
1237 void
1238 pdf_clip_page(fz_context *ctx, pdf_page *page, fz_rect *clip)
1239 {
1240 pdf_document *doc;
1241 struct clip_filter_state hc;
1242
1243 if (page == NULL)
1244 return;
1245
1246 doc = page->doc;
1247
1248 init_clip_filter(ctx, &hc, page, clip);
1249
1250 pdf_begin_operation(ctx, doc, "Apply hard clip to page");
1251 fz_try(ctx)
1252 {
1253 pdf_filter_page_contents(ctx, doc, page, &hc.filter_opts);
1254 pdf_clip_page_links(ctx, &hc);
1255 pdf_clip_page_annotations(ctx, &hc);
1256 pdf_end_operation(ctx, doc);
1257 }
1258 fz_catch(ctx)
1259 {
1260 pdf_abandon_operation(ctx, doc);
1261 fz_rethrow(ctx);
1262 }
1263 }