comparison mupdf-source/source/pdf/pdf-clean-file.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 // Copyright (C) 2004-2025 Artifex Software, Inc.
2 //
3 // This file is part of MuPDF.
4 //
5 // MuPDF is free software: you can redistribute it and/or modify it under the
6 // terms of the GNU Affero General Public License as published by the Free
7 // Software Foundation, either version 3 of the License, or (at your option)
8 // any later version.
9 //
10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13 // details.
14 //
15 // You should have received a copy of the GNU Affero General Public License
16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17 //
18 // Alternative licensing terms are available from the licensor.
19 // For commercial licensing, see <https://www.artifex.com/> or contact
20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21 // CA 94129, USA, for further information.
22
23 #include "mupdf/fitz.h"
24 #include "mupdf/pdf.h"
25
26 #include <string.h>
27
28 static int
29 string_in_names_list(fz_context *ctx, pdf_obj *p, pdf_obj *names_list)
30 {
31 int n = pdf_array_len(ctx, names_list);
32 int i;
33 char *str = pdf_to_str_buf(ctx, p);
34
35 for (i = 0; i < n ; i += 2)
36 {
37 if (!strcmp(pdf_to_str_buf(ctx, pdf_array_get(ctx, names_list, i)), str))
38 return 1;
39 }
40 return 0;
41 }
42
43 /*
44 * Recreate page tree to only retain specified pages.
45 */
46
47 static void retainpage(fz_context *ctx, pdf_document *doc, pdf_obj *parentobj, pdf_obj *kids, int page, pdf_obj *structparents, pdf_obj *ostructparents)
48 {
49 pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, page);
50
51 pdf_flatten_inheritable_page_items(ctx, pageref);
52
53 pdf_dict_put(ctx, pageref, PDF_NAME(Parent), parentobj);
54
55 /* Store page object in new kids array */
56 pdf_array_push(ctx, kids, pageref);
57
58 if (structparents)
59 {
60 int parentnum = pdf_dict_get_int(ctx, pageref, PDF_NAME(StructParents));
61 pdf_obj *parent = pdf_lookup_number(ctx, ostructparents, parentnum);
62 pdf_obj *nums = pdf_dict_get(ctx, structparents, PDF_NAME(Nums));
63 pdf_obj *limits = pdf_dict_get(ctx, structparents, PDF_NAME(Limits));
64 int min, max;
65 pdf_array_push_int(ctx, nums, parentnum);
66 pdf_array_push(ctx, nums, parent);
67 if (limits == NULL)
68 {
69 min = max = parentnum;
70 limits = pdf_new_array(ctx, doc, 2);
71 pdf_dict_put_drop(ctx, structparents, PDF_NAME(Limits), limits);
72 }
73 else
74 {
75 min = pdf_array_get_int(ctx, limits, 0);
76 max = pdf_array_get_int(ctx, limits, 1);
77 if (min > parentnum)
78 min = parentnum;
79 if (max < parentnum)
80 max = parentnum;
81 }
82 pdf_array_put_int(ctx, limits, 0, min);
83 pdf_array_put_int(ctx, limits, 1, max);
84 }
85 }
86
87 static int dest_is_valid_page(fz_context *ctx, pdf_obj *obj, int *page_object_nums, int pagecount)
88 {
89 int i;
90 int num = pdf_to_num(ctx, obj);
91
92 if (num == 0)
93 return 0;
94 for (i = 0; i < pagecount; i++)
95 {
96 if (page_object_nums[i] == num)
97 return 1;
98 }
99 return 0;
100 }
101
102 static int dest_is_valid(fz_context *ctx, pdf_obj *o, int page_count, int *page_object_nums, pdf_obj *names_list)
103 {
104 pdf_obj *p;
105
106 p = pdf_dict_get(ctx, o, PDF_NAME(A));
107 if (pdf_name_eq(ctx, pdf_dict_get(ctx, p, PDF_NAME(S)), PDF_NAME(GoTo)))
108 {
109 pdf_obj *d = pdf_dict_get(ctx, p, PDF_NAME(D));
110 if (pdf_is_array(ctx, d) && !dest_is_valid_page(ctx, pdf_array_get(ctx, d, 0), page_object_nums, page_count))
111 return 0;
112 else if (pdf_is_string(ctx, d) && !string_in_names_list(ctx, d, names_list))
113 return 0;
114 }
115
116 p = pdf_dict_get(ctx, o, PDF_NAME(Dest));
117 if (p == NULL)
118 return 1; /* A name with no dest counts as valid. */
119 else if (pdf_is_string(ctx, p))
120 return string_in_names_list(ctx, p, names_list);
121 else if (!dest_is_valid_page(ctx, pdf_array_get(ctx, p, 0), page_object_nums, page_count))
122 return 0;
123
124 return 1;
125 }
126
127 static int strip_stale_annot_refs(fz_context *ctx, pdf_obj *field, int page_count, int *page_object_nums)
128 {
129 pdf_obj *kids = pdf_dict_get(ctx, field, PDF_NAME(Kids));
130 int len = pdf_array_len(ctx, kids);
131 int j;
132
133 if (kids)
134 {
135 for (j = 0; j < len; j++)
136 {
137 if (strip_stale_annot_refs(ctx, pdf_array_get(ctx, kids, j), page_count, page_object_nums))
138 {
139 pdf_array_delete(ctx, kids, j);
140 len--;
141 j--;
142 }
143 }
144
145 return pdf_array_len(ctx, kids) == 0;
146 }
147 else
148 {
149 pdf_obj *page = pdf_dict_get(ctx, field, PDF_NAME(P));
150 int page_num = pdf_to_num(ctx, page);
151
152 for (j = 0; j < page_count; j++)
153 if (page_num == page_object_nums[j])
154 return 0;
155
156 return 1;
157 }
158 }
159
160 static int strip_outlines(fz_context *ctx, pdf_document *doc, pdf_obj *outlines, int page_count, int *page_object_nums, pdf_obj *names_list, pdf_mark_bits *marks);
161
162 static int strip_outline(fz_context *ctx, pdf_document *doc, pdf_obj *outlines, int page_count, int *page_object_nums, pdf_obj *names_list, pdf_obj **pfirst, pdf_obj **plast, pdf_mark_bits *marks)
163 {
164 pdf_obj *prev = NULL;
165 pdf_obj *first = NULL;
166 pdf_obj *current;
167 int count = 0;
168
169 for (current = outlines; current != NULL; )
170 {
171 int nc;
172
173 /* Strip any children to start with. This takes care of
174 * First/Last/Count for us. */
175 nc = strip_outlines(ctx, doc, current, page_count, page_object_nums, names_list, marks);
176
177 if (!dest_is_valid(ctx, current, page_count, page_object_nums, names_list))
178 {
179 if (nc == 0)
180 {
181 /* Outline with invalid dest and no children. Drop it by
182 * pulling the next one in here. */
183 pdf_obj *next = pdf_dict_get(ctx, current, PDF_NAME(Next));
184 if (!pdf_is_dict(ctx, next))
185 {
186 /* There is no next one to pull in */
187 if (prev != NULL)
188 pdf_dict_del(ctx, prev, PDF_NAME(Next));
189 }
190 else if (prev != NULL)
191 {
192 pdf_dict_put(ctx, prev, PDF_NAME(Next), next);
193 pdf_dict_put(ctx, next, PDF_NAME(Prev), prev);
194 }
195 else
196 {
197 pdf_dict_del(ctx, next, PDF_NAME(Prev));
198 }
199 current = next;
200 }
201 else
202 {
203 /* Outline with invalid dest, but children. Just drop the dest. */
204 pdf_dict_del(ctx, current, PDF_NAME(Dest));
205 pdf_dict_del(ctx, current, PDF_NAME(A));
206 current = pdf_dict_get(ctx, current, PDF_NAME(Next));
207 }
208 }
209 else
210 {
211 /* Keep this one */
212 if (first == NULL)
213 first = current;
214 prev = current;
215 current = pdf_dict_get(ctx, current, PDF_NAME(Next));
216 count++;
217 }
218 }
219
220 *pfirst = first;
221 *plast = prev;
222
223 return count;
224 }
225
226 static int strip_outlines(fz_context *ctx, pdf_document *doc, pdf_obj *outlines, int page_count, int *page_object_nums, pdf_obj *names_list, pdf_mark_bits *marks)
227 {
228 int nc;
229 pdf_obj *first;
230 pdf_obj *last;
231
232 if (!pdf_is_dict(ctx, outlines))
233 return 0;
234
235 if (pdf_mark_bits_set(ctx, marks, outlines))
236 fz_throw(ctx, FZ_ERROR_FORMAT, "Cycle detected in outlines");
237
238 first = pdf_dict_get(ctx, outlines, PDF_NAME(First));
239 if (!pdf_is_dict(ctx, first))
240 nc = 0;
241 else
242 nc = strip_outline(ctx, doc, first, page_count, page_object_nums, names_list, &first, &last, marks);
243
244 if (nc == 0)
245 {
246 pdf_dict_del(ctx, outlines, PDF_NAME(First));
247 pdf_dict_del(ctx, outlines, PDF_NAME(Last));
248 pdf_dict_del(ctx, outlines, PDF_NAME(Count));
249 }
250 else
251 {
252 int old_count = pdf_dict_get_int(ctx, outlines, PDF_NAME(Count));
253 pdf_dict_put(ctx, outlines, PDF_NAME(First), first);
254 pdf_dict_put(ctx, outlines, PDF_NAME(Last), last);
255 pdf_dict_put_int(ctx, outlines, PDF_NAME(Count), old_count > 0 ? nc : -nc);
256 }
257
258 return nc;
259 }
260
261 static void pdf_rearrange_pages_imp(fz_context *ctx, pdf_document *doc, int count, const int *new_page_list, pdf_clean_options_structure structure)
262 {
263 pdf_obj *oldroot, *pages, *kids, *olddests;
264 pdf_obj *root = NULL;
265 pdf_obj *names_list = NULL;
266 pdf_obj *outlines;
267 pdf_obj *ocproperties;
268 pdf_obj *allfields = NULL;
269 int pagecount, i;
270 int *page_object_nums = NULL;
271 pdf_obj *structtreeroot = NULL;
272 pdf_obj *ostructparents = NULL;
273 pdf_obj *structparents = NULL;
274 pdf_mark_bits *marks = NULL;
275
276 /* Keep only pages/type and (reduced) dest entries to avoid
277 * references to unretained pages */
278 oldroot = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root));
279 pages = pdf_dict_get(ctx, oldroot, PDF_NAME(Pages));
280 olddests = pdf_load_name_tree(ctx, doc, PDF_NAME(Dests));
281 outlines = pdf_dict_get(ctx, oldroot, PDF_NAME(Outlines));
282 ocproperties = pdf_dict_get(ctx, oldroot, PDF_NAME(OCProperties));
283 if (structure == PDF_CLEAN_STRUCTURE_KEEP)
284 {
285 structtreeroot = pdf_dict_get(ctx, oldroot, PDF_NAME(StructTreeRoot));
286 ostructparents = pdf_dict_get(ctx, structtreeroot, PDF_NAME(ParentTree));
287 if (structtreeroot)
288 structparents = pdf_new_dict(ctx, doc, 3);
289 }
290
291 fz_var(root);
292 fz_var(names_list);
293 fz_var(allfields);
294 fz_var(page_object_nums);
295 fz_var(kids);
296 fz_var(marks);
297
298 fz_try(ctx)
299 {
300 root = pdf_new_dict(ctx, doc, 3);
301 pdf_dict_put(ctx, root, PDF_NAME(Type), pdf_dict_get(ctx, oldroot, PDF_NAME(Type)));
302 pdf_dict_put(ctx, root, PDF_NAME(Pages), pdf_dict_get(ctx, oldroot, PDF_NAME(Pages)));
303 if (structtreeroot)
304 {
305 pdf_dict_put(ctx, root, PDF_NAME(StructTreeRoot), structtreeroot);
306 pdf_dict_put(ctx, structtreeroot, PDF_NAME(ParentTree), structparents);
307 pdf_dict_put_array(ctx, structparents, PDF_NAME(Nums), 2);
308 }
309 if (outlines)
310 pdf_dict_put(ctx, root, PDF_NAME(Outlines), outlines);
311 if (ocproperties)
312 pdf_dict_put(ctx, root, PDF_NAME(OCProperties), ocproperties);
313
314 pdf_update_object(ctx, doc, pdf_to_num(ctx, oldroot), root);
315
316 /* Create a new kids array with only the pages we want to keep */
317 kids = pdf_new_array(ctx, doc, 1);
318
319 /* Retain pages specified */
320 for (i = 0; i < count; ++i)
321 retainpage(ctx, doc, pages, kids, new_page_list[i], structparents, ostructparents);
322
323 /* Update page count */
324 pdf_dict_put_int(ctx, pages, PDF_NAME(Count), pdf_array_len(ctx, kids));
325 pdf_dict_put(ctx, pages, PDF_NAME(Kids), kids);
326
327 pagecount = pdf_count_pages(ctx, doc);
328 page_object_nums = fz_calloc(ctx, pagecount, sizeof(*page_object_nums));
329 for (i = 0; i < pagecount; i++)
330 {
331 pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, i);
332 page_object_nums[i] = pdf_to_num(ctx, pageref);
333 }
334
335 /* If we had an old Dests tree (now reformed as an olddests
336 * dictionary), keep any entries in there that point to
337 * valid pages. This may mean we keep more than we need, but
338 * it's safe at least. */
339 if (olddests)
340 {
341 pdf_obj *names, *dests;
342 int len = pdf_dict_len(ctx, olddests);
343
344 names = pdf_dict_put_dict(ctx, root, PDF_NAME(Names), 1);
345 dests = pdf_dict_put_dict(ctx, names, PDF_NAME(Dests), 1);
346 names_list = pdf_dict_put_array(ctx, dests, PDF_NAME(Names), 32);
347
348 for (i = 0; i < len; i++)
349 {
350 pdf_obj *key = pdf_dict_get_key(ctx, olddests, i);
351 pdf_obj *val = pdf_dict_get_val(ctx, olddests, i);
352 pdf_obj *dest = pdf_dict_get(ctx, val, PDF_NAME(D));
353
354 dest = pdf_array_get(ctx, dest ? dest : val, 0);
355 if (dest_is_valid_page(ctx, dest, page_object_nums, pagecount))
356 {
357 pdf_array_push_string(ctx, names_list, pdf_to_name(ctx, key), strlen(pdf_to_name(ctx, key)));
358 pdf_array_push(ctx, names_list, val);
359 }
360 }
361
362 pdf_drop_obj(ctx, olddests);
363 }
364
365 /* Edit each pages /Annot list to remove any links that point to nowhere. */
366 for (i = 0; i < pagecount; i++)
367 {
368 pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, i);
369
370 pdf_obj *annots = pdf_dict_get(ctx, pageref, PDF_NAME(Annots));
371
372 int len = pdf_array_len(ctx, annots);
373 int j;
374
375 for (j = 0; j < len; j++)
376 {
377 pdf_obj *o = pdf_array_get(ctx, annots, j);
378
379 if (!pdf_name_eq(ctx, pdf_dict_get(ctx, o, PDF_NAME(Subtype)), PDF_NAME(Link)))
380 continue;
381
382 if (!dest_is_valid(ctx, o, pagecount, page_object_nums, names_list))
383 {
384 /* Remove this annotation */
385 pdf_array_delete(ctx, annots, j);
386 len--;
387 j--;
388 }
389 }
390 }
391
392 /* Locate all fields on retained pages */
393 allfields = pdf_new_array(ctx, doc, 1);
394 for (i = 0; i < pagecount; i++)
395 {
396 pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, i);
397
398 pdf_obj *annots = pdf_dict_get(ctx, pageref, PDF_NAME(Annots));
399
400 int len = pdf_array_len(ctx, annots);
401 int j;
402
403 for (j = 0; j < len; j++)
404 {
405 pdf_obj *f = pdf_array_get(ctx, annots, j);
406
407 if (pdf_dict_get(ctx, f, PDF_NAME(Subtype)) == PDF_NAME(Widget))
408 pdf_array_push(ctx, allfields, f);
409 }
410 }
411
412 /* From non-terminal widget fields, strip out annot references not
413 * belonging to any retained page. */
414 for (i = 0; i < pdf_array_len(ctx, allfields); i++)
415 {
416 pdf_obj *f = pdf_array_get(ctx, allfields, i);
417
418 while (pdf_dict_get(ctx, f, PDF_NAME(Parent)))
419 f = pdf_dict_get(ctx, f, PDF_NAME(Parent));
420
421 strip_stale_annot_refs(ctx, f, pagecount, page_object_nums);
422 }
423
424 /* For terminal fields, if action destination is not valid,
425 * remove the action */
426 for (i = 0; i < pdf_array_len(ctx, allfields); i++)
427 {
428 pdf_obj *f = pdf_array_get(ctx, allfields, i);
429
430 if (!dest_is_valid(ctx, f, pagecount, page_object_nums, names_list))
431 pdf_dict_del(ctx, f, PDF_NAME(A));
432 }
433
434 marks = pdf_new_mark_bits(ctx, doc);
435 if (strip_outlines(ctx, doc, outlines, pagecount, page_object_nums, names_list, marks) == 0)
436 {
437 pdf_dict_del(ctx, root, PDF_NAME(Outlines));
438 }
439 }
440 fz_always(ctx)
441 {
442 pdf_drop_mark_bits(ctx, marks);
443 fz_free(ctx, page_object_nums);
444 pdf_drop_obj(ctx, allfields);
445 pdf_drop_obj(ctx, root);
446 pdf_drop_obj(ctx, kids);
447 pdf_drop_obj(ctx, structparents);
448 }
449 fz_catch(ctx)
450 {
451 fz_rethrow(ctx);
452 }
453 }
454
455 void pdf_rearrange_pages(fz_context *ctx, pdf_document *doc, int count, const int *new_page_list, pdf_clean_options_structure structure)
456 {
457 if (structure < PDF_CLEAN_STRUCTURE_DROP || structure > PDF_CLEAN_STRUCTURE_KEEP)
458 fz_throw(ctx, FZ_ERROR_ARGUMENT, "Invalid structure argument");
459
460 pdf_begin_operation(ctx, doc, "Rearrange pages");
461 fz_try(ctx)
462 {
463 pdf_rearrange_pages_imp(ctx, doc, count, new_page_list, structure);
464 pdf_end_operation(ctx, doc);
465 }
466 fz_catch(ctx)
467 {
468 pdf_abandon_operation(ctx, doc);
469 pdf_sync_open_pages(ctx, doc);
470 fz_rethrow(ctx);
471 }
472 pdf_sync_open_pages(ctx, doc);
473 }
474
475 void pdf_clean_file(fz_context *ctx, char *infile, char *outfile, char *password, pdf_clean_options *opts, int argc, char *argv[])
476 {
477 pdf_clean_options default_opts = { 0 };
478 pdf_document *pdf = NULL;
479 int *pages = NULL;
480 int cap, len, page;
481
482 fz_var(pdf);
483 fz_var(pages);
484
485 if (opts == NULL)
486 opts = &default_opts;
487 if (argc > 0 && argv == NULL)
488 fz_throw(ctx, FZ_ERROR_ARGUMENT, "arguments array must be set if arguments exist");
489
490 fz_try(ctx)
491 {
492 pdf = pdf_open_document(ctx, infile);
493 if (pdf_needs_password(ctx, pdf))
494 if (!pdf_authenticate_password(ctx, pdf, password))
495 fz_throw(ctx, FZ_ERROR_ARGUMENT, "cannot authenticate password: %s", infile);
496
497 len = cap = 0;
498
499 /* Only retain the specified subset of the pages */
500 if (argc)
501 {
502 int pagecount = pdf_count_pages(ctx, pdf);
503 int argidx = 0;
504
505 while (argc - argidx)
506 {
507 int spage, epage;
508 const char *pagelist = argv[argidx];
509
510 while ((pagelist = fz_parse_page_range(ctx, pagelist, &spage, &epage, pagecount)))
511 {
512 if (len + (epage - spage + 1) >= cap)
513 {
514 int n = cap ? cap * 2 : 8;
515 while (len + (epage - spage + 1) >= n)
516 n *= 2;
517 pages = fz_realloc_array(ctx, pages, n, int);
518 cap = n;
519 }
520
521 if (spage < epage)
522 for (page = spage; page <= epage; ++page)
523 pages[len++] = page - 1;
524 else
525 for (page = spage; page >= epage; --page)
526 pages[len++] = page - 1;
527 }
528
529 argidx++;
530 }
531
532 pdf_rearrange_pages(ctx, pdf, len, pages, opts->structure);
533 }
534
535 pdf_rewrite_images(ctx, pdf, &opts->image);
536
537 if (opts->subset_fonts)
538 pdf_subset_fonts(ctx, pdf, len, pages);
539
540 pdf_save_document(ctx, pdf, outfile, &opts->write);
541 }
542 fz_always(ctx)
543 {
544 fz_free(ctx, pages);
545 pdf_drop_document(ctx, pdf);
546 }
547 fz_catch(ctx)
548 {
549 fz_rethrow(ctx);
550 }
551 }