comparison mupdf-source/source/tools/pdfclean.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 // Copyright (C) 2004-2025 Artifex Software, Inc.
2 //
3 // This file is part of MuPDF.
4 //
5 // MuPDF is free software: you can redistribute it and/or modify it under the
6 // terms of the GNU Affero General Public License as published by the Free
7 // Software Foundation, either version 3 of the License, or (at your option)
8 // any later version.
9 //
10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13 // details.
14 //
15 // You should have received a copy of the GNU Affero General Public License
16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17 //
18 // Alternative licensing terms are available from the licensor.
19 // For commercial licensing, see <https://www.artifex.com/> or contact
20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21 // CA 94129, USA, for further information.
22
23 /*
24 * PDF cleaning tool: general purpose pdf syntax washer.
25 *
26 * Rewrite PDF with pretty printed objects.
27 * Garbage collect unreachable objects.
28 * Inflate compressed streams.
29 * Create subset documents.
30 *
31 * TODO: linearize document for fast web view
32 */
33
34 #include "mupdf/fitz.h"
35 #include "mupdf/pdf.h"
36
37 #include <string.h>
38 #include <stdlib.h>
39 #include <stdio.h>
40
41 static int usage(void)
42 {
43 fprintf(stderr,
44 "usage: mutool clean [options] input.pdf [output.pdf] [pages]\n"
45 "\t-p -\tpassword\n"
46 "\t-g\tgarbage collect unused objects\n"
47 "\t-gg\tin addition to -g compact xref table\n"
48 "\t-ggg\tin addition to -gg merge duplicate objects\n"
49 "\t-gggg\tin addition to -ggg check streams for duplication\n"
50 "\t-l\tlinearize PDF (no longer supported!)\n"
51 "\t-D\tsave file without encryption\n"
52 "\t-E -\tsave file with new encryption (rc4-40, rc4-128, aes-128, or aes-256)\n"
53 "\t-O -\towner password (only if encrypting)\n"
54 "\t-U -\tuser password (only if encrypting)\n"
55 "\t-P -\tpermission flags (only if encrypting)\n"
56 "\t-a\tascii hex encode binary streams\n"
57 "\t-d\tdecompress streams\n"
58 "\t-z\tdeflate uncompressed streams\n"
59 "\t-e -\tcompression \"effort\" (0 = default, 1 = min, 100 = max)\n"
60 "\t-f\tcompress font streams\n"
61 "\t-i\tcompress image streams\n"
62 "\t-c\tclean content streams\n"
63 "\t-s\tsanitize content streams\n"
64 "\t-t\tcompact object syntax\n"
65 "\t-tt\tindented object syntax\n"
66 "\t-L\twrite object labels\n"
67 "\t-A\tcreate appearance streams for annotations\n"
68 "\t-AA\trecreate appearance streams for annotations\n"
69 "\t-m\tpreserve metadata\n"
70 "\t-S\tsubset fonts if possible [EXPERIMENTAL!]\n"
71 "\t-Z\tuse objstms if possible for extra compression\n"
72 "\t--{color,gray,bitonal}-{,lossy-,lossless-}image-subsample-method -\n\t\taverage, bicubic\n"
73 "\t--{color,gray,bitonal}-{,lossy-,lossless-}image-subsample-dpi -[,-]\n\t\tDPI at which to subsample [+ target dpi]\n"
74 "\t--{color,gray,bitonal}-{,lossy-,lossless-}image-recompress-method -[:quality]\n\t\tnever, same, lossless, jpeg, j2k, fax, jbig2\n"
75 "\t--structure=keep|drop\tKeep or drop the structure tree\n"
76 "\tpages\tcomma separated list of page numbers and ranges\n"
77 );
78 return 1;
79 }
80
81 static int encrypt_method_from_string(const char *name)
82 {
83 if (!strcmp(name, "rc4-40")) return PDF_ENCRYPT_RC4_40;
84 if (!strcmp(name, "rc4-128")) return PDF_ENCRYPT_RC4_128;
85 if (!strcmp(name, "aes-128")) return PDF_ENCRYPT_AES_128;
86 if (!strcmp(name, "aes-256")) return PDF_ENCRYPT_AES_256;
87 return PDF_ENCRYPT_UNKNOWN;
88 }
89
90 int pdfclean_main(int argc, char **argv)
91 {
92 char *infile;
93 char *outfile = "out.pdf";
94 char *password = "";
95 int c;
96 int pretty = -1;
97 pdf_clean_options opts = { 0 };
98 int errors = 0;
99 fz_context *ctx;
100 int structure;
101 const fz_getopt_long_options longopts[] =
102 {
103 { "color-lossy-image-subsample-method=average|bicubic", &opts.image.color_lossy_image_subsample_method, (void *)1 },
104 { "color-lossless-image-subsample-method=average|bicubic", &opts.image.color_lossless_image_subsample_method, (void *)2 },
105 { "color-image-subsample-method=average|bicubic", &opts.image.color_lossy_image_subsample_method, (void *)3 },
106 { "color-lossy-image-subsample-dpi:", &opts.image.color_lossy_image_subsample_threshold, (void *)4 },
107 { "color-lossless-image-subsample-dpi:", &opts.image.color_lossless_image_subsample_threshold, (void *)5 },
108 { "color-image-subsample-dpi:", &opts.image.color_lossless_image_subsample_threshold, (void *)6 },
109 { "color-lossy-image-recompress-method=never|same|lossless|jpeg:|j2k:|fax|jbig2", &opts.image.color_lossy_image_recompress_method, (void *)7 },
110 { "color-lossless-image-recompress-method=never|same|lossless|jpeg:|j2k:|fax|jbig2", &opts.image.color_lossless_image_recompress_method, (void *)8 },
111 { "color-image-recompress-method=never|same|lossless|jpeg:|j2k:|fax|jbig2", &opts.image.color_lossless_image_recompress_method, (void *)9 },
112
113 { "gray-lossy-image-subsample-method=average|bicubic", &opts.image.gray_lossy_image_subsample_method, (void *)10 },
114 { "gray-lossless-image-subsample-method=average|bicubic", &opts.image.gray_lossless_image_subsample_method, (void *)11 },
115 { "gray-image-subsample-method=average|bicubic", &opts.image.gray_lossy_image_subsample_method, (void *)12 },
116 { "gray-lossy-image-subsample-dpi:", &opts.image.gray_lossy_image_subsample_threshold, (void *)13 },
117 { "gray-lossless-image-subsample-dpi:", &opts.image.gray_lossless_image_subsample_threshold, (void *)14 },
118 { "gray-image-subsample-dpi:", &opts.image.gray_lossless_image_subsample_threshold, (void *)15 },
119 { "gray-lossy-image-recompress-method=never|same|lossless|jpeg:|j2k:|fax|jbig2", &opts.image.gray_lossy_image_recompress_method, (void *)16 },
120 { "gray-lossless-image-recompress-method=never|same|lossless|jpeg:|j2k:|fax|jbig2", &opts.image.gray_lossless_image_recompress_method, (void *)17 },
121 { "gray-image-recompress-method=never|same|lossless|jpeg:|j2k:|fax|jbig2", &opts.image.gray_lossless_image_recompress_method, (void *)18 },
122
123 { "bitonal-image-subsample-method=average|bicubic", &opts.image.bitonal_image_subsample_method, (void *)19 },
124 { "bitonal-image-subsample-dpi:", &opts.image.bitonal_image_subsample_threshold, (void *)20 },
125 { "bitonal-image-recompress-method=never|same|lossless|jpeg:|j2k:|fax|jbig2", &opts.image.bitonal_image_recompress_method, (void *)21 },
126
127 { "structure=drop|keep", &structure, (void *)22 },
128
129 { NULL, NULL, NULL }
130 };
131
132
133 opts.write = pdf_default_write_options;
134 opts.write.dont_regenerate_id = 1;
135
136 while ((c = fz_getopt_long(argc, argv, "ade:fgilmp:stczDAE:LO:U:P:SZ", longopts)) != -1)
137 {
138 switch (c)
139 {
140 case 'p': password = fz_optarg; break;
141
142 case 'd': opts.write.do_decompress += 1; break;
143 case 'z': opts.write.do_compress += 1; break;
144 case 'f': opts.write.do_compress_fonts += 1; break;
145 case 'i': opts.write.do_compress_images += 1; break;
146 case 'a': opts.write.do_ascii += 1; break;
147 case 'e': opts.write.compression_effort = fz_atoi(fz_optarg); break;
148 case 'g': opts.write.do_garbage += 1; break;
149 case 'l': opts.write.do_linear += 1; break;
150 case 'c': opts.write.do_clean += 1; break;
151 case 's': opts.write.do_sanitize += 1; break;
152 case 't': pretty = (pretty < 0) ? 0 : 1; break;
153 case 'A': opts.write.do_appearance += 1; break;
154 case 'L': opts.write.do_labels = 1; break;
155
156 case 'D': opts.write.do_encrypt = PDF_ENCRYPT_NONE; break;
157 case 'E': opts.write.do_encrypt = encrypt_method_from_string(fz_optarg); break;
158 case 'P': opts.write.permissions = fz_atoi(fz_optarg); break;
159 case 'O': fz_strlcpy(opts.write.opwd_utf8, fz_optarg, sizeof opts.write.opwd_utf8); break;
160 case 'U': fz_strlcpy(opts.write.upwd_utf8, fz_optarg, sizeof opts.write.upwd_utf8); break;
161 case 'm': opts.write.do_preserve_metadata = 1; break;
162 case 'S': opts.subset_fonts = 1; break;
163 case 'Z': opts.write.do_use_objstms = 1; break;
164 case 0:
165 {
166 switch((int)(intptr_t)fz_optlong->opaque)
167 {
168 default:
169 case 0:
170 assert(!"Never happens");
171 break;
172
173 case 1: /* color-lossy-image-subsample-method */
174 case 2: /* color-lossless-image-subsample-method */
175 break;
176 case 3: /* color-image-subsample-method */
177 opts.image.color_lossless_image_subsample_method = opts.image.color_lossy_image_subsample_method;
178 break;
179 case 4: /* color-lossy-image-subsample-dpi */
180 opts.image.color_lossy_image_subsample_to = (fz_optarg ? fz_atoi(fz_optarg) : opts.image.color_lossy_image_subsample_threshold);
181 break;
182 case 5: /* color-lossless-image-subsample-dpi */
183 opts.image.color_lossless_image_subsample_to = (fz_optarg ? fz_atoi(fz_optarg) : opts.image.color_lossless_image_subsample_threshold);
184 break;
185 case 6: /* color-image-subsample-dpi */
186 opts.image.color_lossless_image_subsample_to = (fz_optarg ? fz_atoi(fz_optarg) : opts.image.color_lossless_image_subsample_threshold);
187 opts.image.color_lossy_image_subsample_threshold = opts.image.color_lossless_image_subsample_threshold;
188 opts.image.color_lossy_image_subsample_to = opts.image.color_lossless_image_subsample_to;
189 break;
190 case 7: /* color-lossy-image-recompress-method */
191 opts.image.color_lossless_image_recompress_quality = fz_optarg;
192 break;
193 case 8: /* color-lossless-image-recompress-method */
194 opts.image.color_lossy_image_recompress_quality = fz_optarg;
195 break;
196 case 9: /* color-image-recompress-method */
197 opts.image.color_lossless_image_recompress_quality = fz_optarg;
198 opts.image.color_lossy_image_recompress_method = opts.image.color_lossless_image_recompress_method;
199 opts.image.color_lossy_image_recompress_quality = opts.image.color_lossless_image_recompress_quality;
200 break;
201
202 case 10: /* gray-lossy-image-subsample-method */
203 case 11: /* gray-lossless-image-subsample-method */
204 break;
205 case 12: /* gray-image-subsample-method */
206 opts.image.gray_lossless_image_subsample_method = opts.image.gray_lossy_image_subsample_method;
207 break;
208 case 13: /* gray-lossy-image-subsample-dpi */
209 opts.image.gray_lossy_image_subsample_to = (fz_optarg ? fz_atoi(fz_optarg) : opts.image.gray_lossless_image_subsample_threshold);
210 break;
211 case 14: /* gray-lossless-image-subsample-dpi */
212 opts.image.gray_lossless_image_subsample_to = (fz_optarg ? fz_atoi(fz_optarg) : opts.image.gray_lossless_image_subsample_threshold);
213 break;
214 case 15: /* gray-image-subsample-dpi */
215 opts.image.gray_lossless_image_subsample_to = (fz_optarg ? fz_atoi(fz_optarg) : opts.image.gray_lossy_image_subsample_threshold);
216 opts.image.gray_lossy_image_subsample_threshold = opts.image.gray_lossless_image_subsample_threshold;
217 opts.image.gray_lossy_image_subsample_to = opts.image.gray_lossless_image_subsample_to;
218 break;
219 case 16: /* gray-lossy-image-recompress-method */
220 opts.image.gray_lossless_image_recompress_quality = fz_optarg;
221 break;
222 case 17: /* gray-lossless-image-recompress-method */
223 opts.image.gray_lossy_image_recompress_quality = fz_optarg;
224 break;
225 case 18: /* gray-image-recompress-method */
226 opts.image.gray_lossless_image_recompress_quality = fz_optarg;
227 opts.image.gray_lossy_image_recompress_method = opts.image.gray_lossless_image_recompress_method;
228 opts.image.gray_lossy_image_recompress_quality = opts.image.gray_lossless_image_recompress_quality;
229 break;
230
231 case 19: /* bitonal-image-subsample-method */
232 break;
233 case 20: /* bitonal-image-subsample-dpi */
234 opts.image.bitonal_image_subsample_to = (fz_optarg ? fz_atoi(fz_optarg) : opts.image.bitonal_image_subsample_threshold);
235 break;
236 case 21: /* bitonal-image-recompress-method */
237 opts.image.bitonal_image_recompress_quality = fz_optarg;
238 if (fz_optarg)
239 return usage();
240 break;
241 case 22: /* structure */
242 opts.structure = structure; /* Allow for int/enum size mismatch. */
243 break;
244 }
245 break;
246 }
247 default: return usage();
248 }
249 }
250
251 if (pretty < 0)
252 {
253 if ((opts.write.do_ascii || opts.write.do_decompress) && !opts.write.do_compress)
254 pretty = 1;
255 else
256 pretty = 0;
257 }
258 opts.write.do_pretty = pretty;
259
260 if (argc - fz_optind < 1)
261 return usage();
262
263 infile = argv[fz_optind++];
264
265 if (argc - fz_optind > 0 &&
266 (strstr(argv[fz_optind], ".pdf") || strstr(argv[fz_optind], ".PDF")))
267 {
268 outfile = fz_optpath(argv[fz_optind++]);
269 }
270
271 ctx = fz_new_context(NULL, NULL, FZ_STORE_UNLIMITED);
272 if (!ctx)
273 {
274 fprintf(stderr, "cannot initialise context\n");
275 exit(1);
276 }
277
278 if (opts.write.do_compress > 1)
279 fz_warn(ctx, "Brotli compression is currently non-standard and experimental. Files may not be readable in other software.");
280
281 fz_try(ctx)
282 {
283 pdf_clean_file(ctx, infile, outfile, password, &opts, argc - fz_optind, &argv[fz_optind]);
284 }
285 fz_catch(ctx)
286 {
287 fz_report_error(ctx);
288 errors++;
289 }
290 fz_drop_context(ctx);
291
292 return errors != 0;
293 }