comparison mupdf-source/source/tools/pdfextract.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 // Copyright (C) 2004-2024 Artifex Software, Inc.
2 //
3 // This file is part of MuPDF.
4 //
5 // MuPDF is free software: you can redistribute it and/or modify it under the
6 // terms of the GNU Affero General Public License as published by the Free
7 // Software Foundation, either version 3 of the License, or (at your option)
8 // any later version.
9 //
10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13 // details.
14 //
15 // You should have received a copy of the GNU Affero General Public License
16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17 //
18 // Alternative licensing terms are available from the licensor.
19 // For commercial licensing, see <https://www.artifex.com/> or contact
20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21 // CA 94129, USA, for further information.
22
23 /*
24 * pdfextract -- the ultimate way to extract images and fonts from pdfs
25 */
26
27 #include "mupdf/fitz.h"
28 #include "mupdf/pdf.h"
29
30 #include <stdlib.h>
31 #include <stdio.h>
32
33 static pdf_document *doc = NULL;
34 static fz_context *ctx = NULL;
35 static int dorgb = 0;
36 static int doalpha = 0;
37 static int doicc = 1;
38
39 static int usage(void)
40 {
41 fprintf(stderr, "usage: mutool extract [options] file.pdf [object numbers]\n");
42 fprintf(stderr, "\t-p\tpassword\n");
43 fprintf(stderr, "\t-r\tconvert images to rgb\n");
44 fprintf(stderr, "\t-a\tembed SMasks as alpha channel\n");
45 fprintf(stderr, "\t-N\tdo not use ICC color conversions\n");
46 return 1;
47 }
48
49 static int isimage(pdf_obj *obj)
50 {
51 pdf_obj *type = pdf_dict_get(ctx, obj, PDF_NAME(Subtype));
52 return pdf_name_eq(ctx, type, PDF_NAME(Image));
53 }
54
55 static int isfontdesc(pdf_obj *obj)
56 {
57 pdf_obj *type = pdf_dict_get(ctx, obj, PDF_NAME(Type));
58 return pdf_name_eq(ctx, type, PDF_NAME(FontDescriptor));
59 }
60
61 static void writepixmap(fz_pixmap *pix, char *file)
62 {
63 char buf[1024];
64 fz_pixmap *rgb = NULL;
65
66 if (!pix)
67 return;
68
69 if (dorgb && pix->colorspace && pix->colorspace != fz_device_rgb(ctx))
70 {
71 rgb = fz_convert_pixmap(ctx, pix, fz_device_rgb(ctx), NULL, NULL, fz_default_color_params /* FIXME */, 1);
72 pix = rgb;
73 }
74
75 if (!pix->colorspace || pix->colorspace->type == FZ_COLORSPACE_GRAY || pix->colorspace->type == FZ_COLORSPACE_RGB)
76 {
77 fz_snprintf(buf, sizeof(buf), "%s.png", file);
78 printf("extracting %s\n", buf);
79 fz_save_pixmap_as_png(ctx, pix, buf);
80 }
81 else
82 {
83 fz_snprintf(buf, sizeof(buf), "%s.pam", file);
84 printf("extracting %s\n", buf);
85 fz_save_pixmap_as_pam(ctx, pix, buf);
86 }
87
88 fz_drop_pixmap(ctx, rgb);
89 }
90
91 static void
92 writejpeg(const unsigned char *data, size_t len, const char *file)
93 {
94 char buf[1024];
95 fz_output *out;
96
97 fz_snprintf(buf, sizeof(buf), "%s.jpg", file);
98
99 out = fz_new_output_with_path(ctx, buf, 0);
100 fz_try(ctx)
101 {
102 printf("extracting %s\n", buf);
103 fz_write_data(ctx, out, data, len);
104 fz_close_output(ctx, out);
105 }
106 fz_always(ctx)
107 fz_drop_output(ctx, out);
108 fz_catch(ctx)
109 fz_rethrow(ctx);
110 }
111
112 static void saveimage(pdf_obj *ref)
113 {
114 fz_image *image = NULL;
115 fz_pixmap *pix = NULL;
116 fz_pixmap *mask = NULL;
117 char buf[32];
118 fz_compressed_buffer *cbuf;
119 int type;
120
121 fz_var(image);
122 fz_var(mask);
123 fz_var(pix);
124
125 fz_try(ctx)
126 {
127 image = pdf_load_image(ctx, doc, ref);
128 cbuf = fz_compressed_image_buffer(ctx, image);
129 fz_snprintf(buf, sizeof(buf), "image-%04d", pdf_to_num(ctx, ref));
130 type = cbuf == NULL ? FZ_IMAGE_UNKNOWN : cbuf->params.type;
131
132 if (image->use_colorkey)
133 type = FZ_IMAGE_UNKNOWN;
134 if (image->use_decode)
135 type = FZ_IMAGE_UNKNOWN;
136 if (image->mask)
137 type = FZ_IMAGE_UNKNOWN;
138 if (dorgb)
139 {
140 enum fz_colorspace_type ctype = fz_colorspace_type(ctx, image->colorspace);
141 if (ctype != FZ_COLORSPACE_RGB && ctype != FZ_COLORSPACE_GRAY)
142 type = FZ_IMAGE_UNKNOWN;
143 }
144
145 if (type == FZ_IMAGE_JPEG)
146 {
147 unsigned char *data;
148 size_t len = fz_buffer_storage(ctx, cbuf->buffer, &data);
149 writejpeg(data, len, buf);
150 }
151 else
152 {
153 pix = fz_get_pixmap_from_image(ctx, image, NULL, NULL, 0, 0);
154 if (image->mask && doalpha)
155 {
156 mask = fz_get_pixmap_from_image(ctx, image->mask, NULL, NULL, 0, 0);
157 if (mask->w == pix->w && mask->h == pix->h)
158 {
159 fz_pixmap *apix = fz_new_pixmap_from_color_and_mask(ctx, pix, mask);
160 fz_drop_pixmap(ctx, pix);
161 pix = apix;
162 }
163 else
164 {
165 fz_warn(ctx, "cannot combine image with smask if different resolution");
166 }
167 }
168 writepixmap(pix, buf);
169 }
170 }
171 fz_always(ctx)
172 {
173 fz_drop_image(ctx, image);
174 fz_drop_pixmap(ctx, mask);
175 fz_drop_pixmap(ctx, pix);
176 }
177 fz_catch(ctx)
178 fz_rethrow(ctx);
179 }
180
181 static void savefont(pdf_obj *dict)
182 {
183 char namebuf[100];
184 fz_buffer *buf;
185 pdf_obj *stream = NULL;
186 pdf_obj *obj;
187 char *ext = "";
188 fz_output *out;
189 size_t len;
190 unsigned char *data;
191
192 obj = pdf_dict_get(ctx, dict, PDF_NAME(FontFile));
193 if (obj)
194 {
195 stream = obj;
196 ext = "pfa";
197 }
198
199 obj = pdf_dict_get(ctx, dict, PDF_NAME(FontFile2));
200 if (obj)
201 {
202 stream = obj;
203 ext = "ttf";
204 }
205
206 obj = pdf_dict_get(ctx, dict, PDF_NAME(FontFile3));
207 if (obj)
208 {
209 stream = obj;
210
211 obj = pdf_dict_get(ctx, obj, PDF_NAME(Subtype));
212 if (obj && !pdf_is_name(ctx, obj))
213 fz_throw(ctx, FZ_ERROR_FORMAT, "invalid font descriptor subtype");
214
215 if (pdf_name_eq(ctx, obj, PDF_NAME(Type1C)))
216 ext = "cff";
217 else if (pdf_name_eq(ctx, obj, PDF_NAME(CIDFontType0C)))
218 ext = "cid";
219 else if (pdf_name_eq(ctx, obj, PDF_NAME(OpenType)))
220 ext = "otf";
221 else
222 fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "unhandled font type '%s'", pdf_to_name(ctx, obj));
223 }
224
225 if (!stream)
226 {
227 return;
228 }
229
230 buf = pdf_load_stream(ctx, stream);
231 len = fz_buffer_storage(ctx, buf, &data);
232 fz_try(ctx)
233 {
234 fz_snprintf(namebuf, sizeof(namebuf), "font-%04d.%s", pdf_to_num(ctx, dict), ext);
235 printf("extracting %s\n", namebuf);
236 out = fz_new_output_with_path(ctx, namebuf, 0);
237 fz_try(ctx)
238 {
239 fz_write_data(ctx, out, data, len);
240 fz_close_output(ctx, out);
241 }
242 fz_always(ctx)
243 fz_drop_output(ctx, out);
244 fz_catch(ctx)
245 fz_rethrow(ctx);
246 }
247 fz_always(ctx)
248 fz_drop_buffer(ctx, buf);
249 fz_catch(ctx)
250 fz_rethrow(ctx);
251 }
252
253 static void extractobject(int num)
254 {
255 pdf_obj *ref = NULL;
256
257 fz_var(ref);
258
259 fz_try(ctx)
260 {
261 ref = pdf_new_indirect(ctx, doc, num, 0);
262 if (isimage(ref))
263 saveimage(ref);
264 if (isfontdesc(ref))
265 savefont(ref);
266
267 fz_empty_store(ctx);
268 }
269 fz_always(ctx)
270 pdf_drop_obj(ctx, ref);
271 fz_catch(ctx)
272 {
273 fz_report_error(ctx);
274 fz_warn(ctx, "ignoring object %d", num);
275 }
276 }
277
278 int pdfextract_main(int argc, char **argv)
279 {
280 char *infile;
281 char *password = "";
282 int c, o, ret = 0;
283
284 while ((c = fz_getopt(argc, argv, "p:raN")) != -1)
285 {
286 switch (c)
287 {
288 case 'p': password = fz_optarg; break;
289 case 'r': dorgb++; break;
290 case 'a': doalpha++; break;
291 case 'N': doicc^=1; break;
292 default: return usage();
293 }
294 }
295
296 if (fz_optind == argc)
297 return usage();
298
299 infile = argv[fz_optind++];
300
301 ctx = fz_new_context(NULL, NULL, FZ_STORE_UNLIMITED);
302 if (!ctx)
303 {
304 fprintf(stderr, "cannot initialise context\n");
305 exit(1);
306 }
307
308 if (doicc)
309 fz_enable_icc(ctx);
310 else
311 fz_disable_icc(ctx);
312
313 fz_var(doc);
314
315 fz_try(ctx)
316 {
317 doc = pdf_open_document(ctx, infile);
318 if (pdf_needs_password(ctx, doc))
319 if (!pdf_authenticate_password(ctx, doc, password))
320 fz_throw(ctx, FZ_ERROR_ARGUMENT, "cannot authenticate password: %s", infile);
321
322 if (fz_optind == argc)
323 {
324 int len = pdf_count_objects(ctx, doc);
325 for (o = 1; o < len; o++)
326 extractobject(o);
327 }
328 else
329 {
330 while (fz_optind < argc)
331 {
332 extractobject(atoi(argv[fz_optind]));
333 fz_optind++;
334 }
335 }
336 }
337 fz_always(ctx)
338 pdf_drop_document(ctx, doc);
339 fz_catch(ctx)
340 {
341 fz_report_error(ctx);
342 ret = 1;
343 }
344
345 fz_flush_warnings(ctx);
346 fz_drop_context(ctx);
347
348 return ret;
349 }