diff mupdf-source/source/tools/pdfextract.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/source/tools/pdfextract.c	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,349 @@
+// Copyright (C) 2004-2024 Artifex Software, Inc.
+//
+// This file is part of MuPDF.
+//
+// MuPDF is free software: you can redistribute it and/or modify it under the
+// terms of the GNU Affero General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+// details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
+//
+// Alternative licensing terms are available from the licensor.
+// For commercial licensing, see <https://www.artifex.com/> or contact
+// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
+// CA 94129, USA, for further information.
+
+/*
+ * pdfextract -- the ultimate way to extract images and fonts from pdfs
+ */
+
+#include "mupdf/fitz.h"
+#include "mupdf/pdf.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+
+static pdf_document *doc = NULL;
+static fz_context *ctx = NULL;
+static int dorgb = 0;
+static int doalpha = 0;
+static int doicc = 1;
+
+static int usage(void)
+{
+	fprintf(stderr, "usage: mutool extract [options] file.pdf [object numbers]\n");
+	fprintf(stderr, "\t-p\tpassword\n");
+	fprintf(stderr, "\t-r\tconvert images to rgb\n");
+	fprintf(stderr, "\t-a\tembed SMasks as alpha channel\n");
+	fprintf(stderr, "\t-N\tdo not use ICC color conversions\n");
+	return 1;
+}
+
+static int isimage(pdf_obj *obj)
+{
+	pdf_obj *type = pdf_dict_get(ctx, obj, PDF_NAME(Subtype));
+	return pdf_name_eq(ctx, type, PDF_NAME(Image));
+}
+
+static int isfontdesc(pdf_obj *obj)
+{
+	pdf_obj *type = pdf_dict_get(ctx, obj, PDF_NAME(Type));
+	return pdf_name_eq(ctx, type, PDF_NAME(FontDescriptor));
+}
+
+static void writepixmap(fz_pixmap *pix, char *file)
+{
+	char buf[1024];
+	fz_pixmap *rgb = NULL;
+
+	if (!pix)
+		return;
+
+	if (dorgb && pix->colorspace && pix->colorspace != fz_device_rgb(ctx))
+	{
+		rgb = fz_convert_pixmap(ctx, pix, fz_device_rgb(ctx), NULL, NULL, fz_default_color_params /* FIXME */, 1);
+		pix = rgb;
+	}
+
+	if (!pix->colorspace || pix->colorspace->type == FZ_COLORSPACE_GRAY || pix->colorspace->type == FZ_COLORSPACE_RGB)
+	{
+		fz_snprintf(buf, sizeof(buf), "%s.png", file);
+		printf("extracting %s\n", buf);
+		fz_save_pixmap_as_png(ctx, pix, buf);
+	}
+	else
+	{
+		fz_snprintf(buf, sizeof(buf), "%s.pam", file);
+		printf("extracting %s\n", buf);
+		fz_save_pixmap_as_pam(ctx, pix, buf);
+	}
+
+	fz_drop_pixmap(ctx, rgb);
+}
+
+static void
+writejpeg(const unsigned char *data, size_t len, const char *file)
+{
+	char buf[1024];
+	fz_output *out;
+
+	fz_snprintf(buf, sizeof(buf), "%s.jpg", file);
+
+	out = fz_new_output_with_path(ctx, buf, 0);
+	fz_try(ctx)
+	{
+		printf("extracting %s\n", buf);
+		fz_write_data(ctx, out, data, len);
+		fz_close_output(ctx, out);
+	}
+	fz_always(ctx)
+		fz_drop_output(ctx, out);
+	fz_catch(ctx)
+		fz_rethrow(ctx);
+}
+
+static void saveimage(pdf_obj *ref)
+{
+	fz_image *image = NULL;
+	fz_pixmap *pix = NULL;
+	fz_pixmap *mask = NULL;
+	char buf[32];
+	fz_compressed_buffer *cbuf;
+	int type;
+
+	fz_var(image);
+	fz_var(mask);
+	fz_var(pix);
+
+	fz_try(ctx)
+	{
+		image = pdf_load_image(ctx, doc, ref);
+		cbuf = fz_compressed_image_buffer(ctx, image);
+		fz_snprintf(buf, sizeof(buf), "image-%04d", pdf_to_num(ctx, ref));
+		type = cbuf == NULL ? FZ_IMAGE_UNKNOWN : cbuf->params.type;
+
+		if (image->use_colorkey)
+			type = FZ_IMAGE_UNKNOWN;
+		if (image->use_decode)
+			type = FZ_IMAGE_UNKNOWN;
+		if (image->mask)
+			type = FZ_IMAGE_UNKNOWN;
+		if (dorgb)
+		{
+			enum fz_colorspace_type ctype = fz_colorspace_type(ctx, image->colorspace);
+			if (ctype != FZ_COLORSPACE_RGB && ctype != FZ_COLORSPACE_GRAY)
+				type = FZ_IMAGE_UNKNOWN;
+		}
+
+		if (type == FZ_IMAGE_JPEG)
+		{
+			unsigned char *data;
+			size_t len = fz_buffer_storage(ctx, cbuf->buffer, &data);
+			writejpeg(data, len, buf);
+		}
+		else
+		{
+			pix = fz_get_pixmap_from_image(ctx, image, NULL, NULL, 0, 0);
+			if (image->mask && doalpha)
+			{
+				mask = fz_get_pixmap_from_image(ctx, image->mask, NULL, NULL, 0, 0);
+				if (mask->w == pix->w && mask->h == pix->h)
+				{
+					fz_pixmap *apix = fz_new_pixmap_from_color_and_mask(ctx, pix, mask);
+					fz_drop_pixmap(ctx, pix);
+					pix = apix;
+				}
+				else
+				{
+					fz_warn(ctx, "cannot combine image with smask if different resolution");
+				}
+			}
+			writepixmap(pix, buf);
+		}
+	}
+	fz_always(ctx)
+	{
+		fz_drop_image(ctx, image);
+		fz_drop_pixmap(ctx, mask);
+		fz_drop_pixmap(ctx, pix);
+	}
+	fz_catch(ctx)
+		fz_rethrow(ctx);
+}
+
+static void savefont(pdf_obj *dict)
+{
+	char namebuf[100];
+	fz_buffer *buf;
+	pdf_obj *stream = NULL;
+	pdf_obj *obj;
+	char *ext = "";
+	fz_output *out;
+	size_t len;
+	unsigned char *data;
+
+	obj = pdf_dict_get(ctx, dict, PDF_NAME(FontFile));
+	if (obj)
+	{
+		stream = obj;
+		ext = "pfa";
+	}
+
+	obj = pdf_dict_get(ctx, dict, PDF_NAME(FontFile2));
+	if (obj)
+	{
+		stream = obj;
+		ext = "ttf";
+	}
+
+	obj = pdf_dict_get(ctx, dict, PDF_NAME(FontFile3));
+	if (obj)
+	{
+		stream = obj;
+
+		obj = pdf_dict_get(ctx, obj, PDF_NAME(Subtype));
+		if (obj && !pdf_is_name(ctx, obj))
+			fz_throw(ctx, FZ_ERROR_FORMAT, "invalid font descriptor subtype");
+
+		if (pdf_name_eq(ctx, obj, PDF_NAME(Type1C)))
+			ext = "cff";
+		else if (pdf_name_eq(ctx, obj, PDF_NAME(CIDFontType0C)))
+			ext = "cid";
+		else if (pdf_name_eq(ctx, obj, PDF_NAME(OpenType)))
+			ext = "otf";
+		else
+			fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "unhandled font type '%s'", pdf_to_name(ctx, obj));
+	}
+
+	if (!stream)
+	{
+		return;
+	}
+
+	buf = pdf_load_stream(ctx, stream);
+	len = fz_buffer_storage(ctx, buf, &data);
+	fz_try(ctx)
+	{
+		fz_snprintf(namebuf, sizeof(namebuf), "font-%04d.%s", pdf_to_num(ctx, dict), ext);
+		printf("extracting %s\n", namebuf);
+		out = fz_new_output_with_path(ctx, namebuf, 0);
+		fz_try(ctx)
+		{
+			fz_write_data(ctx, out, data, len);
+			fz_close_output(ctx, out);
+		}
+		fz_always(ctx)
+			fz_drop_output(ctx, out);
+		fz_catch(ctx)
+			fz_rethrow(ctx);
+	}
+	fz_always(ctx)
+		fz_drop_buffer(ctx, buf);
+	fz_catch(ctx)
+		fz_rethrow(ctx);
+}
+
+static void extractobject(int num)
+{
+	pdf_obj *ref = NULL;
+
+	fz_var(ref);
+
+	fz_try(ctx)
+	{
+		ref = pdf_new_indirect(ctx, doc, num, 0);
+		if (isimage(ref))
+			saveimage(ref);
+		if (isfontdesc(ref))
+			savefont(ref);
+
+		fz_empty_store(ctx);
+	}
+	fz_always(ctx)
+		pdf_drop_obj(ctx, ref);
+	fz_catch(ctx)
+	{
+		fz_report_error(ctx);
+		fz_warn(ctx, "ignoring object %d", num);
+	}
+}
+
+int pdfextract_main(int argc, char **argv)
+{
+	char *infile;
+	char *password = "";
+	int c, o, ret = 0;
+
+	while ((c = fz_getopt(argc, argv, "p:raN")) != -1)
+	{
+		switch (c)
+		{
+		case 'p': password = fz_optarg; break;
+		case 'r': dorgb++; break;
+		case 'a': doalpha++; break;
+		case 'N': doicc^=1; break;
+		default: return usage();
+		}
+	}
+
+	if (fz_optind == argc)
+		return usage();
+
+	infile = argv[fz_optind++];
+
+	ctx = fz_new_context(NULL, NULL, FZ_STORE_UNLIMITED);
+	if (!ctx)
+	{
+		fprintf(stderr, "cannot initialise context\n");
+		exit(1);
+	}
+
+	if (doicc)
+		fz_enable_icc(ctx);
+	else
+		fz_disable_icc(ctx);
+
+	fz_var(doc);
+
+	fz_try(ctx)
+	{
+		doc = pdf_open_document(ctx, infile);
+		if (pdf_needs_password(ctx, doc))
+			if (!pdf_authenticate_password(ctx, doc, password))
+				fz_throw(ctx, FZ_ERROR_ARGUMENT, "cannot authenticate password: %s", infile);
+
+		if (fz_optind == argc)
+		{
+			int len = pdf_count_objects(ctx, doc);
+			for (o = 1; o < len; o++)
+				extractobject(o);
+		}
+		else
+		{
+			while (fz_optind < argc)
+			{
+				extractobject(atoi(argv[fz_optind]));
+				fz_optind++;
+			}
+		}
+	}
+	fz_always(ctx)
+		pdf_drop_document(ctx, doc);
+	fz_catch(ctx)
+	{
+		fz_report_error(ctx);
+		ret = 1;
+	}
+
+	fz_flush_warnings(ctx);
+	fz_drop_context(ctx);
+
+	return ret;
+}