Mercurial > hgrepos > Python2 > PyMuPDF

diff mupdf-source/source/pdf/pdf-stream.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author: Franz Glasner <fzglas.hg@dom66.de>
date: Mon, 15 Sep 2025 11:43:07 +0200
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/source/pdf/pdf-stream.c	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,764 @@
+// Copyright (C) 2004-2025 Artifex Software, Inc.
+//
+// This file is part of MuPDF.
+//
+// MuPDF is free software: you can redistribute it and/or modify it under the
+// terms of the GNU Affero General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+// details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
+//
+// Alternative licensing terms are available from the licensor.
+// For commercial licensing, see <https://www.artifex.com/> or contact
+// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
+// CA 94129, USA, for further information.
+
+#include "mupdf/fitz.h"
+#include "mupdf/pdf.h"
+
+#include <string.h>
+
+int
+pdf_obj_num_is_stream(fz_context *ctx, pdf_document *doc, int num)
+{
+	pdf_xref_entry *entry;
+
+	if (num <= 0 || num >= pdf_xref_len(ctx, doc))
+		return 0;
+
+	fz_try(ctx)
+		entry = pdf_cache_object(ctx, doc, num);
+	fz_catch(ctx)
+	{
+		fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
+		fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
+		fz_report_error(ctx);
+		return 0;
+	}
+
+	return entry->stm_ofs != 0 || entry->stm_buf;
+}
+
+int
+pdf_is_stream(fz_context *ctx, pdf_obj *ref)
+{
+	pdf_document *doc = pdf_get_indirect_document(ctx, ref);
+	if (doc)
+		return pdf_obj_num_is_stream(ctx, doc, pdf_to_num(ctx, ref));
+	return 0;
+}
+
+/*
+ * Scan stream dictionary for an explicit /Crypt filter
+ */
+static int
+pdf_stream_has_crypt(fz_context *ctx, pdf_obj *stm)
+{
+	pdf_obj *filters;
+	pdf_obj *obj;
+	int i;
+
+	filters = pdf_dict_geta(ctx, stm, PDF_NAME(Filter), PDF_NAME(F));
+	if (filters)
+	{
+		if (pdf_name_eq(ctx, filters, PDF_NAME(Crypt)))
+			return 1;
+		if (pdf_is_array(ctx, filters))
+		{
+			int n = pdf_array_len(ctx, filters);
+			for (i = 0; i < n; i++)
+			{
+				obj = pdf_array_get(ctx, filters, i);
+				if (pdf_name_eq(ctx, obj, PDF_NAME(Crypt)))
+					return 1;
+			}
+		}
+	}
+	return 0;
+}
+
+static fz_jbig2_globals *
+pdf_load_jbig2_globals(fz_context *ctx, pdf_obj *dict)
+{
+	fz_jbig2_globals *globals;
+	fz_buffer *buf = NULL;
+
+	fz_var(buf);
+
+	if ((globals = pdf_find_item(ctx, fz_drop_jbig2_globals_imp, dict)) != NULL)
+		return globals;
+
+	if (pdf_mark_obj(ctx, dict))
+		fz_throw(ctx, FZ_ERROR_FORMAT, "cyclic reference when loading JBIG2 globals");
+
+	fz_try(ctx)
+	{
+		buf = pdf_load_stream(ctx, dict);
+		globals = fz_load_jbig2_globals(ctx, buf);
+		if (globals)
+			pdf_store_item(ctx, dict, globals, fz_buffer_storage(ctx, buf, NULL));
+	}
+	fz_always(ctx)
+	{
+		fz_drop_buffer(ctx, buf);
+		pdf_unmark_obj(ctx, dict);
+	}
+	fz_catch(ctx)
+	{
+		fz_rethrow(ctx);
+	}
+
+	return globals;
+}
+
+static void
+build_compression_params(fz_context *ctx, pdf_obj *f, pdf_obj *p, fz_compression_params *params)
+{
+	params->type = FZ_IMAGE_RAW;
+
+	if (pdf_name_eq(ctx, f, PDF_NAME(CCITTFaxDecode)) || pdf_name_eq(ctx, f, PDF_NAME(CCF)))
+	{
+		params->type = FZ_IMAGE_FAX;
+		params->u.fax.k = pdf_dict_get_int_default(ctx, p, PDF_NAME(K), 0);
+		params->u.fax.end_of_line = pdf_dict_get_bool_default(ctx, p, PDF_NAME(EndOfLine), 0);
+		params->u.fax.encoded_byte_align = pdf_dict_get_bool_default(ctx, p, PDF_NAME(EncodedByteAlign), 0);
+		params->u.fax.columns = pdf_dict_get_int_default(ctx, p, PDF_NAME(Columns), 1728);
+		params->u.fax.rows = pdf_dict_get_int_default(ctx, p, PDF_NAME(Rows), 0);
+		params->u.fax.end_of_block = pdf_dict_get_bool_default(ctx, p, PDF_NAME(EndOfBlock), 1);
+		params->u.fax.black_is_1 = pdf_dict_get_bool_default(ctx, p, PDF_NAME(BlackIs1), 0);
+	}
+	else if (pdf_name_eq(ctx, f, PDF_NAME(DCTDecode)) || pdf_name_eq(ctx, f, PDF_NAME(DCT)))
+	{
+		params->type = FZ_IMAGE_JPEG;
+		params->u.jpeg.color_transform = pdf_dict_get_int_default(ctx, p, PDF_NAME(ColorTransform), -1);
+		params->u.jpeg.invert_cmyk = 0;
+	}
+	else if (pdf_name_eq(ctx, f, PDF_NAME(RunLengthDecode)) || pdf_name_eq(ctx, f, PDF_NAME(RL)))
+	{
+		params->type = FZ_IMAGE_RLD;
+	}
+	else if (pdf_name_eq(ctx, f, PDF_NAME(FlateDecode)) || pdf_name_eq(ctx, f, PDF_NAME(Fl)))
+	{
+		params->type = FZ_IMAGE_FLATE;
+		params->u.flate.predictor = pdf_dict_get_int_default(ctx, p, PDF_NAME(Predictor), 1);
+		params->u.flate.columns = pdf_dict_get_int_default(ctx, p, PDF_NAME(Columns), 1);
+		params->u.flate.colors = pdf_dict_get_int_default(ctx, p, PDF_NAME(Colors), 1);
+		params->u.flate.bpc = pdf_dict_get_int_default(ctx, p, PDF_NAME(BitsPerComponent), 8);
+	}
+	else if (pdf_name_eq(ctx, f, PDF_NAME(BrotliDecode)) || pdf_name_eq(ctx, f, PDF_NAME(Br)))
+	{
+		params->type = FZ_IMAGE_BROTLI;
+		params->u.brotli.predictor = pdf_dict_get_int_default(ctx, p, PDF_NAME(Predictor), 1);
+		params->u.brotli.columns = pdf_dict_get_int_default(ctx, p, PDF_NAME(Columns), 1);
+		params->u.brotli.colors = pdf_dict_get_int_default(ctx, p, PDF_NAME(Colors), 1);
+		params->u.brotli.bpc = pdf_dict_get_int_default(ctx, p, PDF_NAME(BitsPerComponent), 8);
+	}
+	else if (pdf_name_eq(ctx, f, PDF_NAME(LZWDecode)) || pdf_name_eq(ctx, f, PDF_NAME(LZW)))
+	{
+		params->type = FZ_IMAGE_LZW;
+		params->u.lzw.predictor = pdf_dict_get_int_default(ctx, p, PDF_NAME(Predictor), 1);
+		params->u.lzw.columns = pdf_dict_get_int_default(ctx, p, PDF_NAME(Columns), 1);
+		params->u.lzw.colors = pdf_dict_get_int_default(ctx, p, PDF_NAME(Colors), 1);
+		params->u.lzw.bpc = pdf_dict_get_int_default(ctx, p, PDF_NAME(BitsPerComponent), 8);
+		params->u.lzw.early_change = pdf_dict_get_int_default(ctx, p, PDF_NAME(EarlyChange), 1);
+	}
+	else if (pdf_name_eq(ctx, f, PDF_NAME(JBIG2Decode)))
+	{
+		pdf_obj *g = pdf_dict_get(ctx, p, PDF_NAME(JBIG2Globals));
+
+		params->type = FZ_IMAGE_JBIG2;
+		params->u.jbig2.globals = NULL;
+		params->u.jbig2.embedded = 1; /* jbig2 streams are always embedded without file headers */
+		if (g)
+		{
+			if (!pdf_is_stream(ctx, g))
+				fz_warn(ctx, "jbig2 globals is not a stream, skipping globals");
+			else
+				params->u.jbig2.globals = pdf_load_jbig2_globals(ctx, g);
+		}
+	}
+}
+
+/*
+ * Create a filter given a name and param dictionary.
+ */
+static fz_stream *
+build_filter(fz_context *ctx, fz_stream *chain, pdf_document *doc, pdf_obj *f, pdf_obj *p, int num, int gen, fz_compression_params *params, int might_be_image)
+{
+	fz_compression_params local_params;
+
+	local_params.u.jbig2.globals = NULL;
+	if (params == NULL)
+		params = &local_params;
+
+	if (!might_be_image &&
+		(pdf_name_eq(ctx, f, PDF_NAME(CCITTFaxDecode)) ||
+			pdf_name_eq(ctx, f, PDF_NAME(CCF)) ||
+			pdf_name_eq(ctx, f, PDF_NAME(DCTDecode)) ||
+			pdf_name_eq(ctx, f, PDF_NAME(DCT)) ||
+			pdf_name_eq(ctx, f, PDF_NAME(JBIG2Decode)) ||
+			pdf_name_eq(ctx, f, PDF_NAME(JPXDecode))))
+	{
+		fz_warn(ctx, "Can't open image only stream for non-image purposes");
+		return fz_open_memory(ctx, (unsigned char *)"", 0);
+	}
+
+	build_compression_params(ctx, f, p, params);
+
+	/* If we were using params we were passed in, and we successfully
+	 * recognised the image type, we can use the existing filter and
+	 * shortstop here. */
+	if (params != &local_params && params->type != FZ_IMAGE_RAW)
+		return fz_keep_stream(ctx, chain); /* nothing to do */
+
+	else if (params->type == FZ_IMAGE_JBIG2)
+	{
+		fz_stream *stm;
+		fz_try(ctx)
+			stm = fz_open_image_decomp_stream(ctx, chain, params, NULL);
+		fz_always(ctx)
+			fz_drop_jbig2_globals(ctx, local_params.u.jbig2.globals);
+		fz_catch(ctx)
+			fz_rethrow(ctx);
+		return stm;
+	}
+
+	else if (params->type != FZ_IMAGE_RAW)
+		return fz_open_image_decomp_stream(ctx, chain, params, NULL);
+
+	else if (pdf_name_eq(ctx, f, PDF_NAME(ASCIIHexDecode)) || pdf_name_eq(ctx, f, PDF_NAME(AHx)))
+		return fz_open_ahxd(ctx, chain);
+
+	else if (pdf_name_eq(ctx, f, PDF_NAME(ASCII85Decode)) || pdf_name_eq(ctx, f, PDF_NAME(A85)))
+		return fz_open_a85d(ctx, chain);
+
+	else if (pdf_name_eq(ctx, f, PDF_NAME(JPXDecode)))
+		return fz_keep_stream(ctx, chain); /* JPX decoding is special cased in the image loading code */
+
+	else if (pdf_name_eq(ctx, f, PDF_NAME(Crypt)))
+	{
+		if (!doc->crypt)
+			fz_warn(ctx, "crypt filter in unencrypted document");
+		else
+		{
+			pdf_obj *name = pdf_dict_get(ctx, p, PDF_NAME(Name));
+			if (pdf_is_name(ctx, name))
+				return pdf_open_crypt_with_filter(ctx, chain, doc->crypt, name, num, gen);
+		}
+	}
+
+	else
+		fz_warn(ctx, "unknown filter name (%s)", pdf_to_name(ctx, f));
+
+	return fz_keep_stream(ctx, chain);
+}
+
+/* Build filter, and assume ownership of chain */
+static fz_stream *
+build_filter_drop(fz_context *ctx, fz_stream *tail, pdf_document *doc, pdf_obj *f, pdf_obj *p, int num, int gen, fz_compression_params *params, int might_be_image)
+{
+	fz_stream *head;
+	fz_try(ctx)
+		head = build_filter(ctx, tail, doc, f, p, num, gen, params, might_be_image);
+	fz_always(ctx)
+		fz_drop_stream(ctx, tail);
+	fz_catch(ctx)
+		fz_rethrow(ctx);
+	return head;
+}
+
+/*
+ * Build a chain of filters given filter names and param dicts.
+ * If chain is given, start filter chain with it.
+ * Assume ownership of chain.
+ */
+static fz_stream *
+build_filter_chain_drop(fz_context *ctx, fz_stream *chain, pdf_document *doc, pdf_obj *fs, pdf_obj *ps, int num, int gen, fz_compression_params *params, int might_be_image)
+{
+	fz_var(chain);
+	fz_try(ctx)
+	{
+		int i, n = pdf_array_len(ctx, fs);
+		for (i = 0; i < n; i++)
+		{
+			pdf_obj *f = pdf_array_get(ctx, fs, i);
+			pdf_obj *p = pdf_array_get(ctx, ps, i);
+			chain = build_filter_drop(ctx, chain, doc, f, p, num, gen, (i == n-1 ? params : NULL), might_be_image);
+		}
+	}
+	fz_catch(ctx)
+		fz_rethrow(ctx);
+	return chain;
+}
+
+static fz_stream *
+build_filter_chain(fz_context *ctx, fz_stream *chain, pdf_document *doc, pdf_obj *fs, pdf_obj *ps, int num, int gen, fz_compression_params *params, int might_be_image)
+{
+	return build_filter_chain_drop(ctx, fz_keep_stream(ctx, chain), doc, fs, ps, num, gen, params, might_be_image);
+}
+
+/*
+ * Build a filter for reading raw stream data.
+ * This is a null filter to constrain reading to the stream length (and to
+ * allow for other people accessing the file), followed by a decryption
+ * filter.
+ *
+ * orig_num and orig_gen are used purely to seed the encryption.
+ */
+static fz_stream *
+pdf_open_raw_filter(fz_context *ctx, fz_stream *file_stm, pdf_document *doc, pdf_obj *stmobj, int num, int *orig_num, int *orig_gen, int64_t offset)
+{
+	pdf_xref_entry *x = NULL;
+	fz_stream *null_stm, *crypt_stm;
+	int hascrypt;
+	int64_t len;
+
+	if (num > 0 && num < pdf_xref_len(ctx, doc))
+	{
+		x = pdf_get_xref_entry(ctx, doc, num);
+	}
+	if (x == NULL)
+	{
+		/* We only end up here when called from pdf_open_stream_with_offset to parse new format XRef sections. */
+		/* New style XRef sections must have generation number 0. */
+		*orig_num = num;
+		*orig_gen = 0;
+	}
+	else
+	{
+		*orig_num = x->num;
+		*orig_gen = x->gen;
+		if (x->stm_buf)
+			return fz_open_buffer(ctx, x->stm_buf);
+	}
+
+	hascrypt = pdf_stream_has_crypt(ctx, stmobj);
+	len = pdf_dict_get_int64(ctx, stmobj, PDF_NAME(Length));
+	if (len < 0)
+		len = 0;
+	null_stm = fz_open_endstream_filter(ctx, file_stm, (uint64_t)len, offset);
+	if (doc->crypt && !hascrypt)
+	{
+		fz_try(ctx)
+			crypt_stm = pdf_open_crypt(ctx, null_stm, doc->crypt, *orig_num, *orig_gen);
+		fz_always(ctx)
+			fz_drop_stream(ctx, null_stm);
+		fz_catch(ctx)
+			fz_rethrow(ctx);
+		return crypt_stm;
+	}
+	return null_stm;
+}
+
+/*
+ * Construct a filter to decode a stream, constraining
+ * to stream length and decrypting.
+ */
+static fz_stream *
+pdf_open_filter(fz_context *ctx, pdf_document *doc, fz_stream *file_stm, pdf_obj *stmobj, int num, int64_t offset, fz_compression_params *imparams, int might_be_image)
+{
+	pdf_obj *filters = pdf_dict_geta(ctx, stmobj, PDF_NAME(Filter), PDF_NAME(F));
+	pdf_obj *params = pdf_dict_geta(ctx, stmobj, PDF_NAME(DecodeParms), PDF_NAME(DP));
+	int orig_num, orig_gen;
+	fz_stream *rstm, *fstm;
+
+	rstm = pdf_open_raw_filter(ctx, file_stm, doc, stmobj, num, &orig_num, &orig_gen, offset);
+	fz_try(ctx)
+	{
+		if (pdf_is_name(ctx, filters))
+			fstm = build_filter(ctx, rstm, doc, filters, params, orig_num, orig_gen, imparams, might_be_image);
+		else if (pdf_array_len(ctx, filters) > 0)
+			fstm = build_filter_chain(ctx, rstm, doc, filters, params, orig_num, orig_gen, imparams, might_be_image);
+		else
+		{
+			if (imparams)
+				imparams->type = FZ_IMAGE_RAW;
+			fstm = fz_keep_stream(ctx, rstm);
+		}
+	}
+	fz_always(ctx)
+		fz_drop_stream(ctx, rstm);
+	fz_catch(ctx)
+		fz_rethrow(ctx);
+
+	return fstm;
+}
+
+fz_stream *
+pdf_open_inline_stream(fz_context *ctx, pdf_document *doc, pdf_obj *stmobj, int length, fz_stream *file_stm, fz_compression_params *imparams)
+{
+	pdf_obj *filters = pdf_dict_geta(ctx, stmobj, PDF_NAME(Filter), PDF_NAME(F));
+	pdf_obj *params = pdf_dict_geta(ctx, stmobj, PDF_NAME(DecodeParms), PDF_NAME(DP));
+
+	if (pdf_is_name(ctx, filters))
+		return build_filter(ctx, file_stm, doc, filters, params, 0, 0, imparams, 1);
+	else if (pdf_array_len(ctx, filters) > 0)
+		return build_filter_chain(ctx, file_stm, doc, filters, params, 0, 0, imparams, 1);
+
+	if (imparams)
+		imparams->type = FZ_IMAGE_RAW;
+	return fz_open_null_filter(ctx, file_stm, length, fz_tell(ctx, file_stm));
+}
+
+void
+pdf_load_compressed_inline_image(fz_context *ctx, pdf_document *doc, pdf_obj *dict, int length, fz_stream *file_stm, int indexed, fz_compressed_image *image)
+{
+	fz_stream *istm = NULL, *leech = NULL, *decomp = NULL;
+	fz_pixmap *pixmap = NULL;
+	fz_compressed_buffer *bc;
+	int dummy_l2factor = 0;
+
+	fz_var(istm);
+	fz_var(leech);
+	fz_var(decomp);
+	fz_var(pixmap);
+
+	bc = fz_new_compressed_buffer(ctx);
+	fz_try(ctx)
+	{
+		bc->buffer = fz_new_buffer(ctx, 1024);
+		istm = pdf_open_inline_stream(ctx, doc, dict, length, file_stm, &bc->params);
+		leech = fz_open_leecher(ctx, istm, bc->buffer);
+		decomp = fz_open_image_decomp_stream(ctx, leech, &bc->params, &dummy_l2factor);
+		pixmap = fz_decomp_image_from_stream(ctx, decomp, image, NULL, indexed, 0, NULL);
+		fz_set_compressed_image_buffer(ctx, image, bc);
+	}
+	fz_always(ctx)
+	{
+		fz_drop_stream(ctx, istm);
+		fz_drop_stream(ctx, leech);
+		fz_drop_stream(ctx, decomp);
+		fz_drop_pixmap(ctx, pixmap);
+	}
+	fz_catch(ctx)
+	{
+		fz_drop_compressed_buffer(ctx, bc);
+		fz_rethrow(ctx);
+	}
+}
+
+fz_stream *
+pdf_open_raw_stream_number(fz_context *ctx, pdf_document *doc, int num)
+{
+	pdf_xref_entry *x;
+	int orig_num, orig_gen;
+
+	x = pdf_cache_object(ctx, doc, num);
+	if (x->stm_ofs == 0)
+		fz_throw(ctx, FZ_ERROR_FORMAT, "object is not a stream");
+
+	return pdf_open_raw_filter(ctx, doc->file, doc, x->obj, num, &orig_num, &orig_gen, x->stm_ofs);
+}
+
+static fz_stream *
+pdf_open_image_stream(fz_context *ctx, pdf_document *doc, int num, fz_compression_params *params, int might_be_image)
+{
+	pdf_xref_entry *x;
+
+	x = pdf_cache_object(ctx, doc, num);
+	if (x->stm_ofs == 0 && x->stm_buf == NULL)
+		fz_throw(ctx, FZ_ERROR_FORMAT, "object is not a stream");
+
+	return pdf_open_filter(ctx, doc, doc->file, x->obj, num, x->stm_ofs, params, might_be_image);
+}
+
+fz_stream *
+pdf_open_stream_number(fz_context *ctx, pdf_document *doc, int num)
+{
+	return pdf_open_image_stream(ctx, doc, num, NULL, 1);
+}
+
+fz_stream *
+pdf_open_stream_with_offset(fz_context *ctx, pdf_document *doc, int num, pdf_obj *dict, int64_t stm_ofs)
+{
+	if (stm_ofs == 0)
+		fz_throw(ctx, FZ_ERROR_FORMAT, "object is not a stream");
+	return pdf_open_filter(ctx, doc, doc->file, dict, num, stm_ofs, NULL, 1);
+}
+
+fz_buffer *
+pdf_load_raw_stream_number(fz_context *ctx, pdf_document *doc, int num)
+{
+	fz_stream *stm;
+	pdf_obj *dict;
+	int64_t len;
+	fz_buffer *buf = NULL;
+	pdf_xref_entry *x;
+
+	if (num > 0 && num < pdf_xref_len(ctx, doc))
+	{
+		x = pdf_get_xref_entry_no_null(ctx, doc, num);
+		if (x->stm_buf)
+			return fz_keep_buffer(ctx, x->stm_buf);
+	}
+
+	dict = pdf_load_object(ctx, doc, num);
+
+	fz_try(ctx)
+		len = pdf_dict_get_int64(ctx, dict, PDF_NAME(Length));
+	fz_always(ctx)
+		pdf_drop_obj(ctx, dict);
+	fz_catch(ctx)
+		fz_rethrow(ctx);
+
+	stm = pdf_open_raw_stream_number(ctx, doc, num);
+
+	if (len < 0)
+		len = 1024;
+
+	fz_try(ctx)
+		buf = fz_read_all(ctx, stm, (size_t)len);
+	fz_always(ctx)
+		fz_drop_stream(ctx, stm);
+	fz_catch(ctx)
+		fz_rethrow(ctx);
+
+	return buf;
+}
+
+static size_t
+pdf_guess_filter_length(size_t len, const char *filter)
+{
+	size_t nlen = len;
+
+	/* First ones get smaller, no overflow check required. */
+	if (!strcmp(filter, "ASCIIHexDecode"))
+		return len / 2;
+	else if (!strcmp(filter, "ASCII85Decode"))
+		return len * 4 / 5;
+
+	if (!strcmp(filter, "FlateDecode"))
+		nlen = len * 3;
+	else if (!strcmp(filter, "BrotliDecode"))
+		nlen = len * 4;
+	else if (!strcmp(filter, "RunLengthDecode"))
+		nlen = len * 3;
+	else if (!strcmp(filter, "LZWDecode"))
+		nlen = len * 2;
+
+	/* Live with a bad estimate - we'll malloc up as we go, but
+	 * it's probably destined to fail anyway. */
+	if (nlen < len)
+		return len;
+
+	return nlen;
+}
+
+/* Check if an entry has a cached stream and return whether it is directly
+ * reusable. A buffer is directly reusable only if the stream is
+ * uncompressed, or if it is compressed purely a compression method we can
+ * return details of in fz_compression_params.
+ *
+ * If the stream is reusable return 1, and set params as required, otherwise
+ * return 0. */
+static int
+can_reuse_buffer(fz_context *ctx, pdf_xref_entry *entry, fz_compression_params *params)
+{
+	pdf_obj *f;
+	pdf_obj *p;
+
+	if (!entry || !entry->obj || !entry->stm_buf)
+		return 0;
+
+	if (params)
+		params->type = FZ_IMAGE_RAW;
+
+	f = pdf_dict_geta(ctx, entry->obj, PDF_NAME(Filter), PDF_NAME(F));
+	/* If there are no filters, it's uncompressed, and we can use it */
+	if (!f)
+		return 1;
+
+	p = pdf_dict_geta(ctx, entry->obj, PDF_NAME(DecodeParms), PDF_NAME(DP));
+	if (pdf_is_array(ctx, f))
+	{
+		int len = pdf_array_len(ctx, f);
+
+		/* Empty array of filters. Its uncompressed. We can cope. */
+		if (len == 0)
+			return 1;
+		/* 1 filter is the most we can hope to cope with - if more,*/
+		if (len != 1)
+			return 0;
+		p = pdf_array_get(ctx, p, 0);
+	}
+	if (pdf_is_null(ctx, f))
+		return 1; /* Null filter is uncompressed */
+	if (!pdf_is_name(ctx, f))
+		return 0;
+
+	/* There are filters, so unless we have the option of shortstopping,
+	 * we can't use the existing buffer. */
+	if (!params)
+		return 0;
+
+	build_compression_params(ctx, f, p, params);
+
+	return (params->type == FZ_IMAGE_RAW) ? 0 : 1;
+}
+
+static fz_buffer *
+pdf_load_image_stream(fz_context *ctx, pdf_document *doc, int num, fz_compression_params *params, int *truncated, size_t worst_case)
+{
+	fz_stream *stm = NULL;
+	pdf_obj *dict, *obj;
+	int i, n;
+	size_t len;
+	fz_buffer *buf;
+
+	fz_var(buf);
+
+	if (num > 0 && num < pdf_xref_len(ctx, doc))
+	{
+		pdf_xref_entry *entry = pdf_get_xref_entry(ctx, doc, num);
+		/* Return ref to existing buffer, but only if uncompressed,
+		 * or shortstoppable */
+		if (can_reuse_buffer(ctx, entry, params))
+			return fz_keep_buffer(ctx, entry->stm_buf);
+	}
+
+	dict = pdf_load_object(ctx, doc, num);
+	fz_try(ctx)
+	{
+		int64_t ilen = pdf_dict_get_int64(ctx, dict, PDF_NAME(Length));
+		if (ilen < 0)
+			ilen = 0;
+		len = (size_t)ilen;
+		/* In 32 bit builds, we might find a length being too
+		 * large for a size_t. */
+		if ((int64_t)len != ilen)
+			fz_throw(ctx, FZ_ERROR_LIMIT, "Stream too large");
+		obj = pdf_dict_get(ctx, dict, PDF_NAME(Filter));
+		len = pdf_guess_filter_length(len, pdf_to_name(ctx, obj));
+		n = pdf_array_len(ctx, obj);
+		for (i = 0; i < n; i++)
+			len = pdf_guess_filter_length(len, pdf_array_get_name(ctx, obj, i));
+	}
+	fz_always(ctx)
+	{
+		pdf_drop_obj(ctx, dict);
+	}
+	fz_catch(ctx)
+	{
+		fz_rethrow(ctx);
+	}
+
+	stm = pdf_open_image_stream(ctx, doc, num, params, 1);
+
+	fz_try(ctx)
+	{
+		buf = fz_read_best(ctx, stm, len, truncated, worst_case);
+	}
+	fz_always(ctx)
+	{
+		fz_drop_stream(ctx, stm);
+	}
+	fz_catch(ctx)
+	{
+		fz_rethrow(ctx);
+	}
+
+	return buf;
+}
+
+fz_buffer *
+pdf_load_stream_number(fz_context *ctx, pdf_document *doc, int num)
+{
+	return pdf_load_image_stream(ctx, doc, num, NULL, NULL, 0);
+}
+
+fz_compressed_buffer *
+pdf_load_compressed_stream(fz_context *ctx, pdf_document *doc, int num, size_t worst_case)
+{
+	fz_compressed_buffer *bc = fz_new_compressed_buffer(ctx);
+
+	fz_try(ctx)
+	{
+		bc->buffer = pdf_load_image_stream(ctx, doc, num, &bc->params, NULL, worst_case);
+	}
+	fz_catch(ctx)
+	{
+		fz_free(ctx, bc);
+		fz_rethrow(ctx);
+	}
+	return bc;
+}
+
+static fz_stream *
+pdf_open_object_array(fz_context *ctx, pdf_document *doc, pdf_obj *list)
+{
+	fz_stream *stm;
+	int i, n;
+
+	n = pdf_array_len(ctx, list);
+	stm = fz_open_concat(ctx, n, 1);
+
+	for (i = 0; i < n; i++)
+	{
+		pdf_obj *obj = pdf_array_get(ctx, list, i);
+		fz_try(ctx)
+			fz_concat_push_drop(ctx, stm, pdf_open_stream(ctx, obj));
+		fz_catch(ctx)
+		{
+			if (fz_caught(ctx) == FZ_ERROR_TRYLATER || fz_caught(ctx) == FZ_ERROR_SYSTEM)
+			{
+				fz_drop_stream(ctx, stm);
+				fz_rethrow(ctx);
+			}
+			fz_report_error(ctx);
+			fz_warn(ctx, "cannot load content stream part %d/%d", i + 1, n);
+		}
+	}
+
+	return stm;
+}
+
+fz_stream *
+pdf_open_contents_stream(fz_context *ctx, pdf_document *doc, pdf_obj *obj)
+{
+	int num;
+
+	if (pdf_is_array(ctx, obj))
+		return pdf_open_object_array(ctx, doc, obj);
+
+	num = pdf_to_num(ctx, obj);
+	if (pdf_is_stream(ctx, obj))
+		return pdf_open_image_stream(ctx, doc, num, NULL, 0);
+
+	fz_warn(ctx, "content stream is not a stream (%d 0 R)", num);
+	return fz_open_memory(ctx, (unsigned char *)"", 0);
+}
+
+fz_buffer *pdf_load_raw_stream(fz_context *ctx, pdf_obj *ref)
+{
+	if (pdf_is_stream(ctx, ref))
+		return pdf_load_raw_stream_number(ctx, pdf_get_indirect_document(ctx, ref), pdf_to_num(ctx, ref));
+	fz_throw(ctx, FZ_ERROR_FORMAT, "object is not a stream");
+}
+
+fz_buffer *pdf_load_stream(fz_context *ctx, pdf_obj *ref)
+{
+	if (pdf_is_stream(ctx, ref))
+		return pdf_load_stream_number(ctx, pdf_get_indirect_document(ctx, ref), pdf_to_num(ctx, ref));
+	fz_throw(ctx, FZ_ERROR_FORMAT, "object is not a stream");
+}
+
+fz_stream *pdf_open_raw_stream(fz_context *ctx, pdf_obj *ref)
+{
+	if (pdf_is_stream(ctx, ref))
+		return pdf_open_raw_stream_number(ctx, pdf_get_indirect_document(ctx, ref), pdf_to_num(ctx, ref));
+	fz_throw(ctx, FZ_ERROR_FORMAT, "object is not a stream");
+}
+
+fz_stream *pdf_open_stream(fz_context *ctx, pdf_obj *ref)
+{
+	if (pdf_is_stream(ctx, ref))
+		return pdf_open_stream_number(ctx, pdf_get_indirect_document(ctx, ref), pdf_to_num(ctx, ref));
+	fz_throw(ctx, FZ_ERROR_FORMAT, "object is not a stream");
+}
author	Franz Glasner <fzglas.hg@dom66.de>
date	Mon, 15 Sep 2025 11:43:07 +0200
parents
children