Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/source/pdf/pdf-repair.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/source/pdf/pdf-repair.c Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,987 @@ +// Copyright (C) 2004-2025 Artifex Software, Inc. +// +// This file is part of MuPDF. +// +// MuPDF is free software: you can redistribute it and/or modify it under the +// terms of the GNU Affero General Public License as published by the Free +// Software Foundation, either version 3 of the License, or (at your option) +// any later version. +// +// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +// details. +// +// You should have received a copy of the GNU Affero General Public License +// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> +// +// Alternative licensing terms are available from the licensor. +// For commercial licensing, see <https://www.artifex.com/> or contact +// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, +// CA 94129, USA, for further information. + +#include "mupdf/fitz.h" +#include "pdf-imp.h" + +#include <string.h> + +/* Scan file for objects and reconstruct xref table */ + +struct entry +{ + int num; + int gen; + int64_t ofs; + int64_t stm_ofs; + int64_t stm_len; +}; + +typedef struct +{ + int max; + int len; + pdf_obj **roots; +} pdf_root_list; + +static void +add_root(fz_context *ctx, pdf_root_list *roots, pdf_obj *obj) +{ + if (roots->max == roots->len) + { + int new_max_roots = roots->max * 2; + if (new_max_roots == 0) + new_max_roots = 4; + roots->roots = fz_realloc(ctx, roots->roots, new_max_roots * sizeof(roots->roots[0])); + roots->max = new_max_roots; + } + roots->roots[roots->len] = pdf_keep_obj(ctx, obj); + roots->len++; +} + +static pdf_root_list * +fz_new_root_list(fz_context *ctx) +{ + return fz_malloc_struct(ctx, pdf_root_list); +} + +static void +pdf_drop_root_list(fz_context *ctx, pdf_root_list *roots) +{ + int i, n; + + if (roots == NULL) + return; + + n = roots->len; + for (i = 0; i < n; i++) + pdf_drop_obj(ctx, roots->roots[i]); + fz_free(ctx, roots->roots); + fz_free(ctx, roots); +} + +int +pdf_repair_obj(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf, int64_t *stmofsp, int64_t *stmlenp, pdf_obj **encrypt, pdf_obj **id, pdf_obj **page, int64_t *tmpofs, pdf_obj **root) +{ + fz_stream *file = doc->file; + pdf_token tok; + int64_t stm_len; + int64_t local_ofs; + + if (tmpofs == NULL) + tmpofs = &local_ofs; + if (stmofsp == NULL) + stmofsp = &local_ofs; + + *stmofsp = 0; + if (stmlenp) + *stmlenp = -1; + + stm_len = 0; + + *tmpofs = fz_tell(ctx, file); + if (*tmpofs < 0) + fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file"); + + /* On entry to this function, we know that we've just seen + * '<int> <int> obj'. We expect the next thing we see to be a + * pdf object. Regardless of the type of thing we meet next + * we only need to fully parse it if it is a dictionary. */ + tok = pdf_lex(ctx, file, buf); + + /* Don't let a truncated object at EOF overwrite a good one */ + if (tok == PDF_TOK_EOF) + fz_throw(ctx, FZ_ERROR_SYNTAX, "truncated object"); + + if (tok == PDF_TOK_OPEN_DICT) + { + pdf_obj *obj, *dict = NULL; + + fz_try(ctx) + { + dict = pdf_parse_dict(ctx, doc, file, buf); + } + fz_catch(ctx) + { + fz_rethrow_if(ctx, FZ_ERROR_TRYLATER); + fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); + /* Don't let a broken object at EOF overwrite a good one */ + if (file->eof) + fz_rethrow(ctx); + /* Silently swallow the error */ + fz_report_error(ctx); + dict = pdf_new_dict(ctx, doc, 2); + } + + /* We must be careful not to try to resolve any indirections + * here. We have just read dict, so we know it to be a non + * indirected dictionary. Before we look at any values that + * we get back from looking up in it, we need to check they + * aren't indirected. */ + + if (encrypt || id || root) + { + obj = pdf_dict_get(ctx, dict, PDF_NAME(Type)); + if (!pdf_is_indirect(ctx, obj) && pdf_name_eq(ctx, obj, PDF_NAME(XRef))) + { + if (encrypt) + { + obj = pdf_dict_get(ctx, dict, PDF_NAME(Encrypt)); + if (obj) + { + pdf_drop_obj(ctx, *encrypt); + *encrypt = pdf_keep_obj(ctx, obj); + } + } + + if (id) + { + obj = pdf_dict_get(ctx, dict, PDF_NAME(ID)); + if (obj) + { + pdf_drop_obj(ctx, *id); + *id = pdf_keep_obj(ctx, obj); + } + } + + if (root) + *root = pdf_keep_obj(ctx, pdf_dict_get(ctx, dict, PDF_NAME(Root))); + } + } + + obj = pdf_dict_get(ctx, dict, PDF_NAME(Length)); + if (!pdf_is_indirect(ctx, obj) && pdf_is_int(ctx, obj)) + stm_len = pdf_to_int64(ctx, obj); + + if (doc->file_reading_linearly && page) + { + obj = pdf_dict_get(ctx, dict, PDF_NAME(Type)); + if (!pdf_is_indirect(ctx, obj) && pdf_name_eq(ctx, obj, PDF_NAME(Page))) + { + pdf_drop_obj(ctx, *page); + *page = pdf_keep_obj(ctx, dict); + } + } + + pdf_drop_obj(ctx, dict); + } + + while ( tok != PDF_TOK_STREAM && + tok != PDF_TOK_ENDOBJ && + tok != PDF_TOK_ERROR && + tok != PDF_TOK_EOF && + tok != PDF_TOK_INT ) + { + *tmpofs = fz_tell(ctx, file); + if (*tmpofs < 0) + fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file"); + tok = pdf_lex(ctx, file, buf); + } + + if (tok == PDF_TOK_STREAM) + { + int c = fz_read_byte(ctx, file); + if (c == '\r') { + c = fz_peek_byte(ctx, file); + if (c == '\n') + fz_read_byte(ctx, file); + } + + *stmofsp = fz_tell(ctx, file); + if (*stmofsp < 0) + fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file"); + + if (stm_len > 0) + { + fz_seek(ctx, file, *stmofsp + stm_len, 0); + fz_try(ctx) + { + tok = pdf_lex(ctx, file, buf); + } + fz_catch(ctx) + { + fz_rethrow_if(ctx, FZ_ERROR_TRYLATER); + fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); + fz_report_error(ctx); + fz_warn(ctx, "cannot find endstream token, falling back to scanning"); + } + if (tok == PDF_TOK_ENDSTREAM) + goto atobjend; + fz_seek(ctx, file, *stmofsp, 0); + } + + (void)fz_read(ctx, file, (unsigned char *) buf->scratch, 9); + + while (memcmp(buf->scratch, "endstream", 9) != 0) + { + c = fz_read_byte(ctx, file); + if (c == EOF) + break; + memmove(&buf->scratch[0], &buf->scratch[1], 8); + buf->scratch[8] = c; + } + + if (stmlenp) + *stmlenp = fz_tell(ctx, file) - *stmofsp - 9; + +atobjend: + *tmpofs = fz_tell(ctx, file); + if (*tmpofs < 0) + fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file"); + tok = pdf_lex(ctx, file, buf); + if (tok != PDF_TOK_ENDOBJ) + fz_warn(ctx, "object missing 'endobj' token"); + else + { + /* Read another token as we always return the next one */ + *tmpofs = fz_tell(ctx, file); + if (*tmpofs < 0) + fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file"); + tok = pdf_lex(ctx, file, buf); + } + } + return tok; +} + +static int64_t +entry_offset(fz_context *ctx, pdf_document *doc, int num) +{ + pdf_xref_entry *entry = pdf_get_populating_xref_entry(ctx, doc, num); + + if (entry->type == 0 || entry->type == 'f') + return 0; + if (entry->type == 'n') + return entry->ofs; + assert(entry->type == 'o'); + + /* It must be in a stream. Return the entry of that stream. */ + entry = pdf_get_populating_xref_entry(ctx, doc, entry->ofs); + /* If it's NOT in a stream, then we'll invalidate this entry in a moment. + * For now, just return an illegal offset. */ + if (entry->type != 'n') + return -1; + + return entry->ofs; +} + +static void +pdf_repair_obj_stm(fz_context *ctx, pdf_document *doc, int stm_num) +{ + pdf_obj *obj; + fz_stream *stm = NULL; + pdf_token tok; + int i, n, count; + pdf_lexbuf buf; + + fz_var(stm); + + pdf_lexbuf_init(ctx, &buf, PDF_LEXBUF_SMALL); + + fz_try(ctx) + { + obj = pdf_load_object(ctx, doc, stm_num); + + count = pdf_dict_get_int(ctx, obj, PDF_NAME(N)); + + pdf_drop_obj(ctx, obj); + + stm = pdf_open_stream_number(ctx, doc, stm_num); + + for (i = 0; i < count; i++) + { + pdf_xref_entry *entry; + int replace; + + tok = pdf_lex(ctx, stm, &buf); + if (tok != PDF_TOK_INT) + fz_throw(ctx, FZ_ERROR_FORMAT, "corrupt object stream (%d 0 R)", stm_num); + + n = buf.i; + if (n < 0) + { + fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", n, i); + continue; + } + else if (n >= PDF_MAX_OBJECT_NUMBER) + { + fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", n, i); + continue; + } + + entry = pdf_get_populating_xref_entry(ctx, doc, n); + + /* Bug 708286: Do not allow an object from an ObjStm to override an object + * that isn't in an ObjStm that we've already read, that occurs after it + * in the file. */ + replace = 1; + if (entry->type != 0 && entry->type != 'f') + { + int64_t existing_entry_offset = entry_offset(ctx, doc, n); + + if (existing_entry_offset < 0) + { + /* The existing entry is invalid. Anything must be better than that! */ + } + else + { + int64_t this_entry_offset = entry_offset(ctx, doc, stm_num); + + if (existing_entry_offset > this_entry_offset) + replace = 0; + } + } + + if (replace) + { + entry->ofs = stm_num; + entry->gen = i; + entry->num = n; + entry->stm_ofs = 0; + pdf_drop_obj(ctx, entry->obj); + entry->obj = NULL; + entry->type = 'o'; + } + + tok = pdf_lex(ctx, stm, &buf); + if (tok != PDF_TOK_INT) + fz_throw(ctx, FZ_ERROR_FORMAT, "corrupt object stream (%d 0 R)", stm_num); + } + } + fz_always(ctx) + { + fz_drop_stream(ctx, stm); + pdf_lexbuf_fin(ctx, &buf); + } + fz_catch(ctx) + { + fz_rethrow(ctx); + } +} + +static void +orphan_object(fz_context *ctx, pdf_document *doc, pdf_obj *obj) +{ + if (doc->orphans_count == doc->orphans_max) + { + int new_max = (doc->orphans_max ? doc->orphans_max*2 : 32); + + fz_try(ctx) + { + doc->orphans = fz_realloc_array(ctx, doc->orphans, new_max, pdf_obj*); + doc->orphans_max = new_max; + } + fz_catch(ctx) + { + pdf_drop_obj(ctx, obj); + fz_rethrow(ctx); + } + } + doc->orphans[doc->orphans_count++] = obj; +} + +static int is_white(int c) +{ + return c == '\x00' || c == '\x09' || c == '\x0a' || c == '\x0c' || c == '\x0d' || c == '\x20'; +} + +static pdf_root_list * +pdf_repair_xref_base(fz_context *ctx, pdf_document *doc) +{ + pdf_obj *dict, *obj = NULL; + pdf_obj *length; + + pdf_obj *encrypt = NULL; + pdf_obj *id = NULL; + pdf_obj *info = NULL; + pdf_root_list *roots = NULL; + + struct entry *list = NULL; + int listlen; + int listcap; + int maxnum = 0; + + int num = 0; + int gen = 0; + int64_t tmpofs, stm_ofs, numofs = 0, genofs = 0; + int64_t stm_len; + pdf_token tok; + int next; + int i; + size_t j, n; + int c; + pdf_lexbuf *buf = &doc->lexbuf.base; + + fz_var(encrypt); + fz_var(id); + fz_var(info); + fz_var(list); + fz_var(obj); + fz_var(roots); + + if (!doc->is_fdf) + fz_warn(ctx, "repairing PDF document"); + + if (doc->repair_attempted) + fz_throw(ctx, FZ_ERROR_FORMAT, "Repair failed already - not trying again"); + + doc->bias = 0; // reset bias! + + doc->repair_attempted = 1; + doc->repair_in_progress = 1; + + pdf_drop_page_tree_internal(ctx, doc); + doc->page_tree_broken = 0; + pdf_forget_xref(ctx, doc); + + fz_seek(ctx, doc->file, 0, 0); + + fz_try(ctx) + { + pdf_xref_entry *entry; + listlen = 0; + listcap = 1024; + list = fz_malloc_array(ctx, listcap, struct entry); + + roots = fz_new_root_list(ctx); + + /* look for '%PDF' version marker within first kilobyte of file */ + n = fz_read(ctx, doc->file, (unsigned char *)buf->scratch, fz_minz(buf->size, 1024)); + + fz_seek(ctx, doc->file, 0, 0); + if (n >= 5) + { + for (j = 0; j < n - 5; j++) + { + if (memcmp(&buf->scratch[j], "%PDF-", 5) == 0 || memcmp(&buf->scratch[j], "%FDF-", 5) == 0) + { + fz_seek(ctx, doc->file, (int64_t)(j + 8), 0); /* skip "%PDF-X.Y" */ + break; + } + } + } + + /* skip comment line after version marker since some generators + * forget to terminate the comment with a newline */ + c = fz_read_byte(ctx, doc->file); + while (c >= 0 && (c == ' ' || c == '%')) + c = fz_read_byte(ctx, doc->file); + if (c != EOF) + fz_unread_byte(ctx, doc->file); + + while (1) + { + tmpofs = fz_tell(ctx, doc->file); + if (tmpofs < 0) + fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file"); + + fz_try(ctx) + tok = pdf_lex_no_string(ctx, doc->file, buf); + fz_catch(ctx) + { + fz_rethrow_if(ctx, FZ_ERROR_TRYLATER); + fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); + fz_report_error(ctx); + fz_warn(ctx, "skipping ahead to next token"); + do + c = fz_read_byte(ctx, doc->file); + while (c != EOF && !is_white(c)); + if (c == EOF) + tok = PDF_TOK_EOF; + else + continue; + } + + /* If we have the next token already, then we'll jump + * back here, rather than going through the top of + * the loop. */ + have_next_token: + + if (tok == PDF_TOK_INT) + { + if (buf->i < 0) + { + num = 0; + gen = 0; + continue; + } + numofs = genofs; + num = gen; + genofs = tmpofs; + gen = buf->i; + } + + else if (tok == PDF_TOK_OBJ) + { + pdf_obj *root = NULL; + + fz_try(ctx) + { + stm_len = 0; + stm_ofs = 0; + tok = pdf_repair_obj(ctx, doc, buf, &stm_ofs, &stm_len, &encrypt, &id, NULL, &tmpofs, &root); + if (root) + add_root(ctx, roots, root); + } + fz_always(ctx) + { + pdf_drop_obj(ctx, root); + } + fz_catch(ctx) + { + int errcode = fz_caught(ctx); + /* If we haven't seen a root yet, there is nothing + * we can do, but give up. Otherwise, we'll make + * do. */ + if (roots->len == 0 || + errcode == FZ_ERROR_TRYLATER || + errcode == FZ_ERROR_SYSTEM) + { + pdf_drop_root_list(ctx, roots); + roots = NULL; + fz_rethrow(ctx); + } + fz_report_error(ctx); + fz_warn(ctx, "cannot parse object (%d %d R) - ignoring rest of file", num, gen); + break; + } + + if (num <= 0 || num > PDF_MAX_OBJECT_NUMBER) + { + fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", num, gen); + goto have_next_token; + } + + gen = fz_clampi(gen, 0, 65535); + + if (listlen + 1 == listcap) + { + listcap = (listcap * 3) / 2; + list = fz_realloc_array(ctx, list, listcap, struct entry); + } + + list[listlen].num = num; + list[listlen].gen = gen; + list[listlen].ofs = numofs; + list[listlen].stm_ofs = stm_ofs; + list[listlen].stm_len = stm_len; + listlen ++; + + if (num > maxnum) + maxnum = num; + + goto have_next_token; + } + + /* If we find a dictionary it is probably the trailer, + * but could be a stream (or bogus) dictionary caused + * by a corrupt file. */ + else if (tok == PDF_TOK_OPEN_DICT) + { + pdf_obj *dictobj; + + fz_try(ctx) + { + dict = pdf_parse_dict(ctx, doc, doc->file, buf); + } + fz_catch(ctx) + { + fz_rethrow_if(ctx, FZ_ERROR_TRYLATER); + fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); + /* If this was the real trailer dict + * it was broken, in which case we are + * in trouble. Keep going though in + * case this was just a bogus dict. */ + fz_report_error(ctx); + continue; + } + + fz_try(ctx) + { + dictobj = pdf_dict_get(ctx, dict, PDF_NAME(Encrypt)); + if (dictobj) + { + pdf_drop_obj(ctx, encrypt); + encrypt = pdf_keep_obj(ctx, dictobj); + } + + dictobj = pdf_dict_get(ctx, dict, PDF_NAME(ID)); + if (dictobj && (!id || !encrypt || pdf_dict_get(ctx, dict, PDF_NAME(Encrypt)))) + { + pdf_drop_obj(ctx, id); + id = pdf_keep_obj(ctx, dictobj); + } + + dictobj = pdf_dict_get(ctx, dict, PDF_NAME(Root)); + if (dictobj) + add_root(ctx, roots, dictobj); + + dictobj = pdf_dict_get(ctx, dict, PDF_NAME(Info)); + if (dictobj) + { + pdf_drop_obj(ctx, info); + info = pdf_keep_obj(ctx, dictobj); + } + } + fz_always(ctx) + pdf_drop_obj(ctx, dict); + fz_catch(ctx) + fz_rethrow(ctx); + } + + else if (tok == PDF_TOK_EOF) + { + break; + } + + else + { + num = 0; + gen = 0; + } + } + + if (listlen == 0) + fz_throw(ctx, FZ_ERROR_FORMAT, "no objects found"); + + /* make xref reasonable */ + + /* + Dummy access to entry to assure sufficient space in the xref table + and avoid repeated reallocs in the loop + */ + /* Ensure that the first xref table is a 'solid' one from + * 0 to maxnum. */ + pdf_ensure_solid_xref(ctx, doc, maxnum); + + for (i = 1; i < maxnum; i++) + { + entry = pdf_get_populating_xref_entry(ctx, doc, i); + if (entry->obj != NULL) + continue; + entry->type = 'f'; + entry->ofs = 0; + entry->gen = 0; + entry->num = 0; + + entry->stm_ofs = 0; + } + + for (i = 0; i < listlen; i++) + { + entry = pdf_get_populating_xref_entry(ctx, doc, list[i].num); + entry->type = 'n'; + entry->ofs = list[i].ofs; + entry->gen = list[i].gen; + entry->num = list[i].num; + + entry->stm_ofs = list[i].stm_ofs; + + /* correct stream length for unencrypted documents */ + if (!encrypt && list[i].stm_len >= 0) + { + pdf_obj *old_obj = NULL; + dict = pdf_load_object(ctx, doc, list[i].num); + + fz_try(ctx) + { + length = pdf_new_int(ctx, list[i].stm_len); + pdf_dict_get_put_drop(ctx, dict, PDF_NAME(Length), length, &old_obj); + if (old_obj) + orphan_object(ctx, doc, old_obj); + } + fz_always(ctx) + pdf_drop_obj(ctx, dict); + fz_catch(ctx) + fz_rethrow(ctx); + } + } + + entry = pdf_get_populating_xref_entry(ctx, doc, 0); + entry->type = 'f'; + entry->ofs = 0; + entry->gen = 65535; + entry->num = 0; + entry->stm_ofs = 0; + + next = 0; + for (i = pdf_xref_len(ctx, doc) - 1; i >= 0; i--) + { + entry = pdf_get_populating_xref_entry(ctx, doc, i); + if (entry->type == 'f') + { + entry->ofs = next; + if (entry->gen < 65535) + entry->gen ++; + next = i; + } + } + + /* create a repaired trailer, Root will be added later */ + + obj = pdf_new_dict(ctx, doc, 5); + /* During repair there is only a single xref section */ + pdf_set_populating_xref_trailer(ctx, doc, obj); + pdf_drop_obj(ctx, obj); + obj = NULL; + + pdf_dict_put_int(ctx, pdf_trailer(ctx, doc), PDF_NAME(Size), maxnum + 1); + + if (info) + { + pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(Info), info); + pdf_drop_obj(ctx, info); + info = NULL; + } + + if (encrypt) + { + if (pdf_is_indirect(ctx, encrypt)) + { + /* create new reference with non-NULL xref pointer */ + obj = pdf_new_indirect(ctx, doc, pdf_to_num(ctx, encrypt), pdf_to_gen(ctx, encrypt)); + pdf_drop_obj(ctx, encrypt); + encrypt = obj; + obj = NULL; + } + pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(Encrypt), encrypt); + pdf_drop_obj(ctx, encrypt); + encrypt = NULL; + } + + if (id) + { + if (pdf_is_indirect(ctx, id)) + { + /* create new reference with non-NULL xref pointer */ + obj = pdf_new_indirect(ctx, doc, pdf_to_num(ctx, id), pdf_to_gen(ctx, id)); + pdf_drop_obj(ctx, id); + id = obj; + obj = NULL; + } + pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(ID), id); + pdf_drop_obj(ctx, id); + id = NULL; + } + } + fz_always(ctx) + { + fz_free(ctx, list); + doc->repair_in_progress = 0; + } + fz_catch(ctx) + { + pdf_drop_root_list(ctx, roots); + pdf_drop_obj(ctx, encrypt); + pdf_drop_obj(ctx, id); + pdf_drop_obj(ctx, obj); + pdf_drop_obj(ctx, info); + if (ctx->throw_on_repair) + fz_throw(ctx, FZ_ERROR_REPAIRED, "Error during repair attempt"); + fz_rethrow(ctx); + } + + if (ctx->throw_on_repair) + { + pdf_drop_root_list(ctx, roots); + fz_throw(ctx, FZ_ERROR_REPAIRED, "File repaired"); + } + + return roots; +} + +static void +pdf_repair_obj_stms(fz_context *ctx, pdf_document *doc) +{ + pdf_obj *dict; + int i; + int xref_len = pdf_xref_len(ctx, doc); + + for (i = 0; i < xref_len; i++) + { + pdf_xref_entry *entry = pdf_get_populating_xref_entry(ctx, doc, i); + + if (entry->stm_ofs) + { + dict = pdf_load_object(ctx, doc, i); + fz_try(ctx) + { + if (pdf_name_eq(ctx, pdf_dict_get(ctx, dict, PDF_NAME(Type)), PDF_NAME(ObjStm))) + pdf_repair_obj_stm(ctx, doc, i); + } + fz_always(ctx) + pdf_drop_obj(ctx, dict); + fz_catch(ctx) + { + fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); + fz_report_error(ctx); + fz_warn(ctx, "ignoring broken object stream (%d 0 R)", i); + } + } + } + + /* Ensure that streamed objects reside inside a known non-streamed object */ + for (i = 0; i < xref_len; i++) + { + pdf_xref_entry *entry = pdf_get_populating_xref_entry(ctx, doc, i); + + if (entry->type == 'o' && pdf_get_populating_xref_entry(ctx, doc, entry->ofs)->type != 'n') + { + fz_warn(ctx, "invalid reference to non-object-stream: %d, assuming %d 0 R is a freed object", (int)entry->ofs, i); + entry->type = 'f'; + } + } +} + +static void +pdf_repair_roots(fz_context *ctx, pdf_document *doc, pdf_root_list *roots) +{ + int i; + + for (i = roots->len-1; i >= 0; i--) + { + if (pdf_is_indirect(ctx, roots->roots[i]) && pdf_is_dict(ctx, roots->roots[i])) + { + pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root), roots->roots[i]); + break; + } + } +} + +static void +pdf_repair_trailer(fz_context *ctx, pdf_document *doc) +{ + int hasroot, hasinfo; + pdf_obj *obj, *nobj; + pdf_obj *dict = NULL; + int i; + + int xref_len = pdf_xref_len(ctx, doc); + + hasroot = (pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root)) != NULL); + hasinfo = (pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Info)) != NULL); + + fz_var(dict); + + fz_try(ctx) + { + /* Scan from the end so we have a better chance of finding + * newer objects if there are multiple instances of Info and + * Root objects. + */ + for (i = xref_len - 1; i > 0 && (!hasinfo || !hasroot); --i) + { + pdf_xref_entry *entry = pdf_get_xref_entry_no_null(ctx, doc, i); + if (entry->type == 0 || entry->type == 'f') + continue; + + fz_try(ctx) + { + dict = pdf_load_object(ctx, doc, i); + } + fz_catch(ctx) + { + fz_rethrow_if(ctx, FZ_ERROR_TRYLATER); + fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); + fz_report_error(ctx); + fz_warn(ctx, "ignoring broken object (%d 0 R)", i); + continue; + } + + if (!hasroot) + { + obj = pdf_dict_get(ctx, dict, PDF_NAME(Type)); + if (obj == PDF_NAME(Catalog)) + { + nobj = pdf_new_indirect(ctx, doc, i, 0); + pdf_dict_put_drop(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root), nobj); + hasroot = 1; + } + } + + if (!hasinfo) + { + if (pdf_dict_get(ctx, dict, PDF_NAME(Creator)) || pdf_dict_get(ctx, dict, PDF_NAME(Producer))) + { + nobj = pdf_new_indirect(ctx, doc, i, 0); + pdf_dict_put_drop(ctx, pdf_trailer(ctx, doc), PDF_NAME(Info), nobj); + hasinfo = 1; + } + } + + pdf_drop_obj(ctx, dict); + dict = NULL; + } + } + fz_always(ctx) + { + /* ensure that strings are not used in their repaired, non-decrypted form */ + if (doc->crypt) + { + pdf_crypt *tmp; + pdf_clear_xref(ctx, doc); + + /* ensure that Encryption dictionary and ID are cached without decryption, + otherwise a decrypted Encryption dictionary and ID may be used when saving + the PDF causing it to be inconsistent (since strings/streams are encrypted + with the actual encryption key, not the decrypted encryption key). */ + tmp = doc->crypt; + doc->crypt = NULL; + fz_try(ctx) + { + (void) pdf_resolve_indirect(ctx, pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Encrypt))); + (void) pdf_resolve_indirect(ctx, pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(ID))); + } + fz_always(ctx) + doc->crypt = tmp; + fz_catch(ctx) + { + fz_rethrow(ctx); + } + } + } + fz_catch(ctx) + { + pdf_drop_obj(ctx, dict); + fz_rethrow(ctx); + } +} + +void pdf_repair_xref_aux(fz_context *ctx, pdf_document *doc, void (*mid)(fz_context *ctx, pdf_document *doc)) +{ + pdf_root_list *roots = NULL; + + fz_var(roots); + + fz_try(ctx) + { + roots = pdf_repair_xref_base(ctx, doc); + if (mid) + mid(ctx, doc); + pdf_repair_obj_stms(ctx, doc); + pdf_repair_roots(ctx, doc, roots); + pdf_repair_trailer(ctx, doc); + } + fz_always(ctx) + pdf_drop_root_list(ctx, roots); + fz_catch(ctx) + fz_rethrow(ctx); +}
