Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/source/pdf/pdf-repair.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 // Copyright (C) 2004-2025 Artifex Software, Inc. | |
| 2 // | |
| 3 // This file is part of MuPDF. | |
| 4 // | |
| 5 // MuPDF is free software: you can redistribute it and/or modify it under the | |
| 6 // terms of the GNU Affero General Public License as published by the Free | |
| 7 // Software Foundation, either version 3 of the License, or (at your option) | |
| 8 // any later version. | |
| 9 // | |
| 10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY | |
| 11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | |
| 12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more | |
| 13 // details. | |
| 14 // | |
| 15 // You should have received a copy of the GNU Affero General Public License | |
| 16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> | |
| 17 // | |
| 18 // Alternative licensing terms are available from the licensor. | |
| 19 // For commercial licensing, see <https://www.artifex.com/> or contact | |
| 20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, | |
| 21 // CA 94129, USA, for further information. | |
| 22 | |
| 23 #include "mupdf/fitz.h" | |
| 24 #include "pdf-imp.h" | |
| 25 | |
| 26 #include <string.h> | |
| 27 | |
| 28 /* Scan file for objects and reconstruct xref table */ | |
| 29 | |
| 30 struct entry | |
| 31 { | |
| 32 int num; | |
| 33 int gen; | |
| 34 int64_t ofs; | |
| 35 int64_t stm_ofs; | |
| 36 int64_t stm_len; | |
| 37 }; | |
| 38 | |
| 39 typedef struct | |
| 40 { | |
| 41 int max; | |
| 42 int len; | |
| 43 pdf_obj **roots; | |
| 44 } pdf_root_list; | |
| 45 | |
| 46 static void | |
| 47 add_root(fz_context *ctx, pdf_root_list *roots, pdf_obj *obj) | |
| 48 { | |
| 49 if (roots->max == roots->len) | |
| 50 { | |
| 51 int new_max_roots = roots->max * 2; | |
| 52 if (new_max_roots == 0) | |
| 53 new_max_roots = 4; | |
| 54 roots->roots = fz_realloc(ctx, roots->roots, new_max_roots * sizeof(roots->roots[0])); | |
| 55 roots->max = new_max_roots; | |
| 56 } | |
| 57 roots->roots[roots->len] = pdf_keep_obj(ctx, obj); | |
| 58 roots->len++; | |
| 59 } | |
| 60 | |
| 61 static pdf_root_list * | |
| 62 fz_new_root_list(fz_context *ctx) | |
| 63 { | |
| 64 return fz_malloc_struct(ctx, pdf_root_list); | |
| 65 } | |
| 66 | |
| 67 static void | |
| 68 pdf_drop_root_list(fz_context *ctx, pdf_root_list *roots) | |
| 69 { | |
| 70 int i, n; | |
| 71 | |
| 72 if (roots == NULL) | |
| 73 return; | |
| 74 | |
| 75 n = roots->len; | |
| 76 for (i = 0; i < n; i++) | |
| 77 pdf_drop_obj(ctx, roots->roots[i]); | |
| 78 fz_free(ctx, roots->roots); | |
| 79 fz_free(ctx, roots); | |
| 80 } | |
| 81 | |
| 82 int | |
| 83 pdf_repair_obj(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf, int64_t *stmofsp, int64_t *stmlenp, pdf_obj **encrypt, pdf_obj **id, pdf_obj **page, int64_t *tmpofs, pdf_obj **root) | |
| 84 { | |
| 85 fz_stream *file = doc->file; | |
| 86 pdf_token tok; | |
| 87 int64_t stm_len; | |
| 88 int64_t local_ofs; | |
| 89 | |
| 90 if (tmpofs == NULL) | |
| 91 tmpofs = &local_ofs; | |
| 92 if (stmofsp == NULL) | |
| 93 stmofsp = &local_ofs; | |
| 94 | |
| 95 *stmofsp = 0; | |
| 96 if (stmlenp) | |
| 97 *stmlenp = -1; | |
| 98 | |
| 99 stm_len = 0; | |
| 100 | |
| 101 *tmpofs = fz_tell(ctx, file); | |
| 102 if (*tmpofs < 0) | |
| 103 fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file"); | |
| 104 | |
| 105 /* On entry to this function, we know that we've just seen | |
| 106 * '<int> <int> obj'. We expect the next thing we see to be a | |
| 107 * pdf object. Regardless of the type of thing we meet next | |
| 108 * we only need to fully parse it if it is a dictionary. */ | |
| 109 tok = pdf_lex(ctx, file, buf); | |
| 110 | |
| 111 /* Don't let a truncated object at EOF overwrite a good one */ | |
| 112 if (tok == PDF_TOK_EOF) | |
| 113 fz_throw(ctx, FZ_ERROR_SYNTAX, "truncated object"); | |
| 114 | |
| 115 if (tok == PDF_TOK_OPEN_DICT) | |
| 116 { | |
| 117 pdf_obj *obj, *dict = NULL; | |
| 118 | |
| 119 fz_try(ctx) | |
| 120 { | |
| 121 dict = pdf_parse_dict(ctx, doc, file, buf); | |
| 122 } | |
| 123 fz_catch(ctx) | |
| 124 { | |
| 125 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER); | |
| 126 fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); | |
| 127 /* Don't let a broken object at EOF overwrite a good one */ | |
| 128 if (file->eof) | |
| 129 fz_rethrow(ctx); | |
| 130 /* Silently swallow the error */ | |
| 131 fz_report_error(ctx); | |
| 132 dict = pdf_new_dict(ctx, doc, 2); | |
| 133 } | |
| 134 | |
| 135 /* We must be careful not to try to resolve any indirections | |
| 136 * here. We have just read dict, so we know it to be a non | |
| 137 * indirected dictionary. Before we look at any values that | |
| 138 * we get back from looking up in it, we need to check they | |
| 139 * aren't indirected. */ | |
| 140 | |
| 141 if (encrypt || id || root) | |
| 142 { | |
| 143 obj = pdf_dict_get(ctx, dict, PDF_NAME(Type)); | |
| 144 if (!pdf_is_indirect(ctx, obj) && pdf_name_eq(ctx, obj, PDF_NAME(XRef))) | |
| 145 { | |
| 146 if (encrypt) | |
| 147 { | |
| 148 obj = pdf_dict_get(ctx, dict, PDF_NAME(Encrypt)); | |
| 149 if (obj) | |
| 150 { | |
| 151 pdf_drop_obj(ctx, *encrypt); | |
| 152 *encrypt = pdf_keep_obj(ctx, obj); | |
| 153 } | |
| 154 } | |
| 155 | |
| 156 if (id) | |
| 157 { | |
| 158 obj = pdf_dict_get(ctx, dict, PDF_NAME(ID)); | |
| 159 if (obj) | |
| 160 { | |
| 161 pdf_drop_obj(ctx, *id); | |
| 162 *id = pdf_keep_obj(ctx, obj); | |
| 163 } | |
| 164 } | |
| 165 | |
| 166 if (root) | |
| 167 *root = pdf_keep_obj(ctx, pdf_dict_get(ctx, dict, PDF_NAME(Root))); | |
| 168 } | |
| 169 } | |
| 170 | |
| 171 obj = pdf_dict_get(ctx, dict, PDF_NAME(Length)); | |
| 172 if (!pdf_is_indirect(ctx, obj) && pdf_is_int(ctx, obj)) | |
| 173 stm_len = pdf_to_int64(ctx, obj); | |
| 174 | |
| 175 if (doc->file_reading_linearly && page) | |
| 176 { | |
| 177 obj = pdf_dict_get(ctx, dict, PDF_NAME(Type)); | |
| 178 if (!pdf_is_indirect(ctx, obj) && pdf_name_eq(ctx, obj, PDF_NAME(Page))) | |
| 179 { | |
| 180 pdf_drop_obj(ctx, *page); | |
| 181 *page = pdf_keep_obj(ctx, dict); | |
| 182 } | |
| 183 } | |
| 184 | |
| 185 pdf_drop_obj(ctx, dict); | |
| 186 } | |
| 187 | |
| 188 while ( tok != PDF_TOK_STREAM && | |
| 189 tok != PDF_TOK_ENDOBJ && | |
| 190 tok != PDF_TOK_ERROR && | |
| 191 tok != PDF_TOK_EOF && | |
| 192 tok != PDF_TOK_INT ) | |
| 193 { | |
| 194 *tmpofs = fz_tell(ctx, file); | |
| 195 if (*tmpofs < 0) | |
| 196 fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file"); | |
| 197 tok = pdf_lex(ctx, file, buf); | |
| 198 } | |
| 199 | |
| 200 if (tok == PDF_TOK_STREAM) | |
| 201 { | |
| 202 int c = fz_read_byte(ctx, file); | |
| 203 if (c == '\r') { | |
| 204 c = fz_peek_byte(ctx, file); | |
| 205 if (c == '\n') | |
| 206 fz_read_byte(ctx, file); | |
| 207 } | |
| 208 | |
| 209 *stmofsp = fz_tell(ctx, file); | |
| 210 if (*stmofsp < 0) | |
| 211 fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file"); | |
| 212 | |
| 213 if (stm_len > 0) | |
| 214 { | |
| 215 fz_seek(ctx, file, *stmofsp + stm_len, 0); | |
| 216 fz_try(ctx) | |
| 217 { | |
| 218 tok = pdf_lex(ctx, file, buf); | |
| 219 } | |
| 220 fz_catch(ctx) | |
| 221 { | |
| 222 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER); | |
| 223 fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); | |
| 224 fz_report_error(ctx); | |
| 225 fz_warn(ctx, "cannot find endstream token, falling back to scanning"); | |
| 226 } | |
| 227 if (tok == PDF_TOK_ENDSTREAM) | |
| 228 goto atobjend; | |
| 229 fz_seek(ctx, file, *stmofsp, 0); | |
| 230 } | |
| 231 | |
| 232 (void)fz_read(ctx, file, (unsigned char *) buf->scratch, 9); | |
| 233 | |
| 234 while (memcmp(buf->scratch, "endstream", 9) != 0) | |
| 235 { | |
| 236 c = fz_read_byte(ctx, file); | |
| 237 if (c == EOF) | |
| 238 break; | |
| 239 memmove(&buf->scratch[0], &buf->scratch[1], 8); | |
| 240 buf->scratch[8] = c; | |
| 241 } | |
| 242 | |
| 243 if (stmlenp) | |
| 244 *stmlenp = fz_tell(ctx, file) - *stmofsp - 9; | |
| 245 | |
| 246 atobjend: | |
| 247 *tmpofs = fz_tell(ctx, file); | |
| 248 if (*tmpofs < 0) | |
| 249 fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file"); | |
| 250 tok = pdf_lex(ctx, file, buf); | |
| 251 if (tok != PDF_TOK_ENDOBJ) | |
| 252 fz_warn(ctx, "object missing 'endobj' token"); | |
| 253 else | |
| 254 { | |
| 255 /* Read another token as we always return the next one */ | |
| 256 *tmpofs = fz_tell(ctx, file); | |
| 257 if (*tmpofs < 0) | |
| 258 fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file"); | |
| 259 tok = pdf_lex(ctx, file, buf); | |
| 260 } | |
| 261 } | |
| 262 return tok; | |
| 263 } | |
| 264 | |
| 265 static int64_t | |
| 266 entry_offset(fz_context *ctx, pdf_document *doc, int num) | |
| 267 { | |
| 268 pdf_xref_entry *entry = pdf_get_populating_xref_entry(ctx, doc, num); | |
| 269 | |
| 270 if (entry->type == 0 || entry->type == 'f') | |
| 271 return 0; | |
| 272 if (entry->type == 'n') | |
| 273 return entry->ofs; | |
| 274 assert(entry->type == 'o'); | |
| 275 | |
| 276 /* It must be in a stream. Return the entry of that stream. */ | |
| 277 entry = pdf_get_populating_xref_entry(ctx, doc, entry->ofs); | |
| 278 /* If it's NOT in a stream, then we'll invalidate this entry in a moment. | |
| 279 * For now, just return an illegal offset. */ | |
| 280 if (entry->type != 'n') | |
| 281 return -1; | |
| 282 | |
| 283 return entry->ofs; | |
| 284 } | |
| 285 | |
| 286 static void | |
| 287 pdf_repair_obj_stm(fz_context *ctx, pdf_document *doc, int stm_num) | |
| 288 { | |
| 289 pdf_obj *obj; | |
| 290 fz_stream *stm = NULL; | |
| 291 pdf_token tok; | |
| 292 int i, n, count; | |
| 293 pdf_lexbuf buf; | |
| 294 | |
| 295 fz_var(stm); | |
| 296 | |
| 297 pdf_lexbuf_init(ctx, &buf, PDF_LEXBUF_SMALL); | |
| 298 | |
| 299 fz_try(ctx) | |
| 300 { | |
| 301 obj = pdf_load_object(ctx, doc, stm_num); | |
| 302 | |
| 303 count = pdf_dict_get_int(ctx, obj, PDF_NAME(N)); | |
| 304 | |
| 305 pdf_drop_obj(ctx, obj); | |
| 306 | |
| 307 stm = pdf_open_stream_number(ctx, doc, stm_num); | |
| 308 | |
| 309 for (i = 0; i < count; i++) | |
| 310 { | |
| 311 pdf_xref_entry *entry; | |
| 312 int replace; | |
| 313 | |
| 314 tok = pdf_lex(ctx, stm, &buf); | |
| 315 if (tok != PDF_TOK_INT) | |
| 316 fz_throw(ctx, FZ_ERROR_FORMAT, "corrupt object stream (%d 0 R)", stm_num); | |
| 317 | |
| 318 n = buf.i; | |
| 319 if (n < 0) | |
| 320 { | |
| 321 fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", n, i); | |
| 322 continue; | |
| 323 } | |
| 324 else if (n >= PDF_MAX_OBJECT_NUMBER) | |
| 325 { | |
| 326 fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", n, i); | |
| 327 continue; | |
| 328 } | |
| 329 | |
| 330 entry = pdf_get_populating_xref_entry(ctx, doc, n); | |
| 331 | |
| 332 /* Bug 708286: Do not allow an object from an ObjStm to override an object | |
| 333 * that isn't in an ObjStm that we've already read, that occurs after it | |
| 334 * in the file. */ | |
| 335 replace = 1; | |
| 336 if (entry->type != 0 && entry->type != 'f') | |
| 337 { | |
| 338 int64_t existing_entry_offset = entry_offset(ctx, doc, n); | |
| 339 | |
| 340 if (existing_entry_offset < 0) | |
| 341 { | |
| 342 /* The existing entry is invalid. Anything must be better than that! */ | |
| 343 } | |
| 344 else | |
| 345 { | |
| 346 int64_t this_entry_offset = entry_offset(ctx, doc, stm_num); | |
| 347 | |
| 348 if (existing_entry_offset > this_entry_offset) | |
| 349 replace = 0; | |
| 350 } | |
| 351 } | |
| 352 | |
| 353 if (replace) | |
| 354 { | |
| 355 entry->ofs = stm_num; | |
| 356 entry->gen = i; | |
| 357 entry->num = n; | |
| 358 entry->stm_ofs = 0; | |
| 359 pdf_drop_obj(ctx, entry->obj); | |
| 360 entry->obj = NULL; | |
| 361 entry->type = 'o'; | |
| 362 } | |
| 363 | |
| 364 tok = pdf_lex(ctx, stm, &buf); | |
| 365 if (tok != PDF_TOK_INT) | |
| 366 fz_throw(ctx, FZ_ERROR_FORMAT, "corrupt object stream (%d 0 R)", stm_num); | |
| 367 } | |
| 368 } | |
| 369 fz_always(ctx) | |
| 370 { | |
| 371 fz_drop_stream(ctx, stm); | |
| 372 pdf_lexbuf_fin(ctx, &buf); | |
| 373 } | |
| 374 fz_catch(ctx) | |
| 375 { | |
| 376 fz_rethrow(ctx); | |
| 377 } | |
| 378 } | |
| 379 | |
| 380 static void | |
| 381 orphan_object(fz_context *ctx, pdf_document *doc, pdf_obj *obj) | |
| 382 { | |
| 383 if (doc->orphans_count == doc->orphans_max) | |
| 384 { | |
| 385 int new_max = (doc->orphans_max ? doc->orphans_max*2 : 32); | |
| 386 | |
| 387 fz_try(ctx) | |
| 388 { | |
| 389 doc->orphans = fz_realloc_array(ctx, doc->orphans, new_max, pdf_obj*); | |
| 390 doc->orphans_max = new_max; | |
| 391 } | |
| 392 fz_catch(ctx) | |
| 393 { | |
| 394 pdf_drop_obj(ctx, obj); | |
| 395 fz_rethrow(ctx); | |
| 396 } | |
| 397 } | |
| 398 doc->orphans[doc->orphans_count++] = obj; | |
| 399 } | |
| 400 | |
| 401 static int is_white(int c) | |
| 402 { | |
| 403 return c == '\x00' || c == '\x09' || c == '\x0a' || c == '\x0c' || c == '\x0d' || c == '\x20'; | |
| 404 } | |
| 405 | |
| 406 static pdf_root_list * | |
| 407 pdf_repair_xref_base(fz_context *ctx, pdf_document *doc) | |
| 408 { | |
| 409 pdf_obj *dict, *obj = NULL; | |
| 410 pdf_obj *length; | |
| 411 | |
| 412 pdf_obj *encrypt = NULL; | |
| 413 pdf_obj *id = NULL; | |
| 414 pdf_obj *info = NULL; | |
| 415 pdf_root_list *roots = NULL; | |
| 416 | |
| 417 struct entry *list = NULL; | |
| 418 int listlen; | |
| 419 int listcap; | |
| 420 int maxnum = 0; | |
| 421 | |
| 422 int num = 0; | |
| 423 int gen = 0; | |
| 424 int64_t tmpofs, stm_ofs, numofs = 0, genofs = 0; | |
| 425 int64_t stm_len; | |
| 426 pdf_token tok; | |
| 427 int next; | |
| 428 int i; | |
| 429 size_t j, n; | |
| 430 int c; | |
| 431 pdf_lexbuf *buf = &doc->lexbuf.base; | |
| 432 | |
| 433 fz_var(encrypt); | |
| 434 fz_var(id); | |
| 435 fz_var(info); | |
| 436 fz_var(list); | |
| 437 fz_var(obj); | |
| 438 fz_var(roots); | |
| 439 | |
| 440 if (!doc->is_fdf) | |
| 441 fz_warn(ctx, "repairing PDF document"); | |
| 442 | |
| 443 if (doc->repair_attempted) | |
| 444 fz_throw(ctx, FZ_ERROR_FORMAT, "Repair failed already - not trying again"); | |
| 445 | |
| 446 doc->bias = 0; // reset bias! | |
| 447 | |
| 448 doc->repair_attempted = 1; | |
| 449 doc->repair_in_progress = 1; | |
| 450 | |
| 451 pdf_drop_page_tree_internal(ctx, doc); | |
| 452 doc->page_tree_broken = 0; | |
| 453 pdf_forget_xref(ctx, doc); | |
| 454 | |
| 455 fz_seek(ctx, doc->file, 0, 0); | |
| 456 | |
| 457 fz_try(ctx) | |
| 458 { | |
| 459 pdf_xref_entry *entry; | |
| 460 listlen = 0; | |
| 461 listcap = 1024; | |
| 462 list = fz_malloc_array(ctx, listcap, struct entry); | |
| 463 | |
| 464 roots = fz_new_root_list(ctx); | |
| 465 | |
| 466 /* look for '%PDF' version marker within first kilobyte of file */ | |
| 467 n = fz_read(ctx, doc->file, (unsigned char *)buf->scratch, fz_minz(buf->size, 1024)); | |
| 468 | |
| 469 fz_seek(ctx, doc->file, 0, 0); | |
| 470 if (n >= 5) | |
| 471 { | |
| 472 for (j = 0; j < n - 5; j++) | |
| 473 { | |
| 474 if (memcmp(&buf->scratch[j], "%PDF-", 5) == 0 || memcmp(&buf->scratch[j], "%FDF-", 5) == 0) | |
| 475 { | |
| 476 fz_seek(ctx, doc->file, (int64_t)(j + 8), 0); /* skip "%PDF-X.Y" */ | |
| 477 break; | |
| 478 } | |
| 479 } | |
| 480 } | |
| 481 | |
| 482 /* skip comment line after version marker since some generators | |
| 483 * forget to terminate the comment with a newline */ | |
| 484 c = fz_read_byte(ctx, doc->file); | |
| 485 while (c >= 0 && (c == ' ' || c == '%')) | |
| 486 c = fz_read_byte(ctx, doc->file); | |
| 487 if (c != EOF) | |
| 488 fz_unread_byte(ctx, doc->file); | |
| 489 | |
| 490 while (1) | |
| 491 { | |
| 492 tmpofs = fz_tell(ctx, doc->file); | |
| 493 if (tmpofs < 0) | |
| 494 fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file"); | |
| 495 | |
| 496 fz_try(ctx) | |
| 497 tok = pdf_lex_no_string(ctx, doc->file, buf); | |
| 498 fz_catch(ctx) | |
| 499 { | |
| 500 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER); | |
| 501 fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); | |
| 502 fz_report_error(ctx); | |
| 503 fz_warn(ctx, "skipping ahead to next token"); | |
| 504 do | |
| 505 c = fz_read_byte(ctx, doc->file); | |
| 506 while (c != EOF && !is_white(c)); | |
| 507 if (c == EOF) | |
| 508 tok = PDF_TOK_EOF; | |
| 509 else | |
| 510 continue; | |
| 511 } | |
| 512 | |
| 513 /* If we have the next token already, then we'll jump | |
| 514 * back here, rather than going through the top of | |
| 515 * the loop. */ | |
| 516 have_next_token: | |
| 517 | |
| 518 if (tok == PDF_TOK_INT) | |
| 519 { | |
| 520 if (buf->i < 0) | |
| 521 { | |
| 522 num = 0; | |
| 523 gen = 0; | |
| 524 continue; | |
| 525 } | |
| 526 numofs = genofs; | |
| 527 num = gen; | |
| 528 genofs = tmpofs; | |
| 529 gen = buf->i; | |
| 530 } | |
| 531 | |
| 532 else if (tok == PDF_TOK_OBJ) | |
| 533 { | |
| 534 pdf_obj *root = NULL; | |
| 535 | |
| 536 fz_try(ctx) | |
| 537 { | |
| 538 stm_len = 0; | |
| 539 stm_ofs = 0; | |
| 540 tok = pdf_repair_obj(ctx, doc, buf, &stm_ofs, &stm_len, &encrypt, &id, NULL, &tmpofs, &root); | |
| 541 if (root) | |
| 542 add_root(ctx, roots, root); | |
| 543 } | |
| 544 fz_always(ctx) | |
| 545 { | |
| 546 pdf_drop_obj(ctx, root); | |
| 547 } | |
| 548 fz_catch(ctx) | |
| 549 { | |
| 550 int errcode = fz_caught(ctx); | |
| 551 /* If we haven't seen a root yet, there is nothing | |
| 552 * we can do, but give up. Otherwise, we'll make | |
| 553 * do. */ | |
| 554 if (roots->len == 0 || | |
| 555 errcode == FZ_ERROR_TRYLATER || | |
| 556 errcode == FZ_ERROR_SYSTEM) | |
| 557 { | |
| 558 pdf_drop_root_list(ctx, roots); | |
| 559 roots = NULL; | |
| 560 fz_rethrow(ctx); | |
| 561 } | |
| 562 fz_report_error(ctx); | |
| 563 fz_warn(ctx, "cannot parse object (%d %d R) - ignoring rest of file", num, gen); | |
| 564 break; | |
| 565 } | |
| 566 | |
| 567 if (num <= 0 || num > PDF_MAX_OBJECT_NUMBER) | |
| 568 { | |
| 569 fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", num, gen); | |
| 570 goto have_next_token; | |
| 571 } | |
| 572 | |
| 573 gen = fz_clampi(gen, 0, 65535); | |
| 574 | |
| 575 if (listlen + 1 == listcap) | |
| 576 { | |
| 577 listcap = (listcap * 3) / 2; | |
| 578 list = fz_realloc_array(ctx, list, listcap, struct entry); | |
| 579 } | |
| 580 | |
| 581 list[listlen].num = num; | |
| 582 list[listlen].gen = gen; | |
| 583 list[listlen].ofs = numofs; | |
| 584 list[listlen].stm_ofs = stm_ofs; | |
| 585 list[listlen].stm_len = stm_len; | |
| 586 listlen ++; | |
| 587 | |
| 588 if (num > maxnum) | |
| 589 maxnum = num; | |
| 590 | |
| 591 goto have_next_token; | |
| 592 } | |
| 593 | |
| 594 /* If we find a dictionary it is probably the trailer, | |
| 595 * but could be a stream (or bogus) dictionary caused | |
| 596 * by a corrupt file. */ | |
| 597 else if (tok == PDF_TOK_OPEN_DICT) | |
| 598 { | |
| 599 pdf_obj *dictobj; | |
| 600 | |
| 601 fz_try(ctx) | |
| 602 { | |
| 603 dict = pdf_parse_dict(ctx, doc, doc->file, buf); | |
| 604 } | |
| 605 fz_catch(ctx) | |
| 606 { | |
| 607 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER); | |
| 608 fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); | |
| 609 /* If this was the real trailer dict | |
| 610 * it was broken, in which case we are | |
| 611 * in trouble. Keep going though in | |
| 612 * case this was just a bogus dict. */ | |
| 613 fz_report_error(ctx); | |
| 614 continue; | |
| 615 } | |
| 616 | |
| 617 fz_try(ctx) | |
| 618 { | |
| 619 dictobj = pdf_dict_get(ctx, dict, PDF_NAME(Encrypt)); | |
| 620 if (dictobj) | |
| 621 { | |
| 622 pdf_drop_obj(ctx, encrypt); | |
| 623 encrypt = pdf_keep_obj(ctx, dictobj); | |
| 624 } | |
| 625 | |
| 626 dictobj = pdf_dict_get(ctx, dict, PDF_NAME(ID)); | |
| 627 if (dictobj && (!id || !encrypt || pdf_dict_get(ctx, dict, PDF_NAME(Encrypt)))) | |
| 628 { | |
| 629 pdf_drop_obj(ctx, id); | |
| 630 id = pdf_keep_obj(ctx, dictobj); | |
| 631 } | |
| 632 | |
| 633 dictobj = pdf_dict_get(ctx, dict, PDF_NAME(Root)); | |
| 634 if (dictobj) | |
| 635 add_root(ctx, roots, dictobj); | |
| 636 | |
| 637 dictobj = pdf_dict_get(ctx, dict, PDF_NAME(Info)); | |
| 638 if (dictobj) | |
| 639 { | |
| 640 pdf_drop_obj(ctx, info); | |
| 641 info = pdf_keep_obj(ctx, dictobj); | |
| 642 } | |
| 643 } | |
| 644 fz_always(ctx) | |
| 645 pdf_drop_obj(ctx, dict); | |
| 646 fz_catch(ctx) | |
| 647 fz_rethrow(ctx); | |
| 648 } | |
| 649 | |
| 650 else if (tok == PDF_TOK_EOF) | |
| 651 { | |
| 652 break; | |
| 653 } | |
| 654 | |
| 655 else | |
| 656 { | |
| 657 num = 0; | |
| 658 gen = 0; | |
| 659 } | |
| 660 } | |
| 661 | |
| 662 if (listlen == 0) | |
| 663 fz_throw(ctx, FZ_ERROR_FORMAT, "no objects found"); | |
| 664 | |
| 665 /* make xref reasonable */ | |
| 666 | |
| 667 /* | |
| 668 Dummy access to entry to assure sufficient space in the xref table | |
| 669 and avoid repeated reallocs in the loop | |
| 670 */ | |
| 671 /* Ensure that the first xref table is a 'solid' one from | |
| 672 * 0 to maxnum. */ | |
| 673 pdf_ensure_solid_xref(ctx, doc, maxnum); | |
| 674 | |
| 675 for (i = 1; i < maxnum; i++) | |
| 676 { | |
| 677 entry = pdf_get_populating_xref_entry(ctx, doc, i); | |
| 678 if (entry->obj != NULL) | |
| 679 continue; | |
| 680 entry->type = 'f'; | |
| 681 entry->ofs = 0; | |
| 682 entry->gen = 0; | |
| 683 entry->num = 0; | |
| 684 | |
| 685 entry->stm_ofs = 0; | |
| 686 } | |
| 687 | |
| 688 for (i = 0; i < listlen; i++) | |
| 689 { | |
| 690 entry = pdf_get_populating_xref_entry(ctx, doc, list[i].num); | |
| 691 entry->type = 'n'; | |
| 692 entry->ofs = list[i].ofs; | |
| 693 entry->gen = list[i].gen; | |
| 694 entry->num = list[i].num; | |
| 695 | |
| 696 entry->stm_ofs = list[i].stm_ofs; | |
| 697 | |
| 698 /* correct stream length for unencrypted documents */ | |
| 699 if (!encrypt && list[i].stm_len >= 0) | |
| 700 { | |
| 701 pdf_obj *old_obj = NULL; | |
| 702 dict = pdf_load_object(ctx, doc, list[i].num); | |
| 703 | |
| 704 fz_try(ctx) | |
| 705 { | |
| 706 length = pdf_new_int(ctx, list[i].stm_len); | |
| 707 pdf_dict_get_put_drop(ctx, dict, PDF_NAME(Length), length, &old_obj); | |
| 708 if (old_obj) | |
| 709 orphan_object(ctx, doc, old_obj); | |
| 710 } | |
| 711 fz_always(ctx) | |
| 712 pdf_drop_obj(ctx, dict); | |
| 713 fz_catch(ctx) | |
| 714 fz_rethrow(ctx); | |
| 715 } | |
| 716 } | |
| 717 | |
| 718 entry = pdf_get_populating_xref_entry(ctx, doc, 0); | |
| 719 entry->type = 'f'; | |
| 720 entry->ofs = 0; | |
| 721 entry->gen = 65535; | |
| 722 entry->num = 0; | |
| 723 entry->stm_ofs = 0; | |
| 724 | |
| 725 next = 0; | |
| 726 for (i = pdf_xref_len(ctx, doc) - 1; i >= 0; i--) | |
| 727 { | |
| 728 entry = pdf_get_populating_xref_entry(ctx, doc, i); | |
| 729 if (entry->type == 'f') | |
| 730 { | |
| 731 entry->ofs = next; | |
| 732 if (entry->gen < 65535) | |
| 733 entry->gen ++; | |
| 734 next = i; | |
| 735 } | |
| 736 } | |
| 737 | |
| 738 /* create a repaired trailer, Root will be added later */ | |
| 739 | |
| 740 obj = pdf_new_dict(ctx, doc, 5); | |
| 741 /* During repair there is only a single xref section */ | |
| 742 pdf_set_populating_xref_trailer(ctx, doc, obj); | |
| 743 pdf_drop_obj(ctx, obj); | |
| 744 obj = NULL; | |
| 745 | |
| 746 pdf_dict_put_int(ctx, pdf_trailer(ctx, doc), PDF_NAME(Size), maxnum + 1); | |
| 747 | |
| 748 if (info) | |
| 749 { | |
| 750 pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(Info), info); | |
| 751 pdf_drop_obj(ctx, info); | |
| 752 info = NULL; | |
| 753 } | |
| 754 | |
| 755 if (encrypt) | |
| 756 { | |
| 757 if (pdf_is_indirect(ctx, encrypt)) | |
| 758 { | |
| 759 /* create new reference with non-NULL xref pointer */ | |
| 760 obj = pdf_new_indirect(ctx, doc, pdf_to_num(ctx, encrypt), pdf_to_gen(ctx, encrypt)); | |
| 761 pdf_drop_obj(ctx, encrypt); | |
| 762 encrypt = obj; | |
| 763 obj = NULL; | |
| 764 } | |
| 765 pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(Encrypt), encrypt); | |
| 766 pdf_drop_obj(ctx, encrypt); | |
| 767 encrypt = NULL; | |
| 768 } | |
| 769 | |
| 770 if (id) | |
| 771 { | |
| 772 if (pdf_is_indirect(ctx, id)) | |
| 773 { | |
| 774 /* create new reference with non-NULL xref pointer */ | |
| 775 obj = pdf_new_indirect(ctx, doc, pdf_to_num(ctx, id), pdf_to_gen(ctx, id)); | |
| 776 pdf_drop_obj(ctx, id); | |
| 777 id = obj; | |
| 778 obj = NULL; | |
| 779 } | |
| 780 pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(ID), id); | |
| 781 pdf_drop_obj(ctx, id); | |
| 782 id = NULL; | |
| 783 } | |
| 784 } | |
| 785 fz_always(ctx) | |
| 786 { | |
| 787 fz_free(ctx, list); | |
| 788 doc->repair_in_progress = 0; | |
| 789 } | |
| 790 fz_catch(ctx) | |
| 791 { | |
| 792 pdf_drop_root_list(ctx, roots); | |
| 793 pdf_drop_obj(ctx, encrypt); | |
| 794 pdf_drop_obj(ctx, id); | |
| 795 pdf_drop_obj(ctx, obj); | |
| 796 pdf_drop_obj(ctx, info); | |
| 797 if (ctx->throw_on_repair) | |
| 798 fz_throw(ctx, FZ_ERROR_REPAIRED, "Error during repair attempt"); | |
| 799 fz_rethrow(ctx); | |
| 800 } | |
| 801 | |
| 802 if (ctx->throw_on_repair) | |
| 803 { | |
| 804 pdf_drop_root_list(ctx, roots); | |
| 805 fz_throw(ctx, FZ_ERROR_REPAIRED, "File repaired"); | |
| 806 } | |
| 807 | |
| 808 return roots; | |
| 809 } | |
| 810 | |
| 811 static void | |
| 812 pdf_repair_obj_stms(fz_context *ctx, pdf_document *doc) | |
| 813 { | |
| 814 pdf_obj *dict; | |
| 815 int i; | |
| 816 int xref_len = pdf_xref_len(ctx, doc); | |
| 817 | |
| 818 for (i = 0; i < xref_len; i++) | |
| 819 { | |
| 820 pdf_xref_entry *entry = pdf_get_populating_xref_entry(ctx, doc, i); | |
| 821 | |
| 822 if (entry->stm_ofs) | |
| 823 { | |
| 824 dict = pdf_load_object(ctx, doc, i); | |
| 825 fz_try(ctx) | |
| 826 { | |
| 827 if (pdf_name_eq(ctx, pdf_dict_get(ctx, dict, PDF_NAME(Type)), PDF_NAME(ObjStm))) | |
| 828 pdf_repair_obj_stm(ctx, doc, i); | |
| 829 } | |
| 830 fz_always(ctx) | |
| 831 pdf_drop_obj(ctx, dict); | |
| 832 fz_catch(ctx) | |
| 833 { | |
| 834 fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); | |
| 835 fz_report_error(ctx); | |
| 836 fz_warn(ctx, "ignoring broken object stream (%d 0 R)", i); | |
| 837 } | |
| 838 } | |
| 839 } | |
| 840 | |
| 841 /* Ensure that streamed objects reside inside a known non-streamed object */ | |
| 842 for (i = 0; i < xref_len; i++) | |
| 843 { | |
| 844 pdf_xref_entry *entry = pdf_get_populating_xref_entry(ctx, doc, i); | |
| 845 | |
| 846 if (entry->type == 'o' && pdf_get_populating_xref_entry(ctx, doc, entry->ofs)->type != 'n') | |
| 847 { | |
| 848 fz_warn(ctx, "invalid reference to non-object-stream: %d, assuming %d 0 R is a freed object", (int)entry->ofs, i); | |
| 849 entry->type = 'f'; | |
| 850 } | |
| 851 } | |
| 852 } | |
| 853 | |
| 854 static void | |
| 855 pdf_repair_roots(fz_context *ctx, pdf_document *doc, pdf_root_list *roots) | |
| 856 { | |
| 857 int i; | |
| 858 | |
| 859 for (i = roots->len-1; i >= 0; i--) | |
| 860 { | |
| 861 if (pdf_is_indirect(ctx, roots->roots[i]) && pdf_is_dict(ctx, roots->roots[i])) | |
| 862 { | |
| 863 pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root), roots->roots[i]); | |
| 864 break; | |
| 865 } | |
| 866 } | |
| 867 } | |
| 868 | |
| 869 static void | |
| 870 pdf_repair_trailer(fz_context *ctx, pdf_document *doc) | |
| 871 { | |
| 872 int hasroot, hasinfo; | |
| 873 pdf_obj *obj, *nobj; | |
| 874 pdf_obj *dict = NULL; | |
| 875 int i; | |
| 876 | |
| 877 int xref_len = pdf_xref_len(ctx, doc); | |
| 878 | |
| 879 hasroot = (pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root)) != NULL); | |
| 880 hasinfo = (pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Info)) != NULL); | |
| 881 | |
| 882 fz_var(dict); | |
| 883 | |
| 884 fz_try(ctx) | |
| 885 { | |
| 886 /* Scan from the end so we have a better chance of finding | |
| 887 * newer objects if there are multiple instances of Info and | |
| 888 * Root objects. | |
| 889 */ | |
| 890 for (i = xref_len - 1; i > 0 && (!hasinfo || !hasroot); --i) | |
| 891 { | |
| 892 pdf_xref_entry *entry = pdf_get_xref_entry_no_null(ctx, doc, i); | |
| 893 if (entry->type == 0 || entry->type == 'f') | |
| 894 continue; | |
| 895 | |
| 896 fz_try(ctx) | |
| 897 { | |
| 898 dict = pdf_load_object(ctx, doc, i); | |
| 899 } | |
| 900 fz_catch(ctx) | |
| 901 { | |
| 902 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER); | |
| 903 fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); | |
| 904 fz_report_error(ctx); | |
| 905 fz_warn(ctx, "ignoring broken object (%d 0 R)", i); | |
| 906 continue; | |
| 907 } | |
| 908 | |
| 909 if (!hasroot) | |
| 910 { | |
| 911 obj = pdf_dict_get(ctx, dict, PDF_NAME(Type)); | |
| 912 if (obj == PDF_NAME(Catalog)) | |
| 913 { | |
| 914 nobj = pdf_new_indirect(ctx, doc, i, 0); | |
| 915 pdf_dict_put_drop(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root), nobj); | |
| 916 hasroot = 1; | |
| 917 } | |
| 918 } | |
| 919 | |
| 920 if (!hasinfo) | |
| 921 { | |
| 922 if (pdf_dict_get(ctx, dict, PDF_NAME(Creator)) || pdf_dict_get(ctx, dict, PDF_NAME(Producer))) | |
| 923 { | |
| 924 nobj = pdf_new_indirect(ctx, doc, i, 0); | |
| 925 pdf_dict_put_drop(ctx, pdf_trailer(ctx, doc), PDF_NAME(Info), nobj); | |
| 926 hasinfo = 1; | |
| 927 } | |
| 928 } | |
| 929 | |
| 930 pdf_drop_obj(ctx, dict); | |
| 931 dict = NULL; | |
| 932 } | |
| 933 } | |
| 934 fz_always(ctx) | |
| 935 { | |
| 936 /* ensure that strings are not used in their repaired, non-decrypted form */ | |
| 937 if (doc->crypt) | |
| 938 { | |
| 939 pdf_crypt *tmp; | |
| 940 pdf_clear_xref(ctx, doc); | |
| 941 | |
| 942 /* ensure that Encryption dictionary and ID are cached without decryption, | |
| 943 otherwise a decrypted Encryption dictionary and ID may be used when saving | |
| 944 the PDF causing it to be inconsistent (since strings/streams are encrypted | |
| 945 with the actual encryption key, not the decrypted encryption key). */ | |
| 946 tmp = doc->crypt; | |
| 947 doc->crypt = NULL; | |
| 948 fz_try(ctx) | |
| 949 { | |
| 950 (void) pdf_resolve_indirect(ctx, pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Encrypt))); | |
| 951 (void) pdf_resolve_indirect(ctx, pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(ID))); | |
| 952 } | |
| 953 fz_always(ctx) | |
| 954 doc->crypt = tmp; | |
| 955 fz_catch(ctx) | |
| 956 { | |
| 957 fz_rethrow(ctx); | |
| 958 } | |
| 959 } | |
| 960 } | |
| 961 fz_catch(ctx) | |
| 962 { | |
| 963 pdf_drop_obj(ctx, dict); | |
| 964 fz_rethrow(ctx); | |
| 965 } | |
| 966 } | |
| 967 | |
| 968 void pdf_repair_xref_aux(fz_context *ctx, pdf_document *doc, void (*mid)(fz_context *ctx, pdf_document *doc)) | |
| 969 { | |
| 970 pdf_root_list *roots = NULL; | |
| 971 | |
| 972 fz_var(roots); | |
| 973 | |
| 974 fz_try(ctx) | |
| 975 { | |
| 976 roots = pdf_repair_xref_base(ctx, doc); | |
| 977 if (mid) | |
| 978 mid(ctx, doc); | |
| 979 pdf_repair_obj_stms(ctx, doc); | |
| 980 pdf_repair_roots(ctx, doc, roots); | |
| 981 pdf_repair_trailer(ctx, doc); | |
| 982 } | |
| 983 fz_always(ctx) | |
| 984 pdf_drop_root_list(ctx, roots); | |
| 985 fz_catch(ctx) | |
| 986 fz_rethrow(ctx); | |
| 987 } |
