comparison mupdf-source/source/pdf/pdf-repair.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 // Copyright (C) 2004-2025 Artifex Software, Inc.
2 //
3 // This file is part of MuPDF.
4 //
5 // MuPDF is free software: you can redistribute it and/or modify it under the
6 // terms of the GNU Affero General Public License as published by the Free
7 // Software Foundation, either version 3 of the License, or (at your option)
8 // any later version.
9 //
10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13 // details.
14 //
15 // You should have received a copy of the GNU Affero General Public License
16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17 //
18 // Alternative licensing terms are available from the licensor.
19 // For commercial licensing, see <https://www.artifex.com/> or contact
20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21 // CA 94129, USA, for further information.
22
23 #include "mupdf/fitz.h"
24 #include "pdf-imp.h"
25
26 #include <string.h>
27
28 /* Scan file for objects and reconstruct xref table */
29
30 struct entry
31 {
32 int num;
33 int gen;
34 int64_t ofs;
35 int64_t stm_ofs;
36 int64_t stm_len;
37 };
38
39 typedef struct
40 {
41 int max;
42 int len;
43 pdf_obj **roots;
44 } pdf_root_list;
45
46 static void
47 add_root(fz_context *ctx, pdf_root_list *roots, pdf_obj *obj)
48 {
49 if (roots->max == roots->len)
50 {
51 int new_max_roots = roots->max * 2;
52 if (new_max_roots == 0)
53 new_max_roots = 4;
54 roots->roots = fz_realloc(ctx, roots->roots, new_max_roots * sizeof(roots->roots[0]));
55 roots->max = new_max_roots;
56 }
57 roots->roots[roots->len] = pdf_keep_obj(ctx, obj);
58 roots->len++;
59 }
60
61 static pdf_root_list *
62 fz_new_root_list(fz_context *ctx)
63 {
64 return fz_malloc_struct(ctx, pdf_root_list);
65 }
66
67 static void
68 pdf_drop_root_list(fz_context *ctx, pdf_root_list *roots)
69 {
70 int i, n;
71
72 if (roots == NULL)
73 return;
74
75 n = roots->len;
76 for (i = 0; i < n; i++)
77 pdf_drop_obj(ctx, roots->roots[i]);
78 fz_free(ctx, roots->roots);
79 fz_free(ctx, roots);
80 }
81
82 int
83 pdf_repair_obj(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf, int64_t *stmofsp, int64_t *stmlenp, pdf_obj **encrypt, pdf_obj **id, pdf_obj **page, int64_t *tmpofs, pdf_obj **root)
84 {
85 fz_stream *file = doc->file;
86 pdf_token tok;
87 int64_t stm_len;
88 int64_t local_ofs;
89
90 if (tmpofs == NULL)
91 tmpofs = &local_ofs;
92 if (stmofsp == NULL)
93 stmofsp = &local_ofs;
94
95 *stmofsp = 0;
96 if (stmlenp)
97 *stmlenp = -1;
98
99 stm_len = 0;
100
101 *tmpofs = fz_tell(ctx, file);
102 if (*tmpofs < 0)
103 fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file");
104
105 /* On entry to this function, we know that we've just seen
106 * '<int> <int> obj'. We expect the next thing we see to be a
107 * pdf object. Regardless of the type of thing we meet next
108 * we only need to fully parse it if it is a dictionary. */
109 tok = pdf_lex(ctx, file, buf);
110
111 /* Don't let a truncated object at EOF overwrite a good one */
112 if (tok == PDF_TOK_EOF)
113 fz_throw(ctx, FZ_ERROR_SYNTAX, "truncated object");
114
115 if (tok == PDF_TOK_OPEN_DICT)
116 {
117 pdf_obj *obj, *dict = NULL;
118
119 fz_try(ctx)
120 {
121 dict = pdf_parse_dict(ctx, doc, file, buf);
122 }
123 fz_catch(ctx)
124 {
125 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
126 fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
127 /* Don't let a broken object at EOF overwrite a good one */
128 if (file->eof)
129 fz_rethrow(ctx);
130 /* Silently swallow the error */
131 fz_report_error(ctx);
132 dict = pdf_new_dict(ctx, doc, 2);
133 }
134
135 /* We must be careful not to try to resolve any indirections
136 * here. We have just read dict, so we know it to be a non
137 * indirected dictionary. Before we look at any values that
138 * we get back from looking up in it, we need to check they
139 * aren't indirected. */
140
141 if (encrypt || id || root)
142 {
143 obj = pdf_dict_get(ctx, dict, PDF_NAME(Type));
144 if (!pdf_is_indirect(ctx, obj) && pdf_name_eq(ctx, obj, PDF_NAME(XRef)))
145 {
146 if (encrypt)
147 {
148 obj = pdf_dict_get(ctx, dict, PDF_NAME(Encrypt));
149 if (obj)
150 {
151 pdf_drop_obj(ctx, *encrypt);
152 *encrypt = pdf_keep_obj(ctx, obj);
153 }
154 }
155
156 if (id)
157 {
158 obj = pdf_dict_get(ctx, dict, PDF_NAME(ID));
159 if (obj)
160 {
161 pdf_drop_obj(ctx, *id);
162 *id = pdf_keep_obj(ctx, obj);
163 }
164 }
165
166 if (root)
167 *root = pdf_keep_obj(ctx, pdf_dict_get(ctx, dict, PDF_NAME(Root)));
168 }
169 }
170
171 obj = pdf_dict_get(ctx, dict, PDF_NAME(Length));
172 if (!pdf_is_indirect(ctx, obj) && pdf_is_int(ctx, obj))
173 stm_len = pdf_to_int64(ctx, obj);
174
175 if (doc->file_reading_linearly && page)
176 {
177 obj = pdf_dict_get(ctx, dict, PDF_NAME(Type));
178 if (!pdf_is_indirect(ctx, obj) && pdf_name_eq(ctx, obj, PDF_NAME(Page)))
179 {
180 pdf_drop_obj(ctx, *page);
181 *page = pdf_keep_obj(ctx, dict);
182 }
183 }
184
185 pdf_drop_obj(ctx, dict);
186 }
187
188 while ( tok != PDF_TOK_STREAM &&
189 tok != PDF_TOK_ENDOBJ &&
190 tok != PDF_TOK_ERROR &&
191 tok != PDF_TOK_EOF &&
192 tok != PDF_TOK_INT )
193 {
194 *tmpofs = fz_tell(ctx, file);
195 if (*tmpofs < 0)
196 fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file");
197 tok = pdf_lex(ctx, file, buf);
198 }
199
200 if (tok == PDF_TOK_STREAM)
201 {
202 int c = fz_read_byte(ctx, file);
203 if (c == '\r') {
204 c = fz_peek_byte(ctx, file);
205 if (c == '\n')
206 fz_read_byte(ctx, file);
207 }
208
209 *stmofsp = fz_tell(ctx, file);
210 if (*stmofsp < 0)
211 fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file");
212
213 if (stm_len > 0)
214 {
215 fz_seek(ctx, file, *stmofsp + stm_len, 0);
216 fz_try(ctx)
217 {
218 tok = pdf_lex(ctx, file, buf);
219 }
220 fz_catch(ctx)
221 {
222 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
223 fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
224 fz_report_error(ctx);
225 fz_warn(ctx, "cannot find endstream token, falling back to scanning");
226 }
227 if (tok == PDF_TOK_ENDSTREAM)
228 goto atobjend;
229 fz_seek(ctx, file, *stmofsp, 0);
230 }
231
232 (void)fz_read(ctx, file, (unsigned char *) buf->scratch, 9);
233
234 while (memcmp(buf->scratch, "endstream", 9) != 0)
235 {
236 c = fz_read_byte(ctx, file);
237 if (c == EOF)
238 break;
239 memmove(&buf->scratch[0], &buf->scratch[1], 8);
240 buf->scratch[8] = c;
241 }
242
243 if (stmlenp)
244 *stmlenp = fz_tell(ctx, file) - *stmofsp - 9;
245
246 atobjend:
247 *tmpofs = fz_tell(ctx, file);
248 if (*tmpofs < 0)
249 fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file");
250 tok = pdf_lex(ctx, file, buf);
251 if (tok != PDF_TOK_ENDOBJ)
252 fz_warn(ctx, "object missing 'endobj' token");
253 else
254 {
255 /* Read another token as we always return the next one */
256 *tmpofs = fz_tell(ctx, file);
257 if (*tmpofs < 0)
258 fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file");
259 tok = pdf_lex(ctx, file, buf);
260 }
261 }
262 return tok;
263 }
264
265 static int64_t
266 entry_offset(fz_context *ctx, pdf_document *doc, int num)
267 {
268 pdf_xref_entry *entry = pdf_get_populating_xref_entry(ctx, doc, num);
269
270 if (entry->type == 0 || entry->type == 'f')
271 return 0;
272 if (entry->type == 'n')
273 return entry->ofs;
274 assert(entry->type == 'o');
275
276 /* It must be in a stream. Return the entry of that stream. */
277 entry = pdf_get_populating_xref_entry(ctx, doc, entry->ofs);
278 /* If it's NOT in a stream, then we'll invalidate this entry in a moment.
279 * For now, just return an illegal offset. */
280 if (entry->type != 'n')
281 return -1;
282
283 return entry->ofs;
284 }
285
286 static void
287 pdf_repair_obj_stm(fz_context *ctx, pdf_document *doc, int stm_num)
288 {
289 pdf_obj *obj;
290 fz_stream *stm = NULL;
291 pdf_token tok;
292 int i, n, count;
293 pdf_lexbuf buf;
294
295 fz_var(stm);
296
297 pdf_lexbuf_init(ctx, &buf, PDF_LEXBUF_SMALL);
298
299 fz_try(ctx)
300 {
301 obj = pdf_load_object(ctx, doc, stm_num);
302
303 count = pdf_dict_get_int(ctx, obj, PDF_NAME(N));
304
305 pdf_drop_obj(ctx, obj);
306
307 stm = pdf_open_stream_number(ctx, doc, stm_num);
308
309 for (i = 0; i < count; i++)
310 {
311 pdf_xref_entry *entry;
312 int replace;
313
314 tok = pdf_lex(ctx, stm, &buf);
315 if (tok != PDF_TOK_INT)
316 fz_throw(ctx, FZ_ERROR_FORMAT, "corrupt object stream (%d 0 R)", stm_num);
317
318 n = buf.i;
319 if (n < 0)
320 {
321 fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", n, i);
322 continue;
323 }
324 else if (n >= PDF_MAX_OBJECT_NUMBER)
325 {
326 fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", n, i);
327 continue;
328 }
329
330 entry = pdf_get_populating_xref_entry(ctx, doc, n);
331
332 /* Bug 708286: Do not allow an object from an ObjStm to override an object
333 * that isn't in an ObjStm that we've already read, that occurs after it
334 * in the file. */
335 replace = 1;
336 if (entry->type != 0 && entry->type != 'f')
337 {
338 int64_t existing_entry_offset = entry_offset(ctx, doc, n);
339
340 if (existing_entry_offset < 0)
341 {
342 /* The existing entry is invalid. Anything must be better than that! */
343 }
344 else
345 {
346 int64_t this_entry_offset = entry_offset(ctx, doc, stm_num);
347
348 if (existing_entry_offset > this_entry_offset)
349 replace = 0;
350 }
351 }
352
353 if (replace)
354 {
355 entry->ofs = stm_num;
356 entry->gen = i;
357 entry->num = n;
358 entry->stm_ofs = 0;
359 pdf_drop_obj(ctx, entry->obj);
360 entry->obj = NULL;
361 entry->type = 'o';
362 }
363
364 tok = pdf_lex(ctx, stm, &buf);
365 if (tok != PDF_TOK_INT)
366 fz_throw(ctx, FZ_ERROR_FORMAT, "corrupt object stream (%d 0 R)", stm_num);
367 }
368 }
369 fz_always(ctx)
370 {
371 fz_drop_stream(ctx, stm);
372 pdf_lexbuf_fin(ctx, &buf);
373 }
374 fz_catch(ctx)
375 {
376 fz_rethrow(ctx);
377 }
378 }
379
380 static void
381 orphan_object(fz_context *ctx, pdf_document *doc, pdf_obj *obj)
382 {
383 if (doc->orphans_count == doc->orphans_max)
384 {
385 int new_max = (doc->orphans_max ? doc->orphans_max*2 : 32);
386
387 fz_try(ctx)
388 {
389 doc->orphans = fz_realloc_array(ctx, doc->orphans, new_max, pdf_obj*);
390 doc->orphans_max = new_max;
391 }
392 fz_catch(ctx)
393 {
394 pdf_drop_obj(ctx, obj);
395 fz_rethrow(ctx);
396 }
397 }
398 doc->orphans[doc->orphans_count++] = obj;
399 }
400
401 static int is_white(int c)
402 {
403 return c == '\x00' || c == '\x09' || c == '\x0a' || c == '\x0c' || c == '\x0d' || c == '\x20';
404 }
405
406 static pdf_root_list *
407 pdf_repair_xref_base(fz_context *ctx, pdf_document *doc)
408 {
409 pdf_obj *dict, *obj = NULL;
410 pdf_obj *length;
411
412 pdf_obj *encrypt = NULL;
413 pdf_obj *id = NULL;
414 pdf_obj *info = NULL;
415 pdf_root_list *roots = NULL;
416
417 struct entry *list = NULL;
418 int listlen;
419 int listcap;
420 int maxnum = 0;
421
422 int num = 0;
423 int gen = 0;
424 int64_t tmpofs, stm_ofs, numofs = 0, genofs = 0;
425 int64_t stm_len;
426 pdf_token tok;
427 int next;
428 int i;
429 size_t j, n;
430 int c;
431 pdf_lexbuf *buf = &doc->lexbuf.base;
432
433 fz_var(encrypt);
434 fz_var(id);
435 fz_var(info);
436 fz_var(list);
437 fz_var(obj);
438 fz_var(roots);
439
440 if (!doc->is_fdf)
441 fz_warn(ctx, "repairing PDF document");
442
443 if (doc->repair_attempted)
444 fz_throw(ctx, FZ_ERROR_FORMAT, "Repair failed already - not trying again");
445
446 doc->bias = 0; // reset bias!
447
448 doc->repair_attempted = 1;
449 doc->repair_in_progress = 1;
450
451 pdf_drop_page_tree_internal(ctx, doc);
452 doc->page_tree_broken = 0;
453 pdf_forget_xref(ctx, doc);
454
455 fz_seek(ctx, doc->file, 0, 0);
456
457 fz_try(ctx)
458 {
459 pdf_xref_entry *entry;
460 listlen = 0;
461 listcap = 1024;
462 list = fz_malloc_array(ctx, listcap, struct entry);
463
464 roots = fz_new_root_list(ctx);
465
466 /* look for '%PDF' version marker within first kilobyte of file */
467 n = fz_read(ctx, doc->file, (unsigned char *)buf->scratch, fz_minz(buf->size, 1024));
468
469 fz_seek(ctx, doc->file, 0, 0);
470 if (n >= 5)
471 {
472 for (j = 0; j < n - 5; j++)
473 {
474 if (memcmp(&buf->scratch[j], "%PDF-", 5) == 0 || memcmp(&buf->scratch[j], "%FDF-", 5) == 0)
475 {
476 fz_seek(ctx, doc->file, (int64_t)(j + 8), 0); /* skip "%PDF-X.Y" */
477 break;
478 }
479 }
480 }
481
482 /* skip comment line after version marker since some generators
483 * forget to terminate the comment with a newline */
484 c = fz_read_byte(ctx, doc->file);
485 while (c >= 0 && (c == ' ' || c == '%'))
486 c = fz_read_byte(ctx, doc->file);
487 if (c != EOF)
488 fz_unread_byte(ctx, doc->file);
489
490 while (1)
491 {
492 tmpofs = fz_tell(ctx, doc->file);
493 if (tmpofs < 0)
494 fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file");
495
496 fz_try(ctx)
497 tok = pdf_lex_no_string(ctx, doc->file, buf);
498 fz_catch(ctx)
499 {
500 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
501 fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
502 fz_report_error(ctx);
503 fz_warn(ctx, "skipping ahead to next token");
504 do
505 c = fz_read_byte(ctx, doc->file);
506 while (c != EOF && !is_white(c));
507 if (c == EOF)
508 tok = PDF_TOK_EOF;
509 else
510 continue;
511 }
512
513 /* If we have the next token already, then we'll jump
514 * back here, rather than going through the top of
515 * the loop. */
516 have_next_token:
517
518 if (tok == PDF_TOK_INT)
519 {
520 if (buf->i < 0)
521 {
522 num = 0;
523 gen = 0;
524 continue;
525 }
526 numofs = genofs;
527 num = gen;
528 genofs = tmpofs;
529 gen = buf->i;
530 }
531
532 else if (tok == PDF_TOK_OBJ)
533 {
534 pdf_obj *root = NULL;
535
536 fz_try(ctx)
537 {
538 stm_len = 0;
539 stm_ofs = 0;
540 tok = pdf_repair_obj(ctx, doc, buf, &stm_ofs, &stm_len, &encrypt, &id, NULL, &tmpofs, &root);
541 if (root)
542 add_root(ctx, roots, root);
543 }
544 fz_always(ctx)
545 {
546 pdf_drop_obj(ctx, root);
547 }
548 fz_catch(ctx)
549 {
550 int errcode = fz_caught(ctx);
551 /* If we haven't seen a root yet, there is nothing
552 * we can do, but give up. Otherwise, we'll make
553 * do. */
554 if (roots->len == 0 ||
555 errcode == FZ_ERROR_TRYLATER ||
556 errcode == FZ_ERROR_SYSTEM)
557 {
558 pdf_drop_root_list(ctx, roots);
559 roots = NULL;
560 fz_rethrow(ctx);
561 }
562 fz_report_error(ctx);
563 fz_warn(ctx, "cannot parse object (%d %d R) - ignoring rest of file", num, gen);
564 break;
565 }
566
567 if (num <= 0 || num > PDF_MAX_OBJECT_NUMBER)
568 {
569 fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", num, gen);
570 goto have_next_token;
571 }
572
573 gen = fz_clampi(gen, 0, 65535);
574
575 if (listlen + 1 == listcap)
576 {
577 listcap = (listcap * 3) / 2;
578 list = fz_realloc_array(ctx, list, listcap, struct entry);
579 }
580
581 list[listlen].num = num;
582 list[listlen].gen = gen;
583 list[listlen].ofs = numofs;
584 list[listlen].stm_ofs = stm_ofs;
585 list[listlen].stm_len = stm_len;
586 listlen ++;
587
588 if (num > maxnum)
589 maxnum = num;
590
591 goto have_next_token;
592 }
593
594 /* If we find a dictionary it is probably the trailer,
595 * but could be a stream (or bogus) dictionary caused
596 * by a corrupt file. */
597 else if (tok == PDF_TOK_OPEN_DICT)
598 {
599 pdf_obj *dictobj;
600
601 fz_try(ctx)
602 {
603 dict = pdf_parse_dict(ctx, doc, doc->file, buf);
604 }
605 fz_catch(ctx)
606 {
607 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
608 fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
609 /* If this was the real trailer dict
610 * it was broken, in which case we are
611 * in trouble. Keep going though in
612 * case this was just a bogus dict. */
613 fz_report_error(ctx);
614 continue;
615 }
616
617 fz_try(ctx)
618 {
619 dictobj = pdf_dict_get(ctx, dict, PDF_NAME(Encrypt));
620 if (dictobj)
621 {
622 pdf_drop_obj(ctx, encrypt);
623 encrypt = pdf_keep_obj(ctx, dictobj);
624 }
625
626 dictobj = pdf_dict_get(ctx, dict, PDF_NAME(ID));
627 if (dictobj && (!id || !encrypt || pdf_dict_get(ctx, dict, PDF_NAME(Encrypt))))
628 {
629 pdf_drop_obj(ctx, id);
630 id = pdf_keep_obj(ctx, dictobj);
631 }
632
633 dictobj = pdf_dict_get(ctx, dict, PDF_NAME(Root));
634 if (dictobj)
635 add_root(ctx, roots, dictobj);
636
637 dictobj = pdf_dict_get(ctx, dict, PDF_NAME(Info));
638 if (dictobj)
639 {
640 pdf_drop_obj(ctx, info);
641 info = pdf_keep_obj(ctx, dictobj);
642 }
643 }
644 fz_always(ctx)
645 pdf_drop_obj(ctx, dict);
646 fz_catch(ctx)
647 fz_rethrow(ctx);
648 }
649
650 else if (tok == PDF_TOK_EOF)
651 {
652 break;
653 }
654
655 else
656 {
657 num = 0;
658 gen = 0;
659 }
660 }
661
662 if (listlen == 0)
663 fz_throw(ctx, FZ_ERROR_FORMAT, "no objects found");
664
665 /* make xref reasonable */
666
667 /*
668 Dummy access to entry to assure sufficient space in the xref table
669 and avoid repeated reallocs in the loop
670 */
671 /* Ensure that the first xref table is a 'solid' one from
672 * 0 to maxnum. */
673 pdf_ensure_solid_xref(ctx, doc, maxnum);
674
675 for (i = 1; i < maxnum; i++)
676 {
677 entry = pdf_get_populating_xref_entry(ctx, doc, i);
678 if (entry->obj != NULL)
679 continue;
680 entry->type = 'f';
681 entry->ofs = 0;
682 entry->gen = 0;
683 entry->num = 0;
684
685 entry->stm_ofs = 0;
686 }
687
688 for (i = 0; i < listlen; i++)
689 {
690 entry = pdf_get_populating_xref_entry(ctx, doc, list[i].num);
691 entry->type = 'n';
692 entry->ofs = list[i].ofs;
693 entry->gen = list[i].gen;
694 entry->num = list[i].num;
695
696 entry->stm_ofs = list[i].stm_ofs;
697
698 /* correct stream length for unencrypted documents */
699 if (!encrypt && list[i].stm_len >= 0)
700 {
701 pdf_obj *old_obj = NULL;
702 dict = pdf_load_object(ctx, doc, list[i].num);
703
704 fz_try(ctx)
705 {
706 length = pdf_new_int(ctx, list[i].stm_len);
707 pdf_dict_get_put_drop(ctx, dict, PDF_NAME(Length), length, &old_obj);
708 if (old_obj)
709 orphan_object(ctx, doc, old_obj);
710 }
711 fz_always(ctx)
712 pdf_drop_obj(ctx, dict);
713 fz_catch(ctx)
714 fz_rethrow(ctx);
715 }
716 }
717
718 entry = pdf_get_populating_xref_entry(ctx, doc, 0);
719 entry->type = 'f';
720 entry->ofs = 0;
721 entry->gen = 65535;
722 entry->num = 0;
723 entry->stm_ofs = 0;
724
725 next = 0;
726 for (i = pdf_xref_len(ctx, doc) - 1; i >= 0; i--)
727 {
728 entry = pdf_get_populating_xref_entry(ctx, doc, i);
729 if (entry->type == 'f')
730 {
731 entry->ofs = next;
732 if (entry->gen < 65535)
733 entry->gen ++;
734 next = i;
735 }
736 }
737
738 /* create a repaired trailer, Root will be added later */
739
740 obj = pdf_new_dict(ctx, doc, 5);
741 /* During repair there is only a single xref section */
742 pdf_set_populating_xref_trailer(ctx, doc, obj);
743 pdf_drop_obj(ctx, obj);
744 obj = NULL;
745
746 pdf_dict_put_int(ctx, pdf_trailer(ctx, doc), PDF_NAME(Size), maxnum + 1);
747
748 if (info)
749 {
750 pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(Info), info);
751 pdf_drop_obj(ctx, info);
752 info = NULL;
753 }
754
755 if (encrypt)
756 {
757 if (pdf_is_indirect(ctx, encrypt))
758 {
759 /* create new reference with non-NULL xref pointer */
760 obj = pdf_new_indirect(ctx, doc, pdf_to_num(ctx, encrypt), pdf_to_gen(ctx, encrypt));
761 pdf_drop_obj(ctx, encrypt);
762 encrypt = obj;
763 obj = NULL;
764 }
765 pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(Encrypt), encrypt);
766 pdf_drop_obj(ctx, encrypt);
767 encrypt = NULL;
768 }
769
770 if (id)
771 {
772 if (pdf_is_indirect(ctx, id))
773 {
774 /* create new reference with non-NULL xref pointer */
775 obj = pdf_new_indirect(ctx, doc, pdf_to_num(ctx, id), pdf_to_gen(ctx, id));
776 pdf_drop_obj(ctx, id);
777 id = obj;
778 obj = NULL;
779 }
780 pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(ID), id);
781 pdf_drop_obj(ctx, id);
782 id = NULL;
783 }
784 }
785 fz_always(ctx)
786 {
787 fz_free(ctx, list);
788 doc->repair_in_progress = 0;
789 }
790 fz_catch(ctx)
791 {
792 pdf_drop_root_list(ctx, roots);
793 pdf_drop_obj(ctx, encrypt);
794 pdf_drop_obj(ctx, id);
795 pdf_drop_obj(ctx, obj);
796 pdf_drop_obj(ctx, info);
797 if (ctx->throw_on_repair)
798 fz_throw(ctx, FZ_ERROR_REPAIRED, "Error during repair attempt");
799 fz_rethrow(ctx);
800 }
801
802 if (ctx->throw_on_repair)
803 {
804 pdf_drop_root_list(ctx, roots);
805 fz_throw(ctx, FZ_ERROR_REPAIRED, "File repaired");
806 }
807
808 return roots;
809 }
810
811 static void
812 pdf_repair_obj_stms(fz_context *ctx, pdf_document *doc)
813 {
814 pdf_obj *dict;
815 int i;
816 int xref_len = pdf_xref_len(ctx, doc);
817
818 for (i = 0; i < xref_len; i++)
819 {
820 pdf_xref_entry *entry = pdf_get_populating_xref_entry(ctx, doc, i);
821
822 if (entry->stm_ofs)
823 {
824 dict = pdf_load_object(ctx, doc, i);
825 fz_try(ctx)
826 {
827 if (pdf_name_eq(ctx, pdf_dict_get(ctx, dict, PDF_NAME(Type)), PDF_NAME(ObjStm)))
828 pdf_repair_obj_stm(ctx, doc, i);
829 }
830 fz_always(ctx)
831 pdf_drop_obj(ctx, dict);
832 fz_catch(ctx)
833 {
834 fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
835 fz_report_error(ctx);
836 fz_warn(ctx, "ignoring broken object stream (%d 0 R)", i);
837 }
838 }
839 }
840
841 /* Ensure that streamed objects reside inside a known non-streamed object */
842 for (i = 0; i < xref_len; i++)
843 {
844 pdf_xref_entry *entry = pdf_get_populating_xref_entry(ctx, doc, i);
845
846 if (entry->type == 'o' && pdf_get_populating_xref_entry(ctx, doc, entry->ofs)->type != 'n')
847 {
848 fz_warn(ctx, "invalid reference to non-object-stream: %d, assuming %d 0 R is a freed object", (int)entry->ofs, i);
849 entry->type = 'f';
850 }
851 }
852 }
853
854 static void
855 pdf_repair_roots(fz_context *ctx, pdf_document *doc, pdf_root_list *roots)
856 {
857 int i;
858
859 for (i = roots->len-1; i >= 0; i--)
860 {
861 if (pdf_is_indirect(ctx, roots->roots[i]) && pdf_is_dict(ctx, roots->roots[i]))
862 {
863 pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root), roots->roots[i]);
864 break;
865 }
866 }
867 }
868
869 static void
870 pdf_repair_trailer(fz_context *ctx, pdf_document *doc)
871 {
872 int hasroot, hasinfo;
873 pdf_obj *obj, *nobj;
874 pdf_obj *dict = NULL;
875 int i;
876
877 int xref_len = pdf_xref_len(ctx, doc);
878
879 hasroot = (pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root)) != NULL);
880 hasinfo = (pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Info)) != NULL);
881
882 fz_var(dict);
883
884 fz_try(ctx)
885 {
886 /* Scan from the end so we have a better chance of finding
887 * newer objects if there are multiple instances of Info and
888 * Root objects.
889 */
890 for (i = xref_len - 1; i > 0 && (!hasinfo || !hasroot); --i)
891 {
892 pdf_xref_entry *entry = pdf_get_xref_entry_no_null(ctx, doc, i);
893 if (entry->type == 0 || entry->type == 'f')
894 continue;
895
896 fz_try(ctx)
897 {
898 dict = pdf_load_object(ctx, doc, i);
899 }
900 fz_catch(ctx)
901 {
902 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
903 fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
904 fz_report_error(ctx);
905 fz_warn(ctx, "ignoring broken object (%d 0 R)", i);
906 continue;
907 }
908
909 if (!hasroot)
910 {
911 obj = pdf_dict_get(ctx, dict, PDF_NAME(Type));
912 if (obj == PDF_NAME(Catalog))
913 {
914 nobj = pdf_new_indirect(ctx, doc, i, 0);
915 pdf_dict_put_drop(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root), nobj);
916 hasroot = 1;
917 }
918 }
919
920 if (!hasinfo)
921 {
922 if (pdf_dict_get(ctx, dict, PDF_NAME(Creator)) || pdf_dict_get(ctx, dict, PDF_NAME(Producer)))
923 {
924 nobj = pdf_new_indirect(ctx, doc, i, 0);
925 pdf_dict_put_drop(ctx, pdf_trailer(ctx, doc), PDF_NAME(Info), nobj);
926 hasinfo = 1;
927 }
928 }
929
930 pdf_drop_obj(ctx, dict);
931 dict = NULL;
932 }
933 }
934 fz_always(ctx)
935 {
936 /* ensure that strings are not used in their repaired, non-decrypted form */
937 if (doc->crypt)
938 {
939 pdf_crypt *tmp;
940 pdf_clear_xref(ctx, doc);
941
942 /* ensure that Encryption dictionary and ID are cached without decryption,
943 otherwise a decrypted Encryption dictionary and ID may be used when saving
944 the PDF causing it to be inconsistent (since strings/streams are encrypted
945 with the actual encryption key, not the decrypted encryption key). */
946 tmp = doc->crypt;
947 doc->crypt = NULL;
948 fz_try(ctx)
949 {
950 (void) pdf_resolve_indirect(ctx, pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Encrypt)));
951 (void) pdf_resolve_indirect(ctx, pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(ID)));
952 }
953 fz_always(ctx)
954 doc->crypt = tmp;
955 fz_catch(ctx)
956 {
957 fz_rethrow(ctx);
958 }
959 }
960 }
961 fz_catch(ctx)
962 {
963 pdf_drop_obj(ctx, dict);
964 fz_rethrow(ctx);
965 }
966 }
967
968 void pdf_repair_xref_aux(fz_context *ctx, pdf_document *doc, void (*mid)(fz_context *ctx, pdf_document *doc))
969 {
970 pdf_root_list *roots = NULL;
971
972 fz_var(roots);
973
974 fz_try(ctx)
975 {
976 roots = pdf_repair_xref_base(ctx, doc);
977 if (mid)
978 mid(ctx, doc);
979 pdf_repair_obj_stms(ctx, doc);
980 pdf_repair_roots(ctx, doc, roots);
981 pdf_repair_trailer(ctx, doc);
982 }
983 fz_always(ctx)
984 pdf_drop_root_list(ctx, roots);
985 fz_catch(ctx)
986 fz_rethrow(ctx);
987 }