comparison mupdf-source/source/pdf/pdf-xref.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 // Copyright (C) 2004-2025 Artifex Software, Inc.
2 //
3 // This file is part of MuPDF.
4 //
5 // MuPDF is free software: you can redistribute it and/or modify it under the
6 // terms of the GNU Affero General Public License as published by the Free
7 // Software Foundation, either version 3 of the License, or (at your option)
8 // any later version.
9 //
10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13 // details.
14 //
15 // You should have received a copy of the GNU Affero General Public License
16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17 //
18 // Alternative licensing terms are available from the licensor.
19 // For commercial licensing, see <https://www.artifex.com/> or contact
20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21 // CA 94129, USA, for further information.
22
23 #include "mupdf/fitz.h"
24 #include "pdf-annot-imp.h"
25 #include "pdf-imp.h"
26
27 #include <assert.h>
28 #include <limits.h>
29 #include <string.h>
30
31 #undef DEBUG_PROGESSIVE_ADVANCE
32
33 #ifdef DEBUG_PROGESSIVE_ADVANCE
34 #define DEBUGMESS(A) do { fz_warn A; } while (0)
35 #else
36 #define DEBUGMESS(A) do { } while (0)
37 #endif
38
39 #define isdigit(c) (c >= '0' && c <= '9')
40
41 static inline int iswhite(int ch)
42 {
43 return
44 ch == '\000' || ch == '\011' || ch == '\012' ||
45 ch == '\014' || ch == '\015' || ch == '\040';
46 }
47
48 /*
49 * xref tables
50 */
51
52 static void
53 pdf_drop_xref_subsec(fz_context *ctx, pdf_xref *xref)
54 {
55 pdf_xref_subsec *sub = xref->subsec;
56 pdf_unsaved_sig *usig;
57 int e;
58
59 while (sub != NULL)
60 {
61 pdf_xref_subsec *next_sub = sub->next;
62 for (e = 0; e < sub->len; e++)
63 {
64 pdf_xref_entry *entry = &sub->table[e];
65 pdf_drop_obj(ctx, entry->obj);
66 fz_drop_buffer(ctx, entry->stm_buf);
67 }
68 fz_free(ctx, sub->table);
69 fz_free(ctx, sub);
70 sub = next_sub;
71 }
72
73 pdf_drop_obj(ctx, xref->pre_repair_trailer);
74 pdf_drop_obj(ctx, xref->trailer);
75
76 while ((usig = xref->unsaved_sigs) != NULL)
77 {
78 xref->unsaved_sigs = usig->next;
79 pdf_drop_obj(ctx, usig->field);
80 pdf_drop_signer(ctx, usig->signer);
81 fz_free(ctx, usig);
82 }
83 }
84
85 static void pdf_drop_xref_sections_imp(fz_context *ctx, pdf_document *doc, pdf_xref *xref_sections, int num_xref_sections)
86 {
87 int x;
88
89 for (x = 0; x < num_xref_sections; x++)
90 pdf_drop_xref_subsec(ctx, &xref_sections[x]);
91
92 fz_free(ctx, xref_sections);
93 }
94
95 static void pdf_drop_xref_sections(fz_context *ctx, pdf_document *doc)
96 {
97 pdf_drop_xref_sections_imp(ctx, doc, doc->saved_xref_sections, doc->saved_num_xref_sections);
98 pdf_drop_xref_sections_imp(ctx, doc, doc->xref_sections, doc->num_xref_sections);
99
100 doc->saved_xref_sections = NULL;
101 doc->saved_num_xref_sections = 0;
102 doc->xref_sections = NULL;
103 doc->num_xref_sections = 0;
104 doc->num_incremental_sections = 0;
105 }
106
107 static void
108 extend_xref_index(fz_context *ctx, pdf_document *doc, int newlen)
109 {
110 int i;
111
112 doc->xref_index = fz_realloc_array(ctx, doc->xref_index, newlen, int);
113 for (i = doc->max_xref_len; i < newlen; i++)
114 {
115 doc->xref_index[i] = 0;
116 }
117 doc->max_xref_len = newlen;
118 }
119
120 static void
121 resize_xref_sub(fz_context *ctx, pdf_xref *xref, int base, int newlen)
122 {
123 pdf_xref_subsec *sub;
124 int i;
125
126 assert(xref != NULL);
127 sub = xref->subsec;
128 assert(sub->next == NULL && sub->start == base && sub->len+base == xref->num_objects);
129 assert(newlen+base > xref->num_objects);
130
131 sub->table = fz_realloc_array(ctx, sub->table, newlen, pdf_xref_entry);
132 for (i = sub->len; i < newlen; i++)
133 {
134 sub->table[i].type = 0;
135 sub->table[i].ofs = 0;
136 sub->table[i].gen = 0;
137 sub->table[i].num = 0;
138 sub->table[i].stm_ofs = 0;
139 sub->table[i].stm_buf = NULL;
140 sub->table[i].obj = NULL;
141 }
142 sub->len = newlen;
143 if (newlen+base > xref->num_objects)
144 xref->num_objects = newlen+base;
145 }
146
147 /* This is only ever called when we already have an incremental
148 * xref. This means there will only be 1 subsec, and it will be
149 * a complete subsec. */
150 static void pdf_resize_xref(fz_context *ctx, pdf_document *doc, int newlen)
151 {
152 pdf_xref *xref = &doc->xref_sections[doc->xref_base];
153
154 resize_xref_sub(ctx, xref, 0, newlen);
155 if (doc->max_xref_len < newlen)
156 extend_xref_index(ctx, doc, newlen);
157 }
158
159 static void pdf_populate_next_xref_level(fz_context *ctx, pdf_document *doc)
160 {
161 pdf_xref *xref;
162 doc->xref_sections = fz_realloc_array(ctx, doc->xref_sections, doc->num_xref_sections + 1, pdf_xref);
163 doc->num_xref_sections++;
164
165 xref = &doc->xref_sections[doc->num_xref_sections - 1];
166 xref->subsec = NULL;
167 xref->num_objects = 0;
168 xref->trailer = NULL;
169 xref->pre_repair_trailer = NULL;
170 xref->unsaved_sigs = NULL;
171 xref->unsaved_sigs_end = NULL;
172 }
173
174 pdf_obj *pdf_trailer(fz_context *ctx, pdf_document *doc)
175 {
176 /* Return the document's trailer (of the appropriate vintage) */
177 pdf_xref *xrefs = doc->xref_sections;
178
179 return xrefs ? xrefs[doc->xref_base].trailer : NULL;
180 }
181
182 void pdf_set_populating_xref_trailer(fz_context *ctx, pdf_document *doc, pdf_obj *trailer)
183 {
184 /* Update the trailer of the xref section being populated */
185 pdf_xref *xref = &doc->xref_sections[doc->num_xref_sections - 1];
186 if (xref->trailer)
187 {
188 pdf_drop_obj(ctx, xref->pre_repair_trailer);
189 xref->pre_repair_trailer = xref->trailer;
190 }
191 xref->trailer = pdf_keep_obj(ctx, trailer);
192 }
193
194 int pdf_xref_len(fz_context *ctx, pdf_document *doc)
195 {
196 int i = doc->xref_base;
197 int xref_len = 0;
198
199 if (doc->local_xref && doc->local_xref_nesting > 0)
200 xref_len = doc->local_xref->num_objects;
201
202 while (i < doc->num_xref_sections)
203 xref_len = fz_maxi(xref_len, doc->xref_sections[i++].num_objects);
204
205 return xref_len;
206 }
207
208 /* Ensure that the given xref has a single subsection
209 * that covers the entire range. */
210 static void
211 ensure_solid_xref(fz_context *ctx, pdf_document *doc, int num, int which)
212 {
213 pdf_xref *xref = &doc->xref_sections[which];
214 pdf_xref_subsec *sub = xref->subsec;
215 pdf_xref_subsec *new_sub;
216
217 if (num < xref->num_objects)
218 num = xref->num_objects;
219
220 if (sub != NULL && sub->next == NULL && sub->start == 0 && sub->len >= num)
221 return;
222
223 new_sub = fz_malloc_struct(ctx, pdf_xref_subsec);
224 fz_try(ctx)
225 {
226 new_sub->table = fz_malloc_struct_array(ctx, num, pdf_xref_entry);
227 new_sub->start = 0;
228 new_sub->len = num;
229 new_sub->next = NULL;
230 }
231 fz_catch(ctx)
232 {
233 fz_free(ctx, new_sub);
234 fz_rethrow(ctx);
235 }
236
237 /* Move objects over to the new subsection and destroy the old
238 * ones */
239 sub = xref->subsec;
240 while (sub != NULL)
241 {
242 pdf_xref_subsec *next = sub->next;
243 int i;
244
245 for (i = 0; i < sub->len; i++)
246 {
247 new_sub->table[i+sub->start] = sub->table[i];
248 }
249 fz_free(ctx, sub->table);
250 fz_free(ctx, sub);
251 sub = next;
252 }
253 xref->num_objects = num;
254 xref->subsec = new_sub;
255 if (doc->max_xref_len < num)
256 extend_xref_index(ctx, doc, num);
257 }
258
259 static pdf_xref_entry *
260 pdf_get_local_xref_entry(fz_context *ctx, pdf_document *doc, int num)
261 {
262 pdf_xref *xref = doc->local_xref;
263 pdf_xref_subsec *sub;
264
265 if (xref == NULL || doc->local_xref_nesting == 0)
266 fz_throw(ctx, FZ_ERROR_ARGUMENT, "Local xref not present!");
267
268 /* Local xrefs only ever have 1 section, and it should be solid. */
269 sub = xref->subsec;
270 assert(sub && !sub->next);
271 if (num >= sub->start && num < sub->start + sub->len)
272 return &sub->table[num - sub->start];
273
274 /* Expand the xref so we can return a pointer. */
275 resize_xref_sub(ctx, xref, 0, num+1);
276 sub = xref->subsec;
277 return &sub->table[num - sub->start];
278 }
279
280 pdf_xref_entry *pdf_get_populating_xref_entry(fz_context *ctx, pdf_document *doc, int num)
281 {
282 /* Return an entry within the xref currently being populated */
283 pdf_xref *xref;
284 pdf_xref_subsec *sub;
285
286 if (doc->num_xref_sections == 0)
287 {
288 doc->xref_sections = fz_malloc_struct(ctx, pdf_xref);
289 doc->num_xref_sections = 1;
290 }
291
292 if (doc->local_xref && doc->local_xref_nesting > 0)
293 return pdf_get_local_xref_entry(ctx, doc, num);
294
295 /* Prevent accidental heap underflow */
296 if (num < 0 || num > PDF_MAX_OBJECT_NUMBER)
297 fz_throw(ctx, FZ_ERROR_ARGUMENT, "object number out of range (%d)", num);
298
299 /* Return the pointer to the entry in the last section. */
300 xref = &doc->xref_sections[doc->num_xref_sections-1];
301
302 for (sub = xref->subsec; sub != NULL; sub = sub->next)
303 {
304 if (num >= sub->start && num < sub->start + sub->len)
305 return &sub->table[num-sub->start];
306 }
307
308 /* We've been asked for an object that's not in a subsec. */
309 ensure_solid_xref(ctx, doc, num+1, doc->num_xref_sections-1);
310 xref = &doc->xref_sections[doc->num_xref_sections-1];
311 sub = xref->subsec;
312
313 return &sub->table[num-sub->start];
314 }
315
316 /* It is vital that pdf_get_xref_entry_aux called with !solidify_if_needed
317 * and a value object number, does NOT try/catch or throw. */
318 static
319 pdf_xref_entry *pdf_get_xref_entry_aux(fz_context *ctx, pdf_document *doc, int i, int solidify_if_needed)
320 {
321 pdf_xref *xref = NULL;
322 pdf_xref_subsec *sub;
323 int j;
324
325 if (i < 0)
326 fz_throw(ctx, FZ_ERROR_ARGUMENT, "Negative object number requested");
327
328 if (i < doc->max_xref_len)
329 j = doc->xref_index[i];
330 else
331 j = 0;
332
333 /* If we have an active local xref, check there first. */
334 if (doc->local_xref && doc->local_xref_nesting > 0)
335 {
336 xref = doc->local_xref;
337
338 if (i < xref->num_objects)
339 {
340 for (sub = xref->subsec; sub != NULL; sub = sub->next)
341 {
342 pdf_xref_entry *entry;
343
344 if (i < sub->start || i >= sub->start + sub->len)
345 continue;
346
347 entry = &sub->table[i - sub->start];
348 if (entry->type)
349 return entry;
350 }
351 }
352 }
353
354 /* We may be accessing an earlier version of the document using xref_base
355 * and j may be an index into a later xref section */
356 if (doc->xref_base > j)
357 j = doc->xref_base;
358 else
359 j = 0;
360
361
362 /* Find the first xref section where the entry is defined. */
363 for (; j < doc->num_xref_sections; j++)
364 {
365 xref = &doc->xref_sections[j];
366
367 if (i < xref->num_objects)
368 {
369 for (sub = xref->subsec; sub != NULL; sub = sub->next)
370 {
371 pdf_xref_entry *entry;
372
373 if (i < sub->start || i >= sub->start + sub->len)
374 continue;
375
376 entry = &sub->table[i - sub->start];
377 if (entry->type)
378 {
379 /* Don't update xref_index if xref_base may have
380 * influenced the value of j */
381 if (doc->xref_base == 0)
382 doc->xref_index[i] = j;
383 return entry;
384 }
385 }
386 }
387 }
388
389 /* Didn't find the entry in any section. Return the entry from
390 * the local_xref (if there is one active), or the final section. */
391 if (doc->local_xref && doc->local_xref_nesting > 0)
392 {
393 if (xref == NULL || i < xref->num_objects)
394 {
395 xref = doc->local_xref;
396 sub = xref->subsec;
397 assert(sub != NULL && sub->next == NULL);
398 if (i >= sub->start && i < sub->start + sub->len)
399 return &sub->table[i - sub->start];
400 }
401
402 /* Expand the xref so we can return a pointer. */
403 resize_xref_sub(ctx, xref, 0, i+1);
404 sub = xref->subsec;
405 return &sub->table[i - sub->start];
406 }
407
408 doc->xref_index[i] = 0;
409 if (xref == NULL || i < xref->num_objects)
410 {
411 xref = &doc->xref_sections[doc->xref_base];
412 for (sub = xref->subsec; sub != NULL; sub = sub->next)
413 {
414 if (i >= sub->start && i < sub->start + sub->len)
415 return &sub->table[i - sub->start];
416 }
417 }
418
419 /* Some really hairy code here. When we are reading the file in
420 * initially, we read from 'newest' to 'oldest' (i.e. from 0 to
421 * doc->num_xref_sections-1). Each section is created initially
422 * with num_objects == 0 in it, and remains like that while we
423 * are parsing the stream from the file. This is the only time
424 * we'll ever have xref_sections with 0 objects in them. */
425 if (doc->xref_sections[doc->num_xref_sections-1].num_objects == 0)
426 {
427 /* The oldest xref section has 0 objects in it. So we are
428 * parsing an xref stream while loading. We don't want to
429 * solidify the xref we are currently parsing for (as it'll
430 * get very confused, and end up a different 'shape' in
431 * memory to that which is in the file, and would hence
432 * render 'fingerprinting' for snapshotting invalid) so
433 * just give up at this point. */
434 return NULL;
435 }
436
437 if (!solidify_if_needed)
438 return NULL;
439
440 /* At this point, we solidify the xref. This ensures that we
441 * can return a pointer. This is the only case where this function
442 * might throw an exception, and it will never happen when we are
443 * working within a 'solid' xref. */
444 ensure_solid_xref(ctx, doc, i+1, 0);
445 xref = &doc->xref_sections[0];
446 sub = xref->subsec;
447 return &sub->table[i - sub->start];
448 }
449
450 pdf_xref_entry *pdf_get_xref_entry(fz_context *ctx, pdf_document *doc, int i)
451 {
452 return pdf_get_xref_entry_aux(ctx, doc, i, 1);
453 }
454
455 pdf_xref_entry *pdf_get_xref_entry_no_change(fz_context *ctx, pdf_document *doc, int i)
456 {
457 return pdf_get_xref_entry_aux(ctx, doc, i, 0);
458 }
459
460 pdf_xref_entry *pdf_get_xref_entry_no_null(fz_context *ctx, pdf_document *doc, int i)
461 {
462 pdf_xref_entry *entry = pdf_get_xref_entry(ctx, doc, i);
463 if (entry != NULL)
464 return entry;
465 fz_throw(ctx, FZ_ERROR_ARGUMENT, "cannot find object in xref (%d 0 R), but not allowed to return NULL", i);
466 }
467
468 void pdf_xref_entry_map(fz_context *ctx, pdf_document *doc, void (*fn)(fz_context *, pdf_xref_entry *, int, pdf_document *, void *), void *arg)
469 {
470 int i, j;
471 pdf_xref_subsec *sub;
472 int xref_base = doc->xref_base;
473
474 fz_try(ctx)
475 {
476 /* Map over any active local xref first. */
477 if (doc->local_xref && doc->local_xref_nesting > 0)
478 {
479 pdf_xref *xref = doc->local_xref;
480
481 for (sub = xref->subsec; sub != NULL; sub = sub->next)
482 {
483 for (i = sub->start; i < sub->start + sub->len; i++)
484 {
485 pdf_xref_entry *entry = &sub->table[i - sub->start];
486 if (entry->type)
487 fn(ctx, entry, i, doc, arg);
488 }
489 }
490 }
491
492 for (j = 0; j < doc->num_xref_sections; j++)
493 {
494 pdf_xref *xref = &doc->xref_sections[j];
495 doc->xref_base = j;
496
497 for (sub = xref->subsec; sub != NULL; sub = sub->next)
498 {
499 for (i = sub->start; i < sub->start + sub->len; i++)
500 {
501 pdf_xref_entry *entry = &sub->table[i - sub->start];
502 if (entry->type)
503 fn(ctx, entry, i, doc, arg);
504 }
505 }
506 }
507 }
508 fz_always(ctx)
509 {
510 doc->xref_base = xref_base;
511 }
512 fz_catch(ctx)
513 fz_rethrow(ctx);
514 }
515
516 /*
517 Ensure we have an incremental xref section where we can store
518 updated versions of indirect objects. This is a new xref section
519 consisting of a single xref subsection.
520 */
521 static void ensure_incremental_xref(fz_context *ctx, pdf_document *doc)
522 {
523 /* If there are as yet no incremental sections, or if the most recent
524 * one has been used to sign a signature field, then we need a new one.
525 * After a signing, any further document changes require a new increment */
526 if ((doc->num_incremental_sections == 0 || doc->xref_sections[0].unsaved_sigs != NULL)
527 && !doc->disallow_new_increments)
528 {
529 pdf_xref *xref = &doc->xref_sections[0];
530 pdf_xref *pxref;
531 pdf_xref_entry *new_table = fz_malloc_struct_array(ctx, xref->num_objects, pdf_xref_entry);
532 pdf_xref_subsec *sub = NULL;
533 pdf_obj *trailer = NULL;
534 int i;
535
536 fz_var(trailer);
537 fz_var(sub);
538 fz_try(ctx)
539 {
540 sub = fz_malloc_struct(ctx, pdf_xref_subsec);
541 trailer = xref->trailer ? pdf_copy_dict(ctx, xref->trailer) : NULL;
542 doc->xref_sections = fz_realloc_array(ctx, doc->xref_sections, doc->num_xref_sections + 1, pdf_xref);
543 xref = &doc->xref_sections[0];
544 pxref = &doc->xref_sections[1];
545 memmove(pxref, xref, doc->num_xref_sections * sizeof(pdf_xref));
546 /* xref->num_objects is already correct */
547 xref->subsec = sub;
548 sub = NULL;
549 xref->trailer = trailer;
550 xref->pre_repair_trailer = NULL;
551 xref->unsaved_sigs = NULL;
552 xref->unsaved_sigs_end = NULL;
553 xref->subsec->next = NULL;
554 xref->subsec->len = xref->num_objects;
555 xref->subsec->start = 0;
556 xref->subsec->table = new_table;
557 doc->num_xref_sections++;
558 doc->num_incremental_sections++;
559 }
560 fz_catch(ctx)
561 {
562 fz_free(ctx, sub);
563 fz_free(ctx, new_table);
564 pdf_drop_obj(ctx, trailer);
565 fz_rethrow(ctx);
566 }
567
568 /* Update the xref_index */
569 for (i = 0; i < doc->max_xref_len; i++)
570 {
571 doc->xref_index[i]++;
572 }
573 }
574 }
575
576 /* Used when altering a document */
577 pdf_xref_entry *pdf_get_incremental_xref_entry(fz_context *ctx, pdf_document *doc, int i)
578 {
579 pdf_xref *xref;
580 pdf_xref_subsec *sub;
581
582 /* Make a new final xref section if we haven't already */
583 ensure_incremental_xref(ctx, doc);
584
585 xref = &doc->xref_sections[doc->xref_base];
586 if (i >= xref->num_objects)
587 pdf_resize_xref(ctx, doc, i + 1);
588
589 sub = xref->subsec;
590 assert(sub != NULL && sub->next == NULL);
591 assert(i >= sub->start && i < sub->start + sub->len);
592 doc->xref_index[i] = 0;
593 return &sub->table[i - sub->start];
594 }
595
596 int pdf_xref_is_incremental(fz_context *ctx, pdf_document *doc, int num)
597 {
598 pdf_xref *xref = &doc->xref_sections[doc->xref_base];
599 pdf_xref_subsec *sub = xref->subsec;
600
601 assert(sub != NULL && sub->next == NULL && sub->len == xref->num_objects && sub->start == 0);
602
603 return num < xref->num_objects && sub->table[num].type;
604 }
605
606 /* Used when clearing signatures. Removes the signature
607 from the list of unsaved signed signatures. */
608 void pdf_xref_remove_unsaved_signature(fz_context *ctx, pdf_document *doc, pdf_obj *field)
609 {
610 int num = pdf_to_num(ctx, field);
611 int idx = doc->xref_index[num];
612 pdf_xref *xref = &doc->xref_sections[idx];
613 pdf_unsaved_sig **usigptr = &xref->unsaved_sigs;
614 pdf_unsaved_sig *usig = xref->unsaved_sigs;
615
616 while (usig)
617 {
618 pdf_unsaved_sig **nextptr = &usig->next;
619 pdf_unsaved_sig *next = usig->next;
620
621 if (usig->field == field)
622 {
623 if (xref->unsaved_sigs_end == &usig->next)
624 {
625 if (usig->next)
626 xref->unsaved_sigs_end = &usig->next->next;
627 else
628 xref->unsaved_sigs_end = NULL;
629 }
630 if (usigptr)
631 *usigptr = usig->next;
632
633 usig->next = NULL;
634 pdf_drop_obj(ctx, usig->field);
635 pdf_drop_signer(ctx, usig->signer);
636 fz_free(ctx, usig);
637
638 break;
639 }
640
641 usig = next;
642 usigptr = nextptr;
643 }
644 }
645
646 void pdf_xref_store_unsaved_signature(fz_context *ctx, pdf_document *doc, pdf_obj *field, pdf_pkcs7_signer *signer)
647 {
648 pdf_xref *xref = &doc->xref_sections[0];
649 pdf_unsaved_sig *unsaved_sig;
650
651 /* Record details within the document structure so that contents
652 * and byte_range can be updated with their correct values at
653 * saving time */
654 unsaved_sig = fz_malloc_struct(ctx, pdf_unsaved_sig);
655 unsaved_sig->field = pdf_keep_obj(ctx, field);
656 unsaved_sig->signer = signer->keep(ctx, signer);
657 unsaved_sig->next = NULL;
658 if (xref->unsaved_sigs_end == NULL)
659 xref->unsaved_sigs_end = &xref->unsaved_sigs;
660
661 *xref->unsaved_sigs_end = unsaved_sig;
662 xref->unsaved_sigs_end = &unsaved_sig->next;
663 }
664
665 int pdf_xref_obj_is_unsaved_signature(pdf_document *doc, pdf_obj *obj)
666 {
667 int i;
668 for (i = 0; i < doc->num_incremental_sections; i++)
669 {
670 pdf_xref *xref = &doc->xref_sections[i];
671 pdf_unsaved_sig *usig;
672
673 for (usig = xref->unsaved_sigs; usig; usig = usig->next)
674 {
675 if (usig->field == obj)
676 return 1;
677 }
678 }
679
680 return 0;
681 }
682
683 void pdf_ensure_solid_xref(fz_context *ctx, pdf_document *doc, int num)
684 {
685 if (doc->num_xref_sections == 0)
686 pdf_populate_next_xref_level(ctx, doc);
687
688 ensure_solid_xref(ctx, doc, num, 0);
689 }
690
691 int pdf_xref_ensure_incremental_object(fz_context *ctx, pdf_document *doc, int num)
692 {
693 pdf_xref_entry *new_entry, *old_entry;
694 pdf_xref_subsec *sub = NULL;
695 int i;
696 pdf_obj *copy;
697
698 /* Make sure we have created an xref section for incremental updates */
699 ensure_incremental_xref(ctx, doc);
700
701 /* Search for the section that contains this object */
702 for (i = doc->xref_index[num]; i < doc->num_xref_sections; i++)
703 {
704 pdf_xref *xref = &doc->xref_sections[i];
705
706 if (num < 0 && num >= xref->num_objects)
707 break;
708 for (sub = xref->subsec; sub != NULL; sub = sub->next)
709 {
710 if (sub->start <= num && num < sub->start + sub->len && sub->table[num - sub->start].type)
711 break;
712 }
713 if (sub != NULL)
714 break;
715 }
716 /* sub == NULL implies we did not find it */
717
718 /* If we don't find it, or it's already in the incremental section, return */
719 if (i == 0 || sub == NULL)
720 return 0;
721
722 copy = pdf_deep_copy_obj(ctx, sub->table[num - sub->start].obj);
723
724 /* Move the object to the incremental section */
725 i = doc->xref_index[num];
726 doc->xref_index[num] = 0;
727 old_entry = &sub->table[num - sub->start];
728 fz_try(ctx)
729 new_entry = pdf_get_incremental_xref_entry(ctx, doc, num);
730 fz_catch(ctx)
731 {
732 pdf_drop_obj(ctx, copy);
733 doc->xref_index[num] = i;
734 fz_rethrow(ctx);
735 }
736 *new_entry = *old_entry;
737 if (new_entry->type == 'o')
738 {
739 new_entry->type = 'n';
740 new_entry->gen = 0;
741 }
742 /* Better keep a copy. We must override the old entry with
743 * the copy because the caller may be holding a reference to
744 * the original and expect it to end up in the new entry */
745 old_entry->obj = copy;
746 old_entry->stm_buf = NULL;
747
748 return 1;
749 }
750
751 void pdf_xref_ensure_local_object(fz_context *ctx, pdf_document *doc, int num)
752 {
753 pdf_xref_entry *new_entry, *old_entry;
754 pdf_xref_subsec *sub = NULL;
755 int i;
756 pdf_xref *xref;
757 pdf_obj *copy;
758
759 /* Is it in the local section already? */
760 xref = doc->local_xref;
761 for (sub = xref->subsec; sub != NULL; sub = sub->next)
762 {
763 if (sub->start <= num && num < sub->start + sub->len && sub->table[num - sub->start].type)
764 break;
765 }
766 /* If we found it, it's in the local section already. */
767 if (sub != NULL)
768 return;
769
770 /* Search for the section that contains this object */
771 for (i = doc->xref_index[num]; i < doc->num_xref_sections; i++)
772 {
773 xref = &doc->xref_sections[i];
774
775 if (num < 0 && num >= xref->num_objects)
776 break;
777 for (sub = xref->subsec; sub != NULL; sub = sub->next)
778 {
779 if (sub->start <= num && num < sub->start + sub->len && sub->table[num - sub->start].type)
780 break;
781 }
782 if (sub != NULL)
783 break;
784 }
785 /* sub == NULL implies we did not find it */
786 if (sub == NULL)
787 return; /* No object to find */
788
789 copy = pdf_deep_copy_obj(ctx, sub->table[num - sub->start].obj);
790
791 /* Copy the object to the local section */
792 i = doc->xref_index[num];
793 doc->xref_index[num] = 0;
794 old_entry = &sub->table[num - sub->start];
795 fz_try(ctx)
796 new_entry = pdf_get_local_xref_entry(ctx, doc, num);
797 fz_catch(ctx)
798 {
799 pdf_drop_obj(ctx, copy);
800 doc->xref_index[num] = i;
801 fz_rethrow(ctx);
802 }
803 *new_entry = *old_entry;
804 if (new_entry->type == 'o')
805 {
806 new_entry->type = 'n';
807 new_entry->gen = 0;
808 }
809 new_entry->stm_buf = NULL;
810 new_entry->obj = NULL;
811 /* old entry is incremental and may have changes.
812 * Better keep a copy. We must override the old entry with
813 * the copy because the caller may be holding a reference to
814 * the original and expect it to end up in the new entry */
815 new_entry->obj = old_entry->obj;
816 old_entry->obj = copy;
817 new_entry->stm_buf = NULL; /* FIXME */
818 }
819
820 void pdf_replace_xref(fz_context *ctx, pdf_document *doc, pdf_xref_entry *entries, int n)
821 {
822 int *xref_index = NULL;
823 pdf_xref *xref = NULL;
824 pdf_xref_subsec *sub;
825
826 fz_var(xref_index);
827 fz_var(xref);
828
829 fz_try(ctx)
830 {
831 xref_index = fz_calloc(ctx, n, sizeof(int));
832 xref = fz_malloc_struct(ctx, pdf_xref);
833 sub = fz_malloc_struct(ctx, pdf_xref_subsec);
834 }
835 fz_catch(ctx)
836 {
837 fz_free(ctx, xref);
838 fz_free(ctx, xref_index);
839 fz_rethrow(ctx);
840 }
841
842 sub->table = entries;
843 sub->start = 0;
844 sub->len = n;
845
846 xref->subsec = sub;
847 xref->num_objects = n;
848 xref->trailer = pdf_keep_obj(ctx, pdf_trailer(ctx, doc));
849
850 /* The new table completely replaces the previous separate sections */
851 pdf_drop_xref_sections(ctx, doc);
852
853 doc->xref_sections = xref;
854 doc->num_xref_sections = 1;
855 doc->num_incremental_sections = 0;
856 doc->xref_base = 0;
857 doc->disallow_new_increments = 0;
858 doc->max_xref_len = n;
859
860 fz_free(ctx, doc->xref_index);
861 doc->xref_index = xref_index;
862 }
863
864 void pdf_forget_xref(fz_context *ctx, pdf_document *doc)
865 {
866 pdf_obj *trailer = pdf_keep_obj(ctx, pdf_trailer(ctx, doc));
867
868 pdf_drop_local_xref_and_resources(ctx, doc);
869
870 if (doc->saved_xref_sections)
871 pdf_drop_xref_sections_imp(ctx, doc, doc->saved_xref_sections, doc->saved_num_xref_sections);
872
873 doc->saved_xref_sections = doc->xref_sections;
874 doc->saved_num_xref_sections = doc->num_xref_sections;
875
876 doc->xref_sections = NULL;
877 doc->startxref = 0;
878 doc->num_xref_sections = 0;
879 doc->num_incremental_sections = 0;
880 doc->xref_base = 0;
881 doc->disallow_new_increments = 0;
882
883 fz_try(ctx)
884 {
885 pdf_get_populating_xref_entry(ctx, doc, 0);
886 }
887 fz_catch(ctx)
888 {
889 pdf_drop_obj(ctx, trailer);
890 fz_rethrow(ctx);
891 }
892
893 /* Set the trailer of the final xref section. */
894 doc->xref_sections[0].trailer = trailer;
895 }
896
897 /*
898 * magic version tag and startxref
899 */
900
901 int
902 pdf_version(fz_context *ctx, pdf_document *doc)
903 {
904 int version = doc->version;
905 fz_try(ctx)
906 {
907 pdf_obj *obj = pdf_dict_getl(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root), PDF_NAME(Version), NULL);
908 const char *str = pdf_to_name(ctx, obj);
909 if (*str)
910 version = 10 * (fz_atof(str) + 0.05f);
911 }
912 fz_catch(ctx)
913 {
914 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
915 fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
916 fz_report_error(ctx);
917 fz_warn(ctx, "Ignoring broken Root/Version number.");
918 }
919 return version;
920 }
921
922 static void
923 pdf_load_version(fz_context *ctx, pdf_document *doc)
924 {
925 char buf[1024];
926 char *s = NULL;
927 size_t i, n;
928
929 /* look for '%PDF' version marker within first kilobyte of file */
930 fz_seek(ctx, doc->file, 0, SEEK_SET);
931 n = fz_read(ctx, doc->file, (unsigned char*) buf, sizeof buf);
932 if (n < 5)
933 fz_throw(ctx, FZ_ERROR_FORMAT, "cannot find version marker");
934 buf[n-1] = 0;
935 for (i = 0; i < n - 5; i++)
936 {
937 if (memcmp(&buf[i], "%PDF-", 5) == 0 || memcmp(&buf[i], "%FDF-", 5) == 0)
938 {
939 s = buf + i;
940 break;
941 }
942 }
943 if (!s)
944 fz_throw(ctx, FZ_ERROR_FORMAT, "cannot find version marker");
945
946 if (s[1] == 'F')
947 doc->is_fdf = 1;
948
949 doc->version = 10 * (fz_atof(s+5) + 0.05f);
950 if ((doc->version < 10 || doc->version > 17) && doc->version != 20)
951 fz_warn(ctx, "unknown PDF version: %d.%d", doc->version / 10, doc->version % 10);
952
953 if (s != buf)
954 {
955 fz_warn(ctx, "garbage bytes before version marker");
956 doc->bias = s - buf;
957 }
958
959 fz_seek(ctx, doc->file, doc->bias, SEEK_SET);
960 }
961
962 static void
963 pdf_read_start_xref(fz_context *ctx, pdf_document *doc)
964 {
965 unsigned char buf[1024];
966 size_t i, n;
967 int64_t t;
968
969 fz_seek(ctx, doc->file, 0, SEEK_END);
970
971 doc->file_size = fz_tell(ctx, doc->file);
972
973 t = fz_maxi64(0, doc->file_size - (int64_t)sizeof buf);
974 fz_seek(ctx, doc->file, t, SEEK_SET);
975
976 n = fz_read(ctx, doc->file, buf, sizeof buf);
977 if (n < 9)
978 fz_throw(ctx, FZ_ERROR_FORMAT, "cannot find startxref");
979
980 i = n - 9;
981 do
982 {
983 if (memcmp(buf + i, "startxref", 9) == 0)
984 {
985 i += 9;
986 while (i < n && iswhite(buf[i]))
987 i ++;
988 doc->startxref = 0;
989 while (i < n && isdigit(buf[i]))
990 {
991 if (doc->startxref >= INT64_MAX/10)
992 fz_throw(ctx, FZ_ERROR_LIMIT, "startxref too large");
993 doc->startxref = doc->startxref * 10 + (buf[i++] - '0');
994 }
995 if (doc->startxref != 0)
996 return;
997 break;
998 }
999 } while (i-- > 0);
1000
1001 fz_throw(ctx, FZ_ERROR_FORMAT, "cannot find startxref");
1002 }
1003
1004 void fz_skip_space(fz_context *ctx, fz_stream *stm)
1005 {
1006 do
1007 {
1008 int c = fz_peek_byte(ctx, stm);
1009 if (c == EOF || c > 32)
1010 return;
1011 (void)fz_read_byte(ctx, stm);
1012 }
1013 while (1);
1014 }
1015
1016 int fz_skip_string(fz_context *ctx, fz_stream *stm, const char *str)
1017 {
1018 while (*str)
1019 {
1020 int c = fz_peek_byte(ctx, stm);
1021 if (c == EOF || c != *str++)
1022 return 1;
1023 (void)fz_read_byte(ctx, stm);
1024 }
1025 return 0;
1026 }
1027
1028 /*
1029 * trailer dictionary
1030 */
1031
1032 static int
1033 pdf_xref_size_from_old_trailer(fz_context *ctx, pdf_document *doc)
1034 {
1035 int len;
1036 char *s;
1037 int64_t t;
1038 pdf_token tok;
1039 int c;
1040 int size = 0;
1041 int64_t ofs;
1042 pdf_obj *trailer = NULL;
1043 size_t n;
1044 pdf_lexbuf *buf = &doc->lexbuf.base;
1045 pdf_obj *obj = NULL;
1046
1047 fz_var(trailer);
1048
1049 /* Record the current file read offset so that we can reinstate it */
1050 ofs = fz_tell(ctx, doc->file);
1051
1052 fz_skip_space(ctx, doc->file);
1053 if (fz_skip_string(ctx, doc->file, "xref"))
1054 fz_throw(ctx, FZ_ERROR_FORMAT, "cannot find xref marker");
1055 fz_skip_space(ctx, doc->file);
1056
1057 while (1)
1058 {
1059 c = fz_peek_byte(ctx, doc->file);
1060 if (!isdigit(c))
1061 break;
1062
1063 fz_read_line(ctx, doc->file, buf->scratch, buf->size);
1064 s = buf->scratch;
1065 fz_strsep(&s, " "); /* ignore start */
1066 if (!s)
1067 fz_throw(ctx, FZ_ERROR_FORMAT, "xref subsection length missing");
1068 len = fz_atoi(fz_strsep(&s, " "));
1069 if (len < 0)
1070 fz_throw(ctx, FZ_ERROR_FORMAT, "xref subsection length must be positive");
1071
1072 /* broken pdfs where the section is not on a separate line */
1073 if (s && *s != '\0')
1074 fz_seek(ctx, doc->file, -(2 + (int)strlen(s)), SEEK_CUR);
1075
1076 t = fz_tell(ctx, doc->file);
1077 if (t < 0)
1078 fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file");
1079
1080 /* Spec says xref entries should be 20 bytes, but it's not infrequent
1081 * to see 19, in particular for some PCLm drivers. Cope. */
1082 if (len > 0)
1083 {
1084 n = fz_read(ctx, doc->file, (unsigned char *)buf->scratch, 20);
1085 if (n < 19)
1086 fz_throw(ctx, FZ_ERROR_FORMAT, "malformed xref table");
1087 if (n == 20 && buf->scratch[19] > 32)
1088 n = 19;
1089 }
1090 else
1091 n = 20;
1092
1093 if (len > (int64_t)((INT64_MAX - t) / n))
1094 fz_throw(ctx, FZ_ERROR_LIMIT, "xref has too many entries");
1095
1096 fz_seek(ctx, doc->file, t + n * (int64_t)len, SEEK_SET);
1097 }
1098
1099 fz_try(ctx)
1100 {
1101 tok = pdf_lex(ctx, doc->file, buf);
1102 if (tok != PDF_TOK_TRAILER)
1103 fz_throw(ctx, FZ_ERROR_FORMAT, "expected trailer marker");
1104
1105 tok = pdf_lex(ctx, doc->file, buf);
1106 if (tok != PDF_TOK_OPEN_DICT)
1107 fz_throw(ctx, FZ_ERROR_FORMAT, "expected trailer dictionary");
1108
1109 trailer = pdf_parse_dict(ctx, doc, doc->file, buf);
1110
1111 obj = pdf_dict_get(ctx, trailer, PDF_NAME(Size));
1112 if (pdf_is_indirect(ctx, obj))
1113 fz_throw(ctx, FZ_ERROR_FORMAT, "trailer Size entry is indirect");
1114
1115 size = pdf_dict_get_int(ctx, trailer, PDF_NAME(Size));
1116 if (size < 0 || size > PDF_MAX_OBJECT_NUMBER + 1)
1117 fz_throw(ctx, FZ_ERROR_FORMAT, "trailer Size entry out of range");
1118 }
1119 fz_always(ctx)
1120 {
1121 pdf_drop_obj(ctx, trailer);
1122 }
1123 fz_catch(ctx)
1124 {
1125 fz_rethrow(ctx);
1126 }
1127
1128 fz_seek(ctx, doc->file, ofs, SEEK_SET);
1129
1130 return size;
1131 }
1132
1133 static pdf_xref_entry *
1134 pdf_xref_find_subsection(fz_context *ctx, pdf_document *doc, int start, int len)
1135 {
1136 pdf_xref *xref = &doc->xref_sections[doc->num_xref_sections-1];
1137 pdf_xref_subsec *sub, *extend = NULL;
1138 int num_objects;
1139 int solidify = 0;
1140
1141 if (len == 0)
1142 return NULL;
1143
1144 /* Different cases here.
1145 * Case 1) We might be asking for a subsection (or a subset of a
1146 * subsection) that we already have - Just return it.
1147 * Case 2) We might be asking for a subsection that overlaps (or
1148 * extends) a subsection we already have - extend the existing one.
1149 * Case 3) We might be asking for a subsection that overlaps multiple
1150 * existing subsections - solidify the whole set.
1151 * Case 4) We might be asking for a completely new subsection - just
1152 * allocate it.
1153 */
1154
1155 /* Sanity check */
1156 for (sub = xref->subsec; sub != NULL; sub = sub->next)
1157 {
1158 if (start >= sub->start && start <= sub->start + sub->len)
1159 {
1160 /* 'start' is in (or immediately after) 'sub' */
1161 if (start + len <= sub->start + sub->len)
1162 {
1163 /* And so is start+len-1 - just return this! Case 1. */
1164 return &sub->table[start-sub->start];
1165 }
1166 /* So we overlap with sub. */
1167 if (extend == NULL)
1168 {
1169 /* Maybe we can extend sub? */
1170 extend = sub;
1171 }
1172 else
1173 {
1174 /* OK, so we've already found an overlapping one. We'll need to solidify. Case 3. */
1175 solidify = 1;
1176 break;
1177 }
1178 }
1179 else if (start + len > sub->start && start + len < sub->start + sub->len)
1180 {
1181 /* The end of the start+len range is in 'sub'. */
1182 /* For now, we won't support extending sub backwards. Just take this as
1183 * needing to solidify. Case 3. */
1184 solidify = 1;
1185 break;
1186 }
1187 else if (start < sub->start && start + len >= sub->start + sub->len)
1188 {
1189 /* The end of the start+len range is beyond 'sub'. */
1190 /* For now, we won't support extending sub backwards. Just take this as
1191 * needing to solidify. Another variant of case 3. */
1192 solidify = 1;
1193 break;
1194 }
1195 }
1196
1197 num_objects = xref->num_objects;
1198 if (num_objects < start + len)
1199 num_objects = start + len;
1200
1201 if (solidify)
1202 {
1203 /* Case 3: Solidify the xref */
1204 ensure_solid_xref(ctx, doc, num_objects, doc->num_xref_sections-1);
1205 xref = &doc->xref_sections[doc->num_xref_sections-1];
1206 sub = xref->subsec;
1207 }
1208 else if (extend)
1209 {
1210 /* Case 2: Extend the subsection */
1211 int newlen = start + len - extend->start;
1212 sub = extend;
1213 sub->table = fz_realloc_array(ctx, sub->table, newlen, pdf_xref_entry);
1214 memset(&sub->table[sub->len], 0, sizeof(pdf_xref_entry) * (newlen - sub->len));
1215 sub->len = newlen;
1216 if (xref->num_objects < sub->start + sub->len)
1217 xref->num_objects = sub->start + sub->len;
1218 if (doc->max_xref_len < sub->start + sub->len)
1219 extend_xref_index(ctx, doc, sub->start + sub->len);
1220 }
1221 else
1222 {
1223 /* Case 4 */
1224 sub = fz_malloc_struct(ctx, pdf_xref_subsec);
1225 fz_try(ctx)
1226 {
1227 sub->table = fz_malloc_struct_array(ctx, len, pdf_xref_entry);
1228 sub->start = start;
1229 sub->len = len;
1230 sub->next = xref->subsec;
1231 xref->subsec = sub;
1232 }
1233 fz_catch(ctx)
1234 {
1235 fz_free(ctx, sub);
1236 fz_rethrow(ctx);
1237 }
1238 if (xref->num_objects < num_objects)
1239 xref->num_objects = num_objects;
1240 if (doc->max_xref_len < num_objects)
1241 extend_xref_index(ctx, doc, num_objects);
1242 }
1243 return &sub->table[start-sub->start];
1244 }
1245
1246 static inline void
1247 validate_object_number_range(fz_context *ctx, int first, int len, const char *what)
1248 {
1249 if (first < 0 || first > PDF_MAX_OBJECT_NUMBER)
1250 fz_throw(ctx, FZ_ERROR_FORMAT, "first object number in %s out of range", what);
1251 if (len < 0 || len > PDF_MAX_OBJECT_NUMBER)
1252 fz_throw(ctx, FZ_ERROR_FORMAT, "number of objects in %s out of range", what);
1253 if (len > 0 && len - 1 > PDF_MAX_OBJECT_NUMBER - first)
1254 fz_throw(ctx, FZ_ERROR_FORMAT, "last object number in %s out of range", what);
1255 }
1256
1257 static pdf_obj *
1258 pdf_read_old_xref(fz_context *ctx, pdf_document *doc)
1259 {
1260 int start, len, c, i, xref_len, carried;
1261 fz_stream *file = doc->file;
1262 pdf_xref_entry *table;
1263 pdf_token tok;
1264 size_t n;
1265 char *s, *e;
1266 pdf_lexbuf *buf = &doc->lexbuf.base;
1267
1268 xref_len = pdf_xref_size_from_old_trailer(ctx, doc);
1269
1270 fz_skip_space(ctx, doc->file);
1271 if (fz_skip_string(ctx, doc->file, "xref"))
1272 fz_throw(ctx, FZ_ERROR_FORMAT, "cannot find xref marker");
1273 fz_skip_space(ctx, doc->file);
1274
1275 while (1)
1276 {
1277 c = fz_peek_byte(ctx, file);
1278 if (!isdigit(c))
1279 break;
1280
1281 fz_read_line(ctx, file, buf->scratch, buf->size);
1282 s = buf->scratch;
1283 start = fz_atoi(fz_strsep(&s, " "));
1284 len = fz_atoi(fz_strsep(&s, " "));
1285
1286 /* broken pdfs where the section is not on a separate line */
1287 if (s && *s != '\0')
1288 {
1289 fz_warn(ctx, "broken xref subsection. proceeding anyway.");
1290 fz_seek(ctx, file, -(2 + (int)strlen(s)), SEEK_CUR);
1291 }
1292
1293 validate_object_number_range(ctx, start, len, "xref subsection");
1294
1295 /* broken pdfs where size in trailer undershoots entries in xref sections */
1296 if (start + len > xref_len)
1297 {
1298 fz_warn(ctx, "broken xref subsection, proceeding anyway.");
1299 }
1300
1301 table = pdf_xref_find_subsection(ctx, doc, start, len);
1302
1303 /* Xref entries SHOULD be 20 bytes long, but we see 19 byte
1304 * ones more frequently than we'd like (e.g. PCLm drivers).
1305 * Cope with this by 'carrying' data forward. */
1306 carried = 0;
1307 for (i = 0; i < len; i++)
1308 {
1309 pdf_xref_entry *entry = &table[i];
1310 n = fz_read(ctx, file, (unsigned char *) buf->scratch + carried, 20-carried);
1311 if (n != (size_t)(20-carried))
1312 fz_throw(ctx, FZ_ERROR_FORMAT, "unexpected EOF in xref table");
1313 n += carried;
1314 buf->scratch[n] = '\0';
1315 if (!entry->type)
1316 {
1317 s = buf->scratch;
1318 e = s + n;
1319
1320 entry->num = start + i;
1321
1322 /* broken pdfs where line start with white space */
1323 while (s < e && iswhite(*s))
1324 s++;
1325
1326 if (s == e || !isdigit(*s))
1327 fz_throw(ctx, FZ_ERROR_FORMAT, "xref offset missing");
1328 while (s < e && isdigit(*s))
1329 entry->ofs = entry->ofs * 10 + *s++ - '0';
1330
1331 while (s < e && iswhite(*s))
1332 s++;
1333 if (s == e || !isdigit(*s))
1334 fz_throw(ctx, FZ_ERROR_FORMAT, "xref generation number missing");
1335 while (s < e && isdigit(*s))
1336 entry->gen = entry->gen * 10 + *s++ - '0';
1337
1338 while (s < e && iswhite(*s))
1339 s++;
1340 if (s == e || (*s != 'f' && *s != 'n' && *s != 'o'))
1341 fz_throw(ctx, FZ_ERROR_FORMAT, "unexpected xref type: 0x%x (%d %d R)", s == e ? 0 : *s, entry->num, entry->gen);
1342 entry->type = *s++;
1343
1344 /* If the last byte of our buffer isn't an EOL (or space), carry one byte forward */
1345 carried = buf->scratch[19] > 32;
1346 if (carried)
1347 buf->scratch[0] = buf->scratch[19];
1348 }
1349 }
1350 if (carried)
1351 fz_unread_byte(ctx, file);
1352 }
1353
1354 tok = pdf_lex(ctx, file, buf);
1355 if (tok != PDF_TOK_TRAILER)
1356 fz_throw(ctx, FZ_ERROR_FORMAT, "expected trailer marker");
1357
1358 tok = pdf_lex(ctx, file, buf);
1359 if (tok != PDF_TOK_OPEN_DICT)
1360 fz_throw(ctx, FZ_ERROR_FORMAT, "expected trailer dictionary");
1361
1362 doc->last_xref_was_old_style = 1;
1363
1364 return pdf_parse_dict(ctx, doc, file, buf);
1365 }
1366
1367 static void
1368 pdf_read_new_xref_section(fz_context *ctx, pdf_document *doc, fz_stream *stm, int i0, int i1, int w0, int w1, int w2)
1369 {
1370 pdf_xref_entry *table;
1371 int i, n;
1372
1373 validate_object_number_range(ctx, i0, i1, "xref subsection");
1374
1375 table = pdf_xref_find_subsection(ctx, doc, i0, i1);
1376 for (i = i0; i < i0 + i1; i++)
1377 {
1378 pdf_xref_entry *entry = &table[i-i0];
1379 int a = 0;
1380 int64_t b = 0;
1381 int c = 0;
1382
1383 if (fz_is_eof(ctx, stm))
1384 fz_throw(ctx, FZ_ERROR_FORMAT, "truncated xref stream");
1385
1386 for (n = 0; n < w0; n++)
1387 a = (a << 8) + fz_read_byte(ctx, stm);
1388 for (n = 0; n < w1; n++)
1389 b = (b << 8) + fz_read_byte(ctx, stm);
1390 for (n = 0; n < w2; n++)
1391 c = (c << 8) + fz_read_byte(ctx, stm);
1392
1393 if (!entry->type)
1394 {
1395 int t = w0 ? a : 1;
1396 entry->type = t == 0 ? 'f' : t == 1 ? 'n' : t == 2 ? 'o' : 0;
1397 entry->ofs = w1 ? b : 0;
1398 entry->gen = w2 ? c : 0;
1399 entry->num = i;
1400 }
1401 }
1402
1403 doc->last_xref_was_old_style = 0;
1404 }
1405
1406 /* Entered with file locked, remains locked throughout. */
1407 static pdf_obj *
1408 pdf_read_new_xref(fz_context *ctx, pdf_document *doc)
1409 {
1410 fz_stream *stm = NULL;
1411 pdf_obj *trailer = NULL;
1412 pdf_obj *index = NULL;
1413 pdf_obj *obj = NULL;
1414 int gen, num = 0;
1415 int64_t ofs, stm_ofs;
1416 int size, w0, w1, w2;
1417 int t;
1418
1419 fz_var(trailer);
1420 fz_var(stm);
1421
1422 fz_try(ctx)
1423 {
1424 ofs = fz_tell(ctx, doc->file);
1425 trailer = pdf_parse_ind_obj(ctx, doc, doc->file, &num, &gen, &stm_ofs, NULL);
1426 if (num == 0)
1427 fz_throw(ctx, FZ_ERROR_FORMAT, "Trailer object number cannot be 0\n");
1428 }
1429 fz_catch(ctx)
1430 {
1431 pdf_drop_obj(ctx, trailer);
1432 fz_rethrow(ctx);
1433 }
1434
1435 fz_try(ctx)
1436 {
1437 pdf_xref_entry *entry;
1438
1439 obj = pdf_dict_get(ctx, trailer, PDF_NAME(Size));
1440 if (!obj)
1441 fz_throw(ctx, FZ_ERROR_FORMAT, "xref stream missing Size entry (%d 0 R)", num);
1442
1443 size = pdf_to_int(ctx, obj);
1444
1445 /* Bug708176: If the PDF file producer has declared Size without
1446 * including this object, then increment it. */
1447 if (size == num)
1448 pdf_dict_put_int(ctx, trailer, PDF_NAME(Size), size+1);
1449
1450 obj = pdf_dict_get(ctx, trailer, PDF_NAME(W));
1451 if (!obj)
1452 fz_throw(ctx, FZ_ERROR_FORMAT, "xref stream missing W entry (%d R)", num);
1453
1454 if (pdf_is_indirect(ctx, pdf_array_get(ctx, obj, 0)))
1455 fz_throw(ctx, FZ_ERROR_FORMAT, "xref stream object type field width an indirect object");
1456 if (pdf_is_indirect(ctx, pdf_array_get(ctx, obj, 1)))
1457 fz_throw(ctx, FZ_ERROR_FORMAT, "xref stream object field 2 width an indirect object");
1458 if (pdf_is_indirect(ctx, pdf_array_get(ctx, obj, 2)))
1459 fz_throw(ctx, FZ_ERROR_FORMAT, "xref stream object field 3 width an indirect object");
1460
1461 if (doc->file_reading_linearly && pdf_dict_get(ctx, trailer, PDF_NAME(Encrypt)))
1462 fz_throw(ctx, FZ_ERROR_ARGUMENT, "Cannot read linearly with encryption");
1463
1464 w0 = pdf_array_get_int(ctx, obj, 0);
1465 w1 = pdf_array_get_int(ctx, obj, 1);
1466 w2 = pdf_array_get_int(ctx, obj, 2);
1467
1468 if (w0 < 0)
1469 fz_warn(ctx, "xref stream objects have corrupt type");
1470 if (w1 < 0)
1471 fz_warn(ctx, "xref stream objects have corrupt offset");
1472 if (w2 < 0)
1473 fz_warn(ctx, "xref stream objects have corrupt generation");
1474
1475 w0 = w0 < 0 ? 0 : w0;
1476 w1 = w1 < 0 ? 0 : w1;
1477 w2 = w2 < 0 ? 0 : w2;
1478
1479 index = pdf_dict_get(ctx, trailer, PDF_NAME(Index));
1480
1481 stm = pdf_open_stream_with_offset(ctx, doc, num, trailer, stm_ofs);
1482
1483 if (!index)
1484 {
1485 pdf_read_new_xref_section(ctx, doc, stm, 0, size, w0, w1, w2);
1486 }
1487 else
1488 {
1489 int n = pdf_array_len(ctx, index);
1490 for (t = 0; t < n; t += 2)
1491 {
1492 int i0 = pdf_array_get_int(ctx, index, t + 0);
1493 int i1 = pdf_array_get_int(ctx, index, t + 1);
1494 pdf_read_new_xref_section(ctx, doc, stm, i0, i1, w0, w1, w2);
1495 }
1496 }
1497 entry = pdf_get_populating_xref_entry(ctx, doc, num);
1498 entry->ofs = ofs;
1499 entry->gen = gen;
1500 entry->num = num;
1501 entry->stm_ofs = stm_ofs;
1502 pdf_drop_obj(ctx, entry->obj);
1503 entry->obj = pdf_keep_obj(ctx, trailer);
1504 entry->type = 'n';
1505 pdf_set_obj_parent(ctx, trailer, num);
1506 }
1507 fz_always(ctx)
1508 {
1509 fz_drop_stream(ctx, stm);
1510 }
1511 fz_catch(ctx)
1512 {
1513 pdf_drop_obj(ctx, trailer);
1514 fz_rethrow(ctx);
1515 }
1516
1517 return trailer;
1518 }
1519
1520 static pdf_obj *
1521 pdf_read_xref(fz_context *ctx, pdf_document *doc, int64_t ofs)
1522 {
1523 pdf_obj *trailer;
1524 int c;
1525
1526 fz_seek(ctx, doc->file, doc->bias + ofs, SEEK_SET);
1527
1528 while (iswhite(fz_peek_byte(ctx, doc->file)))
1529 fz_read_byte(ctx, doc->file);
1530
1531 c = fz_peek_byte(ctx, doc->file);
1532 if (c == 'x')
1533 trailer = pdf_read_old_xref(ctx, doc);
1534 else if (isdigit(c))
1535 trailer = pdf_read_new_xref(ctx, doc);
1536 else
1537 fz_throw(ctx, FZ_ERROR_FORMAT, "cannot recognize xref format");
1538
1539 return trailer;
1540 }
1541
1542 static int64_t
1543 read_xref_section(fz_context *ctx, pdf_document *doc, int64_t ofs)
1544 {
1545 pdf_obj *trailer = NULL;
1546 pdf_obj *prevobj;
1547 int64_t xrefstmofs = 0;
1548 int64_t prevofs = 0;
1549
1550 trailer = pdf_read_xref(ctx, doc, ofs);
1551 fz_try(ctx)
1552 {
1553 pdf_set_populating_xref_trailer(ctx, doc, trailer);
1554
1555 /* FIXME: do we overwrite free entries properly? */
1556 /* FIXME: Does this work properly with progression? */
1557 xrefstmofs = pdf_to_int64(ctx, pdf_dict_get(ctx, trailer, PDF_NAME(XRefStm)));
1558 if (xrefstmofs)
1559 {
1560 if (xrefstmofs < 0)
1561 fz_throw(ctx, FZ_ERROR_FORMAT, "negative xref stream offset");
1562
1563 /*
1564 Read the XRefStm stream, but throw away the resulting trailer. We do not
1565 follow any Prev tag therein, as specified on Page 108 of the PDF reference
1566 1.7
1567 */
1568 pdf_drop_obj(ctx, pdf_read_xref(ctx, doc, xrefstmofs));
1569 }
1570
1571 prevobj = pdf_dict_get(ctx, trailer, PDF_NAME(Prev));
1572 if (pdf_is_int(ctx, prevobj))
1573 {
1574 prevofs = pdf_to_int64(ctx, prevobj);
1575 if (prevofs <= 0)
1576 fz_throw(ctx, FZ_ERROR_FORMAT, "invalid offset for previous xref section");
1577 }
1578 }
1579 fz_always(ctx)
1580 pdf_drop_obj(ctx, trailer);
1581 fz_catch(ctx)
1582 fz_rethrow(ctx);
1583
1584 return prevofs;
1585 }
1586
1587 static void
1588 pdf_read_xref_sections(fz_context *ctx, pdf_document *doc, int64_t ofs, int read_previous)
1589 {
1590 int i, len, cap;
1591 int64_t *offsets;
1592 int populated = 0;
1593 int size, xref_len;
1594
1595 len = 0;
1596 cap = 10;
1597 offsets = fz_malloc_array(ctx, cap, int64_t);
1598
1599 fz_var(populated);
1600 fz_var(offsets);
1601
1602 fz_try(ctx)
1603 {
1604 while(ofs)
1605 {
1606 for (i = 0; i < len; i ++)
1607 {
1608 if (offsets[i] == ofs)
1609 break;
1610 }
1611 if (i < len)
1612 {
1613 fz_warn(ctx, "ignoring xref section recursion at offset %d", (int)ofs);
1614 break;
1615 }
1616 if (len == cap)
1617 {
1618 cap *= 2;
1619 offsets = fz_realloc_array(ctx, offsets, cap, int64_t);
1620 }
1621 offsets[len++] = ofs;
1622
1623 pdf_populate_next_xref_level(ctx, doc);
1624 populated = 1;
1625 ofs = read_xref_section(ctx, doc, ofs);
1626 if (!read_previous)
1627 break;
1628 }
1629
1630 /* For pathological files, such as chinese-example.pdf, where the original
1631 * xref in the file is highly fragmented, we can safely solidify it here
1632 * with no ill effects. */
1633 ensure_solid_xref(ctx, doc, 0, doc->num_xref_sections-1);
1634
1635 size = pdf_dict_get_int(ctx, pdf_trailer(ctx, doc), PDF_NAME(Size));
1636 xref_len = pdf_xref_len(ctx, doc);
1637 if (xref_len > size)
1638 {
1639 if (xref_len == size+1)
1640 {
1641 /* Bug 708456 && Bug 708176. Allow for (sadly, quite common
1642 * PDF generators that can't get size right). */
1643 fz_warn(ctx, "Trailer Size is off-by-one. Ignoring.");
1644 pdf_dict_put_int(ctx, pdf_trailer(ctx, doc), PDF_NAME(Size), size+1);
1645 }
1646 else
1647 fz_throw(ctx, FZ_ERROR_FORMAT, "incorrect number of xref entries in trailer, repairing");
1648 }
1649 }
1650 fz_always(ctx)
1651 {
1652 fz_free(ctx, offsets);
1653 }
1654 fz_catch(ctx)
1655 {
1656 /* Undo pdf_populate_next_xref_level if we've done that already. */
1657 if (populated)
1658 {
1659 pdf_drop_xref_subsec(ctx, &doc->xref_sections[doc->num_xref_sections - 1]);
1660 doc->num_xref_sections--;
1661 }
1662 fz_rethrow(ctx);
1663 }
1664 }
1665
1666 void
1667 pdf_prime_xref_index(fz_context *ctx, pdf_document *doc)
1668 {
1669 int i, j;
1670 int *idx = doc->xref_index;
1671
1672 for (i = doc->num_xref_sections-1; i >= 0; i--)
1673 {
1674 pdf_xref *xref = &doc->xref_sections[i];
1675 pdf_xref_subsec *subsec = xref->subsec;
1676 while (subsec != NULL)
1677 {
1678 int start = subsec->start;
1679 int end = subsec->start + subsec->len;
1680 for (j = start; j < end; j++)
1681 {
1682 char t = subsec->table[j-start].type;
1683 if (t != 0 && t != 'f')
1684 idx[j] = i;
1685 }
1686
1687 subsec = subsec->next;
1688 }
1689 }
1690 }
1691
1692 static void
1693 check_xref_entry_offsets(fz_context *ctx, pdf_xref_entry *entry, int i, pdf_document *doc, void *arg)
1694 {
1695 int xref_len = (int)(intptr_t)arg;
1696
1697 if (entry->type == 'n')
1698 {
1699 /* Special case code: "0000000000 * n" means free,
1700 * according to some producers (inc Quartz) */
1701 if (entry->ofs == 0)
1702 entry->type = 'f';
1703 else if (entry->ofs <= 0 || entry->ofs >= doc->file_size)
1704 fz_throw(ctx, FZ_ERROR_FORMAT, "object offset out of range: %d (%d 0 R)", (int)entry->ofs, i);
1705 }
1706 else if (entry->type == 'o')
1707 {
1708 /* Read this into a local variable here, because pdf_get_xref_entry
1709 * may solidify the xref, hence invalidating "entry", meaning we
1710 * need a stashed value for the throw. */
1711 int64_t ofs = entry->ofs;
1712 if (ofs <= 0 || ofs >= xref_len || pdf_get_xref_entry_no_null(ctx, doc, ofs)->type != 'n')
1713 fz_throw(ctx, FZ_ERROR_FORMAT, "invalid reference to an objstm that does not exist: %d (%d 0 R)", (int)ofs, i);
1714 }
1715 }
1716
1717 /*
1718 * load xref tables from pdf
1719 *
1720 * File locked on entry, throughout and on exit.
1721 */
1722
1723 static void
1724 pdf_load_xref(fz_context *ctx, pdf_document *doc)
1725 {
1726 int xref_len;
1727 pdf_xref_entry *entry;
1728
1729 pdf_read_start_xref(ctx, doc);
1730
1731 pdf_read_xref_sections(ctx, doc, doc->startxref, 1);
1732
1733 if (pdf_xref_len(ctx, doc) == 0)
1734 fz_throw(ctx, FZ_ERROR_FORMAT, "found xref was empty");
1735
1736 pdf_prime_xref_index(ctx, doc);
1737
1738 entry = pdf_get_xref_entry_no_null(ctx, doc, 0);
1739 /* broken pdfs where first object is missing */
1740 if (!entry->type)
1741 {
1742 entry->type = 'f';
1743 entry->gen = 65535;
1744 entry->num = 0;
1745 }
1746 /* broken pdfs where first object is not free */
1747 else if (entry->type != 'f')
1748 fz_warn(ctx, "first object in xref is not free");
1749
1750 /* broken pdfs where object offsets are out of range */
1751 xref_len = pdf_xref_len(ctx, doc);
1752 pdf_xref_entry_map(ctx, doc, check_xref_entry_offsets, (void *)(intptr_t)xref_len);
1753 }
1754
1755 static void
1756 pdf_check_linear(fz_context *ctx, pdf_document *doc)
1757 {
1758 pdf_obj *dict = NULL;
1759 pdf_obj *o;
1760 int num, gen;
1761 int64_t stmofs;
1762
1763 fz_var(dict);
1764
1765 fz_try(ctx)
1766 {
1767 dict = pdf_parse_ind_obj(ctx, doc, doc->file, &num, &gen, &stmofs, NULL);
1768 if (!pdf_is_dict(ctx, dict))
1769 break;
1770 o = pdf_dict_get(ctx, dict, PDF_NAME(Linearized));
1771 if (o == NULL)
1772 break;
1773 if (pdf_to_int(ctx, o) != 1)
1774 break;
1775 doc->has_linearization_object = 1;
1776 }
1777 fz_always(ctx)
1778 pdf_drop_obj(ctx, dict);
1779 fz_catch(ctx)
1780 {
1781 /* Silently swallow this error. */
1782 fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
1783 fz_report_error(ctx);
1784 }
1785 }
1786
1787 static void
1788 pdf_load_linear(fz_context *ctx, pdf_document *doc)
1789 {
1790 pdf_obj *dict = NULL;
1791 pdf_obj *hint = NULL;
1792 pdf_obj *o;
1793 int num, gen, lin, len;
1794 int64_t stmofs;
1795
1796 fz_var(dict);
1797 fz_var(hint);
1798
1799 fz_try(ctx)
1800 {
1801 pdf_xref_entry *entry;
1802
1803 dict = pdf_parse_ind_obj(ctx, doc, doc->file, &num, &gen, &stmofs, NULL);
1804 if (!pdf_is_dict(ctx, dict))
1805 fz_throw(ctx, FZ_ERROR_FORMAT, "Failed to read linearized dictionary");
1806 o = pdf_dict_get(ctx, dict, PDF_NAME(Linearized));
1807 if (o == NULL)
1808 fz_throw(ctx, FZ_ERROR_FORMAT, "Failed to read linearized dictionary");
1809 lin = pdf_to_int(ctx, o);
1810 if (lin != 1)
1811 fz_throw(ctx, FZ_ERROR_FORMAT, "Unexpected version of Linearized tag (%d)", lin);
1812 doc->has_linearization_object = 1;
1813 len = pdf_dict_get_int(ctx, dict, PDF_NAME(L));
1814 if (len != doc->file_length)
1815 fz_throw(ctx, FZ_ERROR_ARGUMENT, "File has been updated since linearization");
1816
1817 pdf_read_xref_sections(ctx, doc, fz_tell(ctx, doc->file), 0);
1818
1819 doc->linear_page_count = pdf_dict_get_int(ctx, dict, PDF_NAME(N));
1820 doc->linear_page_refs = fz_realloc_array(ctx, doc->linear_page_refs, doc->linear_page_count, pdf_obj *);
1821 memset(doc->linear_page_refs, 0, doc->linear_page_count * sizeof(pdf_obj*));
1822 doc->linear_obj = dict;
1823 doc->linear_pos = fz_tell(ctx, doc->file);
1824 doc->linear_page1_obj_num = pdf_dict_get_int(ctx, dict, PDF_NAME(O));
1825 doc->linear_page_refs[0] = pdf_new_indirect(ctx, doc, doc->linear_page1_obj_num, 0);
1826 doc->linear_page_num = 0;
1827 hint = pdf_dict_get(ctx, dict, PDF_NAME(H));
1828 doc->hint_object_offset = pdf_array_get_int(ctx, hint, 0);
1829 doc->hint_object_length = pdf_array_get_int(ctx, hint, 1);
1830
1831 entry = pdf_get_populating_xref_entry(ctx, doc, 0);
1832 entry->type = 'f';
1833 }
1834 fz_catch(ctx)
1835 {
1836 pdf_drop_obj(ctx, dict);
1837 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
1838 fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
1839 fz_report_error(ctx);
1840 /* Drop back to non linearized reading mode */
1841 doc->file_reading_linearly = 0;
1842 }
1843 }
1844
1845 static void
1846 id_and_password(fz_context *ctx, pdf_document *doc)
1847 {
1848 pdf_obj *encrypt, *id;
1849
1850 pdf_prime_xref_index(ctx, doc);
1851
1852 encrypt = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Encrypt));
1853 id = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(ID));
1854
1855 if (pdf_is_dict(ctx, encrypt))
1856 doc->crypt = pdf_new_crypt(ctx, encrypt, id);
1857
1858 /* Allow lazy clients to read encrypted files with a blank password */
1859 (void)pdf_authenticate_password(ctx, doc, "");
1860 }
1861
1862 /*
1863 * Initialize and load xref tables.
1864 * If password is not null, try to decrypt.
1865 */
1866 static void
1867 pdf_init_document(fz_context *ctx, pdf_document *doc)
1868 {
1869 int repaired = 0;
1870
1871 fz_try(ctx)
1872 {
1873 /* Check to see if we should work in progressive mode */
1874 if (doc->file->progressive)
1875 {
1876 doc->file_reading_linearly = 1;
1877 fz_seek(ctx, doc->file, 0, SEEK_END);
1878 doc->file_length = fz_tell(ctx, doc->file);
1879 if (doc->file_length < 0)
1880 doc->file_length = 0;
1881 fz_seek(ctx, doc->file, 0, SEEK_SET);
1882 }
1883
1884 pdf_load_version(ctx, doc);
1885
1886 if (doc->is_fdf)
1887 {
1888 doc->file_reading_linearly = 0;
1889 repaired = 1;
1890 break; /* skip to end of try/catch */
1891 }
1892
1893 /* Try to load the linearized file if we are in progressive
1894 * mode. */
1895 if (doc->file_reading_linearly)
1896 pdf_load_linear(ctx, doc);
1897 else
1898 /* Even if we're not in progressive mode, check to see
1899 * if the file claims to be linearized. This is important
1900 * for checking signatures later on. */
1901 pdf_check_linear(ctx, doc);
1902
1903 /* If we aren't in progressive mode (or the linear load failed
1904 * and has set us back to non-progressive mode), load normally.
1905 */
1906 if (!doc->file_reading_linearly)
1907 pdf_load_xref(ctx, doc);
1908 }
1909 fz_catch(ctx)
1910 {
1911 pdf_drop_xref_sections(ctx, doc);
1912 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
1913 doc->file_reading_linearly = 0;
1914 fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
1915 fz_report_error(ctx);
1916 fz_warn(ctx, "trying to repair broken xref");
1917 repaired = 1;
1918 }
1919
1920 if (repaired)
1921 {
1922 /* pdf_repair_xref may access xref_index, so reset it properly */
1923 if (doc->xref_index)
1924 memset(doc->xref_index, 0, sizeof(int) * doc->max_xref_len);
1925 pdf_repair_xref_aux(ctx, doc, id_and_password);
1926 }
1927 else
1928 id_and_password(ctx, doc);
1929 }
1930
1931 void
1932 pdf_invalidate_xfa(fz_context *ctx, pdf_document *doc)
1933 {
1934 if (doc == NULL)
1935 return;
1936 fz_drop_xml(ctx, doc->xfa);
1937 doc->xfa = NULL;
1938 }
1939
1940 static void
1941 pdf_drop_document_imp(fz_context *ctx, fz_document *doc_)
1942 {
1943 pdf_document *doc = (pdf_document*)doc_;
1944 int i;
1945
1946 fz_defer_reap_start(ctx);
1947
1948 /* Type3 glyphs in the glyph cache can contain pdf_obj pointers
1949 * that we are about to destroy. Simplest solution is to bin the
1950 * glyph cache at this point. */
1951 fz_try(ctx)
1952 fz_purge_glyph_cache(ctx);
1953 fz_catch(ctx)
1954 {
1955 /* Swallow error, but continue dropping */
1956 fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
1957 fz_report_error(ctx);
1958 }
1959
1960 pdf_set_doc_event_callback(ctx, doc, NULL, NULL, NULL);
1961 pdf_drop_js(ctx, doc->js);
1962
1963 pdf_drop_journal(ctx, doc->journal);
1964
1965 pdf_drop_resource_tables(ctx, doc);
1966
1967 pdf_drop_local_xref(ctx, doc->local_xref);
1968
1969 pdf_drop_xref_sections(ctx, doc);
1970 fz_free(ctx, doc->xref_index);
1971
1972 fz_drop_stream(ctx, doc->file);
1973 pdf_drop_crypt(ctx, doc->crypt);
1974
1975 pdf_drop_obj(ctx, doc->linear_obj);
1976 if (doc->linear_page_refs)
1977 {
1978 for (i=0; i < doc->linear_page_count; i++)
1979 pdf_drop_obj(ctx, doc->linear_page_refs[i]);
1980
1981 fz_free(ctx, doc->linear_page_refs);
1982 }
1983
1984 fz_free(ctx, doc->hint_page);
1985 fz_free(ctx, doc->hint_shared_ref);
1986 fz_free(ctx, doc->hint_shared);
1987 fz_free(ctx, doc->hint_obj_offsets);
1988
1989 for (i=0; i < doc->num_type3_fonts; i++)
1990 {
1991 fz_try(ctx)
1992 fz_decouple_type3_font(ctx, doc->type3_fonts[i], (void *)doc);
1993 fz_always(ctx)
1994 fz_drop_font(ctx, doc->type3_fonts[i]);
1995 fz_catch(ctx)
1996 {
1997 /* Swallow error, but continue dropping */
1998 fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
1999 fz_report_error(ctx);
2000 }
2001 }
2002
2003 fz_free(ctx, doc->type3_fonts);
2004
2005 pdf_drop_ocg(ctx, doc);
2006
2007 pdf_empty_store(ctx, doc);
2008
2009 pdf_lexbuf_fin(ctx, &doc->lexbuf.base);
2010
2011 fz_drop_colorspace(ctx, doc->oi);
2012
2013 for (i = 0; i < doc->orphans_count; i++)
2014 pdf_drop_obj(ctx, doc->orphans[i]);
2015
2016 fz_free(ctx, doc->orphans);
2017
2018 pdf_drop_page_tree_internal(ctx, doc);
2019
2020 fz_defer_reap_end(ctx);
2021
2022 pdf_invalidate_xfa(ctx, doc);
2023 }
2024
2025 void
2026 pdf_drop_document(fz_context *ctx, pdf_document *doc)
2027 {
2028 fz_drop_document(ctx, &doc->super);
2029 }
2030
2031 pdf_document *
2032 pdf_keep_document(fz_context *ctx, pdf_document *doc)
2033 {
2034 return (pdf_document *)fz_keep_document(ctx, &doc->super);
2035 }
2036
2037 /*
2038 * compressed object streams
2039 */
2040
2041 /*
2042 Do not hold pdf_xref_entry's over call to this function as they
2043 may be invalidated!
2044 */
2045 static pdf_xref_entry *
2046 pdf_load_obj_stm(fz_context *ctx, pdf_document *doc, int num, pdf_lexbuf *buf, int target)
2047 {
2048 fz_stream *stm = NULL;
2049 pdf_obj *objstm = NULL;
2050 int *numbuf = NULL;
2051 int64_t *ofsbuf = NULL;
2052
2053 pdf_obj *obj;
2054 int64_t first;
2055 int count;
2056 int i;
2057 pdf_token tok;
2058 pdf_xref_entry *ret_entry = NULL;
2059 int ret_idx;
2060 int xref_len;
2061 int found;
2062 fz_stream *sub = NULL;
2063
2064 fz_var(numbuf);
2065 fz_var(ofsbuf);
2066 fz_var(objstm);
2067 fz_var(stm);
2068 fz_var(sub);
2069
2070 fz_try(ctx)
2071 {
2072 objstm = pdf_load_object(ctx, doc, num);
2073
2074 if (pdf_obj_marked(ctx, objstm))
2075 fz_throw(ctx, FZ_ERROR_FORMAT, "recursive object stream lookup");
2076 }
2077 fz_catch(ctx)
2078 {
2079 pdf_drop_obj(ctx, objstm);
2080 fz_rethrow(ctx);
2081 }
2082
2083 fz_try(ctx)
2084 {
2085 (void)pdf_mark_obj(ctx, objstm);
2086
2087 count = pdf_dict_get_int(ctx, objstm, PDF_NAME(N));
2088 first = pdf_dict_get_int(ctx, objstm, PDF_NAME(First));
2089
2090 if (count < 0 || count > PDF_MAX_OBJECT_NUMBER)
2091 fz_throw(ctx, FZ_ERROR_FORMAT, "number of objects in object stream out of range");
2092
2093 numbuf = fz_calloc(ctx, count, sizeof(*numbuf));
2094 ofsbuf = fz_calloc(ctx, count, sizeof(*ofsbuf));
2095
2096 xref_len = pdf_xref_len(ctx, doc);
2097
2098 found = 0;
2099
2100 stm = pdf_open_stream_number(ctx, doc, num);
2101 for (i = 0; i < count; i++)
2102 {
2103 tok = pdf_lex(ctx, stm, buf);
2104 if (tok != PDF_TOK_INT)
2105 fz_throw(ctx, FZ_ERROR_FORMAT, "corrupt object stream (%d 0 R)", num);
2106 numbuf[found] = buf->i;
2107
2108 tok = pdf_lex(ctx, stm, buf);
2109 if (tok != PDF_TOK_INT)
2110 fz_throw(ctx, FZ_ERROR_FORMAT, "corrupt object stream (%d 0 R)", num);
2111 ofsbuf[found] = buf->i;
2112
2113 if (numbuf[found] <= 0 || numbuf[found] >= xref_len)
2114 fz_warn(ctx, "object stream object out of range, skipping");
2115 else
2116 found++;
2117 }
2118
2119 ret_idx = -1;
2120 for (i = 0; i < found; i++)
2121 {
2122 pdf_xref_entry *entry;
2123 uint64_t length;
2124 int64_t offset;
2125
2126 offset = first + ofsbuf[i];
2127 if (i+1 < found)
2128 length = ofsbuf[i+1] - ofsbuf[i];
2129 else
2130 length = UINT64_MAX;
2131
2132 sub = fz_open_null_filter(ctx, stm, length, offset);
2133
2134 obj = pdf_parse_stm_obj(ctx, doc, sub, buf);
2135 fz_drop_stream(ctx, sub);
2136 sub = NULL;
2137
2138 entry = pdf_get_xref_entry_no_null(ctx, doc, numbuf[i]);
2139
2140 pdf_set_obj_parent(ctx, obj, numbuf[i]);
2141
2142 /* We may have set entry->type to be 'O' from being 'o' to avoid nasty
2143 * recursions in pdf_cache_object. Accept the type being 'O' here. */
2144 if ((entry->type == 'o' || entry->type == 'O') && entry->ofs == num)
2145 {
2146 /* If we already have an entry for this object,
2147 * we'd like to drop it and use the new one -
2148 * but this means that anyone currently holding
2149 * a pointer to the old one will be left with a
2150 * stale pointer. Instead, we drop the new one
2151 * and trust that the old one is correct. */
2152 if (entry->obj)
2153 {
2154 if (pdf_objcmp(ctx, entry->obj, obj))
2155 fz_warn(ctx, "Encountered new definition for object %d - keeping the original one", numbuf[i]);
2156 pdf_drop_obj(ctx, obj);
2157 }
2158 else
2159 {
2160 entry->obj = obj;
2161 /* If we've just read a 'null' object, don't leave this as a NULL 'o' object,
2162 * as that will a) confuse the code that called us into thinking that nothing
2163 * was loaded, and b) cause the entire objstm to be reloaded every time that
2164 * object is accessed. Instead, just mark it as an 'f'. */
2165 if (obj == NULL)
2166 entry->type = 'f';
2167 fz_drop_buffer(ctx, entry->stm_buf);
2168 entry->stm_buf = NULL;
2169 }
2170 if (numbuf[i] == target)
2171 ret_idx = i;
2172 }
2173 else
2174 {
2175 pdf_drop_obj(ctx, obj);
2176 }
2177 }
2178 /* Parsing our way through the stream can cause the xref to be
2179 * solidified, which will move an entry. We therefore can't
2180 * read the entry for returning until no more parsing is to be
2181 * done. Thus we end up reading this entry twice. */
2182 if (ret_idx >= 0)
2183 ret_entry = pdf_get_xref_entry_no_null(ctx, doc, numbuf[ret_idx]);
2184 }
2185 fz_always(ctx)
2186 {
2187 fz_drop_stream(ctx, stm);
2188 fz_drop_stream(ctx, sub);
2189 fz_free(ctx, ofsbuf);
2190 fz_free(ctx, numbuf);
2191 pdf_unmark_obj(ctx, objstm);
2192 pdf_drop_obj(ctx, objstm);
2193 }
2194 fz_catch(ctx)
2195 {
2196 fz_rethrow(ctx);
2197 }
2198 return ret_entry;
2199 }
2200
2201 /*
2202 * object loading
2203 */
2204 static int
2205 pdf_obj_read(fz_context *ctx, pdf_document *doc, int64_t *offset, int *nump, pdf_obj **page)
2206 {
2207 pdf_lexbuf *buf = &doc->lexbuf.base;
2208 int num, gen, tok;
2209 int64_t numofs, genofs, stmofs, tmpofs, newtmpofs;
2210 int xref_len;
2211 pdf_xref_entry *entry;
2212
2213 numofs = *offset;
2214 fz_seek(ctx, doc->file, doc->bias + numofs, SEEK_SET);
2215
2216 /* We expect to read 'num' here */
2217 tok = pdf_lex(ctx, doc->file, buf);
2218 genofs = fz_tell(ctx, doc->file);
2219 if (tok != PDF_TOK_INT)
2220 {
2221 /* Failed! */
2222 DEBUGMESS((ctx, "skipping unexpected data (tok=%d) at %d", tok, *offset));
2223 *offset = genofs;
2224 return tok == PDF_TOK_EOF;
2225 }
2226 *nump = num = buf->i;
2227
2228 /* We expect to read 'gen' here */
2229 tok = pdf_lex(ctx, doc->file, buf);
2230 tmpofs = fz_tell(ctx, doc->file);
2231 if (tok != PDF_TOK_INT)
2232 {
2233 /* Failed! */
2234 DEBUGMESS((ctx, "skipping unexpected data after \"%d\" (tok=%d) at %d", num, tok, *offset));
2235 *offset = tmpofs;
2236 return tok == PDF_TOK_EOF;
2237 }
2238 gen = buf->i;
2239
2240 /* We expect to read 'obj' here */
2241 do
2242 {
2243 tmpofs = fz_tell(ctx, doc->file);
2244 tok = pdf_lex(ctx, doc->file, buf);
2245 if (tok == PDF_TOK_OBJ)
2246 break;
2247 if (tok != PDF_TOK_INT)
2248 {
2249 DEBUGMESS((ctx, "skipping unexpected data (tok=%d) at %d", tok, tmpofs));
2250 *offset = fz_tell(ctx, doc->file);
2251 return tok == PDF_TOK_EOF;
2252 }
2253 DEBUGMESS((ctx, "skipping unexpected int %d at %d", num, numofs));
2254 *nump = num = gen;
2255 numofs = genofs;
2256 gen = buf->i;
2257 genofs = tmpofs;
2258 }
2259 while (1);
2260
2261 /* Now we read the actual object */
2262 xref_len = pdf_xref_len(ctx, doc);
2263
2264 /* When we are reading a progressive file, we typically see:
2265 * File Header
2266 * obj m (Linearization params)
2267 * xref #1 (refers to objects m-n)
2268 * obj m+1
2269 * ...
2270 * obj n
2271 * obj 1
2272 * ...
2273 * obj n-1
2274 * xref #2
2275 *
2276 * The linearisation params are read elsewhere, hence
2277 * whenever we read an object it should just go into the
2278 * previous xref.
2279 */
2280 tok = pdf_repair_obj(ctx, doc, buf, &stmofs, NULL, NULL, NULL, page, &newtmpofs, NULL);
2281
2282 do /* So we can break out of it */
2283 {
2284 if (num <= 0 || num >= xref_len)
2285 {
2286 fz_warn(ctx, "Not a valid object number (%d %d obj)", num, gen);
2287 break;
2288 }
2289 if (gen != 0)
2290 {
2291 fz_warn(ctx, "Unexpected non zero generation number in linearized file");
2292 }
2293 entry = pdf_get_populating_xref_entry(ctx, doc, num);
2294 if (entry->type != 0)
2295 {
2296 DEBUGMESS((ctx, "Duplicate object found (%d %d obj)", num, gen));
2297 break;
2298 }
2299 if (page && *page)
2300 {
2301 DEBUGMESS((ctx, "Successfully read object %d @ %d - and found page %d!", num, numofs, doc->linear_page_num));
2302 if (!entry->obj)
2303 entry->obj = pdf_keep_obj(ctx, *page);
2304
2305 if (doc->linear_page_refs[doc->linear_page_num] == NULL)
2306 doc->linear_page_refs[doc->linear_page_num] = pdf_new_indirect(ctx, doc, num, gen);
2307 }
2308 else
2309 {
2310 DEBUGMESS((ctx, "Successfully read object %d @ %d", num, numofs));
2311 }
2312 entry->type = 'n';
2313 entry->gen = gen; // XXX: was 0
2314 entry->num = num;
2315 entry->ofs = numofs;
2316 entry->stm_ofs = stmofs;
2317 }
2318 while (0);
2319 if (page && *page)
2320 doc->linear_page_num++;
2321
2322 if (tok == PDF_TOK_ENDOBJ)
2323 {
2324 *offset = fz_tell(ctx, doc->file);
2325 }
2326 else
2327 {
2328 *offset = newtmpofs;
2329 }
2330 return 0;
2331 }
2332
2333 static void
2334 pdf_load_hinted_page(fz_context *ctx, pdf_document *doc, int pagenum)
2335 {
2336 pdf_obj *page = NULL;
2337
2338 if (!doc->hints_loaded || !doc->linear_page_refs)
2339 return;
2340
2341 if (doc->linear_page_refs[pagenum])
2342 return;
2343
2344 fz_var(page);
2345
2346 fz_try(ctx)
2347 {
2348 int num = doc->hint_page[pagenum].number;
2349 page = pdf_load_object(ctx, doc, num);
2350 if (pdf_name_eq(ctx, PDF_NAME(Page), pdf_dict_get(ctx, page, PDF_NAME(Type))))
2351 {
2352 /* We have found the page object! */
2353 DEBUGMESS((ctx, "LoadHintedPage pagenum=%d num=%d", pagenum, num));
2354 doc->linear_page_refs[pagenum] = pdf_new_indirect(ctx, doc, num, 0);
2355 }
2356 }
2357 fz_always(ctx)
2358 pdf_drop_obj(ctx, page);
2359 fz_catch(ctx)
2360 {
2361 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
2362 /* Swallow the error and proceed as normal */
2363 fz_report_error(ctx);
2364 }
2365 }
2366
2367 static int
2368 read_hinted_object(fz_context *ctx, pdf_document *doc, int num)
2369 {
2370 /* Try to find the object using our hint table. Find the closest
2371 * object <= the one we want that has a hint and read forward from
2372 * there. */
2373 int expected = num;
2374 int curr_pos;
2375 int64_t start, offset;
2376
2377 while (doc->hint_obj_offsets[expected] == 0 && expected > 0)
2378 expected--;
2379 if (expected != num)
2380 DEBUGMESS((ctx, "object %d is unhinted, will search forward from %d", expected, num));
2381 if (expected == 0) /* No hints found, just bail */
2382 return 0;
2383
2384 curr_pos = fz_tell(ctx, doc->file);
2385 offset = doc->hint_obj_offsets[expected];
2386
2387 fz_var(expected);
2388
2389 fz_try(ctx)
2390 {
2391 int found;
2392
2393 /* Try to read forward from there */
2394 do
2395 {
2396 start = offset;
2397 DEBUGMESS((ctx, "Searching for object %d @ %d", expected, offset));
2398 pdf_obj_read(ctx, doc, &offset, &found, 0);
2399 DEBUGMESS((ctx, "Found object %d - next will be @ %d", found, offset));
2400 if (found <= expected)
2401 {
2402 /* We found the right one (or one earlier than
2403 * we expected). Update the hints. */
2404 doc->hint_obj_offsets[expected] = offset;
2405 doc->hint_obj_offsets[found] = start;
2406 doc->hint_obj_offsets[found+1] = offset;
2407 /* Retry with the next one */
2408 expected = found+1;
2409 }
2410 else
2411 {
2412 /* We found one later than we expected. */
2413 doc->hint_obj_offsets[expected] = 0;
2414 doc->hint_obj_offsets[found] = start;
2415 doc->hint_obj_offsets[found+1] = offset;
2416 while (doc->hint_obj_offsets[expected] == 0 && expected > 0)
2417 expected--;
2418 if (expected == 0) /* No hints found, we give up */
2419 break;
2420 }
2421 }
2422 while (found != num);
2423 }
2424 fz_always(ctx)
2425 {
2426 fz_seek(ctx, doc->file, curr_pos, SEEK_SET);
2427 }
2428 fz_catch(ctx)
2429 {
2430 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
2431 /* FIXME: Currently we ignore the hint. Perhaps we should
2432 * drop back to non-hinted operation here. */
2433 doc->hint_obj_offsets[expected] = 0;
2434 fz_rethrow(ctx);
2435 }
2436 return expected != 0;
2437 }
2438
2439 pdf_obj *
2440 pdf_load_unencrypted_object(fz_context *ctx, pdf_document *doc, int num)
2441 {
2442 pdf_xref_entry *x;
2443
2444 if (num <= 0 || num >= pdf_xref_len(ctx, doc))
2445 fz_throw(ctx, FZ_ERROR_FORMAT, "object out of range (%d 0 R); xref size %d", num, pdf_xref_len(ctx, doc));
2446
2447 x = pdf_get_xref_entry_no_null(ctx, doc, num);
2448 if (x->type == 'n')
2449 {
2450 fz_seek(ctx, doc->file, doc->bias + x->ofs, SEEK_SET);
2451 return pdf_parse_ind_obj(ctx, doc, doc->file, NULL, NULL, NULL, NULL);
2452 }
2453 return NULL;
2454 }
2455
2456 int
2457 pdf_object_exists(fz_context *ctx, pdf_document *doc, int num)
2458 {
2459 pdf_xref_entry *x;
2460 if (num <= 0 || num >= pdf_xref_len(ctx, doc))
2461 return 0;
2462 x = pdf_get_xref_entry(ctx, doc, num);
2463 if (x && (x->type == 'n' || x->type == 'o'))
2464 return 1;
2465 return 0;
2466 }
2467
2468 pdf_xref_entry *
2469 pdf_cache_object(fz_context *ctx, pdf_document *doc, int num)
2470 {
2471 pdf_xref_entry *x;
2472 int rnum, rgen, try_repair;
2473
2474 fz_var(try_repair);
2475
2476 if (num <= 0 || num >= pdf_xref_len(ctx, doc))
2477 fz_throw(ctx, FZ_ERROR_FORMAT, "object out of range (%d 0 R); xref size %d", num, pdf_xref_len(ctx, doc));
2478
2479 object_updated:
2480 try_repair = 0;
2481 rnum = num;
2482
2483 x = pdf_get_xref_entry(ctx, doc, num);
2484 if (x == NULL)
2485 fz_throw(ctx, FZ_ERROR_FORMAT, "cannot find object in xref (%d 0 R)", num);
2486
2487 if (x->obj != NULL)
2488 return x;
2489
2490 if (x->type == 'f')
2491 {
2492 x->obj = PDF_NULL;
2493 }
2494 else if (x->type == 'n')
2495 {
2496 fz_seek(ctx, doc->file, doc->bias + x->ofs, SEEK_SET);
2497
2498 fz_try(ctx)
2499 {
2500 x->obj = pdf_parse_ind_obj(ctx, doc, doc->file,
2501 &rnum, &rgen, &x->stm_ofs, &try_repair);
2502 }
2503 fz_catch(ctx)
2504 {
2505 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
2506 fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
2507 if (!try_repair)
2508 fz_rethrow(ctx);
2509 else
2510 fz_report_error(ctx);
2511 }
2512
2513 if (!try_repair && rnum != num)
2514 {
2515 pdf_drop_obj(ctx, x->obj);
2516 x->type = 'f';
2517 x->ofs = -1;
2518 x->gen = 0;
2519 x->num = 0;
2520 x->stm_ofs = 0;
2521 x->obj = NULL;
2522 try_repair = (doc->repair_attempted == 0);
2523 }
2524
2525 if (try_repair)
2526 {
2527 perform_repair:
2528 fz_try(ctx)
2529 pdf_repair_xref(ctx, doc);
2530 fz_catch(ctx)
2531 {
2532 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
2533 fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
2534 fz_rethrow_if(ctx, FZ_ERROR_REPAIRED);
2535 fz_report_error(ctx);
2536 if (rnum == num)
2537 fz_throw(ctx, FZ_ERROR_FORMAT, "cannot parse object (%d 0 R)", num);
2538 else
2539 fz_throw(ctx, FZ_ERROR_FORMAT, "found object (%d 0 R) instead of (%d 0 R)", rnum, num);
2540 }
2541 goto object_updated;
2542 }
2543
2544 if (doc->crypt)
2545 pdf_crypt_obj(ctx, doc->crypt, x->obj, x->num, x->gen);
2546 }
2547 else if (x->type == 'o')
2548 {
2549 if (!x->obj)
2550 {
2551 pdf_xref_entry *orig_x = x;
2552 pdf_xref_entry *ox = x; /* This init is unused, but it shuts warnings up. */
2553 orig_x->type = 'O'; /* Mark this node so we know we're recursing. */
2554 fz_try(ctx)
2555 x = pdf_load_obj_stm(ctx, doc, x->ofs, &doc->lexbuf.base, num);
2556 fz_always(ctx)
2557 {
2558 /* Most of the time ox == orig_x, but if pdf_load_obj_stm performed a
2559 * repair, it may not be. It is safe to call pdf_get_xref_entry_no_change
2560 * here, as it does not try/catch. */
2561 ox = pdf_get_xref_entry_no_change(ctx, doc, num);
2562 /* Bug 706762: ox can be NULL if the object went away during a repair. */
2563 if (ox && ox->type == 'O')
2564 ox->type = 'o'; /* Not recursing any more. */
2565 }
2566 fz_catch(ctx)
2567 fz_rethrow(ctx);
2568 if (x == NULL)
2569 fz_throw(ctx, FZ_ERROR_FORMAT, "cannot load object stream containing object (%d 0 R)", num);
2570 if (!x->obj)
2571 {
2572 x->type = 'f';
2573 if (ox)
2574 ox->type = 'f';
2575 if (doc->repair_attempted)
2576 fz_throw(ctx, FZ_ERROR_FORMAT, "object (%d 0 R) was not found in its object stream", num);
2577 goto perform_repair;
2578 }
2579 }
2580 }
2581 else if (doc->hint_obj_offsets && read_hinted_object(ctx, doc, num))
2582 {
2583 goto object_updated;
2584 }
2585 else if (doc->file_length && doc->linear_pos < doc->file_length)
2586 {
2587 fz_throw(ctx, FZ_ERROR_TRYLATER, "cannot find object in xref (%d 0 R) - not loaded yet?", num);
2588 }
2589 else
2590 {
2591 fz_throw(ctx, FZ_ERROR_FORMAT, "cannot find object in xref (%d 0 R)", num);
2592 }
2593
2594 pdf_set_obj_parent(ctx, x->obj, num);
2595 return x;
2596 }
2597
2598 pdf_obj *
2599 pdf_load_object(fz_context *ctx, pdf_document *doc, int num)
2600 {
2601 pdf_xref_entry *entry = pdf_cache_object(ctx, doc, num);
2602 return pdf_keep_obj(ctx, entry->obj);
2603 }
2604
2605 pdf_obj *
2606 pdf_resolve_indirect(fz_context *ctx, pdf_obj *ref)
2607 {
2608 if (pdf_is_indirect(ctx, ref))
2609 {
2610 pdf_document *doc = pdf_get_indirect_document(ctx, ref);
2611 int num = pdf_to_num(ctx, ref);
2612 pdf_xref_entry *entry;
2613
2614 if (!doc)
2615 return NULL;
2616 if (num <= 0)
2617 {
2618 fz_warn(ctx, "invalid indirect reference (%d 0 R)", num);
2619 return NULL;
2620 }
2621
2622 fz_try(ctx)
2623 entry = pdf_cache_object(ctx, doc, num);
2624 fz_catch(ctx)
2625 {
2626 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
2627 fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
2628 fz_rethrow_if(ctx, FZ_ERROR_REPAIRED);
2629 fz_report_error(ctx);
2630 fz_warn(ctx, "cannot load object (%d 0 R) into cache", num);
2631 return NULL;
2632 }
2633
2634 ref = entry->obj;
2635 }
2636 return ref;
2637 }
2638
2639 pdf_obj *
2640 pdf_resolve_indirect_chain(fz_context *ctx, pdf_obj *ref)
2641 {
2642 int sanity = 10;
2643
2644 while (pdf_is_indirect(ctx, ref))
2645 {
2646 if (--sanity == 0)
2647 {
2648 fz_warn(ctx, "too many indirections (possible indirection cycle involving %d 0 R)", pdf_to_num(ctx, ref));
2649 return NULL;
2650 }
2651
2652 ref = pdf_resolve_indirect(ctx, ref);
2653 }
2654
2655 return ref;
2656 }
2657
2658 int
2659 pdf_count_objects(fz_context *ctx, pdf_document *doc)
2660 {
2661 return pdf_xref_len(ctx, doc);
2662 }
2663
2664 int
2665 pdf_is_local_object(fz_context *ctx, pdf_document *doc, pdf_obj *obj)
2666 {
2667 pdf_xref *xref = doc->local_xref;
2668 pdf_xref_subsec *sub;
2669 int num;
2670
2671 if (!pdf_is_indirect(ctx, obj))
2672 return 0;
2673
2674 if (xref == NULL)
2675 return 0; /* no local xref present */
2676
2677 num = pdf_to_num(ctx, obj);
2678
2679 /* Local xrefs only ever have 1 section, and it should be solid. */
2680 sub = xref->subsec;
2681 if (num >= sub->start && num < sub->start + sub->len)
2682 return sub->table[num - sub->start].type != 0;
2683
2684 return 0;
2685 }
2686
2687 static int
2688 pdf_create_local_object(fz_context *ctx, pdf_document *doc)
2689 {
2690 /* TODO: reuse free object slots by properly linking free object chains in the ofs field */
2691 pdf_xref_entry *entry;
2692 int num;
2693
2694 num = doc->local_xref->num_objects;
2695
2696 entry = pdf_get_local_xref_entry(ctx, doc, num);
2697 entry->type = 'f';
2698 entry->ofs = -1;
2699 entry->gen = 0;
2700 entry->num = num;
2701 entry->stm_ofs = 0;
2702 entry->stm_buf = NULL;
2703 entry->obj = NULL;
2704 return num;
2705 }
2706
2707 int
2708 pdf_create_object(fz_context *ctx, pdf_document *doc)
2709 {
2710 /* TODO: reuse free object slots by properly linking free object chains in the ofs field */
2711 pdf_xref_entry *entry;
2712 int num;
2713
2714 if (doc->local_xref && doc->local_xref_nesting > 0)
2715 return pdf_create_local_object(ctx, doc);
2716
2717 num = pdf_xref_len(ctx, doc);
2718
2719 if (num > PDF_MAX_OBJECT_NUMBER)
2720 fz_throw(ctx, FZ_ERROR_LIMIT, "too many objects stored in pdf");
2721
2722 entry = pdf_get_incremental_xref_entry(ctx, doc, num);
2723 entry->type = 'f';
2724 entry->ofs = -1;
2725 entry->gen = 0;
2726 entry->num = num;
2727 entry->stm_ofs = 0;
2728 entry->stm_buf = NULL;
2729 entry->obj = NULL;
2730
2731 pdf_add_journal_fragment(ctx, doc, num, NULL, NULL, 1);
2732
2733 return num;
2734 }
2735
2736 static void
2737 pdf_delete_local_object(fz_context *ctx, pdf_document *doc, int num)
2738 {
2739 pdf_xref_entry *x;
2740
2741 if (doc->local_xref == NULL || doc->local_xref_nesting == 0)
2742 fz_throw(ctx, FZ_ERROR_ARGUMENT, "No local xref to delete from!");
2743
2744 if (num <= 0 || num >= doc->local_xref->num_objects)
2745 {
2746 fz_warn(ctx, "local object out of range (%d 0 R); xref size %d", num, doc->local_xref->num_objects);
2747 return;
2748 }
2749
2750 x = pdf_get_local_xref_entry(ctx, doc, num);
2751
2752 fz_drop_buffer(ctx, x->stm_buf);
2753 pdf_drop_obj(ctx, x->obj);
2754
2755 x->type = 'f';
2756 x->ofs = 0;
2757 x->gen += 1;
2758 x->num = 0;
2759 x->stm_ofs = 0;
2760 x->stm_buf = NULL;
2761 x->obj = NULL;
2762 }
2763
2764 void
2765 pdf_delete_object(fz_context *ctx, pdf_document *doc, int num)
2766 {
2767 pdf_xref_entry *x;
2768 pdf_xref *xref;
2769 int j;
2770
2771 if (doc->local_xref && doc->local_xref_nesting > 0)
2772 {
2773 pdf_delete_local_object(ctx, doc, num);
2774 return;
2775 }
2776
2777 if (num <= 0 || num >= pdf_xref_len(ctx, doc))
2778 {
2779 fz_warn(ctx, "object out of range (%d 0 R); xref size %d", num, pdf_xref_len(ctx, doc));
2780 return;
2781 }
2782
2783 x = pdf_get_incremental_xref_entry(ctx, doc, num);
2784
2785 fz_drop_buffer(ctx, x->stm_buf);
2786 pdf_drop_obj(ctx, x->obj);
2787
2788 x->type = 'f';
2789 x->ofs = 0;
2790 x->gen += 1;
2791 x->num = 0;
2792 x->stm_ofs = 0;
2793 x->stm_buf = NULL;
2794 x->obj = NULL;
2795
2796 /* Currently we've left a 'free' object in the incremental
2797 * section. This is enough to cause us to think that the
2798 * document has changes. Check back in the non-incremental
2799 * sections to see if the last instance of the object there
2800 * was free (or if this object never appeared). If so, we
2801 * can mark this object as non-existent in the incremental
2802 * xref. This is important so we can 'undo' back to emptiness
2803 * after we save/when we reload a snapshot. */
2804 for (j = 1; j < doc->num_xref_sections; j++)
2805 {
2806 xref = &doc->xref_sections[j];
2807
2808 if (num < xref->num_objects)
2809 {
2810 pdf_xref_subsec *sub;
2811 for (sub = xref->subsec; sub != NULL; sub = sub->next)
2812 {
2813 pdf_xref_entry *entry;
2814
2815 if (num < sub->start || num >= sub->start + sub->len)
2816 continue;
2817
2818 entry = &sub->table[num - sub->start];
2819 if (entry->type)
2820 {
2821 if (entry->type == 'f')
2822 {
2823 /* It was free already! */
2824 x->type = 0;
2825 x->gen = 0;
2826 }
2827 /* It was a real object. */
2828 return;
2829 }
2830 }
2831 }
2832 }
2833 /* It never appeared before. */
2834 x->type = 0;
2835 x->gen = 0;
2836 }
2837
2838 static void
2839 pdf_update_local_object(fz_context *ctx, pdf_document *doc, int num, pdf_obj *newobj)
2840 {
2841 pdf_xref_entry *x;
2842
2843 if (doc->local_xref == NULL || doc->local_xref_nesting == 0)
2844 fz_throw(ctx, FZ_ERROR_ARGUMENT, "Can't update local object without a local xref");
2845
2846 if (!newobj)
2847 {
2848 pdf_delete_local_object(ctx, doc, num);
2849 return;
2850 }
2851
2852 x = pdf_get_local_xref_entry(ctx, doc, num);
2853
2854 pdf_drop_obj(ctx, x->obj);
2855
2856 x->type = 'n';
2857 x->ofs = 0;
2858 x->obj = pdf_keep_obj(ctx, newobj);
2859
2860 pdf_set_obj_parent(ctx, newobj, num);
2861 }
2862
2863 void
2864 pdf_update_object(fz_context *ctx, pdf_document *doc, int num, pdf_obj *newobj)
2865 {
2866 pdf_xref_entry *x;
2867
2868 if (!doc)
2869 return;
2870
2871 if (doc->local_xref && doc->local_xref_nesting > 0)
2872 {
2873 pdf_update_local_object(ctx, doc, num, newobj);
2874 return;
2875 }
2876
2877 if (num <= 0 || num >= pdf_xref_len(ctx, doc))
2878 {
2879 fz_warn(ctx, "object out of range (%d 0 R); xref size %d", num, pdf_xref_len(ctx, doc));
2880 return;
2881 }
2882
2883 if (!newobj)
2884 {
2885 pdf_delete_object(ctx, doc, num);
2886 return;
2887 }
2888
2889 x = pdf_get_incremental_xref_entry(ctx, doc, num);
2890
2891 pdf_drop_obj(ctx, x->obj);
2892
2893 x->type = 'n';
2894 x->ofs = 0;
2895 x->obj = pdf_keep_obj(ctx, newobj);
2896
2897 pdf_set_obj_parent(ctx, newobj, num);
2898 }
2899
2900 void
2901 pdf_update_stream(fz_context *ctx, pdf_document *doc, pdf_obj *obj, fz_buffer *newbuf, int compressed)
2902 {
2903 int num;
2904 pdf_xref_entry *x;
2905
2906 if (pdf_is_indirect(ctx, obj))
2907 num = pdf_to_num(ctx, obj);
2908 else
2909 num = pdf_obj_parent_num(ctx, obj);
2910
2911 /* Write the Length first, as this has the effect of moving the
2912 * old object into the journal for undo. This also moves the
2913 * stream buffer with it, keeping it consistent. */
2914 pdf_dict_put_int(ctx, obj, PDF_NAME(Length), fz_buffer_storage(ctx, newbuf, NULL));
2915
2916 if (doc->local_xref && doc->local_xref_nesting > 0)
2917 {
2918 x = pdf_get_local_xref_entry(ctx, doc, num);
2919 }
2920 else
2921 {
2922 if (num <= 0 || num >= pdf_xref_len(ctx, doc))
2923 {
2924 fz_warn(ctx, "object out of range (%d 0 R); xref size %d", num, pdf_xref_len(ctx, doc));
2925 return;
2926 }
2927
2928 x = pdf_get_xref_entry_no_null(ctx, doc, num);
2929 }
2930
2931 fz_drop_buffer(ctx, x->stm_buf);
2932 x->stm_buf = fz_keep_buffer(ctx, newbuf);
2933
2934 if (!compressed)
2935 {
2936 pdf_dict_del(ctx, obj, PDF_NAME(Filter));
2937 pdf_dict_del(ctx, obj, PDF_NAME(DecodeParms));
2938 }
2939 }
2940
2941 int
2942 pdf_lookup_metadata(fz_context *ctx, pdf_document *doc, const char *key, char *buf, size_t size)
2943 {
2944 if (!strcmp(key, FZ_META_FORMAT))
2945 {
2946 int version = pdf_version(ctx, doc);
2947 return 1 + (int)fz_snprintf(buf, size, "PDF %d.%d", version/10, version % 10);
2948 }
2949
2950 if (!strcmp(key, FZ_META_ENCRYPTION))
2951 {
2952 if (doc->crypt)
2953 {
2954 const char *stream_method = pdf_crypt_stream_method(ctx, doc->crypt);
2955 const char *string_method = pdf_crypt_string_method(ctx, doc->crypt);
2956 if (stream_method == string_method)
2957 return 1 + (int)fz_snprintf(buf, size, "Standard V%d R%d %d-bit %s",
2958 pdf_crypt_version(ctx, doc->crypt),
2959 pdf_crypt_revision(ctx, doc->crypt),
2960 pdf_crypt_length(ctx, doc->crypt),
2961 pdf_crypt_string_method(ctx, doc->crypt));
2962 else
2963 return 1 + (int)fz_snprintf(buf, size, "Standard V%d R%d %d-bit streams: %s strings: %s",
2964 pdf_crypt_version(ctx, doc->crypt),
2965 pdf_crypt_revision(ctx, doc->crypt),
2966 pdf_crypt_length(ctx, doc->crypt),
2967 pdf_crypt_stream_method(ctx, doc->crypt),
2968 pdf_crypt_string_method(ctx, doc->crypt));
2969 }
2970 else
2971 return 1 + (int)fz_strlcpy(buf, "None", size);
2972 }
2973
2974 if (strstr(key, "info:") == key)
2975 {
2976 pdf_obj *info;
2977 const char *s;
2978 int n;
2979
2980 info = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Info));
2981 if (!info)
2982 return -1;
2983
2984 info = pdf_dict_gets(ctx, info, key + 5);
2985 if (!info)
2986 return -1;
2987
2988 s = pdf_to_text_string(ctx, info);
2989 if (strlen(s) <= 0)
2990 return -1;
2991
2992 n = 1 + (int)fz_strlcpy(buf, s, size);
2993 return n;
2994 }
2995
2996 return -1;
2997 }
2998
2999 void
3000 pdf_set_metadata(fz_context *ctx, pdf_document *doc, const char *key, const char *value)
3001 {
3002
3003 pdf_obj *info = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Info));
3004
3005 pdf_begin_operation(ctx, doc, "Set Metadata");
3006
3007 fz_try(ctx)
3008 {
3009 /* Ensure we have an Info dictionary. */
3010 if (!pdf_is_dict(ctx, info))
3011 {
3012 info = pdf_add_new_dict(ctx, doc, 8);
3013 pdf_dict_put_drop(ctx, pdf_trailer(ctx, doc), PDF_NAME(Info), info);
3014 }
3015
3016 if (!strcmp(key, FZ_META_INFO_TITLE))
3017 pdf_dict_put_text_string(ctx, info, PDF_NAME(Title), value);
3018 else if (!strcmp(key, FZ_META_INFO_AUTHOR))
3019 pdf_dict_put_text_string(ctx, info, PDF_NAME(Author), value);
3020 else if (!strcmp(key, FZ_META_INFO_SUBJECT))
3021 pdf_dict_put_text_string(ctx, info, PDF_NAME(Subject), value);
3022 else if (!strcmp(key, FZ_META_INFO_KEYWORDS))
3023 pdf_dict_put_text_string(ctx, info, PDF_NAME(Keywords), value);
3024 else if (!strcmp(key, FZ_META_INFO_CREATOR))
3025 pdf_dict_put_text_string(ctx, info, PDF_NAME(Creator), value);
3026 else if (!strcmp(key, FZ_META_INFO_PRODUCER))
3027 pdf_dict_put_text_string(ctx, info, PDF_NAME(Producer), value);
3028 else if (!strcmp(key, FZ_META_INFO_CREATIONDATE))
3029 {
3030 int64_t time = pdf_parse_date(ctx, value);
3031 if (time >= 0)
3032 pdf_dict_put_date(ctx, info, PDF_NAME(CreationDate), time);
3033 }
3034 else if (!strcmp(key, FZ_META_INFO_MODIFICATIONDATE))
3035 {
3036 int64_t time = pdf_parse_date(ctx, value);
3037 if (time >= 0)
3038 pdf_dict_put_date(ctx, info, PDF_NAME(ModDate), time);
3039 }
3040
3041 if (!strncmp(key, FZ_META_INFO, strlen(FZ_META_INFO)))
3042 key += strlen(FZ_META_INFO);
3043 pdf_dict_put_text_string(ctx, info, pdf_new_name(ctx, key), value);
3044 pdf_end_operation(ctx, doc);
3045 }
3046 fz_catch(ctx)
3047 {
3048 pdf_abandon_operation(ctx, doc);
3049 fz_rethrow(ctx);
3050 }
3051 }
3052
3053 static fz_link_dest
3054 pdf_resolve_link_imp(fz_context *ctx, fz_document *doc_, const char *uri)
3055 {
3056 pdf_document *doc = (pdf_document*)doc_;
3057 return pdf_resolve_link_dest(ctx, doc, uri);
3058 }
3059
3060 char *pdf_format_link_uri(fz_context *ctx, fz_document *doc, fz_link_dest dest)
3061 {
3062 return pdf_new_uri_from_explicit_dest(ctx, dest);
3063 }
3064
3065 static fz_document *
3066 as_pdf(fz_context *ctx, fz_document *doc)
3067 {
3068 return doc;
3069 }
3070
3071 /*
3072 Initializers for the fz_document interface.
3073
3074 The functions are split across two files to allow calls to a
3075 version of the constructor that does not link in the interpreter.
3076 The interpreter references the built-in font and cmap resources
3077 which are quite big. Not linking those into the mutool binary
3078 saves roughly 6MB of space.
3079 */
3080
3081 static fz_colorspace *pdf_document_output_intent_imp(fz_context *ctx, fz_document *doc)
3082 {
3083 return pdf_document_output_intent(ctx, (pdf_document*)doc);
3084 }
3085
3086 int pdf_needs_password_imp(fz_context *ctx, fz_document *doc)
3087 {
3088 return pdf_needs_password(ctx, (pdf_document*)doc);
3089 }
3090
3091 int pdf_authenticate_password_imp(fz_context *ctx, fz_document *doc, const char *pw)
3092 {
3093 return pdf_authenticate_password(ctx, (pdf_document*)doc, pw);
3094 }
3095
3096 int pdf_has_permission_imp(fz_context *ctx, fz_document *doc, fz_permission p)
3097 {
3098 return pdf_has_permission(ctx, (pdf_document*)doc, p);
3099 }
3100
3101 fz_outline_iterator *pdf_new_outline_iterator_imp(fz_context *ctx, fz_document *doc)
3102 {
3103 return pdf_new_outline_iterator(ctx, (pdf_document*)doc);
3104 }
3105
3106 int pdf_lookup_metadata_imp(fz_context *ctx, fz_document *doc, const char *key, char *ptr, size_t size)
3107 {
3108 return pdf_lookup_metadata(ctx, (pdf_document*)doc, key, ptr, size);
3109 }
3110
3111 void pdf_set_metadata_imp(fz_context *ctx, fz_document *doc, const char *key, const char *value)
3112 {
3113 pdf_set_metadata(ctx, (pdf_document*)doc, key, value);
3114 }
3115
3116 void pdf_run_document_structure_imp(fz_context *ctx, fz_document *doc, fz_device *dev, fz_cookie *cookie)
3117 {
3118 pdf_run_document_structure(ctx, (pdf_document*)doc, dev, cookie);
3119 }
3120
3121 #ifndef NDEBUG
3122 void pdf_verify_name_table_sanity(void);
3123 #endif
3124
3125
3126 static pdf_document *
3127 pdf_new_document(fz_context *ctx, fz_stream *file)
3128 {
3129 pdf_document *doc = fz_new_derived_document(ctx, pdf_document);
3130
3131 #ifndef NDEBUG
3132 pdf_verify_name_table_sanity();
3133 #endif
3134
3135 doc->super.drop_document = pdf_drop_document_imp;
3136 doc->super.get_output_intent = pdf_document_output_intent_imp;
3137 doc->super.needs_password = pdf_needs_password_imp;
3138 doc->super.authenticate_password = pdf_authenticate_password_imp;
3139 doc->super.has_permission = pdf_has_permission_imp;
3140 doc->super.outline_iterator = pdf_new_outline_iterator_imp;
3141 doc->super.resolve_link_dest = pdf_resolve_link_imp;
3142 doc->super.format_link_uri = pdf_format_link_uri;
3143 doc->super.count_pages = pdf_count_pages_imp;
3144 doc->super.load_page = pdf_load_page_imp;
3145 doc->super.page_label = pdf_page_label_imp;
3146 doc->super.lookup_metadata = pdf_lookup_metadata_imp;
3147 doc->super.set_metadata = pdf_set_metadata_imp;
3148 doc->super.run_structure = pdf_run_document_structure_imp;
3149 doc->super.as_pdf = as_pdf;
3150
3151 pdf_lexbuf_init(ctx, &doc->lexbuf.base, PDF_LEXBUF_LARGE);
3152 doc->file = fz_keep_stream(ctx, file);
3153
3154 /* Default to PDF-1.7 if the version header is missing and for new documents */
3155 doc->version = 17;
3156
3157 return doc;
3158 }
3159
3160 pdf_document *
3161 pdf_open_document_with_stream(fz_context *ctx, fz_stream *file)
3162 {
3163 pdf_document *doc = pdf_new_document(ctx, file);
3164 fz_try(ctx)
3165 {
3166 pdf_init_document(ctx, doc);
3167 }
3168 fz_catch(ctx)
3169 {
3170 /* fz_drop_document may clobber our error code/message so we have to stash them temporarily. */
3171 char message[256];
3172 int code;
3173 fz_strlcpy(message, fz_convert_error(ctx, &code), sizeof message);
3174 fz_drop_document(ctx, &doc->super);
3175 fz_throw(ctx, code, "%s", message);
3176 }
3177 return doc;
3178 }
3179
3180 /* Uncomment the following to test progressive loading. */
3181 /* #define TEST_PROGRESSIVE_HACK */
3182
3183 pdf_document *
3184 pdf_open_document(fz_context *ctx, const char *filename)
3185 {
3186 fz_stream *file = NULL;
3187 pdf_document *doc = NULL;
3188
3189 fz_var(file);
3190 fz_var(doc);
3191
3192 fz_try(ctx)
3193 {
3194 file = fz_open_file(ctx, filename);
3195 #ifdef TEST_PROGRESSIVE_HACK
3196 file->progressive = 1;
3197 #endif
3198 doc = pdf_new_document(ctx, file);
3199 pdf_init_document(ctx, doc);
3200 }
3201 fz_always(ctx)
3202 {
3203 fz_drop_stream(ctx, file);
3204 }
3205 fz_catch(ctx)
3206 {
3207 /* fz_drop_document may clobber our error code/message so we have to stash them temporarily. */
3208 char message[256];
3209 int code;
3210 fz_strlcpy(message, fz_convert_error(ctx, &code), sizeof message);
3211 fz_drop_document(ctx, &doc->super);
3212 fz_throw(ctx, code, "%s", message);
3213 }
3214
3215 #ifdef TEST_PROGRESSIVE_HACK
3216 if (doc->file_reading_linearly)
3217 {
3218 fz_try(ctx)
3219 pdf_progressive_advance(ctx, doc, doc->linear_page_count-1);
3220 fz_catch(ctx)
3221 {
3222 doc->file_reading_linearly = 0;
3223 /* swallow the error */
3224 }
3225 }
3226 #endif
3227
3228 return doc;
3229 }
3230
3231 static void
3232 pdf_load_hints(fz_context *ctx, pdf_document *doc, int objnum)
3233 {
3234 fz_stream *stream = NULL;
3235 pdf_obj *dict;
3236
3237 fz_var(stream);
3238 fz_var(dict);
3239
3240 fz_try(ctx)
3241 {
3242 int i, j, least_num_page_objs, page_obj_num_bits;
3243 int least_page_len, page_len_num_bits, shared_hint_offset;
3244 /* int least_page_offset, page_offset_num_bits; */
3245 /* int least_content_stream_len, content_stream_len_num_bits; */
3246 int num_shared_obj_num_bits, shared_obj_num_bits;
3247 /* int numerator_bits, denominator_bits; */
3248 int shared;
3249 int shared_obj_num, shared_obj_offset, shared_obj_count_page1;
3250 int shared_obj_count_total;
3251 int least_shared_group_len, shared_group_len_num_bits;
3252 int max_object_num = pdf_xref_len(ctx, doc);
3253
3254 stream = pdf_open_stream_number(ctx, doc, objnum);
3255 dict = pdf_get_xref_entry_no_null(ctx, doc, objnum)->obj;
3256 if (dict == NULL || !pdf_is_dict(ctx, dict))
3257 fz_throw(ctx, FZ_ERROR_FORMAT, "malformed hint object");
3258
3259 shared_hint_offset = pdf_dict_get_int(ctx, dict, PDF_NAME(S));
3260
3261 /* Malloc the structures (use realloc to cope with the fact we
3262 * may try this several times before enough data is loaded) */
3263 doc->hint_page = fz_realloc_array(ctx, doc->hint_page, doc->linear_page_count+1, pdf_hint_page);
3264 memset(doc->hint_page, 0, sizeof(*doc->hint_page) * (doc->linear_page_count+1));
3265 doc->hint_obj_offsets = fz_realloc_array(ctx, doc->hint_obj_offsets, max_object_num, int64_t);
3266 memset(doc->hint_obj_offsets, 0, sizeof(*doc->hint_obj_offsets) * max_object_num);
3267 doc->hint_obj_offsets_max = max_object_num;
3268
3269 /* Read the page object hints table: Header first */
3270 least_num_page_objs = fz_read_bits(ctx, stream, 32);
3271 /* The following is sometimes a lie, but we read this version,
3272 * as other table values are built from it. In
3273 * pdf_reference17.pdf, this points to 2 objects before the
3274 * first pages page object. */
3275 doc->hint_page[0].offset = fz_read_bits(ctx, stream, 32);
3276 if (doc->hint_page[0].offset > doc->hint_object_offset)
3277 doc->hint_page[0].offset += doc->hint_object_length;
3278 page_obj_num_bits = fz_read_bits(ctx, stream, 16);
3279 least_page_len = fz_read_bits(ctx, stream, 32);
3280 page_len_num_bits = fz_read_bits(ctx, stream, 16);
3281 /* least_page_offset = */ (void) fz_read_bits(ctx, stream, 32);
3282 /* page_offset_num_bits = */ (void) fz_read_bits(ctx, stream, 16);
3283 /* least_content_stream_len = */ (void) fz_read_bits(ctx, stream, 32);
3284 /* content_stream_len_num_bits = */ (void) fz_read_bits(ctx, stream, 16);
3285 num_shared_obj_num_bits = fz_read_bits(ctx, stream, 16);
3286 shared_obj_num_bits = fz_read_bits(ctx, stream, 16);
3287 /* numerator_bits = */ (void) fz_read_bits(ctx, stream, 16);
3288 /* denominator_bits = */ (void) fz_read_bits(ctx, stream, 16);
3289
3290 /* Item 1: Page object numbers */
3291 doc->hint_page[0].number = doc->linear_page1_obj_num;
3292 /* We don't care about the number of objects in the first page */
3293 (void)fz_read_bits(ctx, stream, page_obj_num_bits);
3294 j = 1;
3295 for (i = 1; i < doc->linear_page_count; i++)
3296 {
3297 int delta_page_objs = fz_read_bits(ctx, stream, page_obj_num_bits);
3298
3299 doc->hint_page[i].number = j;
3300 j += least_num_page_objs + delta_page_objs;
3301 }
3302 doc->hint_page[i].number = j; /* Not a real page object */
3303 fz_sync_bits(ctx, stream);
3304 /* Item 2: Page lengths */
3305 j = doc->hint_page[0].offset;
3306 for (i = 0; i < doc->linear_page_count; i++)
3307 {
3308 int delta_page_len = fz_read_bits(ctx, stream, page_len_num_bits);
3309 int old = j;
3310
3311 doc->hint_page[i].offset = j;
3312 j += least_page_len + delta_page_len;
3313 if (old <= doc->hint_object_offset && j > doc->hint_object_offset)
3314 j += doc->hint_object_length;
3315 }
3316 doc->hint_page[i].offset = j;
3317 fz_sync_bits(ctx, stream);
3318 /* Item 3: Shared references */
3319 shared = 0;
3320 for (i = 0; i < doc->linear_page_count; i++)
3321 {
3322 int num_shared_objs = fz_read_bits(ctx, stream, num_shared_obj_num_bits);
3323 doc->hint_page[i].index = shared;
3324 shared += num_shared_objs;
3325 }
3326 doc->hint_page[i].index = shared;
3327 doc->hint_shared_ref = fz_realloc_array(ctx, doc->hint_shared_ref, shared, int);
3328 memset(doc->hint_shared_ref, 0, sizeof(*doc->hint_shared_ref) * shared);
3329 fz_sync_bits(ctx, stream);
3330 /* Item 4: Shared references */
3331 for (i = 0; i < shared; i++)
3332 {
3333 int ref = fz_read_bits(ctx, stream, shared_obj_num_bits);
3334 doc->hint_shared_ref[i] = ref;
3335 }
3336 /* Skip items 5,6,7 as we don't use them */
3337
3338 fz_seek(ctx, stream, doc->bias + shared_hint_offset, SEEK_SET);
3339
3340 /* Read the shared object hints table: Header first */
3341 shared_obj_num = fz_read_bits(ctx, stream, 32);
3342 shared_obj_offset = fz_read_bits(ctx, stream, 32);
3343 if (shared_obj_offset > doc->hint_object_offset)
3344 shared_obj_offset += doc->hint_object_length;
3345 shared_obj_count_page1 = fz_read_bits(ctx, stream, 32);
3346 shared_obj_count_total = fz_read_bits(ctx, stream, 32);
3347 shared_obj_num_bits = fz_read_bits(ctx, stream, 16);
3348 least_shared_group_len = fz_read_bits(ctx, stream, 32);
3349 shared_group_len_num_bits = fz_read_bits(ctx, stream, 16);
3350
3351 /* Sanity check the references in Item 4 above to ensure we
3352 * don't access out of range with malicious files. */
3353 for (i = 0; i < shared; i++)
3354 {
3355 if (doc->hint_shared_ref[i] >= shared_obj_count_total)
3356 {
3357 fz_throw(ctx, FZ_ERROR_FORMAT, "malformed hint stream (shared refs)");
3358 }
3359 }
3360
3361 doc->hint_shared = fz_realloc_array(ctx, doc->hint_shared, shared_obj_count_total+1, pdf_hint_shared);
3362 memset(doc->hint_shared, 0, sizeof(*doc->hint_shared) * (shared_obj_count_total+1));
3363
3364 /* Item 1: Shared references */
3365 j = doc->hint_page[0].offset;
3366 for (i = 0; i < shared_obj_count_page1; i++)
3367 {
3368 int off = fz_read_bits(ctx, stream, shared_group_len_num_bits);
3369 int old = j;
3370 doc->hint_shared[i].offset = j;
3371 j += off + least_shared_group_len;
3372 if (old <= doc->hint_object_offset && j > doc->hint_object_offset)
3373 j += doc->hint_object_length;
3374 }
3375 /* FIXME: We would have problems recreating the length of the
3376 * last page 1 shared reference group. But we'll never need
3377 * to, so ignore it. */
3378 j = shared_obj_offset;
3379 for (; i < shared_obj_count_total; i++)
3380 {
3381 int off = fz_read_bits(ctx, stream, shared_group_len_num_bits);
3382 int old = j;
3383 doc->hint_shared[i].offset = j;
3384 j += off + least_shared_group_len;
3385 if (old <= doc->hint_object_offset && j > doc->hint_object_offset)
3386 j += doc->hint_object_length;
3387 }
3388 doc->hint_shared[i].offset = j;
3389 fz_sync_bits(ctx, stream);
3390 /* Item 2: Signature flags: read these just so we can skip */
3391 for (i = 0; i < shared_obj_count_total; i++)
3392 {
3393 doc->hint_shared[i].number = fz_read_bits(ctx, stream, 1);
3394 }
3395 fz_sync_bits(ctx, stream);
3396 /* Item 3: Signatures: just skip */
3397 for (i = 0; i < shared_obj_count_total; i++)
3398 {
3399 if (doc->hint_shared[i].number)
3400 {
3401 (void) fz_read_bits(ctx, stream, 128);
3402 }
3403 }
3404 fz_sync_bits(ctx, stream);
3405 /* Item 4: Shared object object numbers */
3406 j = doc->linear_page1_obj_num; /* FIXME: This is a lie! */
3407 for (i = 0; i < shared_obj_count_page1; i++)
3408 {
3409 doc->hint_shared[i].number = j;
3410 j += fz_read_bits(ctx, stream, shared_obj_num_bits) + 1;
3411 }
3412 j = shared_obj_num;
3413 for (; i < shared_obj_count_total; i++)
3414 {
3415 doc->hint_shared[i].number = j;
3416 j += fz_read_bits(ctx, stream, shared_obj_num_bits) + 1;
3417 }
3418 doc->hint_shared[i].number = j;
3419
3420 /* Now, actually use the data we have gathered. */
3421 for (i = 0 /*shared_obj_count_page1*/; i < shared_obj_count_total; i++)
3422 {
3423 if (doc->hint_shared[i].number >= 0 && doc->hint_shared[i].number < max_object_num)
3424 doc->hint_obj_offsets[doc->hint_shared[i].number] = doc->hint_shared[i].offset;
3425 }
3426 for (i = 0; i < doc->linear_page_count; i++)
3427 {
3428 if (doc->hint_page[i].number >= 0 && doc->hint_page[i].number < max_object_num)
3429 doc->hint_obj_offsets[doc->hint_page[i].number] = doc->hint_page[i].offset;
3430 }
3431 }
3432 fz_always(ctx)
3433 {
3434 fz_drop_stream(ctx, stream);
3435 }
3436 fz_catch(ctx)
3437 {
3438 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
3439 /* Don't try to load hints again */
3440 doc->hints_loaded = 1;
3441 /* We won't use the linearized object anymore. */
3442 doc->file_reading_linearly = 0;
3443 fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
3444 /* Any other error becomes a TRYLATER */
3445 fz_report_error(ctx);
3446 fz_throw(ctx, FZ_ERROR_TRYLATER, "malformed hints object");
3447 }
3448 doc->hints_loaded = 1;
3449 }
3450
3451 static void
3452 pdf_load_hint_object(fz_context *ctx, pdf_document *doc)
3453 {
3454 pdf_lexbuf *buf = &doc->lexbuf.base;
3455 int64_t curr_pos;
3456
3457 curr_pos = fz_tell(ctx, doc->file);
3458 fz_seek(ctx, doc->file, doc->bias + doc->hint_object_offset, SEEK_SET);
3459 fz_try(ctx)
3460 {
3461 while (1)
3462 {
3463 pdf_obj *page = NULL;
3464 int num, tok;
3465
3466 tok = pdf_lex(ctx, doc->file, buf);
3467 if (tok != PDF_TOK_INT)
3468 break;
3469 num = buf->i;
3470 tok = pdf_lex(ctx, doc->file, buf);
3471 if (tok != PDF_TOK_INT)
3472 break;
3473 /* Ignore gen = buf->i */
3474 tok = pdf_lex(ctx, doc->file, buf);
3475 if (tok != PDF_TOK_OBJ)
3476 break;
3477 (void)pdf_repair_obj(ctx, doc, buf, NULL, NULL, NULL, NULL, &page, NULL, NULL);
3478 pdf_load_hints(ctx, doc, num);
3479 }
3480 }
3481 fz_always(ctx)
3482 {
3483 fz_seek(ctx, doc->file, curr_pos, SEEK_SET);
3484 }
3485 fz_catch(ctx)
3486 {
3487 fz_rethrow(ctx);
3488 }
3489 }
3490
3491 pdf_obj *pdf_progressive_advance(fz_context *ctx, pdf_document *doc, int pagenum)
3492 {
3493 int curr_pos;
3494 pdf_obj *page = NULL;
3495
3496 pdf_load_hinted_page(ctx, doc, pagenum);
3497
3498 if (pagenum < 0 || pagenum >= doc->linear_page_count)
3499 fz_throw(ctx, FZ_ERROR_ARGUMENT, "page load out of range (%d of %d)", pagenum, doc->linear_page_count);
3500
3501 if (doc->linear_pos == doc->file_length)
3502 return doc->linear_page_refs[pagenum];
3503
3504 /* Only load hints once, and then only after we have got page 0 */
3505 if (pagenum > 0 && !doc->hints_loaded && doc->hint_object_offset > 0 && doc->linear_pos >= doc->hint_object_offset)
3506 {
3507 /* Found hint object */
3508 pdf_load_hint_object(ctx, doc);
3509 }
3510
3511 DEBUGMESS((ctx, "continuing to try to advance from %d", doc->linear_pos));
3512 curr_pos = fz_tell(ctx, doc->file);
3513
3514 fz_var(page);
3515
3516 fz_try(ctx)
3517 {
3518 int eof;
3519 do
3520 {
3521 int num;
3522 eof = pdf_obj_read(ctx, doc, &doc->linear_pos, &num, &page);
3523 pdf_drop_obj(ctx, page);
3524 page = NULL;
3525 }
3526 while (!eof);
3527
3528 {
3529 pdf_obj *catalog;
3530 pdf_obj *pages;
3531 doc->linear_pos = doc->file_length;
3532 pdf_load_xref(ctx, doc);
3533 catalog = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root));
3534 pages = pdf_dict_get(ctx, catalog, PDF_NAME(Pages));
3535
3536 if (!pdf_is_dict(ctx, pages))
3537 fz_throw(ctx, FZ_ERROR_FORMAT, "missing page tree");
3538 break;
3539 }
3540 }
3541 fz_always(ctx)
3542 {
3543 fz_seek(ctx, doc->file, curr_pos, SEEK_SET);
3544 }
3545 fz_catch(ctx)
3546 {
3547 pdf_drop_obj(ctx, page);
3548 if (fz_caught(ctx) == FZ_ERROR_TRYLATER)
3549 {
3550 if (doc->linear_page_refs[pagenum] == NULL)
3551 {
3552 /* Still not got a page */
3553 fz_rethrow(ctx);
3554 }
3555 // TODO: should we really swallow this error?
3556 fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
3557 fz_report_error(ctx);
3558 }
3559 else
3560 fz_rethrow(ctx);
3561 }
3562
3563 return doc->linear_page_refs[pagenum];
3564 }
3565
3566 pdf_document *fz_new_pdf_document_from_fz_document(fz_context *ctx, fz_document *ptr)
3567 {
3568 if (!ptr || !ptr->as_pdf)
3569 return NULL;
3570 return (pdf_document *)fz_keep_document(ctx, ptr->as_pdf(ctx, ptr));
3571 }
3572
3573 pdf_document *pdf_document_from_fz_document(fz_context *ctx, fz_document *ptr)
3574 {
3575 return (pdf_document *)((ptr && ptr->count_pages == pdf_count_pages_imp) ? ptr : NULL);
3576 }
3577
3578 pdf_page *pdf_page_from_fz_page(fz_context *ctx, fz_page *page)
3579 {
3580 if (pdf_document_from_fz_document(ctx, page->doc))
3581 return (pdf_page*) page;
3582 return NULL;
3583 }
3584
3585 pdf_document *pdf_specifics(fz_context *ctx, fz_document *doc)
3586 {
3587 return pdf_document_from_fz_document(ctx, doc);
3588 }
3589
3590 pdf_obj *
3591 pdf_add_object(fz_context *ctx, pdf_document *doc, pdf_obj *obj)
3592 {
3593 pdf_document *orig_doc;
3594 int num;
3595
3596 orig_doc = pdf_get_bound_document(ctx, obj);
3597 if (orig_doc && orig_doc != doc)
3598 fz_throw(ctx, FZ_ERROR_ARGUMENT, "tried to add an object belonging to a different document");
3599 if (pdf_is_indirect(ctx, obj))
3600 return pdf_keep_obj(ctx, obj);
3601 num = pdf_create_object(ctx, doc);
3602 pdf_update_object(ctx, doc, num, obj);
3603 return pdf_new_indirect(ctx, doc, num, 0);
3604 }
3605
3606 pdf_obj *
3607 pdf_add_object_drop(fz_context *ctx, pdf_document *doc, pdf_obj *obj)
3608 {
3609 pdf_obj *ind = NULL;
3610 fz_try(ctx)
3611 ind = pdf_add_object(ctx, doc, obj);
3612 fz_always(ctx)
3613 pdf_drop_obj(ctx, obj);
3614 fz_catch(ctx)
3615 fz_rethrow(ctx);
3616 return ind;
3617 }
3618
3619 pdf_obj *
3620 pdf_add_new_dict(fz_context *ctx, pdf_document *doc, int initial)
3621 {
3622 return pdf_add_object_drop(ctx, doc, pdf_new_dict(ctx, doc, initial));
3623 }
3624
3625 pdf_obj *
3626 pdf_add_new_array(fz_context *ctx, pdf_document *doc, int initial)
3627 {
3628 return pdf_add_object_drop(ctx, doc, pdf_new_array(ctx, doc, initial));
3629 }
3630
3631 pdf_obj *
3632 pdf_add_stream(fz_context *ctx, pdf_document *doc, fz_buffer *buf, pdf_obj *obj, int compressed)
3633 {
3634 pdf_obj *ind;
3635 if (!obj)
3636 ind = pdf_add_new_dict(ctx, doc, 4);
3637 else
3638 ind = pdf_add_object(ctx, doc, obj);
3639 fz_try(ctx)
3640 pdf_update_stream(ctx, doc, ind, buf, compressed);
3641 fz_catch(ctx)
3642 {
3643 pdf_drop_obj(ctx, ind);
3644 fz_rethrow(ctx);
3645 }
3646 return ind;
3647 }
3648
3649 pdf_document *pdf_create_document(fz_context *ctx)
3650 {
3651 pdf_document *doc;
3652 pdf_obj *root;
3653 pdf_obj *pages;
3654 pdf_obj *trailer = NULL;
3655
3656 fz_var(trailer);
3657
3658 doc = pdf_new_document(ctx, NULL);
3659 fz_try(ctx)
3660 {
3661 doc->file_size = 0;
3662 doc->startxref = 0;
3663 doc->num_xref_sections = 0;
3664 doc->num_incremental_sections = 0;
3665 doc->xref_base = 0;
3666 doc->disallow_new_increments = 0;
3667 pdf_get_populating_xref_entry(ctx, doc, 0);
3668
3669 trailer = pdf_new_dict(ctx, doc, 2);
3670 pdf_dict_put_int(ctx, trailer, PDF_NAME(Size), 3);
3671 pdf_dict_put_drop(ctx, trailer, PDF_NAME(Root), root = pdf_add_new_dict(ctx, doc, 2));
3672 pdf_dict_put(ctx, root, PDF_NAME(Type), PDF_NAME(Catalog));
3673 pdf_dict_put_drop(ctx, root, PDF_NAME(Pages), pages = pdf_add_new_dict(ctx, doc, 3));
3674 pdf_dict_put(ctx, pages, PDF_NAME(Type), PDF_NAME(Pages));
3675 pdf_dict_put_int(ctx, pages, PDF_NAME(Count), 0);
3676 pdf_dict_put_array(ctx, pages, PDF_NAME(Kids), 1);
3677
3678 /* Set the trailer of the final xref section. */
3679 doc->xref_sections[0].trailer = trailer;
3680 }
3681 fz_catch(ctx)
3682 {
3683 pdf_drop_obj(ctx, trailer);
3684 fz_drop_document(ctx, &doc->super);
3685 fz_rethrow(ctx);
3686 }
3687 return doc;
3688 }
3689
3690 static const char *pdf_extensions[] =
3691 {
3692 "pdf",
3693 "fdf",
3694 "pclm",
3695 "ai",
3696 NULL
3697 };
3698
3699 static const char *pdf_mimetypes[] =
3700 {
3701 "application/pdf",
3702 "application/PCLm",
3703 NULL
3704 };
3705
3706 static int
3707 pdf_recognize_doc_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *dir, void **state, fz_document_recognize_state_free_fn **free_state)
3708 {
3709 const char *match = "%PDF-";
3710 const char *match2 = "%FDF-";
3711 int pos = 0;
3712 int n = 4096+5;
3713 int c;
3714
3715 if (state)
3716 *state = NULL;
3717 if (free_state)
3718 *free_state = NULL;
3719
3720 if (stream == NULL)
3721 return 0;
3722
3723 do
3724 {
3725 c = fz_read_byte(ctx, stream);
3726 if (c == EOF)
3727 return 0;
3728 if (c == match[pos] || c == match2[pos])
3729 {
3730 pos++;
3731 if (pos == 5)
3732 return 100;
3733 }
3734 else
3735 {
3736 /* Restart matching, but recheck c against the start. */
3737 pos = (c == match[0]);
3738 }
3739 }
3740 while (--n > 0);
3741
3742 return 0;
3743 }
3744
3745 static fz_document *
3746 open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *zip, void *state)
3747 {
3748 if (file == NULL)
3749 return NULL;
3750 return (fz_document *)pdf_open_document_with_stream(ctx, file);
3751 }
3752
3753 fz_document_handler pdf_document_handler =
3754 {
3755 NULL,
3756 open_document,
3757 pdf_extensions,
3758 pdf_mimetypes,
3759 pdf_recognize_doc_content
3760 };
3761
3762 void pdf_mark_xref(fz_context *ctx, pdf_document *doc)
3763 {
3764 int x, e;
3765
3766 for (x = 0; x < doc->num_xref_sections; x++)
3767 {
3768 pdf_xref *xref = &doc->xref_sections[x];
3769 pdf_xref_subsec *sub;
3770
3771 for (sub = xref->subsec; sub != NULL; sub = sub->next)
3772 {
3773 for (e = 0; e < sub->len; e++)
3774 {
3775 pdf_xref_entry *entry = &sub->table[e];
3776 if (entry->obj)
3777 {
3778 entry->marked = 1;
3779 }
3780 }
3781 }
3782 }
3783 }
3784
3785 void pdf_clear_xref(fz_context *ctx, pdf_document *doc)
3786 {
3787 int x, e;
3788
3789 for (x = 0; x < doc->num_xref_sections; x++)
3790 {
3791 pdf_xref *xref = &doc->xref_sections[x];
3792 pdf_xref_subsec *sub;
3793
3794 for (sub = xref->subsec; sub != NULL; sub = sub->next)
3795 {
3796 for (e = 0; e < sub->len; e++)
3797 {
3798 pdf_xref_entry *entry = &sub->table[e];
3799 /* We cannot drop objects if the stream
3800 * buffer has been updated */
3801 if (entry->obj != NULL && entry->stm_buf == NULL)
3802 {
3803 if (pdf_obj_refs(ctx, entry->obj) == 1)
3804 {
3805 pdf_drop_obj(ctx, entry->obj);
3806 entry->obj = NULL;
3807 }
3808 }
3809 }
3810 }
3811 }
3812 }
3813
3814 void pdf_clear_xref_to_mark(fz_context *ctx, pdf_document *doc)
3815 {
3816 int x, e;
3817
3818 for (x = 0; x < doc->num_xref_sections; x++)
3819 {
3820 pdf_xref *xref = &doc->xref_sections[x];
3821 pdf_xref_subsec *sub;
3822
3823 for (sub = xref->subsec; sub != NULL; sub = sub->next)
3824 {
3825 for (e = 0; e < sub->len; e++)
3826 {
3827 pdf_xref_entry *entry = &sub->table[e];
3828
3829 /* We cannot drop objects if the stream buffer has
3830 * been updated */
3831 if (entry->obj != NULL && entry->stm_buf == NULL)
3832 {
3833 if (!entry->marked && pdf_obj_refs(ctx, entry->obj) == 1)
3834 {
3835 pdf_drop_obj(ctx, entry->obj);
3836 entry->obj = NULL;
3837 }
3838 }
3839 }
3840 }
3841 }
3842 }
3843
3844 int
3845 pdf_count_versions(fz_context *ctx, pdf_document *doc)
3846 {
3847 return doc->num_xref_sections-doc->num_incremental_sections-doc->has_linearization_object;
3848 }
3849
3850 int
3851 pdf_count_unsaved_versions(fz_context *ctx, pdf_document *doc)
3852 {
3853 return doc->num_incremental_sections;
3854 }
3855
3856 int
3857 pdf_doc_was_linearized(fz_context *ctx, pdf_document *doc)
3858 {
3859 return doc->has_linearization_object;
3860 }
3861
3862 static int pdf_obj_exists(fz_context *ctx, pdf_document *doc, int i)
3863 {
3864 pdf_xref_subsec *sub;
3865 int j;
3866
3867 if (i < 0)
3868 fz_throw(ctx, FZ_ERROR_ARGUMENT, "Negative object number requested");
3869
3870 if (i <= doc->max_xref_len)
3871 j = doc->xref_index[i];
3872 else
3873 j = 0;
3874
3875 /* We may be accessing an earlier version of the document using xref_base
3876 * and j may be an index into a later xref section */
3877 if (doc->xref_base > j)
3878 j = doc->xref_base;
3879
3880 /* Find the first xref section where the entry is defined. */
3881 for (; j < doc->num_xref_sections; j++)
3882 {
3883 pdf_xref *xref = &doc->xref_sections[j];
3884
3885 if (i < xref->num_objects)
3886 {
3887 for (sub = xref->subsec; sub != NULL; sub = sub->next)
3888 {
3889 if (i < sub->start || i >= sub->start + sub->len)
3890 continue;
3891
3892 if (sub->table[i - sub->start].type)
3893 return 1;
3894 }
3895 }
3896 }
3897
3898 return 0;
3899 }
3900
3901 enum {
3902 FIELD_CHANGED = 1,
3903 FIELD_CHANGE_VALID = 2,
3904 FIELD_CHANGE_INVALID = 4
3905 };
3906
3907 typedef struct
3908 {
3909 int num_obj;
3910 int obj_changes[FZ_FLEXIBLE_ARRAY];
3911 } pdf_changes;
3912
3913 static int
3914 check_unchanged_between(fz_context *ctx, pdf_document *doc, pdf_changes *changes, pdf_obj *nobj, pdf_obj *oobj)
3915 {
3916 int marked = 0;
3917 int changed = 0;
3918
3919 /* Trivially identical => trivially unchanged. */
3920 if (nobj == oobj)
3921 return 0;
3922
3923 /* Strictly speaking we shouldn't need to call fz_var,
3924 * but I suspect static analysis tools are not smart
3925 * enough to figure that out. */
3926 fz_var(marked);
3927
3928 if (pdf_is_indirect(ctx, nobj))
3929 {
3930 int o_xref_base = doc->xref_base;
3931
3932 /* Both must be indirect if one is. */
3933 if (!pdf_is_indirect(ctx, oobj))
3934 {
3935 changes->obj_changes[pdf_to_num(ctx, nobj)] |= FIELD_CHANGE_INVALID;
3936 return 1;
3937 }
3938
3939 /* Handle recursing back into ourselves. */
3940 if (pdf_obj_marked(ctx, nobj))
3941 {
3942 if (pdf_obj_marked(ctx, oobj))
3943 return 0;
3944 changes->obj_changes[pdf_to_num(ctx, nobj)] |= FIELD_CHANGE_INVALID;
3945 return 1;
3946 }
3947 else if (pdf_obj_marked(ctx, oobj))
3948 {
3949 changes->obj_changes[pdf_to_num(ctx, nobj)] |= FIELD_CHANGE_INVALID;
3950 return 1;
3951 }
3952
3953 nobj = pdf_resolve_indirect_chain(ctx, nobj);
3954 doc->xref_base = o_xref_base+1;
3955 fz_try(ctx)
3956 {
3957 oobj = pdf_resolve_indirect_chain(ctx, oobj);
3958 if (oobj != nobj)
3959 {
3960 /* Different objects, so lock them */
3961 if (!pdf_obj_marked(ctx, nobj) && !pdf_obj_marked(ctx, oobj))
3962 {
3963 (void)pdf_mark_obj(ctx, nobj);
3964 (void)pdf_mark_obj(ctx, oobj);
3965 marked = 1;
3966 }
3967 }
3968 }
3969 fz_always(ctx)
3970 doc->xref_base = o_xref_base;
3971 fz_catch(ctx)
3972 fz_rethrow(ctx);
3973
3974 if (nobj == oobj)
3975 return 0; /* Trivially identical */
3976 }
3977
3978 fz_var(changed);
3979
3980 fz_try(ctx)
3981 {
3982 if (pdf_is_dict(ctx, nobj))
3983 {
3984 int i, n = pdf_dict_len(ctx, nobj);
3985
3986 if (!pdf_is_dict(ctx, oobj) || n != pdf_dict_len(ctx, oobj))
3987 {
3988 change_found:
3989 changes->obj_changes[pdf_to_num(ctx, nobj)] |= FIELD_CHANGE_INVALID;
3990 changed = 1;
3991 break;
3992 }
3993
3994 for (i = 0; i < n; i++)
3995 {
3996 pdf_obj *key = pdf_dict_get_key(ctx, nobj, i);
3997 pdf_obj *nval = pdf_dict_get(ctx, nobj, key);
3998 pdf_obj *oval = pdf_dict_get(ctx, oobj, key);
3999
4000 changed |= check_unchanged_between(ctx, doc, changes, nval, oval);
4001 }
4002 }
4003 else if (pdf_is_array(ctx, nobj))
4004 {
4005 int i, n = pdf_array_len(ctx, nobj);
4006
4007 if (!pdf_is_array(ctx, oobj) || n != pdf_array_len(ctx, oobj))
4008 goto change_found;
4009
4010 for (i = 0; i < n; i++)
4011 {
4012 pdf_obj *nval = pdf_array_get(ctx, nobj, i);
4013 pdf_obj *oval = pdf_array_get(ctx, oobj, i);
4014
4015 changed |= check_unchanged_between(ctx, doc, changes, nval, oval);
4016 }
4017 }
4018 else if (pdf_objcmp(ctx, nobj, oobj))
4019 goto change_found;
4020 }
4021 fz_always(ctx)
4022 {
4023 if (marked)
4024 {
4025 pdf_unmark_obj(ctx, nobj);
4026 pdf_unmark_obj(ctx, oobj);
4027 }
4028 }
4029 fz_catch(ctx)
4030 fz_rethrow(ctx);
4031
4032 return changed;
4033 }
4034
4035 typedef struct
4036 {
4037 int max;
4038 int len;
4039 char **list;
4040 } char_list;
4041
4042 /* This structure is used to hold the definition of which fields
4043 * are locked. */
4044 struct pdf_locked_fields
4045 {
4046 int p;
4047 int all;
4048 char_list includes;
4049 char_list excludes;
4050 };
4051
4052 static void
4053 free_char_list(fz_context *ctx, char_list *c)
4054 {
4055 int i;
4056
4057 if (c == NULL)
4058 return;
4059
4060 for (i = c->len-1; i >= 0; i--)
4061 fz_free(ctx, c->list[i]);
4062 fz_free(ctx, c->list);
4063 c->len = 0;
4064 c->max = 0;
4065 }
4066
4067 void
4068 pdf_drop_locked_fields(fz_context *ctx, pdf_locked_fields *fl)
4069 {
4070 if (fl == NULL)
4071 return;
4072
4073 free_char_list(ctx, &fl->includes);
4074 free_char_list(ctx, &fl->excludes);
4075 fz_free(ctx, fl);
4076 }
4077
4078 static void
4079 char_list_append(fz_context *ctx, char_list *list, const char *s)
4080 {
4081 if (list->len == list->max)
4082 {
4083 int n = list->max * 2;
4084 if (n == 0) n = 4;
4085
4086 list->list = fz_realloc_array(ctx, list->list, n, char *);
4087 list->max = n;
4088 }
4089 list->list[list->len] = fz_strdup(ctx, s);
4090 list->len++;
4091 }
4092
4093 int
4094 pdf_is_field_locked(fz_context *ctx, pdf_locked_fields *locked, const char *name)
4095 {
4096 int i;
4097
4098 if (locked->p == 1)
4099 {
4100 /* Permissions were set, and say that field changes are not to be allowed. */
4101 return 1; /* Locked */
4102 }
4103
4104 if(locked->all)
4105 {
4106 /* The only way we might not be unlocked is if
4107 * we are listed in the excludes. */
4108 for (i = 0; i < locked->excludes.len; i++)
4109 if (!strcmp(locked->excludes.list[i], name))
4110 return 0;
4111 return 1;
4112 }
4113
4114 /* The only way we can be locked is for us to be in the includes. */
4115 for (i = 0; i < locked->includes.len; i++)
4116 if (strcmp(locked->includes.list[i], name) == 0)
4117 return 1;
4118
4119 /* Anything else is unlocked */
4120 return 0;
4121 }
4122
4123 /* Unfortunately, in C, there is no legal way to define a function
4124 * type that returns itself. We therefore have to use a struct
4125 * wrapper. */
4126 typedef struct filter_wrap
4127 {
4128 struct filter_wrap (*func)(fz_context *ctx, pdf_obj *dict, pdf_obj *key);
4129 } filter_wrap;
4130
4131 typedef struct filter_wrap (*filter_fn)(fz_context *ctx, pdf_obj *dict, pdf_obj *key);
4132
4133 #define RETURN_FILTER(f) { filter_wrap rf; rf.func = (f); return rf; }
4134
4135 static filter_wrap filter_simple(fz_context *ctx, pdf_obj *dict, pdf_obj *key)
4136 {
4137 RETURN_FILTER(NULL);
4138 }
4139
4140 static filter_wrap filter_transformparams(fz_context *ctx, pdf_obj *dict, pdf_obj *key)
4141 {
4142 if (pdf_name_eq(ctx, key, PDF_NAME(Type)) ||
4143 pdf_name_eq(ctx, key, PDF_NAME(P)) ||
4144 pdf_name_eq(ctx, key, PDF_NAME(V)) ||
4145 pdf_name_eq(ctx, key, PDF_NAME(Document)) ||
4146 pdf_name_eq(ctx, key, PDF_NAME(Msg)) ||
4147 pdf_name_eq(ctx, key, PDF_NAME(V)) ||
4148 pdf_name_eq(ctx, key, PDF_NAME(Annots)) ||
4149 pdf_name_eq(ctx, key, PDF_NAME(Form)) ||
4150 pdf_name_eq(ctx, key, PDF_NAME(FormEx)) ||
4151 pdf_name_eq(ctx, key, PDF_NAME(EF)) ||
4152 pdf_name_eq(ctx, key, PDF_NAME(P)) ||
4153 pdf_name_eq(ctx, key, PDF_NAME(Action)) ||
4154 pdf_name_eq(ctx, key, PDF_NAME(Fields)))
4155 RETURN_FILTER(&filter_simple);
4156 RETURN_FILTER(NULL);
4157 }
4158
4159 static filter_wrap filter_reference(fz_context *ctx, pdf_obj *dict, pdf_obj *key)
4160 {
4161 if (pdf_name_eq(ctx, key, PDF_NAME(Type)) ||
4162 pdf_name_eq(ctx, key, PDF_NAME(TransformMethod)) ||
4163 pdf_name_eq(ctx, key, PDF_NAME(DigestMethod)) ||
4164 pdf_name_eq(ctx, key, PDF_NAME(DigestValue)) ||
4165 pdf_name_eq(ctx, key, PDF_NAME(DigestLocation)))
4166 RETURN_FILTER(&filter_simple);
4167 if (pdf_name_eq(ctx, key, PDF_NAME(TransformParams)))
4168 RETURN_FILTER(&filter_transformparams);
4169 RETURN_FILTER(NULL);
4170 }
4171
4172 static filter_wrap filter_prop_build_sub(fz_context *ctx, pdf_obj *dict, pdf_obj *key)
4173 {
4174 if (pdf_name_eq(ctx, key, PDF_NAME(Name)) ||
4175 pdf_name_eq(ctx, key, PDF_NAME(Date)) ||
4176 pdf_name_eq(ctx, key, PDF_NAME(R)) ||
4177 pdf_name_eq(ctx, key, PDF_NAME(PreRelease)) ||
4178 pdf_name_eq(ctx, key, PDF_NAME(OS)) ||
4179 pdf_name_eq(ctx, key, PDF_NAME(NonEFontNoWarn)) ||
4180 pdf_name_eq(ctx, key, PDF_NAME(TrustedMode)) ||
4181 pdf_name_eq(ctx, key, PDF_NAME(V)) ||
4182 pdf_name_eq(ctx, key, PDF_NAME(REx)) ||
4183 pdf_name_eq(ctx, key, PDF_NAME(Preview)))
4184 RETURN_FILTER(&filter_simple);
4185 RETURN_FILTER(NULL);
4186 }
4187
4188 static filter_wrap filter_prop_build(fz_context *ctx, pdf_obj *dict, pdf_obj *key)
4189 {
4190 if (pdf_name_eq(ctx, key, PDF_NAME(Filter)) ||
4191 pdf_name_eq(ctx, key, PDF_NAME(PubSec)) ||
4192 pdf_name_eq(ctx, key, PDF_NAME(App)) ||
4193 pdf_name_eq(ctx, key, PDF_NAME(SigQ)))
4194 RETURN_FILTER(&filter_prop_build_sub);
4195 RETURN_FILTER(NULL);
4196 }
4197
4198 static filter_wrap filter_v(fz_context *ctx, pdf_obj *dict, pdf_obj *key)
4199 {
4200 /* Text can point to a stream object */
4201 if (pdf_name_eq(ctx, key, PDF_NAME(Length)) && pdf_is_stream(ctx, dict))
4202 RETURN_FILTER(&filter_simple);
4203 /* Sigs point to a dict. */
4204 if (pdf_name_eq(ctx, key, PDF_NAME(Type)) ||
4205 pdf_name_eq(ctx, key, PDF_NAME(Filter)) ||
4206 pdf_name_eq(ctx, key, PDF_NAME(SubFilter)) ||
4207 pdf_name_eq(ctx, key, PDF_NAME(Contents)) ||
4208 pdf_name_eq(ctx, key, PDF_NAME(Cert)) ||
4209 pdf_name_eq(ctx, key, PDF_NAME(ByteRange)) ||
4210 pdf_name_eq(ctx, key, PDF_NAME(Changes)) ||
4211 pdf_name_eq(ctx, key, PDF_NAME(Name)) ||
4212 pdf_name_eq(ctx, key, PDF_NAME(M)) ||
4213 pdf_name_eq(ctx, key, PDF_NAME(Location)) ||
4214 pdf_name_eq(ctx, key, PDF_NAME(Reason)) ||
4215 pdf_name_eq(ctx, key, PDF_NAME(ContactInfo)) ||
4216 pdf_name_eq(ctx, key, PDF_NAME(R)) ||
4217 pdf_name_eq(ctx, key, PDF_NAME(V)) ||
4218 pdf_name_eq(ctx, key, PDF_NAME(Prop_AuthTime)) ||
4219 pdf_name_eq(ctx, key, PDF_NAME(Prop_AuthType)))
4220 RETURN_FILTER(&filter_simple);
4221 if (pdf_name_eq(ctx, key, PDF_NAME(Reference)))
4222 RETURN_FILTER(filter_reference);
4223 if (pdf_name_eq(ctx, key, PDF_NAME(Prop_Build)))
4224 RETURN_FILTER(filter_prop_build);
4225 RETURN_FILTER(NULL);
4226 }
4227
4228 static filter_wrap filter_appearance(fz_context *ctx, pdf_obj *dict, pdf_obj *key);
4229
4230 static filter_wrap filter_xobject_list(fz_context *ctx, pdf_obj *dict, pdf_obj *key)
4231 {
4232 /* FIXME: Infinite recursion possible here? */
4233 RETURN_FILTER(&filter_appearance);
4234 }
4235
4236 static filter_wrap filter_font(fz_context *ctx, pdf_obj *dict, pdf_obj *key)
4237 {
4238 /* In the example I've seen the /Name field was dropped, so we'll allow
4239 * local changes, but none that follow an indirection. */
4240 RETURN_FILTER(NULL);
4241 }
4242
4243 /* FIXME: One idea here is to make filter_font_list and filter_xobject_list
4244 * only accept NEW objects as changes. Will think about this. */
4245 static filter_wrap filter_font_list(fz_context *ctx, pdf_obj *dict, pdf_obj *key)
4246 {
4247 RETURN_FILTER(&filter_font);
4248 }
4249
4250 static filter_wrap filter_resources(fz_context *ctx, pdf_obj *dict, pdf_obj *key)
4251 {
4252 if (pdf_name_eq(ctx, key, PDF_NAME(XObject)))
4253 RETURN_FILTER(&filter_xobject_list);
4254 if (pdf_name_eq(ctx, key, PDF_NAME(Font)))
4255 RETURN_FILTER(&filter_font_list);
4256 RETURN_FILTER(NULL);
4257 }
4258
4259 static filter_wrap filter_appearance(fz_context *ctx, pdf_obj *dict, pdf_obj *key)
4260 {
4261 if (pdf_name_eq(ctx, key, PDF_NAME(Resources)))
4262 RETURN_FILTER(&filter_resources);
4263 RETURN_FILTER(NULL);
4264 }
4265
4266 static filter_wrap filter_ap(fz_context *ctx, pdf_obj *dict, pdf_obj *key)
4267 {
4268 /* Just the /N entry for now. May need to add more later. */
4269 if (pdf_name_eq(ctx, key, PDF_NAME(N)) && pdf_is_stream(ctx, pdf_dict_get(ctx, dict, key)))
4270 RETURN_FILTER(&filter_appearance);
4271 RETURN_FILTER(NULL);
4272 }
4273
4274 static filter_wrap filter_xfa(fz_context *ctx, pdf_obj *dict, pdf_obj *key)
4275 {
4276 /* Text can point to a stream object */
4277 if (pdf_is_stream(ctx, dict))
4278 RETURN_FILTER(&filter_simple);
4279 RETURN_FILTER(NULL);
4280 }
4281
4282 static void
4283 filter_changes_accepted(fz_context *ctx, pdf_changes *changes, pdf_obj *obj, filter_fn filter)
4284 {
4285 int obj_num;
4286
4287 if (obj == NULL || pdf_obj_marked(ctx, obj))
4288 return;
4289
4290 obj_num = pdf_to_num(ctx, obj);
4291
4292 fz_try(ctx)
4293 {
4294 if (obj_num != 0)
4295 {
4296 (void)pdf_mark_obj(ctx, obj);
4297 changes->obj_changes[obj_num] |= FIELD_CHANGE_VALID;
4298 }
4299 if (filter == NULL)
4300 break;
4301 if (pdf_is_dict(ctx, obj))
4302 {
4303 int i, n = pdf_dict_len(ctx, obj);
4304
4305 for (i = 0; i < n; i++)
4306 {
4307 pdf_obj *key = pdf_dict_get_key(ctx, obj, i);
4308 pdf_obj *val = pdf_dict_get_val(ctx, obj, i);
4309 filter_fn f = (filter(ctx, obj, key)).func;
4310 if (f != NULL)
4311 filter_changes_accepted(ctx, changes, val, f);
4312 }
4313 }
4314 else if (pdf_is_array(ctx, obj))
4315 {
4316 int i, n = pdf_array_len(ctx, obj);
4317
4318 for (i = 0; i < n; i++)
4319 {
4320 pdf_obj *val = pdf_array_get(ctx, obj, i);
4321 filter_changes_accepted(ctx, changes, val, filter);
4322 }
4323 }
4324 }
4325 fz_always(ctx)
4326 if (obj_num != 0)
4327 pdf_unmark_obj(ctx, obj);
4328 fz_catch(ctx)
4329 fz_rethrow(ctx);
4330 }
4331
4332 static void
4333 check_field(fz_context *ctx, pdf_document *doc, pdf_changes *changes, pdf_obj *obj, pdf_locked_fields *locked, const char *name_prefix, pdf_obj *new_v, pdf_obj *old_v)
4334 {
4335 pdf_obj *old_obj, *new_obj, *n_v, *o_v;
4336 int o_xref_base;
4337 int obj_num;
4338 char *field_name = NULL;
4339
4340 /* All fields MUST be indirections, either in the Fields array
4341 * or AcroForms, or in the Kids array of other Fields. */
4342 if (!pdf_is_indirect(ctx, obj))
4343 return;
4344
4345 obj_num = pdf_to_num(ctx, obj);
4346 o_xref_base = doc->xref_base;
4347 new_obj = pdf_resolve_indirect_chain(ctx, obj);
4348
4349 /* Similarly, all fields must be dicts */
4350 if (!pdf_is_dict(ctx, new_obj))
4351 return;
4352
4353 if (pdf_obj_marked(ctx, obj))
4354 return;
4355
4356 fz_var(field_name);
4357
4358 fz_try(ctx)
4359 {
4360 int i, len;
4361 const char *name;
4362 size_t n;
4363 pdf_obj *t;
4364 int is_locked;
4365
4366 (void)pdf_mark_obj(ctx, obj);
4367
4368 /* Do this within the try, so we can catch any problems */
4369 doc->xref_base = o_xref_base+1;
4370 old_obj = pdf_resolve_indirect_chain(ctx, obj);
4371
4372 t = pdf_dict_get(ctx, old_obj, PDF_NAME(T));
4373 if (t != NULL)
4374 {
4375 name = pdf_dict_get_text_string(ctx, old_obj, PDF_NAME(T));
4376 n = strlen(name)+1;
4377 if (*name_prefix)
4378 n += 1 + strlen(name_prefix);
4379 field_name = fz_malloc(ctx, n);
4380 if (*name_prefix)
4381 {
4382 strcpy(field_name, name_prefix);
4383 strcat(field_name, ".");
4384 }
4385 else
4386 *field_name = 0;
4387 strcat(field_name, name);
4388 name_prefix = field_name;
4389 }
4390
4391 doc->xref_base = o_xref_base;
4392
4393 if (!pdf_is_dict(ctx, old_obj))
4394 break;
4395
4396 /* Check V explicitly, allowing for it being inherited. */
4397 n_v = pdf_dict_get(ctx, new_obj, PDF_NAME(V));
4398 if (n_v == NULL)
4399 n_v = new_v;
4400 o_v = pdf_dict_get(ctx, old_obj, PDF_NAME(V));
4401 if (o_v == NULL)
4402 o_v = old_v;
4403
4404 is_locked = pdf_is_field_locked(ctx, locked, name_prefix);
4405 if (pdf_name_eq(ctx, pdf_dict_get(ctx, new_obj, PDF_NAME(Type)), PDF_NAME(Annot)) &&
4406 pdf_name_eq(ctx, pdf_dict_get(ctx, new_obj, PDF_NAME(Subtype)), PDF_NAME(Widget)))
4407 {
4408 if (is_locked)
4409 {
4410 /* If locked, V must not change! */
4411 if (check_unchanged_between(ctx, doc, changes, n_v, o_v))
4412 changes->obj_changes[obj_num] |= FIELD_CHANGE_INVALID;
4413 }
4414 else
4415 {
4416 /* If not locked, V can change to be filled in! */
4417 filter_changes_accepted(ctx, changes, n_v, &filter_v);
4418 changes->obj_changes[obj_num] |= FIELD_CHANGE_VALID;
4419 }
4420 }
4421
4422 /* Check all the fields in the new object are
4423 * either the same as the old object, or are
4424 * expected changes. */
4425 len = pdf_dict_len(ctx, new_obj);
4426 for (i = 0; i < len; i++)
4427 {
4428 pdf_obj *key = pdf_dict_get_key(ctx, new_obj, i);
4429 pdf_obj *nval = pdf_dict_get(ctx, new_obj, key);
4430 pdf_obj *oval = pdf_dict_get(ctx, old_obj, key);
4431
4432 /* Kids arrays shouldn't change. */
4433 if (pdf_name_eq(ctx, key, PDF_NAME(Kids)))
4434 {
4435 int j, m;
4436
4437 /* Kids must be an array. If it's not, count it as a difference. */
4438 if (!pdf_is_array(ctx, nval) || !pdf_is_array(ctx, oval))
4439 {
4440 change_found:
4441 changes->obj_changes[obj_num] |= FIELD_CHANGE_INVALID;
4442 break;
4443 }
4444 m = pdf_array_len(ctx, nval);
4445 /* Any change in length counts as a difference */
4446 if (m != pdf_array_len(ctx, oval))
4447 goto change_found;
4448 for (j = 0; j < m; j++)
4449 {
4450 pdf_obj *nkid = pdf_array_get(ctx, nval, j);
4451 pdf_obj *okid = pdf_array_get(ctx, oval, j);
4452 /* Kids arrays are supposed to all be indirect. If they aren't,
4453 * count it as a difference. */
4454 if (!pdf_is_indirect(ctx, nkid) || !pdf_is_indirect(ctx, okid))
4455 goto change_found;
4456 /* For now at least, we'll count any change in number as a difference. */
4457 if (pdf_to_num(ctx, nkid) != pdf_to_num(ctx, okid))
4458 goto change_found;
4459 check_field(ctx, doc, changes, nkid, locked, name_prefix, n_v, o_v);
4460 }
4461 }
4462 else if (pdf_name_eq(ctx, key, PDF_NAME(V)))
4463 {
4464 /* V is checked above */
4465 }
4466 else if (pdf_name_eq(ctx, key, PDF_NAME(AP)))
4467 {
4468 /* If we're locked, then nothing can change. If not,
4469 * we can change to be filled in. */
4470 if (is_locked)
4471 check_unchanged_between(ctx, doc, changes, nval, oval);
4472 else
4473 filter_changes_accepted(ctx, changes, nval, &filter_ap);
4474 }
4475 /* All other fields can't change */
4476 else
4477 check_unchanged_between(ctx, doc, changes, nval, oval);
4478 }
4479
4480 /* Now check all the fields in the old object to
4481 * make sure none were dropped. */
4482 len = pdf_dict_len(ctx, old_obj);
4483 for (i = 0; i < len; i++)
4484 {
4485 pdf_obj *key = pdf_dict_get_key(ctx, old_obj, i);
4486 pdf_obj *nval, *oval;
4487
4488 /* V is checked above */
4489 if (pdf_name_eq(ctx, key, PDF_NAME(V)))
4490 continue;
4491
4492 nval = pdf_dict_get(ctx, new_obj, key);
4493 oval = pdf_dict_get(ctx, old_obj, key);
4494
4495 if (nval == NULL && oval != NULL)
4496 changes->obj_changes[pdf_to_num(ctx, nval)] |= FIELD_CHANGE_INVALID;
4497 }
4498 changes->obj_changes[obj_num] |= FIELD_CHANGE_VALID;
4499
4500 }
4501 fz_always(ctx)
4502 {
4503 pdf_unmark_obj(ctx, obj);
4504 fz_free(ctx, field_name);
4505 doc->xref_base = o_xref_base;
4506 }
4507 fz_catch(ctx)
4508 fz_rethrow(ctx);
4509 }
4510
4511 static int
4512 pdf_obj_changed_in_version(fz_context *ctx, pdf_document *doc, int num, int version)
4513 {
4514 if (num < 0 || num > doc->max_xref_len)
4515 fz_throw(ctx, FZ_ERROR_ARGUMENT, "Invalid object number requested");
4516
4517 return version == doc->xref_index[num];
4518 }
4519
4520 static void
4521 merge_lock_specification(fz_context *ctx, pdf_locked_fields *fields, pdf_obj *lock)
4522 {
4523 pdf_obj *action;
4524 int i, r, w;
4525
4526 if (lock == NULL)
4527 return;
4528
4529 action = pdf_dict_get(ctx, lock, PDF_NAME(Action));
4530
4531 if (pdf_name_eq(ctx, action, PDF_NAME(All)))
4532 {
4533 /* All fields locked means we don't need any stored
4534 * includes/excludes. */
4535 fields->all = 1;
4536 free_char_list(ctx, &fields->includes);
4537 free_char_list(ctx, &fields->excludes);
4538 }
4539 else
4540 {
4541 pdf_obj *f = pdf_dict_get(ctx, lock, PDF_NAME(Fields));
4542 int len = pdf_array_len(ctx, f);
4543
4544 if (pdf_name_eq(ctx, action, PDF_NAME(Include)))
4545 {
4546 if (fields->all)
4547 {
4548 /* Current state = "All except <excludes> are locked".
4549 * We need to remove <Fields> from <excludes>. */
4550 for (i = 0; i < len; i++)
4551 {
4552 const char *s = pdf_array_get_text_string(ctx, f, i);
4553
4554 for (r = w = 0; r < fields->excludes.len; r++)
4555 {
4556 if (strcmp(s, fields->excludes.list[r]))
4557 fields->excludes.list[w++] = fields->excludes.list[r];
4558 }
4559 fields->excludes.len = w;
4560 }
4561 }
4562 else
4563 {
4564 /* Current state = <includes> are locked.
4565 * We need to add <Fields> to <include> (avoiding repetition). */
4566 for (i = 0; i < len; i++)
4567 {
4568 const char *s = pdf_array_get_text_string(ctx, f, i);
4569
4570 for (r = 0; r < fields->includes.len; r++)
4571 {
4572 if (!strcmp(s, fields->includes.list[r]))
4573 break;
4574 }
4575 if (r == fields->includes.len)
4576 char_list_append(ctx, &fields->includes, s);
4577 }
4578 }
4579 }
4580 else if (pdf_name_eq(ctx, action, PDF_NAME(Exclude)))
4581 {
4582 if (fields->all)
4583 {
4584 /* Current state = "All except <excludes> are locked.
4585 * We need to remove anything from <excludes> that isn't in <Fields>. */
4586 for (r = w = 0; r < fields->excludes.len; r++)
4587 {
4588 for (i = 0; i < len; i++)
4589 {
4590 const char *s = pdf_array_get_text_string(ctx, f, i);
4591 if (!strcmp(s, fields->excludes.list[r]))
4592 break;
4593 }
4594 if (i != len) /* we found a match */
4595 fields->excludes.list[w++] = fields->excludes.list[r];
4596 }
4597 fields->excludes.len = w;
4598 }
4599 else
4600 {
4601 /* Current state = <includes> are locked.
4602 * Set all. <excludes> becomes <Fields> less <includes>. Remove <includes>. */
4603 fields->all = 1;
4604 for (i = 0; i < len; i++)
4605 {
4606 const char *s = pdf_array_get_text_string(ctx, f, i);
4607 for (r = 0; r < fields->includes.len; r++)
4608 {
4609 if (!strcmp(s, fields->includes.list[r]))
4610 break;
4611 }
4612 if (r == fields->includes.len)
4613 char_list_append(ctx, &fields->excludes, s);
4614 }
4615 free_char_list(ctx, &fields->includes);
4616 }
4617 }
4618 }
4619 }
4620
4621 static void
4622 find_locked_fields_value(fz_context *ctx, pdf_locked_fields *fields, pdf_obj *v)
4623 {
4624 pdf_obj *ref = pdf_dict_get(ctx, v, PDF_NAME(Reference));
4625 int i, n;
4626
4627 if (!ref)
4628 return;
4629
4630 n = pdf_array_len(ctx, ref);
4631 for (i = 0; i < n; i++)
4632 {
4633 pdf_obj *sr = pdf_array_get(ctx, ref, i);
4634 pdf_obj *tm, *tp, *type;
4635
4636 /* Type is optional, but if it exists, it'd better be SigRef. */
4637 type = pdf_dict_get(ctx, sr, PDF_NAME(Type));
4638 if (type != NULL && !pdf_name_eq(ctx, type, PDF_NAME(SigRef)))
4639 continue;
4640 tm = pdf_dict_get(ctx, sr, PDF_NAME(TransformMethod));
4641 tp = pdf_dict_get(ctx, sr, PDF_NAME(TransformParams));
4642 if (pdf_name_eq(ctx, tm, PDF_NAME(DocMDP)))
4643 {
4644 int p = pdf_dict_get_int(ctx, tp, PDF_NAME(P));
4645
4646 if (p == 0)
4647 p = 2;
4648 if (fields->p == 0)
4649 fields->p = p;
4650 else
4651 fields->p = fz_mini(fields->p, p);
4652 }
4653 else if (pdf_name_eq(ctx, tm, PDF_NAME(FieldMDP)))
4654 merge_lock_specification(ctx, fields, tp);
4655 }
4656 }
4657
4658 static void
4659 find_locked_fields_aux(fz_context *ctx, pdf_obj *field, pdf_locked_fields *fields, pdf_obj *inherit_v, pdf_obj *inherit_ft)
4660 {
4661 int i, n;
4662
4663 if (!pdf_name_eq(ctx, pdf_dict_get(ctx, field, PDF_NAME(Type)), PDF_NAME(Annot)))
4664 return;
4665
4666 if (pdf_obj_marked(ctx, field))
4667 return;
4668
4669 fz_try(ctx)
4670 {
4671 pdf_obj *kids, *v, *ft;
4672
4673 (void)pdf_mark_obj(ctx, field);
4674
4675 v = pdf_dict_get(ctx, field, PDF_NAME(V));
4676 if (v == NULL)
4677 v = inherit_v;
4678 ft = pdf_dict_get(ctx, field, PDF_NAME(FT));
4679 if (ft == NULL)
4680 ft = inherit_ft;
4681
4682 /* We are looking for Widget annotations of type Sig that are
4683 * signed (i.e. have a 'V' field). */
4684 if (pdf_name_eq(ctx, pdf_dict_get(ctx, field, PDF_NAME(Subtype)), PDF_NAME(Widget)) &&
4685 pdf_name_eq(ctx, ft, PDF_NAME(Sig)) &&
4686 pdf_name_eq(ctx, pdf_dict_get(ctx, v, PDF_NAME(Type)), PDF_NAME(Sig)))
4687 {
4688 /* Signed Sig Widgets (i.e. ones with a 'V' field) need
4689 * to have their lock field respected. */
4690 merge_lock_specification(ctx, fields, pdf_dict_get(ctx, field, PDF_NAME(Lock)));
4691
4692 /* Look for DocMDP and FieldMDP entries to see what
4693 * flavours of alterations are allowed. */
4694 find_locked_fields_value(ctx, fields, v);
4695 }
4696
4697 /* Recurse as required */
4698 kids = pdf_dict_get(ctx, field, PDF_NAME(Kids));
4699 if (kids)
4700 {
4701 n = pdf_array_len(ctx, kids);
4702 for (i = 0; i < n; i++)
4703 find_locked_fields_aux(ctx, pdf_array_get(ctx, kids, i), fields, v, ft);
4704 }
4705 }
4706 fz_always(ctx)
4707 pdf_unmark_obj(ctx, field);
4708 fz_catch(ctx)
4709 fz_rethrow(ctx);
4710 }
4711
4712 pdf_locked_fields *
4713 pdf_find_locked_fields(fz_context *ctx, pdf_document *doc, int version)
4714 {
4715 pdf_locked_fields *fields = fz_malloc_struct(ctx, pdf_locked_fields);
4716 int o_xref_base = doc->xref_base;
4717 doc->xref_base = version;
4718
4719 fz_var(fields);
4720
4721 fz_try(ctx)
4722 {
4723 pdf_obj *fobj = pdf_dict_getp(ctx, pdf_trailer(ctx, doc), "Root/AcroForm/Fields");
4724 int i, len = pdf_array_len(ctx, fobj);
4725
4726 if (len == 0)
4727 break;
4728
4729 for (i = 0; i < len; i++)
4730 find_locked_fields_aux(ctx, pdf_array_get(ctx, fobj, i), fields, NULL, NULL);
4731
4732 /* Add in any DocMDP referenced directly from the Perms dict. */
4733 find_locked_fields_value(ctx, fields, pdf_dict_getp(ctx, pdf_trailer(ctx, doc), "Root/Perms/DocMDP"));
4734 }
4735 fz_always(ctx)
4736 doc->xref_base = o_xref_base;
4737 fz_catch(ctx)
4738 {
4739 pdf_drop_locked_fields(ctx, fields);
4740 fz_rethrow(ctx);
4741 }
4742
4743 return fields;
4744 }
4745
4746 pdf_locked_fields *
4747 pdf_find_locked_fields_for_sig(fz_context *ctx, pdf_document *doc, pdf_obj *sig)
4748 {
4749 pdf_locked_fields *fields = fz_malloc_struct(ctx, pdf_locked_fields);
4750
4751 fz_var(fields);
4752
4753 fz_try(ctx)
4754 {
4755 pdf_obj *ref;
4756 int i, len;
4757
4758 /* Ensure it really is a sig */
4759 if (!pdf_name_eq(ctx, pdf_dict_get(ctx, sig, PDF_NAME(Subtype)), PDF_NAME(Widget)) ||
4760 !pdf_name_eq(ctx, pdf_dict_get_inheritable(ctx, sig, PDF_NAME(FT)), PDF_NAME(Sig)))
4761 break;
4762
4763 /* Check the locking details given in the V (i.e. what the signature value
4764 * claims to lock). */
4765 ref = pdf_dict_getp(ctx, sig, "V/Reference");
4766 len = pdf_array_len(ctx, ref);
4767 for (i = 0; i < len; i++)
4768 {
4769 pdf_obj *tp = pdf_dict_get(ctx, pdf_array_get(ctx, ref, i), PDF_NAME(TransformParams));
4770 merge_lock_specification(ctx, fields, tp);
4771 }
4772
4773 /* Also, check the locking details given in the Signature definition. This may
4774 * not strictly be necessary as it's supposed to be "what the form author told
4775 * the signature that it should lock". A well-formed signature should lock
4776 * at least that much (possibly with extra fields locked from the XFA). If the
4777 * signature doesn't lock as much as it was told to, we should be suspicious
4778 * of the signing application. It is not clear that this test is actually
4779 * necessary, or in keeping with what Acrobat does. */
4780 merge_lock_specification(ctx, fields, pdf_dict_get(ctx, sig, PDF_NAME(Lock)));
4781 }
4782 fz_catch(ctx)
4783 {
4784 pdf_drop_locked_fields(ctx, fields);
4785 fz_rethrow(ctx);
4786 }
4787
4788 return fields;
4789 }
4790
4791 static int
4792 validate_locked_fields(fz_context *ctx, pdf_document *doc, int version, pdf_locked_fields *locked)
4793 {
4794 int o_xref_base = doc->xref_base;
4795 pdf_changes *changes;
4796 int num_objs;
4797 int i, n;
4798 int all_indirects = 1;
4799
4800 num_objs = doc->max_xref_len;
4801 changes = fz_malloc_flexible(ctx, pdf_changes, obj_changes, num_objs);
4802 changes->num_obj = num_objs;
4803
4804 fz_try(ctx)
4805 {
4806 pdf_obj *acroform, *new_acroform, *old_acroform;
4807 int len, acroform_num;
4808
4809 doc->xref_base = version;
4810
4811 /* Detect every object that has changed */
4812 for (i = 1; i < num_objs; i++)
4813 {
4814 if (pdf_obj_changed_in_version(ctx, doc, i, version))
4815 changes->obj_changes[i] = FIELD_CHANGED;
4816 }
4817
4818 /* FIXME: Compare PageTrees and NumberTrees (just to allow for them being regenerated
4819 * and having produced stuff that represents the same stuff). */
4820
4821 /* The metadata of a document may be regenerated. Allow for that. */
4822 filter_changes_accepted(ctx, changes, pdf_dict_getp(ctx, pdf_trailer(ctx, doc), "Root/Metadata"), &filter_simple);
4823
4824 /* The ModDate of document info may be regenerated. Allow for that. */
4825 /* FIXME: We accept all changes in document info, when maybe we ought to just
4826 * accept ModDate? */
4827 filter_changes_accepted(ctx, changes, pdf_dict_getp(ctx, pdf_trailer(ctx, doc), "Info"), &filter_simple);
4828
4829 /* The Encryption dict may be rewritten for the new Xref. */
4830 filter_changes_accepted(ctx, changes, pdf_dict_getp(ctx, pdf_trailer(ctx, doc), "Encrypt"), &filter_simple);
4831
4832 /* We have to accept certain changes in the top level AcroForms dict,
4833 * so get the 2 versions... */
4834 acroform = pdf_dict_getp(ctx, pdf_trailer(ctx, doc), "Root/AcroForm");
4835 acroform_num = pdf_to_num(ctx, acroform);
4836 new_acroform = pdf_resolve_indirect_chain(ctx, acroform);
4837 doc->xref_base = version+1;
4838 old_acroform = pdf_resolve_indirect_chain(ctx, pdf_dict_getp(ctx, pdf_trailer(ctx, doc), "Root/AcroForm"));
4839 doc->xref_base = version;
4840 n = pdf_dict_len(ctx, new_acroform);
4841 for (i = 0; i < n; i++)
4842 {
4843 pdf_obj *key = pdf_dict_get_key(ctx, new_acroform, i);
4844 pdf_obj *nval = pdf_dict_get(ctx, new_acroform, key);
4845 pdf_obj *oval = pdf_dict_get(ctx, old_acroform, key);
4846
4847 if (pdf_name_eq(ctx, key, PDF_NAME(Fields)))
4848 {
4849 int j;
4850
4851 len = pdf_array_len(ctx, nval);
4852 for (j = 0; j < len; j++)
4853 {
4854 pdf_obj *field = pdf_array_get(ctx, nval, j);
4855 if (!pdf_is_indirect(ctx, field))
4856 all_indirects = 0;
4857 check_field(ctx, doc, changes, field, locked, "", NULL, NULL);
4858 }
4859 }
4860 else if (pdf_name_eq(ctx, key, PDF_NAME(SigFlags)))
4861 {
4862 /* Accept this */
4863 changes->obj_changes[acroform_num] |= FIELD_CHANGE_VALID;
4864 }
4865 else if (pdf_name_eq(ctx, key, PDF_NAME(DR)))
4866 {
4867 /* Accept any changes from within the Document Resources */
4868 filter_changes_accepted(ctx, changes, nval, &filter_resources);
4869 }
4870 else if (pdf_name_eq(ctx, key, PDF_NAME(XFA)))
4871 {
4872 /* Allow any changes within the XFA streams. */
4873 filter_changes_accepted(ctx, changes, nval, &filter_xfa);
4874 }
4875 else if (pdf_objcmp(ctx, nval, oval))
4876 {
4877 changes->obj_changes[acroform_num] |= FIELD_CHANGE_INVALID;
4878 }
4879 }
4880
4881 /* Allow for any object streams/XRefs to be changed. */
4882 doc->xref_base = version+1;
4883 for (i = 1; i < num_objs; i++)
4884 {
4885 pdf_obj *oobj, *otype;
4886 if (changes->obj_changes[i] != FIELD_CHANGED)
4887 continue;
4888 if (!pdf_obj_exists(ctx, doc, i))
4889 {
4890 /* Not present this version - must be newly created, can't be a change. */
4891 changes->obj_changes[i] |= FIELD_CHANGE_VALID;
4892 continue;
4893 }
4894 oobj = pdf_load_object(ctx, doc, i);
4895 otype = pdf_dict_get(ctx, oobj, PDF_NAME(Type));
4896 if (pdf_name_eq(ctx, otype, PDF_NAME(ObjStm)) ||
4897 pdf_name_eq(ctx, otype, PDF_NAME(XRef)))
4898 {
4899 changes->obj_changes[i] |= FIELD_CHANGE_VALID;
4900 }
4901 pdf_drop_obj(ctx, oobj);
4902 }
4903 }
4904 fz_always(ctx)
4905 doc->xref_base = o_xref_base;
4906 fz_catch(ctx)
4907 {
4908 fz_free(ctx, changes);
4909 fz_rethrow(ctx);
4910 }
4911
4912 for (i = 1; i < num_objs; i++)
4913 {
4914 if (changes->obj_changes[i] == FIELD_CHANGED)
4915 /* Change with no reason */
4916 break;
4917 if (changes->obj_changes[i] & FIELD_CHANGE_INVALID)
4918 /* Illegal Change */
4919 break;
4920 }
4921
4922 fz_free(ctx, changes);
4923
4924 return (i == num_objs) && all_indirects;
4925 }
4926
4927 int
4928 pdf_validate_changes(fz_context *ctx, pdf_document *doc, int version)
4929 {
4930 int unsaved_versions = pdf_count_unsaved_versions(ctx, doc);
4931 int n = pdf_count_versions(ctx, doc);
4932 pdf_locked_fields *locked = NULL;
4933 int result;
4934
4935 if (version < 0 || version >= n)
4936 fz_throw(ctx, FZ_ERROR_ARGUMENT, "There aren't that many changes to find in this document!");
4937
4938 /* We are wanting to compare version+1 with version to make sure
4939 * that the only changes made in going to version are conformant
4940 * with what was allowed in version+1. The production of version
4941 * might have involved signing a signature field and locking down
4942 * more fields - this means that taking the list of locked things
4943 * from version rather than version+1 will give us bad results! */
4944 locked = pdf_find_locked_fields(ctx, doc, unsaved_versions+version+1);
4945
4946 fz_try(ctx)
4947 {
4948 if (!locked->all && locked->includes.len == 0 && locked->p == 0)
4949 {
4950 /* If nothing is locked at all, then all changes are permissible. */
4951 result = 1;
4952 }
4953 else
4954 result = validate_locked_fields(ctx, doc, unsaved_versions+version, locked);
4955 }
4956 fz_always(ctx)
4957 pdf_drop_locked_fields(ctx, locked);
4958 fz_catch(ctx)
4959 fz_rethrow(ctx);
4960
4961 return result;
4962 }
4963
4964 int
4965 pdf_validate_change_history(fz_context *ctx, pdf_document *doc)
4966 {
4967 int num_versions = pdf_count_versions(ctx, doc);
4968 int v;
4969
4970 if (num_versions < 2)
4971 return 0; /* Unless there are at least 2 versions, there have been no updates. */
4972
4973 for(v = num_versions - 2; v >= 0; v--)
4974 {
4975 if (!pdf_validate_changes(ctx, doc, v))
4976 return v+1;
4977 }
4978 return 0;
4979 }
4980
4981 /* Return the version that obj appears in, or -1 for not found. */
4982 static int
4983 pdf_find_incremental_update_num_for_obj(fz_context *ctx, pdf_document *doc, pdf_obj *obj)
4984 {
4985 pdf_xref *xref = NULL;
4986 pdf_xref_subsec *sub;
4987 int i, j;
4988
4989 if (obj == NULL)
4990 return -1;
4991
4992 /* obj needs to be indirect for us to get a num out of it. */
4993 i = pdf_to_num(ctx, obj);
4994 if (i <= 0)
4995 return -1;
4996
4997 /* obj can't be indirect below, so resolve it here. */
4998 obj = pdf_resolve_indirect_chain(ctx, obj);
4999
5000 /* Find the first xref section where the entry is defined. */
5001 for (j = 0; j < doc->num_xref_sections; j++)
5002 {
5003 xref = &doc->xref_sections[j];
5004
5005 if (i < xref->num_objects)
5006 {
5007 for (sub = xref->subsec; sub != NULL; sub = sub->next)
5008 {
5009 pdf_xref_entry *entry;
5010
5011 if (i < sub->start || i >= sub->start + sub->len)
5012 continue;
5013
5014 entry = &sub->table[i - sub->start];
5015 if (entry->obj == obj)
5016 return j;
5017 }
5018 }
5019 }
5020 return -1;
5021 }
5022
5023 int pdf_find_version_for_obj(fz_context *ctx, pdf_document *doc, pdf_obj *obj)
5024 {
5025 int v = pdf_find_incremental_update_num_for_obj(ctx, doc, obj);
5026 int n;
5027
5028 if (v == -1)
5029 return -1;
5030
5031 n = pdf_count_versions(ctx, doc) + pdf_count_unsaved_versions(ctx, doc);
5032 if (v > n)
5033 return n;
5034
5035 return v;
5036 }
5037
5038 int pdf_validate_signature(fz_context *ctx, pdf_annot *widget)
5039 {
5040 pdf_document *doc;
5041 int unsaved_versions, num_versions, version, i;
5042 pdf_locked_fields *locked = NULL;
5043 int o_xref_base;
5044
5045 if (!widget->page)
5046 fz_throw(ctx, FZ_ERROR_ARGUMENT, "annotation not bound to any page");
5047
5048 doc = widget->page->doc;
5049 unsaved_versions = pdf_count_unsaved_versions(ctx, doc);
5050 num_versions = pdf_count_versions(ctx, doc) + unsaved_versions;
5051 version = pdf_find_version_for_obj(ctx, doc, widget->obj);
5052
5053 if (version > num_versions-1)
5054 version = num_versions-1;
5055
5056 /* Get the locked definition from the object when it was signed. */
5057 o_xref_base = doc->xref_base;
5058 doc->xref_base = version;
5059
5060 fz_var(locked); /* Not really needed, but it stops warnings */
5061
5062 fz_try(ctx)
5063 {
5064 locked = pdf_find_locked_fields_for_sig(ctx, doc, widget->obj);
5065 for (i = version-1; i >= unsaved_versions; i--)
5066 {
5067 doc->xref_base = i;
5068 if (!validate_locked_fields(ctx, doc, i, locked))
5069 break;
5070 }
5071 }
5072 fz_always(ctx)
5073 {
5074 doc->xref_base = o_xref_base;
5075 pdf_drop_locked_fields(ctx, locked);
5076 }
5077 fz_catch(ctx)
5078 fz_rethrow(ctx);
5079
5080 return i+1-unsaved_versions;
5081 }
5082
5083 int pdf_was_pure_xfa(fz_context *ctx, pdf_document *doc)
5084 {
5085 int num_unsaved_versions = pdf_count_unsaved_versions(ctx, doc);
5086 int num_versions = pdf_count_versions(ctx, doc);
5087 int v;
5088 int o_xref_base = doc->xref_base;
5089 int pure_xfa = 0;
5090
5091 fz_var(pure_xfa);
5092
5093 fz_try(ctx)
5094 {
5095 for(v = num_versions + num_unsaved_versions; !pure_xfa && v >= num_unsaved_versions; v--)
5096 {
5097 pdf_obj *o;
5098 doc->xref_base = v;
5099 o = pdf_dict_getp(ctx, pdf_trailer(ctx, doc), "Root/AcroForm");
5100 /* If we find a version that had an empty Root/AcroForm/Fields, but had a
5101 * Root/AcroForm/XFA entry, then we deduce that this was at one time a
5102 * pure XFA form. */
5103 if (pdf_array_len(ctx, pdf_dict_get(ctx, o, PDF_NAME(Fields))) == 0 &&
5104 pdf_dict_get(ctx, o, PDF_NAME(XFA)) != NULL)
5105 pure_xfa = 1;
5106 }
5107 }
5108 fz_always(ctx)
5109 doc->xref_base = o_xref_base;
5110 fz_catch(ctx)
5111 fz_rethrow(ctx);
5112
5113 return pure_xfa;
5114 }
5115
5116 pdf_xref *pdf_new_local_xref(fz_context *ctx, pdf_document *doc)
5117 {
5118 int n = pdf_xref_len(ctx, doc);
5119 pdf_xref *xref = fz_malloc_struct(ctx, pdf_xref);
5120
5121 xref->subsec = NULL;
5122 xref->num_objects = n;
5123 xref->trailer = NULL;
5124 xref->pre_repair_trailer = NULL;
5125 xref->unsaved_sigs = NULL;
5126 xref->unsaved_sigs_end = NULL;
5127
5128 fz_try(ctx)
5129 {
5130 xref->subsec = fz_malloc_struct(ctx, pdf_xref_subsec);
5131 xref->subsec->len = n;
5132 xref->subsec->start = 0;
5133 xref->subsec->table = fz_malloc_struct_array(ctx, n, pdf_xref_entry);
5134 xref->subsec->next = NULL;
5135 }
5136 fz_catch(ctx)
5137 {
5138 fz_free(ctx, xref->subsec);
5139 fz_free(ctx, xref);
5140 fz_rethrow(ctx);
5141 }
5142
5143 return xref;
5144 }
5145
5146 void pdf_drop_local_xref(fz_context *ctx, pdf_xref *xref)
5147 {
5148 if (xref == NULL)
5149 return;
5150
5151 pdf_drop_xref_subsec(ctx, xref);
5152
5153 fz_free(ctx, xref);
5154 }
5155
5156 void pdf_drop_local_xref_and_resources(fz_context *ctx, pdf_document *doc)
5157 {
5158 pdf_purge_local_resources(ctx, doc);
5159 pdf_purge_locals_from_store(ctx, doc);
5160 pdf_drop_local_xref(ctx, doc->local_xref);
5161 doc->local_xref = NULL;
5162 doc->resynth_required = 1;
5163 }
5164
5165 void
5166 pdf_debug_doc_changes(fz_context *ctx, pdf_document *doc)
5167 {
5168 int i, j;
5169
5170 if (doc->num_incremental_sections == 0)
5171 fz_write_printf(ctx, fz_stddbg(ctx), "No incremental xrefs");
5172 else
5173 {
5174 for (i = 0; i < doc->num_incremental_sections; i++)
5175 {
5176 pdf_xref *xref = &doc->xref_sections[i];
5177 pdf_xref_subsec *sub;
5178
5179 fz_write_printf(ctx, fz_stddbg(ctx), "Incremental xref:\n");
5180 for (sub = xref->subsec; sub != NULL; sub = sub->next)
5181 {
5182 fz_write_printf(ctx, fz_stddbg(ctx), " Objects %d->%d\n", sub->start, sub->start + sub->len - 1);
5183 for (j = 0; j < sub->len; j++)
5184 {
5185 pdf_xref_entry *e = &sub->table[j];
5186 if (e->type == 0)
5187 continue;
5188 fz_write_printf(ctx, fz_stddbg(ctx), "%d %d obj (%c)\n", j + sub->start, e->gen, e->type);
5189 pdf_debug_obj(ctx, e->obj);
5190 fz_write_printf(ctx, fz_stddbg(ctx), "\nendobj\n");
5191 }
5192 }
5193 }
5194 }
5195
5196 if (doc->local_xref == NULL)
5197 fz_write_printf(ctx, fz_stddbg(ctx), "No local xref");
5198 else
5199 {
5200 for (i = 0; i < doc->num_incremental_sections; i++)
5201 {
5202 pdf_xref *xref = doc->local_xref;
5203 pdf_xref_subsec *sub;
5204
5205 fz_write_printf(ctx, fz_stddbg(ctx), "Local xref (%sin force):\n", doc->local_xref_nesting == 0 ? "not " : "");
5206 for (sub = xref->subsec; sub != NULL; sub = sub->next)
5207 {
5208 fz_write_printf(ctx, fz_stddbg(ctx), " Objects %d->%d\n", sub->start, sub->start + sub->len - 1);
5209 for (j = 0; j < sub->len; j++)
5210 {
5211 pdf_xref_entry *e = &sub->table[j];
5212 if (e->type == 0)
5213 continue;
5214 fz_write_printf(ctx, fz_stddbg(ctx), "%d %d obj (%c)\n", j + sub->start, e->gen, e->type);
5215 pdf_debug_obj(ctx, e->obj);
5216 fz_write_printf(ctx, fz_stddbg(ctx), "\nendobj\n");
5217 }
5218 }
5219 }
5220 }
5221
5222 }
5223
5224 pdf_obj *
5225 pdf_metadata(fz_context *ctx, pdf_document *doc)
5226 {
5227 int initial = doc->xref_base;
5228 pdf_obj *obj = NULL;
5229
5230 fz_var(obj);
5231
5232 fz_try(ctx)
5233 {
5234 do
5235 {
5236 pdf_obj *root = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root));
5237 obj = pdf_dict_get(ctx, root, PDF_NAME(Metadata));
5238 if (obj)
5239 break;
5240 doc->xref_base++;
5241 }
5242 while (doc->xref_base < doc->num_xref_sections);
5243 }
5244 fz_always(ctx)
5245 doc->xref_base = initial;
5246 fz_catch(ctx)
5247 fz_rethrow(ctx);
5248
5249 return obj;
5250 }
5251
5252 int pdf_obj_is_incremental(fz_context *ctx, pdf_obj *obj)
5253 {
5254 pdf_document *doc = pdf_get_bound_document(ctx, obj);
5255 int v;
5256
5257 if (doc == NULL || doc->num_incremental_sections == 0)
5258 return 0;
5259
5260 v = pdf_find_incremental_update_num_for_obj(ctx, doc, obj);
5261
5262 return (v == 0);
5263 }
5264
5265 void pdf_minimize_document(fz_context *ctx, pdf_document *doc)
5266 {
5267 int i;
5268
5269 /* Don't throw anything away if we've done a repair! */
5270 if (doc == NULL || doc->repair_attempted)
5271 return;
5272
5273 /* Don't throw anything away in the incremental section, as that's where
5274 * all our changes will be. */
5275 for (i = doc->num_incremental_sections; i < doc->num_xref_sections; i++)
5276 {
5277 pdf_xref *xref = &doc->xref_sections[i];
5278 pdf_xref_subsec *sub;
5279
5280 for (sub = xref->subsec; sub; sub = sub->next)
5281 {
5282 int len = sub->len;
5283 int j;
5284 for (j = 0; j < len; j++)
5285 {
5286 pdf_xref_entry *e = &sub->table[j];
5287 if (e->obj == NULL)
5288 continue;
5289 e->obj = pdf_drop_singleton_obj(ctx, e->obj);
5290 }
5291 }
5292 }
5293 }
5294
5295 void pdf_repair_xref(fz_context *ctx, pdf_document *doc)
5296 {
5297 pdf_repair_xref_aux(ctx, doc, pdf_prime_xref_index);
5298 }