Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/source/pdf/pdf-clean-file.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 // Copyright (C) 2004-2025 Artifex Software, Inc. | |
| 2 // | |
| 3 // This file is part of MuPDF. | |
| 4 // | |
| 5 // MuPDF is free software: you can redistribute it and/or modify it under the | |
| 6 // terms of the GNU Affero General Public License as published by the Free | |
| 7 // Software Foundation, either version 3 of the License, or (at your option) | |
| 8 // any later version. | |
| 9 // | |
| 10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY | |
| 11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | |
| 12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more | |
| 13 // details. | |
| 14 // | |
| 15 // You should have received a copy of the GNU Affero General Public License | |
| 16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> | |
| 17 // | |
| 18 // Alternative licensing terms are available from the licensor. | |
| 19 // For commercial licensing, see <https://www.artifex.com/> or contact | |
| 20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, | |
| 21 // CA 94129, USA, for further information. | |
| 22 | |
| 23 #include "mupdf/fitz.h" | |
| 24 #include "mupdf/pdf.h" | |
| 25 | |
| 26 #include <string.h> | |
| 27 | |
| 28 static int | |
| 29 string_in_names_list(fz_context *ctx, pdf_obj *p, pdf_obj *names_list) | |
| 30 { | |
| 31 int n = pdf_array_len(ctx, names_list); | |
| 32 int i; | |
| 33 char *str = pdf_to_str_buf(ctx, p); | |
| 34 | |
| 35 for (i = 0; i < n ; i += 2) | |
| 36 { | |
| 37 if (!strcmp(pdf_to_str_buf(ctx, pdf_array_get(ctx, names_list, i)), str)) | |
| 38 return 1; | |
| 39 } | |
| 40 return 0; | |
| 41 } | |
| 42 | |
| 43 /* | |
| 44 * Recreate page tree to only retain specified pages. | |
| 45 */ | |
| 46 | |
| 47 static void retainpage(fz_context *ctx, pdf_document *doc, pdf_obj *parentobj, pdf_obj *kids, int page, pdf_obj *structparents, pdf_obj *ostructparents) | |
| 48 { | |
| 49 pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, page); | |
| 50 | |
| 51 pdf_flatten_inheritable_page_items(ctx, pageref); | |
| 52 | |
| 53 pdf_dict_put(ctx, pageref, PDF_NAME(Parent), parentobj); | |
| 54 | |
| 55 /* Store page object in new kids array */ | |
| 56 pdf_array_push(ctx, kids, pageref); | |
| 57 | |
| 58 if (structparents) | |
| 59 { | |
| 60 int parentnum = pdf_dict_get_int(ctx, pageref, PDF_NAME(StructParents)); | |
| 61 pdf_obj *parent = pdf_lookup_number(ctx, ostructparents, parentnum); | |
| 62 pdf_obj *nums = pdf_dict_get(ctx, structparents, PDF_NAME(Nums)); | |
| 63 pdf_obj *limits = pdf_dict_get(ctx, structparents, PDF_NAME(Limits)); | |
| 64 int min, max; | |
| 65 pdf_array_push_int(ctx, nums, parentnum); | |
| 66 pdf_array_push(ctx, nums, parent); | |
| 67 if (limits == NULL) | |
| 68 { | |
| 69 min = max = parentnum; | |
| 70 limits = pdf_new_array(ctx, doc, 2); | |
| 71 pdf_dict_put_drop(ctx, structparents, PDF_NAME(Limits), limits); | |
| 72 } | |
| 73 else | |
| 74 { | |
| 75 min = pdf_array_get_int(ctx, limits, 0); | |
| 76 max = pdf_array_get_int(ctx, limits, 1); | |
| 77 if (min > parentnum) | |
| 78 min = parentnum; | |
| 79 if (max < parentnum) | |
| 80 max = parentnum; | |
| 81 } | |
| 82 pdf_array_put_int(ctx, limits, 0, min); | |
| 83 pdf_array_put_int(ctx, limits, 1, max); | |
| 84 } | |
| 85 } | |
| 86 | |
| 87 static int dest_is_valid_page(fz_context *ctx, pdf_obj *obj, int *page_object_nums, int pagecount) | |
| 88 { | |
| 89 int i; | |
| 90 int num = pdf_to_num(ctx, obj); | |
| 91 | |
| 92 if (num == 0) | |
| 93 return 0; | |
| 94 for (i = 0; i < pagecount; i++) | |
| 95 { | |
| 96 if (page_object_nums[i] == num) | |
| 97 return 1; | |
| 98 } | |
| 99 return 0; | |
| 100 } | |
| 101 | |
| 102 static int dest_is_valid(fz_context *ctx, pdf_obj *o, int page_count, int *page_object_nums, pdf_obj *names_list) | |
| 103 { | |
| 104 pdf_obj *p; | |
| 105 | |
| 106 p = pdf_dict_get(ctx, o, PDF_NAME(A)); | |
| 107 if (pdf_name_eq(ctx, pdf_dict_get(ctx, p, PDF_NAME(S)), PDF_NAME(GoTo))) | |
| 108 { | |
| 109 pdf_obj *d = pdf_dict_get(ctx, p, PDF_NAME(D)); | |
| 110 if (pdf_is_array(ctx, d) && !dest_is_valid_page(ctx, pdf_array_get(ctx, d, 0), page_object_nums, page_count)) | |
| 111 return 0; | |
| 112 else if (pdf_is_string(ctx, d) && !string_in_names_list(ctx, d, names_list)) | |
| 113 return 0; | |
| 114 } | |
| 115 | |
| 116 p = pdf_dict_get(ctx, o, PDF_NAME(Dest)); | |
| 117 if (p == NULL) | |
| 118 return 1; /* A name with no dest counts as valid. */ | |
| 119 else if (pdf_is_string(ctx, p)) | |
| 120 return string_in_names_list(ctx, p, names_list); | |
| 121 else if (!dest_is_valid_page(ctx, pdf_array_get(ctx, p, 0), page_object_nums, page_count)) | |
| 122 return 0; | |
| 123 | |
| 124 return 1; | |
| 125 } | |
| 126 | |
| 127 static int strip_stale_annot_refs(fz_context *ctx, pdf_obj *field, int page_count, int *page_object_nums) | |
| 128 { | |
| 129 pdf_obj *kids = pdf_dict_get(ctx, field, PDF_NAME(Kids)); | |
| 130 int len = pdf_array_len(ctx, kids); | |
| 131 int j; | |
| 132 | |
| 133 if (kids) | |
| 134 { | |
| 135 for (j = 0; j < len; j++) | |
| 136 { | |
| 137 if (strip_stale_annot_refs(ctx, pdf_array_get(ctx, kids, j), page_count, page_object_nums)) | |
| 138 { | |
| 139 pdf_array_delete(ctx, kids, j); | |
| 140 len--; | |
| 141 j--; | |
| 142 } | |
| 143 } | |
| 144 | |
| 145 return pdf_array_len(ctx, kids) == 0; | |
| 146 } | |
| 147 else | |
| 148 { | |
| 149 pdf_obj *page = pdf_dict_get(ctx, field, PDF_NAME(P)); | |
| 150 int page_num = pdf_to_num(ctx, page); | |
| 151 | |
| 152 for (j = 0; j < page_count; j++) | |
| 153 if (page_num == page_object_nums[j]) | |
| 154 return 0; | |
| 155 | |
| 156 return 1; | |
| 157 } | |
| 158 } | |
| 159 | |
| 160 static int strip_outlines(fz_context *ctx, pdf_document *doc, pdf_obj *outlines, int page_count, int *page_object_nums, pdf_obj *names_list, pdf_mark_bits *marks); | |
| 161 | |
| 162 static int strip_outline(fz_context *ctx, pdf_document *doc, pdf_obj *outlines, int page_count, int *page_object_nums, pdf_obj *names_list, pdf_obj **pfirst, pdf_obj **plast, pdf_mark_bits *marks) | |
| 163 { | |
| 164 pdf_obj *prev = NULL; | |
| 165 pdf_obj *first = NULL; | |
| 166 pdf_obj *current; | |
| 167 int count = 0; | |
| 168 | |
| 169 for (current = outlines; current != NULL; ) | |
| 170 { | |
| 171 int nc; | |
| 172 | |
| 173 /* Strip any children to start with. This takes care of | |
| 174 * First/Last/Count for us. */ | |
| 175 nc = strip_outlines(ctx, doc, current, page_count, page_object_nums, names_list, marks); | |
| 176 | |
| 177 if (!dest_is_valid(ctx, current, page_count, page_object_nums, names_list)) | |
| 178 { | |
| 179 if (nc == 0) | |
| 180 { | |
| 181 /* Outline with invalid dest and no children. Drop it by | |
| 182 * pulling the next one in here. */ | |
| 183 pdf_obj *next = pdf_dict_get(ctx, current, PDF_NAME(Next)); | |
| 184 if (!pdf_is_dict(ctx, next)) | |
| 185 { | |
| 186 /* There is no next one to pull in */ | |
| 187 if (prev != NULL) | |
| 188 pdf_dict_del(ctx, prev, PDF_NAME(Next)); | |
| 189 } | |
| 190 else if (prev != NULL) | |
| 191 { | |
| 192 pdf_dict_put(ctx, prev, PDF_NAME(Next), next); | |
| 193 pdf_dict_put(ctx, next, PDF_NAME(Prev), prev); | |
| 194 } | |
| 195 else | |
| 196 { | |
| 197 pdf_dict_del(ctx, next, PDF_NAME(Prev)); | |
| 198 } | |
| 199 current = next; | |
| 200 } | |
| 201 else | |
| 202 { | |
| 203 /* Outline with invalid dest, but children. Just drop the dest. */ | |
| 204 pdf_dict_del(ctx, current, PDF_NAME(Dest)); | |
| 205 pdf_dict_del(ctx, current, PDF_NAME(A)); | |
| 206 current = pdf_dict_get(ctx, current, PDF_NAME(Next)); | |
| 207 } | |
| 208 } | |
| 209 else | |
| 210 { | |
| 211 /* Keep this one */ | |
| 212 if (first == NULL) | |
| 213 first = current; | |
| 214 prev = current; | |
| 215 current = pdf_dict_get(ctx, current, PDF_NAME(Next)); | |
| 216 count++; | |
| 217 } | |
| 218 } | |
| 219 | |
| 220 *pfirst = first; | |
| 221 *plast = prev; | |
| 222 | |
| 223 return count; | |
| 224 } | |
| 225 | |
| 226 static int strip_outlines(fz_context *ctx, pdf_document *doc, pdf_obj *outlines, int page_count, int *page_object_nums, pdf_obj *names_list, pdf_mark_bits *marks) | |
| 227 { | |
| 228 int nc; | |
| 229 pdf_obj *first; | |
| 230 pdf_obj *last; | |
| 231 | |
| 232 if (!pdf_is_dict(ctx, outlines)) | |
| 233 return 0; | |
| 234 | |
| 235 if (pdf_mark_bits_set(ctx, marks, outlines)) | |
| 236 fz_throw(ctx, FZ_ERROR_FORMAT, "Cycle detected in outlines"); | |
| 237 | |
| 238 first = pdf_dict_get(ctx, outlines, PDF_NAME(First)); | |
| 239 if (!pdf_is_dict(ctx, first)) | |
| 240 nc = 0; | |
| 241 else | |
| 242 nc = strip_outline(ctx, doc, first, page_count, page_object_nums, names_list, &first, &last, marks); | |
| 243 | |
| 244 if (nc == 0) | |
| 245 { | |
| 246 pdf_dict_del(ctx, outlines, PDF_NAME(First)); | |
| 247 pdf_dict_del(ctx, outlines, PDF_NAME(Last)); | |
| 248 pdf_dict_del(ctx, outlines, PDF_NAME(Count)); | |
| 249 } | |
| 250 else | |
| 251 { | |
| 252 int old_count = pdf_dict_get_int(ctx, outlines, PDF_NAME(Count)); | |
| 253 pdf_dict_put(ctx, outlines, PDF_NAME(First), first); | |
| 254 pdf_dict_put(ctx, outlines, PDF_NAME(Last), last); | |
| 255 pdf_dict_put_int(ctx, outlines, PDF_NAME(Count), old_count > 0 ? nc : -nc); | |
| 256 } | |
| 257 | |
| 258 return nc; | |
| 259 } | |
| 260 | |
| 261 static void pdf_rearrange_pages_imp(fz_context *ctx, pdf_document *doc, int count, const int *new_page_list, pdf_clean_options_structure structure) | |
| 262 { | |
| 263 pdf_obj *oldroot, *pages, *kids, *olddests; | |
| 264 pdf_obj *root = NULL; | |
| 265 pdf_obj *names_list = NULL; | |
| 266 pdf_obj *outlines; | |
| 267 pdf_obj *ocproperties; | |
| 268 pdf_obj *allfields = NULL; | |
| 269 int pagecount, i; | |
| 270 int *page_object_nums = NULL; | |
| 271 pdf_obj *structtreeroot = NULL; | |
| 272 pdf_obj *ostructparents = NULL; | |
| 273 pdf_obj *structparents = NULL; | |
| 274 pdf_mark_bits *marks = NULL; | |
| 275 | |
| 276 /* Keep only pages/type and (reduced) dest entries to avoid | |
| 277 * references to unretained pages */ | |
| 278 oldroot = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root)); | |
| 279 pages = pdf_dict_get(ctx, oldroot, PDF_NAME(Pages)); | |
| 280 olddests = pdf_load_name_tree(ctx, doc, PDF_NAME(Dests)); | |
| 281 outlines = pdf_dict_get(ctx, oldroot, PDF_NAME(Outlines)); | |
| 282 ocproperties = pdf_dict_get(ctx, oldroot, PDF_NAME(OCProperties)); | |
| 283 if (structure == PDF_CLEAN_STRUCTURE_KEEP) | |
| 284 { | |
| 285 structtreeroot = pdf_dict_get(ctx, oldroot, PDF_NAME(StructTreeRoot)); | |
| 286 ostructparents = pdf_dict_get(ctx, structtreeroot, PDF_NAME(ParentTree)); | |
| 287 if (structtreeroot) | |
| 288 structparents = pdf_new_dict(ctx, doc, 3); | |
| 289 } | |
| 290 | |
| 291 fz_var(root); | |
| 292 fz_var(names_list); | |
| 293 fz_var(allfields); | |
| 294 fz_var(page_object_nums); | |
| 295 fz_var(kids); | |
| 296 fz_var(marks); | |
| 297 | |
| 298 fz_try(ctx) | |
| 299 { | |
| 300 root = pdf_new_dict(ctx, doc, 3); | |
| 301 pdf_dict_put(ctx, root, PDF_NAME(Type), pdf_dict_get(ctx, oldroot, PDF_NAME(Type))); | |
| 302 pdf_dict_put(ctx, root, PDF_NAME(Pages), pdf_dict_get(ctx, oldroot, PDF_NAME(Pages))); | |
| 303 if (structtreeroot) | |
| 304 { | |
| 305 pdf_dict_put(ctx, root, PDF_NAME(StructTreeRoot), structtreeroot); | |
| 306 pdf_dict_put(ctx, structtreeroot, PDF_NAME(ParentTree), structparents); | |
| 307 pdf_dict_put_array(ctx, structparents, PDF_NAME(Nums), 2); | |
| 308 } | |
| 309 if (outlines) | |
| 310 pdf_dict_put(ctx, root, PDF_NAME(Outlines), outlines); | |
| 311 if (ocproperties) | |
| 312 pdf_dict_put(ctx, root, PDF_NAME(OCProperties), ocproperties); | |
| 313 | |
| 314 pdf_update_object(ctx, doc, pdf_to_num(ctx, oldroot), root); | |
| 315 | |
| 316 /* Create a new kids array with only the pages we want to keep */ | |
| 317 kids = pdf_new_array(ctx, doc, 1); | |
| 318 | |
| 319 /* Retain pages specified */ | |
| 320 for (i = 0; i < count; ++i) | |
| 321 retainpage(ctx, doc, pages, kids, new_page_list[i], structparents, ostructparents); | |
| 322 | |
| 323 /* Update page count */ | |
| 324 pdf_dict_put_int(ctx, pages, PDF_NAME(Count), pdf_array_len(ctx, kids)); | |
| 325 pdf_dict_put(ctx, pages, PDF_NAME(Kids), kids); | |
| 326 | |
| 327 pagecount = pdf_count_pages(ctx, doc); | |
| 328 page_object_nums = fz_calloc(ctx, pagecount, sizeof(*page_object_nums)); | |
| 329 for (i = 0; i < pagecount; i++) | |
| 330 { | |
| 331 pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, i); | |
| 332 page_object_nums[i] = pdf_to_num(ctx, pageref); | |
| 333 } | |
| 334 | |
| 335 /* If we had an old Dests tree (now reformed as an olddests | |
| 336 * dictionary), keep any entries in there that point to | |
| 337 * valid pages. This may mean we keep more than we need, but | |
| 338 * it's safe at least. */ | |
| 339 if (olddests) | |
| 340 { | |
| 341 pdf_obj *names, *dests; | |
| 342 int len = pdf_dict_len(ctx, olddests); | |
| 343 | |
| 344 names = pdf_dict_put_dict(ctx, root, PDF_NAME(Names), 1); | |
| 345 dests = pdf_dict_put_dict(ctx, names, PDF_NAME(Dests), 1); | |
| 346 names_list = pdf_dict_put_array(ctx, dests, PDF_NAME(Names), 32); | |
| 347 | |
| 348 for (i = 0; i < len; i++) | |
| 349 { | |
| 350 pdf_obj *key = pdf_dict_get_key(ctx, olddests, i); | |
| 351 pdf_obj *val = pdf_dict_get_val(ctx, olddests, i); | |
| 352 pdf_obj *dest = pdf_dict_get(ctx, val, PDF_NAME(D)); | |
| 353 | |
| 354 dest = pdf_array_get(ctx, dest ? dest : val, 0); | |
| 355 if (dest_is_valid_page(ctx, dest, page_object_nums, pagecount)) | |
| 356 { | |
| 357 pdf_array_push_string(ctx, names_list, pdf_to_name(ctx, key), strlen(pdf_to_name(ctx, key))); | |
| 358 pdf_array_push(ctx, names_list, val); | |
| 359 } | |
| 360 } | |
| 361 | |
| 362 pdf_drop_obj(ctx, olddests); | |
| 363 } | |
| 364 | |
| 365 /* Edit each pages /Annot list to remove any links that point to nowhere. */ | |
| 366 for (i = 0; i < pagecount; i++) | |
| 367 { | |
| 368 pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, i); | |
| 369 | |
| 370 pdf_obj *annots = pdf_dict_get(ctx, pageref, PDF_NAME(Annots)); | |
| 371 | |
| 372 int len = pdf_array_len(ctx, annots); | |
| 373 int j; | |
| 374 | |
| 375 for (j = 0; j < len; j++) | |
| 376 { | |
| 377 pdf_obj *o = pdf_array_get(ctx, annots, j); | |
| 378 | |
| 379 if (!pdf_name_eq(ctx, pdf_dict_get(ctx, o, PDF_NAME(Subtype)), PDF_NAME(Link))) | |
| 380 continue; | |
| 381 | |
| 382 if (!dest_is_valid(ctx, o, pagecount, page_object_nums, names_list)) | |
| 383 { | |
| 384 /* Remove this annotation */ | |
| 385 pdf_array_delete(ctx, annots, j); | |
| 386 len--; | |
| 387 j--; | |
| 388 } | |
| 389 } | |
| 390 } | |
| 391 | |
| 392 /* Locate all fields on retained pages */ | |
| 393 allfields = pdf_new_array(ctx, doc, 1); | |
| 394 for (i = 0; i < pagecount; i++) | |
| 395 { | |
| 396 pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, i); | |
| 397 | |
| 398 pdf_obj *annots = pdf_dict_get(ctx, pageref, PDF_NAME(Annots)); | |
| 399 | |
| 400 int len = pdf_array_len(ctx, annots); | |
| 401 int j; | |
| 402 | |
| 403 for (j = 0; j < len; j++) | |
| 404 { | |
| 405 pdf_obj *f = pdf_array_get(ctx, annots, j); | |
| 406 | |
| 407 if (pdf_dict_get(ctx, f, PDF_NAME(Subtype)) == PDF_NAME(Widget)) | |
| 408 pdf_array_push(ctx, allfields, f); | |
| 409 } | |
| 410 } | |
| 411 | |
| 412 /* From non-terminal widget fields, strip out annot references not | |
| 413 * belonging to any retained page. */ | |
| 414 for (i = 0; i < pdf_array_len(ctx, allfields); i++) | |
| 415 { | |
| 416 pdf_obj *f = pdf_array_get(ctx, allfields, i); | |
| 417 | |
| 418 while (pdf_dict_get(ctx, f, PDF_NAME(Parent))) | |
| 419 f = pdf_dict_get(ctx, f, PDF_NAME(Parent)); | |
| 420 | |
| 421 strip_stale_annot_refs(ctx, f, pagecount, page_object_nums); | |
| 422 } | |
| 423 | |
| 424 /* For terminal fields, if action destination is not valid, | |
| 425 * remove the action */ | |
| 426 for (i = 0; i < pdf_array_len(ctx, allfields); i++) | |
| 427 { | |
| 428 pdf_obj *f = pdf_array_get(ctx, allfields, i); | |
| 429 | |
| 430 if (!dest_is_valid(ctx, f, pagecount, page_object_nums, names_list)) | |
| 431 pdf_dict_del(ctx, f, PDF_NAME(A)); | |
| 432 } | |
| 433 | |
| 434 marks = pdf_new_mark_bits(ctx, doc); | |
| 435 if (strip_outlines(ctx, doc, outlines, pagecount, page_object_nums, names_list, marks) == 0) | |
| 436 { | |
| 437 pdf_dict_del(ctx, root, PDF_NAME(Outlines)); | |
| 438 } | |
| 439 } | |
| 440 fz_always(ctx) | |
| 441 { | |
| 442 pdf_drop_mark_bits(ctx, marks); | |
| 443 fz_free(ctx, page_object_nums); | |
| 444 pdf_drop_obj(ctx, allfields); | |
| 445 pdf_drop_obj(ctx, root); | |
| 446 pdf_drop_obj(ctx, kids); | |
| 447 pdf_drop_obj(ctx, structparents); | |
| 448 } | |
| 449 fz_catch(ctx) | |
| 450 { | |
| 451 fz_rethrow(ctx); | |
| 452 } | |
| 453 } | |
| 454 | |
| 455 void pdf_rearrange_pages(fz_context *ctx, pdf_document *doc, int count, const int *new_page_list, pdf_clean_options_structure structure) | |
| 456 { | |
| 457 if (structure < PDF_CLEAN_STRUCTURE_DROP || structure > PDF_CLEAN_STRUCTURE_KEEP) | |
| 458 fz_throw(ctx, FZ_ERROR_ARGUMENT, "Invalid structure argument"); | |
| 459 | |
| 460 pdf_begin_operation(ctx, doc, "Rearrange pages"); | |
| 461 fz_try(ctx) | |
| 462 { | |
| 463 pdf_rearrange_pages_imp(ctx, doc, count, new_page_list, structure); | |
| 464 pdf_end_operation(ctx, doc); | |
| 465 } | |
| 466 fz_catch(ctx) | |
| 467 { | |
| 468 pdf_abandon_operation(ctx, doc); | |
| 469 pdf_sync_open_pages(ctx, doc); | |
| 470 fz_rethrow(ctx); | |
| 471 } | |
| 472 pdf_sync_open_pages(ctx, doc); | |
| 473 } | |
| 474 | |
| 475 void pdf_clean_file(fz_context *ctx, char *infile, char *outfile, char *password, pdf_clean_options *opts, int argc, char *argv[]) | |
| 476 { | |
| 477 pdf_clean_options default_opts = { 0 }; | |
| 478 pdf_document *pdf = NULL; | |
| 479 int *pages = NULL; | |
| 480 int cap, len, page; | |
| 481 | |
| 482 fz_var(pdf); | |
| 483 fz_var(pages); | |
| 484 | |
| 485 if (opts == NULL) | |
| 486 opts = &default_opts; | |
| 487 if (argc > 0 && argv == NULL) | |
| 488 fz_throw(ctx, FZ_ERROR_ARGUMENT, "arguments array must be set if arguments exist"); | |
| 489 | |
| 490 fz_try(ctx) | |
| 491 { | |
| 492 pdf = pdf_open_document(ctx, infile); | |
| 493 if (pdf_needs_password(ctx, pdf)) | |
| 494 if (!pdf_authenticate_password(ctx, pdf, password)) | |
| 495 fz_throw(ctx, FZ_ERROR_ARGUMENT, "cannot authenticate password: %s", infile); | |
| 496 | |
| 497 len = cap = 0; | |
| 498 | |
| 499 /* Only retain the specified subset of the pages */ | |
| 500 if (argc) | |
| 501 { | |
| 502 int pagecount = pdf_count_pages(ctx, pdf); | |
| 503 int argidx = 0; | |
| 504 | |
| 505 while (argc - argidx) | |
| 506 { | |
| 507 int spage, epage; | |
| 508 const char *pagelist = argv[argidx]; | |
| 509 | |
| 510 while ((pagelist = fz_parse_page_range(ctx, pagelist, &spage, &epage, pagecount))) | |
| 511 { | |
| 512 if (len + (epage - spage + 1) >= cap) | |
| 513 { | |
| 514 int n = cap ? cap * 2 : 8; | |
| 515 while (len + (epage - spage + 1) >= n) | |
| 516 n *= 2; | |
| 517 pages = fz_realloc_array(ctx, pages, n, int); | |
| 518 cap = n; | |
| 519 } | |
| 520 | |
| 521 if (spage < epage) | |
| 522 for (page = spage; page <= epage; ++page) | |
| 523 pages[len++] = page - 1; | |
| 524 else | |
| 525 for (page = spage; page >= epage; --page) | |
| 526 pages[len++] = page - 1; | |
| 527 } | |
| 528 | |
| 529 argidx++; | |
| 530 } | |
| 531 | |
| 532 pdf_rearrange_pages(ctx, pdf, len, pages, opts->structure); | |
| 533 } | |
| 534 | |
| 535 pdf_rewrite_images(ctx, pdf, &opts->image); | |
| 536 | |
| 537 if (opts->subset_fonts) | |
| 538 pdf_subset_fonts(ctx, pdf, len, pages); | |
| 539 | |
| 540 pdf_save_document(ctx, pdf, outfile, &opts->write); | |
| 541 } | |
| 542 fz_always(ctx) | |
| 543 { | |
| 544 fz_free(ctx, pages); | |
| 545 pdf_drop_document(ctx, pdf); | |
| 546 } | |
| 547 fz_catch(ctx) | |
| 548 { | |
| 549 fz_rethrow(ctx); | |
| 550 } | |
| 551 } |
