Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/source/pdf/pdf-subset.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 // Copyright (C) 2004-2025 Artifex Software, Inc. | |
| 2 // | |
| 3 // This file is part of MuPDF. | |
| 4 // | |
| 5 // MuPDF is free software: you can redistribute it and/or modify it under the | |
| 6 // terms of the GNU Affero General Public License as published by the Free | |
| 7 // Software Foundation, either version 3 of the License, or (at your option) | |
| 8 // any later version. | |
| 9 // | |
| 10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY | |
| 11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | |
| 12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more | |
| 13 // details. | |
| 14 // | |
| 15 // You should have received a copy of the GNU Affero General Public License | |
| 16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> | |
| 17 // | |
| 18 // Alternative licensing terms are available from the licensor. | |
| 19 // For commercial licensing, see <https://www.artifex.com/> or contact | |
| 20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, | |
| 21 // CA 94129, USA, for further information. | |
| 22 | |
| 23 | |
| 24 #include "mupdf/fitz.h" | |
| 25 #include "mupdf/pdf.h" | |
| 26 | |
| 27 /* Define the following for some debugging output. */ | |
| 28 #undef DEBUG_SUBSETTING | |
| 29 | |
| 30 typedef struct gstate | |
| 31 { | |
| 32 struct gstate *next; | |
| 33 int current_font; | |
| 34 pdf_font_desc *font; | |
| 35 } gstate; | |
| 36 | |
| 37 typedef struct resources_stack | |
| 38 { | |
| 39 struct resources_stack *next; | |
| 40 pdf_obj *res; | |
| 41 } resources_stack; | |
| 42 | |
| 43 typedef struct | |
| 44 { | |
| 45 int num; | |
| 46 int gen; | |
| 47 int is_ttf; | |
| 48 int is_cidfont; | |
| 49 pdf_obj *fontfile; | |
| 50 unsigned char digest[16]; | |
| 51 | |
| 52 fz_int_heap gids; | |
| 53 fz_int_heap cids; | |
| 54 | |
| 55 /* Pointers back to the top level fonts that refer to this. */ | |
| 56 int max; | |
| 57 int len; | |
| 58 pdf_obj **font; | |
| 59 } font_usage_t; | |
| 60 | |
| 61 typedef struct | |
| 62 { | |
| 63 int max; | |
| 64 int len; | |
| 65 font_usage_t *font; | |
| 66 } fonts_usage_t; | |
| 67 | |
| 68 typedef struct | |
| 69 { | |
| 70 pdf_processor super; | |
| 71 resources_stack *rstack; | |
| 72 fonts_usage_t *usage; | |
| 73 gstate *gs; | |
| 74 } pdf_font_analysis_processor; | |
| 75 | |
| 76 static void | |
| 77 pop_gstate(fz_context *ctx, pdf_font_analysis_processor *p) | |
| 78 { | |
| 79 gstate *gs = p->gs; | |
| 80 gstate *old; | |
| 81 | |
| 82 if (gs == NULL) | |
| 83 return; | |
| 84 | |
| 85 old = gs->next; | |
| 86 pdf_drop_font(ctx, gs->font); | |
| 87 fz_free(ctx, gs); | |
| 88 p->gs = old; | |
| 89 } | |
| 90 | |
| 91 static void | |
| 92 drop_processor(fz_context *ctx, pdf_processor *proc) | |
| 93 { | |
| 94 pdf_font_analysis_processor *p = (pdf_font_analysis_processor*)proc; | |
| 95 | |
| 96 while (p->rstack) | |
| 97 { | |
| 98 resources_stack *stk = p->rstack; | |
| 99 p->rstack = stk->next; | |
| 100 pdf_drop_obj(ctx, stk->res); | |
| 101 fz_free(ctx, stk); | |
| 102 } | |
| 103 | |
| 104 while (p->gs) | |
| 105 pop_gstate(ctx, p); | |
| 106 } | |
| 107 | |
| 108 static void | |
| 109 push_resources(fz_context *ctx, pdf_processor *proc, pdf_obj *res) | |
| 110 { | |
| 111 pdf_font_analysis_processor *p = (pdf_font_analysis_processor *)proc; | |
| 112 resources_stack *stk = fz_malloc_struct(ctx, resources_stack); | |
| 113 | |
| 114 stk->next = p->rstack; | |
| 115 p->rstack = stk; | |
| 116 fz_try(ctx) | |
| 117 { | |
| 118 stk->res = pdf_keep_obj(ctx, res); | |
| 119 } | |
| 120 fz_catch(ctx) | |
| 121 { | |
| 122 pdf_drop_obj(ctx, stk->res); | |
| 123 p->rstack = stk->next; | |
| 124 fz_free(ctx, stk); | |
| 125 fz_rethrow(ctx); | |
| 126 } | |
| 127 } | |
| 128 | |
| 129 static pdf_obj * | |
| 130 pop_resources(fz_context *ctx, pdf_processor *proc) | |
| 131 { | |
| 132 pdf_font_analysis_processor *p = (pdf_font_analysis_processor *)proc; | |
| 133 resources_stack *stk = p->rstack; | |
| 134 pdf_obj *res = p->rstack->res; | |
| 135 | |
| 136 p->rstack = stk->next; | |
| 137 fz_free(ctx, stk); | |
| 138 | |
| 139 return res; | |
| 140 } | |
| 141 | |
| 142 static void | |
| 143 font_analysis_Q(fz_context *ctx, pdf_processor *proc) | |
| 144 { | |
| 145 pdf_font_analysis_processor *p = (pdf_font_analysis_processor*)proc; | |
| 146 | |
| 147 pop_gstate(ctx, p); | |
| 148 } | |
| 149 | |
| 150 static void | |
| 151 font_analysis_q(fz_context *ctx, pdf_processor *proc) | |
| 152 { | |
| 153 pdf_font_analysis_processor *p = (pdf_font_analysis_processor*)proc; | |
| 154 gstate *gs = p->gs; | |
| 155 gstate *new_gs = fz_malloc_struct(ctx, gstate); | |
| 156 p->gs = new_gs; | |
| 157 | |
| 158 if (gs) | |
| 159 { | |
| 160 *new_gs = *gs; | |
| 161 new_gs->next = gs; | |
| 162 } | |
| 163 | |
| 164 pdf_keep_font(ctx, new_gs->font); | |
| 165 | |
| 166 } | |
| 167 | |
| 168 static void | |
| 169 font_analysis_Tf(fz_context *ctx, pdf_processor *proc, const char *name, pdf_font_desc *font, float size) | |
| 170 { | |
| 171 pdf_font_analysis_processor *p = (pdf_font_analysis_processor*)proc; | |
| 172 pdf_obj *dict = pdf_dict_gets(ctx, pdf_dict_get(ctx, p->rstack->res, PDF_NAME(Font)), name); | |
| 173 pdf_obj *subtype, *fontdesc; | |
| 174 pdf_obj *fontfile = NULL; | |
| 175 pdf_obj *key; | |
| 176 int num, gen, i; | |
| 177 int is_cidfont = 0; | |
| 178 int is_ttf = 0; | |
| 179 unsigned char digest[16]; | |
| 180 | |
| 181 p->gs->current_font = -1; /* unknown font! */ | |
| 182 | |
| 183 if (dict == NULL) | |
| 184 return; | |
| 185 | |
| 186 /* We can have multiple fonts that rely on the same underlying fontfile | |
| 187 * object. Therefore, resolve down to that. */ | |
| 188 subtype = pdf_dict_get(ctx, dict, PDF_NAME(Subtype)); | |
| 189 | |
| 190 if (subtype == PDF_NAME(Type1) || subtype == PDF_NAME(MMType1)) | |
| 191 { | |
| 192 // fontfile subtype should be Type1C for us to be able to subset it | |
| 193 key = PDF_NAME(FontFile); | |
| 194 fontdesc = pdf_dict_get(ctx, dict, PDF_NAME(FontDescriptor)); | |
| 195 fontfile = pdf_dict_get(ctx, fontdesc, PDF_NAME(FontFile)); | |
| 196 is_cidfont = 0; | |
| 197 is_ttf = 0; | |
| 198 } | |
| 199 else if (subtype == PDF_NAME(TrueType)) | |
| 200 { | |
| 201 key = PDF_NAME(FontFile2); | |
| 202 fontdesc = pdf_dict_get(ctx, dict, PDF_NAME(FontDescriptor)); | |
| 203 fontfile = pdf_dict_get(ctx, fontdesc, PDF_NAME(FontFile2)); | |
| 204 is_cidfont = 0; | |
| 205 is_ttf = 1; | |
| 206 } | |
| 207 else if (pdf_name_eq(ctx, subtype, PDF_NAME(Type0))) | |
| 208 { | |
| 209 dict = pdf_array_get(ctx, pdf_dict_get(ctx, dict, PDF_NAME(DescendantFonts)), 0); | |
| 210 subtype = pdf_dict_get(ctx, dict, PDF_NAME(Subtype)); | |
| 211 fontdesc = pdf_dict_get(ctx, dict, PDF_NAME(FontDescriptor)); | |
| 212 if (subtype == PDF_NAME(CIDFontType0)) | |
| 213 { | |
| 214 // fontfile subtype is either CIDFontType0C or OpenType | |
| 215 key = PDF_NAME(FontFile3); | |
| 216 fontfile = pdf_dict_get(ctx, fontdesc, PDF_NAME(FontFile3)); | |
| 217 subtype = pdf_dict_get(ctx, fontfile, PDF_NAME(Subtype)); | |
| 218 if (subtype == PDF_NAME(CIDFontType0C)) | |
| 219 { | |
| 220 is_cidfont = 1; | |
| 221 is_ttf = 0; | |
| 222 } | |
| 223 else if (subtype == PDF_NAME(OpenType)) | |
| 224 { | |
| 225 is_cidfont = 1; | |
| 226 is_ttf = 1; | |
| 227 } | |
| 228 else | |
| 229 { | |
| 230 fontfile = NULL; | |
| 231 } | |
| 232 } | |
| 233 else if (subtype == PDF_NAME(CIDFontType2)) | |
| 234 { | |
| 235 key = PDF_NAME(FontFile2); | |
| 236 fontfile = pdf_dict_get(ctx, fontdesc, PDF_NAME(FontFile2)); | |
| 237 is_cidfont = 1; | |
| 238 is_ttf = 1; | |
| 239 } | |
| 240 } | |
| 241 | |
| 242 if (!fontfile) | |
| 243 { | |
| 244 #ifdef DEBUG_SUBSETTING | |
| 245 fz_write_printf(ctx, fz_stddbg(ctx), "No embedded file found for font of subtype %s\n", pdf_to_name(ctx, subtype)); | |
| 246 #endif | |
| 247 return; | |
| 248 } | |
| 249 | |
| 250 num = pdf_to_num(ctx, fontfile); | |
| 251 gen = pdf_to_gen(ctx, fontfile); | |
| 252 | |
| 253 for (i = 0; i < p->usage->len; i++) | |
| 254 { | |
| 255 if (p->usage->font[i].num == num && | |
| 256 p->usage->font[i].gen == gen) | |
| 257 break; | |
| 258 } | |
| 259 | |
| 260 fz_font_digest(ctx, font->font, digest); | |
| 261 | |
| 262 /* Check for duplicate fonts. (Fonts in the document that have | |
| 263 * the font stream included multiple times as different objects). | |
| 264 * This can happen with naive insertion routines. */ | |
| 265 if (i == p->usage->len) | |
| 266 { | |
| 267 for (i = 0; i < p->usage->len; i++) | |
| 268 { | |
| 269 if (memcmp(digest, p->usage->font[i].digest, 16) == 0) | |
| 270 { | |
| 271 pdf_dict_put(ctx, fontdesc, key, p->usage->font[i].fontfile); | |
| 272 break; | |
| 273 } | |
| 274 } | |
| 275 } | |
| 276 | |
| 277 pdf_drop_font(ctx, p->gs->font); | |
| 278 p->gs->font = pdf_keep_font(ctx, font); | |
| 279 p->gs->current_font = i; | |
| 280 if (i < p->usage->len) | |
| 281 { | |
| 282 int j; | |
| 283 | |
| 284 for (j = 0; j < p->usage->font[i].len; j++) | |
| 285 { | |
| 286 if (pdf_objcmp(ctx, p->usage->font[i].font[j], dict) == 0) | |
| 287 return; | |
| 288 } | |
| 289 | |
| 290 if (p->usage->font[i].len == p->usage->font[i].max) | |
| 291 { | |
| 292 int newmax = p->usage->font[i].max * 2; | |
| 293 p->usage->font[i].font = fz_realloc(ctx, p->usage->font[i].font, sizeof(*p->usage->font[i].font) * newmax); | |
| 294 p->usage->font[i].max = newmax; | |
| 295 } | |
| 296 p->usage->font[i].font[j] = pdf_keep_obj(ctx, dict); | |
| 297 p->usage->font[i].len++; | |
| 298 | |
| 299 return; | |
| 300 } | |
| 301 | |
| 302 if (p->usage->max == p->usage->len) | |
| 303 { | |
| 304 int n = p->usage->max * 2; | |
| 305 | |
| 306 if (n == 0) | |
| 307 n = 32; | |
| 308 p->usage->font = (font_usage_t *)fz_realloc(ctx, p->usage->font, sizeof(*p->usage->font) * n); | |
| 309 p->usage->max = n; | |
| 310 } | |
| 311 | |
| 312 p->usage->font[i].is_ttf = is_ttf; | |
| 313 p->usage->font[i].is_cidfont = is_cidfont; | |
| 314 p->usage->font[i].fontfile = pdf_keep_obj(ctx, fontfile); | |
| 315 p->usage->font[i].num = num; | |
| 316 p->usage->font[i].gen = gen; | |
| 317 p->usage->font[i].cids.len = 0; | |
| 318 p->usage->font[i].cids.max = 0; | |
| 319 p->usage->font[i].cids.heap = NULL; | |
| 320 p->usage->font[i].gids.len = 0; | |
| 321 p->usage->font[i].gids.max = 0; | |
| 322 p->usage->font[i].gids.heap = NULL; | |
| 323 p->usage->font[i].len = 0; | |
| 324 p->usage->font[i].max = 0; | |
| 325 p->usage->font[i].font = NULL; | |
| 326 memcpy(p->usage->font[i].digest, digest, 16); | |
| 327 p->usage->len++; | |
| 328 | |
| 329 p->usage->font[i].font = fz_malloc(ctx, sizeof(*p->usage->font[i].font) * 4); | |
| 330 p->usage->font[i].len = 1; | |
| 331 p->usage->font[i].max = 4; | |
| 332 p->usage->font[i].font[0] = pdf_keep_obj(ctx, dict); | |
| 333 } | |
| 334 | |
| 335 static void | |
| 336 show_char(fz_context *ctx, font_usage_t *font, int cid, int gid) | |
| 337 { | |
| 338 fz_int_heap_insert(ctx, &font->cids, cid); | |
| 339 fz_int_heap_insert(ctx, &font->gids, gid); | |
| 340 } | |
| 341 | |
| 342 static void | |
| 343 show_string(fz_context *ctx, pdf_font_analysis_processor *p, unsigned char *buf, size_t len) | |
| 344 { | |
| 345 gstate *gs = p->gs; | |
| 346 pdf_font_desc *fontdesc = gs->font; | |
| 347 size_t pos = 0; | |
| 348 font_usage_t *font; | |
| 349 | |
| 350 // Not an embedded font! | |
| 351 if (gs->current_font < 0 || fontdesc == NULL) | |
| 352 return; | |
| 353 | |
| 354 font = &p->usage->font[gs->current_font]; | |
| 355 | |
| 356 while (pos < len) | |
| 357 { | |
| 358 unsigned int cpt; | |
| 359 int inc = pdf_decode_cmap(fontdesc->encoding, &buf[pos], &buf[len], &cpt); | |
| 360 | |
| 361 int cid = pdf_lookup_cmap(fontdesc->encoding, cpt); | |
| 362 if (cid >= 0) | |
| 363 { | |
| 364 int gid = pdf_font_cid_to_gid(ctx, fontdesc, cid); | |
| 365 show_char(ctx, font, cid, gid); | |
| 366 } | |
| 367 | |
| 368 pos += inc; | |
| 369 } | |
| 370 } | |
| 371 | |
| 372 static void | |
| 373 show_text(fz_context *ctx, pdf_font_analysis_processor *p, pdf_obj *text) | |
| 374 { | |
| 375 gstate *gs = p->gs; | |
| 376 pdf_font_desc *fontdesc; | |
| 377 int i, n; | |
| 378 | |
| 379 if (!gs) | |
| 380 return; | |
| 381 fontdesc = gs->font; | |
| 382 if (!fontdesc) | |
| 383 return; | |
| 384 | |
| 385 if (pdf_is_string(ctx, text)) | |
| 386 { | |
| 387 show_string(ctx, p, (unsigned char *)pdf_to_str_buf(ctx, text), pdf_to_str_len(ctx, text)); | |
| 388 } | |
| 389 else if (pdf_is_array(ctx, text)) | |
| 390 { | |
| 391 n = pdf_array_len(ctx, text); | |
| 392 for (i = 0; i < n; i++) | |
| 393 { | |
| 394 pdf_obj *item = pdf_array_get(ctx, text, i); | |
| 395 if (pdf_is_string(ctx, item)) | |
| 396 { | |
| 397 show_string(ctx, p, (unsigned char *)pdf_to_str_buf(ctx, item), pdf_to_str_len(ctx, item)); | |
| 398 } | |
| 399 } | |
| 400 } | |
| 401 } | |
| 402 | |
| 403 static void | |
| 404 font_analysis_TJ(fz_context *ctx, pdf_processor *proc, pdf_obj *array) | |
| 405 { | |
| 406 pdf_font_analysis_processor *p = (pdf_font_analysis_processor*)proc; | |
| 407 | |
| 408 show_text(ctx, p, array); | |
| 409 } | |
| 410 | |
| 411 static void | |
| 412 font_analysis_Tj(fz_context *ctx, pdf_processor *proc, char *str, size_t len) | |
| 413 { | |
| 414 pdf_font_analysis_processor *p = (pdf_font_analysis_processor*)proc; | |
| 415 | |
| 416 show_string(ctx, p, (unsigned char *)str, len); | |
| 417 } | |
| 418 | |
| 419 static void | |
| 420 font_analysis_squote(fz_context *ctx, pdf_processor *proc, char *str, size_t len) | |
| 421 { | |
| 422 /* Note, we convert all T' operators to (maybe) a T* and a Tj */ | |
| 423 pdf_font_analysis_processor *p = (pdf_font_analysis_processor*)proc; | |
| 424 | |
| 425 show_string(ctx, p, (unsigned char *)str, len); | |
| 426 } | |
| 427 | |
| 428 static void | |
| 429 font_analysis_dquote(fz_context *ctx, pdf_processor *proc, float aw, float ac, char *str, size_t len) | |
| 430 { | |
| 431 /* Note, we convert all T" operators to (maybe) a T*, | |
| 432 * (maybe) Tc, (maybe) Tw and a Tj. */ | |
| 433 pdf_font_analysis_processor *p = (pdf_font_analysis_processor*)proc; | |
| 434 | |
| 435 show_string(ctx, p, (unsigned char*)str, len); | |
| 436 } | |
| 437 | |
| 438 static void | |
| 439 font_analysis_Do_form(fz_context *ctx, pdf_processor *proc, const char *name, pdf_obj *xobj) | |
| 440 { | |
| 441 pdf_font_analysis_processor *pr = (pdf_font_analysis_processor *)proc; | |
| 442 pdf_document *doc = pdf_get_bound_document(ctx, xobj); | |
| 443 pdf_obj *resources = pdf_xobject_resources(ctx, xobj); | |
| 444 | |
| 445 if (!resources) | |
| 446 resources = pr->rstack->res; | |
| 447 | |
| 448 pdf_process_contents(ctx, (pdf_processor*)pr, doc, resources, xobj, NULL, NULL); | |
| 449 } | |
| 450 | |
| 451 static pdf_processor * | |
| 452 pdf_new_font_analysis_processor(fz_context *ctx, fonts_usage_t *usage) | |
| 453 { | |
| 454 pdf_font_analysis_processor *proc = (pdf_font_analysis_processor *)pdf_new_processor(ctx, sizeof *proc); | |
| 455 | |
| 456 proc->super.drop_processor = drop_processor; | |
| 457 proc->super.push_resources = push_resources; | |
| 458 proc->super.pop_resources = pop_resources; | |
| 459 | |
| 460 proc->super.op_Do_form = font_analysis_Do_form; | |
| 461 | |
| 462 proc->super.op_Tf = font_analysis_Tf; | |
| 463 proc->super.op_Tj = font_analysis_Tj; | |
| 464 proc->super.op_TJ = font_analysis_TJ; | |
| 465 proc->super.op_squote = font_analysis_squote; | |
| 466 proc->super.op_dquote = font_analysis_dquote; | |
| 467 | |
| 468 proc->super.op_q = font_analysis_q; | |
| 469 proc->super.op_Q = font_analysis_Q; | |
| 470 | |
| 471 fz_try(ctx) | |
| 472 proc->gs = fz_malloc_struct(ctx, gstate); | |
| 473 fz_catch(ctx) | |
| 474 { | |
| 475 fz_free(ctx, proc); | |
| 476 fz_rethrow(ctx); | |
| 477 } | |
| 478 | |
| 479 proc->gs->current_font = -1; // no font set yet | |
| 480 | |
| 481 proc->usage = usage; | |
| 482 | |
| 483 return &proc->super; | |
| 484 } | |
| 485 | |
| 486 static void | |
| 487 examine_page(fz_context *ctx, pdf_document *doc, pdf_page *page, fonts_usage_t *usage) | |
| 488 { | |
| 489 pdf_processor *proc = pdf_new_font_analysis_processor(ctx, usage); | |
| 490 pdf_obj *contents = pdf_page_contents(ctx, page); | |
| 491 pdf_obj *resources = pdf_page_resources(ctx, page); | |
| 492 pdf_annot *annot, *widget; | |
| 493 | |
| 494 fz_try(ctx) | |
| 495 { | |
| 496 pdf_process_contents(ctx, proc, doc, resources, contents, NULL, NULL); | |
| 497 | |
| 498 pdf_processor_push_resources(ctx, proc, resources); | |
| 499 for (annot = pdf_first_annot(ctx, page); annot; annot = pdf_next_annot(ctx, annot)) | |
| 500 pdf_process_annot(ctx, proc, annot, NULL); | |
| 501 for (widget = pdf_first_widget(ctx, page); widget; widget = pdf_next_widget(ctx, widget)) | |
| 502 pdf_process_annot(ctx, proc, widget, NULL); | |
| 503 pdf_close_processor(ctx, proc); | |
| 504 } | |
| 505 fz_always(ctx) | |
| 506 { | |
| 507 pdf_drop_processor(ctx, proc); | |
| 508 } | |
| 509 fz_catch(ctx) | |
| 510 fz_rethrow(ctx); | |
| 511 } | |
| 512 | |
| 513 static void | |
| 514 subset_ttf(fz_context *ctx, pdf_document *doc, font_usage_t *font, pdf_obj *fontfile, int symbolic, int cidfont) | |
| 515 { | |
| 516 fz_buffer *buf = pdf_load_stream(ctx, fontfile); | |
| 517 fz_buffer *newbuf = NULL; | |
| 518 | |
| 519 if (buf->len == 0) | |
| 520 { | |
| 521 fz_drop_buffer(ctx, buf); | |
| 522 return; | |
| 523 } | |
| 524 | |
| 525 fz_var(newbuf); | |
| 526 | |
| 527 fz_try(ctx) | |
| 528 { | |
| 529 newbuf = fz_subset_ttf_for_gids(ctx, buf, font->gids.heap, font->gids.len, symbolic, cidfont); | |
| 530 | |
| 531 pdf_update_stream(ctx, doc, fontfile, newbuf, 0); | |
| 532 pdf_dict_put_int(ctx, fontfile, PDF_NAME(Length1), newbuf->len); | |
| 533 } | |
| 534 fz_always(ctx) | |
| 535 { | |
| 536 fz_drop_buffer(ctx, newbuf); | |
| 537 fz_drop_buffer(ctx, buf); | |
| 538 } | |
| 539 fz_catch(ctx) | |
| 540 { | |
| 541 fz_rethrow(ctx); | |
| 542 } | |
| 543 } | |
| 544 | |
| 545 static void | |
| 546 subset_cff(fz_context *ctx, pdf_document *doc, font_usage_t *font, pdf_obj *fontfile, int symbolic, int cidfont) | |
| 547 { | |
| 548 fz_buffer *buf = pdf_load_stream(ctx, fontfile); | |
| 549 fz_buffer *newbuf = NULL; | |
| 550 | |
| 551 if (buf->len == 0) | |
| 552 { | |
| 553 fz_drop_buffer(ctx, buf); | |
| 554 return; | |
| 555 } | |
| 556 | |
| 557 fz_var(newbuf); | |
| 558 | |
| 559 fz_try(ctx) | |
| 560 { | |
| 561 newbuf = fz_subset_cff_for_gids(ctx, buf, font->gids.heap, font->gids.len, symbolic, cidfont); | |
| 562 | |
| 563 pdf_update_stream(ctx, doc, fontfile, newbuf, 0); | |
| 564 pdf_dict_put_int(ctx, fontfile, PDF_NAME(Length1), newbuf->len); | |
| 565 } | |
| 566 fz_always(ctx) | |
| 567 { | |
| 568 fz_drop_buffer(ctx, newbuf); | |
| 569 fz_drop_buffer(ctx, buf); | |
| 570 } | |
| 571 fz_catch(ctx) | |
| 572 { | |
| 573 fz_rethrow(ctx); | |
| 574 } | |
| 575 } | |
| 576 | |
| 577 static void | |
| 578 do_adjust_simple_font(fz_context *ctx, pdf_document *doc, font_usage_t *font, int n) | |
| 579 { | |
| 580 pdf_obj *obj = font->font[n]; | |
| 581 int old_firstchar = pdf_dict_get_int(ctx, obj, PDF_NAME(FirstChar)); | |
| 582 pdf_obj *old_widths = pdf_dict_get(ctx, obj, PDF_NAME(Widths)); | |
| 583 int new_firstchar = font->cids.heap[0]; | |
| 584 int new_lastchar = font->cids.heap[font->cids.len-1]; | |
| 585 pdf_obj *widths; | |
| 586 int i; | |
| 587 | |
| 588 pdf_dict_put_int(ctx, obj, PDF_NAME(FirstChar), new_firstchar); | |
| 589 pdf_dict_put_int(ctx, obj, PDF_NAME(LastChar), new_lastchar); | |
| 590 if (old_widths) | |
| 591 { | |
| 592 int j = 0; | |
| 593 widths = pdf_new_array(ctx, doc, new_lastchar - new_firstchar + 1); | |
| 594 for (i = new_firstchar; i <= new_lastchar; i++) | |
| 595 { | |
| 596 if (font->cids.heap[j] == i) | |
| 597 { | |
| 598 pdf_array_push_int(ctx, widths, pdf_array_get_int(ctx, old_widths, i - old_firstchar)); | |
| 599 j++; | |
| 600 } | |
| 601 else | |
| 602 pdf_array_push_int(ctx, widths, 0); | |
| 603 } | |
| 604 pdf_dict_put_drop(ctx, obj, PDF_NAME(Widths), widths); | |
| 605 } | |
| 606 } | |
| 607 | |
| 608 static void | |
| 609 adjust_simple_font(fz_context *ctx, pdf_document *doc, font_usage_t *font) | |
| 610 { | |
| 611 int i; | |
| 612 | |
| 613 for (i = 0; i < font->len; i++) | |
| 614 do_adjust_simple_font(ctx, doc, font, i); | |
| 615 } | |
| 616 | |
| 617 | |
| 618 static pdf_obj * | |
| 619 get_fontdesc(fz_context *ctx, pdf_obj *font) | |
| 620 { | |
| 621 pdf_obj *fontdesc = pdf_dict_get(ctx, font, PDF_NAME(FontDescriptor)); | |
| 622 | |
| 623 if (fontdesc) | |
| 624 return fontdesc; | |
| 625 | |
| 626 return pdf_dict_get(ctx, pdf_array_get(ctx, pdf_dict_get(ctx, font, PDF_NAME(DescendantFonts)), 0), PDF_NAME(FontDescriptor)); | |
| 627 } | |
| 628 | |
| 629 static void | |
| 630 prefix_font_name(fz_context *ctx, pdf_document *doc, pdf_obj *font, pdf_obj *file) | |
| 631 { | |
| 632 fz_buffer *buf; | |
| 633 uint32_t digest[4], v; | |
| 634 pdf_obj *fontdesc = get_fontdesc(ctx, font); | |
| 635 const char *name = pdf_dict_get_name(ctx, fontdesc, PDF_NAME(FontName)); | |
| 636 char new_name[256]; | |
| 637 size_t len; | |
| 638 | |
| 639 /* If there is no name, just exit. Possibly should throw here. */ | |
| 640 if (name == NULL) | |
| 641 return; | |
| 642 | |
| 643 len = strlen(name); | |
| 644 if (len > 6 && name[6] == '+') | |
| 645 return; /* Already a subset name */ | |
| 646 | |
| 647 buf = pdf_load_stream(ctx, file); | |
| 648 fz_md5_buffer(ctx, buf, (uint8_t *)digest); | |
| 649 fz_drop_buffer(ctx, buf); | |
| 650 | |
| 651 v = digest[0] ^ digest[1] ^ digest[2] ^ digest[3]; | |
| 652 new_name[0] = 'A' + (v % 26); | |
| 653 v /= 26; | |
| 654 new_name[1] = 'A' + (v % 26); | |
| 655 v /= 26; | |
| 656 new_name[2] = 'A' + (v % 26); | |
| 657 v /= 26; | |
| 658 new_name[3] = 'A' + (v % 26); | |
| 659 v /= 26; | |
| 660 new_name[4] = 'A' + (v % 26); | |
| 661 v /= 26; | |
| 662 new_name[5] = 'A' + (v % 26); | |
| 663 new_name[6] = '+'; | |
| 664 | |
| 665 memcpy(new_name+7, name, len > sizeof(new_name)-8 ? sizeof(new_name)-8 : len+1); | |
| 666 new_name[sizeof(new_name)-1] = 0; | |
| 667 | |
| 668 pdf_dict_put_name(ctx, fontdesc, PDF_NAME(FontName), new_name); | |
| 669 } | |
| 670 | |
| 671 static int | |
| 672 get_symbolic(fz_context *ctx, font_usage_t *font) | |
| 673 { | |
| 674 int i, flags, symbolic, symbolic2; | |
| 675 pdf_obj *fontdesc; | |
| 676 | |
| 677 if (!font || font->len == 0) | |
| 678 return 0; | |
| 679 | |
| 680 fontdesc = pdf_dict_get(ctx, font->font[0], PDF_NAME(FontDescriptor)); | |
| 681 flags = pdf_dict_get_int(ctx, fontdesc, PDF_NAME(Flags)); | |
| 682 symbolic = (!!(flags & 4)) | ((flags & 32) == 0); | |
| 683 | |
| 684 for (i = 1; i < font->len; i++) | |
| 685 { | |
| 686 fontdesc = pdf_dict_get(ctx, font->font[i], PDF_NAME(FontDescriptor)); | |
| 687 flags = pdf_dict_get_int(ctx, fontdesc, PDF_NAME(Flags)); | |
| 688 symbolic2 = (!!(flags & 4)) | ((flags & 32) == 0); | |
| 689 | |
| 690 if (symbolic != symbolic2) | |
| 691 { | |
| 692 fz_warn(ctx, "Font cannot be both symbolic and non-symbolic. Skipping subsetting."); | |
| 693 return -1; | |
| 694 } | |
| 695 } | |
| 696 | |
| 697 return symbolic; | |
| 698 } | |
| 699 | |
| 700 static pdf_obj *get_subtype(fz_context *ctx, font_usage_t *font) | |
| 701 { | |
| 702 /* If we can get the subtype from the fontfile, great. Use that. */ | |
| 703 pdf_obj *subtype = pdf_dict_get(ctx, font->fontfile, PDF_NAME(Subtype)); | |
| 704 int i; | |
| 705 | |
| 706 if (subtype != NULL) | |
| 707 return subtype; | |
| 708 | |
| 709 /* Otherwise we'll have to get it from the font objects, and they'd | |
| 710 * all better agree. */ | |
| 711 if (font->len == 0) | |
| 712 return NULL; | |
| 713 | |
| 714 subtype = pdf_dict_get(ctx, font->font[0], PDF_NAME(Subtype)); | |
| 715 | |
| 716 for (i = 1; i < font->len; i++) | |
| 717 { | |
| 718 pdf_obj *subtype2 = pdf_dict_get(ctx, font->font[i], PDF_NAME(Subtype)); | |
| 719 | |
| 720 if (pdf_objcmp(ctx, subtype, subtype2)) | |
| 721 return NULL; | |
| 722 } | |
| 723 return subtype; | |
| 724 } | |
| 725 | |
| 726 void | |
| 727 pdf_subset_fonts(fz_context *ctx, pdf_document *doc, int len, const int *pages) | |
| 728 { | |
| 729 int i, j; | |
| 730 pdf_page *page = NULL; | |
| 731 fonts_usage_t usage = { 0 }; | |
| 732 | |
| 733 fz_var(page); | |
| 734 | |
| 735 fz_try(ctx) | |
| 736 { | |
| 737 if (len == 0) | |
| 738 { | |
| 739 /* Process every page. */ | |
| 740 len = pdf_count_pages(ctx, doc); | |
| 741 for (i = 0; i < len; i++) | |
| 742 { | |
| 743 page = pdf_load_page(ctx, doc, i); | |
| 744 | |
| 745 examine_page(ctx, doc, page, &usage); | |
| 746 | |
| 747 fz_drop_page(ctx, (fz_page *)page); | |
| 748 page = NULL; | |
| 749 } | |
| 750 } | |
| 751 else | |
| 752 { | |
| 753 /* Process just the pages we are given. */ | |
| 754 for (i = 0; i < len; i++) | |
| 755 { | |
| 756 page = pdf_load_page(ctx, doc, pages[i]); | |
| 757 | |
| 758 examine_page(ctx, doc, page, &usage); | |
| 759 | |
| 760 fz_drop_page(ctx, (fz_page *)page); | |
| 761 page = NULL; | |
| 762 } | |
| 763 } | |
| 764 | |
| 765 /* All our font usage data is in heaps. Sort the heaps. */ | |
| 766 for (i = 0; i < usage.len; i++) | |
| 767 { | |
| 768 font_usage_t *font = &usage.font[i]; | |
| 769 | |
| 770 fz_int_heap_sort(ctx, &font->cids); | |
| 771 fz_int_heap_uniq(ctx, &font->cids); | |
| 772 fz_int_heap_sort(ctx, &font->gids); | |
| 773 fz_int_heap_uniq(ctx, &font->gids); | |
| 774 } | |
| 775 | |
| 776 /* Now, actually subset the fonts. */ | |
| 777 for (i = 0; i < usage.len; i++) | |
| 778 { | |
| 779 font_usage_t *font = &usage.font[i]; | |
| 780 pdf_obj *subtype = get_subtype(ctx, font); | |
| 781 int symbolic = get_symbolic(ctx, font); | |
| 782 if (symbolic < 0) | |
| 783 continue; | |
| 784 | |
| 785 /* Not sure this can ever happen, and if it does this is not a great | |
| 786 * way to handle it, but it'll do for now. */ | |
| 787 if (font->gids.len == 0 || font->cids.len == 0 || subtype == NULL) | |
| 788 continue; | |
| 789 | |
| 790 #ifdef DEBUG_SUBSETTING | |
| 791 fz_write_printf(ctx, fz_stddbg(ctx), "font->obj=%d subtype=", pdf_to_num(ctx, font->fontfile)); | |
| 792 pdf_debug_obj(ctx, subtype); | |
| 793 fz_write_printf(ctx, fz_stddbg(ctx), "\n"); | |
| 794 pdf_debug_obj(ctx, pdf_dict_get(ctx, font->font[0], PDF_NAME(FontDescriptor))); | |
| 795 #endif | |
| 796 | |
| 797 /* If we hit a (non-SYSTEM) problem subsetting a font, give up for this font alone. | |
| 798 * This will leave this font alone. */ | |
| 799 fz_try(ctx) | |
| 800 { | |
| 801 if (font->is_ttf) | |
| 802 subset_ttf(ctx, doc, font, font->fontfile, symbolic, font->is_cidfont); | |
| 803 else if (font->is_cidfont) | |
| 804 subset_cff(ctx, doc, font, font->fontfile, symbolic, font->is_cidfont); | |
| 805 } | |
| 806 fz_catch(ctx) | |
| 807 { | |
| 808 fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); | |
| 809 fz_report_error(ctx); | |
| 810 continue; | |
| 811 } | |
| 812 | |
| 813 /* Any problems changing these parts of the fonts are really fatal though. */ | |
| 814 if (pdf_name_eq(ctx, subtype, PDF_NAME(TrueType)) || | |
| 815 pdf_name_eq(ctx, subtype, PDF_NAME(Type1))) | |
| 816 { | |
| 817 adjust_simple_font(ctx, doc, font); | |
| 818 } | |
| 819 | |
| 820 /* And prefix the name */ | |
| 821 for (j = 0; j < font->len; j++) | |
| 822 prefix_font_name(ctx, doc, font->font[j], font->fontfile); | |
| 823 } | |
| 824 } | |
| 825 fz_always(ctx) | |
| 826 { | |
| 827 fz_drop_page(ctx, (fz_page *)page); | |
| 828 | |
| 829 for (i = 0; i < usage.len; i++) | |
| 830 { | |
| 831 pdf_drop_obj(ctx, usage.font[i].fontfile); | |
| 832 fz_free(ctx, usage.font[i].cids.heap); | |
| 833 fz_free(ctx, usage.font[i].gids.heap); | |
| 834 for (j = 0; j < usage.font[i].len; j++) | |
| 835 pdf_drop_obj(ctx, usage.font[i].font[j]); | |
| 836 fz_free(ctx, usage.font[i].font); | |
| 837 } | |
| 838 fz_free(ctx, usage.font); | |
| 839 } | |
| 840 fz_catch(ctx) | |
| 841 fz_rethrow(ctx); | |
| 842 } |
