Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/source/pdf/pdf-parse.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 // Copyright (C) 2004-2021 Artifex Software, Inc. | |
| 2 // | |
| 3 // This file is part of MuPDF. | |
| 4 // | |
| 5 // MuPDF is free software: you can redistribute it and/or modify it under the | |
| 6 // terms of the GNU Affero General Public License as published by the Free | |
| 7 // Software Foundation, either version 3 of the License, or (at your option) | |
| 8 // any later version. | |
| 9 // | |
| 10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY | |
| 11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | |
| 12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more | |
| 13 // details. | |
| 14 // | |
| 15 // You should have received a copy of the GNU Affero General Public License | |
| 16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> | |
| 17 // | |
| 18 // Alternative licensing terms are available from the licensor. | |
| 19 // For commercial licensing, see <https://www.artifex.com/> or contact | |
| 20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, | |
| 21 // CA 94129, USA, for further information. | |
| 22 | |
| 23 #include "mupdf/fitz.h" | |
| 24 #include "mupdf/pdf.h" | |
| 25 | |
| 26 #include <string.h> | |
| 27 #include <time.h> | |
| 28 | |
| 29 #ifdef _WIN32 | |
| 30 #define timegm _mkgmtime | |
| 31 #endif | |
| 32 | |
| 33 #define isdigit(c) (c >= '0' && c <= '9') | |
| 34 | |
| 35 fz_rect | |
| 36 pdf_to_rect(fz_context *ctx, pdf_obj *array) | |
| 37 { | |
| 38 if (!pdf_is_array(ctx, array)) | |
| 39 return fz_empty_rect; | |
| 40 else | |
| 41 { | |
| 42 float a = pdf_array_get_real(ctx, array, 0); | |
| 43 float b = pdf_array_get_real(ctx, array, 1); | |
| 44 float c = pdf_array_get_real(ctx, array, 2); | |
| 45 float d = pdf_array_get_real(ctx, array, 3); | |
| 46 fz_rect r; | |
| 47 r.x0 = fz_min(a, c); | |
| 48 r.y0 = fz_min(b, d); | |
| 49 r.x1 = fz_max(a, c); | |
| 50 r.y1 = fz_max(b, d); | |
| 51 return r; | |
| 52 } | |
| 53 } | |
| 54 | |
| 55 fz_quad | |
| 56 pdf_to_quad(fz_context *ctx, pdf_obj *array, int offset) | |
| 57 { | |
| 58 fz_quad q; | |
| 59 q.ul.x = pdf_array_get_real(ctx, array, offset+0); | |
| 60 q.ul.y = pdf_array_get_real(ctx, array, offset+1); | |
| 61 q.ur.x = pdf_array_get_real(ctx, array, offset+2); | |
| 62 q.ur.y = pdf_array_get_real(ctx, array, offset+3); | |
| 63 q.ll.x = pdf_array_get_real(ctx, array, offset+4); | |
| 64 q.ll.y = pdf_array_get_real(ctx, array, offset+5); | |
| 65 q.lr.x = pdf_array_get_real(ctx, array, offset+6); | |
| 66 q.lr.y = pdf_array_get_real(ctx, array, offset+7); | |
| 67 return q; | |
| 68 } | |
| 69 | |
| 70 fz_point | |
| 71 pdf_to_point(fz_context *ctx, pdf_obj *array, int offset) | |
| 72 { | |
| 73 fz_point p; | |
| 74 p.x = pdf_array_get_real(ctx, array, offset+0); | |
| 75 p.y = pdf_array_get_real(ctx, array, offset+1); | |
| 76 return p; | |
| 77 } | |
| 78 | |
| 79 fz_matrix | |
| 80 pdf_to_matrix(fz_context *ctx, pdf_obj *array) | |
| 81 { | |
| 82 if (!pdf_is_array(ctx, array)) | |
| 83 return fz_identity; | |
| 84 else | |
| 85 { | |
| 86 fz_matrix m; | |
| 87 m.a = pdf_array_get_real(ctx, array, 0); | |
| 88 m.b = pdf_array_get_real(ctx, array, 1); | |
| 89 m.c = pdf_array_get_real(ctx, array, 2); | |
| 90 m.d = pdf_array_get_real(ctx, array, 3); | |
| 91 m.e = pdf_array_get_real(ctx, array, 4); | |
| 92 m.f = pdf_array_get_real(ctx, array, 5); | |
| 93 return m; | |
| 94 } | |
| 95 } | |
| 96 | |
| 97 char * | |
| 98 pdf_format_date(fz_context *ctx, int64_t time, char *s, size_t n) | |
| 99 { | |
| 100 time_t secs = time; | |
| 101 #ifdef _POSIX_SOURCE | |
| 102 struct tm tmbuf, *tm = gmtime_r(&secs, &tmbuf); | |
| 103 #else | |
| 104 struct tm *tm = gmtime(&secs); | |
| 105 #endif | |
| 106 if (time < 0 || !tm || !strftime(s, n, "D:%Y%m%d%H%M%SZ", tm)) | |
| 107 return NULL; | |
| 108 return s; | |
| 109 } | |
| 110 | |
| 111 int64_t | |
| 112 pdf_parse_date(fz_context *ctx, const char *s) | |
| 113 { | |
| 114 int tz_sign, tz_hour, tz_min, tz_adj; | |
| 115 struct tm tm; | |
| 116 time_t utc; | |
| 117 | |
| 118 if (!s[0]) | |
| 119 return -1; | |
| 120 | |
| 121 memset(&tm, 0, sizeof tm); | |
| 122 tm.tm_mday = 1; | |
| 123 | |
| 124 tz_sign = 1; | |
| 125 tz_hour = 0; | |
| 126 tz_min = 0; | |
| 127 | |
| 128 if (s[0] == 'D' && s[1] == ':') | |
| 129 s += 2; | |
| 130 | |
| 131 if (!isdigit(s[0]) || !isdigit(s[1]) || !isdigit(s[2]) || !isdigit(s[3])) | |
| 132 { | |
| 133 fz_warn(ctx, "invalid date format (missing year)"); | |
| 134 return -1; | |
| 135 } | |
| 136 tm.tm_year = (s[0]-'0')*1000 + (s[1]-'0')*100 + (s[2]-'0')*10 + (s[3]-'0') - 1900; | |
| 137 s += 4; | |
| 138 | |
| 139 if (tm.tm_year < 70) | |
| 140 { | |
| 141 fz_warn(ctx, "invalid date (year out of range)"); | |
| 142 return -1; | |
| 143 } | |
| 144 | |
| 145 if (isdigit(s[0]) && isdigit(s[1])) | |
| 146 { | |
| 147 tm.tm_mon = (s[0]-'0')*10 + (s[1]-'0') - 1; /* month is 0-11 in struct tm */ | |
| 148 s += 2; | |
| 149 if (isdigit(s[0]) && isdigit(s[1])) | |
| 150 { | |
| 151 tm.tm_mday = (s[0]-'0')*10 + (s[1]-'0'); | |
| 152 s += 2; | |
| 153 if (isdigit(s[0]) && isdigit(s[1])) | |
| 154 { | |
| 155 tm.tm_hour = (s[0]-'0')*10 + (s[1]-'0'); | |
| 156 s += 2; | |
| 157 if (isdigit(s[0]) && isdigit(s[1])) | |
| 158 { | |
| 159 tm.tm_min = (s[0]-'0')*10 + (s[1]-'0'); | |
| 160 s += 2; | |
| 161 if (isdigit(s[0]) && isdigit(s[1])) | |
| 162 { | |
| 163 tm.tm_sec = (s[0]-'0')*10 + (s[1]-'0'); | |
| 164 s += 2; | |
| 165 } | |
| 166 } | |
| 167 } | |
| 168 } | |
| 169 } | |
| 170 | |
| 171 if (tm.tm_sec > 60 || tm.tm_min > 59 || tm.tm_hour > 23 || tm.tm_mday > 31 || tm.tm_mon > 11) | |
| 172 { | |
| 173 fz_warn(ctx, "invalid date (a field is out of range)"); | |
| 174 return -1; | |
| 175 } | |
| 176 | |
| 177 if (s[0] == 'Z') | |
| 178 { | |
| 179 if (s[1] == '0' && s[2] == '0') | |
| 180 { | |
| 181 s += 3; | |
| 182 if (s[0] == '\'' && s[1] == '0' && s[2] == '0') | |
| 183 { | |
| 184 s += 3; | |
| 185 if (s[0] == '\'') | |
| 186 s += 1; | |
| 187 } | |
| 188 } | |
| 189 else | |
| 190 { | |
| 191 s += 1; | |
| 192 } | |
| 193 } | |
| 194 else if ((s[0] == '-' || s[0] == '+') && isdigit(s[1]) && isdigit(s[2])) | |
| 195 { | |
| 196 tz_sign = (s[0] == '-') ? -1 : 1; | |
| 197 tz_hour = (s[1]-'0')*10 + (s[2]-'0'); | |
| 198 s += 3; | |
| 199 if (s[0] == '\'' && isdigit(s[1]) && isdigit(s[2])) | |
| 200 { | |
| 201 tz_min = (s[1]-'0')*10 + (s[2]-'0'); | |
| 202 s += 3; | |
| 203 if (s[0] == '\'') | |
| 204 s += 1; | |
| 205 } | |
| 206 } | |
| 207 | |
| 208 /* PDF is based on ISO/IEC 8824 which limits time zones from -15 to +16. */ | |
| 209 if (tz_sign < 0 && (tz_hour > 15 || (tz_hour == 15 && tz_min > 0))) | |
| 210 { | |
| 211 fz_warn(ctx, "invalid date format (time zone out of range)"); | |
| 212 return -1; | |
| 213 } | |
| 214 if (tz_sign > 0 && (tz_hour > 16 || (tz_hour == 16 && tz_min > 0))) | |
| 215 { | |
| 216 fz_warn(ctx, "invalid date format (time zone out of range)"); | |
| 217 return -1; | |
| 218 } | |
| 219 | |
| 220 if (s[0] != 0) | |
| 221 fz_warn(ctx, "invalid date format (garbage at end)"); | |
| 222 | |
| 223 utc = timegm(&tm); | |
| 224 if (utc == (time_t)-1) | |
| 225 { | |
| 226 fz_warn(ctx, "date overflow error"); | |
| 227 return -1; | |
| 228 } | |
| 229 | |
| 230 tz_adj = tz_sign * (tz_hour * 3600 + tz_min * 60); | |
| 231 return utc - tz_adj; | |
| 232 } | |
| 233 | |
| 234 int64_t | |
| 235 pdf_to_date(fz_context *ctx, pdf_obj *time) | |
| 236 { | |
| 237 return pdf_parse_date(ctx, pdf_to_str_buf(ctx, time)); | |
| 238 } | |
| 239 | |
| 240 static int | |
| 241 rune_from_utf16be(int *out, const unsigned char *s, const unsigned char *end) | |
| 242 { | |
| 243 if (s + 2 <= end) | |
| 244 { | |
| 245 int a = s[0] << 8 | s[1]; | |
| 246 if (a >= 0xD800 && a <= 0xDFFF && s + 4 <= end) | |
| 247 { | |
| 248 int b = s[2] << 8 | s[3]; | |
| 249 *out = ((a - 0xD800) << 10) + (b - 0xDC00) + 0x10000; | |
| 250 return 4; | |
| 251 } | |
| 252 *out = a; | |
| 253 return 2; | |
| 254 } | |
| 255 *out = FZ_REPLACEMENT_CHARACTER; | |
| 256 return 1; | |
| 257 } | |
| 258 | |
| 259 static int | |
| 260 rune_from_utf16le(int *out, const unsigned char *s, const unsigned char *end) | |
| 261 { | |
| 262 if (s + 2 <= end) | |
| 263 { | |
| 264 int a = s[1] << 8 | s[0]; | |
| 265 if (a >= 0xD800 && a <= 0xDFFF && s + 4 <= end) | |
| 266 { | |
| 267 int b = s[3] << 8 | s[2]; | |
| 268 *out = ((a - 0xD800) << 10) + (b - 0xDC00) + 0x10000; | |
| 269 return 4; | |
| 270 } | |
| 271 *out = a; | |
| 272 return 2; | |
| 273 } | |
| 274 *out = FZ_REPLACEMENT_CHARACTER; | |
| 275 return 1; | |
| 276 } | |
| 277 | |
| 278 static size_t | |
| 279 skip_language_code_utf16le(const unsigned char *s, size_t n, size_t i) | |
| 280 { | |
| 281 /* skip language escape codes */ | |
| 282 if (i + 6 <= n && s[i+1] == 0 && s[i+0] == 27 && s[i+5] == 0 && s[i+4] == 27) | |
| 283 return 6; | |
| 284 else if (i + 8 <= n && s[i+1] == 0 && s[i+0] == 27 && s[i+7] == 0 && s[i+6] == 27) | |
| 285 return 8; | |
| 286 return 0; | |
| 287 } | |
| 288 | |
| 289 static size_t | |
| 290 skip_language_code_utf16be(const unsigned char *s, size_t n, size_t i) | |
| 291 { | |
| 292 /* skip language escape codes */ | |
| 293 if (i + 6 <= n && s[i+0] == 0 && s[i+1] == 27 && s[i+4] == 0 && s[i+5] == 27) | |
| 294 return 6; | |
| 295 else if (i + 8 <= n && s[i+0] == 0 && s[i+1] == 27 && s[i+6] == 0 && s[i+7] == 27) | |
| 296 return 8; | |
| 297 return 0; | |
| 298 } | |
| 299 | |
| 300 static size_t | |
| 301 skip_language_code_utf8(const unsigned char *s, size_t n, size_t i) | |
| 302 { | |
| 303 /* skip language escape codes */ | |
| 304 if (i + 3 <= n && s[i] == 27 && s[i+3]) | |
| 305 return 3; | |
| 306 else if (i + 5 <= n && s[i] == 27 && s[i+5] == 27) | |
| 307 return 5; | |
| 308 return 0; | |
| 309 } | |
| 310 | |
| 311 static int | |
| 312 is_valid_utf8(const unsigned char *s, const unsigned char *end) | |
| 313 { | |
| 314 for (; s < end; ++s) | |
| 315 { | |
| 316 int skip = *s < 0x80 ? 0 : *s < 0xC0 ? -1 : *s < 0xE0 ? 1 : *s < 0xF0 ? 2 : *s < 0xF5 ? 3 : -1; | |
| 317 if (skip == -1) | |
| 318 return 0; | |
| 319 while (skip-- > 0) | |
| 320 if (++s >= end || (*s & 0xC0) != 0x80) | |
| 321 return 0; | |
| 322 } | |
| 323 return 1; | |
| 324 } | |
| 325 | |
| 326 char * | |
| 327 pdf_new_utf8_from_pdf_string(fz_context *ctx, const char *ssrcptr, size_t srclen) | |
| 328 { | |
| 329 const unsigned char *srcptr = (const unsigned char*)ssrcptr; | |
| 330 char *dstptr, *dst; | |
| 331 size_t dstlen = 0; | |
| 332 int ucs; | |
| 333 size_t i, n; | |
| 334 | |
| 335 /* UTF-16BE */ | |
| 336 if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255) | |
| 337 { | |
| 338 i = 2; | |
| 339 while (i + 2 <= srclen) | |
| 340 { | |
| 341 n = skip_language_code_utf16be(srcptr, srclen, i); | |
| 342 if (n) | |
| 343 i += n; | |
| 344 else | |
| 345 { | |
| 346 i += rune_from_utf16be(&ucs, srcptr + i, srcptr + srclen); | |
| 347 dstlen += fz_runelen(ucs); | |
| 348 } | |
| 349 } | |
| 350 | |
| 351 dstptr = dst = Memento_label(fz_malloc(ctx, dstlen + 1), "utf8_from_utf16be"); | |
| 352 | |
| 353 i = 2; | |
| 354 while (i + 2 <= srclen) | |
| 355 { | |
| 356 n = skip_language_code_utf16be(srcptr, srclen, i); | |
| 357 if (n) | |
| 358 i += n; | |
| 359 else | |
| 360 { | |
| 361 i += rune_from_utf16be(&ucs, srcptr + i, srcptr + srclen); | |
| 362 dstptr += fz_runetochar(dstptr, ucs); | |
| 363 } | |
| 364 } | |
| 365 } | |
| 366 | |
| 367 /* UTF-16LE */ | |
| 368 else if (srclen >= 2 && srcptr[0] == 255 && srcptr[1] == 254) | |
| 369 { | |
| 370 i = 2; | |
| 371 while (i + 2 <= srclen) | |
| 372 { | |
| 373 n = skip_language_code_utf16le(srcptr, srclen, i); | |
| 374 if (n) | |
| 375 i += n; | |
| 376 else | |
| 377 { | |
| 378 i += rune_from_utf16le(&ucs, srcptr + i, srcptr + srclen); | |
| 379 dstlen += fz_runelen(ucs); | |
| 380 } | |
| 381 } | |
| 382 | |
| 383 dstptr = dst = Memento_label(fz_malloc(ctx, dstlen + 1), "utf8_from_utf16le"); | |
| 384 | |
| 385 i = 2; | |
| 386 while (i + 2 <= srclen) | |
| 387 { | |
| 388 n = skip_language_code_utf16le(srcptr, srclen, i); | |
| 389 if (n) | |
| 390 i += n; | |
| 391 else | |
| 392 { | |
| 393 i += rune_from_utf16le(&ucs, srcptr + i, srcptr + srclen); | |
| 394 dstptr += fz_runetochar(dstptr, ucs); | |
| 395 } | |
| 396 } | |
| 397 } | |
| 398 | |
| 399 /* UTF-8 */ | |
| 400 else if (srclen >= 3 && srcptr[0] == 239 && srcptr[1] == 187 && srcptr[2] == 191) | |
| 401 { | |
| 402 i = 3; | |
| 403 while (i < srclen) | |
| 404 { | |
| 405 n = skip_language_code_utf8(srcptr, srclen, i); | |
| 406 if (n) | |
| 407 i += n; | |
| 408 else | |
| 409 { | |
| 410 i += 1; | |
| 411 dstlen += 1; | |
| 412 } | |
| 413 } | |
| 414 | |
| 415 dstptr = dst = Memento_label(fz_malloc(ctx, dstlen + 1), "utf8_from_utf8"); | |
| 416 | |
| 417 i = 3; | |
| 418 while (i < srclen) | |
| 419 { | |
| 420 n = skip_language_code_utf8(srcptr, srclen, i); | |
| 421 if (n) | |
| 422 i += n; | |
| 423 else | |
| 424 *dstptr++ = srcptr[i++]; | |
| 425 } | |
| 426 } | |
| 427 | |
| 428 /* Detect UTF-8 strings that aren't marked with a BOM */ | |
| 429 else if (is_valid_utf8(srcptr, srcptr + srclen)) | |
| 430 { | |
| 431 dst = Memento_label(fz_malloc(ctx, srclen + 1), "utf8_from_guess"); | |
| 432 memcpy(dst, srcptr, srclen); | |
| 433 dstptr = dst + srclen; | |
| 434 } | |
| 435 | |
| 436 /* PDFDocEncoding */ | |
| 437 else | |
| 438 { | |
| 439 for (i = 0; i < srclen; i++) | |
| 440 dstlen += fz_runelen(fz_unicode_from_pdf_doc_encoding[srcptr[i]]); | |
| 441 | |
| 442 dstptr = dst = Memento_label(fz_malloc(ctx, dstlen + 1), "utf8_from_pdfdocenc"); | |
| 443 | |
| 444 for (i = 0; i < srclen; i++) | |
| 445 { | |
| 446 ucs = fz_unicode_from_pdf_doc_encoding[srcptr[i]]; | |
| 447 dstptr += fz_runetochar(dstptr, ucs); | |
| 448 } | |
| 449 } | |
| 450 | |
| 451 *dstptr = 0; | |
| 452 return dst; | |
| 453 } | |
| 454 | |
| 455 char * | |
| 456 pdf_new_utf8_from_pdf_string_obj(fz_context *ctx, pdf_obj *src) | |
| 457 { | |
| 458 const char *srcptr; | |
| 459 size_t srclen; | |
| 460 srcptr = pdf_to_string(ctx, src, &srclen); | |
| 461 return pdf_new_utf8_from_pdf_string(ctx, srcptr, srclen); | |
| 462 } | |
| 463 | |
| 464 char * | |
| 465 pdf_new_utf8_from_pdf_stream_obj(fz_context *ctx, pdf_obj *src) | |
| 466 { | |
| 467 fz_buffer *stmbuf; | |
| 468 char *srcptr; | |
| 469 size_t srclen; | |
| 470 char *dst = NULL; | |
| 471 | |
| 472 stmbuf = pdf_load_stream(ctx, src); | |
| 473 srclen = fz_buffer_storage(ctx, stmbuf, (unsigned char **)&srcptr); | |
| 474 fz_try(ctx) | |
| 475 dst = pdf_new_utf8_from_pdf_string(ctx, srcptr, srclen); | |
| 476 fz_always(ctx) | |
| 477 fz_drop_buffer(ctx, stmbuf); | |
| 478 fz_catch(ctx) | |
| 479 fz_rethrow(ctx); | |
| 480 return dst; | |
| 481 } | |
| 482 | |
| 483 char * | |
| 484 pdf_load_stream_or_string_as_utf8(fz_context *ctx, pdf_obj *src) | |
| 485 { | |
| 486 if (pdf_is_stream(ctx, src)) | |
| 487 return pdf_new_utf8_from_pdf_stream_obj(ctx, src); | |
| 488 return pdf_new_utf8_from_pdf_string_obj(ctx, src); | |
| 489 } | |
| 490 | |
| 491 static pdf_obj * | |
| 492 pdf_new_text_string_utf16be(fz_context *ctx, const char *s) | |
| 493 { | |
| 494 const char *ss; | |
| 495 int c, i, n, a, b; | |
| 496 unsigned char *p; | |
| 497 pdf_obj *obj; | |
| 498 | |
| 499 ss = s; | |
| 500 n = 0; | |
| 501 while (*ss) | |
| 502 { | |
| 503 ss += fz_chartorune(&c, ss); | |
| 504 n += (c >= 0x10000) ? 2 : 1; | |
| 505 } | |
| 506 | |
| 507 p = fz_malloc(ctx, n * 2 + 2); | |
| 508 i = 0; | |
| 509 p[i++] = 254; | |
| 510 p[i++] = 255; | |
| 511 while (*s) | |
| 512 { | |
| 513 s += fz_chartorune(&c, s); | |
| 514 if (c >= 0x10000) | |
| 515 { | |
| 516 a = (((c - 0x10000) >> 10) & 0x3ff) + 0xD800; | |
| 517 p[i++] = (a>>8) & 0xff; | |
| 518 p[i++] = (a) & 0xff; | |
| 519 b = (((c - 0x10000)) & 0x3ff) + 0xDC00; | |
| 520 p[i++] = (b>>8) & 0xff; | |
| 521 p[i++] = (b) & 0xff; | |
| 522 } | |
| 523 else | |
| 524 { | |
| 525 p[i++] = (c>>8) & 0xff; | |
| 526 p[i++] = (c) & 0xff; | |
| 527 } | |
| 528 } | |
| 529 | |
| 530 fz_try(ctx) | |
| 531 obj = pdf_new_string(ctx, (char*)p, i); | |
| 532 fz_always(ctx) | |
| 533 fz_free(ctx, p); | |
| 534 fz_catch(ctx) | |
| 535 fz_rethrow(ctx); | |
| 536 return obj; | |
| 537 } | |
| 538 | |
| 539 pdf_obj * | |
| 540 pdf_new_text_string(fz_context *ctx, const char *s) | |
| 541 { | |
| 542 int i = 0; | |
| 543 while (s[i] != 0) | |
| 544 { | |
| 545 if (((unsigned char)s[i]) >= 128) | |
| 546 return pdf_new_text_string_utf16be(ctx, s); | |
| 547 ++i; | |
| 548 } | |
| 549 return pdf_new_string(ctx, s, i); | |
| 550 } | |
| 551 | |
| 552 pdf_obj * | |
| 553 pdf_parse_array(fz_context *ctx, pdf_document *doc, fz_stream *file, pdf_lexbuf *buf) | |
| 554 { | |
| 555 pdf_obj *ary = NULL; | |
| 556 pdf_obj *obj = NULL; | |
| 557 int64_t a = 0, b = 0, n = 0; | |
| 558 pdf_token tok; | |
| 559 pdf_obj *op = NULL; | |
| 560 | |
| 561 fz_var(obj); | |
| 562 | |
| 563 ary = pdf_new_array(ctx, doc, 4); | |
| 564 | |
| 565 fz_try(ctx) | |
| 566 { | |
| 567 while (1) | |
| 568 { | |
| 569 tok = pdf_lex(ctx, file, buf); | |
| 570 | |
| 571 if (tok != PDF_TOK_INT && tok != PDF_TOK_R) | |
| 572 { | |
| 573 if (n > 0) | |
| 574 pdf_array_push_int(ctx, ary, a); | |
| 575 if (n > 1) | |
| 576 pdf_array_push_int(ctx, ary, b); | |
| 577 n = 0; | |
| 578 } | |
| 579 | |
| 580 if (tok == PDF_TOK_INT && n == 2) | |
| 581 { | |
| 582 pdf_array_push_int(ctx, ary, a); | |
| 583 a = b; | |
| 584 n --; | |
| 585 } | |
| 586 | |
| 587 switch (tok) | |
| 588 { | |
| 589 case PDF_TOK_EOF: | |
| 590 fz_throw(ctx, FZ_ERROR_SYNTAX, "array not closed before end of file"); | |
| 591 | |
| 592 case PDF_TOK_CLOSE_ARRAY: | |
| 593 op = ary; | |
| 594 goto end; | |
| 595 | |
| 596 case PDF_TOK_INT: | |
| 597 if (n == 0) | |
| 598 a = buf->i; | |
| 599 if (n == 1) | |
| 600 b = buf->i; | |
| 601 n ++; | |
| 602 break; | |
| 603 | |
| 604 case PDF_TOK_R: | |
| 605 if (n != 2) | |
| 606 fz_throw(ctx, FZ_ERROR_SYNTAX, "cannot parse indirect reference in array"); | |
| 607 pdf_array_push_drop(ctx, ary, pdf_new_indirect(ctx, doc, a, b)); | |
| 608 n = 0; | |
| 609 break; | |
| 610 | |
| 611 case PDF_TOK_OPEN_ARRAY: | |
| 612 obj = pdf_parse_array(ctx, doc, file, buf); | |
| 613 pdf_array_push_drop(ctx, ary, obj); | |
| 614 break; | |
| 615 | |
| 616 case PDF_TOK_OPEN_DICT: | |
| 617 obj = pdf_parse_dict(ctx, doc, file, buf); | |
| 618 pdf_array_push_drop(ctx, ary, obj); | |
| 619 break; | |
| 620 | |
| 621 case PDF_TOK_NAME: | |
| 622 pdf_array_push_name(ctx, ary, buf->scratch); | |
| 623 break; | |
| 624 case PDF_TOK_REAL: | |
| 625 pdf_array_push_real(ctx, ary, buf->f); | |
| 626 break; | |
| 627 case PDF_TOK_STRING: | |
| 628 pdf_array_push_string(ctx, ary, buf->scratch, buf->len); | |
| 629 break; | |
| 630 case PDF_TOK_TRUE: | |
| 631 pdf_array_push_bool(ctx, ary, 1); | |
| 632 break; | |
| 633 case PDF_TOK_FALSE: | |
| 634 pdf_array_push_bool(ctx, ary, 0); | |
| 635 break; | |
| 636 case PDF_TOK_NULL: | |
| 637 pdf_array_push(ctx, ary, PDF_NULL); | |
| 638 break; | |
| 639 | |
| 640 default: | |
| 641 pdf_array_push(ctx, ary, PDF_NULL); | |
| 642 break; | |
| 643 } | |
| 644 } | |
| 645 end: | |
| 646 {} | |
| 647 } | |
| 648 fz_catch(ctx) | |
| 649 { | |
| 650 pdf_drop_obj(ctx, ary); | |
| 651 fz_rethrow(ctx); | |
| 652 } | |
| 653 return op; | |
| 654 } | |
| 655 | |
| 656 pdf_obj * | |
| 657 pdf_parse_dict(fz_context *ctx, pdf_document *doc, fz_stream *file, pdf_lexbuf *buf) | |
| 658 { | |
| 659 pdf_obj *dict; | |
| 660 pdf_obj *key = NULL; | |
| 661 pdf_obj *val = NULL; | |
| 662 pdf_token tok; | |
| 663 int64_t a, b; | |
| 664 | |
| 665 dict = pdf_new_dict(ctx, doc, 8); | |
| 666 | |
| 667 fz_var(key); | |
| 668 fz_var(val); | |
| 669 | |
| 670 fz_try(ctx) | |
| 671 { | |
| 672 while (1) | |
| 673 { | |
| 674 tok = pdf_lex(ctx, file, buf); | |
| 675 skip: | |
| 676 if (tok == PDF_TOK_CLOSE_DICT) | |
| 677 break; | |
| 678 | |
| 679 /* for BI .. ID .. EI in content streams */ | |
| 680 if (tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID")) | |
| 681 break; | |
| 682 | |
| 683 if (tok != PDF_TOK_NAME) | |
| 684 fz_throw(ctx, FZ_ERROR_SYNTAX, "invalid key in dict"); | |
| 685 | |
| 686 key = pdf_new_name(ctx, buf->scratch); | |
| 687 | |
| 688 tok = pdf_lex(ctx, file, buf); | |
| 689 | |
| 690 switch (tok) | |
| 691 { | |
| 692 case PDF_TOK_OPEN_ARRAY: | |
| 693 val = pdf_parse_array(ctx, doc, file, buf); | |
| 694 break; | |
| 695 | |
| 696 case PDF_TOK_OPEN_DICT: | |
| 697 val = pdf_parse_dict(ctx, doc, file, buf); | |
| 698 break; | |
| 699 | |
| 700 case PDF_TOK_NAME: val = pdf_new_name(ctx, buf->scratch); break; | |
| 701 case PDF_TOK_REAL: val = pdf_new_real(ctx, buf->f); break; | |
| 702 case PDF_TOK_STRING: val = pdf_new_string(ctx, buf->scratch, buf->len); break; | |
| 703 case PDF_TOK_TRUE: val = PDF_TRUE; break; | |
| 704 case PDF_TOK_FALSE: val = PDF_FALSE; break; | |
| 705 case PDF_TOK_NULL: val = PDF_NULL; break; | |
| 706 | |
| 707 case PDF_TOK_INT: | |
| 708 /* 64-bit to allow for numbers > INT_MAX and overflow */ | |
| 709 a = buf->i; | |
| 710 tok = pdf_lex(ctx, file, buf); | |
| 711 if (tok == PDF_TOK_CLOSE_DICT || tok == PDF_TOK_NAME || | |
| 712 (tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID"))) | |
| 713 { | |
| 714 pdf_dict_put_int(ctx, dict, key, a); | |
| 715 pdf_drop_obj(ctx, key); | |
| 716 key = NULL; | |
| 717 goto skip; | |
| 718 } | |
| 719 if (tok == PDF_TOK_INT) | |
| 720 { | |
| 721 b = buf->i; | |
| 722 tok = pdf_lex(ctx, file, buf); | |
| 723 if (tok == PDF_TOK_R) | |
| 724 { | |
| 725 val = pdf_new_indirect(ctx, doc, a, b); | |
| 726 break; | |
| 727 } | |
| 728 } | |
| 729 fz_warn(ctx, "invalid indirect reference in dict"); | |
| 730 val = PDF_NULL; | |
| 731 break; | |
| 732 | |
| 733 default: | |
| 734 val = PDF_NULL; | |
| 735 break; | |
| 736 } | |
| 737 | |
| 738 pdf_dict_put(ctx, dict, key, val); | |
| 739 pdf_drop_obj(ctx, val); | |
| 740 val = NULL; | |
| 741 pdf_drop_obj(ctx, key); | |
| 742 key = NULL; | |
| 743 } | |
| 744 } | |
| 745 fz_catch(ctx) | |
| 746 { | |
| 747 pdf_drop_obj(ctx, dict); | |
| 748 pdf_drop_obj(ctx, key); | |
| 749 pdf_drop_obj(ctx, val); | |
| 750 fz_rethrow(ctx); | |
| 751 } | |
| 752 return dict; | |
| 753 } | |
| 754 | |
| 755 pdf_obj * | |
| 756 pdf_parse_stm_obj(fz_context *ctx, pdf_document *doc, fz_stream *file, pdf_lexbuf *buf) | |
| 757 { | |
| 758 pdf_token tok; | |
| 759 | |
| 760 tok = pdf_lex(ctx, file, buf); | |
| 761 | |
| 762 switch (tok) | |
| 763 { | |
| 764 case PDF_TOK_OPEN_ARRAY: | |
| 765 return pdf_parse_array(ctx, doc, file, buf); | |
| 766 case PDF_TOK_OPEN_DICT: | |
| 767 return pdf_parse_dict(ctx, doc, file, buf); | |
| 768 case PDF_TOK_NAME: return pdf_new_name(ctx, buf->scratch); | |
| 769 case PDF_TOK_REAL: return pdf_new_real(ctx, buf->f); | |
| 770 case PDF_TOK_STRING: return pdf_new_string(ctx, buf->scratch, buf->len); | |
| 771 case PDF_TOK_TRUE: return PDF_TRUE; | |
| 772 case PDF_TOK_FALSE: return PDF_FALSE; | |
| 773 case PDF_TOK_NULL: return PDF_NULL; | |
| 774 case PDF_TOK_INT: return pdf_new_int(ctx, buf->i); | |
| 775 default: fz_throw(ctx, FZ_ERROR_SYNTAX, "unknown token in object stream"); | |
| 776 } | |
| 777 } | |
| 778 | |
| 779 pdf_obj * | |
| 780 pdf_parse_ind_obj_or_newobj(fz_context *ctx, pdf_document *doc, fz_stream *file, | |
| 781 int *onum, int *ogen, int64_t *ostmofs, int *try_repair, int *newobj) | |
| 782 { | |
| 783 pdf_obj *obj = NULL; | |
| 784 int num = 0, gen = 0; | |
| 785 int64_t stm_ofs; | |
| 786 pdf_token tok; | |
| 787 pdf_lexbuf *buf = &doc->lexbuf.base; | |
| 788 int64_t a, b; | |
| 789 int read_next_token = 1; | |
| 790 | |
| 791 fz_var(obj); | |
| 792 | |
| 793 tok = pdf_lex(ctx, file, buf); | |
| 794 if (tok != PDF_TOK_INT) | |
| 795 { | |
| 796 if (try_repair) | |
| 797 *try_repair = 1; | |
| 798 fz_throw(ctx, FZ_ERROR_SYNTAX, "expected object number"); | |
| 799 } | |
| 800 num = buf->i; | |
| 801 if (num < 0 || num > PDF_MAX_OBJECT_NUMBER) | |
| 802 fz_throw(ctx, FZ_ERROR_SYNTAX, "object number out of range"); | |
| 803 | |
| 804 tok = pdf_lex(ctx, file, buf); | |
| 805 if (tok != PDF_TOK_INT) | |
| 806 { | |
| 807 if (try_repair) | |
| 808 *try_repair = 1; | |
| 809 fz_throw(ctx, FZ_ERROR_SYNTAX, "expected generation number (%d ? obj)", num); | |
| 810 } | |
| 811 gen = buf->i; | |
| 812 if (gen < 0 || gen >= 65536) | |
| 813 { | |
| 814 if (try_repair) | |
| 815 *try_repair = 1; | |
| 816 fz_throw(ctx, FZ_ERROR_SYNTAX, "invalid generation number (%d)", gen); | |
| 817 } | |
| 818 | |
| 819 tok = pdf_lex(ctx, file, buf); | |
| 820 if (tok == PDF_TOK_NEWOBJ && newobj) | |
| 821 { | |
| 822 *newobj = 1; | |
| 823 if (onum) *onum = num; | |
| 824 if (ogen) *ogen = gen; | |
| 825 if (ostmofs) *ostmofs = 0; | |
| 826 return NULL; | |
| 827 } | |
| 828 if (tok != PDF_TOK_OBJ) | |
| 829 { | |
| 830 if (try_repair) | |
| 831 *try_repair = 1; | |
| 832 fz_throw(ctx, FZ_ERROR_SYNTAX, "expected 'obj' keyword (%d %d ?)", num, gen); | |
| 833 } | |
| 834 | |
| 835 tok = pdf_lex(ctx, file, buf); | |
| 836 | |
| 837 switch (tok) | |
| 838 { | |
| 839 case PDF_TOK_OPEN_ARRAY: | |
| 840 obj = pdf_parse_array(ctx, doc, file, buf); | |
| 841 break; | |
| 842 | |
| 843 case PDF_TOK_OPEN_DICT: | |
| 844 obj = pdf_parse_dict(ctx, doc, file, buf); | |
| 845 break; | |
| 846 | |
| 847 case PDF_TOK_NAME: obj = pdf_new_name(ctx, buf->scratch); break; | |
| 848 case PDF_TOK_REAL: obj = pdf_new_real(ctx, buf->f); break; | |
| 849 case PDF_TOK_STRING: obj = pdf_new_string(ctx, buf->scratch, buf->len); break; | |
| 850 case PDF_TOK_TRUE: obj = PDF_TRUE; break; | |
| 851 case PDF_TOK_FALSE: obj = PDF_FALSE; break; | |
| 852 case PDF_TOK_NULL: obj = PDF_NULL; break; | |
| 853 | |
| 854 case PDF_TOK_INT: | |
| 855 a = buf->i; | |
| 856 tok = pdf_lex(ctx, file, buf); | |
| 857 | |
| 858 if (tok == PDF_TOK_STREAM || tok == PDF_TOK_ENDOBJ) | |
| 859 { | |
| 860 obj = pdf_new_int(ctx, a); | |
| 861 read_next_token = 0; | |
| 862 break; | |
| 863 } | |
| 864 else if (tok == PDF_TOK_INT) | |
| 865 { | |
| 866 b = buf->i; | |
| 867 tok = pdf_lex(ctx, file, buf); | |
| 868 if (tok == PDF_TOK_R) | |
| 869 { | |
| 870 obj = pdf_new_indirect(ctx, doc, a, b); | |
| 871 break; | |
| 872 } | |
| 873 } | |
| 874 fz_throw(ctx, FZ_ERROR_SYNTAX, "expected 'R' keyword (%d %d R)", num, gen); | |
| 875 | |
| 876 case PDF_TOK_ENDOBJ: | |
| 877 obj = PDF_NULL; | |
| 878 read_next_token = 0; | |
| 879 break; | |
| 880 | |
| 881 default: | |
| 882 fz_throw(ctx, FZ_ERROR_SYNTAX, "syntax error in object (%d %d R)", num, gen); | |
| 883 } | |
| 884 | |
| 885 fz_try(ctx) | |
| 886 { | |
| 887 if (read_next_token) | |
| 888 tok = pdf_lex(ctx, file, buf); | |
| 889 | |
| 890 if (tok == PDF_TOK_STREAM) | |
| 891 { | |
| 892 int c = fz_read_byte(ctx, file); | |
| 893 while (c == ' ') | |
| 894 c = fz_read_byte(ctx, file); | |
| 895 if (c == '\r') | |
| 896 { | |
| 897 c = fz_peek_byte(ctx, file); | |
| 898 if (c != '\n') | |
| 899 fz_warn(ctx, "line feed missing after stream begin marker (%d %d R)", num, gen); | |
| 900 else | |
| 901 fz_read_byte(ctx, file); | |
| 902 } | |
| 903 stm_ofs = fz_tell(ctx, file); | |
| 904 } | |
| 905 else if (tok == PDF_TOK_ENDOBJ) | |
| 906 { | |
| 907 stm_ofs = 0; | |
| 908 } | |
| 909 else | |
| 910 { | |
| 911 fz_warn(ctx, "expected 'endobj' or 'stream' keyword (%d %d R)", num, gen); | |
| 912 stm_ofs = 0; | |
| 913 } | |
| 914 } | |
| 915 fz_catch(ctx) | |
| 916 { | |
| 917 pdf_drop_obj(ctx, obj); | |
| 918 fz_rethrow(ctx); | |
| 919 } | |
| 920 | |
| 921 if (onum) *onum = num; | |
| 922 if (ogen) *ogen = gen; | |
| 923 if (ostmofs) *ostmofs = stm_ofs; | |
| 924 | |
| 925 return obj; | |
| 926 } | |
| 927 | |
| 928 pdf_obj * | |
| 929 pdf_parse_ind_obj(fz_context *ctx, pdf_document *doc, fz_stream *file, | |
| 930 int *onum, int *ogen, int64_t *ostmofs, int *try_repair) | |
| 931 { | |
| 932 return pdf_parse_ind_obj_or_newobj(ctx, doc, file, onum, ogen, ostmofs, try_repair, NULL); | |
| 933 } | |
| 934 | |
| 935 pdf_obj * | |
| 936 pdf_parse_journal_obj(fz_context *ctx, pdf_document *doc, fz_stream *stm, | |
| 937 int *onum, fz_buffer **ostm, int *newobj) | |
| 938 { | |
| 939 pdf_obj *obj = NULL; | |
| 940 pdf_token tok; | |
| 941 pdf_lexbuf *buf = &doc->lexbuf.base; | |
| 942 int64_t stmofs; | |
| 943 | |
| 944 *newobj = 0; | |
| 945 obj = pdf_parse_ind_obj_or_newobj(ctx, doc, stm, onum, NULL, &stmofs, NULL, newobj); | |
| 946 /* This will have consumed either the stream or the endobj keywords. */ | |
| 947 | |
| 948 *ostm = NULL; | |
| 949 if (stmofs) | |
| 950 { | |
| 951 fz_stream *stream = NULL; | |
| 952 | |
| 953 fz_var(stream); | |
| 954 | |
| 955 fz_try(ctx) | |
| 956 { | |
| 957 stream = fz_open_endstream_filter(ctx, stm, 0, stmofs); | |
| 958 *ostm = fz_read_all(ctx, stream, 32); | |
| 959 fz_drop_stream(ctx, stream); | |
| 960 stream = NULL; | |
| 961 fz_seek(ctx, stm, stmofs + (*ostm ? (*ostm)->len : 0), SEEK_SET); | |
| 962 tok = pdf_lex(ctx, stm, buf); | |
| 963 if (tok != PDF_TOK_ENDSTREAM) | |
| 964 fz_throw(ctx, FZ_ERROR_SYNTAX, "expected 'endstream' keyword"); | |
| 965 tok = pdf_lex(ctx, stm, buf); | |
| 966 if (tok != PDF_TOK_ENDOBJ) | |
| 967 fz_throw(ctx, FZ_ERROR_SYNTAX, "expected 'endobj' keyword"); | |
| 968 } | |
| 969 fz_always(ctx) | |
| 970 fz_drop_stream(ctx, stream); | |
| 971 fz_catch(ctx) | |
| 972 { | |
| 973 pdf_drop_obj(ctx, obj); | |
| 974 fz_rethrow(ctx); | |
| 975 } | |
| 976 } | |
| 977 | |
| 978 return obj; | |
| 979 } |
