Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/zlib/examples/gznorm.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /* gznorm.c -- normalize a gzip stream | |
| 2 * Copyright (C) 2018 Mark Adler | |
| 3 * For conditions of distribution and use, see copyright notice in zlib.h | |
| 4 * Version 1.0 7 Oct 2018 Mark Adler */ | |
| 5 | |
| 6 // gznorm takes a gzip stream, potentially containing multiple members, and | |
| 7 // converts it to a gzip stream with a single member. In addition the gzip | |
| 8 // header is normalized, removing the file name and time stamp, and setting the | |
| 9 // other header contents (XFL, OS) to fixed values. gznorm does not recompress | |
| 10 // the data, so it is fast, but no advantage is gained from the history that | |
| 11 // could be available across member boundaries. | |
| 12 | |
| 13 #include <stdio.h> // fread, fwrite, putc, fflush, ferror, fprintf, | |
| 14 // vsnprintf, stdout, stderr, NULL, FILE | |
| 15 #include <stdlib.h> // malloc, free | |
| 16 #include <string.h> // strerror | |
| 17 #include <errno.h> // errno | |
| 18 #include <stdarg.h> // va_list, va_start, va_end | |
| 19 #include "zlib.h" // inflateInit2, inflate, inflateReset, inflateEnd, | |
| 20 // z_stream, z_off_t, crc32_combine, Z_NULL, Z_BLOCK, | |
| 21 // Z_OK, Z_STREAM_END, Z_BUF_ERROR, Z_DATA_ERROR, | |
| 22 // Z_MEM_ERROR | |
| 23 | |
| 24 #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__) | |
| 25 # include <fcntl.h> | |
| 26 # include <io.h> | |
| 27 # define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY) | |
| 28 #else | |
| 29 # define SET_BINARY_MODE(file) | |
| 30 #endif | |
| 31 | |
| 32 #define local static | |
| 33 | |
| 34 // printf to an allocated string. Return the string, or NULL if the printf or | |
| 35 // allocation fails. | |
| 36 local char *aprintf(char *fmt, ...) { | |
| 37 // Get the length of the result of the printf. | |
| 38 va_list args; | |
| 39 va_start(args, fmt); | |
| 40 int len = vsnprintf(NULL, 0, fmt, args); | |
| 41 va_end(args); | |
| 42 if (len < 0) | |
| 43 return NULL; | |
| 44 | |
| 45 // Allocate the required space and printf to it. | |
| 46 char *str = malloc(len + 1); | |
| 47 if (str == NULL) | |
| 48 return NULL; | |
| 49 va_start(args, fmt); | |
| 50 vsnprintf(str, len + 1, fmt, args); | |
| 51 va_end(args); | |
| 52 return str; | |
| 53 } | |
| 54 | |
| 55 // Return with an error, putting an allocated error message in *err. Doing an | |
| 56 // inflateEnd() on an already ended state, or one with state set to Z_NULL, is | |
| 57 // permitted. | |
| 58 #define BYE(...) \ | |
| 59 do { \ | |
| 60 inflateEnd(&strm); \ | |
| 61 *err = aprintf(__VA_ARGS__); \ | |
| 62 return 1; \ | |
| 63 } while (0) | |
| 64 | |
| 65 // Chunk size for buffered reads and for decompression. Twice this many bytes | |
| 66 // will be allocated on the stack by gzip_normalize(). Must fit in an unsigned. | |
| 67 #define CHUNK 16384 | |
| 68 | |
| 69 // Read a gzip stream from in and write an equivalent normalized gzip stream to | |
| 70 // out. If given no input, an empty gzip stream will be written. If successful, | |
| 71 // 0 is returned, and *err is set to NULL. On error, 1 is returned, where the | |
| 72 // details of the error are returned in *err, a pointer to an allocated string. | |
| 73 // | |
| 74 // The input may be a stream with multiple gzip members, which is converted to | |
| 75 // a single gzip member on the output. Each gzip member is decompressed at the | |
| 76 // level of deflate blocks. This enables clearing the last-block bit, shifting | |
| 77 // the compressed data to concatenate to the previous member's compressed data, | |
| 78 // which can end at an arbitrary bit boundary, and identifying stored blocks in | |
| 79 // order to resynchronize those to byte boundaries. The deflate compressed data | |
| 80 // is terminated with a 10-bit empty fixed block. If any members on the input | |
| 81 // end with a 10-bit empty fixed block, then that block is excised from the | |
| 82 // stream. This avoids appending empty fixed blocks for every normalization, | |
| 83 // and assures that gzip_normalize applied a second time will not change the | |
| 84 // input. The pad bits after stored block headers and after the final deflate | |
| 85 // block are all forced to zeros. | |
| 86 local int gzip_normalize(FILE *in, FILE *out, char **err) { | |
| 87 // initialize the inflate engine to process a gzip member | |
| 88 z_stream strm; | |
| 89 strm.zalloc = Z_NULL; | |
| 90 strm.zfree = Z_NULL; | |
| 91 strm.opaque = Z_NULL; | |
| 92 strm.avail_in = 0; | |
| 93 strm.next_in = Z_NULL; | |
| 94 if (inflateInit2(&strm, 15 + 16) != Z_OK) | |
| 95 BYE("out of memory"); | |
| 96 | |
| 97 // State while processing the input gzip stream. | |
| 98 enum { // BETWEEN -> HEAD -> BLOCK -> TAIL -> BETWEEN -> ... | |
| 99 BETWEEN, // between gzip members (must end in this state) | |
| 100 HEAD, // reading a gzip header | |
| 101 BLOCK, // reading deflate blocks | |
| 102 TAIL // reading a gzip trailer | |
| 103 } state = BETWEEN; // current component being processed | |
| 104 unsigned long crc = 0; // accumulated CRC of uncompressed data | |
| 105 unsigned long len = 0; // accumulated length of uncompressed data | |
| 106 unsigned long buf = 0; // deflate stream bit buffer of num bits | |
| 107 int num = 0; // number of bits in buf (at bottom) | |
| 108 | |
| 109 // Write a canonical gzip header (no mod time, file name, comment, extra | |
| 110 // block, or extra flags, and OS is marked as unknown). | |
| 111 fwrite("\x1f\x8b\x08\0\0\0\0\0\0\xff", 1, 10, out); | |
| 112 | |
| 113 // Process the gzip stream from in until reaching the end of the input, | |
| 114 // encountering invalid input, or experiencing an i/o error. | |
| 115 int more; // true if not at the end of the input | |
| 116 do { | |
| 117 // State inside this loop. | |
| 118 unsigned char *put; // next input buffer location to process | |
| 119 int prev; // number of bits from previous block in | |
| 120 // the bit buffer, or -1 if not at the | |
| 121 // start of a block | |
| 122 unsigned long long memb; // uncompressed length of member | |
| 123 size_t tail; // number of trailer bytes read (0..8) | |
| 124 unsigned long part; // accumulated trailer component | |
| 125 | |
| 126 // Get the next chunk of input from in. | |
| 127 unsigned char dat[CHUNK]; | |
| 128 strm.avail_in = fread(dat, 1, CHUNK, in); | |
| 129 if (strm.avail_in == 0) | |
| 130 break; | |
| 131 more = strm.avail_in == CHUNK; | |
| 132 strm.next_in = put = dat; | |
| 133 | |
| 134 // Run that chunk of input through the inflate engine to exhaustion. | |
| 135 do { | |
| 136 // At this point it is assured that strm.avail_in > 0. | |
| 137 | |
| 138 // Inflate until the end of a gzip component (header, deflate | |
| 139 // block, trailer) is reached, or until all of the chunk is | |
| 140 // consumed. The resulting decompressed data is discarded, though | |
| 141 // the total size of the decompressed data in each member is | |
| 142 // tracked, for the calculation of the total CRC. | |
| 143 do { | |
| 144 // inflate and handle any errors | |
| 145 unsigned char scrap[CHUNK]; | |
| 146 strm.avail_out = CHUNK; | |
| 147 strm.next_out = scrap; | |
| 148 int ret = inflate(&strm, Z_BLOCK); | |
| 149 if (ret == Z_MEM_ERROR) | |
| 150 BYE("out of memory"); | |
| 151 if (ret == Z_DATA_ERROR) | |
| 152 BYE("input invalid: %s", strm.msg); | |
| 153 if (ret != Z_OK && ret != Z_BUF_ERROR && ret != Z_STREAM_END) | |
| 154 BYE("internal error"); | |
| 155 | |
| 156 // Update the number of uncompressed bytes generated in this | |
| 157 // member. The actual count (not modulo 2^32) is required to | |
| 158 // correctly compute the total CRC. | |
| 159 unsigned got = CHUNK - strm.avail_out; | |
| 160 memb += got; | |
| 161 if (memb < got) | |
| 162 BYE("overflow error"); | |
| 163 | |
| 164 // Continue to process this chunk until it is consumed, or | |
| 165 // until the end of a component (header, deflate block, or | |
| 166 // trailer) is reached. | |
| 167 } while (strm.avail_out == 0 && (strm.data_type & 0x80) == 0); | |
| 168 | |
| 169 // Since strm.avail_in was > 0 for the inflate call, some input was | |
| 170 // just consumed. It is therefore assured that put < strm.next_in. | |
| 171 | |
| 172 // Disposition the consumed component or part of a component. | |
| 173 switch (state) { | |
| 174 case BETWEEN: | |
| 175 state = HEAD; | |
| 176 // Fall through to HEAD when some or all of the header is | |
| 177 // processed. | |
| 178 | |
| 179 case HEAD: | |
| 180 // Discard the header. | |
| 181 if (strm.data_type & 0x80) { | |
| 182 // End of header reached -- deflate blocks follow. | |
| 183 put = strm.next_in; | |
| 184 prev = num; | |
| 185 memb = 0; | |
| 186 state = BLOCK; | |
| 187 } | |
| 188 break; | |
| 189 | |
| 190 case BLOCK: | |
| 191 // Copy the deflate stream to the output, but with the | |
| 192 // last-block-bit cleared. Re-synchronize stored block | |
| 193 // headers to the output byte boundaries. The bytes at | |
| 194 // put..strm.next_in-1 is the compressed data that has been | |
| 195 // processed and is ready to be copied to the output. | |
| 196 | |
| 197 // At this point, it is assured that new compressed data is | |
| 198 // available, i.e., put < strm.next_in. If prev is -1, then | |
| 199 // that compressed data starts in the middle of a deflate | |
| 200 // block. If prev is not -1, then the bits in the bit | |
| 201 // buffer, possibly combined with the bits in *put, contain | |
| 202 // the three-bit header of the new deflate block. In that | |
| 203 // case, prev is the number of bits from the previous block | |
| 204 // that remain in the bit buffer. Since num is the number | |
| 205 // of bits in the bit buffer, we have that num - prev is | |
| 206 // the number of bits from the new block currently in the | |
| 207 // bit buffer. | |
| 208 | |
| 209 // If strm.data_type & 0xc0 is 0x80, then the last byte of | |
| 210 // the available compressed data includes the last bits of | |
| 211 // the end of a deflate block. In that case, that last byte | |
| 212 // also has strm.data_type & 0x1f bits of the next deflate | |
| 213 // block, in the range 0..7. If strm.data_type & 0xc0 is | |
| 214 // 0xc0, then the last byte of the compressed data is the | |
| 215 // end of the deflate stream, followed by strm.data_type & | |
| 216 // 0x1f pad bits, also in the range 0..7. | |
| 217 | |
| 218 // Set bits to the number of bits not yet consumed from the | |
| 219 // last byte. If we are at the end of the block, bits is | |
| 220 // either the number of bits in the last byte belonging to | |
| 221 // the next block, or the number of pad bits after the | |
| 222 // final block. In either of those cases, bits is in the | |
| 223 // range 0..7. | |
| 224 ; // (required due to C syntax oddity) | |
| 225 int bits = strm.data_type & 0x1f; | |
| 226 | |
| 227 if (prev != -1) { | |
| 228 // We are at the start of a new block. Clear the last | |
| 229 // block bit, and check for special cases. If it is a | |
| 230 // stored block, then emit the header and pad to the | |
| 231 // next byte boundary. If it is a final, empty fixed | |
| 232 // block, then excise it. | |
| 233 | |
| 234 // Some or all of the three header bits for this block | |
| 235 // may already be in the bit buffer. Load any remaining | |
| 236 // header bits into the bit buffer. | |
| 237 if (num - prev < 3) { | |
| 238 buf += (unsigned long)*put++ << num; | |
| 239 num += 8; | |
| 240 } | |
| 241 | |
| 242 // Set last to have a 1 in the position of the last | |
| 243 // block bit in the bit buffer. | |
| 244 unsigned long last = (unsigned long)1 << prev; | |
| 245 | |
| 246 if (((buf >> prev) & 7) == 3) { | |
| 247 // This is a final fixed block. Load at least ten | |
| 248 // bits from this block, including the header, into | |
| 249 // the bit buffer. We already have at least three, | |
| 250 // so at most one more byte needs to be loaded. | |
| 251 if (num - prev < 10) { | |
| 252 if (put == strm.next_in) | |
| 253 // Need to go get and process more input. | |
| 254 // We'll end up back here to finish this. | |
| 255 break; | |
| 256 buf += (unsigned long)*put++ << num; | |
| 257 num += 8; | |
| 258 } | |
| 259 if (((buf >> prev) & 0x3ff) == 3) { | |
| 260 // That final fixed block is empty. Delete it | |
| 261 // to avoid adding an empty block every time a | |
| 262 // gzip stream is normalized. | |
| 263 num = prev; | |
| 264 buf &= last - 1; // zero the pad bits | |
| 265 } | |
| 266 } | |
| 267 else if (((buf >> prev) & 6) == 0) { | |
| 268 // This is a stored block. Flush to the next | |
| 269 // byte boundary after the three-bit header. | |
| 270 num = (prev + 10) & ~7; | |
| 271 buf &= last - 1; // zero the pad bits | |
| 272 } | |
| 273 | |
| 274 // Clear the last block bit. | |
| 275 buf &= ~last; | |
| 276 | |
| 277 // Write out complete bytes in the bit buffer. | |
| 278 while (num >= 8) { | |
| 279 putc(buf, out); | |
| 280 buf >>= 8; | |
| 281 num -= 8; | |
| 282 } | |
| 283 | |
| 284 // If no more bytes left to process, then we have | |
| 285 // consumed the byte that had bits from the next block. | |
| 286 if (put == strm.next_in) | |
| 287 bits = 0; | |
| 288 } | |
| 289 | |
| 290 // We are done handling the deflate block header. Now copy | |
| 291 // all or almost all of the remaining compressed data that | |
| 292 // has been processed so far. Don't copy one byte at the | |
| 293 // end if it contains bits from the next deflate block or | |
| 294 // pad bits at the end of a deflate block. | |
| 295 | |
| 296 // mix is 1 if we are at the end of a deflate block, and if | |
| 297 // some of the bits in the last byte follow this block. mix | |
| 298 // is 0 if we are in the middle of a deflate block, if the | |
| 299 // deflate block ended on a byte boundary, or if all of the | |
| 300 // compressed data processed so far has been consumed. | |
| 301 int mix = (strm.data_type & 0x80) && bits; | |
| 302 | |
| 303 // Copy all of the processed compressed data to the output, | |
| 304 // except for the last byte if it contains bits from the | |
| 305 // next deflate block or pad bits at the end of the deflate | |
| 306 // stream. Copy the data after shifting in num bits from | |
| 307 // buf in front of it, leaving num bits from the end of the | |
| 308 // compressed data in buf when done. | |
| 309 unsigned char *end = strm.next_in - mix; | |
| 310 if (put < end) { | |
| 311 if (num) | |
| 312 // Insert num bits from buf before the data being | |
| 313 // copied. | |
| 314 do { | |
| 315 buf += (unsigned)(*put++) << num; | |
| 316 putc(buf, out); | |
| 317 buf >>= 8; | |
| 318 } while (put < end); | |
| 319 else { | |
| 320 // No shifting needed -- write directly. | |
| 321 fwrite(put, 1, end - put, out); | |
| 322 put = end; | |
| 323 } | |
| 324 } | |
| 325 | |
| 326 // Process the last processed byte if it wasn't written. | |
| 327 if (mix) { | |
| 328 // Load the last byte into the bit buffer. | |
| 329 buf += (unsigned)(*put++) << num; | |
| 330 num += 8; | |
| 331 | |
| 332 if (strm.data_type & 0x40) { | |
| 333 // We are at the end of the deflate stream and | |
| 334 // there are bits pad bits. Discard the pad bits | |
| 335 // and write a byte to the output, if available. | |
| 336 // Leave the num bits left over in buf to prepend | |
| 337 // to the next deflate stream. | |
| 338 num -= bits; | |
| 339 if (num >= 8) { | |
| 340 putc(buf, out); | |
| 341 num -= 8; | |
| 342 buf >>= 8; | |
| 343 } | |
| 344 | |
| 345 // Force the pad bits in the bit buffer to zeros. | |
| 346 buf &= ((unsigned long)1 << num) - 1; | |
| 347 | |
| 348 // Don't need to set prev here since going to TAIL. | |
| 349 } | |
| 350 else | |
| 351 // At the end of an internal deflate block. Leave | |
| 352 // the last byte in the bit buffer to examine on | |
| 353 // the next entry to BLOCK, when more bits from the | |
| 354 // next block will be available. | |
| 355 prev = num - bits; // number of bits in buffer | |
| 356 // from current block | |
| 357 } | |
| 358 | |
| 359 // Don't have a byte left over, so we are in the middle of | |
| 360 // a deflate block, or the deflate block ended on a byte | |
| 361 // boundary. Set prev appropriately for the next entry into | |
| 362 // BLOCK. | |
| 363 else if (strm.data_type & 0x80) | |
| 364 // The block ended on a byte boundary, so no header | |
| 365 // bits are in the bit buffer. | |
| 366 prev = num; | |
| 367 else | |
| 368 // In the middle of a deflate block, so no header here. | |
| 369 prev = -1; | |
| 370 | |
| 371 // Check for the end of the deflate stream. | |
| 372 if ((strm.data_type & 0xc0) == 0xc0) { | |
| 373 // That ends the deflate stream on the input side, the | |
| 374 // pad bits were discarded, and any remaining bits from | |
| 375 // the last block in the stream are saved in the bit | |
| 376 // buffer to prepend to the next stream. Process the | |
| 377 // gzip trailer next. | |
| 378 tail = 0; | |
| 379 part = 0; | |
| 380 state = TAIL; | |
| 381 } | |
| 382 break; | |
| 383 | |
| 384 case TAIL: | |
| 385 // Accumulate available trailer bytes to update the total | |
| 386 // CRC and the total uncompressed length. | |
| 387 do { | |
| 388 part = (part >> 8) + ((unsigned long)(*put++) << 24); | |
| 389 tail++; | |
| 390 if (tail == 4) { | |
| 391 // Update the total CRC. | |
| 392 z_off_t len2 = memb; | |
| 393 if (len2 < 0 || (unsigned long long)len2 != memb) | |
| 394 BYE("overflow error"); | |
| 395 crc = crc ? crc32_combine(crc, part, len2) : part; | |
| 396 part = 0; | |
| 397 } | |
| 398 else if (tail == 8) { | |
| 399 // Update the total uncompressed length. (It's ok | |
| 400 // if this sum is done modulo 2^32.) | |
| 401 len += part; | |
| 402 | |
| 403 // At the end of a member. Set up to inflate an | |
| 404 // immediately following gzip member. (If we made | |
| 405 // it this far, then the trailer was valid.) | |
| 406 if (inflateReset(&strm) != Z_OK) | |
| 407 BYE("internal error"); | |
| 408 state = BETWEEN; | |
| 409 break; | |
| 410 } | |
| 411 } while (put < strm.next_in); | |
| 412 break; | |
| 413 } | |
| 414 | |
| 415 // Process the input buffer until completely consumed. | |
| 416 } while (strm.avail_in > 0); | |
| 417 | |
| 418 // Process input until end of file, invalid input, or i/o error. | |
| 419 } while (more); | |
| 420 | |
| 421 // Done with the inflate engine. | |
| 422 inflateEnd(&strm); | |
| 423 | |
| 424 // Verify the validity of the input. | |
| 425 if (state != BETWEEN) | |
| 426 BYE("input invalid: incomplete gzip stream"); | |
| 427 | |
| 428 // Write the remaining deflate stream bits, followed by a terminating | |
| 429 // deflate fixed block. | |
| 430 buf += (unsigned long)3 << num; | |
| 431 putc(buf, out); | |
| 432 putc(buf >> 8, out); | |
| 433 if (num > 6) | |
| 434 putc(0, out); | |
| 435 | |
| 436 // Write the gzip trailer, which is the CRC and the uncompressed length | |
| 437 // modulo 2^32, both in little-endian order. | |
| 438 putc(crc, out); | |
| 439 putc(crc >> 8, out); | |
| 440 putc(crc >> 16, out); | |
| 441 putc(crc >> 24, out); | |
| 442 putc(len, out); | |
| 443 putc(len >> 8, out); | |
| 444 putc(len >> 16, out); | |
| 445 putc(len >> 24, out); | |
| 446 fflush(out); | |
| 447 | |
| 448 // Check for any i/o errors. | |
| 449 if (ferror(in) || ferror(out)) | |
| 450 BYE("i/o error: %s", strerror(errno)); | |
| 451 | |
| 452 // All good! | |
| 453 *err = NULL; | |
| 454 return 0; | |
| 455 } | |
| 456 | |
| 457 // Normalize the gzip stream on stdin, writing the result to stdout. | |
| 458 int main(void) { | |
| 459 // Avoid end-of-line conversions on evil operating systems. | |
| 460 SET_BINARY_MODE(stdin); | |
| 461 SET_BINARY_MODE(stdout); | |
| 462 | |
| 463 // Normalize from stdin to stdout, returning 1 on error, 0 if ok. | |
| 464 char *err; | |
| 465 int ret = gzip_normalize(stdin, stdout, &err); | |
| 466 if (ret) | |
| 467 fprintf(stderr, "gznorm error: %s\n", err); | |
| 468 free(err); | |
| 469 return ret; | |
| 470 } |
