Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/leptonica/src/pdfio2.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /*====================================================================* | |
| 2 - Copyright (C) 2001 Leptonica. All rights reserved. | |
| 3 - | |
| 4 - Redistribution and use in source and binary forms, with or without | |
| 5 - modification, are permitted provided that the following conditions | |
| 6 - are met: | |
| 7 - 1. Redistributions of source code must retain the above copyright | |
| 8 - notice, this list of conditions and the following disclaimer. | |
| 9 - 2. Redistributions in binary form must reproduce the above | |
| 10 - copyright notice, this list of conditions and the following | |
| 11 - disclaimer in the documentation and/or other materials | |
| 12 - provided with the distribution. | |
| 13 - | |
| 14 - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
| 15 - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
| 16 - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
| 17 - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY | |
| 18 - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
| 19 - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
| 20 - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
| 21 - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | |
| 22 - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | |
| 23 - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
| 24 - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| 25 *====================================================================*/ | |
| 26 | |
| 27 /*! | |
| 28 * \file pdfio2.c | |
| 29 * <pre> | |
| 30 * | |
| 31 * Lower-level operations for generating pdf. | |
| 32 * | |
| 33 * Intermediate function for single page, multi-image conversion | |
| 34 * l_int32 pixConvertToPdfData() | |
| 35 * | |
| 36 * Intermediate function for generating multipage pdf output | |
| 37 * l_int32 ptraConcatenatePdfToData() | |
| 38 * | |
| 39 * Convert tiff multipage to pdf file | |
| 40 * l_int32 convertTiffMultipageToPdf() | |
| 41 * | |
| 42 * Generates the CID, transcoding under some conditions | |
| 43 * l_int32 l_generateCIDataForPdf() | |
| 44 * l_int32 l_generateCIData() | |
| 45 * | |
| 46 * Lower-level CID generation without transcoding | |
| 47 * L_COMP_DATA *l_generateFlateDataPdf() | |
| 48 * L_COMP_DATA *l_generateJpegData() | |
| 49 * L_COMP_DATA *l_generateJpegDataMem() | |
| 50 * static L_COMP_DATA *l_generateJp2kData() | |
| 51 * L_COMP_DATA *l_generateG4Data() | |
| 52 * | |
| 53 * Lower-level CID generation with transcoding | |
| 54 * l_int32 pixGenerateCIData() | |
| 55 * L_COMP_DATA *l_generateFlateData() | |
| 56 * static L_COMP_DATA *pixGenerateFlateData() | |
| 57 * static L_COMP_DATA *pixGenerateJpegData() | |
| 58 * static L_COMP_DATA *pixGenerateJp2kData() | |
| 59 * static L_COMP_DATA *pixGenerateG4Data() | |
| 60 * | |
| 61 * Other CID operations | |
| 62 * l_int32 cidConvertToPdfData() | |
| 63 * void l_CIDataDestroy() | |
| 64 * | |
| 65 * Helper functions for generating the output pdf string | |
| 66 * static l_int32 l_generatePdf() | |
| 67 * static void generateFixedStringsPdf() | |
| 68 * static char *generateEscapeString() | |
| 69 * static void generateMediaboxPdf() | |
| 70 * static l_int32 generatePageStringPdf() | |
| 71 * static l_int32 generateContentStringPdf() | |
| 72 * static l_int32 generatePreXStringsPdf() | |
| 73 * static l_int32 generateColormapStringsPdf() | |
| 74 * static void generateTrailerPdf() | |
| 75 * static l_int32 makeTrailerStringPdf() | |
| 76 * static l_int32 generateOutputDataPdf() | |
| 77 * | |
| 78 * Helper functions for generating multipage pdf output | |
| 79 * static l_int32 parseTrailerPdf() | |
| 80 * static char *generatePagesObjStringPdf() | |
| 81 * static L_BYTEA *substituteObjectNumbers() | |
| 82 * | |
| 83 * Create/destroy/access pdf data | |
| 84 * static L_PDF_DATA *pdfdataCreate() | |
| 85 * static void pdfdataDestroy() | |
| 86 * static L_COMP_DATA *pdfdataGetCid() | |
| 87 * | |
| 88 * Find number of pages in a pdf | |
| 89 * l_int32 getPdfPageCount() | |
| 90 * | |
| 91 * Find widths and heights of pages and media boxes in a pdf | |
| 92 * l_int32 getPdfPageSizes() | |
| 93 * l_int32 getPdfMediaBoxSizes() | |
| 94 * | |
| 95 * Find effective resolution of images rendered from a pdf | |
| 96 * l_int32 getPdfRendererResolution() | |
| 97 * | |
| 98 * Set flags for special modes | |
| 99 * void l_pdfSetG4ImageMask() | |
| 100 * void l_pdfSetDateAndVersion() | |
| 101 * | |
| 102 * </pre> | |
| 103 */ | |
| 104 | |
| 105 #ifdef HAVE_CONFIG_H | |
| 106 #include <config_auto.h> | |
| 107 #endif /* HAVE_CONFIG_H */ | |
| 108 | |
| 109 #include <string.h> | |
| 110 #include <math.h> | |
| 111 #include "allheaders.h" | |
| 112 | |
| 113 /* --------------------------------------------*/ | |
| 114 #if USE_PDFIO /* defined in environ.h */ | |
| 115 /* --------------------------------------------*/ | |
| 116 | |
| 117 /* Typical scan resolution in ppi (pixels/inch) */ | |
| 118 static const l_int32 DefaultInputRes = 300; | |
| 119 | |
| 120 /* Static helpers */ | |
| 121 static L_COMP_DATA *l_generateJp2kData(const char *fname); | |
| 122 static L_COMP_DATA *pixGenerateFlateData(PIX *pixs, l_int32 ascii85flag); | |
| 123 static L_COMP_DATA *pixGenerateJpegData(PIX *pixs, l_int32 ascii85flag, | |
| 124 l_int32 quality); | |
| 125 static L_COMP_DATA *pixGenerateJp2kData(PIX *pixs, l_int32 quality); | |
| 126 static L_COMP_DATA *pixGenerateG4Data(PIX *pixs, l_int32 ascii85flag); | |
| 127 | |
| 128 static l_int32 l_generatePdf(l_uint8 **pdata, size_t *pnbytes, | |
| 129 L_PDF_DATA *lpd); | |
| 130 static void generateFixedStringsPdf(L_PDF_DATA *lpd); | |
| 131 static char *generateEscapeString(const char *str); | |
| 132 static void generateMediaboxPdf(L_PDF_DATA *lpd); | |
| 133 static l_int32 generatePageStringPdf(L_PDF_DATA *lpd); | |
| 134 static l_int32 generateContentStringPdf(L_PDF_DATA *lpd); | |
| 135 static l_int32 generatePreXStringsPdf(L_PDF_DATA *lpd); | |
| 136 static l_int32 generateColormapStringsPdf(L_PDF_DATA *lpd); | |
| 137 static void generateTrailerPdf(L_PDF_DATA *lpd); | |
| 138 static char *makeTrailerStringPdf(L_DNA *daloc); | |
| 139 static l_int32 generateOutputDataPdf(l_uint8 **pdata, size_t *pnbytes, | |
| 140 L_PDF_DATA *lpd); | |
| 141 | |
| 142 static l_int32 parseTrailerPdf(L_BYTEA *bas, L_DNA **pda); | |
| 143 static char *generatePagesObjStringPdf(NUMA *napage); | |
| 144 static L_BYTEA *substituteObjectNumbers(L_BYTEA *bas, NUMA *na_objs); | |
| 145 | |
| 146 static L_PDF_DATA *pdfdataCreate(const char *title); | |
| 147 static void pdfdataDestroy(L_PDF_DATA **plpd); | |
| 148 static L_COMP_DATA *pdfdataGetCid(L_PDF_DATA *lpd, l_int32 index); | |
| 149 | |
| 150 | |
| 151 /* ---------------- Defaults for rendering options ----------------- */ | |
| 152 /* Output G4 as writing through image mask; this is the default */ | |
| 153 static l_int32 var_WRITE_G4_IMAGE_MASK = 1; | |
| 154 /* Write date/time and lib version into pdf; this is the default */ | |
| 155 static l_int32 var_WRITE_DATE_AND_VERSION = 1; | |
| 156 | |
| 157 #define L_SMALLBUF 256 | |
| 158 #define L_BIGBUF 2048 /* must be able to hold hex colormap */ | |
| 159 | |
| 160 | |
| 161 #ifndef NO_CONSOLE_IO | |
| 162 #define DEBUG_MULTIPAGE 0 | |
| 163 #endif /* ~NO_CONSOLE_IO */ | |
| 164 | |
| 165 | |
| 166 /*---------------------------------------------------------------------* | |
| 167 * Intermediate function for generating multipage pdf output * | |
| 168 *---------------------------------------------------------------------*/ | |
| 169 /*! | |
| 170 * \brief pixConvertToPdfData() | |
| 171 * | |
| 172 * \param[in] pix all depths; cmap OK | |
| 173 * \param[in] type L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE, | |
| 174 * L_JP2K_ENCODE | |
| 175 * \param[in] quality for jpeg: 1-100; 0 for default (75) | |
| 176 * for jp2k: 27-45; 0 for default (34) | |
| 177 * \param[out] pdata pdf array | |
| 178 * \param[out] pnbytes number of bytes in pdf array | |
| 179 * \param[in] x, y location of lower-left corner of image, in pixels, | |
| 180 * relative to the PostScript origin (0,0) at | |
| 181 * the lower-left corner of the page) | |
| 182 * \param[in] res override the resolution of the input image, in ppi; | |
| 183 * use 0 to respect resolution embedded in the input | |
| 184 * \param[in] title [optional] pdf title; can be null | |
| 185 * \param[in,out] plpd ptr to lpd; created on the first invocation and | |
| 186 * returned until last image is processed | |
| 187 * \param[in] position in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE, | |
| 188 * L_LAST_IMAGE | |
| 189 * \return 0 if OK, 1 on error | |
| 190 * | |
| 191 * <pre> | |
| 192 * Notes: | |
| 193 * (1) If %res == 0 and the input resolution field from the pix is 0, | |
| 194 * this will use DefaultInputRes. | |
| 195 * (2) This only writes %data if it is the last image to be | |
| 196 * written on the page. | |
| 197 * (3) See comments in convertToPdf(). | |
| 198 * </pre> | |
| 199 */ | |
| 200 l_ok | |
| 201 pixConvertToPdfData(PIX *pix, | |
| 202 l_int32 type, | |
| 203 l_int32 quality, | |
| 204 l_uint8 **pdata, | |
| 205 size_t *pnbytes, | |
| 206 l_int32 x, | |
| 207 l_int32 y, | |
| 208 l_int32 res, | |
| 209 const char *title, | |
| 210 L_PDF_DATA **plpd, | |
| 211 l_int32 position) | |
| 212 { | |
| 213 l_int32 pixres, w, h, ret; | |
| 214 l_float32 xpt, ypt, wpt, hpt; | |
| 215 L_COMP_DATA *cid = NULL; | |
| 216 L_PDF_DATA *lpd = NULL; | |
| 217 | |
| 218 if (!pdata) | |
| 219 return ERROR_INT("&data not defined", __func__, 1); | |
| 220 *pdata = NULL; | |
| 221 if (!pnbytes) | |
| 222 return ERROR_INT("&nbytes not defined", __func__, 1); | |
| 223 *pnbytes = 0; | |
| 224 if (!pix) | |
| 225 return ERROR_INT("pix not defined", __func__, 1); | |
| 226 if (type != L_JPEG_ENCODE && type != L_G4_ENCODE && | |
| 227 type != L_FLATE_ENCODE && type != L_JP2K_ENCODE) { | |
| 228 selectDefaultPdfEncoding(pix, &type); | |
| 229 } | |
| 230 if (quality < 0 || quality > 100) | |
| 231 return ERROR_INT("invalid quality", __func__, 1); | |
| 232 | |
| 233 if (plpd) { /* part of multi-page invocation */ | |
| 234 if (position == L_FIRST_IMAGE) | |
| 235 *plpd = NULL; | |
| 236 } | |
| 237 | |
| 238 /* Generate the compressed image data. It must NOT | |
| 239 * be ascii85 encoded. */ | |
| 240 pixGenerateCIData(pix, type, quality, 0, &cid); | |
| 241 if (!cid) | |
| 242 return ERROR_INT("cid not made", __func__, 1); | |
| 243 | |
| 244 /* Get media box in pts. Guess the input image resolution | |
| 245 * based on the input parameter %res, the resolution data in | |
| 246 * the pix, and the size of the image. */ | |
| 247 pixres = cid->res; | |
| 248 w = cid->w; | |
| 249 h = cid->h; | |
| 250 if (res <= 0.0) | |
| 251 res = (pixres > 0) ? pixres : DefaultInputRes; | |
| 252 xpt = x * 72.f / res; | |
| 253 ypt = y * 72.f / res; | |
| 254 wpt = w * 72.f / res; | |
| 255 hpt = h * 72.f / res; | |
| 256 | |
| 257 /* Set up lpd */ | |
| 258 if (!plpd) { /* single image */ | |
| 259 if ((lpd = pdfdataCreate(title)) == NULL) | |
| 260 return ERROR_INT("lpd not made", __func__, 1); | |
| 261 } else if (position == L_FIRST_IMAGE) { /* first of multiple images */ | |
| 262 if ((lpd = pdfdataCreate(title)) == NULL) | |
| 263 return ERROR_INT("lpd not made", __func__, 1); | |
| 264 *plpd = lpd; | |
| 265 } else { /* not the first of multiple images */ | |
| 266 lpd = *plpd; | |
| 267 } | |
| 268 | |
| 269 /* Add the data to the lpd */ | |
| 270 ptraAdd(lpd->cida, cid); | |
| 271 lpd->n++; | |
| 272 ptaAddPt(lpd->xy, xpt, ypt); | |
| 273 ptaAddPt(lpd->wh, wpt, hpt); | |
| 274 | |
| 275 /* If a single image or the last of multiple images, | |
| 276 * generate the pdf and destroy the lpd */ | |
| 277 if (!plpd || (position == L_LAST_IMAGE)) { | |
| 278 ret = l_generatePdf(pdata, pnbytes, lpd); | |
| 279 pdfdataDestroy(&lpd); | |
| 280 if (plpd) *plpd = NULL; | |
| 281 if (ret) | |
| 282 return ERROR_INT("pdf output not made", __func__, 1); | |
| 283 } | |
| 284 | |
| 285 return 0; | |
| 286 } | |
| 287 | |
| 288 | |
| 289 /*---------------------------------------------------------------------* | |
| 290 * Intermediate function for generating multipage pdf output * | |
| 291 *---------------------------------------------------------------------*/ | |
| 292 /*! | |
| 293 * \brief ptraConcatenatePdfToData() | |
| 294 * | |
| 295 * \param[in] pa_data ptra array of pdf strings, each for a | |
| 296 * single-page pdf file | |
| 297 * \param[in] sa [optional] string array of pathnames for | |
| 298 * input pdf files; can be null | |
| 299 * \param[out] pdata concatenated pdf data in memory | |
| 300 * \param[out] pnbytes number of bytes in pdf data | |
| 301 * \return 0 if OK, 1 on error | |
| 302 * | |
| 303 * <pre> | |
| 304 * Notes: | |
| 305 * (1) This only works with leptonica-formatted single-page pdf files. | |
| 306 * pdf files generated by other programs will have unpredictable | |
| 307 * (and usually bad) results. The requirements for each pdf file: | |
| 308 * (a) The Catalog and Info objects are the first two. | |
| 309 * (b) Object 3 is Pages | |
| 310 * (c) Object 4 is Page | |
| 311 * (d) The remaining objects are Contents, XObjects, and ColorSpace | |
| 312 * (2) We remove trailers from each page, and append the full trailer | |
| 313 * for all pages at the end. | |
| 314 * (3) For all but the first file, remove the ID and the first 3 | |
| 315 * objects (catalog, info, pages), so that each subsequent | |
| 316 * file has only objects of these classes: | |
| 317 * Page, Contents, XObject, ColorSpace (Indexed RGB). | |
| 318 * For those objects, we substitute these refs to objects | |
| 319 * in the local file: | |
| 320 * Page: Parent(object 3), Contents, XObject(typically multiple) | |
| 321 * XObject: [ColorSpace if indexed] | |
| 322 * The Pages object on the first page (object 3) has a Kids array | |
| 323 * of references to all the Page objects, with a Count equal | |
| 324 * to the number of pages. Each Page object refers back to | |
| 325 * this parent. | |
| 326 * </pre> | |
| 327 */ | |
| 328 l_ok | |
| 329 ptraConcatenatePdfToData(L_PTRA *pa_data, | |
| 330 SARRAY *sa, | |
| 331 l_uint8 **pdata, | |
| 332 size_t *pnbytes) | |
| 333 { | |
| 334 char *fname, *str_pages, *str_trailer; | |
| 335 l_uint8 *pdfdata, *data; | |
| 336 l_int32 i, j, index, nobj, npages; | |
| 337 l_int32 *sizes, *locs; | |
| 338 size_t size; | |
| 339 L_BYTEA *bas, *bad, *bat1, *bat2; | |
| 340 L_DNA *da_locs, *da_sizes, *da_outlocs, *da; | |
| 341 L_DNAA *daa_locs; /* object locations on each page */ | |
| 342 NUMA *na_objs, *napage; | |
| 343 NUMAA *naa_objs; /* object mapping numbers to new values */ | |
| 344 | |
| 345 if (!pdata) | |
| 346 return ERROR_INT("&data not defined", __func__, 1); | |
| 347 *pdata = NULL; | |
| 348 if (!pnbytes) | |
| 349 return ERROR_INT("&nbytes not defined", __func__, 1); | |
| 350 *pnbytes = 0; | |
| 351 if (!pa_data) | |
| 352 return ERROR_INT("pa_data not defined", __func__, 1); | |
| 353 | |
| 354 /* Parse the files and find the object locations. | |
| 355 * Remove file data that cannot be parsed. */ | |
| 356 ptraGetActualCount(pa_data, &npages); | |
| 357 daa_locs = l_dnaaCreate(npages); | |
| 358 for (i = 0; i < npages; i++) { | |
| 359 bas = (L_BYTEA *)ptraGetPtrToItem(pa_data, i); | |
| 360 if (parseTrailerPdf(bas, &da_locs) != 0) { | |
| 361 bas = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION); | |
| 362 l_byteaDestroy(&bas); | |
| 363 if (sa) { | |
| 364 fname = sarrayGetString(sa, i, L_NOCOPY); | |
| 365 L_ERROR("can't parse file %s; skipping\n", __func__, fname); | |
| 366 } else { | |
| 367 L_ERROR("can't parse file %d; skipping\n", __func__, i); | |
| 368 } | |
| 369 } else { | |
| 370 l_dnaaAddDna(daa_locs, da_locs, L_INSERT); | |
| 371 } | |
| 372 } | |
| 373 | |
| 374 /* Recompute npages in case some of the files were not pdf */ | |
| 375 ptraCompactArray(pa_data); | |
| 376 ptraGetActualCount(pa_data, &npages); | |
| 377 if (npages == 0) { | |
| 378 l_dnaaDestroy(&daa_locs); | |
| 379 return ERROR_INT("no parsable pdf files found", __func__, 1); | |
| 380 } | |
| 381 | |
| 382 /* Find the mapping from initial to final object numbers */ | |
| 383 naa_objs = numaaCreate(npages); /* stores final object numbers */ | |
| 384 napage = numaCreate(npages); /* stores "Page" object numbers */ | |
| 385 index = 0; | |
| 386 for (i = 0; i < npages; i++) { | |
| 387 da = l_dnaaGetDna(daa_locs, i, L_CLONE); | |
| 388 nobj = l_dnaGetCount(da); | |
| 389 if (i == 0) { | |
| 390 numaAddNumber(napage, 4); /* object 4 on first page */ | |
| 391 na_objs = numaMakeSequence(0.0, 1.0, nobj - 1); | |
| 392 index = nobj - 1; | |
| 393 } else { /* skip the first 3 objects in each file */ | |
| 394 numaAddNumber(napage, index); /* Page object is first we add */ | |
| 395 na_objs = numaMakeConstant(0.0, nobj - 1); | |
| 396 numaReplaceNumber(na_objs, 3, 3); /* refers to parent of all */ | |
| 397 for (j = 4; j < nobj - 1; j++) | |
| 398 numaSetValue(na_objs, j, index++); | |
| 399 } | |
| 400 numaaAddNuma(naa_objs, na_objs, L_INSERT); | |
| 401 l_dnaDestroy(&da); | |
| 402 } | |
| 403 | |
| 404 /* Make the Pages object (#3) */ | |
| 405 str_pages = generatePagesObjStringPdf(napage); | |
| 406 | |
| 407 /* Build the output */ | |
| 408 bad = l_byteaCreate(5000); | |
| 409 da_outlocs = l_dnaCreate(0); /* locations of all output objects */ | |
| 410 for (i = 0; i < npages; i++) { | |
| 411 bas = (L_BYTEA *)ptraGetPtrToItem(pa_data, i); | |
| 412 pdfdata = l_byteaGetData(bas, &size); | |
| 413 da_locs = l_dnaaGetDna(daa_locs, i, L_CLONE); /* locs on this page */ | |
| 414 na_objs = numaaGetNuma(naa_objs, i, L_CLONE); /* obj # on this page */ | |
| 415 nobj = l_dnaGetCount(da_locs) - 1; | |
| 416 da_sizes = l_dnaDiffAdjValues(da_locs); /* object sizes on this page */ | |
| 417 sizes = l_dnaGetIArray(da_sizes); | |
| 418 locs = l_dnaGetIArray(da_locs); | |
| 419 if (i == 0) { | |
| 420 l_byteaAppendData(bad, pdfdata, sizes[0]); | |
| 421 l_byteaAppendData(bad, pdfdata + locs[1], sizes[1]); | |
| 422 l_byteaAppendData(bad, pdfdata + locs[2], sizes[2]); | |
| 423 l_byteaAppendString(bad, str_pages); | |
| 424 for (j = 0; j < 4; j++) | |
| 425 l_dnaAddNumber(da_outlocs, locs[j]); | |
| 426 } | |
| 427 for (j = 4; j < nobj; j++) { | |
| 428 l_dnaAddNumber(da_outlocs, l_byteaGetSize(bad)); | |
| 429 bat1 = l_byteaInitFromMem(pdfdata + locs[j], sizes[j]); | |
| 430 bat2 = substituteObjectNumbers(bat1, na_objs); | |
| 431 data = l_byteaGetData(bat2, &size); | |
| 432 l_byteaAppendData(bad, data, size); | |
| 433 l_byteaDestroy(&bat1); | |
| 434 l_byteaDestroy(&bat2); | |
| 435 } | |
| 436 if (i == npages - 1) /* last one */ | |
| 437 l_dnaAddNumber(da_outlocs, l_byteaGetSize(bad)); | |
| 438 LEPT_FREE(sizes); | |
| 439 LEPT_FREE(locs); | |
| 440 l_dnaDestroy(&da_locs); | |
| 441 numaDestroy(&na_objs); | |
| 442 l_dnaDestroy(&da_sizes); | |
| 443 } | |
| 444 | |
| 445 /* Add the trailer */ | |
| 446 str_trailer = makeTrailerStringPdf(da_outlocs); | |
| 447 l_byteaAppendString(bad, str_trailer); | |
| 448 | |
| 449 /* Transfer the output data */ | |
| 450 *pdata = l_byteaCopyData(bad, pnbytes); | |
| 451 l_byteaDestroy(&bad); | |
| 452 | |
| 453 #if DEBUG_MULTIPAGE | |
| 454 lept_stderr("******** object mapper **********"); | |
| 455 numaaWriteStream(stderr, naa_objs); | |
| 456 | |
| 457 lept_stderr("******** Page object numbers ***********"); | |
| 458 numaWriteStderr(napage); | |
| 459 | |
| 460 lept_stderr("******** Pages object ***********\n"); | |
| 461 lept_stderr("%s\n", str_pages); | |
| 462 #endif /* DEBUG_MULTIPAGE */ | |
| 463 | |
| 464 numaDestroy(&napage); | |
| 465 numaaDestroy(&naa_objs); | |
| 466 l_dnaDestroy(&da_outlocs); | |
| 467 l_dnaaDestroy(&daa_locs); | |
| 468 LEPT_FREE(str_pages); | |
| 469 LEPT_FREE(str_trailer); | |
| 470 return 0; | |
| 471 } | |
| 472 | |
| 473 | |
| 474 /*---------------------------------------------------------------------* | |
| 475 * Convert tiff multipage to pdf file * | |
| 476 *---------------------------------------------------------------------*/ | |
| 477 /*! | |
| 478 * \brief convertTiffMultipageToPdf() | |
| 479 * | |
| 480 * \param[in] filein (tiff) | |
| 481 * \param[in] fileout (pdf) | |
| 482 * \return 0 if OK, 1 on error | |
| 483 * | |
| 484 * <pre> | |
| 485 * Notes: | |
| 486 * (1) A multipage tiff file can also be converted to PS, using | |
| 487 * convertTiffMultipageToPS() | |
| 488 * </pre> | |
| 489 */ | |
| 490 l_ok | |
| 491 convertTiffMultipageToPdf(const char *filein, | |
| 492 const char *fileout) | |
| 493 { | |
| 494 l_int32 istiff; | |
| 495 PIXA *pixa; | |
| 496 FILE *fp; | |
| 497 | |
| 498 if ((fp = fopenReadStream(filein)) == NULL) | |
| 499 return ERROR_INT_1("file not found", filein, __func__, 1); | |
| 500 istiff = fileFormatIsTiff(fp); | |
| 501 fclose(fp); | |
| 502 if (!istiff) | |
| 503 return ERROR_INT_1("file not tiff format", filein, __func__, 1); | |
| 504 | |
| 505 pixa = pixaReadMultipageTiff(filein); | |
| 506 pixaConvertToPdf(pixa, 0, 1.0, 0, 0, "weasel2", fileout); | |
| 507 pixaDestroy(&pixa); | |
| 508 return 0; | |
| 509 } | |
| 510 | |
| 511 | |
| 512 /*---------------------------------------------------------------------* | |
| 513 * CID-based operations * | |
| 514 *---------------------------------------------------------------------*/ | |
| 515 /*! | |
| 516 * \brief l_generateCIDataForPdf() | |
| 517 * | |
| 518 * \param[in] fname [optional] can be null | |
| 519 * \param[in] pix [optional] can be null | |
| 520 * \param[in] quality for jpeg if transcoded: 1-100; 0 for default (75) | |
| 521 * for jp2k if transcoded: 27-45; 0 for default (34) | |
| 522 * \param[out] pcid compressed data | |
| 523 * \return 0 if OK, 1 on error | |
| 524 * | |
| 525 * <pre> | |
| 526 * Notes: | |
| 527 * (1) You must set either filename or pix. | |
| 528 * (2) Given an image file and optionally a pix raster of that data, | |
| 529 * this provides a CID that is compatible with PDF, preferably | |
| 530 * without transcoding. | |
| 531 * (3) The pix is included for efficiency, in case transcoding | |
| 532 * is required and the pix is available to the caller. | |
| 533 * (4) We don't try to open files named "stdin" or "-" for Tesseract | |
| 534 * compatibility reasons. We may remove this restriction | |
| 535 * in the future. | |
| 536 * (5) Note that tiff-g4 must be transcoded to properly handle byte | |
| 537 * order and perhaps photometry (e.g., min-is-black). For a | |
| 538 * multipage tiff file, data will only be extracted from the | |
| 539 * first page, so this should not be invoked. | |
| 540 * </pre> | |
| 541 */ | |
| 542 l_ok | |
| 543 l_generateCIDataForPdf(const char *fname, | |
| 544 PIX *pix, | |
| 545 l_int32 quality, | |
| 546 L_COMP_DATA **pcid) | |
| 547 { | |
| 548 l_int32 format, type; | |
| 549 L_COMP_DATA *cid; | |
| 550 PIX *pixt; | |
| 551 | |
| 552 if (!pcid) | |
| 553 return ERROR_INT("&cid not defined", __func__, 1); | |
| 554 *pcid = cid = NULL; | |
| 555 if (!fname && !pix) | |
| 556 return ERROR_INT("neither fname nor pix are defined", __func__, 1); | |
| 557 | |
| 558 /* If a compressed file is given that is not 'stdin', see if we | |
| 559 * can generate the pdf output without transcoding. */ | |
| 560 if (fname && strcmp(fname, "-") != 0 && strcmp(fname, "stdin") != 0) { | |
| 561 findFileFormat(fname, &format); | |
| 562 if (format == IFF_UNKNOWN) | |
| 563 L_WARNING("file %s format is unknown\n", __func__, fname); | |
| 564 if (format == IFF_PS || format == IFF_LPDF) { | |
| 565 L_ERROR("file %s is unsupported format %d\n", | |
| 566 __func__, fname, format); | |
| 567 return 1; | |
| 568 } | |
| 569 if (format == IFF_JFIF_JPEG) { | |
| 570 cid = l_generateJpegData(fname, 0); | |
| 571 } else if (format == IFF_JP2) { | |
| 572 cid = l_generateJp2kData(fname); | |
| 573 } else if (format == IFF_PNG) { | |
| 574 cid = l_generateFlateDataPdf(fname, pix); | |
| 575 } | |
| 576 } | |
| 577 | |
| 578 /* Otherwise, use the pix to generate the pdf output */ | |
| 579 if (!cid) { | |
| 580 if (!pix) | |
| 581 pixt = pixRead(fname); | |
| 582 else | |
| 583 pixt = pixClone(pix); | |
| 584 if (!pixt) | |
| 585 return ERROR_INT("pixt not made", __func__, 1); | |
| 586 if (selectDefaultPdfEncoding(pixt, &type)) { | |
| 587 pixDestroy(&pixt); | |
| 588 return 1; | |
| 589 } | |
| 590 pixGenerateCIData(pixt, type, quality, 0, &cid); | |
| 591 pixDestroy(&pixt); | |
| 592 if (!cid) | |
| 593 return ERROR_INT("cid not made from pix", __func__, 1); | |
| 594 } | |
| 595 *pcid = cid; | |
| 596 return 0; | |
| 597 } | |
| 598 | |
| 599 | |
| 600 /*! | |
| 601 * \brief l_generateCIData() | |
| 602 * | |
| 603 * \param[in] fname | |
| 604 * \param[in] type L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE, | |
| 605 * L_JP2K_ENCODE | |
| 606 * \param[in] quality for jpeg if transcoded: 1-100; 0 for default (75) | |
| 607 * for jp2k if transcoded: 27-45; 0 for default (34) | |
| 608 * \param[in] ascii85 0 for binary; 1 for ascii85-encoded | |
| 609 * \param[out] pcid compressed data | |
| 610 * \return 0 if OK, 1 on error | |
| 611 * | |
| 612 * <pre> | |
| 613 * Notes: | |
| 614 * (1) This can be used for both PostScript and pdf. | |
| 615 * (1) Set ascii85: | |
| 616 * ~ 0 for binary data (PDF only) | |
| 617 * ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only) | |
| 618 * (2) This attempts to compress according to the requested type. | |
| 619 * If this can't be done, it falls back to ordinary flate encoding. | |
| 620 * (3) This differs from l_generateCIDataForPdf(), which determines | |
| 621 * the file format and only works for pdf. | |
| 622 * </pre> | |
| 623 */ | |
| 624 l_ok | |
| 625 l_generateCIData(const char *fname, | |
| 626 l_int32 type, | |
| 627 l_int32 quality, | |
| 628 l_int32 ascii85, | |
| 629 L_COMP_DATA **pcid) | |
| 630 { | |
| 631 l_int32 format, d, bps, spp, iscmap; | |
| 632 L_COMP_DATA *cid; | |
| 633 PIX *pix; | |
| 634 | |
| 635 if (!pcid) | |
| 636 return ERROR_INT("&cid not defined", __func__, 1); | |
| 637 *pcid = NULL; | |
| 638 if (!fname) | |
| 639 return ERROR_INT("fname not defined", __func__, 1); | |
| 640 if (type != L_G4_ENCODE && type != L_JPEG_ENCODE && | |
| 641 type != L_FLATE_ENCODE && type != L_JP2K_ENCODE) | |
| 642 return ERROR_INT("invalid conversion type", __func__, 1); | |
| 643 if (ascii85 != 0 && ascii85 != 1) | |
| 644 return ERROR_INT("invalid ascii85", __func__, 1); | |
| 645 | |
| 646 /* Sanity check on requested encoding */ | |
| 647 pixReadHeader(fname, &format, NULL, NULL, &bps, &spp, &iscmap); | |
| 648 d = bps * spp; | |
| 649 if (d == 24) d = 32; | |
| 650 if (iscmap && type != L_FLATE_ENCODE) { | |
| 651 L_WARNING("pixs has cmap; using flate encoding\n", __func__); | |
| 652 type = L_FLATE_ENCODE; | |
| 653 } else if (d < 8 && type == L_JPEG_ENCODE) { | |
| 654 L_WARNING("pixs has < 8 bpp; using flate encoding\n", __func__); | |
| 655 type = L_FLATE_ENCODE; | |
| 656 } else if (d < 8 && type == L_JP2K_ENCODE) { | |
| 657 L_WARNING("pixs has < 8 bpp; using flate encoding\n", __func__); | |
| 658 type = L_FLATE_ENCODE; | |
| 659 } else if (d > 1 && type == L_G4_ENCODE) { | |
| 660 L_WARNING("pixs has > 1 bpp; using flate encoding\n", __func__); | |
| 661 type = L_FLATE_ENCODE; | |
| 662 } | |
| 663 | |
| 664 if (type == L_JPEG_ENCODE) { | |
| 665 if (format == IFF_JFIF_JPEG) { /* do not transcode */ | |
| 666 cid = l_generateJpegData(fname, ascii85); | |
| 667 } else { | |
| 668 if ((pix = pixRead(fname)) == NULL) | |
| 669 return ERROR_INT("pix not returned for JPEG", __func__, 1); | |
| 670 cid = pixGenerateJpegData(pix, ascii85, quality); | |
| 671 pixDestroy(&pix); | |
| 672 } | |
| 673 if (!cid) | |
| 674 return ERROR_INT("jpeg data not made", __func__, 1); | |
| 675 } else if (type == L_JP2K_ENCODE) { | |
| 676 if (format == IFF_JP2) { /* do not transcode */ | |
| 677 cid = l_generateJp2kData(fname); | |
| 678 } else { | |
| 679 if ((pix = pixRead(fname)) == NULL) | |
| 680 return ERROR_INT("pix not returned for JP2K", __func__, 1); | |
| 681 cid = pixGenerateJp2kData(pix, quality); | |
| 682 pixDestroy(&pix); | |
| 683 } | |
| 684 if (!cid) | |
| 685 return ERROR_INT("jp2k data not made", __func__, 1); | |
| 686 } else if (type == L_G4_ENCODE) { | |
| 687 if ((pix = pixRead(fname)) == NULL) | |
| 688 return ERROR_INT("pix not returned for G4", __func__, 1); | |
| 689 cid = pixGenerateG4Data(pix, ascii85); | |
| 690 pixDestroy(&pix); | |
| 691 if (!cid) | |
| 692 return ERROR_INT("g4 data not made", __func__, 1); | |
| 693 } else if (type == L_FLATE_ENCODE) { | |
| 694 if ((cid = l_generateFlateData(fname, ascii85)) == NULL) | |
| 695 return ERROR_INT("flate data not made", __func__, 1); | |
| 696 } else { | |
| 697 return ERROR_INT("invalid conversion type", __func__, 1); | |
| 698 } | |
| 699 *pcid = cid; | |
| 700 | |
| 701 return 0; | |
| 702 } | |
| 703 | |
| 704 | |
| 705 /*---------------------------------------------------------------------* | |
| 706 * Low-level CID-based operations * | |
| 707 *---------------------------------------------------------------------*/ | |
| 708 /*! | |
| 709 * \brief l_generateFlateDataPdf() | |
| 710 * | |
| 711 * \param[in] fname preferably png | |
| 712 * \param[in] pixs [optional] can be null | |
| 713 * \return cid containing png data, or NULL on error | |
| 714 * | |
| 715 * <pre> | |
| 716 * Notes: | |
| 717 * (1) If you hand this a png file, you are going to get | |
| 718 * png predictors embedded in the flate data. So it has | |
| 719 * come to this. http://xkcd.com/1022/ | |
| 720 * (2) Exception: if the png is interlaced or if it is RGBA, | |
| 721 * it will be transcoded. | |
| 722 * (3) If transcoding is required, this will not have to read from | |
| 723 * file if a pix is input. | |
| 724 * </pre> | |
| 725 */ | |
| 726 L_COMP_DATA * | |
| 727 l_generateFlateDataPdf(const char *fname, | |
| 728 PIX *pixs) | |
| 729 { | |
| 730 l_uint8 *pngcomp = NULL; /* entire PNG compressed file */ | |
| 731 l_uint8 *datacomp = NULL; /* gzipped raster data */ | |
| 732 l_uint8 *cmapdata = NULL; /* uncompressed colormap */ | |
| 733 char *cmapdatahex = NULL; /* hex ascii uncompressed colormap */ | |
| 734 l_uint32 i, j, n; | |
| 735 l_int32 format, interlaced; | |
| 736 l_int32 ncolors; /* in colormap */ | |
| 737 l_int32 bps; /* bits/sample: usually 8 */ | |
| 738 l_int32 spp; /* samples/pixel: 1-grayscale/cmap); 3-rgb; 4-rgba */ | |
| 739 l_int32 w, h, cmapflag; | |
| 740 l_int32 xres, yres; | |
| 741 size_t nbytescomp = 0, nbytespng = 0; | |
| 742 FILE *fp; | |
| 743 L_COMP_DATA *cid; | |
| 744 PIX *pix; | |
| 745 PIXCMAP *cmap = NULL; | |
| 746 | |
| 747 if (!fname) | |
| 748 return (L_COMP_DATA *)ERROR_PTR("fname not defined", __func__, NULL); | |
| 749 | |
| 750 findFileFormat(fname, &format); | |
| 751 spp = 0; /* init to spp != 4 if not png */ | |
| 752 interlaced = 0; /* initialize to no interlacing */ | |
| 753 bps = 0; /* initialize to a nonsense value */ | |
| 754 if (format == IFF_PNG) { | |
| 755 isPngInterlaced(fname, &interlaced); | |
| 756 if (readHeaderPng(fname, NULL, NULL, &bps, &spp, NULL)) | |
| 757 return (L_COMP_DATA *)ERROR_PTR("bad png input", __func__, NULL); | |
| 758 } | |
| 759 | |
| 760 /* PDF is capable of inlining some types of PNG files, but not all | |
| 761 of them. We need to transcode anything with interlacing, an | |
| 762 alpha channel, or 1 bpp (which would otherwise be photo-inverted). | |
| 763 | |
| 764 Note: any PNG image file with an alpha channel is converted on | |
| 765 reading to RGBA (spp == 4). This includes the (gray + alpha) format | |
| 766 with spp == 2. Because of the conversion, readHeaderPng() gives | |
| 767 spp = 2, whereas pixGetSpp() gives spp = 4 on the converted pix. */ | |
| 768 if (format != IFF_PNG || | |
| 769 (format == IFF_PNG && (interlaced || bps == 1 || spp == 4 || spp == 2))) | |
| 770 { /* lgtm+ analyzer needed the logic expanded */ | |
| 771 if (!pixs) | |
| 772 pix = pixRead(fname); | |
| 773 else | |
| 774 pix = pixClone(pixs); | |
| 775 if (!pix) | |
| 776 return (L_COMP_DATA *)ERROR_PTR("pix not made", __func__, NULL); | |
| 777 cid = pixGenerateFlateData(pix, 0); | |
| 778 pixDestroy(&pix); | |
| 779 return cid; | |
| 780 } | |
| 781 | |
| 782 /* It's png. Generate the pdf data without transcoding. | |
| 783 * Implementation by Jeff Breidenbach. | |
| 784 * First, read the metadata */ | |
| 785 if ((fp = fopenReadStream(fname)) == NULL) | |
| 786 return (L_COMP_DATA *)ERROR_PTR_1("stream not opened", | |
| 787 fname, __func__, NULL); | |
| 788 freadHeaderPng(fp, &w, &h, &bps, &spp, &cmapflag); | |
| 789 fgetPngResolution(fp, &xres, &yres); | |
| 790 fclose(fp); | |
| 791 | |
| 792 /* We get pdf corruption when inlining the data from 16 bpp png. */ | |
| 793 if (bps == 16) | |
| 794 return l_generateFlateData(fname, 0); | |
| 795 | |
| 796 /* Read the entire png file */ | |
| 797 if ((pngcomp = l_binaryRead(fname, &nbytespng)) == NULL) | |
| 798 return (L_COMP_DATA *)ERROR_PTR_1("unable to read file", | |
| 799 fname, __func__, NULL); | |
| 800 | |
| 801 /* Extract flate data, copying portions of it to memory, including | |
| 802 * the predictor information in a byte at the beginning of each | |
| 803 * raster line. The flate data makes up the vast majority of | |
| 804 * the png file, so after extraction we expect datacomp to | |
| 805 * be nearly full (i.e., nbytescomp will be only slightly less | |
| 806 * than nbytespng). Also extract the colormap if present. */ | |
| 807 if ((datacomp = (l_uint8 *)LEPT_CALLOC(1, nbytespng)) == NULL) { | |
| 808 LEPT_FREE(pngcomp); | |
| 809 return (L_COMP_DATA *)ERROR_PTR("unable to allocate memory", | |
| 810 __func__, NULL); | |
| 811 } | |
| 812 | |
| 813 /* Parse the png file. Each chunk consists of: | |
| 814 * length: 4 bytes | |
| 815 * name: 4 bytes (e.g., "IDAT") | |
| 816 * data: n bytes | |
| 817 * CRC: 4 bytes | |
| 818 * Start at the beginning of the data section of the first chunk, | |
| 819 * byte 16, because the png file begins with 8 bytes of header, | |
| 820 * followed by the first 8 bytes of the first chunk | |
| 821 * (length and name). On each loop, increment by 12 bytes to | |
| 822 * skip over the CRC, length and name of the next chunk. */ | |
| 823 for (i = 16; i < nbytespng; i += 12) { /* do each successive chunk */ | |
| 824 /* Get the chunk length */ | |
| 825 n = pngcomp[i - 8] << 24; | |
| 826 n += pngcomp[i - 7] << 16; | |
| 827 n += pngcomp[i - 6] << 8; | |
| 828 n += pngcomp[i - 5] << 0; | |
| 829 if (n >= nbytespng - i) { /* "n + i" can overflow */ | |
| 830 LEPT_FREE(pngcomp); | |
| 831 LEPT_FREE(datacomp); | |
| 832 pixcmapDestroy(&cmap); | |
| 833 L_ERROR("invalid png: i = %d, n = %d, nbytes = %zu\n", __func__, | |
| 834 i, n, nbytespng); | |
| 835 return NULL; | |
| 836 } | |
| 837 | |
| 838 /* Is it a data chunk? */ | |
| 839 if (memcmp(pngcomp + i - 4, "IDAT", 4) == 0) { | |
| 840 memcpy(datacomp + nbytescomp, pngcomp + i, n); | |
| 841 nbytescomp += n; | |
| 842 } | |
| 843 | |
| 844 /* Is it a palette chunk? */ | |
| 845 if (cmapflag && !cmap && | |
| 846 memcmp(pngcomp + i - 4, "PLTE", 4) == 0) { | |
| 847 if ((n / 3) > (1 << bps)) { | |
| 848 LEPT_FREE(pngcomp); | |
| 849 LEPT_FREE(datacomp); | |
| 850 pixcmapDestroy(&cmap); | |
| 851 L_ERROR("invalid png: i = %d, n = %d, cmapsize = %d\n", | |
| 852 __func__, i, n, (1 << bps)); | |
| 853 return NULL; | |
| 854 } | |
| 855 cmap = pixcmapCreate(bps); | |
| 856 for (j = i; j < i + n; j += 3) { | |
| 857 pixcmapAddColor(cmap, pngcomp[j], pngcomp[j + 1], | |
| 858 pngcomp[j + 2]); | |
| 859 } | |
| 860 } | |
| 861 i += n; /* move to the end of the data chunk */ | |
| 862 } | |
| 863 LEPT_FREE(pngcomp); | |
| 864 | |
| 865 if (nbytescomp == 0) { | |
| 866 LEPT_FREE(datacomp); | |
| 867 pixcmapDestroy(&cmap); | |
| 868 return (L_COMP_DATA *)ERROR_PTR("invalid PNG file", __func__, NULL); | |
| 869 } | |
| 870 | |
| 871 /* Extract and encode the colormap data as hexascii */ | |
| 872 ncolors = 0; | |
| 873 if (cmap) { | |
| 874 pixcmapSerializeToMemory(cmap, 3, &ncolors, &cmapdata); | |
| 875 pixcmapDestroy(&cmap); | |
| 876 if (!cmapdata) { | |
| 877 LEPT_FREE(datacomp); | |
| 878 return (L_COMP_DATA *)ERROR_PTR("cmapdata not made", | |
| 879 __func__, NULL); | |
| 880 } | |
| 881 cmapdatahex = pixcmapConvertToHex(cmapdata, ncolors); | |
| 882 LEPT_FREE(cmapdata); | |
| 883 } | |
| 884 | |
| 885 /* Note that this is the only situation where the predictor | |
| 886 * field of the CID is set to 1. Adobe's predictor values on | |
| 887 * p. 76 of pdf_reference_1-7.pdf give 1 for no predictor and | |
| 888 * 10-14 for inline predictors, the specifics of which are | |
| 889 * ignored by the pdf interpreter, which just needs to know that | |
| 890 * the first byte on each compressed scanline is some predictor | |
| 891 * whose type can be inferred from the byte itself. */ | |
| 892 cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA)); | |
| 893 cid->datacomp = datacomp; | |
| 894 cid->type = L_FLATE_ENCODE; | |
| 895 cid->cmapdatahex = cmapdatahex; | |
| 896 cid->nbytescomp = nbytescomp; | |
| 897 cid->ncolors = ncolors; | |
| 898 cid->predictor = TRUE; | |
| 899 cid->w = w; | |
| 900 cid->h = h; | |
| 901 cid->bps = bps; | |
| 902 cid->spp = spp; | |
| 903 cid->res = xres; | |
| 904 return cid; | |
| 905 } | |
| 906 | |
| 907 | |
| 908 /*! | |
| 909 * \brief l_generateJpegData() | |
| 910 * | |
| 911 * \param[in] fname of jpeg file | |
| 912 * \param[in] ascii85flag 0 for jpeg; 1 for ascii85-encoded jpeg | |
| 913 * \return cid containing jpeg data, or NULL on error | |
| 914 * | |
| 915 * <pre> | |
| 916 * Notes: | |
| 917 * (1) Set ascii85flag: | |
| 918 * ~ 0 for binary data (PDF only) | |
| 919 * ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only) | |
| 920 * (2) Most of this function is repeated in l_generateJpegMemData(), | |
| 921 * which is required in pixacompFastConvertToPdfData(). | |
| 922 * </pre> | |
| 923 */ | |
| 924 L_COMP_DATA * | |
| 925 l_generateJpegData(const char *fname, | |
| 926 l_int32 ascii85flag) | |
| 927 { | |
| 928 char *data85 = NULL; /* ascii85 encoded jpeg compressed file */ | |
| 929 l_uint8 *data = NULL; | |
| 930 l_int32 w, h, xres, yres, bps, spp; | |
| 931 size_t nbytes, nbytes85; | |
| 932 L_COMP_DATA *cid; | |
| 933 FILE *fp; | |
| 934 | |
| 935 if (!fname) | |
| 936 return (L_COMP_DATA *)ERROR_PTR("fname not defined", __func__, NULL); | |
| 937 | |
| 938 if (ascii85flag != 0 && ascii85flag != 1) | |
| 939 return (L_COMP_DATA *)ERROR_PTR("wrong ascii85flags", __func__, NULL); | |
| 940 | |
| 941 /* Read the metadata */ | |
| 942 if (readHeaderJpeg(fname, &w, &h, &spp, NULL, NULL)) | |
| 943 return (L_COMP_DATA *)ERROR_PTR("bad jpeg metadata", __func__, NULL); | |
| 944 bps = 8; | |
| 945 if ((fp = fopenReadStream(fname)) == NULL) | |
| 946 return (L_COMP_DATA *)ERROR_PTR_1("stream not opened", | |
| 947 fname, __func__, NULL); | |
| 948 fgetJpegResolution(fp, &xres, &yres); | |
| 949 fclose(fp); | |
| 950 | |
| 951 /* Read the entire jpeg file. The returned jpeg data in memory | |
| 952 * starts with ffd8 and ends with ffd9 */ | |
| 953 if ((data = l_binaryRead(fname, &nbytes)) == NULL) | |
| 954 return (L_COMP_DATA *)ERROR_PTR_1("data not extracted", | |
| 955 fname, __func__, NULL); | |
| 956 | |
| 957 /* Optionally, encode the compressed data */ | |
| 958 if (ascii85flag == 1) { | |
| 959 data85 = encodeAscii85(data, nbytes, &nbytes85); | |
| 960 LEPT_FREE(data); | |
| 961 if (!data85) | |
| 962 return (L_COMP_DATA *)ERROR_PTR_1("data85 not made", | |
| 963 fname, __func__, NULL); | |
| 964 else | |
| 965 data85[nbytes85 - 1] = '\0'; /* remove the newline */ | |
| 966 } | |
| 967 | |
| 968 cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA)); | |
| 969 if (ascii85flag == 0) { | |
| 970 cid->datacomp = data; | |
| 971 } else { /* ascii85 */ | |
| 972 cid->data85 = data85; | |
| 973 cid->nbytes85 = nbytes85; | |
| 974 } | |
| 975 cid->type = L_JPEG_ENCODE; | |
| 976 cid->nbytescomp = nbytes; | |
| 977 cid->w = w; | |
| 978 cid->h = h; | |
| 979 cid->bps = bps; | |
| 980 cid->spp = spp; | |
| 981 cid->res = xres; | |
| 982 return cid; | |
| 983 } | |
| 984 | |
| 985 | |
| 986 /*! | |
| 987 * \brief l_generateJpegDataMem() | |
| 988 * | |
| 989 * \param[in] data of jpeg-encoded file | |
| 990 * \param[in] nbytes size of jpeg-encoded file | |
| 991 * \param[in] ascii85flag 0 for jpeg; 1 for ascii85-encoded jpeg | |
| 992 * \return cid containing jpeg data, or NULL on error | |
| 993 * | |
| 994 * <pre> | |
| 995 * Notes: | |
| 996 * (1) Set ascii85flag: | |
| 997 * ~ 0 for binary data (PDF only) | |
| 998 * ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only) | |
| 999 * </pre> | |
| 1000 */ | |
| 1001 L_COMP_DATA * | |
| 1002 l_generateJpegDataMem(l_uint8 *data, | |
| 1003 size_t nbytes, | |
| 1004 l_int32 ascii85flag) | |
| 1005 { | |
| 1006 char *data85 = NULL; /* ascii85 encoded jpeg compressed file */ | |
| 1007 l_int32 w, h, xres, yres, bps, spp; | |
| 1008 size_t nbytes85; | |
| 1009 L_COMP_DATA *cid; | |
| 1010 | |
| 1011 if (!data) | |
| 1012 return (L_COMP_DATA *)ERROR_PTR("data not defined", __func__, NULL); | |
| 1013 | |
| 1014 /* Read the metadata */ | |
| 1015 if (readHeaderMemJpeg(data, nbytes, &w, &h, &spp, NULL, NULL)) { | |
| 1016 LEPT_FREE(data); | |
| 1017 return (L_COMP_DATA *)ERROR_PTR("bad jpeg metadata", __func__, NULL); | |
| 1018 } | |
| 1019 bps = 8; | |
| 1020 readResolutionMemJpeg(data, nbytes, &xres, &yres); | |
| 1021 | |
| 1022 /* Optionally, encode the compressed data */ | |
| 1023 if (ascii85flag == 1) { | |
| 1024 data85 = encodeAscii85(data, nbytes, &nbytes85); | |
| 1025 LEPT_FREE(data); | |
| 1026 if (!data85) | |
| 1027 return (L_COMP_DATA *)ERROR_PTR("data85 not made", __func__, NULL); | |
| 1028 else | |
| 1029 data85[nbytes85 - 1] = '\0'; /* remove the newline */ | |
| 1030 } | |
| 1031 | |
| 1032 cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA)); | |
| 1033 if (ascii85flag == 0) { | |
| 1034 cid->datacomp = data; | |
| 1035 } else { /* ascii85 */ | |
| 1036 cid->data85 = data85; | |
| 1037 cid->nbytes85 = nbytes85; | |
| 1038 } | |
| 1039 cid->type = L_JPEG_ENCODE; | |
| 1040 cid->nbytescomp = nbytes; | |
| 1041 cid->w = w; | |
| 1042 cid->h = h; | |
| 1043 cid->bps = bps; | |
| 1044 cid->spp = spp; | |
| 1045 cid->res = xres; | |
| 1046 return cid; | |
| 1047 } | |
| 1048 | |
| 1049 | |
| 1050 /*! | |
| 1051 * \brief l_generateJp2kData() | |
| 1052 * | |
| 1053 * \param[in] fname of jp2k file | |
| 1054 * \return cid containing jp2k data, or NULL on error | |
| 1055 * | |
| 1056 * <pre> | |
| 1057 * Notes: | |
| 1058 * (1) This is only called after the file is verified to be jp2k. | |
| 1059 * </pre> | |
| 1060 */ | |
| 1061 static L_COMP_DATA * | |
| 1062 l_generateJp2kData(const char *fname) | |
| 1063 { | |
| 1064 l_int32 w, h, bps, spp, xres, yres; | |
| 1065 size_t nbytes; | |
| 1066 L_COMP_DATA *cid; | |
| 1067 FILE *fp; | |
| 1068 | |
| 1069 if (!fname) | |
| 1070 return (L_COMP_DATA *)ERROR_PTR("fname not defined", __func__, NULL); | |
| 1071 | |
| 1072 if (readHeaderJp2k(fname, &w, &h, &bps, &spp, NULL)) | |
| 1073 return (L_COMP_DATA *)ERROR_PTR("bad jp2k metadata", __func__, NULL); | |
| 1074 | |
| 1075 /* The returned jp2k data in memory is the entire jp2k file */ | |
| 1076 cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA)); | |
| 1077 if ((cid->datacomp = l_binaryRead(fname, &nbytes)) == NULL) { | |
| 1078 l_CIDataDestroy(&cid); | |
| 1079 return (L_COMP_DATA *)ERROR_PTR("data not extracted", __func__, NULL); | |
| 1080 } | |
| 1081 | |
| 1082 xres = yres = 0; | |
| 1083 if ((fp = fopenReadStream(fname)) != NULL) { | |
| 1084 fgetJp2kResolution(fp, &xres, &yres); | |
| 1085 fclose(fp); | |
| 1086 } | |
| 1087 cid->type = L_JP2K_ENCODE; | |
| 1088 cid->nbytescomp = nbytes; | |
| 1089 cid->w = w; | |
| 1090 cid->h = h; | |
| 1091 cid->bps = bps; | |
| 1092 cid->spp = spp; | |
| 1093 cid->res = xres; | |
| 1094 return cid; | |
| 1095 } | |
| 1096 | |
| 1097 | |
| 1098 /*! | |
| 1099 * \brief l_generateG4Data() | |
| 1100 * | |
| 1101 * \param[in] fname of g4 compressed file | |
| 1102 * \param[in] ascii85flag 0 for g4 compressed; 1 for ascii85-encoded g4 | |
| 1103 * \return cid g4 compressed image data, or NULL on error | |
| 1104 * | |
| 1105 * <pre> | |
| 1106 * Notes: | |
| 1107 * (1) Set ascii85flag: | |
| 1108 * ~ 0 for binary data (PDF only) | |
| 1109 * ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only) | |
| 1110 * (2) This does not work for multipage tiff files. | |
| 1111 * </pre> | |
| 1112 */ | |
| 1113 L_COMP_DATA * | |
| 1114 l_generateG4Data(const char *fname, | |
| 1115 l_int32 ascii85flag) | |
| 1116 { | |
| 1117 l_uint8 *datacomp = NULL; /* g4 compressed raster data */ | |
| 1118 char *data85 = NULL; /* ascii85 encoded g4 compressed data */ | |
| 1119 l_int32 w, h, xres, yres, npages; | |
| 1120 l_int32 minisblack; /* TRUE or FALSE */ | |
| 1121 size_t nbytes85, nbytescomp; | |
| 1122 L_COMP_DATA *cid; | |
| 1123 FILE *fp; | |
| 1124 | |
| 1125 if (!fname) | |
| 1126 return (L_COMP_DATA *)ERROR_PTR("fname not defined", __func__, NULL); | |
| 1127 | |
| 1128 /* Make sure this is a single page tiff file */ | |
| 1129 if ((fp = fopenReadStream(fname)) == NULL) | |
| 1130 return (L_COMP_DATA *)ERROR_PTR_1("stream not opened", | |
| 1131 fname, __func__, NULL); | |
| 1132 tiffGetCount(fp, &npages); | |
| 1133 fclose(fp); | |
| 1134 if (npages != 1) { | |
| 1135 L_ERROR(" %d page tiff; only works with 1 page (file: %s)\n", __func__, npages, fname); | |
| 1136 return NULL; | |
| 1137 } | |
| 1138 | |
| 1139 /* Read the resolution */ | |
| 1140 if ((fp = fopenReadStream(fname)) == NULL) | |
| 1141 return (L_COMP_DATA *)ERROR_PTR_1("stream not opened", | |
| 1142 fname, __func__, NULL); | |
| 1143 getTiffResolution(fp, &xres, &yres); | |
| 1144 fclose(fp); | |
| 1145 | |
| 1146 /* The returned ccitt g4 data in memory is the block of | |
| 1147 * bytes in the tiff file, starting after 8 bytes and | |
| 1148 * ending before the directory. */ | |
| 1149 if (extractG4DataFromFile(fname, &datacomp, &nbytescomp, | |
| 1150 &w, &h, &minisblack)) { | |
| 1151 return (L_COMP_DATA *)ERROR_PTR_1("datacomp not extracted", | |
| 1152 fname, __func__, NULL); | |
| 1153 } | |
| 1154 | |
| 1155 /* Optionally, encode the compressed data */ | |
| 1156 if (ascii85flag == 1) { | |
| 1157 data85 = encodeAscii85(datacomp, nbytescomp, &nbytes85); | |
| 1158 LEPT_FREE(datacomp); | |
| 1159 if (!data85) | |
| 1160 return (L_COMP_DATA *)ERROR_PTR_1("data85 not made", | |
| 1161 fname, __func__, NULL); | |
| 1162 else | |
| 1163 data85[nbytes85 - 1] = '\0'; /* remove the newline */ | |
| 1164 } | |
| 1165 | |
| 1166 cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA)); | |
| 1167 if (ascii85flag == 0) { | |
| 1168 cid->datacomp = datacomp; | |
| 1169 } else { /* ascii85 */ | |
| 1170 cid->data85 = data85; | |
| 1171 cid->nbytes85 = nbytes85; | |
| 1172 } | |
| 1173 cid->type = L_G4_ENCODE; | |
| 1174 cid->nbytescomp = nbytescomp; | |
| 1175 cid->w = w; | |
| 1176 cid->h = h; | |
| 1177 cid->bps = 1; | |
| 1178 cid->spp = 1; | |
| 1179 cid->minisblack = minisblack; | |
| 1180 cid->res = xres; | |
| 1181 return cid; | |
| 1182 } | |
| 1183 | |
| 1184 | |
| 1185 /*! | |
| 1186 * \brief pixGenerateCIData() | |
| 1187 * | |
| 1188 * \param[in] pixs 8 or 32 bpp, no colormap | |
| 1189 * \param[in] type L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE or | |
| 1190 * L_JP2K_ENCODE | |
| 1191 * \param[in] quality for jpeg if transcoded: 1-100; 0 for default (75) | |
| 1192 * for jp2k if transcoded: 27-45; 0 for default (34) | |
| 1193 * \param[in] ascii85 0 for binary; 1 for ascii85-encoded | |
| 1194 * \param[out] pcid compressed data | |
| 1195 * \return 0 if OK, 1 on error | |
| 1196 * | |
| 1197 * <pre> | |
| 1198 * Notes: | |
| 1199 * (1) Set ascii85: | |
| 1200 * ~ 0 for binary data (PDF only) | |
| 1201 * ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only) | |
| 1202 * (2) Do not accept images with an asperity ratio greater than 10. | |
| 1203 * </pre> | |
| 1204 */ | |
| 1205 l_ok | |
| 1206 pixGenerateCIData(PIX *pixs, | |
| 1207 l_int32 type, | |
| 1208 l_int32 quality, | |
| 1209 l_int32 ascii85, | |
| 1210 L_COMP_DATA **pcid) | |
| 1211 { | |
| 1212 l_int32 w, h, d, maxAsp; | |
| 1213 PIXCMAP *cmap; | |
| 1214 | |
| 1215 if (!pcid) | |
| 1216 return ERROR_INT("&cid not defined", __func__, 1); | |
| 1217 *pcid = NULL; | |
| 1218 if (!pixs) | |
| 1219 return ERROR_INT("pixs not defined", __func__, 1); | |
| 1220 if (type != L_G4_ENCODE && type != L_JPEG_ENCODE && | |
| 1221 type != L_FLATE_ENCODE && type != L_JP2K_ENCODE) { | |
| 1222 selectDefaultPdfEncoding(pixs, &type); | |
| 1223 } | |
| 1224 if (ascii85 != 0 && ascii85 != 1) | |
| 1225 return ERROR_INT("invalid ascii85", __func__, 1); | |
| 1226 pixGetDimensions(pixs, &w, &h, NULL); | |
| 1227 if (w == 0 || h == 0) | |
| 1228 return ERROR_INT("invalid w or h", __func__, 1); | |
| 1229 maxAsp = L_MAX(w / h, h / w); | |
| 1230 if (maxAsp > 10) | |
| 1231 return ERROR_INT("max asperity > 10", __func__, 1); | |
| 1232 | |
| 1233 /* Conditionally modify the encoding type if libz is | |
| 1234 * available and the requested library is missing. */ | |
| 1235 #if defined(HAVE_LIBZ) | |
| 1236 # if !defined(HAVE_LIBJPEG) | |
| 1237 if (type == L_JPEG_ENCODE) { | |
| 1238 L_WARNING("no libjpeg; using flate encoding\n", __func__); | |
| 1239 type = L_FLATE_ENCODE; | |
| 1240 } | |
| 1241 # endif /* !defined(HAVE_LIBJPEG) */ | |
| 1242 # if !defined(HAVE_LIBJP2K) | |
| 1243 if (type == L_JP2K_ENCODE) { | |
| 1244 L_WARNING("no libjp2k; using flate encoding\n", __func__); | |
| 1245 type = L_FLATE_ENCODE; | |
| 1246 } | |
| 1247 # endif /* !defined(HAVE_LIBJP2K) */ | |
| 1248 # if !defined(HAVE_LIBTIFF) | |
| 1249 if (type == L_G4_ENCODE) { | |
| 1250 L_WARNING("no libtiff; using flate encoding\n", __func__); | |
| 1251 type = L_FLATE_ENCODE; | |
| 1252 } | |
| 1253 # endif /* !defined(HAVE_LIBTIFF) */ | |
| 1254 #endif /* defined(HAVE_LIBZ) */ | |
| 1255 | |
| 1256 /* Sanity check on requested encoding */ | |
| 1257 d = pixGetDepth(pixs); | |
| 1258 cmap = pixGetColormap(pixs); | |
| 1259 if (cmap && type != L_FLATE_ENCODE) { | |
| 1260 L_WARNING("pixs has cmap; using flate encoding\n", __func__); | |
| 1261 type = L_FLATE_ENCODE; | |
| 1262 } else if (d < 8 && (type == L_JPEG_ENCODE || type == L_JP2K_ENCODE)) { | |
| 1263 L_WARNING("pixs has < 8 bpp; using flate encoding\n", __func__); | |
| 1264 type = L_FLATE_ENCODE; | |
| 1265 } else if (d > 1 && type == L_G4_ENCODE) { | |
| 1266 L_WARNING("pixs has > 1 bpp; using flate encoding\n", __func__); | |
| 1267 type = L_FLATE_ENCODE; | |
| 1268 } | |
| 1269 | |
| 1270 if (type == L_JPEG_ENCODE) { | |
| 1271 if ((*pcid = pixGenerateJpegData(pixs, ascii85, quality)) == NULL) | |
| 1272 return ERROR_INT("jpeg data not made", __func__, 1); | |
| 1273 } else if (type == L_JP2K_ENCODE) { | |
| 1274 if ((*pcid = pixGenerateJp2kData(pixs, quality)) == NULL) | |
| 1275 return ERROR_INT("jp2k data not made", __func__, 1); | |
| 1276 } else if (type == L_G4_ENCODE) { | |
| 1277 if ((*pcid = pixGenerateG4Data(pixs, ascii85)) == NULL) | |
| 1278 return ERROR_INT("g4 data not made", __func__, 1); | |
| 1279 } else { /* type == L_FLATE_ENCODE */ | |
| 1280 if ((*pcid = pixGenerateFlateData(pixs, ascii85)) == NULL) | |
| 1281 return ERROR_INT("flate data not made", __func__, 1); | |
| 1282 } | |
| 1283 return 0; | |
| 1284 } | |
| 1285 | |
| 1286 | |
| 1287 /*! | |
| 1288 * \brief l_generateFlateData() | |
| 1289 * | |
| 1290 * \param[in] fname | |
| 1291 * \param[in] ascii85flag 0 for gzipped; 1 for ascii85-encoded gzipped | |
| 1292 * \return cid flate compressed image data, or NULL on error | |
| 1293 * | |
| 1294 * <pre> | |
| 1295 * Notes: | |
| 1296 * (1) The input image is converted to one of these 4 types: | |
| 1297 * ~ 1 bpp | |
| 1298 * ~ 8 bpp, no colormap | |
| 1299 * ~ 8 bpp, colormap | |
| 1300 * ~ 32 bpp rgb | |
| 1301 * (2) Set ascii85flag: | |
| 1302 * ~ 0 for binary data (PDF only) | |
| 1303 * ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only) | |
| 1304 * (3) Always transcodes (i.e., first decodes the png file) | |
| 1305 * </pre> | |
| 1306 */ | |
| 1307 L_COMP_DATA * | |
| 1308 l_generateFlateData(const char *fname, | |
| 1309 l_int32 ascii85flag) | |
| 1310 { | |
| 1311 L_COMP_DATA *cid; | |
| 1312 PIX *pixs; | |
| 1313 | |
| 1314 if (!fname) | |
| 1315 return (L_COMP_DATA *)ERROR_PTR("fname not defined", __func__, NULL); | |
| 1316 | |
| 1317 if ((pixs = pixRead(fname)) == NULL) | |
| 1318 return (L_COMP_DATA *)ERROR_PTR("pixs not made", __func__, NULL); | |
| 1319 cid = pixGenerateFlateData(pixs, ascii85flag); | |
| 1320 pixDestroy(&pixs); | |
| 1321 return cid; | |
| 1322 } | |
| 1323 | |
| 1324 | |
| 1325 /*! | |
| 1326 * \brief pixGenerateFlateData() | |
| 1327 * | |
| 1328 * \param[in] pixs | |
| 1329 * \param[in] ascii85flag 0 for gzipped; 1 for ascii85-encoded gzipped | |
| 1330 * \return cid flate compressed image data, or NULL on error | |
| 1331 * | |
| 1332 * <pre> | |
| 1333 * Notes: | |
| 1334 * (1) If called with an RGBA pix (spp == 4), the alpha channel | |
| 1335 * will be removed, projecting a white backgrouond through | |
| 1336 * any transparency. | |
| 1337 * (2) If called with a colormapped pix, any transparency in the | |
| 1338 * alpha component in the colormap will be ignored, as it is | |
| 1339 * for all leptonica operations on colormapped pix. | |
| 1340 * </pre> | |
| 1341 */ | |
| 1342 static L_COMP_DATA * | |
| 1343 pixGenerateFlateData(PIX *pixs, | |
| 1344 l_int32 ascii85flag) | |
| 1345 { | |
| 1346 l_uint8 *data = NULL; /* uncompressed raster data in required format */ | |
| 1347 l_uint8 *datacomp = NULL; /* gzipped raster data */ | |
| 1348 char *data85 = NULL; /* ascii85 encoded gzipped raster data */ | |
| 1349 l_uint8 *cmapdata = NULL; /* uncompressed colormap */ | |
| 1350 char *cmapdata85 = NULL; /* ascii85 encoded uncompressed colormap */ | |
| 1351 char *cmapdatahex = NULL; /* hex ascii uncompressed colormap */ | |
| 1352 l_int32 ncolors; /* in colormap; not used if cmapdata85 is null */ | |
| 1353 l_int32 bps; /* bits/sample: usually 8 */ | |
| 1354 l_int32 spp; /* samples/pixel: 1-grayscale/cmap); 3-rgb */ | |
| 1355 l_int32 w, h, d, cmapflag; | |
| 1356 size_t ncmapbytes85 = 0; | |
| 1357 size_t nbytes85 = 0; | |
| 1358 size_t nbytes, nbytescomp; | |
| 1359 L_COMP_DATA *cid; | |
| 1360 PIX *pixt; | |
| 1361 PIXCMAP *cmap; | |
| 1362 | |
| 1363 if (!pixs) | |
| 1364 return (L_COMP_DATA *)ERROR_PTR("pixs not defined", __func__, NULL); | |
| 1365 | |
| 1366 /* Convert the image to one of these 4 types: | |
| 1367 * 1 bpp | |
| 1368 * 8 bpp, no colormap | |
| 1369 * 8 bpp, colormap | |
| 1370 * 32 bpp rgb */ | |
| 1371 pixGetDimensions(pixs, &w, &h, &d); | |
| 1372 cmap = pixGetColormap(pixs); | |
| 1373 cmapflag = (cmap) ? 1 : 0; | |
| 1374 if (d == 2 || d == 4 || d == 16) { | |
| 1375 pixt = pixConvertTo8(pixs, cmapflag); | |
| 1376 cmap = pixGetColormap(pixt); | |
| 1377 d = pixGetDepth(pixt); | |
| 1378 } else if (d == 32 && pixGetSpp(pixs) == 4) { /* remove alpha */ | |
| 1379 pixt = pixAlphaBlendUniform(pixs, 0xffffff00); | |
| 1380 } else { | |
| 1381 pixt = pixClone(pixs); | |
| 1382 } | |
| 1383 if (!pixt) | |
| 1384 return (L_COMP_DATA *)ERROR_PTR("pixt not made", __func__, NULL); | |
| 1385 spp = (d == 32) ? 3 : 1; | |
| 1386 bps = (d == 32) ? 8 : d; | |
| 1387 | |
| 1388 /* Extract and encode the colormap data as both ascii85 and hexascii */ | |
| 1389 ncolors = 0; | |
| 1390 if (cmap) { | |
| 1391 pixcmapSerializeToMemory(cmap, 3, &ncolors, &cmapdata); | |
| 1392 if (!cmapdata) { | |
| 1393 pixDestroy(&pixt); | |
| 1394 return (L_COMP_DATA *)ERROR_PTR("cmapdata not made", | |
| 1395 __func__, NULL); | |
| 1396 } | |
| 1397 | |
| 1398 cmapdata85 = encodeAscii85(cmapdata, 3 * ncolors, &ncmapbytes85); | |
| 1399 cmapdatahex = pixcmapConvertToHex(cmapdata, ncolors); | |
| 1400 LEPT_FREE(cmapdata); | |
| 1401 } | |
| 1402 | |
| 1403 /* Extract and compress the raster data */ | |
| 1404 pixGetRasterData(pixt, &data, &nbytes); | |
| 1405 pixDestroy(&pixt); | |
| 1406 if (!data) { | |
| 1407 LEPT_FREE(cmapdata85); | |
| 1408 LEPT_FREE(cmapdatahex); | |
| 1409 return (L_COMP_DATA *)ERROR_PTR("data not returned", __func__, NULL); | |
| 1410 } | |
| 1411 datacomp = zlibCompress(data, nbytes, &nbytescomp); | |
| 1412 LEPT_FREE(data); | |
| 1413 if (!datacomp) { | |
| 1414 LEPT_FREE(cmapdata85); | |
| 1415 LEPT_FREE(cmapdatahex); | |
| 1416 return (L_COMP_DATA *)ERROR_PTR("datacomp not made", __func__, NULL); | |
| 1417 } | |
| 1418 | |
| 1419 /* Optionally, encode the compressed data */ | |
| 1420 if (ascii85flag == 1) { | |
| 1421 data85 = encodeAscii85(datacomp, nbytescomp, &nbytes85); | |
| 1422 LEPT_FREE(datacomp); | |
| 1423 if (!data85) { | |
| 1424 LEPT_FREE(cmapdata85); | |
| 1425 LEPT_FREE(cmapdatahex); | |
| 1426 return (L_COMP_DATA *)ERROR_PTR("data85 not made", __func__, NULL); | |
| 1427 } else { | |
| 1428 data85[nbytes85 - 1] = '\0'; /* remove the newline */ | |
| 1429 } | |
| 1430 } | |
| 1431 | |
| 1432 cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA)); | |
| 1433 if (ascii85flag == 0) { | |
| 1434 cid->datacomp = datacomp; | |
| 1435 } else { /* ascii85 */ | |
| 1436 cid->data85 = data85; | |
| 1437 cid->nbytes85 = nbytes85; | |
| 1438 } | |
| 1439 cid->type = L_FLATE_ENCODE; | |
| 1440 cid->cmapdatahex = cmapdatahex; | |
| 1441 cid->cmapdata85 = cmapdata85; | |
| 1442 cid->nbytescomp = nbytescomp; | |
| 1443 cid->ncolors = ncolors; | |
| 1444 cid->w = w; | |
| 1445 cid->h = h; | |
| 1446 cid->bps = bps; | |
| 1447 cid->spp = spp; | |
| 1448 cid->res = pixGetXRes(pixs); | |
| 1449 cid->nbytes = nbytes; /* only for debugging */ | |
| 1450 return cid; | |
| 1451 } | |
| 1452 | |
| 1453 | |
| 1454 /*! | |
| 1455 * \brief pixGenerateJpegData() | |
| 1456 * | |
| 1457 * \param[in] pixs 8, 16 or 32 bpp, no colormap | |
| 1458 * \param[in] ascii85flag 0 for jpeg; 1 for ascii85-encoded jpeg | |
| 1459 * \param[in] quality 0 for default, which is 75 | |
| 1460 * \return cid jpeg compressed data, or NULL on error | |
| 1461 * | |
| 1462 * <pre> | |
| 1463 * Notes: | |
| 1464 * (1) Set ascii85flag: | |
| 1465 * ~ 0 for binary data (PDF only) | |
| 1466 * ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only) | |
| 1467 * (2) If 16 bpp, convert first to 8 bpp, using the MSB | |
| 1468 * </pre> | |
| 1469 */ | |
| 1470 static L_COMP_DATA * | |
| 1471 pixGenerateJpegData(PIX *pixs, | |
| 1472 l_int32 ascii85flag, | |
| 1473 l_int32 quality) | |
| 1474 { | |
| 1475 l_int32 d; | |
| 1476 char *fname; | |
| 1477 L_COMP_DATA *cid; | |
| 1478 | |
| 1479 if (!pixs) | |
| 1480 return (L_COMP_DATA *)ERROR_PTR("pixs not defined", __func__, NULL); | |
| 1481 if (pixGetColormap(pixs)) | |
| 1482 return (L_COMP_DATA *)ERROR_PTR("pixs has colormap", __func__, NULL); | |
| 1483 d = pixGetDepth(pixs); | |
| 1484 if (d != 8 && d != 16 && d != 32) | |
| 1485 return (L_COMP_DATA *)ERROR_PTR("pixs not 8, 16 or 32 bpp", | |
| 1486 __func__, NULL); | |
| 1487 | |
| 1488 /* Compress to a temp jpeg file */ | |
| 1489 fname = l_makeTempFilename(); | |
| 1490 if (pixWriteJpeg(fname, pixs, quality, 0)) { | |
| 1491 LEPT_FREE(fname); | |
| 1492 return NULL; | |
| 1493 } | |
| 1494 | |
| 1495 /* Generate the data */ | |
| 1496 cid = l_generateJpegData(fname, ascii85flag); | |
| 1497 if (lept_rmfile(fname) != 0) | |
| 1498 L_ERROR("temp file %s was not deleted\n", __func__, fname); | |
| 1499 LEPT_FREE(fname); | |
| 1500 return cid; | |
| 1501 } | |
| 1502 | |
| 1503 | |
| 1504 /*! | |
| 1505 * \brief pixGenerateJp2kData() | |
| 1506 * | |
| 1507 * \param[in] pixs 8 or 32 bpp, no colormap | |
| 1508 * \param[in] quality 0 for default, which is 34 | |
| 1509 * \return cid jp2k compressed data, or NULL on error | |
| 1510 * | |
| 1511 * <pre> | |
| 1512 * Notes: | |
| 1513 * (1) The quality can be set between 27 (very poor) and 45 | |
| 1514 * (nearly perfect). Use 0 for default (34). Use 100 for lossless, | |
| 1515 * but this is very expensive and not recommended. | |
| 1516 * </pre> | |
| 1517 */ | |
| 1518 static L_COMP_DATA * | |
| 1519 pixGenerateJp2kData(PIX *pixs, | |
| 1520 l_int32 quality) | |
| 1521 { | |
| 1522 l_int32 d; | |
| 1523 char *fname; | |
| 1524 L_COMP_DATA *cid; | |
| 1525 | |
| 1526 if (!pixs) | |
| 1527 return (L_COMP_DATA *)ERROR_PTR("pixs not defined", __func__, NULL); | |
| 1528 if (pixGetColormap(pixs)) | |
| 1529 return (L_COMP_DATA *)ERROR_PTR("pixs has colormap", __func__, NULL); | |
| 1530 d = pixGetDepth(pixs); | |
| 1531 if (d != 8 && d != 32) | |
| 1532 return (L_COMP_DATA *)ERROR_PTR("pixs not 8 or 32 bpp", __func__, NULL); | |
| 1533 | |
| 1534 /* Compress to a temp jp2k file */ | |
| 1535 fname = l_makeTempFilename(); | |
| 1536 if (pixWriteJp2k(fname, pixs, quality, 5, 0, 0)) { | |
| 1537 LEPT_FREE(fname); | |
| 1538 return NULL; | |
| 1539 } | |
| 1540 | |
| 1541 /* Generate the data */ | |
| 1542 cid = l_generateJp2kData(fname); | |
| 1543 if (lept_rmfile(fname) != 0) | |
| 1544 L_ERROR("temp file %s was not deleted\n", __func__, fname); | |
| 1545 LEPT_FREE(fname); | |
| 1546 return cid; | |
| 1547 } | |
| 1548 | |
| 1549 | |
| 1550 /*! | |
| 1551 * \brief pixGenerateG4Data() | |
| 1552 * | |
| 1553 * \param[in] pixs 1 bpp, no colormap | |
| 1554 * \param[in] ascii85flag 0 for gzipped; 1 for ascii85-encoded gzipped | |
| 1555 * \return cid g4 compressed image data, or NULL on error | |
| 1556 * | |
| 1557 * <pre> | |
| 1558 * Notes: | |
| 1559 * (1) Set ascii85flag: | |
| 1560 * ~ 0 for binary data (PDF only) | |
| 1561 * ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only) | |
| 1562 * </pre> | |
| 1563 */ | |
| 1564 static L_COMP_DATA * | |
| 1565 pixGenerateG4Data(PIX *pixs, | |
| 1566 l_int32 ascii85flag) | |
| 1567 { | |
| 1568 char *fname; | |
| 1569 L_COMP_DATA *cid; | |
| 1570 | |
| 1571 if (!pixs) | |
| 1572 return (L_COMP_DATA *)ERROR_PTR("pixs not defined", __func__, NULL); | |
| 1573 if (pixGetDepth(pixs) != 1) | |
| 1574 return (L_COMP_DATA *)ERROR_PTR("pixs not 1 bpp", __func__, NULL); | |
| 1575 if (pixGetColormap(pixs)) | |
| 1576 return (L_COMP_DATA *)ERROR_PTR("pixs has colormap", __func__, NULL); | |
| 1577 | |
| 1578 /* Compress to a temp tiff g4 file */ | |
| 1579 fname = l_makeTempFilename(); | |
| 1580 if (pixWrite(fname, pixs, IFF_TIFF_G4)) { | |
| 1581 LEPT_FREE(fname); | |
| 1582 return NULL; | |
| 1583 } | |
| 1584 | |
| 1585 cid = l_generateG4Data(fname, ascii85flag); | |
| 1586 if (lept_rmfile(fname) != 0) | |
| 1587 L_ERROR("temp file %s was not deleted\n", __func__, fname); | |
| 1588 LEPT_FREE(fname); | |
| 1589 return cid; | |
| 1590 } | |
| 1591 | |
| 1592 | |
| 1593 /*! | |
| 1594 * \brief cidConvertToPdfData() | |
| 1595 * | |
| 1596 * \param[in] cid compressed image data | |
| 1597 * \param[in] title [optional] pdf title; can be null | |
| 1598 * \param[out] pdata output pdf data for image | |
| 1599 * \param[out] pnbytes size of output pdf data | |
| 1600 * \return 0 if OK, 1 on error | |
| 1601 * | |
| 1602 * <pre> | |
| 1603 * Notes: | |
| 1604 * (1) Caller must not destroy the cid. It is absorbed in the | |
| 1605 * lpd and destroyed by this function. | |
| 1606 * </pre> | |
| 1607 */ | |
| 1608 l_ok | |
| 1609 cidConvertToPdfData(L_COMP_DATA *cid, | |
| 1610 const char *title, | |
| 1611 l_uint8 **pdata, | |
| 1612 size_t *pnbytes) | |
| 1613 { | |
| 1614 l_int32 res, ret; | |
| 1615 l_float32 wpt, hpt; | |
| 1616 L_PDF_DATA *lpd = NULL; | |
| 1617 | |
| 1618 if (!pdata || !pnbytes) | |
| 1619 return ERROR_INT("&data and &nbytes not both defined", __func__, 1); | |
| 1620 *pdata = NULL; | |
| 1621 *pnbytes = 0; | |
| 1622 if (!cid) | |
| 1623 return ERROR_INT("cid not defined", __func__, 1); | |
| 1624 | |
| 1625 /* Get media box parameters, in pts */ | |
| 1626 res = cid->res; | |
| 1627 if (res <= 0) | |
| 1628 res = DefaultInputRes; | |
| 1629 wpt = cid->w * 72.f / res; | |
| 1630 hpt = cid->h * 72.f / res; | |
| 1631 | |
| 1632 /* Set up the pdf data struct (lpd) */ | |
| 1633 if ((lpd = pdfdataCreate(title)) == NULL) | |
| 1634 return ERROR_INT("lpd not made", __func__, 1); | |
| 1635 ptraAdd(lpd->cida, cid); | |
| 1636 lpd->n++; | |
| 1637 ptaAddPt(lpd->xy, 0, 0); /* xpt = ypt = 0 */ | |
| 1638 ptaAddPt(lpd->wh, wpt, hpt); | |
| 1639 | |
| 1640 /* Generate the pdf string and destroy the lpd */ | |
| 1641 ret = l_generatePdf(pdata, pnbytes, lpd); | |
| 1642 pdfdataDestroy(&lpd); | |
| 1643 if (ret) | |
| 1644 return ERROR_INT("pdf output not made", __func__, 1); | |
| 1645 return 0; | |
| 1646 } | |
| 1647 | |
| 1648 | |
| 1649 /*! | |
| 1650 * \brief l_CIDataDestroy() | |
| 1651 * | |
| 1652 * \param[in,out] pcid will be set to null before returning | |
| 1653 * \return void | |
| 1654 */ | |
| 1655 void | |
| 1656 l_CIDataDestroy(L_COMP_DATA **pcid) | |
| 1657 { | |
| 1658 L_COMP_DATA *cid; | |
| 1659 | |
| 1660 if (pcid == NULL) { | |
| 1661 L_WARNING("ptr address is null!\n", __func__); | |
| 1662 return; | |
| 1663 } | |
| 1664 if ((cid = *pcid) == NULL) | |
| 1665 return; | |
| 1666 | |
| 1667 if (cid->datacomp) LEPT_FREE(cid->datacomp); | |
| 1668 if (cid->data85) LEPT_FREE(cid->data85); | |
| 1669 if (cid->cmapdata85) LEPT_FREE(cid->cmapdata85); | |
| 1670 if (cid->cmapdatahex) LEPT_FREE(cid->cmapdatahex); | |
| 1671 LEPT_FREE(cid); | |
| 1672 *pcid = NULL; | |
| 1673 } | |
| 1674 | |
| 1675 | |
| 1676 /*---------------------------------------------------------------------* | |
| 1677 * Helper functions for generating the output pdf string * | |
| 1678 *---------------------------------------------------------------------*/ | |
| 1679 /*! | |
| 1680 * \brief l_generatePdf() | |
| 1681 * | |
| 1682 * \param[out] pdata pdf array | |
| 1683 * \param[out] pnbytes number of bytes in pdf array | |
| 1684 * \param[in] lpd all the required input image data | |
| 1685 * \return 0 if OK, 1 on error | |
| 1686 * | |
| 1687 * <pre> | |
| 1688 * Notes: | |
| 1689 * (1) On error, no data is returned. | |
| 1690 * (2) The objects are: | |
| 1691 * 1: Catalog | |
| 1692 * 2: Info | |
| 1693 * 3: Pages | |
| 1694 * 4: Page | |
| 1695 * 5: Contents (rendering command) | |
| 1696 * 6 to 6+n-1: n XObjects | |
| 1697 * 6+n to 6+n+m-1: m colormaps | |
| 1698 * </pre> | |
| 1699 */ | |
| 1700 static l_int32 | |
| 1701 l_generatePdf(l_uint8 **pdata, | |
| 1702 size_t *pnbytes, | |
| 1703 L_PDF_DATA *lpd) | |
| 1704 { | |
| 1705 if (!pdata) | |
| 1706 return ERROR_INT("&data not defined", __func__, 1); | |
| 1707 *pdata = NULL; | |
| 1708 if (!pnbytes) | |
| 1709 return ERROR_INT("&nbytes not defined", __func__, 1); | |
| 1710 *pnbytes = 0; | |
| 1711 if (!lpd) | |
| 1712 return ERROR_INT("lpd not defined", __func__, 1); | |
| 1713 | |
| 1714 generateFixedStringsPdf(lpd); | |
| 1715 generateMediaboxPdf(lpd); | |
| 1716 generatePageStringPdf(lpd); | |
| 1717 generateContentStringPdf(lpd); | |
| 1718 generatePreXStringsPdf(lpd); | |
| 1719 generateColormapStringsPdf(lpd); | |
| 1720 generateTrailerPdf(lpd); | |
| 1721 return generateOutputDataPdf(pdata, pnbytes, lpd); | |
| 1722 } | |
| 1723 | |
| 1724 | |
| 1725 static void | |
| 1726 generateFixedStringsPdf(L_PDF_DATA *lpd) | |
| 1727 { | |
| 1728 char buf[L_SMALLBUF]; | |
| 1729 char *version, *datestr; | |
| 1730 SARRAY *sa; | |
| 1731 | |
| 1732 /* Accumulate data for the header and objects 1-3 */ | |
| 1733 lpd->id = stringNew("%PDF-1.5\n"); | |
| 1734 l_dnaAddNumber(lpd->objsize, strlen(lpd->id)); | |
| 1735 | |
| 1736 lpd->obj1 = stringNew("1 0 obj\n" | |
| 1737 "<<\n" | |
| 1738 "/Type /Catalog\n" | |
| 1739 "/Pages 3 0 R\n" | |
| 1740 ">>\n" | |
| 1741 "endobj\n"); | |
| 1742 l_dnaAddNumber(lpd->objsize, strlen(lpd->obj1)); | |
| 1743 | |
| 1744 sa = sarrayCreate(0); | |
| 1745 sarrayAddString(sa, "2 0 obj\n" | |
| 1746 "<<\n", L_COPY); | |
| 1747 if (var_WRITE_DATE_AND_VERSION) { | |
| 1748 datestr = l_getFormattedDate(); | |
| 1749 snprintf(buf, sizeof(buf), "/CreationDate (D:%s)\n", datestr); | |
| 1750 sarrayAddString(sa, buf, L_COPY); | |
| 1751 LEPT_FREE(datestr); | |
| 1752 version = getLeptonicaVersion(); | |
| 1753 snprintf(buf, sizeof(buf), | |
| 1754 "/Producer (leptonica: %s)\n", version); | |
| 1755 LEPT_FREE(version); | |
| 1756 } else { | |
| 1757 snprintf(buf, sizeof(buf), "/Producer (leptonica)\n"); | |
| 1758 } | |
| 1759 sarrayAddString(sa, buf, L_COPY); | |
| 1760 if (lpd->title) { | |
| 1761 char *hexstr; | |
| 1762 if ((hexstr = generateEscapeString(lpd->title)) != NULL) { | |
| 1763 snprintf(buf, sizeof(buf), "/Title %s\n", hexstr); | |
| 1764 sarrayAddString(sa, buf, L_COPY); | |
| 1765 } else { | |
| 1766 L_ERROR("title string is not ascii\n", __func__); | |
| 1767 } | |
| 1768 LEPT_FREE(hexstr); | |
| 1769 } | |
| 1770 sarrayAddString(sa, ">>\n" | |
| 1771 "endobj\n", L_COPY); | |
| 1772 lpd->obj2 = sarrayToString(sa, 0); | |
| 1773 l_dnaAddNumber(lpd->objsize, strlen(lpd->obj2)); | |
| 1774 sarrayDestroy(&sa); | |
| 1775 | |
| 1776 lpd->obj3 = stringNew("3 0 obj\n" | |
| 1777 "<<\n" | |
| 1778 "/Type /Pages\n" | |
| 1779 "/Kids [ 4 0 R ]\n" | |
| 1780 "/Count 1\n" | |
| 1781 ">>\n"); | |
| 1782 l_dnaAddNumber(lpd->objsize, strlen(lpd->obj3)); | |
| 1783 | |
| 1784 /* Do the post-datastream string */ | |
| 1785 lpd->poststream = stringNew("\n" | |
| 1786 "endstream\n" | |
| 1787 "endobj\n"); | |
| 1788 } | |
| 1789 | |
| 1790 | |
| 1791 /*! | |
| 1792 * \brief generateEscapeString() | |
| 1793 * | |
| 1794 * \param[in] str input string | |
| 1795 * \return hex escape string, or null on error | |
| 1796 * | |
| 1797 * <pre> | |
| 1798 * Notes: | |
| 1799 * (1) If the input string is not ascii, returns null. | |
| 1800 * (2) This takes an input ascii string and generates a hex | |
| 1801 * ascii output string with 4 bytes out for each byte in. | |
| 1802 * The feff code at the beginning tells the pdf interpreter | |
| 1803 * that the data is to be interpreted as big-endian, 4 bytes | |
| 1804 * at a time. For ascii, the first two bytes are 0 and the | |
| 1805 * last two bytes are less than 0x80. | |
| 1806 * </pre> | |
| 1807 */ | |
| 1808 static char * | |
| 1809 generateEscapeString(const char *str) | |
| 1810 { | |
| 1811 char smallbuf[8]; | |
| 1812 char *buffer; | |
| 1813 l_int32 i, nchar, buflen; | |
| 1814 | |
| 1815 if (!str) | |
| 1816 return (char *)ERROR_PTR("str not defined", __func__, NULL); | |
| 1817 nchar = strlen(str); | |
| 1818 for (i = 0; i < nchar; i++) { | |
| 1819 if (str[i] < 0) | |
| 1820 return (char *)ERROR_PTR("str not all ascii", __func__, NULL); | |
| 1821 } | |
| 1822 | |
| 1823 buflen = 4 * nchar + 10; | |
| 1824 buffer = (char *)LEPT_CALLOC(buflen, sizeof(char)); | |
| 1825 stringCat(buffer, buflen, "<feff"); | |
| 1826 for (i = 0; i < nchar; i++) { | |
| 1827 snprintf(smallbuf, sizeof(smallbuf), "%04x", str[i]); | |
| 1828 stringCat(buffer, buflen, smallbuf); | |
| 1829 } | |
| 1830 stringCat(buffer, buflen, ">"); | |
| 1831 return buffer; | |
| 1832 } | |
| 1833 | |
| 1834 | |
| 1835 static void | |
| 1836 generateMediaboxPdf(L_PDF_DATA *lpd) | |
| 1837 { | |
| 1838 l_int32 i; | |
| 1839 l_float32 xpt, ypt, wpt, hpt, maxx, maxy; | |
| 1840 | |
| 1841 /* First get the full extent of all the images. | |
| 1842 * This is the mediabox, in pts. */ | |
| 1843 maxx = maxy = 0; | |
| 1844 for (i = 0; i < lpd->n; i++) { | |
| 1845 ptaGetPt(lpd->xy, i, &xpt, &ypt); | |
| 1846 ptaGetPt(lpd->wh, i, &wpt, &hpt); | |
| 1847 maxx = L_MAX(maxx, xpt + wpt); | |
| 1848 maxy = L_MAX(maxy, ypt + hpt); | |
| 1849 } | |
| 1850 | |
| 1851 lpd->mediabox = boxCreate(0, 0, (l_int32)(maxx + 0.5), | |
| 1852 (l_int32)(maxy + 0.5)); | |
| 1853 | |
| 1854 /* ypt is in standard image coordinates: the location of | |
| 1855 * the UL image corner with respect to the UL media box corner. | |
| 1856 * Rewrite each ypt for PostScript coordinates: the location of | |
| 1857 * the LL image corner with respect to the LL media box corner. */ | |
| 1858 for (i = 0; i < lpd->n; i++) { | |
| 1859 ptaGetPt(lpd->xy, i, &xpt, &ypt); | |
| 1860 ptaGetPt(lpd->wh, i, &wpt, &hpt); | |
| 1861 ptaSetPt(lpd->xy, i, xpt, maxy - ypt - hpt); | |
| 1862 } | |
| 1863 } | |
| 1864 | |
| 1865 | |
| 1866 static l_int32 | |
| 1867 generatePageStringPdf(L_PDF_DATA *lpd) | |
| 1868 { | |
| 1869 char *buf; | |
| 1870 char *xstr; | |
| 1871 l_int32 bufsize, i, wpt, hpt; | |
| 1872 SARRAY *sa; | |
| 1873 | |
| 1874 /* Allocate 1000 bytes for the boilerplate text, and | |
| 1875 * 50 bytes for each reference to an image in the | |
| 1876 * ProcSet array. */ | |
| 1877 bufsize = 1000 + 50 * lpd->n; | |
| 1878 if ((buf = (char *)LEPT_CALLOC(bufsize, sizeof(char))) == NULL) | |
| 1879 return ERROR_INT("calloc fail for buf", __func__, 1); | |
| 1880 | |
| 1881 boxGetGeometry(lpd->mediabox, NULL, NULL, &wpt, &hpt); | |
| 1882 sa = sarrayCreate(lpd->n); | |
| 1883 for (i = 0; i < lpd->n; i++) { | |
| 1884 snprintf(buf, bufsize, "/Im%d %d 0 R ", i + 1, 6 + i); | |
| 1885 sarrayAddString(sa, buf, L_COPY); | |
| 1886 } | |
| 1887 xstr = sarrayToString(sa, 0); | |
| 1888 sarrayDestroy(&sa); | |
| 1889 if (!xstr) { | |
| 1890 LEPT_FREE(buf); | |
| 1891 return ERROR_INT("xstr not made", __func__, 1); | |
| 1892 } | |
| 1893 | |
| 1894 snprintf(buf, bufsize, "4 0 obj\n" | |
| 1895 "<<\n" | |
| 1896 "/Type /Page\n" | |
| 1897 "/Parent 3 0 R\n" | |
| 1898 "/MediaBox [%d %d %d %d]\n" | |
| 1899 "/Contents 5 0 R\n" | |
| 1900 "/Resources\n" | |
| 1901 "<<\n" | |
| 1902 "/XObject << %s >>\n" | |
| 1903 "/ProcSet [ /ImageB /ImageI /ImageC ]\n" | |
| 1904 ">>\n" | |
| 1905 ">>\n" | |
| 1906 "endobj\n", | |
| 1907 0, 0, wpt, hpt, xstr); | |
| 1908 | |
| 1909 lpd->obj4 = stringNew(buf); | |
| 1910 l_dnaAddNumber(lpd->objsize, strlen(lpd->obj4)); | |
| 1911 sarrayDestroy(&sa); | |
| 1912 LEPT_FREE(buf); | |
| 1913 LEPT_FREE(xstr); | |
| 1914 return 0; | |
| 1915 } | |
| 1916 | |
| 1917 | |
| 1918 static l_int32 | |
| 1919 generateContentStringPdf(L_PDF_DATA *lpd) | |
| 1920 { | |
| 1921 char *buf; | |
| 1922 char *cstr; | |
| 1923 l_int32 i, bufsize; | |
| 1924 l_float32 xpt, ypt, wpt, hpt; | |
| 1925 SARRAY *sa; | |
| 1926 | |
| 1927 bufsize = 1000 + 200 * lpd->n; | |
| 1928 if ((buf = (char *)LEPT_CALLOC(bufsize, sizeof(char))) == NULL) | |
| 1929 return ERROR_INT("calloc fail for buf", __func__, 1); | |
| 1930 | |
| 1931 sa = sarrayCreate(lpd->n); | |
| 1932 for (i = 0; i < lpd->n; i++) { | |
| 1933 ptaGetPt(lpd->xy, i, &xpt, &ypt); | |
| 1934 ptaGetPt(lpd->wh, i, &wpt, &hpt); | |
| 1935 snprintf(buf, bufsize, | |
| 1936 "q %.4f %.4f %.4f %.4f %.4f %.4f cm /Im%d Do Q\n", | |
| 1937 wpt, 0.0, 0.0, hpt, xpt, ypt, i + 1); | |
| 1938 sarrayAddString(sa, buf, L_COPY); | |
| 1939 } | |
| 1940 cstr = sarrayToString(sa, 0); | |
| 1941 sarrayDestroy(&sa); | |
| 1942 if (!cstr) { | |
| 1943 LEPT_FREE(buf); | |
| 1944 return ERROR_INT("cstr not made", __func__, 1); | |
| 1945 } | |
| 1946 | |
| 1947 snprintf(buf, bufsize, "5 0 obj\n" | |
| 1948 "<< /Length %d >>\n" | |
| 1949 "stream\n" | |
| 1950 "%s" | |
| 1951 "endstream\n" | |
| 1952 "endobj\n", | |
| 1953 (l_int32)strlen(cstr), cstr); | |
| 1954 | |
| 1955 lpd->obj5 = stringNew(buf); | |
| 1956 l_dnaAddNumber(lpd->objsize, strlen(lpd->obj5)); | |
| 1957 sarrayDestroy(&sa); | |
| 1958 LEPT_FREE(buf); | |
| 1959 LEPT_FREE(cstr); | |
| 1960 return 0; | |
| 1961 } | |
| 1962 | |
| 1963 | |
| 1964 static l_int32 | |
| 1965 generatePreXStringsPdf(L_PDF_DATA *lpd) | |
| 1966 { | |
| 1967 char buff[256]; | |
| 1968 char buf[L_BIGBUF]; | |
| 1969 char *cstr, *bstr, *fstr, *pstr, *xstr, *photometry; | |
| 1970 l_int32 i, cmindex; | |
| 1971 L_COMP_DATA *cid; | |
| 1972 SARRAY *sa; | |
| 1973 | |
| 1974 sa = lpd->saprex; | |
| 1975 cmindex = 6 + lpd->n; /* starting value */ | |
| 1976 for (i = 0; i < lpd->n; i++) { | |
| 1977 pstr = cstr = NULL; | |
| 1978 if ((cid = pdfdataGetCid(lpd, i)) == NULL) | |
| 1979 return ERROR_INT("cid not found", __func__, 1); | |
| 1980 | |
| 1981 if (cid->type == L_G4_ENCODE) { | |
| 1982 if (var_WRITE_G4_IMAGE_MASK) { | |
| 1983 cstr = stringNew("/ImageMask true\n" | |
| 1984 "/ColorSpace /DeviceGray"); | |
| 1985 } else { | |
| 1986 cstr = stringNew("/ColorSpace /DeviceGray"); | |
| 1987 } | |
| 1988 bstr = stringNew("/BitsPerComponent 1\n" | |
| 1989 "/Interpolate true"); | |
| 1990 /* Note: the reversal is deliberate. The BlackIs1 flag | |
| 1991 * is misleadingly named: it says whether to invert the | |
| 1992 * image on decoding because the black pixels are 0, | |
| 1993 * not whether the black pixels are 1! The default for | |
| 1994 * BlackIs1 is "false", which means "don't invert because | |
| 1995 * black is 1." Yikes. */ | |
| 1996 photometry = (cid->minisblack) ? stringNew("true") | |
| 1997 : stringNew("false"); | |
| 1998 snprintf(buff, sizeof(buff), | |
| 1999 "/Filter /CCITTFaxDecode\n" | |
| 2000 "/DecodeParms\n" | |
| 2001 "<<\n" | |
| 2002 "/BlackIs1 %s\n" | |
| 2003 "/K -1\n" | |
| 2004 "/Columns %d\n" | |
| 2005 ">>", photometry, cid->w); | |
| 2006 fstr = stringNew(buff); | |
| 2007 LEPT_FREE(photometry); | |
| 2008 } else if (cid->type == L_JPEG_ENCODE) { | |
| 2009 if (cid->spp == 1) | |
| 2010 cstr = stringNew("/ColorSpace /DeviceGray"); | |
| 2011 else if (cid->spp == 3) | |
| 2012 cstr = stringNew("/ColorSpace /DeviceRGB"); | |
| 2013 else if (cid->spp == 4) /* pdf supports cmyk */ | |
| 2014 cstr = stringNew("/ColorSpace /DeviceCMYK"); | |
| 2015 else | |
| 2016 L_ERROR("in jpeg: spp != 1, 3 or 4\n", __func__); | |
| 2017 bstr = stringNew("/BitsPerComponent 8"); | |
| 2018 fstr = stringNew("/Filter /DCTDecode"); | |
| 2019 } else if (cid->type == L_JP2K_ENCODE) { | |
| 2020 if (cid->spp == 1) | |
| 2021 cstr = stringNew("/ColorSpace /DeviceGray"); | |
| 2022 else if (cid->spp == 3) | |
| 2023 cstr = stringNew("/ColorSpace /DeviceRGB"); | |
| 2024 else | |
| 2025 L_ERROR("in jp2k: spp != 1 && spp != 3\n", __func__); | |
| 2026 bstr = stringNew("/BitsPerComponent 8"); | |
| 2027 fstr = stringNew("/Filter /JPXDecode"); | |
| 2028 } else { /* type == L_FLATE_ENCODE */ | |
| 2029 if (cid->ncolors > 0) { /* cmapped */ | |
| 2030 snprintf(buff, sizeof(buff), "/ColorSpace %d 0 R", cmindex++); | |
| 2031 cstr = stringNew(buff); | |
| 2032 } else { | |
| 2033 if (cid->spp == 1 && cid->bps == 1) | |
| 2034 cstr = stringNew("/ColorSpace /DeviceGray\n" | |
| 2035 "/Decode [1 0]"); | |
| 2036 else if (cid->spp == 1) /* 8 bpp */ | |
| 2037 cstr = stringNew("/ColorSpace /DeviceGray"); | |
| 2038 else if (cid->spp == 3) | |
| 2039 cstr = stringNew("/ColorSpace /DeviceRGB"); | |
| 2040 else | |
| 2041 L_ERROR("unknown colorspace: spp = %d\n", | |
| 2042 __func__, cid->spp); | |
| 2043 } | |
| 2044 snprintf(buff, sizeof(buff), "/BitsPerComponent %d", cid->bps); | |
| 2045 bstr = stringNew(buff); | |
| 2046 fstr = stringNew("/Filter /FlateDecode"); | |
| 2047 if (cid->predictor == TRUE) { | |
| 2048 snprintf(buff, sizeof(buff), | |
| 2049 "/DecodeParms\n" | |
| 2050 "<<\n" | |
| 2051 " /Columns %d\n" | |
| 2052 " /Predictor 14\n" | |
| 2053 " /Colors %d\n" | |
| 2054 " /BitsPerComponent %d\n" | |
| 2055 ">>\n", cid->w, cid->spp, cid->bps); | |
| 2056 pstr = stringNew(buff); | |
| 2057 } | |
| 2058 } | |
| 2059 if (!pstr) /* no decode parameters */ | |
| 2060 pstr = stringNew(""); | |
| 2061 | |
| 2062 snprintf(buf, sizeof(buf), | |
| 2063 "%d 0 obj\n" | |
| 2064 "<<\n" | |
| 2065 "/Length %zu\n" | |
| 2066 "/Subtype /Image\n" | |
| 2067 "%s\n" /* colorspace */ | |
| 2068 "/Width %d\n" | |
| 2069 "/Height %d\n" | |
| 2070 "%s\n" /* bits/component */ | |
| 2071 "%s\n" /* filter */ | |
| 2072 "%s" /* decode parms; can be empty */ | |
| 2073 ">>\n" | |
| 2074 "stream\n", | |
| 2075 6 + i, cid->nbytescomp, cstr, | |
| 2076 cid->w, cid->h, bstr, fstr, pstr); | |
| 2077 xstr = stringNew(buf); | |
| 2078 sarrayAddString(sa, xstr, L_INSERT); | |
| 2079 l_dnaAddNumber(lpd->objsize, | |
| 2080 strlen(xstr) + cid->nbytescomp + strlen(lpd->poststream)); | |
| 2081 LEPT_FREE(cstr); | |
| 2082 LEPT_FREE(bstr); | |
| 2083 LEPT_FREE(fstr); | |
| 2084 LEPT_FREE(pstr); | |
| 2085 } | |
| 2086 | |
| 2087 return 0; | |
| 2088 } | |
| 2089 | |
| 2090 | |
| 2091 static l_int32 | |
| 2092 generateColormapStringsPdf(L_PDF_DATA *lpd) | |
| 2093 { | |
| 2094 char buf[L_BIGBUF]; | |
| 2095 char *cmstr; | |
| 2096 l_int32 i, cmindex, ncmap; | |
| 2097 L_COMP_DATA *cid; | |
| 2098 SARRAY *sa; | |
| 2099 | |
| 2100 /* In our canonical format, we have 5 objects, followed | |
| 2101 * by n XObjects, followed by m colormaps, so the index of | |
| 2102 * the first colormap object is 6 + n. */ | |
| 2103 sa = lpd->sacmap; | |
| 2104 cmindex = 6 + lpd->n; /* starting value */ | |
| 2105 ncmap = 0; | |
| 2106 for (i = 0; i < lpd->n; i++) { | |
| 2107 if ((cid = pdfdataGetCid(lpd, i)) == NULL) | |
| 2108 return ERROR_INT("cid not found", __func__, 1); | |
| 2109 if (cid->ncolors == 0) continue; | |
| 2110 | |
| 2111 ncmap++; | |
| 2112 snprintf(buf, sizeof(buf), "%d 0 obj\n" | |
| 2113 "[ /Indexed /DeviceRGB\n" | |
| 2114 "%d\n" | |
| 2115 "%s\n" | |
| 2116 "]\n" | |
| 2117 "endobj\n", | |
| 2118 cmindex, cid->ncolors - 1, cid->cmapdatahex); | |
| 2119 cmindex++; | |
| 2120 cmstr = stringNew(buf); | |
| 2121 l_dnaAddNumber(lpd->objsize, strlen(cmstr)); | |
| 2122 sarrayAddString(sa, cmstr, L_INSERT); | |
| 2123 } | |
| 2124 | |
| 2125 lpd->ncmap = ncmap; | |
| 2126 return 0; | |
| 2127 } | |
| 2128 | |
| 2129 | |
| 2130 static void | |
| 2131 generateTrailerPdf(L_PDF_DATA *lpd) | |
| 2132 { | |
| 2133 l_int32 i, n, size, linestart; | |
| 2134 L_DNA *daloc, *dasize; | |
| 2135 | |
| 2136 /* Let nobj be the number of numbered objects. These numbered | |
| 2137 * objects are indexed by their pdf number in arrays naloc[] | |
| 2138 * and nasize[]. The 0th object is the 9 byte header. Then | |
| 2139 * the number of objects in nasize, which includes the header, | |
| 2140 * is n = nobj + 1. The array naloc[] has n + 1 elements, | |
| 2141 * because it includes as the last element the starting | |
| 2142 * location of xref. The indexing of these objects, their | |
| 2143 * starting locations and sizes are: | |
| 2144 * | |
| 2145 * Object number Starting location Size | |
| 2146 * ------------- ----------------- -------------- | |
| 2147 * 0 daloc[0] = 0 dasize[0] = 9 | |
| 2148 * 1 daloc[1] = 9 dasize[1] = 49 | |
| 2149 * n daloc[n] dasize[n] | |
| 2150 * xref daloc[n+1] | |
| 2151 * | |
| 2152 * We first generate daloc. | |
| 2153 */ | |
| 2154 dasize = lpd->objsize; | |
| 2155 daloc = lpd->objloc; | |
| 2156 linestart = 0; | |
| 2157 l_dnaAddNumber(daloc, linestart); /* header */ | |
| 2158 n = l_dnaGetCount(dasize); | |
| 2159 for (i = 0; i < n; i++) { | |
| 2160 l_dnaGetIValue(dasize, i, &size); | |
| 2161 linestart += size; | |
| 2162 l_dnaAddNumber(daloc, linestart); | |
| 2163 } | |
| 2164 l_dnaGetIValue(daloc, n, &lpd->xrefloc); /* save it */ | |
| 2165 | |
| 2166 /* Now make the actual trailer string */ | |
| 2167 lpd->trailer = makeTrailerStringPdf(daloc); | |
| 2168 } | |
| 2169 | |
| 2170 | |
| 2171 static char * | |
| 2172 makeTrailerStringPdf(L_DNA *daloc) | |
| 2173 { | |
| 2174 char *outstr; | |
| 2175 char buf[L_BIGBUF]; | |
| 2176 l_int32 i, n, linestart, xrefloc; | |
| 2177 SARRAY *sa; | |
| 2178 | |
| 2179 if (!daloc) | |
| 2180 return (char *)ERROR_PTR("daloc not defined", __func__, NULL); | |
| 2181 n = l_dnaGetCount(daloc) - 1; /* numbered objects + 1 (yes, +1) */ | |
| 2182 | |
| 2183 sa = sarrayCreate(0); | |
| 2184 snprintf(buf, sizeof(buf), "xref\n" | |
| 2185 "0 %d\n" | |
| 2186 "0000000000 65535 f \n", n); | |
| 2187 sarrayAddString(sa, buf, L_COPY); | |
| 2188 for (i = 1; i < n; i++) { | |
| 2189 l_dnaGetIValue(daloc, i, &linestart); | |
| 2190 snprintf(buf, sizeof(buf), "%010d 00000 n \n", linestart); | |
| 2191 sarrayAddString(sa, buf, L_COPY); | |
| 2192 } | |
| 2193 | |
| 2194 l_dnaGetIValue(daloc, n, &xrefloc); | |
| 2195 snprintf(buf, sizeof(buf), "trailer\n" | |
| 2196 "<<\n" | |
| 2197 "/Size %d\n" | |
| 2198 "/Root 1 0 R\n" | |
| 2199 "/Info 2 0 R\n" | |
| 2200 ">>\n" | |
| 2201 "startxref\n" | |
| 2202 "%d\n" | |
| 2203 "%%%%EOF\n", n, xrefloc); | |
| 2204 sarrayAddString(sa, buf, L_COPY); | |
| 2205 outstr = sarrayToString(sa, 0); | |
| 2206 sarrayDestroy(&sa); | |
| 2207 return outstr; | |
| 2208 } | |
| 2209 | |
| 2210 | |
| 2211 /*! | |
| 2212 * \brief generateOutputDataPdf() | |
| 2213 * | |
| 2214 * \param[out] pdata pdf data array | |
| 2215 * \param[out] pnbytes size of pdf data array | |
| 2216 * \param[in] lpd input data used to make pdf | |
| 2217 * \return 0 if OK, 1 on error | |
| 2218 * | |
| 2219 * <pre> | |
| 2220 * Notes: | |
| 2221 * (1) Only called from l_generatePdf(). On error, no data is returned. | |
| 2222 * </pre> | |
| 2223 */ | |
| 2224 static l_int32 | |
| 2225 generateOutputDataPdf(l_uint8 **pdata, | |
| 2226 size_t *pnbytes, | |
| 2227 L_PDF_DATA *lpd) | |
| 2228 { | |
| 2229 char *str; | |
| 2230 l_uint8 *data; | |
| 2231 l_int32 nimages, i, len; | |
| 2232 l_int32 *sizes, *locs; | |
| 2233 size_t nbytes; | |
| 2234 L_COMP_DATA *cid; | |
| 2235 | |
| 2236 if (!pdata) | |
| 2237 return ERROR_INT("&data not defined", __func__, 1); | |
| 2238 *pdata = NULL; | |
| 2239 if (!pnbytes) | |
| 2240 return ERROR_INT("&nbytes not defined", __func__, 1); | |
| 2241 nbytes = lpd->xrefloc + strlen(lpd->trailer); | |
| 2242 *pnbytes = nbytes; | |
| 2243 if ((data = (l_uint8 *)LEPT_CALLOC(nbytes, sizeof(l_uint8))) == NULL) | |
| 2244 return ERROR_INT("calloc fail for data", __func__, 1); | |
| 2245 *pdata = data; | |
| 2246 | |
| 2247 sizes = l_dnaGetIArray(lpd->objsize); | |
| 2248 locs = l_dnaGetIArray(lpd->objloc); | |
| 2249 memcpy(data, lpd->id, sizes[0]); | |
| 2250 memcpy(data + locs[1], lpd->obj1, sizes[1]); | |
| 2251 memcpy(data + locs[2], lpd->obj2, sizes[2]); | |
| 2252 memcpy(data + locs[3], lpd->obj3, sizes[3]); | |
| 2253 memcpy(data + locs[4], lpd->obj4, sizes[4]); | |
| 2254 memcpy(data + locs[5], lpd->obj5, sizes[5]); | |
| 2255 | |
| 2256 /* Each image has 3 parts: variable preamble, the compressed | |
| 2257 * data stream, and the fixed poststream. */ | |
| 2258 nimages = lpd->n; | |
| 2259 for (i = 0; i < nimages; i++) { | |
| 2260 if ((cid = pdfdataGetCid(lpd, i)) == NULL) { /* should not happen */ | |
| 2261 LEPT_FREE(sizes); | |
| 2262 LEPT_FREE(locs); | |
| 2263 return ERROR_INT("cid not found", __func__, 1); | |
| 2264 } | |
| 2265 str = sarrayGetString(lpd->saprex, i, L_NOCOPY); | |
| 2266 len = strlen(str); | |
| 2267 memcpy(data + locs[6 + i], str, len); | |
| 2268 memcpy(data + locs[6 + i] + len, | |
| 2269 cid->datacomp, cid->nbytescomp); | |
| 2270 memcpy(data + locs[6 + i] + len + cid->nbytescomp, | |
| 2271 lpd->poststream, strlen(lpd->poststream)); | |
| 2272 } | |
| 2273 | |
| 2274 /* Each colormap is simply a stored string */ | |
| 2275 for (i = 0; i < lpd->ncmap; i++) { | |
| 2276 str = sarrayGetString(lpd->sacmap, i, L_NOCOPY); | |
| 2277 memcpy(data + locs[6 + nimages + i], str, strlen(str)); | |
| 2278 } | |
| 2279 | |
| 2280 /* And finally the trailer */ | |
| 2281 memcpy(data + lpd->xrefloc, lpd->trailer, strlen(lpd->trailer)); | |
| 2282 LEPT_FREE(sizes); | |
| 2283 LEPT_FREE(locs); | |
| 2284 return 0; | |
| 2285 } | |
| 2286 | |
| 2287 | |
| 2288 /*---------------------------------------------------------------------* | |
| 2289 * Helper functions for generating multipage pdf output * | |
| 2290 *---------------------------------------------------------------------*/ | |
| 2291 /*! | |
| 2292 * \brief parseTrailerPdf() | |
| 2293 * | |
| 2294 * \param[in] bas lba of a pdf file | |
| 2295 * \param[out] pda byte locations of the beginning of each object | |
| 2296 * \return 0 if OK, 1 on error | |
| 2297 */ | |
| 2298 static l_int32 | |
| 2299 parseTrailerPdf(L_BYTEA *bas, | |
| 2300 L_DNA **pda) | |
| 2301 { | |
| 2302 char *str; | |
| 2303 l_uint8 nl = '\n'; | |
| 2304 l_uint8 *data; | |
| 2305 l_int32 i, j, start, startloc, xrefloc, found, loc, nobj, objno, trailer_ok; | |
| 2306 size_t size; | |
| 2307 L_DNA *da, *daobj, *daxref; | |
| 2308 SARRAY *sa; | |
| 2309 | |
| 2310 if (!pda) | |
| 2311 return ERROR_INT("&da not defined", __func__, 1); | |
| 2312 *pda = NULL; | |
| 2313 if (!bas) | |
| 2314 return ERROR_INT("bas not defined", __func__, 1); | |
| 2315 data = l_byteaGetData(bas, &size); | |
| 2316 if (memcmp(data, "%PDF-1.", 7) != 0) | |
| 2317 return ERROR_INT("PDF header signature not found", __func__, 1); | |
| 2318 | |
| 2319 /* Search for "startxref" starting 50 bytes from the EOF */ | |
| 2320 start = 0; | |
| 2321 if (size > 50) | |
| 2322 start = size - 50; | |
| 2323 arrayFindSequence(data + start, size - start, | |
| 2324 (l_uint8 *)"startxref\n", 10, &loc, &found); | |
| 2325 if (!found) | |
| 2326 return ERROR_INT("startxref not found!", __func__, 1); | |
| 2327 if (sscanf((char *)(data + start + loc + 10), "%d\n", &xrefloc) != 1) | |
| 2328 return ERROR_INT("xrefloc not found!", __func__, 1); | |
| 2329 if (xrefloc < 0 || xrefloc >= size) | |
| 2330 return ERROR_INT("invalid xrefloc!", __func__, 1); | |
| 2331 sa = sarrayCreateLinesFromString((char *)(data + xrefloc), 0); | |
| 2332 str = sarrayGetString(sa, 1, L_NOCOPY); | |
| 2333 if ((sscanf(str, "0 %d", &nobj)) != 1) { | |
| 2334 sarrayDestroy(&sa); | |
| 2335 return ERROR_INT("nobj not found", __func__, 1); | |
| 2336 } | |
| 2337 | |
| 2338 /* Get starting locations. The numa index is the | |
| 2339 * object number. loc[0] is the ID; loc[nobj + 1] is xrefloc. */ | |
| 2340 da = l_dnaCreate(nobj + 1); | |
| 2341 *pda = da; | |
| 2342 for (i = 0; i < nobj; i++) { | |
| 2343 str = sarrayGetString(sa, i + 2, L_NOCOPY); | |
| 2344 sscanf(str, "%d", &startloc); | |
| 2345 l_dnaAddNumber(da, startloc); | |
| 2346 } | |
| 2347 l_dnaAddNumber(da, xrefloc); | |
| 2348 | |
| 2349 #if DEBUG_MULTIPAGE | |
| 2350 lept_stderr("************** Trailer string ************\n"); | |
| 2351 lept_stderr("xrefloc = %d", xrefloc); | |
| 2352 sarrayWriteStderr(sa); | |
| 2353 | |
| 2354 lept_stderr("************** Object locations ************"); | |
| 2355 l_dnaWriteStderr(da); | |
| 2356 #endif /* DEBUG_MULTIPAGE */ | |
| 2357 sarrayDestroy(&sa); | |
| 2358 | |
| 2359 /* Verify correct parsing */ | |
| 2360 trailer_ok = TRUE; | |
| 2361 for (i = 1; i < nobj; i++) { | |
| 2362 l_dnaGetIValue(da, i, &startloc); | |
| 2363 if ((sscanf((char *)(data + startloc), "%d 0 obj", &objno)) != 1) { | |
| 2364 L_ERROR("bad trailer for object %d\n", __func__, i); | |
| 2365 trailer_ok = FALSE; | |
| 2366 break; | |
| 2367 } | |
| 2368 } | |
| 2369 | |
| 2370 /* If the trailer is broken, reconstruct the correct obj locations */ | |
| 2371 if (!trailer_ok) { | |
| 2372 L_INFO("rebuilding pdf trailer\n", __func__); | |
| 2373 l_dnaEmpty(da); | |
| 2374 l_dnaAddNumber(da, 0); | |
| 2375 l_byteaFindEachSequence(bas, (l_uint8 *)" 0 obj\n", 7, &daobj); | |
| 2376 nobj = l_dnaGetCount(daobj); | |
| 2377 for (i = 0; i < nobj; i++) { | |
| 2378 l_dnaGetIValue(daobj, i, &loc); | |
| 2379 for (j = loc - 1; j > 0; j--) { | |
| 2380 if (data[j] == nl) | |
| 2381 break; | |
| 2382 } | |
| 2383 l_dnaAddNumber(da, j + 1); | |
| 2384 } | |
| 2385 l_byteaFindEachSequence(bas, (l_uint8 *)"xref", 4, &daxref); | |
| 2386 l_dnaGetIValue(daxref, 0, &loc); | |
| 2387 l_dnaAddNumber(da, loc); | |
| 2388 l_dnaDestroy(&daobj); | |
| 2389 l_dnaDestroy(&daxref); | |
| 2390 } | |
| 2391 | |
| 2392 return 0; | |
| 2393 } | |
| 2394 | |
| 2395 | |
| 2396 static char * | |
| 2397 generatePagesObjStringPdf(NUMA *napage) | |
| 2398 { | |
| 2399 char *str; | |
| 2400 char *buf; | |
| 2401 l_int32 i, n, index, bufsize; | |
| 2402 SARRAY *sa; | |
| 2403 | |
| 2404 if (!napage) | |
| 2405 return (char *)ERROR_PTR("napage not defined", __func__, NULL); | |
| 2406 | |
| 2407 n = numaGetCount(napage); | |
| 2408 bufsize = 100 + 16 * n; /* large enough to hold the output string */ | |
| 2409 buf = (char *)LEPT_CALLOC(bufsize, sizeof(char)); | |
| 2410 sa = sarrayCreate(n); | |
| 2411 for (i = 0; i < n; i++) { | |
| 2412 numaGetIValue(napage, i, &index); | |
| 2413 snprintf(buf, bufsize, " %d 0 R ", index); | |
| 2414 sarrayAddString(sa, buf, L_COPY); | |
| 2415 } | |
| 2416 | |
| 2417 str = sarrayToString(sa, 0); | |
| 2418 snprintf(buf, bufsize - 1, "3 0 obj\n" | |
| 2419 "<<\n" | |
| 2420 "/Type /Pages\n" | |
| 2421 "/Kids [%s]\n" | |
| 2422 "/Count %d\n" | |
| 2423 ">>\n" | |
| 2424 "endobj\n", | |
| 2425 str, n); | |
| 2426 sarrayDestroy(&sa); | |
| 2427 LEPT_FREE(str); | |
| 2428 return buf; | |
| 2429 } | |
| 2430 | |
| 2431 | |
| 2432 /*! | |
| 2433 * \brief substituteObjectNumbers() | |
| 2434 * | |
| 2435 * \param[in] bas lba of a pdf object | |
| 2436 * \param[in] na_objs object number mapping array | |
| 2437 * \return bad lba of rewritten pdf for the object | |
| 2438 * | |
| 2439 * <pre> | |
| 2440 * Notes: | |
| 2441 * (1) Interpret the first set of bytes as the object number, | |
| 2442 * map to the new number, and write it out. | |
| 2443 * (2) Find all occurrences of this 4-byte sequence: " 0 R" | |
| 2444 * (3) Find the location and value of the integer preceding this, | |
| 2445 * and map it to the new value. | |
| 2446 * (4) Rewrite the object with new object numbers. | |
| 2447 * </pre> | |
| 2448 */ | |
| 2449 static L_BYTEA * | |
| 2450 substituteObjectNumbers(L_BYTEA *bas, | |
| 2451 NUMA *na_objs) | |
| 2452 { | |
| 2453 l_uint8 space = ' '; | |
| 2454 l_uint8 *datas; | |
| 2455 l_uint8 buf[32]; /* only needs to hold one integer in ascii format */ | |
| 2456 l_int32 start, nrepl, i, j, nobjs, objin, objout, found; | |
| 2457 l_int32 *objs, *matches; | |
| 2458 size_t size; | |
| 2459 L_BYTEA *bad; | |
| 2460 L_DNA *da_match; | |
| 2461 | |
| 2462 if (!bas) | |
| 2463 return (L_BYTEA *)ERROR_PTR("bas not defined", __func__, NULL); | |
| 2464 if (!na_objs) | |
| 2465 return (L_BYTEA *)ERROR_PTR("na_objs not defined", __func__, NULL); | |
| 2466 | |
| 2467 datas = l_byteaGetData(bas, &size); | |
| 2468 bad = l_byteaCreate(100); | |
| 2469 objs = numaGetIArray(na_objs); /* object number mapper */ | |
| 2470 nobjs = numaGetCount(na_objs); /* use for sanity checking */ | |
| 2471 | |
| 2472 /* Substitute the object number on the first line */ | |
| 2473 sscanf((char *)datas, "%d", &objin); | |
| 2474 if (objin < 0 || objin >= nobjs) { | |
| 2475 L_ERROR("index %d into array of size %d\n", __func__, objin, nobjs); | |
| 2476 LEPT_FREE(objs); | |
| 2477 return bad; | |
| 2478 } | |
| 2479 objout = objs[objin]; | |
| 2480 snprintf((char *)buf, 32, "%d", objout); | |
| 2481 l_byteaAppendString(bad, (char *)buf); | |
| 2482 | |
| 2483 /* Find the set of matching locations for object references */ | |
| 2484 arrayFindSequence(datas, size, &space, 1, &start, &found); | |
| 2485 da_match = arrayFindEachSequence(datas, size, (l_uint8 *)" 0 R", 4); | |
| 2486 if (!da_match) { | |
| 2487 l_byteaAppendData(bad, datas + start, size - start); | |
| 2488 LEPT_FREE(objs); | |
| 2489 return bad; | |
| 2490 } | |
| 2491 | |
| 2492 /* Substitute all the object reference numbers */ | |
| 2493 nrepl = l_dnaGetCount(da_match); | |
| 2494 matches = l_dnaGetIArray(da_match); | |
| 2495 for (i = 0; i < nrepl; i++) { | |
| 2496 /* Find the first space before the object number */ | |
| 2497 for (j = matches[i] - 1; j > 0; j--) { | |
| 2498 if (datas[j] == space) | |
| 2499 break; | |
| 2500 } | |
| 2501 /* Copy bytes from 'start' up to the object number */ | |
| 2502 l_byteaAppendData(bad, datas + start, j - start + 1); | |
| 2503 sscanf((char *)(datas + j + 1), "%d", &objin); | |
| 2504 if (objin < 0 || objin >= nobjs) { | |
| 2505 L_ERROR("index %d into array of size %d\n", __func__, objin, nobjs); | |
| 2506 LEPT_FREE(objs); | |
| 2507 LEPT_FREE(matches); | |
| 2508 l_dnaDestroy(&da_match); | |
| 2509 return bad; | |
| 2510 } | |
| 2511 objout = objs[objin]; | |
| 2512 snprintf((char *)buf, 32, "%d", objout); | |
| 2513 l_byteaAppendString(bad, (char *)buf); | |
| 2514 start = matches[i]; | |
| 2515 } | |
| 2516 l_byteaAppendData(bad, datas + start, size - start); | |
| 2517 | |
| 2518 LEPT_FREE(objs); | |
| 2519 LEPT_FREE(matches); | |
| 2520 l_dnaDestroy(&da_match); | |
| 2521 return bad; | |
| 2522 } | |
| 2523 | |
| 2524 | |
| 2525 /*---------------------------------------------------------------------* | |
| 2526 * Create/destroy/access pdf data * | |
| 2527 *---------------------------------------------------------------------*/ | |
| 2528 static L_PDF_DATA * | |
| 2529 pdfdataCreate(const char *title) | |
| 2530 { | |
| 2531 L_PDF_DATA *lpd; | |
| 2532 | |
| 2533 lpd = (L_PDF_DATA *)LEPT_CALLOC(1, sizeof(L_PDF_DATA)); | |
| 2534 if (title) lpd->title = stringNew(title); | |
| 2535 lpd->cida = ptraCreate(10); | |
| 2536 lpd->xy = ptaCreate(10); | |
| 2537 lpd->wh = ptaCreate(10); | |
| 2538 lpd->saprex = sarrayCreate(10); | |
| 2539 lpd->sacmap = sarrayCreate(10); | |
| 2540 lpd->objsize = l_dnaCreate(20); | |
| 2541 lpd->objloc = l_dnaCreate(20); | |
| 2542 return lpd; | |
| 2543 } | |
| 2544 | |
| 2545 static void | |
| 2546 pdfdataDestroy(L_PDF_DATA **plpd) | |
| 2547 { | |
| 2548 l_int32 i; | |
| 2549 L_COMP_DATA *cid; | |
| 2550 L_PDF_DATA *lpd; | |
| 2551 | |
| 2552 if (plpd== NULL) { | |
| 2553 L_WARNING("ptr address is null!\n", __func__); | |
| 2554 return; | |
| 2555 } | |
| 2556 if ((lpd = *plpd) == NULL) | |
| 2557 return; | |
| 2558 | |
| 2559 if (lpd->title) LEPT_FREE(lpd->title); | |
| 2560 for (i = 0; i < lpd->n; i++) { | |
| 2561 cid = (L_COMP_DATA *)ptraRemove(lpd->cida, i, L_NO_COMPACTION); | |
| 2562 l_CIDataDestroy(&cid); | |
| 2563 } | |
| 2564 | |
| 2565 ptraDestroy(&lpd->cida, 0, 0); | |
| 2566 if (lpd->id) LEPT_FREE(lpd->id); | |
| 2567 if (lpd->obj1) LEPT_FREE(lpd->obj1); | |
| 2568 if (lpd->obj2) LEPT_FREE(lpd->obj2); | |
| 2569 if (lpd->obj3) LEPT_FREE(lpd->obj3); | |
| 2570 if (lpd->obj4) LEPT_FREE(lpd->obj4); | |
| 2571 if (lpd->obj5) LEPT_FREE(lpd->obj5); | |
| 2572 if (lpd->poststream) LEPT_FREE(lpd->poststream); | |
| 2573 if (lpd->trailer) LEPT_FREE(lpd->trailer); | |
| 2574 if (lpd->xy) ptaDestroy(&lpd->xy); | |
| 2575 if (lpd->wh) ptaDestroy(&lpd->wh); | |
| 2576 if (lpd->mediabox) boxDestroy(&lpd->mediabox); | |
| 2577 if (lpd->saprex) sarrayDestroy(&lpd->saprex); | |
| 2578 if (lpd->sacmap) sarrayDestroy(&lpd->sacmap); | |
| 2579 if (lpd->objsize) l_dnaDestroy(&lpd->objsize); | |
| 2580 if (lpd->objloc) l_dnaDestroy(&lpd->objloc); | |
| 2581 LEPT_FREE(lpd); | |
| 2582 *plpd = NULL; | |
| 2583 } | |
| 2584 | |
| 2585 | |
| 2586 static L_COMP_DATA * | |
| 2587 pdfdataGetCid(L_PDF_DATA *lpd, | |
| 2588 l_int32 index) | |
| 2589 { | |
| 2590 if (!lpd) | |
| 2591 return (L_COMP_DATA *)ERROR_PTR("lpd not defined", __func__, NULL); | |
| 2592 if (index < 0 || index >= lpd->n) | |
| 2593 return (L_COMP_DATA *)ERROR_PTR("invalid image index", __func__, NULL); | |
| 2594 | |
| 2595 return (L_COMP_DATA *)ptraGetPtrToItem(lpd->cida, index); | |
| 2596 } | |
| 2597 | |
| 2598 | |
| 2599 /*---------------------------------------------------------------------* | |
| 2600 * Find number of pages in a pdf * | |
| 2601 *---------------------------------------------------------------------*/ | |
| 2602 /*! | |
| 2603 * \brief getPdfPageCount() | |
| 2604 * | |
| 2605 * \param[in] fname filename | |
| 2606 * \param[out] pnpages number of pages | |
| 2607 * \return 0 if OK, 1 on error | |
| 2608 * | |
| 2609 * <pre> | |
| 2610 * Notes: | |
| 2611 * (1) Looks for the argument of the first instance of /Count in the file. | |
| 2612 * (2) This first reads 10000 bytes from the beginning of the file. | |
| 2613 * If "/Count" is not in that string, it reads the entire file | |
| 2614 * and looks for "/Count". | |
| 2615 * (3) This will not work on encrypted pdf files or on files where | |
| 2616 * the "/Count" field is binary compressed. Not finding the | |
| 2617 * "/Count" field is not an error, but a warning is given. | |
| 2618 * </pre> | |
| 2619 */ | |
| 2620 l_ok | |
| 2621 getPdfPageCount(const char *fname, | |
| 2622 l_int32 *pnpages) | |
| 2623 { | |
| 2624 l_uint8 *data; | |
| 2625 l_int32 format, loc, ret, npages, found; | |
| 2626 size_t nread; | |
| 2627 | |
| 2628 if (!pnpages) | |
| 2629 return ERROR_INT("&npages not defined", __func__, 1); | |
| 2630 *pnpages = 0; | |
| 2631 if (!fname) | |
| 2632 return ERROR_INT("fname not defined", __func__, 1); | |
| 2633 | |
| 2634 /* Make sure this a pdf file */ | |
| 2635 findFileFormat(fname, &format); | |
| 2636 if (format != IFF_LPDF) | |
| 2637 return ERROR_INT("file is not pdf", __func__, 1); | |
| 2638 | |
| 2639 /* Read 10000 bytes from the beginning of the file */ | |
| 2640 if ((data = l_binaryReadSelect(fname, 0, 10000, &nread)) | |
| 2641 == NULL) | |
| 2642 return ERROR_INT("partial data not read", __func__, 1); | |
| 2643 | |
| 2644 /* Find the location of the first instance of "/Count". | |
| 2645 * If it is not found, try reading the entire file and | |
| 2646 * looking again. */ | |
| 2647 arrayFindSequence(data, nread, (const l_uint8 *)"/Count", | |
| 2648 strlen("/Count"), &loc, &found); | |
| 2649 if (!found) { | |
| 2650 lept_stderr("Reading entire file looking for '/Count'\n"); | |
| 2651 LEPT_FREE(data); | |
| 2652 if ((data = l_binaryRead(fname, &nread)) == NULL) | |
| 2653 return ERROR_INT("full data not read", __func__, 1); | |
| 2654 arrayFindSequence(data, nread, (const l_uint8 *)"/Count", | |
| 2655 strlen("/Count"), &loc, &found); | |
| 2656 if (!found) { | |
| 2657 LEPT_FREE(data); | |
| 2658 L_WARNING("/Count not found\n", __func__); | |
| 2659 return 0; | |
| 2660 } | |
| 2661 } | |
| 2662 | |
| 2663 /* Unlikely: make sure we can read the count field */ | |
| 2664 if (nread - loc < 12) { /* haven't read enough to capture page count */ | |
| 2665 LEPT_FREE(data); | |
| 2666 return ERROR_INT("data may not include page count field", __func__, 1); | |
| 2667 } | |
| 2668 | |
| 2669 /* Read the page count; if not found, puts garbage in npages */ | |
| 2670 ret = sscanf((char *)&data[loc], "/Count %d", &npages); | |
| 2671 LEPT_FREE(data); | |
| 2672 if (ret != 1) | |
| 2673 return ERROR_INT("npages not found", __func__, 1); | |
| 2674 *pnpages = npages; | |
| 2675 /* lept_stderr("bytes read = %d, loc = %d, npages = %d\n", | |
| 2676 nread, loc, *pnpages); */ | |
| 2677 return 0; | |
| 2678 } | |
| 2679 | |
| 2680 | |
| 2681 /*---------------------------------------------------------------------* | |
| 2682 * Find widths and heights of pages and media boxes in a pdf * | |
| 2683 *---------------------------------------------------------------------*/ | |
| 2684 /*! | |
| 2685 * \brief getPdfPageSizes() | |
| 2686 * | |
| 2687 * \param[in] fname filename | |
| 2688 * \param[out] pnaw [optional] array of page widths | |
| 2689 * \param[out] pnah [optional] array of page heights | |
| 2690 * \param[out] pmedw [optional] median page width | |
| 2691 * \param[out] pmedh [optional] median page height | |
| 2692 * \return 0 if OK, 1 on error | |
| 2693 * | |
| 2694 * <pre> | |
| 2695 * Notes: | |
| 2696 * (1) Finds the arguments of each instance of '/Width' and '/Height' | |
| 2697 * in the file. | |
| 2698 * (2) This will not work on encrypted pdf files or on files where | |
| 2699 * the "/Width" and "/Height" fields are binary compressed. | |
| 2700 * Not finding the "/Width" and /Height" fields is not an error, | |
| 2701 * but a warning is given. | |
| 2702 * </pre> | |
| 2703 */ | |
| 2704 l_ok | |
| 2705 getPdfPageSizes(const char *fname, | |
| 2706 NUMA **pnaw, | |
| 2707 NUMA **pnah, | |
| 2708 l_int32 *pmedw, | |
| 2709 l_int32 *pmedh) | |
| 2710 { | |
| 2711 l_uint8 *data; | |
| 2712 l_int32 i, nw, nh, format, ret, loc, width, height; | |
| 2713 l_float32 fval; | |
| 2714 size_t nread; | |
| 2715 L_DNA *dnaw; /* width locations */ | |
| 2716 L_DNA *dnah; /* height locations */ | |
| 2717 NUMA *naw; /* widths */ | |
| 2718 NUMA *nah; /* heights */ | |
| 2719 | |
| 2720 if (pnaw) *pnaw = NULL; | |
| 2721 if (pnah) *pnah = NULL; | |
| 2722 if (pmedw) *pmedw = 0; | |
| 2723 if (pmedh) *pmedh = 0; | |
| 2724 if (!pnaw && !pnah && !pmedw && !pmedh) | |
| 2725 return ERROR_INT("no output requested", __func__, 1); | |
| 2726 if (!fname) | |
| 2727 return ERROR_INT("fname not defined", __func__, 1); | |
| 2728 | |
| 2729 /* Make sure this a pdf file */ | |
| 2730 findFileFormat(fname, &format); | |
| 2731 if (format != IFF_LPDF) | |
| 2732 return ERROR_INT("file is not pdf", __func__, 1); | |
| 2733 | |
| 2734 /* Read the file into memory and find all locations of | |
| 2735 * '/Width' and '/Height' */ | |
| 2736 if ((data = l_binaryRead(fname, &nread)) == NULL) | |
| 2737 return ERROR_INT("full data not read", __func__, 1); | |
| 2738 dnaw = arrayFindEachSequence(data, nread, (const l_uint8 *)"/Width", | |
| 2739 strlen("/Width")); | |
| 2740 dnah = arrayFindEachSequence(data, nread, (const l_uint8 *)"/Height", | |
| 2741 strlen("/Height")); | |
| 2742 if (!dnaw) | |
| 2743 L_WARNING("unable to find widths\n", __func__); | |
| 2744 if (!dnah) | |
| 2745 L_WARNING("unable to find heights\n", __func__); | |
| 2746 if (!dnaw && !dnah) { | |
| 2747 LEPT_FREE(data); | |
| 2748 L_WARNING("no fields found\n", __func__); | |
| 2749 return 0; | |
| 2750 } | |
| 2751 | |
| 2752 /* Find the page widths and heights */ | |
| 2753 nw = l_dnaGetCount(dnaw); | |
| 2754 naw = numaCreate(nw); | |
| 2755 for (i = 0; i < nw; i++) { | |
| 2756 l_dnaGetIValue(dnaw, i, &loc); | |
| 2757 ret = sscanf((char *)&data[loc], "/Width %d", &width); | |
| 2758 if (ret != 1) { | |
| 2759 L_ERROR("width not found for item %d at loc %d\n", | |
| 2760 __func__, i, loc); | |
| 2761 continue; | |
| 2762 } | |
| 2763 numaAddNumber(naw, width); | |
| 2764 } | |
| 2765 nh = l_dnaGetCount(dnah); | |
| 2766 nah = numaCreate(nh); | |
| 2767 for (i = 0; i < nh; i++) { | |
| 2768 l_dnaGetIValue(dnah, i, &loc); | |
| 2769 ret = sscanf((char *)&data[loc], "/Height %d", &height); | |
| 2770 if (ret != 1) { | |
| 2771 L_ERROR("height not found for item %d at loc %d\n", | |
| 2772 __func__, i, loc); | |
| 2773 continue; | |
| 2774 } | |
| 2775 numaAddNumber(nah, height); | |
| 2776 } | |
| 2777 | |
| 2778 LEPT_FREE(data); | |
| 2779 l_dnaDestroy(&dnaw); | |
| 2780 l_dnaDestroy(&dnah); | |
| 2781 if (pmedw) { | |
| 2782 numaGetMedian(naw, &fval); | |
| 2783 *pmedw = lept_roundftoi(fval); | |
| 2784 } | |
| 2785 if (pnaw) | |
| 2786 *pnaw = naw; | |
| 2787 else | |
| 2788 numaDestroy(&naw); | |
| 2789 if (pmedh) { | |
| 2790 numaGetMedian(nah, &fval); | |
| 2791 *pmedh = lept_roundftoi(fval); | |
| 2792 } | |
| 2793 if (pnah) | |
| 2794 *pnah = nah; | |
| 2795 else | |
| 2796 numaDestroy(&nah); | |
| 2797 return 0; | |
| 2798 } | |
| 2799 | |
| 2800 | |
| 2801 /*! | |
| 2802 * \brief getPdfMediaBoxSizes() | |
| 2803 * | |
| 2804 * \param[in] fname filename | |
| 2805 * \param[out] pnaw [optional] array of mediabox widths | |
| 2806 * \param[out] pnah [optional] array of mediabox heights | |
| 2807 * \param[out] pmedw [optional] median mediabox width | |
| 2808 * \param[out] pmedh [optional] median mediabox height | |
| 2809 * \return 0 if OK, 1 on error | |
| 2810 * | |
| 2811 * <pre> | |
| 2812 * Notes: | |
| 2813 * (1) Finds the arguments of each instance of '/MediaBox' in the file. | |
| 2814 * (2) This will not work on encrypted pdf files or on files where | |
| 2815 * the "/MediaBoxes" field is binary compressed. Not finding | |
| 2816 * the "/MediaBoxes" field is not an error, but a warning is given. | |
| 2817 * (3) This is useful for determining if the media boxes are | |
| 2818 * incorrectly assigned, such as assuming the resolution is 72 ppi. | |
| 2819 * If that happens and the input the the renderer assumes the | |
| 2820 * resolution is 300 ppi, the rendered images will be over 4x too | |
| 2821 * large in each dimension. | |
| 2822 * (4) An image dimension of 11 inches corresponds to a MediaBox | |
| 2823 * parameter of 792. We consider a value > 850 to be oversized | |
| 2824 * and not to be taken literally. | |
| 2825 * </pre> | |
| 2826 */ | |
| 2827 l_ok | |
| 2828 getPdfMediaBoxSizes(const char *fname, | |
| 2829 NUMA **pnaw, | |
| 2830 NUMA **pnah, | |
| 2831 l_int32 *pmedw, | |
| 2832 l_int32 *pmedh) | |
| 2833 { | |
| 2834 l_uint8 *data; | |
| 2835 l_int32 i, n, format, ret, loc; | |
| 2836 l_float32 fval, ignore1, ignore2, w, h; | |
| 2837 size_t nread; | |
| 2838 L_DNA *dna; /* mediabox locations */ | |
| 2839 NUMA *naw; /* mediabox widths */ | |
| 2840 NUMA *nah; /* mediabox heights */ | |
| 2841 | |
| 2842 if (pnaw) *pnaw = NULL; | |
| 2843 if (pnah) *pnah = NULL; | |
| 2844 if (pmedw) *pmedw = 0; | |
| 2845 if (pmedh) *pmedh = 0; | |
| 2846 if (!pnaw && !pnah && !pmedw && !pmedh) | |
| 2847 return ERROR_INT("no output requested", __func__, 1); | |
| 2848 if (!fname) | |
| 2849 return ERROR_INT("fname not defined", __func__, 1); | |
| 2850 | |
| 2851 /* Make sure this a pdf file */ | |
| 2852 findFileFormat(fname, &format); | |
| 2853 if (format != IFF_LPDF) | |
| 2854 return ERROR_INT("file is not pdf", __func__, 1); | |
| 2855 | |
| 2856 /* Read the file into memory and find all locations of '/MediaBox' */ | |
| 2857 if ((data = l_binaryRead(fname, &nread)) == NULL) | |
| 2858 return ERROR_INT("full data not read", __func__, 1); | |
| 2859 dna = arrayFindEachSequence(data, nread, (const l_uint8 *)"/MediaBox", | |
| 2860 strlen("/MediaBox")); | |
| 2861 if (!dna) { | |
| 2862 LEPT_FREE(data); | |
| 2863 L_WARNING("no mediaboxes found\n", __func__); | |
| 2864 return 1; | |
| 2865 } | |
| 2866 | |
| 2867 /* Find the mediabox widths and heights */ | |
| 2868 n = l_dnaGetCount(dna); | |
| 2869 naw = numaCreate(n); | |
| 2870 nah = numaCreate(n); | |
| 2871 for (i = 0; i < n; i++) { | |
| 2872 l_dnaGetIValue(dna, i, &loc); | |
| 2873 ret = sscanf((char *)&data[loc], "/MediaBox [ %f %f %f %f", | |
| 2874 &ignore1, &ignore2, &w, &h); | |
| 2875 if (ret != 4) { | |
| 2876 L_ERROR("mediabox sizes not found for item %d at loc %d\n", | |
| 2877 __func__, i, loc); | |
| 2878 continue; | |
| 2879 } | |
| 2880 numaAddNumber(naw, w); | |
| 2881 numaAddNumber(nah, h); | |
| 2882 } | |
| 2883 LEPT_FREE(data); | |
| 2884 l_dnaDestroy(&dna); | |
| 2885 | |
| 2886 if (pmedw) { | |
| 2887 numaGetMedian(naw, &fval); | |
| 2888 *pmedw = lept_roundftoi(fval); | |
| 2889 if (*pmedw > 850) lept_stderr("oversize width: %d\n", *pmedw); | |
| 2890 } | |
| 2891 if (pnaw) | |
| 2892 *pnaw = naw; | |
| 2893 else | |
| 2894 numaDestroy(&naw); | |
| 2895 if (pmedh) { | |
| 2896 numaGetMedian(nah, &fval); | |
| 2897 *pmedh = lept_roundftoi(fval); | |
| 2898 if (*pmedh > 850) lept_stderr("oversize height: %d\n", *pmedh); | |
| 2899 } | |
| 2900 if (pnah) | |
| 2901 *pnah = nah; | |
| 2902 else | |
| 2903 numaDestroy(&nah); | |
| 2904 return 0; | |
| 2905 } | |
| 2906 | |
| 2907 | |
| 2908 /*---------------------------------------------------------------------* | |
| 2909 * Find effective resolution of images rendered from a pdf * | |
| 2910 *---------------------------------------------------------------------*/ | |
| 2911 /*! | |
| 2912 * \brief getPdfRendererResolution() | |
| 2913 * | |
| 2914 * \param[in] infile filename of input pdf file | |
| 2915 * \param[in] outdir directory of rendered output images | |
| 2916 * \param[out] pres desired resolution to use with renderer | |
| 2917 * \return 0 if OK, 1 on error | |
| 2918 * | |
| 2919 * <pre> | |
| 2920 * Notes: | |
| 2921 * (1) Finds the input resolution to pdftoppm that will generate | |
| 2922 * images with a maximum dimension of about 3300 pixels, | |
| 2923 * representing a full page at 300 ppi. | |
| 2924 * (2) It is most important is to make sure the renderer does | |
| 2925 * not make huge images because of an error in /MediaBox. | |
| 2926 * An image dimension of 11 inches corresponds to a MediaBox | |
| 2927 * parameter of 792. We consider a value > 850 to be oversized | |
| 2928 * and not to be taken literally. If the mediaboxes are | |
| 2929 * oversized, choose an appropriate lower resolution. | |
| 2930 * (3) If the mediaboxes are not accessible, render an image at | |
| 2931 * a low known resolution (say, 72 ppi) and based on the image | |
| 2932 * size, determine the resolution necessary to make an image | |
| 2933 * with 3300 pixels in the largest dimension. | |
| 2934 * (4) Requires pdftoppm, so this is disabled on windows for now. | |
| 2935 * (5) Requires the ability to call an external program, so it is | |
| 2936 * necessary to call setLeptDebugOK(1) before this function. | |
| 2937 * </pre> | |
| 2938 */ | |
| 2939 l_ok | |
| 2940 getPdfRendererResolution(const char *infile, | |
| 2941 const char *outdir, | |
| 2942 l_int32 *pres) | |
| 2943 { | |
| 2944 char buf[256]; | |
| 2945 char *tail, *basename, *fname; | |
| 2946 l_int32 ret, res, medw, medh, medmax, npages, pageno, w, h; | |
| 2947 SARRAY *sa; | |
| 2948 | |
| 2949 if (!pres) | |
| 2950 return ERROR_INT("&res not defined", __func__, 1); | |
| 2951 *pres = 300; /* default */ | |
| 2952 | |
| 2953 #ifdef _WIN32 | |
| 2954 L_INFO("Requires pdftoppm, so this is disabled on windows.\n" | |
| 2955 "Returns default resolution 300 ppi", __func__); | |
| 2956 return 0; | |
| 2957 #endif /* _WIN32 */ | |
| 2958 | |
| 2959 if (!LeptDebugOK) { | |
| 2960 L_INFO("Running pdftoppm is disabled; " | |
| 2961 "use setLeptDebugOK(1) to enable\n" | |
| 2962 "returns default resolution 300 ppi\n", __func__); | |
| 2963 return 1; | |
| 2964 } | |
| 2965 | |
| 2966 if (!infile) | |
| 2967 return ERROR_INT("infile not defined", __func__, 1); | |
| 2968 if (!outdir) | |
| 2969 return ERROR_INT("outdir not defined", __func__, 1); | |
| 2970 | |
| 2971 res = 300; /* default value */ | |
| 2972 ret = getPdfMediaBoxSizes(infile, NULL, NULL, &medw, &medh); | |
| 2973 if (ret == 0) { /* Check for oversize mediaboxes */ | |
| 2974 lept_stderr("Media Box medians: medw = %d, medh = %d\n", medw, medh); | |
| 2975 medmax = L_MAX(medw, medh); | |
| 2976 if (medmax > 850) { | |
| 2977 res = 300 * ((l_float32)792 / (l_float32)medmax); | |
| 2978 lept_stderr(" Oversize media box; use resolution = %d\n", res); | |
| 2979 *pres = res; | |
| 2980 } | |
| 2981 return 0; | |
| 2982 } | |
| 2983 | |
| 2984 /* No mediaboxes; render one page and measure the max dimension */ | |
| 2985 lept_stderr("Media Box dimensions not found\n"); | |
| 2986 getPdfPageCount(infile, &npages); | |
| 2987 pageno = (npages > 0) ? (npages + 1) / 2 : 1; | |
| 2988 splitPathAtDirectory(infile, NULL, &tail); | |
| 2989 splitPathAtExtension(tail, &basename, NULL); | |
| 2990 snprintf(buf, sizeof(buf), "pdftoppm -f %d -l %d -r 72 %s %s/%s", | |
| 2991 pageno, pageno, infile, outdir, basename); | |
| 2992 LEPT_FREE(tail); | |
| 2993 LEPT_FREE(basename); | |
| 2994 callSystemDebug(buf); /* pdftoppm */ | |
| 2995 | |
| 2996 /* Get the page size */ | |
| 2997 sa = getSortedPathnamesInDirectory(outdir, NULL, 0, 0); | |
| 2998 fname = sarrayGetString(sa, 0, L_NOCOPY); | |
| 2999 pixReadHeader(fname, NULL, &w, &h, NULL, NULL, NULL); | |
| 3000 sarrayDestroy(&sa); | |
| 3001 if (w > 0 && h > 0) { | |
| 3002 res = L_MIN((72 * 3300 / L_MAX(w, h)), 600); | |
| 3003 *pres = res; | |
| 3004 lept_stderr("Use resolution = %d\n", res); | |
| 3005 } else { | |
| 3006 L_ERROR("page size not found; assuming res = 300\n", __func__); | |
| 3007 } | |
| 3008 | |
| 3009 return 0; | |
| 3010 } | |
| 3011 | |
| 3012 | |
| 3013 /*---------------------------------------------------------------------* | |
| 3014 * Set flags for special modes * | |
| 3015 *---------------------------------------------------------------------*/ | |
| 3016 /*! | |
| 3017 * \brief l_pdfSetG4ImageMask() | |
| 3018 * | |
| 3019 * \param[in] flag 1 for writing g4 data as fg only through a mask; | |
| 3020 * 0 for writing fg and bg | |
| 3021 * \return void | |
| 3022 * | |
| 3023 * <pre> | |
| 3024 * Notes: | |
| 3025 * (1) The default is for writing only the fg (through the mask). | |
| 3026 * That way when you write a 1 bpp image, the bg is transparent, | |
| 3027 * so any previously written image remains visible behind it. | |
| 3028 * </pre> | |
| 3029 */ | |
| 3030 void | |
| 3031 l_pdfSetG4ImageMask(l_int32 flag) | |
| 3032 { | |
| 3033 var_WRITE_G4_IMAGE_MASK = flag; | |
| 3034 } | |
| 3035 | |
| 3036 | |
| 3037 /*! | |
| 3038 * \brief l_pdfSetDateAndVersion() | |
| 3039 * | |
| 3040 * \param[in] flag 1 for writing date/time and leptonica version; | |
| 3041 * 0 for omitting this from the metadata | |
| 3042 * \return void | |
| 3043 * | |
| 3044 * <pre> | |
| 3045 * Notes: | |
| 3046 * (1) The default is for writing this data. For regression tests | |
| 3047 * that compare output against golden files, it is useful to omit. | |
| 3048 * </pre> | |
| 3049 */ | |
| 3050 void | |
| 3051 l_pdfSetDateAndVersion(l_int32 flag) | |
| 3052 { | |
| 3053 var_WRITE_DATE_AND_VERSION = flag; | |
| 3054 } | |
| 3055 | |
| 3056 /* --------------------------------------------*/ | |
| 3057 #endif /* USE_PDFIO */ | |
| 3058 /* --------------------------------------------*/ |
