comparison mupdf-source/thirdparty/leptonica/src/pdfio2.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /*====================================================================*
2 - Copyright (C) 2001 Leptonica. All rights reserved.
3 -
4 - Redistribution and use in source and binary forms, with or without
5 - modification, are permitted provided that the following conditions
6 - are met:
7 - 1. Redistributions of source code must retain the above copyright
8 - notice, this list of conditions and the following disclaimer.
9 - 2. Redistributions in binary form must reproduce the above
10 - copyright notice, this list of conditions and the following
11 - disclaimer in the documentation and/or other materials
12 - provided with the distribution.
13 -
14 - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15 - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16 - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17 - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY
18 - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23 - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 *====================================================================*/
26
27 /*!
28 * \file pdfio2.c
29 * <pre>
30 *
31 * Lower-level operations for generating pdf.
32 *
33 * Intermediate function for single page, multi-image conversion
34 * l_int32 pixConvertToPdfData()
35 *
36 * Intermediate function for generating multipage pdf output
37 * l_int32 ptraConcatenatePdfToData()
38 *
39 * Convert tiff multipage to pdf file
40 * l_int32 convertTiffMultipageToPdf()
41 *
42 * Generates the CID, transcoding under some conditions
43 * l_int32 l_generateCIDataForPdf()
44 * l_int32 l_generateCIData()
45 *
46 * Lower-level CID generation without transcoding
47 * L_COMP_DATA *l_generateFlateDataPdf()
48 * L_COMP_DATA *l_generateJpegData()
49 * L_COMP_DATA *l_generateJpegDataMem()
50 * static L_COMP_DATA *l_generateJp2kData()
51 * L_COMP_DATA *l_generateG4Data()
52 *
53 * Lower-level CID generation with transcoding
54 * l_int32 pixGenerateCIData()
55 * L_COMP_DATA *l_generateFlateData()
56 * static L_COMP_DATA *pixGenerateFlateData()
57 * static L_COMP_DATA *pixGenerateJpegData()
58 * static L_COMP_DATA *pixGenerateJp2kData()
59 * static L_COMP_DATA *pixGenerateG4Data()
60 *
61 * Other CID operations
62 * l_int32 cidConvertToPdfData()
63 * void l_CIDataDestroy()
64 *
65 * Helper functions for generating the output pdf string
66 * static l_int32 l_generatePdf()
67 * static void generateFixedStringsPdf()
68 * static char *generateEscapeString()
69 * static void generateMediaboxPdf()
70 * static l_int32 generatePageStringPdf()
71 * static l_int32 generateContentStringPdf()
72 * static l_int32 generatePreXStringsPdf()
73 * static l_int32 generateColormapStringsPdf()
74 * static void generateTrailerPdf()
75 * static l_int32 makeTrailerStringPdf()
76 * static l_int32 generateOutputDataPdf()
77 *
78 * Helper functions for generating multipage pdf output
79 * static l_int32 parseTrailerPdf()
80 * static char *generatePagesObjStringPdf()
81 * static L_BYTEA *substituteObjectNumbers()
82 *
83 * Create/destroy/access pdf data
84 * static L_PDF_DATA *pdfdataCreate()
85 * static void pdfdataDestroy()
86 * static L_COMP_DATA *pdfdataGetCid()
87 *
88 * Find number of pages in a pdf
89 * l_int32 getPdfPageCount()
90 *
91 * Find widths and heights of pages and media boxes in a pdf
92 * l_int32 getPdfPageSizes()
93 * l_int32 getPdfMediaBoxSizes()
94 *
95 * Find effective resolution of images rendered from a pdf
96 * l_int32 getPdfRendererResolution()
97 *
98 * Set flags for special modes
99 * void l_pdfSetG4ImageMask()
100 * void l_pdfSetDateAndVersion()
101 *
102 * </pre>
103 */
104
105 #ifdef HAVE_CONFIG_H
106 #include <config_auto.h>
107 #endif /* HAVE_CONFIG_H */
108
109 #include <string.h>
110 #include <math.h>
111 #include "allheaders.h"
112
113 /* --------------------------------------------*/
114 #if USE_PDFIO /* defined in environ.h */
115 /* --------------------------------------------*/
116
117 /* Typical scan resolution in ppi (pixels/inch) */
118 static const l_int32 DefaultInputRes = 300;
119
120 /* Static helpers */
121 static L_COMP_DATA *l_generateJp2kData(const char *fname);
122 static L_COMP_DATA *pixGenerateFlateData(PIX *pixs, l_int32 ascii85flag);
123 static L_COMP_DATA *pixGenerateJpegData(PIX *pixs, l_int32 ascii85flag,
124 l_int32 quality);
125 static L_COMP_DATA *pixGenerateJp2kData(PIX *pixs, l_int32 quality);
126 static L_COMP_DATA *pixGenerateG4Data(PIX *pixs, l_int32 ascii85flag);
127
128 static l_int32 l_generatePdf(l_uint8 **pdata, size_t *pnbytes,
129 L_PDF_DATA *lpd);
130 static void generateFixedStringsPdf(L_PDF_DATA *lpd);
131 static char *generateEscapeString(const char *str);
132 static void generateMediaboxPdf(L_PDF_DATA *lpd);
133 static l_int32 generatePageStringPdf(L_PDF_DATA *lpd);
134 static l_int32 generateContentStringPdf(L_PDF_DATA *lpd);
135 static l_int32 generatePreXStringsPdf(L_PDF_DATA *lpd);
136 static l_int32 generateColormapStringsPdf(L_PDF_DATA *lpd);
137 static void generateTrailerPdf(L_PDF_DATA *lpd);
138 static char *makeTrailerStringPdf(L_DNA *daloc);
139 static l_int32 generateOutputDataPdf(l_uint8 **pdata, size_t *pnbytes,
140 L_PDF_DATA *lpd);
141
142 static l_int32 parseTrailerPdf(L_BYTEA *bas, L_DNA **pda);
143 static char *generatePagesObjStringPdf(NUMA *napage);
144 static L_BYTEA *substituteObjectNumbers(L_BYTEA *bas, NUMA *na_objs);
145
146 static L_PDF_DATA *pdfdataCreate(const char *title);
147 static void pdfdataDestroy(L_PDF_DATA **plpd);
148 static L_COMP_DATA *pdfdataGetCid(L_PDF_DATA *lpd, l_int32 index);
149
150
151 /* ---------------- Defaults for rendering options ----------------- */
152 /* Output G4 as writing through image mask; this is the default */
153 static l_int32 var_WRITE_G4_IMAGE_MASK = 1;
154 /* Write date/time and lib version into pdf; this is the default */
155 static l_int32 var_WRITE_DATE_AND_VERSION = 1;
156
157 #define L_SMALLBUF 256
158 #define L_BIGBUF 2048 /* must be able to hold hex colormap */
159
160
161 #ifndef NO_CONSOLE_IO
162 #define DEBUG_MULTIPAGE 0
163 #endif /* ~NO_CONSOLE_IO */
164
165
166 /*---------------------------------------------------------------------*
167 * Intermediate function for generating multipage pdf output *
168 *---------------------------------------------------------------------*/
169 /*!
170 * \brief pixConvertToPdfData()
171 *
172 * \param[in] pix all depths; cmap OK
173 * \param[in] type L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE,
174 * L_JP2K_ENCODE
175 * \param[in] quality for jpeg: 1-100; 0 for default (75)
176 * for jp2k: 27-45; 0 for default (34)
177 * \param[out] pdata pdf array
178 * \param[out] pnbytes number of bytes in pdf array
179 * \param[in] x, y location of lower-left corner of image, in pixels,
180 * relative to the PostScript origin (0,0) at
181 * the lower-left corner of the page)
182 * \param[in] res override the resolution of the input image, in ppi;
183 * use 0 to respect resolution embedded in the input
184 * \param[in] title [optional] pdf title; can be null
185 * \param[in,out] plpd ptr to lpd; created on the first invocation and
186 * returned until last image is processed
187 * \param[in] position in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
188 * L_LAST_IMAGE
189 * \return 0 if OK, 1 on error
190 *
191 * <pre>
192 * Notes:
193 * (1) If %res == 0 and the input resolution field from the pix is 0,
194 * this will use DefaultInputRes.
195 * (2) This only writes %data if it is the last image to be
196 * written on the page.
197 * (3) See comments in convertToPdf().
198 * </pre>
199 */
200 l_ok
201 pixConvertToPdfData(PIX *pix,
202 l_int32 type,
203 l_int32 quality,
204 l_uint8 **pdata,
205 size_t *pnbytes,
206 l_int32 x,
207 l_int32 y,
208 l_int32 res,
209 const char *title,
210 L_PDF_DATA **plpd,
211 l_int32 position)
212 {
213 l_int32 pixres, w, h, ret;
214 l_float32 xpt, ypt, wpt, hpt;
215 L_COMP_DATA *cid = NULL;
216 L_PDF_DATA *lpd = NULL;
217
218 if (!pdata)
219 return ERROR_INT("&data not defined", __func__, 1);
220 *pdata = NULL;
221 if (!pnbytes)
222 return ERROR_INT("&nbytes not defined", __func__, 1);
223 *pnbytes = 0;
224 if (!pix)
225 return ERROR_INT("pix not defined", __func__, 1);
226 if (type != L_JPEG_ENCODE && type != L_G4_ENCODE &&
227 type != L_FLATE_ENCODE && type != L_JP2K_ENCODE) {
228 selectDefaultPdfEncoding(pix, &type);
229 }
230 if (quality < 0 || quality > 100)
231 return ERROR_INT("invalid quality", __func__, 1);
232
233 if (plpd) { /* part of multi-page invocation */
234 if (position == L_FIRST_IMAGE)
235 *plpd = NULL;
236 }
237
238 /* Generate the compressed image data. It must NOT
239 * be ascii85 encoded. */
240 pixGenerateCIData(pix, type, quality, 0, &cid);
241 if (!cid)
242 return ERROR_INT("cid not made", __func__, 1);
243
244 /* Get media box in pts. Guess the input image resolution
245 * based on the input parameter %res, the resolution data in
246 * the pix, and the size of the image. */
247 pixres = cid->res;
248 w = cid->w;
249 h = cid->h;
250 if (res <= 0.0)
251 res = (pixres > 0) ? pixres : DefaultInputRes;
252 xpt = x * 72.f / res;
253 ypt = y * 72.f / res;
254 wpt = w * 72.f / res;
255 hpt = h * 72.f / res;
256
257 /* Set up lpd */
258 if (!plpd) { /* single image */
259 if ((lpd = pdfdataCreate(title)) == NULL)
260 return ERROR_INT("lpd not made", __func__, 1);
261 } else if (position == L_FIRST_IMAGE) { /* first of multiple images */
262 if ((lpd = pdfdataCreate(title)) == NULL)
263 return ERROR_INT("lpd not made", __func__, 1);
264 *plpd = lpd;
265 } else { /* not the first of multiple images */
266 lpd = *plpd;
267 }
268
269 /* Add the data to the lpd */
270 ptraAdd(lpd->cida, cid);
271 lpd->n++;
272 ptaAddPt(lpd->xy, xpt, ypt);
273 ptaAddPt(lpd->wh, wpt, hpt);
274
275 /* If a single image or the last of multiple images,
276 * generate the pdf and destroy the lpd */
277 if (!plpd || (position == L_LAST_IMAGE)) {
278 ret = l_generatePdf(pdata, pnbytes, lpd);
279 pdfdataDestroy(&lpd);
280 if (plpd) *plpd = NULL;
281 if (ret)
282 return ERROR_INT("pdf output not made", __func__, 1);
283 }
284
285 return 0;
286 }
287
288
289 /*---------------------------------------------------------------------*
290 * Intermediate function for generating multipage pdf output *
291 *---------------------------------------------------------------------*/
292 /*!
293 * \brief ptraConcatenatePdfToData()
294 *
295 * \param[in] pa_data ptra array of pdf strings, each for a
296 * single-page pdf file
297 * \param[in] sa [optional] string array of pathnames for
298 * input pdf files; can be null
299 * \param[out] pdata concatenated pdf data in memory
300 * \param[out] pnbytes number of bytes in pdf data
301 * \return 0 if OK, 1 on error
302 *
303 * <pre>
304 * Notes:
305 * (1) This only works with leptonica-formatted single-page pdf files.
306 * pdf files generated by other programs will have unpredictable
307 * (and usually bad) results. The requirements for each pdf file:
308 * (a) The Catalog and Info objects are the first two.
309 * (b) Object 3 is Pages
310 * (c) Object 4 is Page
311 * (d) The remaining objects are Contents, XObjects, and ColorSpace
312 * (2) We remove trailers from each page, and append the full trailer
313 * for all pages at the end.
314 * (3) For all but the first file, remove the ID and the first 3
315 * objects (catalog, info, pages), so that each subsequent
316 * file has only objects of these classes:
317 * Page, Contents, XObject, ColorSpace (Indexed RGB).
318 * For those objects, we substitute these refs to objects
319 * in the local file:
320 * Page: Parent(object 3), Contents, XObject(typically multiple)
321 * XObject: [ColorSpace if indexed]
322 * The Pages object on the first page (object 3) has a Kids array
323 * of references to all the Page objects, with a Count equal
324 * to the number of pages. Each Page object refers back to
325 * this parent.
326 * </pre>
327 */
328 l_ok
329 ptraConcatenatePdfToData(L_PTRA *pa_data,
330 SARRAY *sa,
331 l_uint8 **pdata,
332 size_t *pnbytes)
333 {
334 char *fname, *str_pages, *str_trailer;
335 l_uint8 *pdfdata, *data;
336 l_int32 i, j, index, nobj, npages;
337 l_int32 *sizes, *locs;
338 size_t size;
339 L_BYTEA *bas, *bad, *bat1, *bat2;
340 L_DNA *da_locs, *da_sizes, *da_outlocs, *da;
341 L_DNAA *daa_locs; /* object locations on each page */
342 NUMA *na_objs, *napage;
343 NUMAA *naa_objs; /* object mapping numbers to new values */
344
345 if (!pdata)
346 return ERROR_INT("&data not defined", __func__, 1);
347 *pdata = NULL;
348 if (!pnbytes)
349 return ERROR_INT("&nbytes not defined", __func__, 1);
350 *pnbytes = 0;
351 if (!pa_data)
352 return ERROR_INT("pa_data not defined", __func__, 1);
353
354 /* Parse the files and find the object locations.
355 * Remove file data that cannot be parsed. */
356 ptraGetActualCount(pa_data, &npages);
357 daa_locs = l_dnaaCreate(npages);
358 for (i = 0; i < npages; i++) {
359 bas = (L_BYTEA *)ptraGetPtrToItem(pa_data, i);
360 if (parseTrailerPdf(bas, &da_locs) != 0) {
361 bas = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
362 l_byteaDestroy(&bas);
363 if (sa) {
364 fname = sarrayGetString(sa, i, L_NOCOPY);
365 L_ERROR("can't parse file %s; skipping\n", __func__, fname);
366 } else {
367 L_ERROR("can't parse file %d; skipping\n", __func__, i);
368 }
369 } else {
370 l_dnaaAddDna(daa_locs, da_locs, L_INSERT);
371 }
372 }
373
374 /* Recompute npages in case some of the files were not pdf */
375 ptraCompactArray(pa_data);
376 ptraGetActualCount(pa_data, &npages);
377 if (npages == 0) {
378 l_dnaaDestroy(&daa_locs);
379 return ERROR_INT("no parsable pdf files found", __func__, 1);
380 }
381
382 /* Find the mapping from initial to final object numbers */
383 naa_objs = numaaCreate(npages); /* stores final object numbers */
384 napage = numaCreate(npages); /* stores "Page" object numbers */
385 index = 0;
386 for (i = 0; i < npages; i++) {
387 da = l_dnaaGetDna(daa_locs, i, L_CLONE);
388 nobj = l_dnaGetCount(da);
389 if (i == 0) {
390 numaAddNumber(napage, 4); /* object 4 on first page */
391 na_objs = numaMakeSequence(0.0, 1.0, nobj - 1);
392 index = nobj - 1;
393 } else { /* skip the first 3 objects in each file */
394 numaAddNumber(napage, index); /* Page object is first we add */
395 na_objs = numaMakeConstant(0.0, nobj - 1);
396 numaReplaceNumber(na_objs, 3, 3); /* refers to parent of all */
397 for (j = 4; j < nobj - 1; j++)
398 numaSetValue(na_objs, j, index++);
399 }
400 numaaAddNuma(naa_objs, na_objs, L_INSERT);
401 l_dnaDestroy(&da);
402 }
403
404 /* Make the Pages object (#3) */
405 str_pages = generatePagesObjStringPdf(napage);
406
407 /* Build the output */
408 bad = l_byteaCreate(5000);
409 da_outlocs = l_dnaCreate(0); /* locations of all output objects */
410 for (i = 0; i < npages; i++) {
411 bas = (L_BYTEA *)ptraGetPtrToItem(pa_data, i);
412 pdfdata = l_byteaGetData(bas, &size);
413 da_locs = l_dnaaGetDna(daa_locs, i, L_CLONE); /* locs on this page */
414 na_objs = numaaGetNuma(naa_objs, i, L_CLONE); /* obj # on this page */
415 nobj = l_dnaGetCount(da_locs) - 1;
416 da_sizes = l_dnaDiffAdjValues(da_locs); /* object sizes on this page */
417 sizes = l_dnaGetIArray(da_sizes);
418 locs = l_dnaGetIArray(da_locs);
419 if (i == 0) {
420 l_byteaAppendData(bad, pdfdata, sizes[0]);
421 l_byteaAppendData(bad, pdfdata + locs[1], sizes[1]);
422 l_byteaAppendData(bad, pdfdata + locs[2], sizes[2]);
423 l_byteaAppendString(bad, str_pages);
424 for (j = 0; j < 4; j++)
425 l_dnaAddNumber(da_outlocs, locs[j]);
426 }
427 for (j = 4; j < nobj; j++) {
428 l_dnaAddNumber(da_outlocs, l_byteaGetSize(bad));
429 bat1 = l_byteaInitFromMem(pdfdata + locs[j], sizes[j]);
430 bat2 = substituteObjectNumbers(bat1, na_objs);
431 data = l_byteaGetData(bat2, &size);
432 l_byteaAppendData(bad, data, size);
433 l_byteaDestroy(&bat1);
434 l_byteaDestroy(&bat2);
435 }
436 if (i == npages - 1) /* last one */
437 l_dnaAddNumber(da_outlocs, l_byteaGetSize(bad));
438 LEPT_FREE(sizes);
439 LEPT_FREE(locs);
440 l_dnaDestroy(&da_locs);
441 numaDestroy(&na_objs);
442 l_dnaDestroy(&da_sizes);
443 }
444
445 /* Add the trailer */
446 str_trailer = makeTrailerStringPdf(da_outlocs);
447 l_byteaAppendString(bad, str_trailer);
448
449 /* Transfer the output data */
450 *pdata = l_byteaCopyData(bad, pnbytes);
451 l_byteaDestroy(&bad);
452
453 #if DEBUG_MULTIPAGE
454 lept_stderr("******** object mapper **********");
455 numaaWriteStream(stderr, naa_objs);
456
457 lept_stderr("******** Page object numbers ***********");
458 numaWriteStderr(napage);
459
460 lept_stderr("******** Pages object ***********\n");
461 lept_stderr("%s\n", str_pages);
462 #endif /* DEBUG_MULTIPAGE */
463
464 numaDestroy(&napage);
465 numaaDestroy(&naa_objs);
466 l_dnaDestroy(&da_outlocs);
467 l_dnaaDestroy(&daa_locs);
468 LEPT_FREE(str_pages);
469 LEPT_FREE(str_trailer);
470 return 0;
471 }
472
473
474 /*---------------------------------------------------------------------*
475 * Convert tiff multipage to pdf file *
476 *---------------------------------------------------------------------*/
477 /*!
478 * \brief convertTiffMultipageToPdf()
479 *
480 * \param[in] filein (tiff)
481 * \param[in] fileout (pdf)
482 * \return 0 if OK, 1 on error
483 *
484 * <pre>
485 * Notes:
486 * (1) A multipage tiff file can also be converted to PS, using
487 * convertTiffMultipageToPS()
488 * </pre>
489 */
490 l_ok
491 convertTiffMultipageToPdf(const char *filein,
492 const char *fileout)
493 {
494 l_int32 istiff;
495 PIXA *pixa;
496 FILE *fp;
497
498 if ((fp = fopenReadStream(filein)) == NULL)
499 return ERROR_INT_1("file not found", filein, __func__, 1);
500 istiff = fileFormatIsTiff(fp);
501 fclose(fp);
502 if (!istiff)
503 return ERROR_INT_1("file not tiff format", filein, __func__, 1);
504
505 pixa = pixaReadMultipageTiff(filein);
506 pixaConvertToPdf(pixa, 0, 1.0, 0, 0, "weasel2", fileout);
507 pixaDestroy(&pixa);
508 return 0;
509 }
510
511
512 /*---------------------------------------------------------------------*
513 * CID-based operations *
514 *---------------------------------------------------------------------*/
515 /*!
516 * \brief l_generateCIDataForPdf()
517 *
518 * \param[in] fname [optional] can be null
519 * \param[in] pix [optional] can be null
520 * \param[in] quality for jpeg if transcoded: 1-100; 0 for default (75)
521 * for jp2k if transcoded: 27-45; 0 for default (34)
522 * \param[out] pcid compressed data
523 * \return 0 if OK, 1 on error
524 *
525 * <pre>
526 * Notes:
527 * (1) You must set either filename or pix.
528 * (2) Given an image file and optionally a pix raster of that data,
529 * this provides a CID that is compatible with PDF, preferably
530 * without transcoding.
531 * (3) The pix is included for efficiency, in case transcoding
532 * is required and the pix is available to the caller.
533 * (4) We don't try to open files named "stdin" or "-" for Tesseract
534 * compatibility reasons. We may remove this restriction
535 * in the future.
536 * (5) Note that tiff-g4 must be transcoded to properly handle byte
537 * order and perhaps photometry (e.g., min-is-black). For a
538 * multipage tiff file, data will only be extracted from the
539 * first page, so this should not be invoked.
540 * </pre>
541 */
542 l_ok
543 l_generateCIDataForPdf(const char *fname,
544 PIX *pix,
545 l_int32 quality,
546 L_COMP_DATA **pcid)
547 {
548 l_int32 format, type;
549 L_COMP_DATA *cid;
550 PIX *pixt;
551
552 if (!pcid)
553 return ERROR_INT("&cid not defined", __func__, 1);
554 *pcid = cid = NULL;
555 if (!fname && !pix)
556 return ERROR_INT("neither fname nor pix are defined", __func__, 1);
557
558 /* If a compressed file is given that is not 'stdin', see if we
559 * can generate the pdf output without transcoding. */
560 if (fname && strcmp(fname, "-") != 0 && strcmp(fname, "stdin") != 0) {
561 findFileFormat(fname, &format);
562 if (format == IFF_UNKNOWN)
563 L_WARNING("file %s format is unknown\n", __func__, fname);
564 if (format == IFF_PS || format == IFF_LPDF) {
565 L_ERROR("file %s is unsupported format %d\n",
566 __func__, fname, format);
567 return 1;
568 }
569 if (format == IFF_JFIF_JPEG) {
570 cid = l_generateJpegData(fname, 0);
571 } else if (format == IFF_JP2) {
572 cid = l_generateJp2kData(fname);
573 } else if (format == IFF_PNG) {
574 cid = l_generateFlateDataPdf(fname, pix);
575 }
576 }
577
578 /* Otherwise, use the pix to generate the pdf output */
579 if (!cid) {
580 if (!pix)
581 pixt = pixRead(fname);
582 else
583 pixt = pixClone(pix);
584 if (!pixt)
585 return ERROR_INT("pixt not made", __func__, 1);
586 if (selectDefaultPdfEncoding(pixt, &type)) {
587 pixDestroy(&pixt);
588 return 1;
589 }
590 pixGenerateCIData(pixt, type, quality, 0, &cid);
591 pixDestroy(&pixt);
592 if (!cid)
593 return ERROR_INT("cid not made from pix", __func__, 1);
594 }
595 *pcid = cid;
596 return 0;
597 }
598
599
600 /*!
601 * \brief l_generateCIData()
602 *
603 * \param[in] fname
604 * \param[in] type L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE,
605 * L_JP2K_ENCODE
606 * \param[in] quality for jpeg if transcoded: 1-100; 0 for default (75)
607 * for jp2k if transcoded: 27-45; 0 for default (34)
608 * \param[in] ascii85 0 for binary; 1 for ascii85-encoded
609 * \param[out] pcid compressed data
610 * \return 0 if OK, 1 on error
611 *
612 * <pre>
613 * Notes:
614 * (1) This can be used for both PostScript and pdf.
615 * (1) Set ascii85:
616 * ~ 0 for binary data (PDF only)
617 * ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only)
618 * (2) This attempts to compress according to the requested type.
619 * If this can't be done, it falls back to ordinary flate encoding.
620 * (3) This differs from l_generateCIDataForPdf(), which determines
621 * the file format and only works for pdf.
622 * </pre>
623 */
624 l_ok
625 l_generateCIData(const char *fname,
626 l_int32 type,
627 l_int32 quality,
628 l_int32 ascii85,
629 L_COMP_DATA **pcid)
630 {
631 l_int32 format, d, bps, spp, iscmap;
632 L_COMP_DATA *cid;
633 PIX *pix;
634
635 if (!pcid)
636 return ERROR_INT("&cid not defined", __func__, 1);
637 *pcid = NULL;
638 if (!fname)
639 return ERROR_INT("fname not defined", __func__, 1);
640 if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
641 type != L_FLATE_ENCODE && type != L_JP2K_ENCODE)
642 return ERROR_INT("invalid conversion type", __func__, 1);
643 if (ascii85 != 0 && ascii85 != 1)
644 return ERROR_INT("invalid ascii85", __func__, 1);
645
646 /* Sanity check on requested encoding */
647 pixReadHeader(fname, &format, NULL, NULL, &bps, &spp, &iscmap);
648 d = bps * spp;
649 if (d == 24) d = 32;
650 if (iscmap && type != L_FLATE_ENCODE) {
651 L_WARNING("pixs has cmap; using flate encoding\n", __func__);
652 type = L_FLATE_ENCODE;
653 } else if (d < 8 && type == L_JPEG_ENCODE) {
654 L_WARNING("pixs has < 8 bpp; using flate encoding\n", __func__);
655 type = L_FLATE_ENCODE;
656 } else if (d < 8 && type == L_JP2K_ENCODE) {
657 L_WARNING("pixs has < 8 bpp; using flate encoding\n", __func__);
658 type = L_FLATE_ENCODE;
659 } else if (d > 1 && type == L_G4_ENCODE) {
660 L_WARNING("pixs has > 1 bpp; using flate encoding\n", __func__);
661 type = L_FLATE_ENCODE;
662 }
663
664 if (type == L_JPEG_ENCODE) {
665 if (format == IFF_JFIF_JPEG) { /* do not transcode */
666 cid = l_generateJpegData(fname, ascii85);
667 } else {
668 if ((pix = pixRead(fname)) == NULL)
669 return ERROR_INT("pix not returned for JPEG", __func__, 1);
670 cid = pixGenerateJpegData(pix, ascii85, quality);
671 pixDestroy(&pix);
672 }
673 if (!cid)
674 return ERROR_INT("jpeg data not made", __func__, 1);
675 } else if (type == L_JP2K_ENCODE) {
676 if (format == IFF_JP2) { /* do not transcode */
677 cid = l_generateJp2kData(fname);
678 } else {
679 if ((pix = pixRead(fname)) == NULL)
680 return ERROR_INT("pix not returned for JP2K", __func__, 1);
681 cid = pixGenerateJp2kData(pix, quality);
682 pixDestroy(&pix);
683 }
684 if (!cid)
685 return ERROR_INT("jp2k data not made", __func__, 1);
686 } else if (type == L_G4_ENCODE) {
687 if ((pix = pixRead(fname)) == NULL)
688 return ERROR_INT("pix not returned for G4", __func__, 1);
689 cid = pixGenerateG4Data(pix, ascii85);
690 pixDestroy(&pix);
691 if (!cid)
692 return ERROR_INT("g4 data not made", __func__, 1);
693 } else if (type == L_FLATE_ENCODE) {
694 if ((cid = l_generateFlateData(fname, ascii85)) == NULL)
695 return ERROR_INT("flate data not made", __func__, 1);
696 } else {
697 return ERROR_INT("invalid conversion type", __func__, 1);
698 }
699 *pcid = cid;
700
701 return 0;
702 }
703
704
705 /*---------------------------------------------------------------------*
706 * Low-level CID-based operations *
707 *---------------------------------------------------------------------*/
708 /*!
709 * \brief l_generateFlateDataPdf()
710 *
711 * \param[in] fname preferably png
712 * \param[in] pixs [optional] can be null
713 * \return cid containing png data, or NULL on error
714 *
715 * <pre>
716 * Notes:
717 * (1) If you hand this a png file, you are going to get
718 * png predictors embedded in the flate data. So it has
719 * come to this. http://xkcd.com/1022/
720 * (2) Exception: if the png is interlaced or if it is RGBA,
721 * it will be transcoded.
722 * (3) If transcoding is required, this will not have to read from
723 * file if a pix is input.
724 * </pre>
725 */
726 L_COMP_DATA *
727 l_generateFlateDataPdf(const char *fname,
728 PIX *pixs)
729 {
730 l_uint8 *pngcomp = NULL; /* entire PNG compressed file */
731 l_uint8 *datacomp = NULL; /* gzipped raster data */
732 l_uint8 *cmapdata = NULL; /* uncompressed colormap */
733 char *cmapdatahex = NULL; /* hex ascii uncompressed colormap */
734 l_uint32 i, j, n;
735 l_int32 format, interlaced;
736 l_int32 ncolors; /* in colormap */
737 l_int32 bps; /* bits/sample: usually 8 */
738 l_int32 spp; /* samples/pixel: 1-grayscale/cmap); 3-rgb; 4-rgba */
739 l_int32 w, h, cmapflag;
740 l_int32 xres, yres;
741 size_t nbytescomp = 0, nbytespng = 0;
742 FILE *fp;
743 L_COMP_DATA *cid;
744 PIX *pix;
745 PIXCMAP *cmap = NULL;
746
747 if (!fname)
748 return (L_COMP_DATA *)ERROR_PTR("fname not defined", __func__, NULL);
749
750 findFileFormat(fname, &format);
751 spp = 0; /* init to spp != 4 if not png */
752 interlaced = 0; /* initialize to no interlacing */
753 bps = 0; /* initialize to a nonsense value */
754 if (format == IFF_PNG) {
755 isPngInterlaced(fname, &interlaced);
756 if (readHeaderPng(fname, NULL, NULL, &bps, &spp, NULL))
757 return (L_COMP_DATA *)ERROR_PTR("bad png input", __func__, NULL);
758 }
759
760 /* PDF is capable of inlining some types of PNG files, but not all
761 of them. We need to transcode anything with interlacing, an
762 alpha channel, or 1 bpp (which would otherwise be photo-inverted).
763
764 Note: any PNG image file with an alpha channel is converted on
765 reading to RGBA (spp == 4). This includes the (gray + alpha) format
766 with spp == 2. Because of the conversion, readHeaderPng() gives
767 spp = 2, whereas pixGetSpp() gives spp = 4 on the converted pix. */
768 if (format != IFF_PNG ||
769 (format == IFF_PNG && (interlaced || bps == 1 || spp == 4 || spp == 2)))
770 { /* lgtm+ analyzer needed the logic expanded */
771 if (!pixs)
772 pix = pixRead(fname);
773 else
774 pix = pixClone(pixs);
775 if (!pix)
776 return (L_COMP_DATA *)ERROR_PTR("pix not made", __func__, NULL);
777 cid = pixGenerateFlateData(pix, 0);
778 pixDestroy(&pix);
779 return cid;
780 }
781
782 /* It's png. Generate the pdf data without transcoding.
783 * Implementation by Jeff Breidenbach.
784 * First, read the metadata */
785 if ((fp = fopenReadStream(fname)) == NULL)
786 return (L_COMP_DATA *)ERROR_PTR_1("stream not opened",
787 fname, __func__, NULL);
788 freadHeaderPng(fp, &w, &h, &bps, &spp, &cmapflag);
789 fgetPngResolution(fp, &xres, &yres);
790 fclose(fp);
791
792 /* We get pdf corruption when inlining the data from 16 bpp png. */
793 if (bps == 16)
794 return l_generateFlateData(fname, 0);
795
796 /* Read the entire png file */
797 if ((pngcomp = l_binaryRead(fname, &nbytespng)) == NULL)
798 return (L_COMP_DATA *)ERROR_PTR_1("unable to read file",
799 fname, __func__, NULL);
800
801 /* Extract flate data, copying portions of it to memory, including
802 * the predictor information in a byte at the beginning of each
803 * raster line. The flate data makes up the vast majority of
804 * the png file, so after extraction we expect datacomp to
805 * be nearly full (i.e., nbytescomp will be only slightly less
806 * than nbytespng). Also extract the colormap if present. */
807 if ((datacomp = (l_uint8 *)LEPT_CALLOC(1, nbytespng)) == NULL) {
808 LEPT_FREE(pngcomp);
809 return (L_COMP_DATA *)ERROR_PTR("unable to allocate memory",
810 __func__, NULL);
811 }
812
813 /* Parse the png file. Each chunk consists of:
814 * length: 4 bytes
815 * name: 4 bytes (e.g., "IDAT")
816 * data: n bytes
817 * CRC: 4 bytes
818 * Start at the beginning of the data section of the first chunk,
819 * byte 16, because the png file begins with 8 bytes of header,
820 * followed by the first 8 bytes of the first chunk
821 * (length and name). On each loop, increment by 12 bytes to
822 * skip over the CRC, length and name of the next chunk. */
823 for (i = 16; i < nbytespng; i += 12) { /* do each successive chunk */
824 /* Get the chunk length */
825 n = pngcomp[i - 8] << 24;
826 n += pngcomp[i - 7] << 16;
827 n += pngcomp[i - 6] << 8;
828 n += pngcomp[i - 5] << 0;
829 if (n >= nbytespng - i) { /* "n + i" can overflow */
830 LEPT_FREE(pngcomp);
831 LEPT_FREE(datacomp);
832 pixcmapDestroy(&cmap);
833 L_ERROR("invalid png: i = %d, n = %d, nbytes = %zu\n", __func__,
834 i, n, nbytespng);
835 return NULL;
836 }
837
838 /* Is it a data chunk? */
839 if (memcmp(pngcomp + i - 4, "IDAT", 4) == 0) {
840 memcpy(datacomp + nbytescomp, pngcomp + i, n);
841 nbytescomp += n;
842 }
843
844 /* Is it a palette chunk? */
845 if (cmapflag && !cmap &&
846 memcmp(pngcomp + i - 4, "PLTE", 4) == 0) {
847 if ((n / 3) > (1 << bps)) {
848 LEPT_FREE(pngcomp);
849 LEPT_FREE(datacomp);
850 pixcmapDestroy(&cmap);
851 L_ERROR("invalid png: i = %d, n = %d, cmapsize = %d\n",
852 __func__, i, n, (1 << bps));
853 return NULL;
854 }
855 cmap = pixcmapCreate(bps);
856 for (j = i; j < i + n; j += 3) {
857 pixcmapAddColor(cmap, pngcomp[j], pngcomp[j + 1],
858 pngcomp[j + 2]);
859 }
860 }
861 i += n; /* move to the end of the data chunk */
862 }
863 LEPT_FREE(pngcomp);
864
865 if (nbytescomp == 0) {
866 LEPT_FREE(datacomp);
867 pixcmapDestroy(&cmap);
868 return (L_COMP_DATA *)ERROR_PTR("invalid PNG file", __func__, NULL);
869 }
870
871 /* Extract and encode the colormap data as hexascii */
872 ncolors = 0;
873 if (cmap) {
874 pixcmapSerializeToMemory(cmap, 3, &ncolors, &cmapdata);
875 pixcmapDestroy(&cmap);
876 if (!cmapdata) {
877 LEPT_FREE(datacomp);
878 return (L_COMP_DATA *)ERROR_PTR("cmapdata not made",
879 __func__, NULL);
880 }
881 cmapdatahex = pixcmapConvertToHex(cmapdata, ncolors);
882 LEPT_FREE(cmapdata);
883 }
884
885 /* Note that this is the only situation where the predictor
886 * field of the CID is set to 1. Adobe's predictor values on
887 * p. 76 of pdf_reference_1-7.pdf give 1 for no predictor and
888 * 10-14 for inline predictors, the specifics of which are
889 * ignored by the pdf interpreter, which just needs to know that
890 * the first byte on each compressed scanline is some predictor
891 * whose type can be inferred from the byte itself. */
892 cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA));
893 cid->datacomp = datacomp;
894 cid->type = L_FLATE_ENCODE;
895 cid->cmapdatahex = cmapdatahex;
896 cid->nbytescomp = nbytescomp;
897 cid->ncolors = ncolors;
898 cid->predictor = TRUE;
899 cid->w = w;
900 cid->h = h;
901 cid->bps = bps;
902 cid->spp = spp;
903 cid->res = xres;
904 return cid;
905 }
906
907
908 /*!
909 * \brief l_generateJpegData()
910 *
911 * \param[in] fname of jpeg file
912 * \param[in] ascii85flag 0 for jpeg; 1 for ascii85-encoded jpeg
913 * \return cid containing jpeg data, or NULL on error
914 *
915 * <pre>
916 * Notes:
917 * (1) Set ascii85flag:
918 * ~ 0 for binary data (PDF only)
919 * ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only)
920 * (2) Most of this function is repeated in l_generateJpegMemData(),
921 * which is required in pixacompFastConvertToPdfData().
922 * </pre>
923 */
924 L_COMP_DATA *
925 l_generateJpegData(const char *fname,
926 l_int32 ascii85flag)
927 {
928 char *data85 = NULL; /* ascii85 encoded jpeg compressed file */
929 l_uint8 *data = NULL;
930 l_int32 w, h, xres, yres, bps, spp;
931 size_t nbytes, nbytes85;
932 L_COMP_DATA *cid;
933 FILE *fp;
934
935 if (!fname)
936 return (L_COMP_DATA *)ERROR_PTR("fname not defined", __func__, NULL);
937
938 if (ascii85flag != 0 && ascii85flag != 1)
939 return (L_COMP_DATA *)ERROR_PTR("wrong ascii85flags", __func__, NULL);
940
941 /* Read the metadata */
942 if (readHeaderJpeg(fname, &w, &h, &spp, NULL, NULL))
943 return (L_COMP_DATA *)ERROR_PTR("bad jpeg metadata", __func__, NULL);
944 bps = 8;
945 if ((fp = fopenReadStream(fname)) == NULL)
946 return (L_COMP_DATA *)ERROR_PTR_1("stream not opened",
947 fname, __func__, NULL);
948 fgetJpegResolution(fp, &xres, &yres);
949 fclose(fp);
950
951 /* Read the entire jpeg file. The returned jpeg data in memory
952 * starts with ffd8 and ends with ffd9 */
953 if ((data = l_binaryRead(fname, &nbytes)) == NULL)
954 return (L_COMP_DATA *)ERROR_PTR_1("data not extracted",
955 fname, __func__, NULL);
956
957 /* Optionally, encode the compressed data */
958 if (ascii85flag == 1) {
959 data85 = encodeAscii85(data, nbytes, &nbytes85);
960 LEPT_FREE(data);
961 if (!data85)
962 return (L_COMP_DATA *)ERROR_PTR_1("data85 not made",
963 fname, __func__, NULL);
964 else
965 data85[nbytes85 - 1] = '\0'; /* remove the newline */
966 }
967
968 cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA));
969 if (ascii85flag == 0) {
970 cid->datacomp = data;
971 } else { /* ascii85 */
972 cid->data85 = data85;
973 cid->nbytes85 = nbytes85;
974 }
975 cid->type = L_JPEG_ENCODE;
976 cid->nbytescomp = nbytes;
977 cid->w = w;
978 cid->h = h;
979 cid->bps = bps;
980 cid->spp = spp;
981 cid->res = xres;
982 return cid;
983 }
984
985
986 /*!
987 * \brief l_generateJpegDataMem()
988 *
989 * \param[in] data of jpeg-encoded file
990 * \param[in] nbytes size of jpeg-encoded file
991 * \param[in] ascii85flag 0 for jpeg; 1 for ascii85-encoded jpeg
992 * \return cid containing jpeg data, or NULL on error
993 *
994 * <pre>
995 * Notes:
996 * (1) Set ascii85flag:
997 * ~ 0 for binary data (PDF only)
998 * ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only)
999 * </pre>
1000 */
1001 L_COMP_DATA *
1002 l_generateJpegDataMem(l_uint8 *data,
1003 size_t nbytes,
1004 l_int32 ascii85flag)
1005 {
1006 char *data85 = NULL; /* ascii85 encoded jpeg compressed file */
1007 l_int32 w, h, xres, yres, bps, spp;
1008 size_t nbytes85;
1009 L_COMP_DATA *cid;
1010
1011 if (!data)
1012 return (L_COMP_DATA *)ERROR_PTR("data not defined", __func__, NULL);
1013
1014 /* Read the metadata */
1015 if (readHeaderMemJpeg(data, nbytes, &w, &h, &spp, NULL, NULL)) {
1016 LEPT_FREE(data);
1017 return (L_COMP_DATA *)ERROR_PTR("bad jpeg metadata", __func__, NULL);
1018 }
1019 bps = 8;
1020 readResolutionMemJpeg(data, nbytes, &xres, &yres);
1021
1022 /* Optionally, encode the compressed data */
1023 if (ascii85flag == 1) {
1024 data85 = encodeAscii85(data, nbytes, &nbytes85);
1025 LEPT_FREE(data);
1026 if (!data85)
1027 return (L_COMP_DATA *)ERROR_PTR("data85 not made", __func__, NULL);
1028 else
1029 data85[nbytes85 - 1] = '\0'; /* remove the newline */
1030 }
1031
1032 cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA));
1033 if (ascii85flag == 0) {
1034 cid->datacomp = data;
1035 } else { /* ascii85 */
1036 cid->data85 = data85;
1037 cid->nbytes85 = nbytes85;
1038 }
1039 cid->type = L_JPEG_ENCODE;
1040 cid->nbytescomp = nbytes;
1041 cid->w = w;
1042 cid->h = h;
1043 cid->bps = bps;
1044 cid->spp = spp;
1045 cid->res = xres;
1046 return cid;
1047 }
1048
1049
1050 /*!
1051 * \brief l_generateJp2kData()
1052 *
1053 * \param[in] fname of jp2k file
1054 * \return cid containing jp2k data, or NULL on error
1055 *
1056 * <pre>
1057 * Notes:
1058 * (1) This is only called after the file is verified to be jp2k.
1059 * </pre>
1060 */
1061 static L_COMP_DATA *
1062 l_generateJp2kData(const char *fname)
1063 {
1064 l_int32 w, h, bps, spp, xres, yres;
1065 size_t nbytes;
1066 L_COMP_DATA *cid;
1067 FILE *fp;
1068
1069 if (!fname)
1070 return (L_COMP_DATA *)ERROR_PTR("fname not defined", __func__, NULL);
1071
1072 if (readHeaderJp2k(fname, &w, &h, &bps, &spp, NULL))
1073 return (L_COMP_DATA *)ERROR_PTR("bad jp2k metadata", __func__, NULL);
1074
1075 /* The returned jp2k data in memory is the entire jp2k file */
1076 cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA));
1077 if ((cid->datacomp = l_binaryRead(fname, &nbytes)) == NULL) {
1078 l_CIDataDestroy(&cid);
1079 return (L_COMP_DATA *)ERROR_PTR("data not extracted", __func__, NULL);
1080 }
1081
1082 xres = yres = 0;
1083 if ((fp = fopenReadStream(fname)) != NULL) {
1084 fgetJp2kResolution(fp, &xres, &yres);
1085 fclose(fp);
1086 }
1087 cid->type = L_JP2K_ENCODE;
1088 cid->nbytescomp = nbytes;
1089 cid->w = w;
1090 cid->h = h;
1091 cid->bps = bps;
1092 cid->spp = spp;
1093 cid->res = xres;
1094 return cid;
1095 }
1096
1097
1098 /*!
1099 * \brief l_generateG4Data()
1100 *
1101 * \param[in] fname of g4 compressed file
1102 * \param[in] ascii85flag 0 for g4 compressed; 1 for ascii85-encoded g4
1103 * \return cid g4 compressed image data, or NULL on error
1104 *
1105 * <pre>
1106 * Notes:
1107 * (1) Set ascii85flag:
1108 * ~ 0 for binary data (PDF only)
1109 * ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only)
1110 * (2) This does not work for multipage tiff files.
1111 * </pre>
1112 */
1113 L_COMP_DATA *
1114 l_generateG4Data(const char *fname,
1115 l_int32 ascii85flag)
1116 {
1117 l_uint8 *datacomp = NULL; /* g4 compressed raster data */
1118 char *data85 = NULL; /* ascii85 encoded g4 compressed data */
1119 l_int32 w, h, xres, yres, npages;
1120 l_int32 minisblack; /* TRUE or FALSE */
1121 size_t nbytes85, nbytescomp;
1122 L_COMP_DATA *cid;
1123 FILE *fp;
1124
1125 if (!fname)
1126 return (L_COMP_DATA *)ERROR_PTR("fname not defined", __func__, NULL);
1127
1128 /* Make sure this is a single page tiff file */
1129 if ((fp = fopenReadStream(fname)) == NULL)
1130 return (L_COMP_DATA *)ERROR_PTR_1("stream not opened",
1131 fname, __func__, NULL);
1132 tiffGetCount(fp, &npages);
1133 fclose(fp);
1134 if (npages != 1) {
1135 L_ERROR(" %d page tiff; only works with 1 page (file: %s)\n", __func__, npages, fname);
1136 return NULL;
1137 }
1138
1139 /* Read the resolution */
1140 if ((fp = fopenReadStream(fname)) == NULL)
1141 return (L_COMP_DATA *)ERROR_PTR_1("stream not opened",
1142 fname, __func__, NULL);
1143 getTiffResolution(fp, &xres, &yres);
1144 fclose(fp);
1145
1146 /* The returned ccitt g4 data in memory is the block of
1147 * bytes in the tiff file, starting after 8 bytes and
1148 * ending before the directory. */
1149 if (extractG4DataFromFile(fname, &datacomp, &nbytescomp,
1150 &w, &h, &minisblack)) {
1151 return (L_COMP_DATA *)ERROR_PTR_1("datacomp not extracted",
1152 fname, __func__, NULL);
1153 }
1154
1155 /* Optionally, encode the compressed data */
1156 if (ascii85flag == 1) {
1157 data85 = encodeAscii85(datacomp, nbytescomp, &nbytes85);
1158 LEPT_FREE(datacomp);
1159 if (!data85)
1160 return (L_COMP_DATA *)ERROR_PTR_1("data85 not made",
1161 fname, __func__, NULL);
1162 else
1163 data85[nbytes85 - 1] = '\0'; /* remove the newline */
1164 }
1165
1166 cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA));
1167 if (ascii85flag == 0) {
1168 cid->datacomp = datacomp;
1169 } else { /* ascii85 */
1170 cid->data85 = data85;
1171 cid->nbytes85 = nbytes85;
1172 }
1173 cid->type = L_G4_ENCODE;
1174 cid->nbytescomp = nbytescomp;
1175 cid->w = w;
1176 cid->h = h;
1177 cid->bps = 1;
1178 cid->spp = 1;
1179 cid->minisblack = minisblack;
1180 cid->res = xres;
1181 return cid;
1182 }
1183
1184
1185 /*!
1186 * \brief pixGenerateCIData()
1187 *
1188 * \param[in] pixs 8 or 32 bpp, no colormap
1189 * \param[in] type L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE or
1190 * L_JP2K_ENCODE
1191 * \param[in] quality for jpeg if transcoded: 1-100; 0 for default (75)
1192 * for jp2k if transcoded: 27-45; 0 for default (34)
1193 * \param[in] ascii85 0 for binary; 1 for ascii85-encoded
1194 * \param[out] pcid compressed data
1195 * \return 0 if OK, 1 on error
1196 *
1197 * <pre>
1198 * Notes:
1199 * (1) Set ascii85:
1200 * ~ 0 for binary data (PDF only)
1201 * ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only)
1202 * (2) Do not accept images with an asperity ratio greater than 10.
1203 * </pre>
1204 */
1205 l_ok
1206 pixGenerateCIData(PIX *pixs,
1207 l_int32 type,
1208 l_int32 quality,
1209 l_int32 ascii85,
1210 L_COMP_DATA **pcid)
1211 {
1212 l_int32 w, h, d, maxAsp;
1213 PIXCMAP *cmap;
1214
1215 if (!pcid)
1216 return ERROR_INT("&cid not defined", __func__, 1);
1217 *pcid = NULL;
1218 if (!pixs)
1219 return ERROR_INT("pixs not defined", __func__, 1);
1220 if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
1221 type != L_FLATE_ENCODE && type != L_JP2K_ENCODE) {
1222 selectDefaultPdfEncoding(pixs, &type);
1223 }
1224 if (ascii85 != 0 && ascii85 != 1)
1225 return ERROR_INT("invalid ascii85", __func__, 1);
1226 pixGetDimensions(pixs, &w, &h, NULL);
1227 if (w == 0 || h == 0)
1228 return ERROR_INT("invalid w or h", __func__, 1);
1229 maxAsp = L_MAX(w / h, h / w);
1230 if (maxAsp > 10)
1231 return ERROR_INT("max asperity > 10", __func__, 1);
1232
1233 /* Conditionally modify the encoding type if libz is
1234 * available and the requested library is missing. */
1235 #if defined(HAVE_LIBZ)
1236 # if !defined(HAVE_LIBJPEG)
1237 if (type == L_JPEG_ENCODE) {
1238 L_WARNING("no libjpeg; using flate encoding\n", __func__);
1239 type = L_FLATE_ENCODE;
1240 }
1241 # endif /* !defined(HAVE_LIBJPEG) */
1242 # if !defined(HAVE_LIBJP2K)
1243 if (type == L_JP2K_ENCODE) {
1244 L_WARNING("no libjp2k; using flate encoding\n", __func__);
1245 type = L_FLATE_ENCODE;
1246 }
1247 # endif /* !defined(HAVE_LIBJP2K) */
1248 # if !defined(HAVE_LIBTIFF)
1249 if (type == L_G4_ENCODE) {
1250 L_WARNING("no libtiff; using flate encoding\n", __func__);
1251 type = L_FLATE_ENCODE;
1252 }
1253 # endif /* !defined(HAVE_LIBTIFF) */
1254 #endif /* defined(HAVE_LIBZ) */
1255
1256 /* Sanity check on requested encoding */
1257 d = pixGetDepth(pixs);
1258 cmap = pixGetColormap(pixs);
1259 if (cmap && type != L_FLATE_ENCODE) {
1260 L_WARNING("pixs has cmap; using flate encoding\n", __func__);
1261 type = L_FLATE_ENCODE;
1262 } else if (d < 8 && (type == L_JPEG_ENCODE || type == L_JP2K_ENCODE)) {
1263 L_WARNING("pixs has < 8 bpp; using flate encoding\n", __func__);
1264 type = L_FLATE_ENCODE;
1265 } else if (d > 1 && type == L_G4_ENCODE) {
1266 L_WARNING("pixs has > 1 bpp; using flate encoding\n", __func__);
1267 type = L_FLATE_ENCODE;
1268 }
1269
1270 if (type == L_JPEG_ENCODE) {
1271 if ((*pcid = pixGenerateJpegData(pixs, ascii85, quality)) == NULL)
1272 return ERROR_INT("jpeg data not made", __func__, 1);
1273 } else if (type == L_JP2K_ENCODE) {
1274 if ((*pcid = pixGenerateJp2kData(pixs, quality)) == NULL)
1275 return ERROR_INT("jp2k data not made", __func__, 1);
1276 } else if (type == L_G4_ENCODE) {
1277 if ((*pcid = pixGenerateG4Data(pixs, ascii85)) == NULL)
1278 return ERROR_INT("g4 data not made", __func__, 1);
1279 } else { /* type == L_FLATE_ENCODE */
1280 if ((*pcid = pixGenerateFlateData(pixs, ascii85)) == NULL)
1281 return ERROR_INT("flate data not made", __func__, 1);
1282 }
1283 return 0;
1284 }
1285
1286
1287 /*!
1288 * \brief l_generateFlateData()
1289 *
1290 * \param[in] fname
1291 * \param[in] ascii85flag 0 for gzipped; 1 for ascii85-encoded gzipped
1292 * \return cid flate compressed image data, or NULL on error
1293 *
1294 * <pre>
1295 * Notes:
1296 * (1) The input image is converted to one of these 4 types:
1297 * ~ 1 bpp
1298 * ~ 8 bpp, no colormap
1299 * ~ 8 bpp, colormap
1300 * ~ 32 bpp rgb
1301 * (2) Set ascii85flag:
1302 * ~ 0 for binary data (PDF only)
1303 * ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only)
1304 * (3) Always transcodes (i.e., first decodes the png file)
1305 * </pre>
1306 */
1307 L_COMP_DATA *
1308 l_generateFlateData(const char *fname,
1309 l_int32 ascii85flag)
1310 {
1311 L_COMP_DATA *cid;
1312 PIX *pixs;
1313
1314 if (!fname)
1315 return (L_COMP_DATA *)ERROR_PTR("fname not defined", __func__, NULL);
1316
1317 if ((pixs = pixRead(fname)) == NULL)
1318 return (L_COMP_DATA *)ERROR_PTR("pixs not made", __func__, NULL);
1319 cid = pixGenerateFlateData(pixs, ascii85flag);
1320 pixDestroy(&pixs);
1321 return cid;
1322 }
1323
1324
1325 /*!
1326 * \brief pixGenerateFlateData()
1327 *
1328 * \param[in] pixs
1329 * \param[in] ascii85flag 0 for gzipped; 1 for ascii85-encoded gzipped
1330 * \return cid flate compressed image data, or NULL on error
1331 *
1332 * <pre>
1333 * Notes:
1334 * (1) If called with an RGBA pix (spp == 4), the alpha channel
1335 * will be removed, projecting a white backgrouond through
1336 * any transparency.
1337 * (2) If called with a colormapped pix, any transparency in the
1338 * alpha component in the colormap will be ignored, as it is
1339 * for all leptonica operations on colormapped pix.
1340 * </pre>
1341 */
1342 static L_COMP_DATA *
1343 pixGenerateFlateData(PIX *pixs,
1344 l_int32 ascii85flag)
1345 {
1346 l_uint8 *data = NULL; /* uncompressed raster data in required format */
1347 l_uint8 *datacomp = NULL; /* gzipped raster data */
1348 char *data85 = NULL; /* ascii85 encoded gzipped raster data */
1349 l_uint8 *cmapdata = NULL; /* uncompressed colormap */
1350 char *cmapdata85 = NULL; /* ascii85 encoded uncompressed colormap */
1351 char *cmapdatahex = NULL; /* hex ascii uncompressed colormap */
1352 l_int32 ncolors; /* in colormap; not used if cmapdata85 is null */
1353 l_int32 bps; /* bits/sample: usually 8 */
1354 l_int32 spp; /* samples/pixel: 1-grayscale/cmap); 3-rgb */
1355 l_int32 w, h, d, cmapflag;
1356 size_t ncmapbytes85 = 0;
1357 size_t nbytes85 = 0;
1358 size_t nbytes, nbytescomp;
1359 L_COMP_DATA *cid;
1360 PIX *pixt;
1361 PIXCMAP *cmap;
1362
1363 if (!pixs)
1364 return (L_COMP_DATA *)ERROR_PTR("pixs not defined", __func__, NULL);
1365
1366 /* Convert the image to one of these 4 types:
1367 * 1 bpp
1368 * 8 bpp, no colormap
1369 * 8 bpp, colormap
1370 * 32 bpp rgb */
1371 pixGetDimensions(pixs, &w, &h, &d);
1372 cmap = pixGetColormap(pixs);
1373 cmapflag = (cmap) ? 1 : 0;
1374 if (d == 2 || d == 4 || d == 16) {
1375 pixt = pixConvertTo8(pixs, cmapflag);
1376 cmap = pixGetColormap(pixt);
1377 d = pixGetDepth(pixt);
1378 } else if (d == 32 && pixGetSpp(pixs) == 4) { /* remove alpha */
1379 pixt = pixAlphaBlendUniform(pixs, 0xffffff00);
1380 } else {
1381 pixt = pixClone(pixs);
1382 }
1383 if (!pixt)
1384 return (L_COMP_DATA *)ERROR_PTR("pixt not made", __func__, NULL);
1385 spp = (d == 32) ? 3 : 1;
1386 bps = (d == 32) ? 8 : d;
1387
1388 /* Extract and encode the colormap data as both ascii85 and hexascii */
1389 ncolors = 0;
1390 if (cmap) {
1391 pixcmapSerializeToMemory(cmap, 3, &ncolors, &cmapdata);
1392 if (!cmapdata) {
1393 pixDestroy(&pixt);
1394 return (L_COMP_DATA *)ERROR_PTR("cmapdata not made",
1395 __func__, NULL);
1396 }
1397
1398 cmapdata85 = encodeAscii85(cmapdata, 3 * ncolors, &ncmapbytes85);
1399 cmapdatahex = pixcmapConvertToHex(cmapdata, ncolors);
1400 LEPT_FREE(cmapdata);
1401 }
1402
1403 /* Extract and compress the raster data */
1404 pixGetRasterData(pixt, &data, &nbytes);
1405 pixDestroy(&pixt);
1406 if (!data) {
1407 LEPT_FREE(cmapdata85);
1408 LEPT_FREE(cmapdatahex);
1409 return (L_COMP_DATA *)ERROR_PTR("data not returned", __func__, NULL);
1410 }
1411 datacomp = zlibCompress(data, nbytes, &nbytescomp);
1412 LEPT_FREE(data);
1413 if (!datacomp) {
1414 LEPT_FREE(cmapdata85);
1415 LEPT_FREE(cmapdatahex);
1416 return (L_COMP_DATA *)ERROR_PTR("datacomp not made", __func__, NULL);
1417 }
1418
1419 /* Optionally, encode the compressed data */
1420 if (ascii85flag == 1) {
1421 data85 = encodeAscii85(datacomp, nbytescomp, &nbytes85);
1422 LEPT_FREE(datacomp);
1423 if (!data85) {
1424 LEPT_FREE(cmapdata85);
1425 LEPT_FREE(cmapdatahex);
1426 return (L_COMP_DATA *)ERROR_PTR("data85 not made", __func__, NULL);
1427 } else {
1428 data85[nbytes85 - 1] = '\0'; /* remove the newline */
1429 }
1430 }
1431
1432 cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA));
1433 if (ascii85flag == 0) {
1434 cid->datacomp = datacomp;
1435 } else { /* ascii85 */
1436 cid->data85 = data85;
1437 cid->nbytes85 = nbytes85;
1438 }
1439 cid->type = L_FLATE_ENCODE;
1440 cid->cmapdatahex = cmapdatahex;
1441 cid->cmapdata85 = cmapdata85;
1442 cid->nbytescomp = nbytescomp;
1443 cid->ncolors = ncolors;
1444 cid->w = w;
1445 cid->h = h;
1446 cid->bps = bps;
1447 cid->spp = spp;
1448 cid->res = pixGetXRes(pixs);
1449 cid->nbytes = nbytes; /* only for debugging */
1450 return cid;
1451 }
1452
1453
1454 /*!
1455 * \brief pixGenerateJpegData()
1456 *
1457 * \param[in] pixs 8, 16 or 32 bpp, no colormap
1458 * \param[in] ascii85flag 0 for jpeg; 1 for ascii85-encoded jpeg
1459 * \param[in] quality 0 for default, which is 75
1460 * \return cid jpeg compressed data, or NULL on error
1461 *
1462 * <pre>
1463 * Notes:
1464 * (1) Set ascii85flag:
1465 * ~ 0 for binary data (PDF only)
1466 * ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only)
1467 * (2) If 16 bpp, convert first to 8 bpp, using the MSB
1468 * </pre>
1469 */
1470 static L_COMP_DATA *
1471 pixGenerateJpegData(PIX *pixs,
1472 l_int32 ascii85flag,
1473 l_int32 quality)
1474 {
1475 l_int32 d;
1476 char *fname;
1477 L_COMP_DATA *cid;
1478
1479 if (!pixs)
1480 return (L_COMP_DATA *)ERROR_PTR("pixs not defined", __func__, NULL);
1481 if (pixGetColormap(pixs))
1482 return (L_COMP_DATA *)ERROR_PTR("pixs has colormap", __func__, NULL);
1483 d = pixGetDepth(pixs);
1484 if (d != 8 && d != 16 && d != 32)
1485 return (L_COMP_DATA *)ERROR_PTR("pixs not 8, 16 or 32 bpp",
1486 __func__, NULL);
1487
1488 /* Compress to a temp jpeg file */
1489 fname = l_makeTempFilename();
1490 if (pixWriteJpeg(fname, pixs, quality, 0)) {
1491 LEPT_FREE(fname);
1492 return NULL;
1493 }
1494
1495 /* Generate the data */
1496 cid = l_generateJpegData(fname, ascii85flag);
1497 if (lept_rmfile(fname) != 0)
1498 L_ERROR("temp file %s was not deleted\n", __func__, fname);
1499 LEPT_FREE(fname);
1500 return cid;
1501 }
1502
1503
1504 /*!
1505 * \brief pixGenerateJp2kData()
1506 *
1507 * \param[in] pixs 8 or 32 bpp, no colormap
1508 * \param[in] quality 0 for default, which is 34
1509 * \return cid jp2k compressed data, or NULL on error
1510 *
1511 * <pre>
1512 * Notes:
1513 * (1) The quality can be set between 27 (very poor) and 45
1514 * (nearly perfect). Use 0 for default (34). Use 100 for lossless,
1515 * but this is very expensive and not recommended.
1516 * </pre>
1517 */
1518 static L_COMP_DATA *
1519 pixGenerateJp2kData(PIX *pixs,
1520 l_int32 quality)
1521 {
1522 l_int32 d;
1523 char *fname;
1524 L_COMP_DATA *cid;
1525
1526 if (!pixs)
1527 return (L_COMP_DATA *)ERROR_PTR("pixs not defined", __func__, NULL);
1528 if (pixGetColormap(pixs))
1529 return (L_COMP_DATA *)ERROR_PTR("pixs has colormap", __func__, NULL);
1530 d = pixGetDepth(pixs);
1531 if (d != 8 && d != 32)
1532 return (L_COMP_DATA *)ERROR_PTR("pixs not 8 or 32 bpp", __func__, NULL);
1533
1534 /* Compress to a temp jp2k file */
1535 fname = l_makeTempFilename();
1536 if (pixWriteJp2k(fname, pixs, quality, 5, 0, 0)) {
1537 LEPT_FREE(fname);
1538 return NULL;
1539 }
1540
1541 /* Generate the data */
1542 cid = l_generateJp2kData(fname);
1543 if (lept_rmfile(fname) != 0)
1544 L_ERROR("temp file %s was not deleted\n", __func__, fname);
1545 LEPT_FREE(fname);
1546 return cid;
1547 }
1548
1549
1550 /*!
1551 * \brief pixGenerateG4Data()
1552 *
1553 * \param[in] pixs 1 bpp, no colormap
1554 * \param[in] ascii85flag 0 for gzipped; 1 for ascii85-encoded gzipped
1555 * \return cid g4 compressed image data, or NULL on error
1556 *
1557 * <pre>
1558 * Notes:
1559 * (1) Set ascii85flag:
1560 * ~ 0 for binary data (PDF only)
1561 * ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only)
1562 * </pre>
1563 */
1564 static L_COMP_DATA *
1565 pixGenerateG4Data(PIX *pixs,
1566 l_int32 ascii85flag)
1567 {
1568 char *fname;
1569 L_COMP_DATA *cid;
1570
1571 if (!pixs)
1572 return (L_COMP_DATA *)ERROR_PTR("pixs not defined", __func__, NULL);
1573 if (pixGetDepth(pixs) != 1)
1574 return (L_COMP_DATA *)ERROR_PTR("pixs not 1 bpp", __func__, NULL);
1575 if (pixGetColormap(pixs))
1576 return (L_COMP_DATA *)ERROR_PTR("pixs has colormap", __func__, NULL);
1577
1578 /* Compress to a temp tiff g4 file */
1579 fname = l_makeTempFilename();
1580 if (pixWrite(fname, pixs, IFF_TIFF_G4)) {
1581 LEPT_FREE(fname);
1582 return NULL;
1583 }
1584
1585 cid = l_generateG4Data(fname, ascii85flag);
1586 if (lept_rmfile(fname) != 0)
1587 L_ERROR("temp file %s was not deleted\n", __func__, fname);
1588 LEPT_FREE(fname);
1589 return cid;
1590 }
1591
1592
1593 /*!
1594 * \brief cidConvertToPdfData()
1595 *
1596 * \param[in] cid compressed image data
1597 * \param[in] title [optional] pdf title; can be null
1598 * \param[out] pdata output pdf data for image
1599 * \param[out] pnbytes size of output pdf data
1600 * \return 0 if OK, 1 on error
1601 *
1602 * <pre>
1603 * Notes:
1604 * (1) Caller must not destroy the cid. It is absorbed in the
1605 * lpd and destroyed by this function.
1606 * </pre>
1607 */
1608 l_ok
1609 cidConvertToPdfData(L_COMP_DATA *cid,
1610 const char *title,
1611 l_uint8 **pdata,
1612 size_t *pnbytes)
1613 {
1614 l_int32 res, ret;
1615 l_float32 wpt, hpt;
1616 L_PDF_DATA *lpd = NULL;
1617
1618 if (!pdata || !pnbytes)
1619 return ERROR_INT("&data and &nbytes not both defined", __func__, 1);
1620 *pdata = NULL;
1621 *pnbytes = 0;
1622 if (!cid)
1623 return ERROR_INT("cid not defined", __func__, 1);
1624
1625 /* Get media box parameters, in pts */
1626 res = cid->res;
1627 if (res <= 0)
1628 res = DefaultInputRes;
1629 wpt = cid->w * 72.f / res;
1630 hpt = cid->h * 72.f / res;
1631
1632 /* Set up the pdf data struct (lpd) */
1633 if ((lpd = pdfdataCreate(title)) == NULL)
1634 return ERROR_INT("lpd not made", __func__, 1);
1635 ptraAdd(lpd->cida, cid);
1636 lpd->n++;
1637 ptaAddPt(lpd->xy, 0, 0); /* xpt = ypt = 0 */
1638 ptaAddPt(lpd->wh, wpt, hpt);
1639
1640 /* Generate the pdf string and destroy the lpd */
1641 ret = l_generatePdf(pdata, pnbytes, lpd);
1642 pdfdataDestroy(&lpd);
1643 if (ret)
1644 return ERROR_INT("pdf output not made", __func__, 1);
1645 return 0;
1646 }
1647
1648
1649 /*!
1650 * \brief l_CIDataDestroy()
1651 *
1652 * \param[in,out] pcid will be set to null before returning
1653 * \return void
1654 */
1655 void
1656 l_CIDataDestroy(L_COMP_DATA **pcid)
1657 {
1658 L_COMP_DATA *cid;
1659
1660 if (pcid == NULL) {
1661 L_WARNING("ptr address is null!\n", __func__);
1662 return;
1663 }
1664 if ((cid = *pcid) == NULL)
1665 return;
1666
1667 if (cid->datacomp) LEPT_FREE(cid->datacomp);
1668 if (cid->data85) LEPT_FREE(cid->data85);
1669 if (cid->cmapdata85) LEPT_FREE(cid->cmapdata85);
1670 if (cid->cmapdatahex) LEPT_FREE(cid->cmapdatahex);
1671 LEPT_FREE(cid);
1672 *pcid = NULL;
1673 }
1674
1675
1676 /*---------------------------------------------------------------------*
1677 * Helper functions for generating the output pdf string *
1678 *---------------------------------------------------------------------*/
1679 /*!
1680 * \brief l_generatePdf()
1681 *
1682 * \param[out] pdata pdf array
1683 * \param[out] pnbytes number of bytes in pdf array
1684 * \param[in] lpd all the required input image data
1685 * \return 0 if OK, 1 on error
1686 *
1687 * <pre>
1688 * Notes:
1689 * (1) On error, no data is returned.
1690 * (2) The objects are:
1691 * 1: Catalog
1692 * 2: Info
1693 * 3: Pages
1694 * 4: Page
1695 * 5: Contents (rendering command)
1696 * 6 to 6+n-1: n XObjects
1697 * 6+n to 6+n+m-1: m colormaps
1698 * </pre>
1699 */
1700 static l_int32
1701 l_generatePdf(l_uint8 **pdata,
1702 size_t *pnbytes,
1703 L_PDF_DATA *lpd)
1704 {
1705 if (!pdata)
1706 return ERROR_INT("&data not defined", __func__, 1);
1707 *pdata = NULL;
1708 if (!pnbytes)
1709 return ERROR_INT("&nbytes not defined", __func__, 1);
1710 *pnbytes = 0;
1711 if (!lpd)
1712 return ERROR_INT("lpd not defined", __func__, 1);
1713
1714 generateFixedStringsPdf(lpd);
1715 generateMediaboxPdf(lpd);
1716 generatePageStringPdf(lpd);
1717 generateContentStringPdf(lpd);
1718 generatePreXStringsPdf(lpd);
1719 generateColormapStringsPdf(lpd);
1720 generateTrailerPdf(lpd);
1721 return generateOutputDataPdf(pdata, pnbytes, lpd);
1722 }
1723
1724
1725 static void
1726 generateFixedStringsPdf(L_PDF_DATA *lpd)
1727 {
1728 char buf[L_SMALLBUF];
1729 char *version, *datestr;
1730 SARRAY *sa;
1731
1732 /* Accumulate data for the header and objects 1-3 */
1733 lpd->id = stringNew("%PDF-1.5\n");
1734 l_dnaAddNumber(lpd->objsize, strlen(lpd->id));
1735
1736 lpd->obj1 = stringNew("1 0 obj\n"
1737 "<<\n"
1738 "/Type /Catalog\n"
1739 "/Pages 3 0 R\n"
1740 ">>\n"
1741 "endobj\n");
1742 l_dnaAddNumber(lpd->objsize, strlen(lpd->obj1));
1743
1744 sa = sarrayCreate(0);
1745 sarrayAddString(sa, "2 0 obj\n"
1746 "<<\n", L_COPY);
1747 if (var_WRITE_DATE_AND_VERSION) {
1748 datestr = l_getFormattedDate();
1749 snprintf(buf, sizeof(buf), "/CreationDate (D:%s)\n", datestr);
1750 sarrayAddString(sa, buf, L_COPY);
1751 LEPT_FREE(datestr);
1752 version = getLeptonicaVersion();
1753 snprintf(buf, sizeof(buf),
1754 "/Producer (leptonica: %s)\n", version);
1755 LEPT_FREE(version);
1756 } else {
1757 snprintf(buf, sizeof(buf), "/Producer (leptonica)\n");
1758 }
1759 sarrayAddString(sa, buf, L_COPY);
1760 if (lpd->title) {
1761 char *hexstr;
1762 if ((hexstr = generateEscapeString(lpd->title)) != NULL) {
1763 snprintf(buf, sizeof(buf), "/Title %s\n", hexstr);
1764 sarrayAddString(sa, buf, L_COPY);
1765 } else {
1766 L_ERROR("title string is not ascii\n", __func__);
1767 }
1768 LEPT_FREE(hexstr);
1769 }
1770 sarrayAddString(sa, ">>\n"
1771 "endobj\n", L_COPY);
1772 lpd->obj2 = sarrayToString(sa, 0);
1773 l_dnaAddNumber(lpd->objsize, strlen(lpd->obj2));
1774 sarrayDestroy(&sa);
1775
1776 lpd->obj3 = stringNew("3 0 obj\n"
1777 "<<\n"
1778 "/Type /Pages\n"
1779 "/Kids [ 4 0 R ]\n"
1780 "/Count 1\n"
1781 ">>\n");
1782 l_dnaAddNumber(lpd->objsize, strlen(lpd->obj3));
1783
1784 /* Do the post-datastream string */
1785 lpd->poststream = stringNew("\n"
1786 "endstream\n"
1787 "endobj\n");
1788 }
1789
1790
1791 /*!
1792 * \brief generateEscapeString()
1793 *
1794 * \param[in] str input string
1795 * \return hex escape string, or null on error
1796 *
1797 * <pre>
1798 * Notes:
1799 * (1) If the input string is not ascii, returns null.
1800 * (2) This takes an input ascii string and generates a hex
1801 * ascii output string with 4 bytes out for each byte in.
1802 * The feff code at the beginning tells the pdf interpreter
1803 * that the data is to be interpreted as big-endian, 4 bytes
1804 * at a time. For ascii, the first two bytes are 0 and the
1805 * last two bytes are less than 0x80.
1806 * </pre>
1807 */
1808 static char *
1809 generateEscapeString(const char *str)
1810 {
1811 char smallbuf[8];
1812 char *buffer;
1813 l_int32 i, nchar, buflen;
1814
1815 if (!str)
1816 return (char *)ERROR_PTR("str not defined", __func__, NULL);
1817 nchar = strlen(str);
1818 for (i = 0; i < nchar; i++) {
1819 if (str[i] < 0)
1820 return (char *)ERROR_PTR("str not all ascii", __func__, NULL);
1821 }
1822
1823 buflen = 4 * nchar + 10;
1824 buffer = (char *)LEPT_CALLOC(buflen, sizeof(char));
1825 stringCat(buffer, buflen, "<feff");
1826 for (i = 0; i < nchar; i++) {
1827 snprintf(smallbuf, sizeof(smallbuf), "%04x", str[i]);
1828 stringCat(buffer, buflen, smallbuf);
1829 }
1830 stringCat(buffer, buflen, ">");
1831 return buffer;
1832 }
1833
1834
1835 static void
1836 generateMediaboxPdf(L_PDF_DATA *lpd)
1837 {
1838 l_int32 i;
1839 l_float32 xpt, ypt, wpt, hpt, maxx, maxy;
1840
1841 /* First get the full extent of all the images.
1842 * This is the mediabox, in pts. */
1843 maxx = maxy = 0;
1844 for (i = 0; i < lpd->n; i++) {
1845 ptaGetPt(lpd->xy, i, &xpt, &ypt);
1846 ptaGetPt(lpd->wh, i, &wpt, &hpt);
1847 maxx = L_MAX(maxx, xpt + wpt);
1848 maxy = L_MAX(maxy, ypt + hpt);
1849 }
1850
1851 lpd->mediabox = boxCreate(0, 0, (l_int32)(maxx + 0.5),
1852 (l_int32)(maxy + 0.5));
1853
1854 /* ypt is in standard image coordinates: the location of
1855 * the UL image corner with respect to the UL media box corner.
1856 * Rewrite each ypt for PostScript coordinates: the location of
1857 * the LL image corner with respect to the LL media box corner. */
1858 for (i = 0; i < lpd->n; i++) {
1859 ptaGetPt(lpd->xy, i, &xpt, &ypt);
1860 ptaGetPt(lpd->wh, i, &wpt, &hpt);
1861 ptaSetPt(lpd->xy, i, xpt, maxy - ypt - hpt);
1862 }
1863 }
1864
1865
1866 static l_int32
1867 generatePageStringPdf(L_PDF_DATA *lpd)
1868 {
1869 char *buf;
1870 char *xstr;
1871 l_int32 bufsize, i, wpt, hpt;
1872 SARRAY *sa;
1873
1874 /* Allocate 1000 bytes for the boilerplate text, and
1875 * 50 bytes for each reference to an image in the
1876 * ProcSet array. */
1877 bufsize = 1000 + 50 * lpd->n;
1878 if ((buf = (char *)LEPT_CALLOC(bufsize, sizeof(char))) == NULL)
1879 return ERROR_INT("calloc fail for buf", __func__, 1);
1880
1881 boxGetGeometry(lpd->mediabox, NULL, NULL, &wpt, &hpt);
1882 sa = sarrayCreate(lpd->n);
1883 for (i = 0; i < lpd->n; i++) {
1884 snprintf(buf, bufsize, "/Im%d %d 0 R ", i + 1, 6 + i);
1885 sarrayAddString(sa, buf, L_COPY);
1886 }
1887 xstr = sarrayToString(sa, 0);
1888 sarrayDestroy(&sa);
1889 if (!xstr) {
1890 LEPT_FREE(buf);
1891 return ERROR_INT("xstr not made", __func__, 1);
1892 }
1893
1894 snprintf(buf, bufsize, "4 0 obj\n"
1895 "<<\n"
1896 "/Type /Page\n"
1897 "/Parent 3 0 R\n"
1898 "/MediaBox [%d %d %d %d]\n"
1899 "/Contents 5 0 R\n"
1900 "/Resources\n"
1901 "<<\n"
1902 "/XObject << %s >>\n"
1903 "/ProcSet [ /ImageB /ImageI /ImageC ]\n"
1904 ">>\n"
1905 ">>\n"
1906 "endobj\n",
1907 0, 0, wpt, hpt, xstr);
1908
1909 lpd->obj4 = stringNew(buf);
1910 l_dnaAddNumber(lpd->objsize, strlen(lpd->obj4));
1911 sarrayDestroy(&sa);
1912 LEPT_FREE(buf);
1913 LEPT_FREE(xstr);
1914 return 0;
1915 }
1916
1917
1918 static l_int32
1919 generateContentStringPdf(L_PDF_DATA *lpd)
1920 {
1921 char *buf;
1922 char *cstr;
1923 l_int32 i, bufsize;
1924 l_float32 xpt, ypt, wpt, hpt;
1925 SARRAY *sa;
1926
1927 bufsize = 1000 + 200 * lpd->n;
1928 if ((buf = (char *)LEPT_CALLOC(bufsize, sizeof(char))) == NULL)
1929 return ERROR_INT("calloc fail for buf", __func__, 1);
1930
1931 sa = sarrayCreate(lpd->n);
1932 for (i = 0; i < lpd->n; i++) {
1933 ptaGetPt(lpd->xy, i, &xpt, &ypt);
1934 ptaGetPt(lpd->wh, i, &wpt, &hpt);
1935 snprintf(buf, bufsize,
1936 "q %.4f %.4f %.4f %.4f %.4f %.4f cm /Im%d Do Q\n",
1937 wpt, 0.0, 0.0, hpt, xpt, ypt, i + 1);
1938 sarrayAddString(sa, buf, L_COPY);
1939 }
1940 cstr = sarrayToString(sa, 0);
1941 sarrayDestroy(&sa);
1942 if (!cstr) {
1943 LEPT_FREE(buf);
1944 return ERROR_INT("cstr not made", __func__, 1);
1945 }
1946
1947 snprintf(buf, bufsize, "5 0 obj\n"
1948 "<< /Length %d >>\n"
1949 "stream\n"
1950 "%s"
1951 "endstream\n"
1952 "endobj\n",
1953 (l_int32)strlen(cstr), cstr);
1954
1955 lpd->obj5 = stringNew(buf);
1956 l_dnaAddNumber(lpd->objsize, strlen(lpd->obj5));
1957 sarrayDestroy(&sa);
1958 LEPT_FREE(buf);
1959 LEPT_FREE(cstr);
1960 return 0;
1961 }
1962
1963
1964 static l_int32
1965 generatePreXStringsPdf(L_PDF_DATA *lpd)
1966 {
1967 char buff[256];
1968 char buf[L_BIGBUF];
1969 char *cstr, *bstr, *fstr, *pstr, *xstr, *photometry;
1970 l_int32 i, cmindex;
1971 L_COMP_DATA *cid;
1972 SARRAY *sa;
1973
1974 sa = lpd->saprex;
1975 cmindex = 6 + lpd->n; /* starting value */
1976 for (i = 0; i < lpd->n; i++) {
1977 pstr = cstr = NULL;
1978 if ((cid = pdfdataGetCid(lpd, i)) == NULL)
1979 return ERROR_INT("cid not found", __func__, 1);
1980
1981 if (cid->type == L_G4_ENCODE) {
1982 if (var_WRITE_G4_IMAGE_MASK) {
1983 cstr = stringNew("/ImageMask true\n"
1984 "/ColorSpace /DeviceGray");
1985 } else {
1986 cstr = stringNew("/ColorSpace /DeviceGray");
1987 }
1988 bstr = stringNew("/BitsPerComponent 1\n"
1989 "/Interpolate true");
1990 /* Note: the reversal is deliberate. The BlackIs1 flag
1991 * is misleadingly named: it says whether to invert the
1992 * image on decoding because the black pixels are 0,
1993 * not whether the black pixels are 1! The default for
1994 * BlackIs1 is "false", which means "don't invert because
1995 * black is 1." Yikes. */
1996 photometry = (cid->minisblack) ? stringNew("true")
1997 : stringNew("false");
1998 snprintf(buff, sizeof(buff),
1999 "/Filter /CCITTFaxDecode\n"
2000 "/DecodeParms\n"
2001 "<<\n"
2002 "/BlackIs1 %s\n"
2003 "/K -1\n"
2004 "/Columns %d\n"
2005 ">>", photometry, cid->w);
2006 fstr = stringNew(buff);
2007 LEPT_FREE(photometry);
2008 } else if (cid->type == L_JPEG_ENCODE) {
2009 if (cid->spp == 1)
2010 cstr = stringNew("/ColorSpace /DeviceGray");
2011 else if (cid->spp == 3)
2012 cstr = stringNew("/ColorSpace /DeviceRGB");
2013 else if (cid->spp == 4) /* pdf supports cmyk */
2014 cstr = stringNew("/ColorSpace /DeviceCMYK");
2015 else
2016 L_ERROR("in jpeg: spp != 1, 3 or 4\n", __func__);
2017 bstr = stringNew("/BitsPerComponent 8");
2018 fstr = stringNew("/Filter /DCTDecode");
2019 } else if (cid->type == L_JP2K_ENCODE) {
2020 if (cid->spp == 1)
2021 cstr = stringNew("/ColorSpace /DeviceGray");
2022 else if (cid->spp == 3)
2023 cstr = stringNew("/ColorSpace /DeviceRGB");
2024 else
2025 L_ERROR("in jp2k: spp != 1 && spp != 3\n", __func__);
2026 bstr = stringNew("/BitsPerComponent 8");
2027 fstr = stringNew("/Filter /JPXDecode");
2028 } else { /* type == L_FLATE_ENCODE */
2029 if (cid->ncolors > 0) { /* cmapped */
2030 snprintf(buff, sizeof(buff), "/ColorSpace %d 0 R", cmindex++);
2031 cstr = stringNew(buff);
2032 } else {
2033 if (cid->spp == 1 && cid->bps == 1)
2034 cstr = stringNew("/ColorSpace /DeviceGray\n"
2035 "/Decode [1 0]");
2036 else if (cid->spp == 1) /* 8 bpp */
2037 cstr = stringNew("/ColorSpace /DeviceGray");
2038 else if (cid->spp == 3)
2039 cstr = stringNew("/ColorSpace /DeviceRGB");
2040 else
2041 L_ERROR("unknown colorspace: spp = %d\n",
2042 __func__, cid->spp);
2043 }
2044 snprintf(buff, sizeof(buff), "/BitsPerComponent %d", cid->bps);
2045 bstr = stringNew(buff);
2046 fstr = stringNew("/Filter /FlateDecode");
2047 if (cid->predictor == TRUE) {
2048 snprintf(buff, sizeof(buff),
2049 "/DecodeParms\n"
2050 "<<\n"
2051 " /Columns %d\n"
2052 " /Predictor 14\n"
2053 " /Colors %d\n"
2054 " /BitsPerComponent %d\n"
2055 ">>\n", cid->w, cid->spp, cid->bps);
2056 pstr = stringNew(buff);
2057 }
2058 }
2059 if (!pstr) /* no decode parameters */
2060 pstr = stringNew("");
2061
2062 snprintf(buf, sizeof(buf),
2063 "%d 0 obj\n"
2064 "<<\n"
2065 "/Length %zu\n"
2066 "/Subtype /Image\n"
2067 "%s\n" /* colorspace */
2068 "/Width %d\n"
2069 "/Height %d\n"
2070 "%s\n" /* bits/component */
2071 "%s\n" /* filter */
2072 "%s" /* decode parms; can be empty */
2073 ">>\n"
2074 "stream\n",
2075 6 + i, cid->nbytescomp, cstr,
2076 cid->w, cid->h, bstr, fstr, pstr);
2077 xstr = stringNew(buf);
2078 sarrayAddString(sa, xstr, L_INSERT);
2079 l_dnaAddNumber(lpd->objsize,
2080 strlen(xstr) + cid->nbytescomp + strlen(lpd->poststream));
2081 LEPT_FREE(cstr);
2082 LEPT_FREE(bstr);
2083 LEPT_FREE(fstr);
2084 LEPT_FREE(pstr);
2085 }
2086
2087 return 0;
2088 }
2089
2090
2091 static l_int32
2092 generateColormapStringsPdf(L_PDF_DATA *lpd)
2093 {
2094 char buf[L_BIGBUF];
2095 char *cmstr;
2096 l_int32 i, cmindex, ncmap;
2097 L_COMP_DATA *cid;
2098 SARRAY *sa;
2099
2100 /* In our canonical format, we have 5 objects, followed
2101 * by n XObjects, followed by m colormaps, so the index of
2102 * the first colormap object is 6 + n. */
2103 sa = lpd->sacmap;
2104 cmindex = 6 + lpd->n; /* starting value */
2105 ncmap = 0;
2106 for (i = 0; i < lpd->n; i++) {
2107 if ((cid = pdfdataGetCid(lpd, i)) == NULL)
2108 return ERROR_INT("cid not found", __func__, 1);
2109 if (cid->ncolors == 0) continue;
2110
2111 ncmap++;
2112 snprintf(buf, sizeof(buf), "%d 0 obj\n"
2113 "[ /Indexed /DeviceRGB\n"
2114 "%d\n"
2115 "%s\n"
2116 "]\n"
2117 "endobj\n",
2118 cmindex, cid->ncolors - 1, cid->cmapdatahex);
2119 cmindex++;
2120 cmstr = stringNew(buf);
2121 l_dnaAddNumber(lpd->objsize, strlen(cmstr));
2122 sarrayAddString(sa, cmstr, L_INSERT);
2123 }
2124
2125 lpd->ncmap = ncmap;
2126 return 0;
2127 }
2128
2129
2130 static void
2131 generateTrailerPdf(L_PDF_DATA *lpd)
2132 {
2133 l_int32 i, n, size, linestart;
2134 L_DNA *daloc, *dasize;
2135
2136 /* Let nobj be the number of numbered objects. These numbered
2137 * objects are indexed by their pdf number in arrays naloc[]
2138 * and nasize[]. The 0th object is the 9 byte header. Then
2139 * the number of objects in nasize, which includes the header,
2140 * is n = nobj + 1. The array naloc[] has n + 1 elements,
2141 * because it includes as the last element the starting
2142 * location of xref. The indexing of these objects, their
2143 * starting locations and sizes are:
2144 *
2145 * Object number Starting location Size
2146 * ------------- ----------------- --------------
2147 * 0 daloc[0] = 0 dasize[0] = 9
2148 * 1 daloc[1] = 9 dasize[1] = 49
2149 * n daloc[n] dasize[n]
2150 * xref daloc[n+1]
2151 *
2152 * We first generate daloc.
2153 */
2154 dasize = lpd->objsize;
2155 daloc = lpd->objloc;
2156 linestart = 0;
2157 l_dnaAddNumber(daloc, linestart); /* header */
2158 n = l_dnaGetCount(dasize);
2159 for (i = 0; i < n; i++) {
2160 l_dnaGetIValue(dasize, i, &size);
2161 linestart += size;
2162 l_dnaAddNumber(daloc, linestart);
2163 }
2164 l_dnaGetIValue(daloc, n, &lpd->xrefloc); /* save it */
2165
2166 /* Now make the actual trailer string */
2167 lpd->trailer = makeTrailerStringPdf(daloc);
2168 }
2169
2170
2171 static char *
2172 makeTrailerStringPdf(L_DNA *daloc)
2173 {
2174 char *outstr;
2175 char buf[L_BIGBUF];
2176 l_int32 i, n, linestart, xrefloc;
2177 SARRAY *sa;
2178
2179 if (!daloc)
2180 return (char *)ERROR_PTR("daloc not defined", __func__, NULL);
2181 n = l_dnaGetCount(daloc) - 1; /* numbered objects + 1 (yes, +1) */
2182
2183 sa = sarrayCreate(0);
2184 snprintf(buf, sizeof(buf), "xref\n"
2185 "0 %d\n"
2186 "0000000000 65535 f \n", n);
2187 sarrayAddString(sa, buf, L_COPY);
2188 for (i = 1; i < n; i++) {
2189 l_dnaGetIValue(daloc, i, &linestart);
2190 snprintf(buf, sizeof(buf), "%010d 00000 n \n", linestart);
2191 sarrayAddString(sa, buf, L_COPY);
2192 }
2193
2194 l_dnaGetIValue(daloc, n, &xrefloc);
2195 snprintf(buf, sizeof(buf), "trailer\n"
2196 "<<\n"
2197 "/Size %d\n"
2198 "/Root 1 0 R\n"
2199 "/Info 2 0 R\n"
2200 ">>\n"
2201 "startxref\n"
2202 "%d\n"
2203 "%%%%EOF\n", n, xrefloc);
2204 sarrayAddString(sa, buf, L_COPY);
2205 outstr = sarrayToString(sa, 0);
2206 sarrayDestroy(&sa);
2207 return outstr;
2208 }
2209
2210
2211 /*!
2212 * \brief generateOutputDataPdf()
2213 *
2214 * \param[out] pdata pdf data array
2215 * \param[out] pnbytes size of pdf data array
2216 * \param[in] lpd input data used to make pdf
2217 * \return 0 if OK, 1 on error
2218 *
2219 * <pre>
2220 * Notes:
2221 * (1) Only called from l_generatePdf(). On error, no data is returned.
2222 * </pre>
2223 */
2224 static l_int32
2225 generateOutputDataPdf(l_uint8 **pdata,
2226 size_t *pnbytes,
2227 L_PDF_DATA *lpd)
2228 {
2229 char *str;
2230 l_uint8 *data;
2231 l_int32 nimages, i, len;
2232 l_int32 *sizes, *locs;
2233 size_t nbytes;
2234 L_COMP_DATA *cid;
2235
2236 if (!pdata)
2237 return ERROR_INT("&data not defined", __func__, 1);
2238 *pdata = NULL;
2239 if (!pnbytes)
2240 return ERROR_INT("&nbytes not defined", __func__, 1);
2241 nbytes = lpd->xrefloc + strlen(lpd->trailer);
2242 *pnbytes = nbytes;
2243 if ((data = (l_uint8 *)LEPT_CALLOC(nbytes, sizeof(l_uint8))) == NULL)
2244 return ERROR_INT("calloc fail for data", __func__, 1);
2245 *pdata = data;
2246
2247 sizes = l_dnaGetIArray(lpd->objsize);
2248 locs = l_dnaGetIArray(lpd->objloc);
2249 memcpy(data, lpd->id, sizes[0]);
2250 memcpy(data + locs[1], lpd->obj1, sizes[1]);
2251 memcpy(data + locs[2], lpd->obj2, sizes[2]);
2252 memcpy(data + locs[3], lpd->obj3, sizes[3]);
2253 memcpy(data + locs[4], lpd->obj4, sizes[4]);
2254 memcpy(data + locs[5], lpd->obj5, sizes[5]);
2255
2256 /* Each image has 3 parts: variable preamble, the compressed
2257 * data stream, and the fixed poststream. */
2258 nimages = lpd->n;
2259 for (i = 0; i < nimages; i++) {
2260 if ((cid = pdfdataGetCid(lpd, i)) == NULL) { /* should not happen */
2261 LEPT_FREE(sizes);
2262 LEPT_FREE(locs);
2263 return ERROR_INT("cid not found", __func__, 1);
2264 }
2265 str = sarrayGetString(lpd->saprex, i, L_NOCOPY);
2266 len = strlen(str);
2267 memcpy(data + locs[6 + i], str, len);
2268 memcpy(data + locs[6 + i] + len,
2269 cid->datacomp, cid->nbytescomp);
2270 memcpy(data + locs[6 + i] + len + cid->nbytescomp,
2271 lpd->poststream, strlen(lpd->poststream));
2272 }
2273
2274 /* Each colormap is simply a stored string */
2275 for (i = 0; i < lpd->ncmap; i++) {
2276 str = sarrayGetString(lpd->sacmap, i, L_NOCOPY);
2277 memcpy(data + locs[6 + nimages + i], str, strlen(str));
2278 }
2279
2280 /* And finally the trailer */
2281 memcpy(data + lpd->xrefloc, lpd->trailer, strlen(lpd->trailer));
2282 LEPT_FREE(sizes);
2283 LEPT_FREE(locs);
2284 return 0;
2285 }
2286
2287
2288 /*---------------------------------------------------------------------*
2289 * Helper functions for generating multipage pdf output *
2290 *---------------------------------------------------------------------*/
2291 /*!
2292 * \brief parseTrailerPdf()
2293 *
2294 * \param[in] bas lba of a pdf file
2295 * \param[out] pda byte locations of the beginning of each object
2296 * \return 0 if OK, 1 on error
2297 */
2298 static l_int32
2299 parseTrailerPdf(L_BYTEA *bas,
2300 L_DNA **pda)
2301 {
2302 char *str;
2303 l_uint8 nl = '\n';
2304 l_uint8 *data;
2305 l_int32 i, j, start, startloc, xrefloc, found, loc, nobj, objno, trailer_ok;
2306 size_t size;
2307 L_DNA *da, *daobj, *daxref;
2308 SARRAY *sa;
2309
2310 if (!pda)
2311 return ERROR_INT("&da not defined", __func__, 1);
2312 *pda = NULL;
2313 if (!bas)
2314 return ERROR_INT("bas not defined", __func__, 1);
2315 data = l_byteaGetData(bas, &size);
2316 if (memcmp(data, "%PDF-1.", 7) != 0)
2317 return ERROR_INT("PDF header signature not found", __func__, 1);
2318
2319 /* Search for "startxref" starting 50 bytes from the EOF */
2320 start = 0;
2321 if (size > 50)
2322 start = size - 50;
2323 arrayFindSequence(data + start, size - start,
2324 (l_uint8 *)"startxref\n", 10, &loc, &found);
2325 if (!found)
2326 return ERROR_INT("startxref not found!", __func__, 1);
2327 if (sscanf((char *)(data + start + loc + 10), "%d\n", &xrefloc) != 1)
2328 return ERROR_INT("xrefloc not found!", __func__, 1);
2329 if (xrefloc < 0 || xrefloc >= size)
2330 return ERROR_INT("invalid xrefloc!", __func__, 1);
2331 sa = sarrayCreateLinesFromString((char *)(data + xrefloc), 0);
2332 str = sarrayGetString(sa, 1, L_NOCOPY);
2333 if ((sscanf(str, "0 %d", &nobj)) != 1) {
2334 sarrayDestroy(&sa);
2335 return ERROR_INT("nobj not found", __func__, 1);
2336 }
2337
2338 /* Get starting locations. The numa index is the
2339 * object number. loc[0] is the ID; loc[nobj + 1] is xrefloc. */
2340 da = l_dnaCreate(nobj + 1);
2341 *pda = da;
2342 for (i = 0; i < nobj; i++) {
2343 str = sarrayGetString(sa, i + 2, L_NOCOPY);
2344 sscanf(str, "%d", &startloc);
2345 l_dnaAddNumber(da, startloc);
2346 }
2347 l_dnaAddNumber(da, xrefloc);
2348
2349 #if DEBUG_MULTIPAGE
2350 lept_stderr("************** Trailer string ************\n");
2351 lept_stderr("xrefloc = %d", xrefloc);
2352 sarrayWriteStderr(sa);
2353
2354 lept_stderr("************** Object locations ************");
2355 l_dnaWriteStderr(da);
2356 #endif /* DEBUG_MULTIPAGE */
2357 sarrayDestroy(&sa);
2358
2359 /* Verify correct parsing */
2360 trailer_ok = TRUE;
2361 for (i = 1; i < nobj; i++) {
2362 l_dnaGetIValue(da, i, &startloc);
2363 if ((sscanf((char *)(data + startloc), "%d 0 obj", &objno)) != 1) {
2364 L_ERROR("bad trailer for object %d\n", __func__, i);
2365 trailer_ok = FALSE;
2366 break;
2367 }
2368 }
2369
2370 /* If the trailer is broken, reconstruct the correct obj locations */
2371 if (!trailer_ok) {
2372 L_INFO("rebuilding pdf trailer\n", __func__);
2373 l_dnaEmpty(da);
2374 l_dnaAddNumber(da, 0);
2375 l_byteaFindEachSequence(bas, (l_uint8 *)" 0 obj\n", 7, &daobj);
2376 nobj = l_dnaGetCount(daobj);
2377 for (i = 0; i < nobj; i++) {
2378 l_dnaGetIValue(daobj, i, &loc);
2379 for (j = loc - 1; j > 0; j--) {
2380 if (data[j] == nl)
2381 break;
2382 }
2383 l_dnaAddNumber(da, j + 1);
2384 }
2385 l_byteaFindEachSequence(bas, (l_uint8 *)"xref", 4, &daxref);
2386 l_dnaGetIValue(daxref, 0, &loc);
2387 l_dnaAddNumber(da, loc);
2388 l_dnaDestroy(&daobj);
2389 l_dnaDestroy(&daxref);
2390 }
2391
2392 return 0;
2393 }
2394
2395
2396 static char *
2397 generatePagesObjStringPdf(NUMA *napage)
2398 {
2399 char *str;
2400 char *buf;
2401 l_int32 i, n, index, bufsize;
2402 SARRAY *sa;
2403
2404 if (!napage)
2405 return (char *)ERROR_PTR("napage not defined", __func__, NULL);
2406
2407 n = numaGetCount(napage);
2408 bufsize = 100 + 16 * n; /* large enough to hold the output string */
2409 buf = (char *)LEPT_CALLOC(bufsize, sizeof(char));
2410 sa = sarrayCreate(n);
2411 for (i = 0; i < n; i++) {
2412 numaGetIValue(napage, i, &index);
2413 snprintf(buf, bufsize, " %d 0 R ", index);
2414 sarrayAddString(sa, buf, L_COPY);
2415 }
2416
2417 str = sarrayToString(sa, 0);
2418 snprintf(buf, bufsize - 1, "3 0 obj\n"
2419 "<<\n"
2420 "/Type /Pages\n"
2421 "/Kids [%s]\n"
2422 "/Count %d\n"
2423 ">>\n"
2424 "endobj\n",
2425 str, n);
2426 sarrayDestroy(&sa);
2427 LEPT_FREE(str);
2428 return buf;
2429 }
2430
2431
2432 /*!
2433 * \brief substituteObjectNumbers()
2434 *
2435 * \param[in] bas lba of a pdf object
2436 * \param[in] na_objs object number mapping array
2437 * \return bad lba of rewritten pdf for the object
2438 *
2439 * <pre>
2440 * Notes:
2441 * (1) Interpret the first set of bytes as the object number,
2442 * map to the new number, and write it out.
2443 * (2) Find all occurrences of this 4-byte sequence: " 0 R"
2444 * (3) Find the location and value of the integer preceding this,
2445 * and map it to the new value.
2446 * (4) Rewrite the object with new object numbers.
2447 * </pre>
2448 */
2449 static L_BYTEA *
2450 substituteObjectNumbers(L_BYTEA *bas,
2451 NUMA *na_objs)
2452 {
2453 l_uint8 space = ' ';
2454 l_uint8 *datas;
2455 l_uint8 buf[32]; /* only needs to hold one integer in ascii format */
2456 l_int32 start, nrepl, i, j, nobjs, objin, objout, found;
2457 l_int32 *objs, *matches;
2458 size_t size;
2459 L_BYTEA *bad;
2460 L_DNA *da_match;
2461
2462 if (!bas)
2463 return (L_BYTEA *)ERROR_PTR("bas not defined", __func__, NULL);
2464 if (!na_objs)
2465 return (L_BYTEA *)ERROR_PTR("na_objs not defined", __func__, NULL);
2466
2467 datas = l_byteaGetData(bas, &size);
2468 bad = l_byteaCreate(100);
2469 objs = numaGetIArray(na_objs); /* object number mapper */
2470 nobjs = numaGetCount(na_objs); /* use for sanity checking */
2471
2472 /* Substitute the object number on the first line */
2473 sscanf((char *)datas, "%d", &objin);
2474 if (objin < 0 || objin >= nobjs) {
2475 L_ERROR("index %d into array of size %d\n", __func__, objin, nobjs);
2476 LEPT_FREE(objs);
2477 return bad;
2478 }
2479 objout = objs[objin];
2480 snprintf((char *)buf, 32, "%d", objout);
2481 l_byteaAppendString(bad, (char *)buf);
2482
2483 /* Find the set of matching locations for object references */
2484 arrayFindSequence(datas, size, &space, 1, &start, &found);
2485 da_match = arrayFindEachSequence(datas, size, (l_uint8 *)" 0 R", 4);
2486 if (!da_match) {
2487 l_byteaAppendData(bad, datas + start, size - start);
2488 LEPT_FREE(objs);
2489 return bad;
2490 }
2491
2492 /* Substitute all the object reference numbers */
2493 nrepl = l_dnaGetCount(da_match);
2494 matches = l_dnaGetIArray(da_match);
2495 for (i = 0; i < nrepl; i++) {
2496 /* Find the first space before the object number */
2497 for (j = matches[i] - 1; j > 0; j--) {
2498 if (datas[j] == space)
2499 break;
2500 }
2501 /* Copy bytes from 'start' up to the object number */
2502 l_byteaAppendData(bad, datas + start, j - start + 1);
2503 sscanf((char *)(datas + j + 1), "%d", &objin);
2504 if (objin < 0 || objin >= nobjs) {
2505 L_ERROR("index %d into array of size %d\n", __func__, objin, nobjs);
2506 LEPT_FREE(objs);
2507 LEPT_FREE(matches);
2508 l_dnaDestroy(&da_match);
2509 return bad;
2510 }
2511 objout = objs[objin];
2512 snprintf((char *)buf, 32, "%d", objout);
2513 l_byteaAppendString(bad, (char *)buf);
2514 start = matches[i];
2515 }
2516 l_byteaAppendData(bad, datas + start, size - start);
2517
2518 LEPT_FREE(objs);
2519 LEPT_FREE(matches);
2520 l_dnaDestroy(&da_match);
2521 return bad;
2522 }
2523
2524
2525 /*---------------------------------------------------------------------*
2526 * Create/destroy/access pdf data *
2527 *---------------------------------------------------------------------*/
2528 static L_PDF_DATA *
2529 pdfdataCreate(const char *title)
2530 {
2531 L_PDF_DATA *lpd;
2532
2533 lpd = (L_PDF_DATA *)LEPT_CALLOC(1, sizeof(L_PDF_DATA));
2534 if (title) lpd->title = stringNew(title);
2535 lpd->cida = ptraCreate(10);
2536 lpd->xy = ptaCreate(10);
2537 lpd->wh = ptaCreate(10);
2538 lpd->saprex = sarrayCreate(10);
2539 lpd->sacmap = sarrayCreate(10);
2540 lpd->objsize = l_dnaCreate(20);
2541 lpd->objloc = l_dnaCreate(20);
2542 return lpd;
2543 }
2544
2545 static void
2546 pdfdataDestroy(L_PDF_DATA **plpd)
2547 {
2548 l_int32 i;
2549 L_COMP_DATA *cid;
2550 L_PDF_DATA *lpd;
2551
2552 if (plpd== NULL) {
2553 L_WARNING("ptr address is null!\n", __func__);
2554 return;
2555 }
2556 if ((lpd = *plpd) == NULL)
2557 return;
2558
2559 if (lpd->title) LEPT_FREE(lpd->title);
2560 for (i = 0; i < lpd->n; i++) {
2561 cid = (L_COMP_DATA *)ptraRemove(lpd->cida, i, L_NO_COMPACTION);
2562 l_CIDataDestroy(&cid);
2563 }
2564
2565 ptraDestroy(&lpd->cida, 0, 0);
2566 if (lpd->id) LEPT_FREE(lpd->id);
2567 if (lpd->obj1) LEPT_FREE(lpd->obj1);
2568 if (lpd->obj2) LEPT_FREE(lpd->obj2);
2569 if (lpd->obj3) LEPT_FREE(lpd->obj3);
2570 if (lpd->obj4) LEPT_FREE(lpd->obj4);
2571 if (lpd->obj5) LEPT_FREE(lpd->obj5);
2572 if (lpd->poststream) LEPT_FREE(lpd->poststream);
2573 if (lpd->trailer) LEPT_FREE(lpd->trailer);
2574 if (lpd->xy) ptaDestroy(&lpd->xy);
2575 if (lpd->wh) ptaDestroy(&lpd->wh);
2576 if (lpd->mediabox) boxDestroy(&lpd->mediabox);
2577 if (lpd->saprex) sarrayDestroy(&lpd->saprex);
2578 if (lpd->sacmap) sarrayDestroy(&lpd->sacmap);
2579 if (lpd->objsize) l_dnaDestroy(&lpd->objsize);
2580 if (lpd->objloc) l_dnaDestroy(&lpd->objloc);
2581 LEPT_FREE(lpd);
2582 *plpd = NULL;
2583 }
2584
2585
2586 static L_COMP_DATA *
2587 pdfdataGetCid(L_PDF_DATA *lpd,
2588 l_int32 index)
2589 {
2590 if (!lpd)
2591 return (L_COMP_DATA *)ERROR_PTR("lpd not defined", __func__, NULL);
2592 if (index < 0 || index >= lpd->n)
2593 return (L_COMP_DATA *)ERROR_PTR("invalid image index", __func__, NULL);
2594
2595 return (L_COMP_DATA *)ptraGetPtrToItem(lpd->cida, index);
2596 }
2597
2598
2599 /*---------------------------------------------------------------------*
2600 * Find number of pages in a pdf *
2601 *---------------------------------------------------------------------*/
2602 /*!
2603 * \brief getPdfPageCount()
2604 *
2605 * \param[in] fname filename
2606 * \param[out] pnpages number of pages
2607 * \return 0 if OK, 1 on error
2608 *
2609 * <pre>
2610 * Notes:
2611 * (1) Looks for the argument of the first instance of /Count in the file.
2612 * (2) This first reads 10000 bytes from the beginning of the file.
2613 * If "/Count" is not in that string, it reads the entire file
2614 * and looks for "/Count".
2615 * (3) This will not work on encrypted pdf files or on files where
2616 * the "/Count" field is binary compressed. Not finding the
2617 * "/Count" field is not an error, but a warning is given.
2618 * </pre>
2619 */
2620 l_ok
2621 getPdfPageCount(const char *fname,
2622 l_int32 *pnpages)
2623 {
2624 l_uint8 *data;
2625 l_int32 format, loc, ret, npages, found;
2626 size_t nread;
2627
2628 if (!pnpages)
2629 return ERROR_INT("&npages not defined", __func__, 1);
2630 *pnpages = 0;
2631 if (!fname)
2632 return ERROR_INT("fname not defined", __func__, 1);
2633
2634 /* Make sure this a pdf file */
2635 findFileFormat(fname, &format);
2636 if (format != IFF_LPDF)
2637 return ERROR_INT("file is not pdf", __func__, 1);
2638
2639 /* Read 10000 bytes from the beginning of the file */
2640 if ((data = l_binaryReadSelect(fname, 0, 10000, &nread))
2641 == NULL)
2642 return ERROR_INT("partial data not read", __func__, 1);
2643
2644 /* Find the location of the first instance of "/Count".
2645 * If it is not found, try reading the entire file and
2646 * looking again. */
2647 arrayFindSequence(data, nread, (const l_uint8 *)"/Count",
2648 strlen("/Count"), &loc, &found);
2649 if (!found) {
2650 lept_stderr("Reading entire file looking for '/Count'\n");
2651 LEPT_FREE(data);
2652 if ((data = l_binaryRead(fname, &nread)) == NULL)
2653 return ERROR_INT("full data not read", __func__, 1);
2654 arrayFindSequence(data, nread, (const l_uint8 *)"/Count",
2655 strlen("/Count"), &loc, &found);
2656 if (!found) {
2657 LEPT_FREE(data);
2658 L_WARNING("/Count not found\n", __func__);
2659 return 0;
2660 }
2661 }
2662
2663 /* Unlikely: make sure we can read the count field */
2664 if (nread - loc < 12) { /* haven't read enough to capture page count */
2665 LEPT_FREE(data);
2666 return ERROR_INT("data may not include page count field", __func__, 1);
2667 }
2668
2669 /* Read the page count; if not found, puts garbage in npages */
2670 ret = sscanf((char *)&data[loc], "/Count %d", &npages);
2671 LEPT_FREE(data);
2672 if (ret != 1)
2673 return ERROR_INT("npages not found", __func__, 1);
2674 *pnpages = npages;
2675 /* lept_stderr("bytes read = %d, loc = %d, npages = %d\n",
2676 nread, loc, *pnpages); */
2677 return 0;
2678 }
2679
2680
2681 /*---------------------------------------------------------------------*
2682 * Find widths and heights of pages and media boxes in a pdf *
2683 *---------------------------------------------------------------------*/
2684 /*!
2685 * \brief getPdfPageSizes()
2686 *
2687 * \param[in] fname filename
2688 * \param[out] pnaw [optional] array of page widths
2689 * \param[out] pnah [optional] array of page heights
2690 * \param[out] pmedw [optional] median page width
2691 * \param[out] pmedh [optional] median page height
2692 * \return 0 if OK, 1 on error
2693 *
2694 * <pre>
2695 * Notes:
2696 * (1) Finds the arguments of each instance of '/Width' and '/Height'
2697 * in the file.
2698 * (2) This will not work on encrypted pdf files or on files where
2699 * the "/Width" and "/Height" fields are binary compressed.
2700 * Not finding the "/Width" and /Height" fields is not an error,
2701 * but a warning is given.
2702 * </pre>
2703 */
2704 l_ok
2705 getPdfPageSizes(const char *fname,
2706 NUMA **pnaw,
2707 NUMA **pnah,
2708 l_int32 *pmedw,
2709 l_int32 *pmedh)
2710 {
2711 l_uint8 *data;
2712 l_int32 i, nw, nh, format, ret, loc, width, height;
2713 l_float32 fval;
2714 size_t nread;
2715 L_DNA *dnaw; /* width locations */
2716 L_DNA *dnah; /* height locations */
2717 NUMA *naw; /* widths */
2718 NUMA *nah; /* heights */
2719
2720 if (pnaw) *pnaw = NULL;
2721 if (pnah) *pnah = NULL;
2722 if (pmedw) *pmedw = 0;
2723 if (pmedh) *pmedh = 0;
2724 if (!pnaw && !pnah && !pmedw && !pmedh)
2725 return ERROR_INT("no output requested", __func__, 1);
2726 if (!fname)
2727 return ERROR_INT("fname not defined", __func__, 1);
2728
2729 /* Make sure this a pdf file */
2730 findFileFormat(fname, &format);
2731 if (format != IFF_LPDF)
2732 return ERROR_INT("file is not pdf", __func__, 1);
2733
2734 /* Read the file into memory and find all locations of
2735 * '/Width' and '/Height' */
2736 if ((data = l_binaryRead(fname, &nread)) == NULL)
2737 return ERROR_INT("full data not read", __func__, 1);
2738 dnaw = arrayFindEachSequence(data, nread, (const l_uint8 *)"/Width",
2739 strlen("/Width"));
2740 dnah = arrayFindEachSequence(data, nread, (const l_uint8 *)"/Height",
2741 strlen("/Height"));
2742 if (!dnaw)
2743 L_WARNING("unable to find widths\n", __func__);
2744 if (!dnah)
2745 L_WARNING("unable to find heights\n", __func__);
2746 if (!dnaw && !dnah) {
2747 LEPT_FREE(data);
2748 L_WARNING("no fields found\n", __func__);
2749 return 0;
2750 }
2751
2752 /* Find the page widths and heights */
2753 nw = l_dnaGetCount(dnaw);
2754 naw = numaCreate(nw);
2755 for (i = 0; i < nw; i++) {
2756 l_dnaGetIValue(dnaw, i, &loc);
2757 ret = sscanf((char *)&data[loc], "/Width %d", &width);
2758 if (ret != 1) {
2759 L_ERROR("width not found for item %d at loc %d\n",
2760 __func__, i, loc);
2761 continue;
2762 }
2763 numaAddNumber(naw, width);
2764 }
2765 nh = l_dnaGetCount(dnah);
2766 nah = numaCreate(nh);
2767 for (i = 0; i < nh; i++) {
2768 l_dnaGetIValue(dnah, i, &loc);
2769 ret = sscanf((char *)&data[loc], "/Height %d", &height);
2770 if (ret != 1) {
2771 L_ERROR("height not found for item %d at loc %d\n",
2772 __func__, i, loc);
2773 continue;
2774 }
2775 numaAddNumber(nah, height);
2776 }
2777
2778 LEPT_FREE(data);
2779 l_dnaDestroy(&dnaw);
2780 l_dnaDestroy(&dnah);
2781 if (pmedw) {
2782 numaGetMedian(naw, &fval);
2783 *pmedw = lept_roundftoi(fval);
2784 }
2785 if (pnaw)
2786 *pnaw = naw;
2787 else
2788 numaDestroy(&naw);
2789 if (pmedh) {
2790 numaGetMedian(nah, &fval);
2791 *pmedh = lept_roundftoi(fval);
2792 }
2793 if (pnah)
2794 *pnah = nah;
2795 else
2796 numaDestroy(&nah);
2797 return 0;
2798 }
2799
2800
2801 /*!
2802 * \brief getPdfMediaBoxSizes()
2803 *
2804 * \param[in] fname filename
2805 * \param[out] pnaw [optional] array of mediabox widths
2806 * \param[out] pnah [optional] array of mediabox heights
2807 * \param[out] pmedw [optional] median mediabox width
2808 * \param[out] pmedh [optional] median mediabox height
2809 * \return 0 if OK, 1 on error
2810 *
2811 * <pre>
2812 * Notes:
2813 * (1) Finds the arguments of each instance of '/MediaBox' in the file.
2814 * (2) This will not work on encrypted pdf files or on files where
2815 * the "/MediaBoxes" field is binary compressed. Not finding
2816 * the "/MediaBoxes" field is not an error, but a warning is given.
2817 * (3) This is useful for determining if the media boxes are
2818 * incorrectly assigned, such as assuming the resolution is 72 ppi.
2819 * If that happens and the input the the renderer assumes the
2820 * resolution is 300 ppi, the rendered images will be over 4x too
2821 * large in each dimension.
2822 * (4) An image dimension of 11 inches corresponds to a MediaBox
2823 * parameter of 792. We consider a value > 850 to be oversized
2824 * and not to be taken literally.
2825 * </pre>
2826 */
2827 l_ok
2828 getPdfMediaBoxSizes(const char *fname,
2829 NUMA **pnaw,
2830 NUMA **pnah,
2831 l_int32 *pmedw,
2832 l_int32 *pmedh)
2833 {
2834 l_uint8 *data;
2835 l_int32 i, n, format, ret, loc;
2836 l_float32 fval, ignore1, ignore2, w, h;
2837 size_t nread;
2838 L_DNA *dna; /* mediabox locations */
2839 NUMA *naw; /* mediabox widths */
2840 NUMA *nah; /* mediabox heights */
2841
2842 if (pnaw) *pnaw = NULL;
2843 if (pnah) *pnah = NULL;
2844 if (pmedw) *pmedw = 0;
2845 if (pmedh) *pmedh = 0;
2846 if (!pnaw && !pnah && !pmedw && !pmedh)
2847 return ERROR_INT("no output requested", __func__, 1);
2848 if (!fname)
2849 return ERROR_INT("fname not defined", __func__, 1);
2850
2851 /* Make sure this a pdf file */
2852 findFileFormat(fname, &format);
2853 if (format != IFF_LPDF)
2854 return ERROR_INT("file is not pdf", __func__, 1);
2855
2856 /* Read the file into memory and find all locations of '/MediaBox' */
2857 if ((data = l_binaryRead(fname, &nread)) == NULL)
2858 return ERROR_INT("full data not read", __func__, 1);
2859 dna = arrayFindEachSequence(data, nread, (const l_uint8 *)"/MediaBox",
2860 strlen("/MediaBox"));
2861 if (!dna) {
2862 LEPT_FREE(data);
2863 L_WARNING("no mediaboxes found\n", __func__);
2864 return 1;
2865 }
2866
2867 /* Find the mediabox widths and heights */
2868 n = l_dnaGetCount(dna);
2869 naw = numaCreate(n);
2870 nah = numaCreate(n);
2871 for (i = 0; i < n; i++) {
2872 l_dnaGetIValue(dna, i, &loc);
2873 ret = sscanf((char *)&data[loc], "/MediaBox [ %f %f %f %f",
2874 &ignore1, &ignore2, &w, &h);
2875 if (ret != 4) {
2876 L_ERROR("mediabox sizes not found for item %d at loc %d\n",
2877 __func__, i, loc);
2878 continue;
2879 }
2880 numaAddNumber(naw, w);
2881 numaAddNumber(nah, h);
2882 }
2883 LEPT_FREE(data);
2884 l_dnaDestroy(&dna);
2885
2886 if (pmedw) {
2887 numaGetMedian(naw, &fval);
2888 *pmedw = lept_roundftoi(fval);
2889 if (*pmedw > 850) lept_stderr("oversize width: %d\n", *pmedw);
2890 }
2891 if (pnaw)
2892 *pnaw = naw;
2893 else
2894 numaDestroy(&naw);
2895 if (pmedh) {
2896 numaGetMedian(nah, &fval);
2897 *pmedh = lept_roundftoi(fval);
2898 if (*pmedh > 850) lept_stderr("oversize height: %d\n", *pmedh);
2899 }
2900 if (pnah)
2901 *pnah = nah;
2902 else
2903 numaDestroy(&nah);
2904 return 0;
2905 }
2906
2907
2908 /*---------------------------------------------------------------------*
2909 * Find effective resolution of images rendered from a pdf *
2910 *---------------------------------------------------------------------*/
2911 /*!
2912 * \brief getPdfRendererResolution()
2913 *
2914 * \param[in] infile filename of input pdf file
2915 * \param[in] outdir directory of rendered output images
2916 * \param[out] pres desired resolution to use with renderer
2917 * \return 0 if OK, 1 on error
2918 *
2919 * <pre>
2920 * Notes:
2921 * (1) Finds the input resolution to pdftoppm that will generate
2922 * images with a maximum dimension of about 3300 pixels,
2923 * representing a full page at 300 ppi.
2924 * (2) It is most important is to make sure the renderer does
2925 * not make huge images because of an error in /MediaBox.
2926 * An image dimension of 11 inches corresponds to a MediaBox
2927 * parameter of 792. We consider a value > 850 to be oversized
2928 * and not to be taken literally. If the mediaboxes are
2929 * oversized, choose an appropriate lower resolution.
2930 * (3) If the mediaboxes are not accessible, render an image at
2931 * a low known resolution (say, 72 ppi) and based on the image
2932 * size, determine the resolution necessary to make an image
2933 * with 3300 pixels in the largest dimension.
2934 * (4) Requires pdftoppm, so this is disabled on windows for now.
2935 * (5) Requires the ability to call an external program, so it is
2936 * necessary to call setLeptDebugOK(1) before this function.
2937 * </pre>
2938 */
2939 l_ok
2940 getPdfRendererResolution(const char *infile,
2941 const char *outdir,
2942 l_int32 *pres)
2943 {
2944 char buf[256];
2945 char *tail, *basename, *fname;
2946 l_int32 ret, res, medw, medh, medmax, npages, pageno, w, h;
2947 SARRAY *sa;
2948
2949 if (!pres)
2950 return ERROR_INT("&res not defined", __func__, 1);
2951 *pres = 300; /* default */
2952
2953 #ifdef _WIN32
2954 L_INFO("Requires pdftoppm, so this is disabled on windows.\n"
2955 "Returns default resolution 300 ppi", __func__);
2956 return 0;
2957 #endif /* _WIN32 */
2958
2959 if (!LeptDebugOK) {
2960 L_INFO("Running pdftoppm is disabled; "
2961 "use setLeptDebugOK(1) to enable\n"
2962 "returns default resolution 300 ppi\n", __func__);
2963 return 1;
2964 }
2965
2966 if (!infile)
2967 return ERROR_INT("infile not defined", __func__, 1);
2968 if (!outdir)
2969 return ERROR_INT("outdir not defined", __func__, 1);
2970
2971 res = 300; /* default value */
2972 ret = getPdfMediaBoxSizes(infile, NULL, NULL, &medw, &medh);
2973 if (ret == 0) { /* Check for oversize mediaboxes */
2974 lept_stderr("Media Box medians: medw = %d, medh = %d\n", medw, medh);
2975 medmax = L_MAX(medw, medh);
2976 if (medmax > 850) {
2977 res = 300 * ((l_float32)792 / (l_float32)medmax);
2978 lept_stderr(" Oversize media box; use resolution = %d\n", res);
2979 *pres = res;
2980 }
2981 return 0;
2982 }
2983
2984 /* No mediaboxes; render one page and measure the max dimension */
2985 lept_stderr("Media Box dimensions not found\n");
2986 getPdfPageCount(infile, &npages);
2987 pageno = (npages > 0) ? (npages + 1) / 2 : 1;
2988 splitPathAtDirectory(infile, NULL, &tail);
2989 splitPathAtExtension(tail, &basename, NULL);
2990 snprintf(buf, sizeof(buf), "pdftoppm -f %d -l %d -r 72 %s %s/%s",
2991 pageno, pageno, infile, outdir, basename);
2992 LEPT_FREE(tail);
2993 LEPT_FREE(basename);
2994 callSystemDebug(buf); /* pdftoppm */
2995
2996 /* Get the page size */
2997 sa = getSortedPathnamesInDirectory(outdir, NULL, 0, 0);
2998 fname = sarrayGetString(sa, 0, L_NOCOPY);
2999 pixReadHeader(fname, NULL, &w, &h, NULL, NULL, NULL);
3000 sarrayDestroy(&sa);
3001 if (w > 0 && h > 0) {
3002 res = L_MIN((72 * 3300 / L_MAX(w, h)), 600);
3003 *pres = res;
3004 lept_stderr("Use resolution = %d\n", res);
3005 } else {
3006 L_ERROR("page size not found; assuming res = 300\n", __func__);
3007 }
3008
3009 return 0;
3010 }
3011
3012
3013 /*---------------------------------------------------------------------*
3014 * Set flags for special modes *
3015 *---------------------------------------------------------------------*/
3016 /*!
3017 * \brief l_pdfSetG4ImageMask()
3018 *
3019 * \param[in] flag 1 for writing g4 data as fg only through a mask;
3020 * 0 for writing fg and bg
3021 * \return void
3022 *
3023 * <pre>
3024 * Notes:
3025 * (1) The default is for writing only the fg (through the mask).
3026 * That way when you write a 1 bpp image, the bg is transparent,
3027 * so any previously written image remains visible behind it.
3028 * </pre>
3029 */
3030 void
3031 l_pdfSetG4ImageMask(l_int32 flag)
3032 {
3033 var_WRITE_G4_IMAGE_MASK = flag;
3034 }
3035
3036
3037 /*!
3038 * \brief l_pdfSetDateAndVersion()
3039 *
3040 * \param[in] flag 1 for writing date/time and leptonica version;
3041 * 0 for omitting this from the metadata
3042 * \return void
3043 *
3044 * <pre>
3045 * Notes:
3046 * (1) The default is for writing this data. For regression tests
3047 * that compare output against golden files, it is useful to omit.
3048 * </pre>
3049 */
3050 void
3051 l_pdfSetDateAndVersion(l_int32 flag)
3052 {
3053 var_WRITE_DATE_AND_VERSION = flag;
3054 }
3055
3056 /* --------------------------------------------*/
3057 #endif /* USE_PDFIO */
3058 /* --------------------------------------------*/