comparison mupdf-source/thirdparty/leptonica/src/pdfio1.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /*====================================================================*
2 - Copyright (C) 2001 Leptonica. All rights reserved.
3 -
4 - Redistribution and use in source and binary forms, with or without
5 - modification, are permitted provided that the following conditions
6 - are met:
7 - 1. Redistributions of source code must retain the above copyright
8 - notice, this list of conditions and the following disclaimer.
9 - 2. Redistributions in binary form must reproduce the above
10 - copyright notice, this list of conditions and the following
11 - disclaimer in the documentation and/or other materials
12 - provided with the distribution.
13 -
14 - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15 - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16 - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17 - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY
18 - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23 - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 *====================================================================*/
26
27 /*!
28 * \file pdfio1.c
29 * <pre>
30 *
31 * Higher-level operations for generating pdf from images.
32 * Use poppler's pdftoppm or pdfimages to invert the process,
33 * extracting raster images from pdf.
34 *
35 * |=============================================================|
36 * | Important notes |
37 * |=============================================================|
38 * | Some of these functions require I/O libraries such as |
39 * | libtiff, libjpeg, libpng, libz and libopenjp2. If you do |
40 * | not have these libraries, some calls will fail. For |
41 * | example, if you do not have libopenjp2, you cannot write a |
42 * | pdf where transcoding is required to incorporate a |
43 * | jp2k image. |
44 * | |
45 * | You can manually deactivate all pdf writing by setting |
46 * | this in environ.h: |
47 * | \code |
48 * | #define USE_PDFIO 0 |
49 * | \endcode |
50 * | This will link the stub file pdfiostub.c. |
51 * |=============================================================|
52 *
53 * Set 1. These functions convert a set of image files
54 * to a multi-page pdf file, with one image on each page.
55 * All images are rendered at the same (input) resolution.
56 * The images can be specified as being in a directory, or they
57 * can be in an sarray. The output pdf can be either a file
58 * or an array of bytes in memory.
59 *
60 * Set 2. These functions are a special case of set 1, where
61 * no scaling or change in quality is required. For jpeg, jp2k and
62 * tiffg4 images, the bytes in each file can be directly incorporated
63 * into the output pdf, and the wrapping up of multiple image
64 * files is very fast. For non-interlaced png, the data bytes
65 * including the predictors can also be written directly into the
66 * flate pdf data. For other image formats transcoding is required,
67 * where the image data is first decompressed and then flate (gzip),
68 * DCT (jpeg) or tiffg4 (1 bpp) encodings are generated.
69 *
70 * Set 3. These functions convert a set of images in memory
71 * to a multi-page pdf, with one image on each page. The pdf
72 * output can be either a file or an array of bytes in memory.
73 *
74 * Set 4. These functions implement a pdf output "device driver"
75 * for wrapping (encoding) any number of images on a single page
76 * in pdf. The input can be either an image file or a Pix;
77 * the pdf output can be either a file or an array of bytes in memory.
78 *
79 * Set 5. These "segmented" functions take a set of image
80 * files, along with optional segmentation information, and
81 * generate a multi-page pdf file, where each page consists
82 * in general of a mixed raster pdf of image and non-image regions.
83 * The segmentation information for each page can be input as
84 * either a mask over the image parts, or as a Boxa of those
85 * regions.
86 *
87 * Set 6. These "segmented" functions convert an image and
88 * an optional Boxa of image regions into a mixed raster pdf file
89 * for the page. The input image can be either a file or a Pix.
90 *
91 * Set 7. These functions take a set of single-page pdf files
92 * and concatenates it into a multi-page pdf. The input can be
93 * a set of either single page pdf files or pdf 'strings' in memory.
94 * The output can be either a file or an array of bytes in memory.
95 *
96 * The images in the pdf file can be rendered using a pdf viewer,
97 * such as evince, gv, xpdf or acroread.
98 *
99 * Reference on the pdf file format:
100 * http://www.adobe.com/devnet/pdf/pdf_reference_archive.html
101 *
102 * 1. Convert specified image files to pdf (one image file per page)
103 * l_int32 convertFilesToPdf()
104 * l_int32 saConvertFilesToPdf()
105 * l_int32 saConvertFilesToPdfData()
106 * l_int32 selectDefaultPdfEncoding()
107 *
108 * 2. Convert specified image files to pdf without scaling
109 * l_int32 convertUnscaledFilesToPdf()
110 * l_int32 saConvertUnscaledFilesToPdf()
111 * l_int32 saConvertUnscaledFilesToPdfData()
112 * l_int32 convertUnscaledToPdfData()
113 *
114 * 3. Convert multiple images to pdf (one image per page)
115 * l_int32 pixaConvertToPdf()
116 * l_int32 pixaConvertToPdfData()
117 *
118 * 4. Single page, multi-image converters
119 * l_int32 convertToPdf()
120 * l_int32 convertImageDataToPdf()
121 * l_int32 convertToPdfData()
122 * l_int32 convertImageDataToPdfData()
123 * l_int32 pixConvertToPdf()
124 * l_int32 pixWriteStreamPdf()
125 * l_int32 pixWriteMemPdf()
126 *
127 * 5. Segmented multi-page, multi-image converter
128 * l_int32 convertSegmentedFilesToPdf()
129 * BOXAA *convertNumberedMasksToBoxaa()
130 *
131 * 6. Segmented single page, multi-image converters
132 * l_int32 convertToPdfSegmented()
133 * l_int32 pixConvertToPdfSegmented()
134 * l_int32 convertToPdfDataSegmented()
135 * l_int32 pixConvertToPdfDataSegmented()
136 *
137 * 7. Multipage concatenation
138 * l_int32 concatenatePdf()
139 * l_int32 saConcatenatePdf()
140 * l_int32 ptraConcatenatePdf()
141 * l_int32 concatenatePdfToData()
142 * l_int32 saConcatenatePdfToData()
143 *
144 * The top-level multi-image functions can be visualized as follows:
145 * Output pdf data to file:
146 * convertToPdf() and convertImageDataToPdf()
147 * --> pixConvertToPdf()
148 * --> pixConvertToPdfData()
149 *
150 * Output pdf data to array in memory:
151 * convertToPdfData() and convertImageDataToPdfData()
152 * --> pixConvertToPdfData()
153 *
154 * The top-level segmented image functions can be visualized as follows:
155 * Output pdf data to file:
156 * convertToPdfSegmented()
157 * --> pixConvertToPdfSegmented()
158 * --> pixConvertToPdfDataSegmented()
159 *
160 * Output pdf data to array in memory:
161 * convertToPdfDataSegmented()
162 * --> pixConvertToPdfDataSegmented()
163 *
164 * For multi-page concatenation, there are three different types of input
165 * (1) directory and optional filename filter
166 * (2) sarray of filenames
167 * (3) ptra of byte arrays of pdf data
168 * and two types of output for the concatenated pdf data
169 * (1) filename
170 * (2) data array and size
171 * High-level interfaces are given for each of the six combinations.
172 *
173 * Note: When wrapping small images into pdf, it is useful to give
174 * them a relatively low resolution value, to avoid rounding errors
175 * when rendering the images. For example, if you want an image
176 * of width w pixels to be 5 inches wide on a screen, choose a
177 * resolution w/5.
178 *
179 * The very fast functions in section (2) require neither transcoding
180 * nor parsing of the compressed jpeg file. With three types of image
181 * compression, the compressed strings can be incorporated into
182 * the pdf data without decompression and re-encoding: jpeg, jp2k
183 * and png. The DCTDecode and JPXDecode filters can handle the
184 * entire jpeg and jp2k encoded string as a byte array in the pdf file.
185 * The FlateDecode filter can handle the png compressed image data,
186 * including predictors that occur as the first byte in each
187 * raster line, but it is necessary to store only the png IDAT chunk
188 * data in the pdf array. The alternative for wrapping png images
189 * is to transcode them: uncompress into a raster (a pix) and then
190 * gzip the raster data. This typically results in a larger pdf file
191 * because it doesn't use the two-dimensional png predictor.
192 * Colormaps, which are found in png PLTE chunks, must always be
193 * pulled out and included separately in the pdf. For CCITT-G4
194 * compression, you can not simply include a tiff G4 file -- you must
195 * either parse it and extract the G4 compressed data within it,
196 * or uncompress to a raster and G4 compress again.
197 * </pre>
198 */
199
200 #ifdef HAVE_CONFIG_H
201 #include <config_auto.h>
202 #endif /* HAVE_CONFIG_H */
203
204 #include <string.h>
205 #include <math.h>
206 #include "allheaders.h"
207
208 /* --------------------------------------------*/
209 #if USE_PDFIO /* defined in environ.h */
210 /* --------------------------------------------*/
211
212 /* Typical scan resolution in ppi (pixels/inch) */
213 static const l_int32 DefaultInputRes = 300;
214
215 /*---------------------------------------------------------------------*
216 * Convert specified image files to pdf (one image file per page) *
217 *---------------------------------------------------------------------*/
218 /*!
219 * \brief convertFilesToPdf()
220 *
221 * \param[in] dirname directory name containing images
222 * \param[in] substr [optional] substring filter on filenames;
223 * can be null
224 * \param[in] res input resolution of all images
225 * \param[in] scalefactor scaling factor applied to each image; > 0.0
226 * \param[in] type encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
227 * L_FLATE_ENCODE, L_JP2K_ENCODE or
228 * L_DEFAULT_ENCODE for default)
229 * \param[in] quality for jpeg: 1-100; 0 for default (75)
230 * for jp2k: 27-45; 0 for default (34)
231 * \param[in] title [optional] pdf title; can be null
232 * \param[in] fileout pdf file of all images
233 * \return 0 if OK, 1 on error
234 *
235 * <pre>
236 * Notes:
237 * (1) If %substr is not NULL, only image filenames that contain
238 * the substring can be used. If %substr == NULL, all files
239 * in the directory are used.
240 * (2) The files in the directory, after optional filtering by
241 * the substring, are lexically sorted in increasing order
242 * before concatenation.
243 * (3) The scalefactor is applied to each image before encoding.
244 * If you enter a value <= 0.0, it will be set to 1.0.
245 * (4) Specifying one of the four encoding types for %type forces
246 * all images to be compressed with that type. Use 0 to have
247 * the type determined for each image based on depth and whether
248 * or not it has a colormap.
249 * </pre>
250 */
251 l_ok
252 convertFilesToPdf(const char *dirname,
253 const char *substr,
254 l_int32 res,
255 l_float32 scalefactor,
256 l_int32 type,
257 l_int32 quality,
258 const char *title,
259 const char *fileout)
260 {
261 l_int32 ret;
262 SARRAY *sa;
263
264 if (!dirname)
265 return ERROR_INT("dirname not defined", __func__, 1);
266 if (!fileout)
267 return ERROR_INT("fileout not defined", __func__, 1);
268
269 if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
270 return ERROR_INT("sa not made", __func__, 1);
271 ret = saConvertFilesToPdf(sa, res, scalefactor, type, quality,
272 title, fileout);
273 sarrayDestroy(&sa);
274 return ret;
275 }
276
277
278 /*!
279 * \brief saConvertFilesToPdf()
280 *
281 * \param[in] sa string array of pathnames for images
282 * \param[in] res input resolution of all images
283 * \param[in] scalefactor scaling factor applied to each image; > 0.0
284 * \param[in] type encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
285 * L_FLATE_ENCODE, L_JP2K_ENCODE or
286 * L_DEFAULT_ENCODE for default)
287 * \param[in] quality for jpeg: 1-100; 0 for default (75)
288 * for jp2k: 27-45; 0 for default (34)
289 * \param[in] title [optional] pdf title; can be null
290 * \param[in] fileout pdf file of all images
291 * \return 0 if OK, 1 on error
292 *
293 * <pre>
294 * Notes:
295 * (1) See convertFilesToPdf().
296 * </pre>
297 */
298 l_ok
299 saConvertFilesToPdf(SARRAY *sa,
300 l_int32 res,
301 l_float32 scalefactor,
302 l_int32 type,
303 l_int32 quality,
304 const char *title,
305 const char *fileout)
306 {
307 l_uint8 *data;
308 l_int32 ret;
309 size_t nbytes;
310
311 if (!sa)
312 return ERROR_INT("sa not defined", __func__, 1);
313
314 ret = saConvertFilesToPdfData(sa, res, scalefactor, type, quality,
315 title, &data, &nbytes);
316 if (ret) {
317 if (data) LEPT_FREE(data);
318 return ERROR_INT("pdf data not made", __func__, 1);
319 }
320
321 ret = l_binaryWrite(fileout, "w", data, nbytes);
322 LEPT_FREE(data);
323 if (ret)
324 L_ERROR("pdf data not written to file\n", __func__);
325 return ret;
326 }
327
328
329 /*!
330 * \brief saConvertFilesToPdfData()
331 *
332 * \param[in] sa string array of pathnames for images
333 * \param[in] res input resolution of all images
334 * \param[in] scalefactor scaling factor applied to each image; > 0.0
335 * \param[in] type encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
336 * L_FLATE_ENCODE, L_JP2K_ENCODE or
337 * L_DEFAULT_ENCODE for default)
338 * \param[in] quality for jpeg: 1-100; 0 for default (75)
339 * for jp2k: 27-45; 0 for default (34)
340 * \param[in] title [optional] pdf title; can be null
341 * \param[out] pdata output pdf data (of all images
342 * \param[out] pnbytes size of output pdf data
343 * \return 0 if OK, 1 on error
344 *
345 * <pre>
346 * Notes:
347 * (1) See convertFilesToPdf().
348 * </pre>
349 */
350 l_ok
351 saConvertFilesToPdfData(SARRAY *sa,
352 l_int32 res,
353 l_float32 scalefactor,
354 l_int32 type,
355 l_int32 quality,
356 const char *title,
357 l_uint8 **pdata,
358 size_t *pnbytes)
359 {
360 char *fname;
361 l_uint8 *imdata;
362 l_int32 i, n, ret, pagetype, npages, scaledres;
363 size_t imbytes;
364 L_BYTEA *ba;
365 PIX *pixs, *pix;
366 L_PTRA *pa_data;
367
368 if (!pdata)
369 return ERROR_INT("&data not defined", __func__, 1);
370 *pdata = NULL;
371 if (!pnbytes)
372 return ERROR_INT("&nbytes not defined", __func__, 1);
373 *pnbytes = 0;
374 if (!sa)
375 return ERROR_INT("sa not defined", __func__, 1);
376 if (scalefactor <= 0.0) scalefactor = 1.0;
377 if (type != L_JPEG_ENCODE && type != L_G4_ENCODE &&
378 type != L_FLATE_ENCODE && type != L_JP2K_ENCODE) {
379 type = L_DEFAULT_ENCODE;
380 }
381
382 /* Generate all the encoded pdf strings */
383 n = sarrayGetCount(sa);
384 pa_data = ptraCreate(n);
385 for (i = 0; i < n; i++) {
386 if (i && (i % 10 == 0)) lept_stderr(".. %d ", i);
387 fname = sarrayGetString(sa, i, L_NOCOPY);
388 if ((pixs = pixRead(fname)) == NULL) {
389 L_ERROR("image not readable from file %s\n", __func__, fname);
390 continue;
391 }
392 if (scalefactor != 1.0)
393 pix = pixScale(pixs, scalefactor, scalefactor);
394 else
395 pix = pixClone(pixs);
396 pixDestroy(&pixs);
397 scaledres = (l_int32)(res * scalefactor);
398
399 /* Select the encoding type */
400 if (type != L_DEFAULT_ENCODE) {
401 pagetype = type;
402 } else if (selectDefaultPdfEncoding(pix, &pagetype) != 0) {
403 pixDestroy(&pix);
404 L_ERROR("encoding type selection failed for file %s\n",
405 __func__, fname);
406 continue;
407 }
408
409 ret = pixConvertToPdfData(pix, pagetype, quality, &imdata, &imbytes,
410 0, 0, scaledres, title, NULL, 0);
411 pixDestroy(&pix);
412 if (ret) {
413 LEPT_FREE(imdata);
414 L_ERROR("pdf encoding failed for %s\n", __func__, fname);
415 continue;
416 }
417 ba = l_byteaInitFromMem(imdata, imbytes);
418 LEPT_FREE(imdata);
419 ptraAdd(pa_data, ba);
420 }
421 ptraGetActualCount(pa_data, &npages);
422 if (npages == 0) {
423 L_ERROR("no pdf files made\n", __func__);
424 ptraDestroy(&pa_data, FALSE, FALSE);
425 return 1;
426 }
427
428 /* Concatenate them */
429 lept_stderr("\nconcatenating ... ");
430 ret = ptraConcatenatePdfToData(pa_data, NULL, pdata, pnbytes);
431 lept_stderr("done\n");
432
433 ptraGetActualCount(pa_data, &npages); /* recalculate in case it changes */
434 for (i = 0; i < npages; i++) {
435 ba = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
436 l_byteaDestroy(&ba);
437 }
438 ptraDestroy(&pa_data, FALSE, FALSE);
439 return ret;
440 }
441
442
443 /*!
444 * \brief selectDefaultPdfEncoding()
445 *
446 * \param[in] pix
447 * \param[out] ptype L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE
448 * \return 0 if OK, 1 on error
449 *
450 * <pre>
451 * Notes:
452 * (1) This attempts to choose an encoding for the pix that results
453 * in the smallest file, assuming that if jpeg encoded, it will
454 * use quality = 75. The decision is approximate, in that
455 * (a) all colormapped images will be losslessly encoded with
456 * gzip (flate), and (b) an image with less than about 20 colors
457 * is likely to be smaller if flate encoded than if encoded
458 * as a jpeg (dct). For example, an image made by pixScaleToGray3()
459 * will have 10 colors, and flate encoding will give about
460 * twice the compression as jpeg with quality = 75.
461 * (2) We could have used L_JP2K_ENCODE instead of L_JPEG_ENCODE.
462 * However, the jp2k compression is not much better than jpeg, and
463 * the jpeg library is more commonly available than the jp2k library.
464 * </pre>
465 */
466 l_ok
467 selectDefaultPdfEncoding(PIX *pix,
468 l_int32 *ptype)
469 {
470 l_int32 w, h, d, factor, ncolors;
471 PIXCMAP *cmap;
472
473 if (!ptype)
474 return ERROR_INT("&type not defined", __func__, 1);
475 *ptype = L_FLATE_ENCODE; /* default universal encoding */
476 if (!pix)
477 return ERROR_INT("pix not defined", __func__, 1);
478 pixGetDimensions(pix, &w, &h, &d);
479 cmap = pixGetColormap(pix);
480 if (d == 8 && !cmap) {
481 factor = L_MAX(1, (l_int32)sqrt((l_float64)(w * h) / 20000.));
482 pixNumColors(pix, factor, &ncolors);
483 if (ncolors < 20)
484 *ptype = L_FLATE_ENCODE;
485 else
486 *ptype = L_JPEG_ENCODE;
487 } else if (d == 1) {
488 *ptype = L_G4_ENCODE;
489 } else if (cmap || d == 2 || d == 4) {
490 *ptype = L_FLATE_ENCODE;
491 } else if (d == 8 || d == 32) {
492 *ptype = L_JPEG_ENCODE;
493 } else if (d == 16) {
494 *ptype = L_FLATE_ENCODE;
495 } else {
496 return ERROR_INT("type selection failure", __func__, 1);
497 }
498
499 return 0;
500 }
501
502
503 /*---------------------------------------------------------------------*
504 * Convert specified image files to pdf without scaling *
505 *---------------------------------------------------------------------*/
506 /*!
507 * \brief convertUnscaledFilesToPdf()
508 *
509 * \param[in] dirname directory name containing images
510 * \param[in] substr [optional] substring filter on filenames;
511 * can be null
512 * \param[in] title [optional] pdf title; can be null
513 * \param[in] fileout pdf file of all images
514 * \return 0 if OK, 1 on error
515 *
516 * <pre>
517 * Notes:
518 * (1) If %substr is not NULL, only image filenames that contain
519 * the substring can be used. If %substr == NULL, all files
520 * in the directory are used.
521 * (2) The files in the directory, after optional filtering by
522 * the substring, are lexically sorted in increasing order
523 * before concatenation.
524 * (3) This is very fast for jpeg, jp2k and some png files,
525 * because the compressed data is wrapped up and concatenated.
526 * For other types of png, the images must be read and recompressed.
527 * </pre>
528 */
529 l_ok
530 convertUnscaledFilesToPdf(const char *dirname,
531 const char *substr,
532 const char *title,
533 const char *fileout)
534 {
535 l_int32 ret;
536 SARRAY *sa;
537
538 if (!dirname)
539 return ERROR_INT("dirname not defined", __func__, 1);
540 if (!fileout)
541 return ERROR_INT("fileout not defined", __func__, 1);
542
543 if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
544 return ERROR_INT("sa not made", __func__, 1);
545 ret = saConvertUnscaledFilesToPdf(sa, title, fileout);
546 sarrayDestroy(&sa);
547 return ret;
548 }
549
550
551 /*!
552 * \brief saConvertUnscaledFilesToPdf()
553 *
554 * \param[in] sa string array of pathnames for images
555 * \param[in] title [optional] pdf title; can be null
556 * \param[in] fileout pdf file of all images
557 * \return 0 if OK, 1 on error
558 *
559 * <pre>
560 * Notes:
561 * (1) See convertUnscaledFilesToPdf().
562 * </pre>
563 */
564 l_ok
565 saConvertUnscaledFilesToPdf(SARRAY *sa,
566 const char *title,
567 const char *fileout)
568 {
569 l_uint8 *data;
570 l_int32 ret;
571 size_t nbytes;
572
573 if (!sa)
574 return ERROR_INT("sa not defined", __func__, 1);
575
576 ret = saConvertUnscaledFilesToPdfData(sa, title, &data, &nbytes);
577 if (ret) {
578 if (data) LEPT_FREE(data);
579 return ERROR_INT("pdf data not made", __func__, 1);
580 }
581
582 ret = l_binaryWrite(fileout, "w", data, nbytes);
583 LEPT_FREE(data);
584 if (ret)
585 L_ERROR("pdf data not written to file\n", __func__);
586 return ret;
587 }
588
589
590 /*!
591 * \brief saConvertUnscaledFilesToPdfData()
592 *
593 * \param[in] sa string array of pathnames for image files
594 * \param[in] title [optional] pdf title; can be null
595 * \param[out] pdata output pdf data (of all images)
596 * \param[out] pnbytes size of output pdf data
597 * \return 0 if OK, 1 on error
598 *
599 * <pre>
600 * Notes:
601 * (1) This is very fast for jpeg, jp2k and some png files,
602 * because the compressed data is wrapped up and concatenated.
603 * For other types of png, the images must be read and recompressed.
604 * </pre>
605 */
606 l_ok
607 saConvertUnscaledFilesToPdfData(SARRAY *sa,
608 const char *title,
609 l_uint8 **pdata,
610 size_t *pnbytes)
611 {
612 char *fname;
613 l_uint8 *imdata;
614 l_int32 i, n, ret, npages;
615 size_t imbytes;
616 L_BYTEA *ba;
617 L_PTRA *pa_data;
618
619 if (!pdata)
620 return ERROR_INT("&data not defined", __func__, 1);
621 *pdata = NULL;
622 if (!pnbytes)
623 return ERROR_INT("&nbytes not defined", __func__, 1);
624 *pnbytes = 0;
625 if (!sa)
626 return ERROR_INT("sa not defined", __func__, 1);
627
628 /* Generate all the encoded pdf strings */
629 n = sarrayGetCount(sa);
630 pa_data = ptraCreate(n);
631 for (i = 0; i < n; i++) {
632 if (i && (i % 10 == 0)) lept_stderr(".. %d ", i);
633 fname = sarrayGetString(sa, i, L_NOCOPY);
634
635 /* Generate the pdf data */
636 if (convertUnscaledToPdfData(fname, title, &imdata, &imbytes))
637 continue;
638
639 /* ... and add it to the array of single page data */
640 ba = l_byteaInitFromMem(imdata, imbytes);
641 if (imdata) LEPT_FREE(imdata);
642 ptraAdd(pa_data, ba);
643 }
644 ptraGetActualCount(pa_data, &npages);
645 if (npages == 0) {
646 L_ERROR("no pdf files made\n", __func__);
647 ptraDestroy(&pa_data, FALSE, FALSE);
648 return 1;
649 }
650
651 /* Concatenate to generate a multipage pdf */
652 lept_stderr("\nconcatenating ... ");
653 ret = ptraConcatenatePdfToData(pa_data, NULL, pdata, pnbytes);
654 lept_stderr("done\n");
655
656 /* Clean up */
657 ptraGetActualCount(pa_data, &npages); /* maybe failed to read some files */
658 for (i = 0; i < npages; i++) {
659 ba = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
660 l_byteaDestroy(&ba);
661 }
662 ptraDestroy(&pa_data, FALSE, FALSE);
663 return ret;
664 }
665
666
667 /*!
668 * \brief convertUnscaledToPdfData()
669 *
670 * \param[in] fname of image file in all formats
671 * \param[in] title [optional] pdf title; can be null
672 * \param[out] pdata output pdf data for image
673 * \param[out] pnbytes size of output pdf data
674 * \return 0 if OK, 1 on error
675 *
676 * <pre>
677 * Notes:
678 * (1) This is very fast for jpeg, jp2k and some png files,
679 * because the compressed data is wrapped up and concatenated.
680 * For other types of png, the images must be read and recompressed.
681 * </pre>
682 */
683 l_ok
684 convertUnscaledToPdfData(const char *fname,
685 const char *title,
686 l_uint8 **pdata,
687 size_t *pnbytes)
688 {
689 l_int32 format;
690 L_COMP_DATA *cid;
691
692 if (!pdata)
693 return ERROR_INT("&data not defined", __func__, 1);
694 *pdata = NULL;
695 if (!pnbytes)
696 return ERROR_INT("&nbytes not defined", __func__, 1);
697 *pnbytes = 0;
698 if (!fname)
699 return ERROR_INT("fname not defined", __func__, 1);
700
701 findFileFormat(fname, &format);
702 if (format == IFF_UNKNOWN) {
703 L_WARNING("file %s format is unknown; skip\n", __func__, fname);
704 return 1;
705 }
706 if (format == IFF_PS || format == IFF_LPDF) {
707 L_WARNING("file %s format is %d; skip\n", __func__, fname, format);
708 return 1;
709 }
710
711 /* Generate the image data required for pdf generation, always
712 * in binary (not ascii85) coding. Note that jpeg, jp2k and some
713 * png files are not transcoded. */
714 l_generateCIDataForPdf(fname, NULL, 0, &cid);
715 if (!cid) {
716 L_ERROR("file %s format is %d; unreadable\n", __func__, fname, format);
717 return 1;
718 }
719
720 /* Generate the pdf string for this page (image). This destroys
721 * the cid by attaching it to an lpd and destroying the lpd. */
722 cidConvertToPdfData(cid, title, pdata, pnbytes);
723 return 0;
724 }
725
726
727 /*---------------------------------------------------------------------*
728 * Convert multiple images to pdf (one image per page) *
729 *---------------------------------------------------------------------*/
730 /*!
731 * \brief pixaConvertToPdf()
732 *
733 * \param[in] pixa containing images all at the same resolution
734 * \param[in] res override the resolution of each input image,
735 * in ppi; use 0 to respect the resolution
736 * embedded in the input images
737 * \param[in] scalefactor scaling factor applied to each image; > 0.0
738 * \param[in] type encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
739 * L_FLATE_ENCODE, L_JP2K_ENCODE, or
740 * L_DEFAULT_ENCODE for default)
741 * \param[in] quality for jpeg: 1-100; 0 for default (75)
742 * for jp2k: 27-45; 0 for default (34)
743 * \param[in] title [optional] pdf title; can be null
744 * \param[in] fileout pdf file of all images
745 * \return 0 if OK, 1 on error
746 *
747 * <pre>
748 * Notes:
749 * (1) The images are encoded with G4 if 1 bpp; JPEG if 8 bpp without
750 * colormap and many colors, or 32 bpp; FLATE for anything else.
751 * (2) The scalefactor must be > 0.0; otherwise it is set to 1.0.
752 * (3) Specifying one of the three encoding types for %type forces
753 * all images to be compressed with that type. Use 0 to have
754 * the type determined for each image based on depth and whether
755 * or not it has a colormap.
756 * </pre>
757 */
758 l_ok
759 pixaConvertToPdf(PIXA *pixa,
760 l_int32 res,
761 l_float32 scalefactor,
762 l_int32 type,
763 l_int32 quality,
764 const char *title,
765 const char *fileout)
766 {
767 l_uint8 *data;
768 l_int32 ret;
769 size_t nbytes;
770
771 if (!pixa)
772 return ERROR_INT("pixa not defined", __func__, 1);
773
774 ret = pixaConvertToPdfData(pixa, res, scalefactor, type, quality,
775 title, &data, &nbytes);
776 if (ret) {
777 LEPT_FREE(data);
778 return ERROR_INT("conversion to pdf failed", __func__, 1);
779 }
780
781 ret = l_binaryWrite(fileout, "w", data, nbytes);
782 LEPT_FREE(data);
783 if (ret)
784 L_ERROR("pdf data not written to file\n", __func__);
785 return ret;
786 }
787
788
789 /*!
790 * \brief pixaConvertToPdfData()
791 *
792 * \param[in] pixa containing images all at the same resolution
793 * \param[in] res input resolution of all images
794 * \param[in] scalefactor scaling factor applied to each image; > 0.0; <50
795 * \param[in] type encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
796 * L_FLATE_ENCODE, L_JP2K_ENCODE, or
797 * L_DEFAULT_ENCODE for default)
798 * \param[in] quality for jpeg: 1-100; 0 for default (75)
799 * for jp2k: 27-45; 0 for default (34)
800 * \param[in] title [optional] pdf title; can be null
801 * \param[out] pdata output pdf data of all images
802 * \param[out] pnbytes size of output pdf data
803 * \return 0 if OK, 1 on error
804 *
805 * <pre>
806 * Notes:
807 * (1) See pixaConvertToPdf().
808 * </pre>
809 */
810 l_ok
811 pixaConvertToPdfData(PIXA *pixa,
812 l_int32 res,
813 l_float32 scalefactor,
814 l_int32 type,
815 l_int32 quality,
816 const char *title,
817 l_uint8 **pdata,
818 size_t *pnbytes)
819 {
820 l_uint8 *imdata;
821 l_int32 i, n, ret, scaledres, pagetype;
822 size_t imbytes;
823 L_BYTEA *ba;
824 PIX *pixs, *pix;
825 L_PTRA *pa_data;
826
827 if (!pdata)
828 return ERROR_INT("&data not defined", __func__, 1);
829 *pdata = NULL;
830 if (!pnbytes)
831 return ERROR_INT("&nbytes not defined", __func__, 1);
832 *pnbytes = 0;
833 if (!pixa)
834 return ERROR_INT("pixa not defined", __func__, 1);
835 if (scalefactor <= 0.0) scalefactor = 1.0;
836 if (scalefactor >= 50.0)
837 return ERROR_INT("scalefactor too large", __func__, 1);
838 if (type != L_DEFAULT_ENCODE && type != L_JPEG_ENCODE &&
839 type != L_G4_ENCODE && type != L_FLATE_ENCODE &&
840 type != L_JP2K_ENCODE) {
841 L_WARNING("invalid compression type; using per-page default\n",
842 __func__);
843 type = L_DEFAULT_ENCODE;
844 }
845 if (quality < 0 || quality > 100)
846 return ERROR_INT("invalid quality", __func__, 1);
847
848 /* Generate all the encoded pdf strings */
849 n = pixaGetCount(pixa);
850 pa_data = ptraCreate(n);
851 for (i = 0; i < n; i++) {
852 if ((pixs = pixaGetPix(pixa, i, L_CLONE)) == NULL) {
853 L_ERROR("pixs[%d] not retrieved\n", __func__, i);
854 continue;
855 }
856 if (scalefactor != 1.0)
857 pix = pixScale(pixs, scalefactor, scalefactor);
858 else
859 pix = pixClone(pixs);
860 pixDestroy(&pixs);
861 if (!pix) {
862 L_ERROR("pix[%d] not made\n", __func__, i);
863 continue;
864 }
865 scaledres = (l_int32)(res * scalefactor);
866
867 /* Select the encoding type */
868 if (type != L_DEFAULT_ENCODE) {
869 pagetype = type;
870 } else if (selectDefaultPdfEncoding(pix, &pagetype) != 0) {
871 L_ERROR("encoding type selection failed for pix[%d]\n",
872 __func__, i);
873 pixDestroy(&pix);
874 continue;
875 }
876
877 ret = pixConvertToPdfData(pix, pagetype, quality, &imdata, &imbytes,
878 0, 0, scaledres, title, NULL, 0);
879 pixDestroy(&pix);
880 if (ret) {
881 LEPT_FREE(imdata);
882 L_ERROR("pdf encoding failed for pix[%d]\n", __func__, i);
883 continue;
884 }
885 ba = l_byteaInitFromMem(imdata, imbytes);
886 LEPT_FREE(imdata);
887 ptraAdd(pa_data, ba);
888 }
889 ptraGetActualCount(pa_data, &n);
890 if (n == 0) {
891 L_ERROR("no pdf files made\n", __func__);
892 ptraDestroy(&pa_data, FALSE, FALSE);
893 return 1;
894 }
895
896 /* Concatenate them */
897 ret = ptraConcatenatePdfToData(pa_data, NULL, pdata, pnbytes);
898
899 ptraGetActualCount(pa_data, &n); /* recalculate in case it changes */
900 for (i = 0; i < n; i++) {
901 ba = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
902 l_byteaDestroy(&ba);
903 }
904 ptraDestroy(&pa_data, FALSE, FALSE);
905 return ret;
906 }
907
908
909 /*---------------------------------------------------------------------*
910 * Single page, multi-image converters *
911 *---------------------------------------------------------------------*/
912 /*!
913 * \brief convertToPdf()
914 *
915 * \param[in] filein input image file -- any format
916 * \param[in] type encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
917 * L_FLATE_ENCODE, or L_JP2K_ENCODE)
918 * \param[in] quality for jpeg: 1-100; 0 for default (75)
919 * for jp2k: 27-45; 0 for default (34)
920 * \param[in] fileout output pdf file; only required on last
921 * image on page
922 * \param[in] x, y location of lower-left corner of image,
923 * in pixels, relative to the PostScript origin
924 * (0,0) at the lower-left corner of the page
925 * \param[in] res override the resolution of the input image,
926 * in ppi; use 0 to respect the resolution
927 * embedded in the input images
928 * \param[in] title [optional] pdf title; can be null
929 * \param[in,out] plpd ptr to lpd, which is created on the first
930 * invocation and returned until last image is
931 * processed, at which time it is destroyed
932 * \param[in] position in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
933 * L_LAST_IMAGE
934 * \return 0 if OK, 1 on error
935 *
936 * <pre>
937 * Notes:
938 * (1) To wrap only one image in pdf, input %plpd = NULL, and
939 * the value of %position will be ignored:
940 * convertToPdf(... type, quality, x, y, res, NULL, 0);
941 * (2) To wrap multiple images on a single pdf page, this is called
942 * once for each successive image. Do it this way:
943 * L_PDF_DATA *lpd;
944 * convertToPdf(... type, quality, x, y, res, &lpd, L_FIRST_IMAGE);
945 * convertToPdf(... type, quality, x, y, res, &lpd, L_NEXT_IMAGE);
946 * ...
947 * convertToPdf(... type, quality, x, y, res, &lpd, L_LAST_IMAGE);
948 * This will write the result to the value of %fileout specified
949 * in the first call; succeeding values of %fileout are ignored.
950 * On the last call: the pdf data bytes are computed and written
951 * to %fileout, lpd is destroyed internally, and the returned
952 * value of lpd is null. So the client has nothing to clean up.
953 * (3) (a) Set %res == 0 to respect the resolution embedded in the
954 * image file. If no resolution is embedded, it will be set
955 * to the default value.
956 * (b) Set %res to some other value to override the file resolution.
957 * (4) (a) If the input %res and the resolution of the output device
958 * are equal, the image will be "displayed" at the same size
959 * as the original.
960 * (b) If the input %res is 72, the output device will render
961 * the image at 1 pt/pixel.
962 * (c) Some possible choices for the default input pix resolution are:
963 * 72 ppi Render pix on any output device at one pt/pixel
964 * 96 ppi Windows default for generated display images
965 * 300 ppi Typical default for scanned images.
966 * We choose 300, which is sensible for rendering page images.
967 * However, images come from a variety of sources, and
968 * some are explicitly created for viewing on a display.
969 * </pre>
970 */
971 l_ok
972 convertToPdf(const char *filein,
973 l_int32 type,
974 l_int32 quality,
975 const char *fileout,
976 l_int32 x,
977 l_int32 y,
978 l_int32 res,
979 const char *title,
980 L_PDF_DATA **plpd,
981 l_int32 position)
982 {
983 l_uint8 *data;
984 l_int32 ret;
985 size_t nbytes;
986
987 if (!filein)
988 return ERROR_INT("filein not defined", __func__, 1);
989 if (!plpd || (position == L_LAST_IMAGE)) {
990 if (!fileout)
991 return ERROR_INT("fileout not defined", __func__, 1);
992 }
993
994 if (convertToPdfData(filein, type, quality, &data, &nbytes, x, y,
995 res, title, plpd, position))
996 return ERROR_INT("pdf data not made", __func__, 1);
997
998 if (!plpd || (position == L_LAST_IMAGE)) {
999 ret = l_binaryWrite(fileout, "w", data, nbytes);
1000 LEPT_FREE(data);
1001 if (ret)
1002 return ERROR_INT("pdf data not written to file", __func__, 1);
1003 }
1004
1005 return 0;
1006 }
1007
1008
1009 /*!
1010 * \brief convertImageDataToPdf()
1011 *
1012 * \param[in] imdata array of formatted image data; e.g., png, jpeg
1013 * \param[in] size size of image data
1014 * \param[in] type encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
1015 * L_FLATE_ENCODE, or L_JP2K_ENCODE)
1016 * \param[in] quality for jpeg: 1-100; 0 for default (75)
1017 * for jp2k: 27-45; 0 for default (34)
1018 * \param[in] fileout output pdf file; only required on last
1019 * image on page
1020 * \param[in] x, y location of lower-left corner of image,
1021 * in pixels, relative to the PostScript origin
1022 * (0,0) at the lower-left corner of the page
1023 * \param[in] res override the resolution of the input image,
1024 * in ppi; use 0 to respect the resolution
1025 * embedded in the input images
1026 * \param[in] title [optional] pdf title; can be null
1027 * \param[in,out] plpd ptr to lpd, which is created on the first
1028 * invocation and returned until last image is
1029 * processed, at which time it is destroyed
1030 * \param[in] position in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
1031 * L_LAST_IMAGE
1032 * \return 0 if OK, 1 on error
1033 *
1034 * <pre>
1035 * Notes:
1036 * (1) If %res == 0 and the input resolution field is 0,
1037 * this will use DefaultInputRes.
1038 * (2) See comments in convertToPdf().
1039 * </pre>
1040 */
1041 l_ok
1042 convertImageDataToPdf(l_uint8 *imdata,
1043 size_t size,
1044 l_int32 type,
1045 l_int32 quality,
1046 const char *fileout,
1047 l_int32 x,
1048 l_int32 y,
1049 l_int32 res,
1050 const char *title,
1051 L_PDF_DATA **plpd,
1052 l_int32 position)
1053 {
1054 l_int32 ret;
1055 PIX *pix;
1056
1057 if (!imdata)
1058 return ERROR_INT("image data not defined", __func__, 1);
1059 if (!plpd || (position == L_LAST_IMAGE)) {
1060 if (!fileout)
1061 return ERROR_INT("fileout not defined", __func__, 1);
1062 }
1063
1064 if ((pix = pixReadMem(imdata, size)) == NULL)
1065 return ERROR_INT("pix not read", __func__, 1);
1066 if (type != L_JPEG_ENCODE && type != L_G4_ENCODE &&
1067 type != L_FLATE_ENCODE && type != L_JP2K_ENCODE) {
1068 selectDefaultPdfEncoding(pix, &type);
1069 }
1070 ret = pixConvertToPdf(pix, type, quality, fileout, x, y, res,
1071 title, plpd, position);
1072 pixDestroy(&pix);
1073 return ret;
1074 }
1075
1076
1077 /*!
1078 * \brief convertToPdfData()
1079 *
1080 * \param[in] filein input image file -- any format
1081 * \param[in] type encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
1082 * L_FLATE_ENCODE, or L_JP2K_ENCODE)
1083 * \param[in] quality for jpeg: 1-100; 0 for default (75)
1084 * for jp2k: 27-45; 0 for default (34)
1085 * \param[out] pdata pdf data in memory
1086 * \param[out] pnbytes number of bytes in pdf data
1087 * \param[in] x, y location of lower-left corner of image,
1088 * in pixels, relative to the PostScript origin
1089 * (0,0) at the lower-left corner of the page
1090 * \param[in] res override the resolution of the input image,
1091 * in ppi; use 0 to respect the resolution
1092 * embedded in the input images
1093 * \param[in] title [optional] pdf title; can be null
1094 * \param[in,out] plpd ptr to lpd, which is created on the first
1095 * invocation and returned until last image is
1096 * processed, at which time it is destroyed
1097 * \param[in] position in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
1098 * L_LAST_IMAGE
1099 * \return 0 if OK, 1 on error
1100 *
1101 * <pre>
1102 * Notes:
1103 * (1) If %res == 0 and the input resolution field is 0,
1104 * this will use DefaultInputRes.
1105 * (2) See comments in convertToPdf().
1106 * </pre>
1107 */
1108 l_ok
1109 convertToPdfData(const char *filein,
1110 l_int32 type,
1111 l_int32 quality,
1112 l_uint8 **pdata,
1113 size_t *pnbytes,
1114 l_int32 x,
1115 l_int32 y,
1116 l_int32 res,
1117 const char *title,
1118 L_PDF_DATA **plpd,
1119 l_int32 position)
1120 {
1121 PIX *pix;
1122
1123 if (!pdata)
1124 return ERROR_INT("&data not defined", __func__, 1);
1125 *pdata = NULL;
1126 if (!pnbytes)
1127 return ERROR_INT("&nbytes not defined", __func__, 1);
1128 *pnbytes = 0;
1129 if (!filein)
1130 return ERROR_INT("filein not defined", __func__, 1);
1131
1132 if ((pix = pixRead(filein)) == NULL)
1133 return ERROR_INT("pix not made", __func__, 1);
1134
1135 pixConvertToPdfData(pix, type, quality, pdata, pnbytes,
1136 x, y, res, title, plpd, position);
1137 pixDestroy(&pix);
1138 return 0;
1139 }
1140
1141
1142 /*!
1143 * \brief convertImageDataToPdfData()
1144 *
1145 * \param[in] imdata array of formatted image data; e.g., png, jpeg
1146 * \param[in] size size of image data
1147 * \param[in] type encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
1148 * L_FLATE_ENCODE, or L_JP2K_ENCODE)
1149 * \param[in] quality for jpeg: 1-100; 0 for default (75)
1150 * for jp2k: 27-45; 0 for default (34)
1151 * \param[out] pdata pdf data in memory
1152 * \param[out] pnbytes number of bytes in pdf data
1153 * \param[in] x, y location of lower-left corner of image,
1154 * in pixels, relative to the PostScript origin
1155 * (0,0) at the lower-left corner of the page
1156 * \param[in] res override the resolution of the input image,
1157 * in ppi; use 0 to respect the resolution
1158 * embedded in the input images
1159 * \param[in] title [optional] pdf title; can be null
1160 * \param[out] plpd ptr to lpd, which is created on the first
1161 * invocation and returned until last image is
1162 * processed, at which time it is destroyed
1163 * \param[in] position in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
1164 * L_LAST_IMAGE
1165 * \return 0 if OK, 1 on error
1166 *
1167 * <pre>
1168 * Notes:
1169 * (1) If %res == 0 and the input resolution field is 0,
1170 * this will use DefaultInputRes.
1171 * (2) See comments in convertToPdf().
1172 * </pre>
1173 */
1174 l_ok
1175 convertImageDataToPdfData(l_uint8 *imdata,
1176 size_t size,
1177 l_int32 type,
1178 l_int32 quality,
1179 l_uint8 **pdata,
1180 size_t *pnbytes,
1181 l_int32 x,
1182 l_int32 y,
1183 l_int32 res,
1184 const char *title,
1185 L_PDF_DATA **plpd,
1186 l_int32 position)
1187 {
1188 l_int32 ret;
1189 PIX *pix;
1190
1191 if (!pdata)
1192 return ERROR_INT("&data not defined", __func__, 1);
1193 *pdata = NULL;
1194 if (!pnbytes)
1195 return ERROR_INT("&nbytes not defined", __func__, 1);
1196 *pnbytes = 0;
1197 if (!imdata)
1198 return ERROR_INT("image data not defined", __func__, 1);
1199 if (plpd) { /* part of multi-page invocation */
1200 if (position == L_FIRST_IMAGE)
1201 *plpd = NULL;
1202 }
1203
1204 if ((pix = pixReadMem(imdata, size)) == NULL)
1205 return ERROR_INT("pix not read", __func__, 1);
1206 if (type != L_JPEG_ENCODE && type != L_G4_ENCODE &&
1207 type != L_FLATE_ENCODE && type != L_JP2K_ENCODE) {
1208 selectDefaultPdfEncoding(pix, &type);
1209 }
1210 ret = pixConvertToPdfData(pix, type, quality, pdata, pnbytes,
1211 x, y, res, title, plpd, position);
1212 pixDestroy(&pix);
1213 return ret;
1214 }
1215
1216
1217 /*!
1218 * \brief pixConvertToPdf()
1219 *
1220 * \param[in] pix
1221 * \param[in] type encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
1222 * L_FLATE_ENCODE, L_JP2K_ENCODE)
1223 * \param[in] quality for jpeg: 1-100; 0 for default (75)
1224 * for jp2k: 27-45; 0 for default (34)
1225 * \param[in] fileout output pdf file; only required on last
1226 * image on page
1227 * \param[in] x, y location of lower-left corner of image,
1228 * in pixels, relative to the PostScript origin
1229 * (0,0) at the lower-left corner of the page
1230 * \param[in] res override the resolution of the input image,
1231 * in ppi; use 0 to respect the resolution
1232 * embedded in the input images
1233 * \param[in] title [optional] pdf title; can be null
1234 * \param[in,out] plpd ptr to lpd, which is created on the first
1235 * invocation and returned until last image is
1236 * processed, at which time it is destroyed
1237 * \param[in] position in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
1238 * L_LAST_IMAGE
1239 * \return 0 if OK, 1 on error
1240 *
1241 * <pre>
1242 * Notes:
1243 * (1) If %res == 0 and the input resolution field is 0,
1244 * this will use DefaultInputRes.
1245 * (2) This only writes data to fileout if it is the last
1246 * image to be written on the page.
1247 * (3) See comments in convertToPdf().
1248 * </pre>
1249 */
1250 l_ok
1251 pixConvertToPdf(PIX *pix,
1252 l_int32 type,
1253 l_int32 quality,
1254 const char *fileout,
1255 l_int32 x,
1256 l_int32 y,
1257 l_int32 res,
1258 const char *title,
1259 L_PDF_DATA **plpd,
1260 l_int32 position)
1261 {
1262 l_uint8 *data;
1263 l_int32 ret;
1264 size_t nbytes;
1265
1266 if (!pix)
1267 return ERROR_INT("pix not defined", __func__, 1);
1268 if (!plpd || (position == L_LAST_IMAGE)) {
1269 if (!fileout)
1270 return ERROR_INT("fileout not defined", __func__, 1);
1271 }
1272
1273 if (pixConvertToPdfData(pix, type, quality, &data, &nbytes,
1274 x, y, res, title, plpd, position)) {
1275 LEPT_FREE(data);
1276 return ERROR_INT("pdf data not made", __func__, 1);
1277 }
1278
1279 if (!plpd || (position == L_LAST_IMAGE)) {
1280 ret = l_binaryWrite(fileout, "w", data, nbytes);
1281 LEPT_FREE(data);
1282 if (ret)
1283 return ERROR_INT("pdf data not written to file", __func__, 1);
1284 }
1285 return 0;
1286 }
1287
1288
1289 /*!
1290 * \brief pixWriteStreamPdf()
1291 *
1292 * \param[in] fp file stream opened for writing
1293 * \param[in] pix all depths, cmap OK
1294 * \param[in] res override the resolution of the input image, in ppi;
1295 * use 0 to respect the resolution embedded in the input
1296 * \param[in] title [optional] pdf title; can be null
1297 * \return 0 if OK, 1 on error
1298 *
1299 * <pre>
1300 * Notes:
1301 * (1) This is the simplest interface for writing a single image
1302 * with pdf encoding to a stream. It uses G4 encoding for 1 bpp,
1303 * JPEG encoding for 8 bpp (no cmap) and 32 bpp, and FLATE
1304 * encoding for everything else.
1305 * </pre>
1306 */
1307 l_ok
1308 pixWriteStreamPdf(FILE *fp,
1309 PIX *pix,
1310 l_int32 res,
1311 const char *title)
1312 {
1313 l_uint8 *data;
1314 size_t nbytes, nbytes_written;
1315
1316 if (!fp)
1317 return ERROR_INT("stream not opened", __func__, 1);
1318 if (!pix)
1319 return ERROR_INT("pix not defined", __func__, 1);
1320
1321 if (pixWriteMemPdf(&data, &nbytes, pix, res, title) != 0) {
1322 LEPT_FREE(data);
1323 return ERROR_INT("pdf data not made", __func__, 1);
1324 }
1325
1326 nbytes_written = fwrite(data, 1, nbytes, fp);
1327 LEPT_FREE(data);
1328 if (nbytes != nbytes_written)
1329 return ERROR_INT("failure writing pdf data to stream", __func__, 1);
1330 return 0;
1331 }
1332
1333
1334 /*!
1335 * \brief pixWriteMemPdf()
1336 *
1337 * \param[out] pdata pdf as byte array
1338 * \param[out] pnbytes number of bytes in pdf array
1339 * \param[in] pix all depths, cmap OK
1340 * \param[in] res override the resolution of the input image, in ppi;
1341 * use 0 to respect the res embedded in the input
1342 * \param[in] title [optional] pdf title; can be null
1343 * \return 0 if OK, 1 on error
1344 *
1345 * <pre>
1346 * Notes:
1347 * (1) This is the simplest interface for writing a single image
1348 * with pdf encoding to memory. It uses G4 encoding for 1 bpp,
1349 * and makes a guess whether to use JPEG or FLATE encoding for
1350 * everything else.
1351 * </pre>
1352 */
1353 l_ok
1354 pixWriteMemPdf(l_uint8 **pdata,
1355 size_t *pnbytes,
1356 PIX *pix,
1357 l_int32 res,
1358 const char *title)
1359 {
1360 l_int32 ret, type;
1361
1362 if (pdata) *pdata = NULL;
1363 if (pnbytes) *pnbytes = 0;
1364 if (!pdata || !pnbytes)
1365 return ERROR_INT("&data or &nbytes not defined", __func__, 1);
1366 if (!pix)
1367 return ERROR_INT("pix not defined", __func__, 1);
1368
1369 selectDefaultPdfEncoding(pix, &type);
1370 ret = pixConvertToPdfData(pix, type, 75, pdata, pnbytes,
1371 0, 0, res, title, NULL, 0);
1372 if (ret)
1373 return ERROR_INT("pdf data not made", __func__, 1);
1374 return 0;
1375 }
1376
1377
1378 /*---------------------------------------------------------------------*
1379 * Segmented multi-page, multi-image converter *
1380 *---------------------------------------------------------------------*/
1381 /*!
1382 * \brief convertSegmentedFilesToPdf()
1383 *
1384 * \param[in] dirname directory name containing images
1385 * \param[in] substr [optional] substring filter on filenames;
1386 * can be null
1387 * \param[in] res input resolution of all images
1388 * \param[in] type compression type for non-image regions; the
1389 * image regions are always compressed with
1390 * L_JPEG_ENCODE
1391 * \param[in] thresh used for converting gray --> 1 bpp with
1392 * L_G4_ENCODE
1393 * \param[in] baa [optional] boxaa of image regions
1394 * \param[in] quality used for JPEG only; 0 for default (75)
1395 * \param[in] scalefactor scaling factor applied to each image region
1396 * \param[in] title [optional] pdf title; can be null
1397 * \param[in] fileout pdf file of all images
1398 * \return 0 if OK, 1 on error
1399 *
1400 * <pre>
1401 * Notes:
1402 * (1) If %substr is not NULL, only image filenames that contain
1403 * the substring can be used. If %substr == NULL, all files
1404 * in the directory are used.
1405 * (2) The files in the directory, after optional filtering by
1406 * the substring, are lexically sorted in increasing order
1407 * before concatenation.
1408 * (3) The images are encoded with G4 if 1 bpp; JPEG if 8 bpp without
1409 * colormap and many colors, or 32 bpp; FLATE for anything else.
1410 * (4) The boxaa, if it exists, contains one boxa of "image regions"
1411 * for each image file. The boxa must be aligned with the
1412 * sorted set of images.
1413 * (5) The scalefactor is applied to each image region. It is
1414 * typically < 1.0, to save bytes in the final pdf, because
1415 * the resolution is often not critical in non-text regions.
1416 * (6) If the non-image regions have pixel depth > 1 and the encoding
1417 * type is G4, they are automatically scaled up by 2x and
1418 * thresholded. Otherwise, no scaling is performed on them.
1419 * (7) Note that this function can be used to generate multipage
1420 * G4 compressed pdf from any input, by using %boxaa == NULL
1421 * and %type == L_G4_ENCODE.
1422 * </pre>
1423 */
1424 l_ok
1425 convertSegmentedFilesToPdf(const char *dirname,
1426 const char *substr,
1427 l_int32 res,
1428 l_int32 type,
1429 l_int32 thresh,
1430 BOXAA *baa,
1431 l_int32 quality,
1432 l_float32 scalefactor,
1433 const char *title,
1434 const char *fileout)
1435 {
1436 char *fname;
1437 l_uint8 *imdata, *data;
1438 l_int32 i, npages, nboxa, nboxes, ret;
1439 size_t imbytes, databytes;
1440 BOXA *boxa;
1441 L_BYTEA *ba;
1442 L_PTRA *pa_data;
1443 SARRAY *sa;
1444
1445 if (!dirname)
1446 return ERROR_INT("dirname not defined", __func__, 1);
1447 if (!fileout)
1448 return ERROR_INT("fileout not defined", __func__, 1);
1449
1450 if ((sa = getNumberedPathnamesInDirectory(dirname, substr, 0, 0, 10000))
1451 == NULL)
1452 return ERROR_INT("sa not made", __func__, 1);
1453
1454 npages = sarrayGetCount(sa);
1455 /* If necessary, extend the boxaa, which is page-aligned with
1456 * the image files, to be as large as the set of images. */
1457 if (baa) {
1458 nboxa = boxaaGetCount(baa);
1459 if (nboxa < npages) {
1460 boxa = boxaCreate(1);
1461 boxaaExtendWithInit(baa, npages, boxa);
1462 boxaDestroy(&boxa);
1463 }
1464 }
1465
1466 /* Generate and save all the encoded pdf strings */
1467 pa_data = ptraCreate(npages);
1468 for (i = 0; i < npages; i++) {
1469 fname = sarrayGetString(sa, i, L_NOCOPY);
1470 if (!strcmp(fname, "")) continue;
1471 boxa = NULL;
1472 if (baa) {
1473 boxa = boxaaGetBoxa(baa, i, L_CLONE);
1474 nboxes = boxaGetCount(boxa);
1475 if (nboxes == 0)
1476 boxaDestroy(&boxa);
1477 }
1478 ret = convertToPdfDataSegmented(fname, res, type, thresh, boxa,
1479 quality, scalefactor, title,
1480 &imdata, &imbytes);
1481 boxaDestroy(&boxa); /* safe; in case nboxes > 0 */
1482 if (ret) {
1483 L_ERROR("pdf encoding failed for %s\n", __func__, fname);
1484 continue;
1485 }
1486 ba = l_byteaInitFromMem(imdata, imbytes);
1487 if (imdata) LEPT_FREE(imdata);
1488 ptraAdd(pa_data, ba);
1489 }
1490 sarrayDestroy(&sa);
1491
1492 ptraGetActualCount(pa_data, &npages);
1493 if (npages == 0) {
1494 L_ERROR("no pdf files made\n", __func__);
1495 ptraDestroy(&pa_data, FALSE, FALSE);
1496 return 1;
1497 }
1498
1499 /* Concatenate */
1500 ret = ptraConcatenatePdfToData(pa_data, NULL, &data, &databytes);
1501
1502 /* Clean up */
1503 ptraGetActualCount(pa_data, &npages); /* recalculate in case it changes */
1504 for (i = 0; i < npages; i++) {
1505 ba = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
1506 l_byteaDestroy(&ba);
1507 }
1508 ptraDestroy(&pa_data, FALSE, FALSE);
1509
1510 if (ret) {
1511 if (data) LEPT_FREE(data);
1512 return ERROR_INT("pdf data not made", __func__, 1);
1513 }
1514
1515 ret = l_binaryWrite(fileout, "w", data, databytes);
1516 LEPT_FREE(data);
1517 if (ret)
1518 L_ERROR("pdf data not written to file\n", __func__);
1519 return ret;
1520 }
1521
1522
1523 /*!
1524 * \brief convertNumberedMasksToBoxaa()
1525 *
1526 * \param[in] dirname directory name containing mask images
1527 * \param[in] substr [optional] substring filter on filenames;
1528 * can be null
1529 * \param[in] numpre number of characters in name before number
1530 * \param[in] numpost number of characters in name after number,
1531 * up to a dot before an extension
1532 * \return boxaa of mask regions, or NULL on error
1533 *
1534 * <pre>
1535 * Notes:
1536 * (1) This is conveniently used to generate the input boxaa
1537 * for convertSegmentedFilesToPdf(). It guarantees that the
1538 * boxa will be aligned with the page images, even if some
1539 * of the boxa are empty.
1540 * </pre>
1541 */
1542 BOXAA *
1543 convertNumberedMasksToBoxaa(const char *dirname,
1544 const char *substr,
1545 l_int32 numpre,
1546 l_int32 numpost)
1547 {
1548 char *fname;
1549 l_int32 i, n;
1550 BOXA *boxa;
1551 BOXAA *baa;
1552 PIX *pix;
1553 SARRAY *sa;
1554
1555 if (!dirname)
1556 return (BOXAA *)ERROR_PTR("dirname not defined", __func__, NULL);
1557
1558 if ((sa = getNumberedPathnamesInDirectory(dirname, substr, numpre,
1559 numpost, 10000)) == NULL)
1560 return (BOXAA *)ERROR_PTR("sa not made", __func__, NULL);
1561
1562 /* Generate and save all the encoded pdf strings */
1563 n = sarrayGetCount(sa);
1564 baa = boxaaCreate(n);
1565 boxa = boxaCreate(1);
1566 boxaaInitFull(baa, boxa);
1567 boxaDestroy(&boxa);
1568 for (i = 0; i < n; i++) {
1569 fname = sarrayGetString(sa, i, L_NOCOPY);
1570 if (!strcmp(fname, "")) continue;
1571 if ((pix = pixRead(fname)) == NULL) {
1572 L_WARNING("invalid image on page %d\n", __func__, i);
1573 continue;
1574 }
1575 boxa = pixConnComp(pix, NULL, 8);
1576 boxaaReplaceBoxa(baa, i, boxa);
1577 pixDestroy(&pix);
1578 }
1579
1580 sarrayDestroy(&sa);
1581 return baa;
1582 }
1583
1584
1585 /*---------------------------------------------------------------------*
1586 * Segmented single page, multi-image converters *
1587 *---------------------------------------------------------------------*/
1588 /*!
1589 * \brief convertToPdfSegmented()
1590 *
1591 * \param[in] filein input image file -- any format
1592 * \param[in] res input image resolution; typ. 300 ppi;
1593 * use 0 for default
1594 * \param[in] type compression type for non-image regions; image
1595 * regions are always compressed with L_JPEG_ENCODE
1596 * \param[in] thresh for converting gray --> 1 bpp with L_G4_ENCODE
1597 * \param[in] boxa [optional] of image regions; can be null
1598 * \param[in] quality used for jpeg image regions; 0 for default
1599 * \param[in] scalefactor used for jpeg regions; must be <= 1.0
1600 * \param[in] title [optional] pdf title; can be null
1601 * \param[in] fileout output pdf file
1602 * \return 0 if OK, 1 on error
1603 *
1604 * <pre>
1605 * Notes:
1606 * (1) If there are no image regions, set %boxa == NULL;
1607 * %quality and %scalefactor are ignored.
1608 * (2) Typically, %scalefactor is < 1.0, because the image regions
1609 * can be rendered at a lower resolution (for better compression)
1610 * than the text regions. If %scalefactor == 0, we use 1.0.
1611 * If the input image is 1 bpp and scalefactor < 1.0, we
1612 * use scaleToGray() to downsample the image regions to gray
1613 * before compressing them.
1614 * (3) If the compression type for non-image regions is L_G4_ENCODE
1615 * and bpp > 1, the image is upscaled 2x and thresholded
1616 * to 1 bpp. That is the only situation where %thresh is used.
1617 * (4) The parameter %quality is only used for image regions.
1618 * If %type == L_JPEG_ENCODE, default jpeg quality (75) is
1619 * used for the non-image regions.
1620 * (5) Processing matrix for non-image regions.
1621 *
1622 * Input G4 JPEG FLATE
1623 * ----------|---------------------------------------------------
1624 * 1 bpp | 1x, 1 bpp 1x flate, 1 bpp 1x, 1 bpp
1625 * |
1626 * cmap | 2x, 1 bpp 1x flate, cmap 1x, cmap
1627 * |
1628 * 2,4 bpp | 2x, 1 bpp 1x flate 1x, 2,4 bpp
1629 * no cmap | 2,4 bpp
1630 * |
1631 * 8,32 bpp | 2x, 1 bpp 1x (jpeg) 1x, 8,32 bpp
1632 * no cmap | 8,32 bpp
1633 *
1634 * Summary:
1635 * (a) if G4 is requested, G4 is used, with 2x upscaling
1636 * for all cases except 1 bpp.
1637 * (b) if JPEG is requested, use flate encoding for all cases
1638 * except 8 bpp without cmap and 32 bpp (rgb).
1639 * (c) if FLATE is requested, use flate with no transformation
1640 * of the raster data.
1641 * (6) Calling options/sequence for these functions:
1642 * file --> file (convertToPdfSegmented)
1643 * pix --> file (pixConvertToPdfSegmented)
1644 * pix --> data (pixConvertToPdfDataSegmented)
1645 * file --> data (convertToPdfDataSegmented)
1646 * pix --> data (pixConvertToPdfDataSegmented)
1647 * </pre>
1648 */
1649 l_ok
1650 convertToPdfSegmented(const char *filein,
1651 l_int32 res,
1652 l_int32 type,
1653 l_int32 thresh,
1654 BOXA *boxa,
1655 l_int32 quality,
1656 l_float32 scalefactor,
1657 const char *title,
1658 const char *fileout)
1659 {
1660 l_int32 ret;
1661 PIX *pixs;
1662
1663 if (!filein)
1664 return ERROR_INT("filein not defined", __func__, 1);
1665 if (!fileout)
1666 return ERROR_INT("fileout not defined", __func__, 1);
1667 if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
1668 type != L_FLATE_ENCODE)
1669 return ERROR_INT("invalid conversion type", __func__, 1);
1670 if (boxa && scalefactor > 1.0) {
1671 L_WARNING("setting scalefactor to 1.0\n", __func__);
1672 scalefactor = 1.0;
1673 }
1674
1675 if ((pixs = pixRead(filein)) == NULL)
1676 return ERROR_INT("pixs not made", __func__, 1);
1677
1678 ret = pixConvertToPdfSegmented(pixs, res, type, thresh, boxa, quality,
1679 scalefactor, title, fileout);
1680 pixDestroy(&pixs);
1681 return ret;
1682 }
1683
1684
1685 /*!
1686 * \brief pixConvertToPdfSegmented()
1687 *
1688 * \param[in] pixs any depth, cmap OK
1689 * \param[in] res input image resolution; typ. 300 ppi;
1690 * use 0 for default
1691 * \param[in] type compression type for non-image regions; image
1692 * regions are always compressed with L_JPEG_ENCODE
1693 * \param[in] thresh for converting gray --> 1 bpp with L_G4_ENCODE
1694 * \param[in] boxa [optional] of image regions; can be null
1695 * \param[in] quality used for jpeg image regions; 0 for default
1696 * \param[in] scalefactor used for jpeg regions; must be <= 1.0
1697 * \param[in] title [optional] pdf title; can be null
1698 * \param[in] fileout output pdf file
1699 * \return 0 if OK, 1 on error
1700 *
1701 * <pre>
1702 * Notes:
1703 * (1) See convertToPdfSegmented() for details.
1704 * </pre>
1705 */
1706 l_ok
1707 pixConvertToPdfSegmented(PIX *pixs,
1708 l_int32 res,
1709 l_int32 type,
1710 l_int32 thresh,
1711 BOXA *boxa,
1712 l_int32 quality,
1713 l_float32 scalefactor,
1714 const char *title,
1715 const char *fileout)
1716 {
1717 l_uint8 *data;
1718 l_int32 ret;
1719 size_t nbytes;
1720
1721 if (!pixs)
1722 return ERROR_INT("pixs not defined", __func__, 1);
1723 if (!fileout)
1724 return ERROR_INT("fileout not defined", __func__, 1);
1725 if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
1726 type != L_FLATE_ENCODE)
1727 return ERROR_INT("invalid conversion type", __func__, 1);
1728 if (boxa && scalefactor > 1.0) {
1729 L_WARNING("setting scalefactor to 1.0\n", __func__);
1730 scalefactor = 1.0;
1731 }
1732
1733 ret = pixConvertToPdfDataSegmented(pixs, res, type, thresh, boxa, quality,
1734 scalefactor, title, &data, &nbytes);
1735 if (ret)
1736 return ERROR_INT("pdf generation failure", __func__, 1);
1737
1738 ret = l_binaryWrite(fileout, "w", data, nbytes);
1739 if (data) LEPT_FREE(data);
1740 return ret;
1741 }
1742
1743
1744 /*!
1745 * \brief convertToPdfDataSegmented()
1746 *
1747 * \param[in] filein input image file -- any format
1748 * \param[in] res input image resolution; typ. 300 ppi;
1749 * use 0 for default
1750 * \param[in] type compression type for non-image regions; image
1751 * regions are always compressed with L_JPEG_ENCODE
1752 * \param[in] thresh for converting gray --> 1 bpp with L_G4_ENCODE
1753 * \param[in] boxa [optional] image regions; can be null
1754 * \param[in] quality used for jpeg image regions; 0 for default
1755 * \param[in] scalefactor used for jpeg regions; must be <= 1.0
1756 * \param[in] title [optional] pdf title; can be null
1757 * \param[out] pdata pdf data in memory
1758 * \param[out] pnbytes number of bytes in pdf data
1759 * \return 0 if OK, 1 on error
1760 *
1761 * <pre>
1762 * Notes:
1763 * (1) If there are no image regions, set %boxa == NULL;
1764 * %quality and %scalefactor are ignored.
1765 * (2) Typically, %scalefactor is < 1.0. The image regions are
1766 * </pre>
1767 */
1768 l_ok
1769 convertToPdfDataSegmented(const char *filein,
1770 l_int32 res,
1771 l_int32 type,
1772 l_int32 thresh,
1773 BOXA *boxa,
1774 l_int32 quality,
1775 l_float32 scalefactor,
1776 const char *title,
1777 l_uint8 **pdata,
1778 size_t *pnbytes)
1779 {
1780 l_int32 ret;
1781 PIX *pixs;
1782
1783 if (!pdata)
1784 return ERROR_INT("&data not defined", __func__, 1);
1785 *pdata = NULL;
1786 if (!pnbytes)
1787 return ERROR_INT("&nbytes not defined", __func__, 1);
1788 *pnbytes = 0;
1789 if (!filein)
1790 return ERROR_INT("filein not defined", __func__, 1);
1791 if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
1792 type != L_FLATE_ENCODE)
1793 return ERROR_INT("invalid conversion type", __func__, 1);
1794 if (boxa && scalefactor > 1.0) {
1795 L_WARNING("setting scalefactor to 1.0\n", __func__);
1796 scalefactor = 1.0;
1797 }
1798
1799 if ((pixs = pixRead(filein)) == NULL)
1800 return ERROR_INT("pixs not made", __func__, 1);
1801
1802 ret = pixConvertToPdfDataSegmented(pixs, res, type, thresh, boxa,
1803 quality, scalefactor, title,
1804 pdata, pnbytes);
1805 pixDestroy(&pixs);
1806 return ret;
1807 }
1808
1809
1810 /*!
1811 * \brief pixConvertToPdfDataSegmented()
1812 *
1813 * \param[in] pixs any depth, cmap OK
1814 * \param[in] res input image resolution; typ. 300 ppi;
1815 * use 0 for default
1816 * \param[in] type compression type for non-image regions; image
1817 * regions are always compressed with L_JPEG_ENCODE
1818 * \param[in] thresh for converting gray --> 1 bpp with L_G4_ENCODE
1819 * \param[in] boxa [optional] of image regions; can be null
1820 * \param[in] quality used for jpeg image regions; 0 for default
1821 * \param[in] scalefactor used for jpeg regions; must be <= 1.0
1822 * \param[in] title [optional] pdf title; can be null
1823 * \param[out] pdata pdf data in memory
1824 * \param[out] pnbytes number of bytes in pdf data
1825 * \return 0 if OK, 1 on error
1826 *
1827 * <pre>
1828 * Notes:
1829 * (1) See convertToPdfSegmented() for details.
1830 * </pre>
1831 */
1832 l_ok
1833 pixConvertToPdfDataSegmented(PIX *pixs,
1834 l_int32 res,
1835 l_int32 type,
1836 l_int32 thresh,
1837 BOXA *boxa,
1838 l_int32 quality,
1839 l_float32 scalefactor,
1840 const char *title,
1841 l_uint8 **pdata,
1842 size_t *pnbytes)
1843 {
1844 l_int32 i, nbox, seq, bx, by, bw, bh, upscale;
1845 l_float32 scale;
1846 BOX *box, *boxc, *box2;
1847 PIX *pix, *pixt1, *pixt2, *pixt3, *pixt4, *pixt5, *pixt6;
1848 PIXCMAP *cmap;
1849 L_PDF_DATA *lpd;
1850
1851 if (!pdata)
1852 return ERROR_INT("&data not defined", __func__, 1);
1853 *pdata = NULL;
1854 if (!pnbytes)
1855 return ERROR_INT("&nbytes not defined", __func__, 1);
1856 *pnbytes = 0;
1857 if (!pixs)
1858 return ERROR_INT("pixs not defined", __func__, 1);
1859 if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
1860 type != L_FLATE_ENCODE)
1861 return ERROR_INT("invalid conversion type", __func__, 1);
1862 if (boxa && (scalefactor <= 0.0 || scalefactor > 1.0)) {
1863 L_WARNING("setting scalefactor to 1.0\n", __func__);
1864 scalefactor = 1.0;
1865 }
1866
1867 /* Adjust scalefactor so that the product with res gives an integer */
1868 if (res <= 0)
1869 res = DefaultInputRes;
1870 scale = (l_float32)((l_int32)(scalefactor * res + 0.5)) / (l_float32)res;
1871 cmap = pixGetColormap(pixs);
1872
1873 /* Simple case: single image to be encoded */
1874 if (!boxa || boxaGetCount(boxa) == 0) {
1875 if (pixGetDepth(pixs) > 1 && type == L_G4_ENCODE) {
1876 if (cmap)
1877 pixt1 = pixRemoveColormap(pixs, REMOVE_CMAP_TO_GRAYSCALE);
1878 else
1879 pixt1 = pixConvertTo8(pixs, FALSE);
1880 pixt2 = pixScaleGray2xLIThresh(pixt1, thresh);
1881 pixConvertToPdfData(pixt2, type, quality, pdata, pnbytes,
1882 0, 0, 2 * res, title, NULL, 0);
1883 pixDestroy(&pixt1);
1884 pixDestroy(&pixt2);
1885 } else {
1886 pixConvertToPdfData(pixs, type, quality, pdata, pnbytes,
1887 0, 0, res, title, NULL, 0);
1888 }
1889 return 0;
1890 }
1891
1892 /* Multiple images to be encoded. If %type == L_G4_ENCODE,
1893 * jpeg encode a version of pixs that is blanked in the non-image
1894 * regions, and paint the scaled non-image part onto it through a mask.
1895 * Otherwise, we must put the non-image part down first and
1896 * then render all the image regions separately on top of it,
1897 * at their own resolution. */
1898 pixt1 = pixSetBlackOrWhiteBoxa(pixs, boxa, L_SET_WHITE); /* non-image */
1899 nbox = boxaGetCount(boxa);
1900 if (type == L_G4_ENCODE) {
1901 pixt2 = pixCreateTemplate(pixs); /* only image regions */
1902 pixSetBlackOrWhite(pixt2, L_SET_WHITE);
1903 for (i = 0; i < nbox; i++) {
1904 box = boxaGetBox(boxa, i, L_CLONE);
1905 pix = pixClipRectangle(pixs, box, &boxc);
1906 boxGetGeometry(boxc, &bx, &by, &bw, &bh);
1907 pixRasterop(pixt2, bx, by, bw, bh, PIX_SRC, pix, 0, 0);
1908 pixDestroy(&pix);
1909 boxDestroy(&box);
1910 boxDestroy(&boxc);
1911 }
1912 pixt3 = pixRemoveColormap(pixt2, REMOVE_CMAP_BASED_ON_SRC);
1913 if (pixGetDepth(pixt3) == 1)
1914 pixt4 = pixScaleToGray(pixt3, scale);
1915 else
1916 pixt4 = pixScale(pixt3, scale, scale);
1917 pixConvertToPdfData(pixt4, L_JPEG_ENCODE, quality, pdata, pnbytes,
1918 0, 0, (l_int32)(scale * res), title,
1919 &lpd, L_FIRST_IMAGE);
1920
1921 if (pixGetDepth(pixt1) == 1) {
1922 pixt5 = pixClone(pixt1);
1923 upscale = 1;
1924 } else {
1925 pixt6 = pixConvertTo8(pixt1, 0);
1926 pixt5 = pixScaleGray2xLIThresh(pixt6, thresh);
1927 pixDestroy(&pixt6);
1928 upscale = 2;
1929 }
1930 pixConvertToPdfData(pixt5, L_G4_ENCODE, quality, pdata, pnbytes,
1931 0, 0, upscale * res, title, &lpd, L_LAST_IMAGE);
1932 pixDestroy(&pixt2);
1933 pixDestroy(&pixt3);
1934 pixDestroy(&pixt4);
1935 pixDestroy(&pixt5);
1936 } else {
1937 /* Put the non-image part down first. This is the full
1938 size of the page, so we can use it to find the page
1939 height in pixels, which is required for determining
1940 the LL corner of the image relative to the LL corner
1941 of the page. */
1942 pixConvertToPdfData(pixt1, type, quality, pdata, pnbytes, 0, 0,
1943 res, title, &lpd, L_FIRST_IMAGE);
1944 for (i = 0; i < nbox; i++) {
1945 box = boxaGetBox(boxa, i, L_CLONE);
1946 pixt2 = pixClipRectangle(pixs, box, &boxc);
1947 pixt3 = pixRemoveColormap(pixt2, REMOVE_CMAP_BASED_ON_SRC);
1948 if (pixGetDepth(pixt3) == 1)
1949 pixt4 = pixScaleToGray(pixt3, scale);
1950 else
1951 pixt4 = pixScale(pixt3, scale, scale);
1952 box2 = boxTransform(boxc, 0, 0, scale, scale);
1953 boxGetGeometry(box2, &bx, &by, NULL, &bh);
1954 seq = (i == nbox - 1) ? L_LAST_IMAGE : L_NEXT_IMAGE;
1955 pixConvertToPdfData(pixt4, L_JPEG_ENCODE, quality, pdata, pnbytes,
1956 bx, by, (l_int32)(scale * res), title,
1957 &lpd, seq);
1958 pixDestroy(&pixt2);
1959 pixDestroy(&pixt3);
1960 pixDestroy(&pixt4);
1961 boxDestroy(&box);
1962 boxDestroy(&boxc);
1963 boxDestroy(&box2);
1964 }
1965 }
1966
1967 pixDestroy(&pixt1);
1968 return 0;
1969 }
1970
1971
1972 /*---------------------------------------------------------------------*
1973 * Multi-page concatenation *
1974 *---------------------------------------------------------------------*/
1975 /*!
1976 * \brief concatenatePdf()
1977 *
1978 * \param[in] dirname directory name containing single-page pdf files
1979 * \param[in] substr [optional] substring filter on filenames;
1980 * can be null
1981 * \param[in] fileout concatenated pdf file
1982 * \return 0 if OK, 1 on error
1983 *
1984 * <pre>
1985 * Notes:
1986 * (1) This only works with leptonica-formatted single-page pdf files.
1987 * (2) If %substr is not NULL, only filenames that contain
1988 * the substring can be returned. If %substr == NULL,
1989 * none of the filenames are filtered out.
1990 * (3) The files in the directory, after optional filtering by
1991 * the substring, are lexically sorted in increasing order
1992 * before concatenation.
1993 * </pre>
1994 */
1995 l_ok
1996 concatenatePdf(const char *dirname,
1997 const char *substr,
1998 const char *fileout)
1999 {
2000 l_int32 ret;
2001 SARRAY *sa;
2002
2003 if (!dirname)
2004 return ERROR_INT("dirname not defined", __func__, 1);
2005 if (!fileout)
2006 return ERROR_INT("fileout not defined", __func__, 1);
2007
2008 if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
2009 return ERROR_INT("sa not made", __func__, 1);
2010 ret = saConcatenatePdf(sa, fileout);
2011 sarrayDestroy(&sa);
2012 return ret;
2013 }
2014
2015
2016 /*!
2017 * \brief saConcatenatePdf()
2018 *
2019 * \param[in] sa string array of pathnames for single-page pdf files
2020 * \param[in] fileout concatenated pdf file
2021 * \return 0 if OK, 1 on error
2022 *
2023 * <pre>
2024 * Notes:
2025 * (1) This only works with leptonica-formatted single-page pdf files.
2026 * </pre>
2027 */
2028 l_ok
2029 saConcatenatePdf(SARRAY *sa,
2030 const char *fileout)
2031 {
2032 l_uint8 *data;
2033 l_int32 ret;
2034 size_t nbytes;
2035
2036 if (!sa)
2037 return ERROR_INT("sa not defined", __func__, 1);
2038 if (!fileout)
2039 return ERROR_INT("fileout not defined", __func__, 1);
2040
2041 ret = saConcatenatePdfToData(sa, &data, &nbytes);
2042 if (ret)
2043 return ERROR_INT("pdf data not made", __func__, 1);
2044 ret = l_binaryWrite(fileout, "w", data, nbytes);
2045 LEPT_FREE(data);
2046 return ret;
2047 }
2048
2049
2050 /*!
2051 * \brief ptraConcatenatePdf()
2052 *
2053 * \param[in] pa array of pdf strings, each for a single-page pdf file
2054 * \param[in] fileout concatenated pdf file
2055 * \return 0 if OK, 1 on error
2056 *
2057 * <pre>
2058 * Notes:
2059 * (1) This only works with leptonica-formatted single-page pdf files.
2060 * </pre>
2061 */
2062 l_ok
2063 ptraConcatenatePdf(L_PTRA *pa,
2064 const char *fileout)
2065 {
2066 l_uint8 *data;
2067 l_int32 ret;
2068 size_t nbytes;
2069
2070 if (!pa)
2071 return ERROR_INT("pa not defined", __func__, 1);
2072 if (!fileout)
2073 return ERROR_INT("fileout not defined", __func__, 1);
2074
2075 ret = ptraConcatenatePdfToData(pa, NULL, &data, &nbytes);
2076 if (ret)
2077 return ERROR_INT("pdf data not made", __func__, 1);
2078 ret = l_binaryWrite(fileout, "w", data, nbytes);
2079 LEPT_FREE(data);
2080 return ret;
2081 }
2082
2083
2084 /*!
2085 * \brief concatenatePdfToData()
2086 *
2087 * \param[in] dirname directory name containing single-page pdf files
2088 * \param[in] substr [optional] substring filter on filenames;
2089 * can be null
2090 * \param[out] pdata concatenated pdf data in memory
2091 * \param[out] pnbytes number of bytes in pdf data
2092 * \return 0 if OK, 1 on error
2093 *
2094 * <pre>
2095 * Notes:
2096 * (1) This only works with leptonica-formatted single-page pdf files.
2097 * (2) If %substr is not NULL, only filenames that contain
2098 * the substring can be returned. If %substr == NULL,
2099 * none of the filenames are filtered out.
2100 * (3) The files in the directory, after optional filtering by
2101 * the substring, are lexically sorted in increasing order
2102 * before concatenation.
2103 * </pre>
2104 */
2105 l_ok
2106 concatenatePdfToData(const char *dirname,
2107 const char *substr,
2108 l_uint8 **pdata,
2109 size_t *pnbytes)
2110 {
2111 l_int32 ret;
2112 SARRAY *sa;
2113
2114 if (!pdata)
2115 return ERROR_INT("&data not defined", __func__, 1);
2116 *pdata = NULL;
2117 if (!pnbytes)
2118 return ERROR_INT("&nbytes not defined", __func__, 1);
2119 *pnbytes = 0;
2120 if (!dirname)
2121 return ERROR_INT("dirname not defined", __func__, 1);
2122
2123 if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
2124 return ERROR_INT("sa not made", __func__, 1);
2125 ret = saConcatenatePdfToData(sa, pdata, pnbytes);
2126 sarrayDestroy(&sa);
2127 return ret;
2128 }
2129
2130
2131 /*!
2132 * \brief saConcatenatePdfToData()
2133 *
2134 * \param[in] sa string array of pathnames for single-page pdf files
2135 * \param[out] pdata concatenated pdf data in memory
2136 * \param[out] pnbytes number of bytes in pdf data
2137 * \return 0 if OK, 1 on error
2138 *
2139 * <pre>
2140 * Notes:
2141 * (1) This only works with leptonica-formatted single-page pdf files.
2142 * </pre>
2143 */
2144 l_ok
2145 saConcatenatePdfToData(SARRAY *sa,
2146 l_uint8 **pdata,
2147 size_t *pnbytes)
2148 {
2149 char *fname;
2150 l_int32 i, npages, ret;
2151 L_BYTEA *bas;
2152 L_PTRA *pa_data; /* input pdf data for each page */
2153
2154 if (!pdata)
2155 return ERROR_INT("&data not defined", __func__, 1);
2156 *pdata = NULL;
2157 if (!pnbytes)
2158 return ERROR_INT("&nbytes not defined", __func__, 1);
2159 *pnbytes = 0;
2160 if (!sa)
2161 return ERROR_INT("sa not defined", __func__, 1);
2162
2163 /* Read the pdf files into memory */
2164 if ((npages = sarrayGetCount(sa)) == 0)
2165 return ERROR_INT("no filenames found", __func__, 1);
2166 pa_data = ptraCreate(npages);
2167 for (i = 0; i < npages; i++) {
2168 fname = sarrayGetString(sa, i, L_NOCOPY);
2169 bas = l_byteaInitFromFile(fname);
2170 ptraAdd(pa_data, bas);
2171 }
2172
2173 ret = ptraConcatenatePdfToData(pa_data, sa, pdata, pnbytes);
2174
2175 /* Cleanup: some pages could have been removed */
2176 ptraGetActualCount(pa_data, &npages);
2177 for (i = 0; i < npages; i++) {
2178 bas = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
2179 l_byteaDestroy(&bas);
2180 }
2181 ptraDestroy(&pa_data, FALSE, FALSE);
2182 return ret;
2183 }
2184
2185 /* --------------------------------------------*/
2186 #endif /* USE_PDFIO */
2187 /* --------------------------------------------*/