Mercurial > hgrepos > Python2 > PyMuPDF

diff mupdf-source/thirdparty/leptonica/src/pdfio1.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author: Franz Glasner <fzglas.hg@dom66.de>
date: Mon, 15 Sep 2025 11:43:07 +0200
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/leptonica/src/pdfio1.c	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,2187 @@
+/*====================================================================*
+ -  Copyright (C) 2001 Leptonica.  All rights reserved.
+ -
+ -  Redistribution and use in source and binary forms, with or without
+ -  modification, are permitted provided that the following conditions
+ -  are met:
+ -  1. Redistributions of source code must retain the above copyright
+ -     notice, this list of conditions and the following disclaimer.
+ -  2. Redistributions in binary form must reproduce the above
+ -     copyright notice, this list of conditions and the following
+ -     disclaimer in the documentation and/or other materials
+ -     provided with the distribution.
+ -
+ -  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ -  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ -  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ -  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL ANY
+ -  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ -  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ -  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ -  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ -  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ -  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ -  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *====================================================================*/
+
+/*!
+ * \file pdfio1.c
+ * <pre>
+ *
+ *    Higher-level operations for generating pdf from images.
+ *    Use poppler's pdftoppm or pdfimages to invert the process,
+ *    extracting raster images from pdf.
+ *
+ *    |=============================================================|
+ *    |                        Important notes                      |
+ *    |=============================================================|
+ *    | Some of these functions require I/O libraries such as       |
+ *    | libtiff, libjpeg, libpng, libz and libopenjp2.  If you do   |
+ *    | not have these libraries, some calls will fail.  For        |
+ *    | example, if you do not have libopenjp2, you cannot write a  |
+ *    | pdf where transcoding is required to incorporate a          |
+ *    | jp2k image.                                                 |
+ *    |                                                             |
+ *    | You can manually deactivate all pdf writing by setting      |
+ *    | this in environ.h:                                          |
+ *    | \code                                                       |
+ *    |      #define  USE_PDFIO     0                               |
+ *    | \endcode                                                    |
+ *    | This will link the stub file pdfiostub.c.                   |
+ *    |=============================================================|
+ *
+ *     Set 1. These functions convert a set of image files
+ *     to a multi-page pdf file, with one image on each page.
+ *     All images are rendered at the same (input) resolution.
+ *     The images can be specified as being in a directory, or they
+ *     can be in an sarray.  The output pdf can be either a file
+ *     or an array of bytes in memory.
+ *
+ *     Set 2. These functions are a special case of set 1, where
+ *     no scaling or change in quality is required.  For jpeg, jp2k and
+ *     tiffg4 images, the bytes in each file can be directly incorporated
+ *     into the output pdf, and the wrapping up of multiple image
+ *     files is very fast.  For non-interlaced png, the data bytes
+ *     including the predictors can also be written directly into the
+ *     flate pdf data.  For other image formats transcoding is required,
+ *     where the image data is first decompressed and then flate (gzip),
+ *     DCT (jpeg) or tiffg4 (1 bpp) encodings are generated.
+ *
+ *     Set 3. These functions convert a set of images in memory
+ *     to a multi-page pdf, with one image on each page.  The pdf
+ *     output can be either a file or an array of bytes in memory.
+ *
+ *     Set 4. These functions implement a pdf output "device driver"
+ *     for wrapping (encoding) any number of images on a single page
+ *     in pdf.  The input can be either an image file or a Pix;
+ *     the pdf output can be either a file or an array of bytes in memory.
+ *
+ *     Set 5. These "segmented" functions take a set of image
+ *     files, along with optional segmentation information, and
+ *     generate a multi-page pdf file, where each page consists
+ *     in general of a mixed raster pdf of image and non-image regions.
+ *     The segmentation information for each page can be input as
+ *     either a mask over the image parts, or as a Boxa of those
+ *     regions.
+ *
+ *     Set 6. These "segmented" functions convert an image and
+ *     an optional Boxa of image regions into a mixed raster pdf file
+ *     for the page.  The input image can be either a file or a Pix.
+ *
+ *     Set 7. These functions take a set of single-page pdf files
+ *     and concatenates it into a multi-page pdf.  The input can be
+ *     a set of either single page pdf files or pdf 'strings' in memory.
+ *     The output can be either a file or an array of bytes in memory.
+ *
+ *     The images in the pdf file can be rendered using a pdf viewer,
+ *     such as evince, gv, xpdf or acroread.
+ *
+ *     Reference on the pdf file format:
+ *         http://www.adobe.com/devnet/pdf/pdf_reference_archive.html
+ *
+ *     1. Convert specified image files to pdf (one image file per page)
+ *          l_int32             convertFilesToPdf()
+ *          l_int32             saConvertFilesToPdf()
+ *          l_int32             saConvertFilesToPdfData()
+ *          l_int32             selectDefaultPdfEncoding()
+ *
+ *     2. Convert specified image files to pdf without scaling
+ *          l_int32             convertUnscaledFilesToPdf()
+ *          l_int32             saConvertUnscaledFilesToPdf()
+ *          l_int32             saConvertUnscaledFilesToPdfData()
+ *          l_int32             convertUnscaledToPdfData()
+ *
+ *     3. Convert multiple images to pdf (one image per page)
+ *          l_int32             pixaConvertToPdf()
+ *          l_int32             pixaConvertToPdfData()
+ *
+ *     4. Single page, multi-image converters
+ *          l_int32             convertToPdf()
+ *          l_int32             convertImageDataToPdf()
+ *          l_int32             convertToPdfData()
+ *          l_int32             convertImageDataToPdfData()
+ *          l_int32             pixConvertToPdf()
+ *          l_int32             pixWriteStreamPdf()
+ *          l_int32             pixWriteMemPdf()
+ *
+ *     5. Segmented multi-page, multi-image converter
+ *          l_int32             convertSegmentedFilesToPdf()
+ *          BOXAA              *convertNumberedMasksToBoxaa()
+ *
+ *     6. Segmented single page, multi-image converters
+ *          l_int32             convertToPdfSegmented()
+ *          l_int32             pixConvertToPdfSegmented()
+ *          l_int32             convertToPdfDataSegmented()
+ *          l_int32             pixConvertToPdfDataSegmented()
+ *
+ *     7. Multipage concatenation
+ *          l_int32             concatenatePdf()
+ *          l_int32             saConcatenatePdf()
+ *          l_int32             ptraConcatenatePdf()
+ *          l_int32             concatenatePdfToData()
+ *          l_int32             saConcatenatePdfToData()
+ *
+ *     The top-level multi-image functions can be visualized as follows:
+ *          Output pdf data to file:
+ *             convertToPdf()  and  convertImageDataToPdf()
+ *                     --> pixConvertToPdf()
+ *                           --> pixConvertToPdfData()
+ *
+ *          Output pdf data to array in memory:
+ *             convertToPdfData()  and  convertImageDataToPdfData()
+ *                     --> pixConvertToPdfData()
+ *
+ *     The top-level segmented image functions can be visualized as follows:
+ *          Output pdf data to file:
+ *             convertToPdfSegmented()
+ *                     --> pixConvertToPdfSegmented()
+ *                           --> pixConvertToPdfDataSegmented()
+ *
+ *          Output pdf data to array in memory:
+ *             convertToPdfDataSegmented()
+ *                     --> pixConvertToPdfDataSegmented()
+ *
+ *     For multi-page concatenation, there are three different types of input
+ *        (1) directory and optional filename filter
+ *        (2) sarray of filenames
+ *        (3) ptra of byte arrays of pdf data
+ *     and two types of output for the concatenated pdf data
+ *        (1) filename
+ *        (2) data array and size
+ *     High-level interfaces are given for each of the six combinations.
+ *
+ *     Note: When wrapping small images into pdf, it is useful to give
+ *     them a relatively low resolution value, to avoid rounding errors
+ *     when rendering the images.  For example, if you want an image
+ *     of width w pixels to be 5 inches wide on a screen, choose a
+ *     resolution w/5.
+ *
+ *     The very fast functions in section (2) require neither transcoding
+ *     nor parsing of the compressed jpeg file.  With three types of image
+ *     compression, the compressed strings can be incorporated into
+ *     the pdf data without decompression and re-encoding: jpeg, jp2k
+ *     and png.  The DCTDecode and JPXDecode filters can handle the
+ *     entire jpeg and jp2k encoded string as a byte array in the pdf file.
+ *     The FlateDecode filter can handle the png compressed image data,
+ *     including predictors that occur as the first byte in each
+ *     raster line, but it is necessary to store only the png IDAT chunk
+ *     data in the pdf array.  The alternative for wrapping png images
+ *     is to transcode them: uncompress into a raster (a pix) and then
+ *     gzip the raster data.  This typically results in a larger pdf file
+ *     because it doesn't use the two-dimensional png predictor.
+ *     Colormaps, which are found in png PLTE chunks, must always be
+ *     pulled out and included separately in the pdf.  For CCITT-G4
+ *     compression, you can not simply include a tiff G4 file -- you must
+ *     either parse it and extract the G4 compressed data within it,
+ *     or uncompress to a raster and G4 compress again.
+ * </pre>
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config_auto.h>
+#endif  /* HAVE_CONFIG_H */
+
+#include <string.h>
+#include <math.h>
+#include "allheaders.h"
+
+/* --------------------------------------------*/
+#if  USE_PDFIO   /* defined in environ.h */
+ /* --------------------------------------------*/
+
+    /* Typical scan resolution in ppi (pixels/inch) */
+static const l_int32  DefaultInputRes = 300;
+
+/*---------------------------------------------------------------------*
+ *    Convert specified image files to pdf (one image file per page)   *
+ *---------------------------------------------------------------------*/
+/*!
+ * \brief   convertFilesToPdf()
+ *
+ * \param[in]    dirname       directory name containing images
+ * \param[in]    substr        [optional] substring filter on filenames;
+ *                             can be null
+ * \param[in]    res           input resolution of all images
+ * \param[in]    scalefactor   scaling factor applied to each image; > 0.0
+ * \param[in]    type          encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
+ *                             L_FLATE_ENCODE, L_JP2K_ENCODE or
+ *                             L_DEFAULT_ENCODE for default)
+ * \param[in]    quality       for jpeg: 1-100; 0 for default (75)
+ *                             for jp2k: 27-45; 0 for default (34)
+ * \param[in]    title         [optional] pdf title; can be null
+ * \param[in]    fileout       pdf file of all images
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) If %substr is not NULL, only image filenames that contain
+ *          the substring can be used.  If %substr == NULL, all files
+ *          in the directory are used.
+ *      (2) The files in the directory, after optional filtering by
+ *          the substring, are lexically sorted in increasing order
+ *          before concatenation.
+ *      (3) The scalefactor is applied to each image before encoding.
+ *          If you enter a value <= 0.0, it will be set to 1.0.
+ *      (4) Specifying one of the four encoding types for %type forces
+ *          all images to be compressed with that type.  Use 0 to have
+ *          the type determined for each image based on depth and whether
+ *          or not it has a colormap.
+ * </pre>
+ */
+l_ok
+convertFilesToPdf(const char  *dirname,
+                  const char  *substr,
+                  l_int32      res,
+                  l_float32    scalefactor,
+                  l_int32      type,
+                  l_int32      quality,
+                  const char  *title,
+                  const char  *fileout)
+{
+l_int32  ret;
+SARRAY  *sa;
+
+    if (!dirname)
+        return ERROR_INT("dirname not defined", __func__, 1);
+    if (!fileout)
+        return ERROR_INT("fileout not defined", __func__, 1);
+
+    if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
+        return ERROR_INT("sa not made", __func__, 1);
+    ret = saConvertFilesToPdf(sa, res, scalefactor, type, quality,
+                              title, fileout);
+    sarrayDestroy(&sa);
+    return ret;
+}
+
+
+/*!
+ * \brief   saConvertFilesToPdf()
+ *
+ * \param[in]    sa            string array of pathnames for images
+ * \param[in]    res           input resolution of all images
+ * \param[in]    scalefactor   scaling factor applied to each image; > 0.0
+ * \param[in]    type          encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
+ *                             L_FLATE_ENCODE, L_JP2K_ENCODE or
+ *                             L_DEFAULT_ENCODE for default)
+ * \param[in]    quality       for jpeg: 1-100; 0 for default (75)
+ *                             for jp2k: 27-45; 0 for default (34)
+ * \param[in]    title         [optional] pdf title; can be null
+ * \param[in]    fileout       pdf file of all images
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) See convertFilesToPdf().
+ * </pre>
+ */
+l_ok
+saConvertFilesToPdf(SARRAY      *sa,
+                    l_int32      res,
+                    l_float32    scalefactor,
+                    l_int32      type,
+                    l_int32      quality,
+                    const char  *title,
+                    const char  *fileout)
+{
+l_uint8  *data;
+l_int32   ret;
+size_t    nbytes;
+
+    if (!sa)
+        return ERROR_INT("sa not defined", __func__, 1);
+
+    ret = saConvertFilesToPdfData(sa, res, scalefactor, type, quality,
+                                  title, &data, &nbytes);
+    if (ret) {
+        if (data) LEPT_FREE(data);
+        return ERROR_INT("pdf data not made", __func__, 1);
+    }
+
+    ret = l_binaryWrite(fileout, "w", data, nbytes);
+    LEPT_FREE(data);
+    if (ret)
+        L_ERROR("pdf data not written to file\n", __func__);
+    return ret;
+}
+
+
+/*!
+ * \brief   saConvertFilesToPdfData()
+ *
+ * \param[in]    sa            string array of pathnames for images
+ * \param[in]    res           input resolution of all images
+ * \param[in]    scalefactor   scaling factor applied to each image; > 0.0
+ * \param[in]    type          encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
+ *                             L_FLATE_ENCODE, L_JP2K_ENCODE or
+ *                             L_DEFAULT_ENCODE for default)
+ * \param[in]    quality       for jpeg: 1-100; 0 for default (75)
+ *                             for jp2k: 27-45; 0 for default (34)
+ * \param[in]    title         [optional] pdf title; can be null
+ * \param[out]   pdata         output pdf data (of all images
+ * \param[out]   pnbytes       size of output pdf data
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) See convertFilesToPdf().
+ * </pre>
+ */
+l_ok
+saConvertFilesToPdfData(SARRAY      *sa,
+                        l_int32      res,
+                        l_float32    scalefactor,
+                        l_int32      type,
+                        l_int32      quality,
+                        const char  *title,
+                        l_uint8    **pdata,
+                        size_t      *pnbytes)
+{
+char     *fname;
+l_uint8  *imdata;
+l_int32   i, n, ret, pagetype, npages, scaledres;
+size_t    imbytes;
+L_BYTEA  *ba;
+PIX      *pixs, *pix;
+L_PTRA   *pa_data;
+
+    if (!pdata)
+        return ERROR_INT("&data not defined", __func__, 1);
+    *pdata = NULL;
+    if (!pnbytes)
+        return ERROR_INT("&nbytes not defined", __func__, 1);
+    *pnbytes = 0;
+    if (!sa)
+        return ERROR_INT("sa not defined", __func__, 1);
+    if (scalefactor <= 0.0) scalefactor = 1.0;
+    if (type != L_JPEG_ENCODE && type != L_G4_ENCODE &&
+        type != L_FLATE_ENCODE && type != L_JP2K_ENCODE) {
+        type = L_DEFAULT_ENCODE;
+    }
+
+        /* Generate all the encoded pdf strings */
+    n = sarrayGetCount(sa);
+    pa_data = ptraCreate(n);
+    for (i = 0; i < n; i++) {
+        if (i && (i % 10 == 0)) lept_stderr(".. %d ", i);
+        fname = sarrayGetString(sa, i, L_NOCOPY);
+        if ((pixs = pixRead(fname)) == NULL) {
+            L_ERROR("image not readable from file %s\n", __func__, fname);
+            continue;
+        }
+        if (scalefactor != 1.0)
+            pix = pixScale(pixs, scalefactor, scalefactor);
+        else
+            pix = pixClone(pixs);
+        pixDestroy(&pixs);
+        scaledres = (l_int32)(res * scalefactor);
+
+            /* Select the encoding type */
+        if (type != L_DEFAULT_ENCODE) {
+            pagetype = type;
+        } else if (selectDefaultPdfEncoding(pix, &pagetype) != 0) {
+            pixDestroy(&pix);
+            L_ERROR("encoding type selection failed for file %s\n",
+                    __func__, fname);
+            continue;
+        }
+
+        ret = pixConvertToPdfData(pix, pagetype, quality, &imdata, &imbytes,
+                                  0, 0, scaledres, title, NULL, 0);
+        pixDestroy(&pix);
+        if (ret) {
+            LEPT_FREE(imdata);
+            L_ERROR("pdf encoding failed for %s\n", __func__, fname);
+            continue;
+        }
+        ba = l_byteaInitFromMem(imdata, imbytes);
+        LEPT_FREE(imdata);
+        ptraAdd(pa_data, ba);
+    }
+    ptraGetActualCount(pa_data, &npages);
+    if (npages == 0) {
+        L_ERROR("no pdf files made\n", __func__);
+        ptraDestroy(&pa_data, FALSE, FALSE);
+        return 1;
+    }
+
+        /* Concatenate them */
+    lept_stderr("\nconcatenating ... ");
+    ret = ptraConcatenatePdfToData(pa_data, NULL, pdata, pnbytes);
+    lept_stderr("done\n");
+
+    ptraGetActualCount(pa_data, &npages);  /* recalculate in case it changes */
+    for (i = 0; i < npages; i++) {
+        ba = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
+        l_byteaDestroy(&ba);
+    }
+    ptraDestroy(&pa_data, FALSE, FALSE);
+    return ret;
+}
+
+
+/*!
+ * \brief   selectDefaultPdfEncoding()
+ *
+ * \param[in]    pix
+ * \param[out]   ptype     L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) This attempts to choose an encoding for the pix that results
+ *          in the smallest file, assuming that if jpeg encoded, it will
+ *          use quality = 75.  The decision is approximate, in that
+ *          (a) all colormapped images will be losslessly encoded with
+ *          gzip (flate), and (b) an image with less than about 20 colors
+ *          is likely to be smaller if flate encoded than if encoded
+ *          as a jpeg (dct).  For example, an image made by pixScaleToGray3()
+ *          will have 10 colors, and flate encoding will give about
+ *          twice the compression as jpeg with quality = 75.
+ *      (2) We could have used L_JP2K_ENCODE instead of L_JPEG_ENCODE.
+ *          However, the jp2k compression is not much better than jpeg, and
+ *          the jpeg library is more commonly available than the jp2k library.
+ * </pre>
+ */
+l_ok
+selectDefaultPdfEncoding(PIX      *pix,
+                         l_int32  *ptype)
+{
+l_int32   w, h, d, factor, ncolors;
+PIXCMAP  *cmap;
+
+    if (!ptype)
+        return ERROR_INT("&type not defined", __func__, 1);
+    *ptype = L_FLATE_ENCODE;  /* default universal encoding */
+    if (!pix)
+        return ERROR_INT("pix not defined", __func__, 1);
+    pixGetDimensions(pix, &w, &h, &d);
+    cmap = pixGetColormap(pix);
+    if (d == 8 && !cmap) {
+        factor = L_MAX(1, (l_int32)sqrt((l_float64)(w * h) / 20000.));
+        pixNumColors(pix, factor, &ncolors);
+        if (ncolors < 20)
+            *ptype = L_FLATE_ENCODE;
+        else
+            *ptype = L_JPEG_ENCODE;
+    } else if (d == 1) {
+        *ptype = L_G4_ENCODE;
+    } else if (cmap || d == 2 || d == 4) {
+        *ptype = L_FLATE_ENCODE;
+    } else if (d == 8 || d == 32) {
+        *ptype = L_JPEG_ENCODE;
+    } else if (d == 16) {
+        *ptype = L_FLATE_ENCODE;
+    } else {
+        return ERROR_INT("type selection failure", __func__, 1);
+    }
+
+    return 0;
+}
+
+
+/*---------------------------------------------------------------------*
+ *          Convert specified image files to pdf without scaling       *
+ *---------------------------------------------------------------------*/
+/*!
+ * \brief   convertUnscaledFilesToPdf()
+ *
+ * \param[in]    dirname   directory name containing images
+ * \param[in]    substr    [optional] substring filter on filenames;
+ *                         can be null
+ * \param[in]    title     [optional] pdf title; can be null
+ * \param[in]    fileout   pdf file of all images
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) If %substr is not NULL, only image filenames that contain
+ *          the substring can be used.  If %substr == NULL, all files
+ *          in the directory are used.
+ *      (2) The files in the directory, after optional filtering by
+ *          the substring, are lexically sorted in increasing order
+ *          before concatenation.
+ *      (3) This is very fast for jpeg, jp2k and some png files,
+ *          because the compressed data is wrapped up and concatenated.
+ *          For other types of png, the images must be read and recompressed.
+ * </pre>
+ */
+l_ok
+convertUnscaledFilesToPdf(const char  *dirname,
+                          const char  *substr,
+                          const char  *title,
+                          const char  *fileout)
+{
+l_int32  ret;
+SARRAY  *sa;
+
+    if (!dirname)
+        return ERROR_INT("dirname not defined", __func__, 1);
+    if (!fileout)
+        return ERROR_INT("fileout not defined", __func__, 1);
+
+    if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
+        return ERROR_INT("sa not made", __func__, 1);
+    ret = saConvertUnscaledFilesToPdf(sa, title, fileout);
+    sarrayDestroy(&sa);
+    return ret;
+}
+
+
+/*!
+ * \brief   saConvertUnscaledFilesToPdf()
+ *
+ * \param[in]    sa        string array of pathnames for images
+ * \param[in]    title     [optional] pdf title; can be null
+ * \param[in]    fileout   pdf file of all images
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) See convertUnscaledFilesToPdf().
+ * </pre>
+ */
+l_ok
+saConvertUnscaledFilesToPdf(SARRAY      *sa,
+                            const char  *title,
+                            const char  *fileout)
+{
+l_uint8  *data;
+l_int32   ret;
+size_t    nbytes;
+
+    if (!sa)
+        return ERROR_INT("sa not defined", __func__, 1);
+
+    ret = saConvertUnscaledFilesToPdfData(sa, title, &data, &nbytes);
+    if (ret) {
+        if (data) LEPT_FREE(data);
+        return ERROR_INT("pdf data not made", __func__, 1);
+    }
+
+    ret = l_binaryWrite(fileout, "w", data, nbytes);
+    LEPT_FREE(data);
+    if (ret)
+        L_ERROR("pdf data not written to file\n", __func__);
+    return ret;
+}
+
+
+/*!
+ * \brief   saConvertUnscaledFilesToPdfData()
+ *
+ * \param[in]    sa        string array of pathnames for image files
+ * \param[in]    title     [optional] pdf title; can be null
+ * \param[out]   pdata     output pdf data (of all images)
+ * \param[out]   pnbytes   size of output pdf data
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) This is very fast for jpeg, jp2k and some png files,
+ *          because the compressed data is wrapped up and concatenated.
+ *          For other types of png, the images must be read and recompressed.
+ * </pre>
+ */
+l_ok
+saConvertUnscaledFilesToPdfData(SARRAY      *sa,
+                                const char  *title,
+                                l_uint8    **pdata,
+                                size_t      *pnbytes)
+{
+char         *fname;
+l_uint8      *imdata;
+l_int32       i, n, ret, npages;
+size_t        imbytes;
+L_BYTEA      *ba;
+L_PTRA       *pa_data;
+
+    if (!pdata)
+        return ERROR_INT("&data not defined", __func__, 1);
+    *pdata = NULL;
+    if (!pnbytes)
+        return ERROR_INT("&nbytes not defined", __func__, 1);
+    *pnbytes = 0;
+    if (!sa)
+        return ERROR_INT("sa not defined", __func__, 1);
+
+        /* Generate all the encoded pdf strings */
+    n = sarrayGetCount(sa);
+    pa_data = ptraCreate(n);
+    for (i = 0; i < n; i++) {
+        if (i && (i % 10 == 0)) lept_stderr(".. %d ", i);
+        fname = sarrayGetString(sa, i, L_NOCOPY);
+
+            /* Generate the pdf data */
+        if (convertUnscaledToPdfData(fname, title, &imdata, &imbytes))
+            continue;
+
+            /* ... and add it to the array of single page data */
+        ba = l_byteaInitFromMem(imdata, imbytes);
+        if (imdata) LEPT_FREE(imdata);
+        ptraAdd(pa_data, ba);
+    }
+    ptraGetActualCount(pa_data, &npages);
+    if (npages == 0) {
+        L_ERROR("no pdf files made\n", __func__);
+        ptraDestroy(&pa_data, FALSE, FALSE);
+        return 1;
+    }
+
+        /* Concatenate to generate a multipage pdf */
+    lept_stderr("\nconcatenating ... ");
+    ret = ptraConcatenatePdfToData(pa_data, NULL, pdata, pnbytes);
+    lept_stderr("done\n");
+
+        /* Clean up */
+    ptraGetActualCount(pa_data, &npages);  /* maybe failed to read some files */
+    for (i = 0; i < npages; i++) {
+        ba = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
+        l_byteaDestroy(&ba);
+    }
+    ptraDestroy(&pa_data, FALSE, FALSE);
+    return ret;
+}
+
+
+/*!
+ * \brief   convertUnscaledToPdfData()
+ *
+ * \param[in]    fname      of image file in all formats
+ * \param[in]    title      [optional] pdf title; can be null
+ * \param[out]   pdata      output pdf data for image
+ * \param[out]   pnbytes    size of output pdf data
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) This is very fast for jpeg, jp2k and some png files,
+ *          because the compressed data is wrapped up and concatenated.
+ *          For other types of png, the images must be read and recompressed.
+ * </pre>
+ */
+l_ok
+convertUnscaledToPdfData(const char  *fname,
+                         const char  *title,
+                         l_uint8    **pdata,
+                         size_t      *pnbytes)
+{
+l_int32       format;
+L_COMP_DATA  *cid;
+
+    if (!pdata)
+        return ERROR_INT("&data not defined", __func__, 1);
+    *pdata = NULL;
+    if (!pnbytes)
+        return ERROR_INT("&nbytes not defined", __func__, 1);
+    *pnbytes = 0;
+    if (!fname)
+        return ERROR_INT("fname not defined", __func__, 1);
+
+    findFileFormat(fname, &format);
+    if (format == IFF_UNKNOWN) {
+        L_WARNING("file %s format is unknown; skip\n", __func__, fname);
+        return 1;
+    }
+    if (format == IFF_PS || format == IFF_LPDF) {
+        L_WARNING("file %s format is %d; skip\n", __func__, fname, format);
+        return 1;
+    }
+
+        /* Generate the image data required for pdf generation, always
+         * in binary (not ascii85) coding.  Note that jpeg, jp2k and some
+         * png files are not transcoded.  */
+    l_generateCIDataForPdf(fname, NULL, 0, &cid);
+    if (!cid) {
+        L_ERROR("file %s format is %d; unreadable\n", __func__, fname, format);
+        return 1;
+    }
+
+        /* Generate the pdf string for this page (image).  This destroys
+         * the cid by attaching it to an lpd and destroying the lpd. */
+    cidConvertToPdfData(cid, title, pdata, pnbytes);
+    return 0;
+}
+
+
+/*---------------------------------------------------------------------*
+ *          Convert multiple images to pdf (one image per page)        *
+ *---------------------------------------------------------------------*/
+/*!
+ * \brief   pixaConvertToPdf()
+ *
+ * \param[in]    pixa          containing images all at the same resolution
+ * \param[in]    res           override the resolution of each input image,
+ *                             in ppi; use 0 to respect the resolution
+ *                             embedded in the input images
+ * \param[in]    scalefactor   scaling factor applied to each image; > 0.0
+ * \param[in]    type          encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
+ *                             L_FLATE_ENCODE, L_JP2K_ENCODE, or
+ *                             L_DEFAULT_ENCODE for default)
+ * \param[in]    quality       for jpeg: 1-100; 0 for default (75)
+ *                             for jp2k: 27-45; 0 for default (34)
+ * \param[in]    title         [optional] pdf title; can be null
+ * \param[in]    fileout       pdf file of all images
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) The images are encoded with G4 if 1 bpp; JPEG if 8 bpp without
+ *          colormap and many colors, or 32 bpp; FLATE for anything else.
+ *      (2) The scalefactor must be > 0.0; otherwise it is set to 1.0.
+ *      (3) Specifying one of the three encoding types for %type forces
+ *          all images to be compressed with that type.  Use 0 to have
+ *          the type determined for each image based on depth and whether
+ *          or not it has a colormap.
+ * </pre>
+ */
+l_ok
+pixaConvertToPdf(PIXA        *pixa,
+                 l_int32      res,
+                 l_float32    scalefactor,
+                 l_int32      type,
+                 l_int32      quality,
+                 const char  *title,
+                 const char  *fileout)
+{
+l_uint8  *data;
+l_int32   ret;
+size_t    nbytes;
+
+    if (!pixa)
+        return ERROR_INT("pixa not defined", __func__, 1);
+
+    ret = pixaConvertToPdfData(pixa, res, scalefactor, type, quality,
+                               title, &data, &nbytes);
+    if (ret) {
+        LEPT_FREE(data);
+        return ERROR_INT("conversion to pdf failed", __func__, 1);
+    }
+
+    ret = l_binaryWrite(fileout, "w", data, nbytes);
+    LEPT_FREE(data);
+    if (ret)
+        L_ERROR("pdf data not written to file\n", __func__);
+    return ret;
+}
+
+
+/*!
+ * \brief   pixaConvertToPdfData()
+ *
+ * \param[in]    pixa           containing images all at the same resolution
+ * \param[in]    res            input resolution of all images
+ * \param[in]    scalefactor    scaling factor applied to each image; > 0.0; <50
+ * \param[in]    type           encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
+ *                              L_FLATE_ENCODE, L_JP2K_ENCODE, or
+ *                              L_DEFAULT_ENCODE for default)
+ * \param[in]    quality        for jpeg: 1-100; 0 for default (75)
+ *                              for jp2k: 27-45; 0 for default (34)
+ * \param[in]    title          [optional] pdf title; can be null
+ * \param[out]   pdata          output pdf data of all images
+ * \param[out]   pnbytes        size of output pdf data
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) See pixaConvertToPdf().
+ * </pre>
+ */
+l_ok
+pixaConvertToPdfData(PIXA        *pixa,
+                     l_int32      res,
+                     l_float32    scalefactor,
+                     l_int32      type,
+                     l_int32      quality,
+                     const char  *title,
+                     l_uint8    **pdata,
+                     size_t      *pnbytes)
+{
+l_uint8  *imdata;
+l_int32   i, n, ret, scaledres, pagetype;
+size_t    imbytes;
+L_BYTEA  *ba;
+PIX      *pixs, *pix;
+L_PTRA   *pa_data;
+
+    if (!pdata)
+        return ERROR_INT("&data not defined", __func__, 1);
+    *pdata = NULL;
+    if (!pnbytes)
+        return ERROR_INT("&nbytes not defined", __func__, 1);
+    *pnbytes = 0;
+    if (!pixa)
+        return ERROR_INT("pixa not defined", __func__, 1);
+    if (scalefactor <= 0.0) scalefactor = 1.0;
+    if (scalefactor >= 50.0)
+        return ERROR_INT("scalefactor too large", __func__, 1);
+    if (type != L_DEFAULT_ENCODE && type != L_JPEG_ENCODE &&
+        type != L_G4_ENCODE && type != L_FLATE_ENCODE &&
+        type != L_JP2K_ENCODE) {
+        L_WARNING("invalid compression type; using per-page default\n",
+                  __func__);
+        type = L_DEFAULT_ENCODE;
+    }
+    if (quality < 0 || quality > 100)
+        return ERROR_INT("invalid quality", __func__, 1);
+
+        /* Generate all the encoded pdf strings */
+    n = pixaGetCount(pixa);
+    pa_data = ptraCreate(n);
+    for (i = 0; i < n; i++) {
+        if ((pixs = pixaGetPix(pixa, i, L_CLONE)) == NULL) {
+            L_ERROR("pixs[%d] not retrieved\n", __func__, i);
+            continue;
+        }
+        if (scalefactor != 1.0)
+            pix = pixScale(pixs, scalefactor, scalefactor);
+        else
+            pix = pixClone(pixs);
+        pixDestroy(&pixs);
+        if (!pix) {
+            L_ERROR("pix[%d] not made\n", __func__, i);
+            continue;
+        }
+        scaledres = (l_int32)(res * scalefactor);
+
+            /* Select the encoding type */
+        if (type != L_DEFAULT_ENCODE) {
+            pagetype = type;
+        } else if (selectDefaultPdfEncoding(pix, &pagetype) != 0) {
+            L_ERROR("encoding type selection failed for pix[%d]\n",
+                        __func__, i);
+            pixDestroy(&pix);
+            continue;
+        }
+
+        ret = pixConvertToPdfData(pix, pagetype, quality, &imdata, &imbytes,
+                                  0, 0, scaledres, title, NULL, 0);
+        pixDestroy(&pix);
+        if (ret) {
+            LEPT_FREE(imdata);
+            L_ERROR("pdf encoding failed for pix[%d]\n", __func__, i);
+            continue;
+        }
+        ba = l_byteaInitFromMem(imdata, imbytes);
+        LEPT_FREE(imdata);
+        ptraAdd(pa_data, ba);
+    }
+    ptraGetActualCount(pa_data, &n);
+    if (n == 0) {
+        L_ERROR("no pdf files made\n", __func__);
+        ptraDestroy(&pa_data, FALSE, FALSE);
+        return 1;
+    }
+
+        /* Concatenate them */
+    ret = ptraConcatenatePdfToData(pa_data, NULL, pdata, pnbytes);
+
+    ptraGetActualCount(pa_data, &n);  /* recalculate in case it changes */
+    for (i = 0; i < n; i++) {
+        ba = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
+        l_byteaDestroy(&ba);
+    }
+    ptraDestroy(&pa_data, FALSE, FALSE);
+    return ret;
+}
+
+
+/*---------------------------------------------------------------------*
+ *                Single page, multi-image converters                  *
+ *---------------------------------------------------------------------*/
+/*!
+ * \brief   convertToPdf()
+ *
+ * \param[in]      filein       input image file -- any format
+ * \param[in]      type         encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
+ *                              L_FLATE_ENCODE, or L_JP2K_ENCODE)
+ * \param[in]      quality      for jpeg: 1-100; 0 for default (75)
+ *                              for jp2k: 27-45; 0 for default (34)
+ * \param[in]      fileout      output pdf file; only required on last
+ *                              image on page
+ * \param[in]      x, y         location of lower-left corner of image,
+ *                              in pixels, relative to the PostScript origin
+ *                              (0,0) at the lower-left corner of the page
+ * \param[in]      res          override the resolution of the input image,
+ *                              in ppi; use 0 to respect the resolution
+ *                              embedded in the input images
+ * \param[in]      title        [optional] pdf title; can be null
+ * \param[in,out]  plpd         ptr to lpd, which is created on the first
+ *                              invocation and returned until last image is
+ *                              processed, at which time it is destroyed
+ * \param[in]      position     in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
+ *                              L_LAST_IMAGE
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) To wrap only one image in pdf, input %plpd = NULL, and
+ *          the value of %position will be ignored:
+ *            convertToPdf(...  type, quality, x, y, res, NULL, 0);
+ *      (2) To wrap multiple images on a single pdf page, this is called
+ *          once for each successive image.  Do it this way:
+ *            L_PDF_DATA   *lpd;
+ *            convertToPdf(...  type, quality, x, y, res, &lpd, L_FIRST_IMAGE);
+ *            convertToPdf(...  type, quality, x, y, res, &lpd, L_NEXT_IMAGE);
+ *            ...
+ *            convertToPdf(...  type, quality, x, y, res, &lpd, L_LAST_IMAGE);
+ *          This will write the result to the value of %fileout specified
+ *          in the first call; succeeding values of %fileout are ignored.
+ *          On the last call: the pdf data bytes are computed and written
+ *          to %fileout, lpd is destroyed internally, and the returned
+ *          value of lpd is null.  So the client has nothing to clean up.
+ *      (3) (a) Set %res == 0 to respect the resolution embedded in the
+ *              image file.  If no resolution is embedded, it will be set
+ *              to the default value.
+ *          (b) Set %res to some other value to override the file resolution.
+ *      (4) (a) If the input %res and the resolution of the output device
+ *              are equal, the image will be "displayed" at the same size
+ *              as the original.
+ *          (b) If the input %res is 72, the output device will render
+ *              the image at 1 pt/pixel.
+ *          (c) Some possible choices for the default input pix resolution are:
+ *                 72 ppi     Render pix on any output device at one pt/pixel
+ *                 96 ppi     Windows default for generated display images
+ *                300 ppi     Typical default for scanned images.
+ *              We choose 300, which is sensible for rendering page images.
+ *              However,  images come from a variety of sources, and
+ *              some are explicitly created for viewing on a display.
+ * </pre>
+ */
+l_ok
+convertToPdf(const char   *filein,
+             l_int32       type,
+             l_int32       quality,
+             const char   *fileout,
+             l_int32       x,
+             l_int32       y,
+             l_int32       res,
+             const char   *title,
+             L_PDF_DATA  **plpd,
+             l_int32       position)
+{
+l_uint8  *data;
+l_int32   ret;
+size_t    nbytes;
+
+    if (!filein)
+        return ERROR_INT("filein not defined", __func__, 1);
+    if (!plpd || (position == L_LAST_IMAGE)) {
+        if (!fileout)
+            return ERROR_INT("fileout not defined", __func__, 1);
+    }
+
+    if (convertToPdfData(filein, type, quality, &data, &nbytes, x, y,
+                         res, title, plpd, position))
+        return ERROR_INT("pdf data not made", __func__, 1);
+
+    if (!plpd || (position == L_LAST_IMAGE)) {
+        ret = l_binaryWrite(fileout, "w", data, nbytes);
+        LEPT_FREE(data);
+        if (ret)
+            return ERROR_INT("pdf data not written to file", __func__, 1);
+    }
+
+    return 0;
+}
+
+
+/*!
+ * \brief   convertImageDataToPdf()
+ *
+ * \param[in]      imdata       array of formatted image data; e.g., png, jpeg
+ * \param[in]      size         size of image data
+ * \param[in]      type         encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
+ *                              L_FLATE_ENCODE, or L_JP2K_ENCODE)
+ * \param[in]      quality      for jpeg: 1-100; 0 for default (75)
+ *                              for jp2k: 27-45; 0 for default (34)
+ * \param[in]      fileout      output pdf file; only required on last
+ *                              image on page
+ * \param[in]      x, y         location of lower-left corner of image,
+ *                              in pixels, relative to the PostScript origin
+ *                              (0,0) at the lower-left corner of the page
+ * \param[in]      res          override the resolution of the input image,
+ *                              in ppi; use 0 to respect the resolution
+ *                              embedded in the input images
+ * \param[in]      title        [optional] pdf title; can be null
+ * \param[in,out]  plpd         ptr to lpd, which is created on the first
+ *                              invocation and returned until last image is
+ *                              processed, at which time it is destroyed
+ * \param[in]      position     in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
+ *                              L_LAST_IMAGE
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) If %res == 0 and the input resolution field is 0,
+ *          this will use DefaultInputRes.
+ *      (2) See comments in convertToPdf().
+ * </pre>
+ */
+l_ok
+convertImageDataToPdf(l_uint8      *imdata,
+                      size_t        size,
+                      l_int32       type,
+                      l_int32       quality,
+                      const char   *fileout,
+                      l_int32       x,
+                      l_int32       y,
+                      l_int32       res,
+                      const char   *title,
+                      L_PDF_DATA  **plpd,
+                      l_int32       position)
+{
+l_int32  ret;
+PIX     *pix;
+
+    if (!imdata)
+        return ERROR_INT("image data not defined", __func__, 1);
+    if (!plpd || (position == L_LAST_IMAGE)) {
+        if (!fileout)
+            return ERROR_INT("fileout not defined", __func__, 1);
+    }
+
+    if ((pix = pixReadMem(imdata, size)) == NULL)
+        return ERROR_INT("pix not read", __func__, 1);
+    if (type != L_JPEG_ENCODE && type != L_G4_ENCODE &&
+        type != L_FLATE_ENCODE && type != L_JP2K_ENCODE) {
+        selectDefaultPdfEncoding(pix, &type);
+    }
+    ret = pixConvertToPdf(pix, type, quality, fileout, x, y, res,
+                          title, plpd, position);
+    pixDestroy(&pix);
+    return ret;
+}
+
+
+/*!
+ * \brief   convertToPdfData()
+ *
+ * \param[in]      filein       input image file -- any format
+ * \param[in]      type         encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
+ *                              L_FLATE_ENCODE, or L_JP2K_ENCODE)
+ * \param[in]      quality      for jpeg: 1-100; 0 for default (75)
+ *                              for jp2k: 27-45; 0 for default (34)
+ * \param[out]     pdata        pdf data in memory
+ * \param[out]     pnbytes      number of bytes in pdf data
+ * \param[in]      x, y         location of lower-left corner of image,
+ *                              in pixels, relative to the PostScript origin
+ *                              (0,0) at the lower-left corner of the page
+ * \param[in]      res          override the resolution of the input image,
+ *                              in ppi; use 0 to respect the resolution
+ *                              embedded in the input images
+ * \param[in]      title        [optional] pdf title; can be null
+ * \param[in,out]  plpd         ptr to lpd, which is created on the first
+ *                              invocation and returned until last image is
+ *                              processed, at which time it is destroyed
+ * \param[in]      position     in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
+ *                              L_LAST_IMAGE
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) If %res == 0 and the input resolution field is 0,
+ *          this will use DefaultInputRes.
+ *      (2) See comments in convertToPdf().
+ * </pre>
+ */
+l_ok
+convertToPdfData(const char   *filein,
+                 l_int32       type,
+                 l_int32       quality,
+                 l_uint8     **pdata,
+                 size_t       *pnbytes,
+                 l_int32       x,
+                 l_int32       y,
+                 l_int32       res,
+                 const char   *title,
+                 L_PDF_DATA  **plpd,
+                 l_int32       position)
+{
+PIX  *pix;
+
+    if (!pdata)
+        return ERROR_INT("&data not defined", __func__, 1);
+    *pdata = NULL;
+    if (!pnbytes)
+        return ERROR_INT("&nbytes not defined", __func__, 1);
+    *pnbytes = 0;
+    if (!filein)
+        return ERROR_INT("filein not defined", __func__, 1);
+
+    if ((pix = pixRead(filein)) == NULL)
+        return ERROR_INT("pix not made", __func__, 1);
+
+    pixConvertToPdfData(pix, type, quality, pdata, pnbytes,
+                        x, y, res, title, plpd, position);
+    pixDestroy(&pix);
+    return 0;
+}
+
+
+/*!
+ * \brief   convertImageDataToPdfData()
+ *
+ * \param[in]    imdata       array of formatted image data; e.g., png, jpeg
+ * \param[in]    size         size of image data
+ * \param[in]    type         encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
+ *                            L_FLATE_ENCODE, or L_JP2K_ENCODE)
+ * \param[in]    quality      for jpeg: 1-100; 0 for default (75)
+ *                            for jp2k: 27-45; 0 for default (34)
+ * \param[out]   pdata        pdf data in memory
+ * \param[out]   pnbytes      number of bytes in pdf data
+ * \param[in]    x, y         location of lower-left corner of image,
+ *                            in pixels, relative to the PostScript origin
+ *                            (0,0) at the lower-left corner of the page
+ * \param[in]    res          override the resolution of the input image,
+ *                            in ppi; use 0 to respect the resolution
+ *                            embedded in the input images
+ * \param[in]    title        [optional] pdf title; can be null
+ * \param[out]   plpd         ptr to lpd, which is created on the first
+ *                            invocation and returned until last image is
+ *                            processed, at which time it is destroyed
+ * \param[in]    position     in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
+ *                            L_LAST_IMAGE
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) If %res == 0 and the input resolution field is 0,
+ *          this will use DefaultInputRes.
+ *      (2) See comments in convertToPdf().
+ * </pre>
+ */
+l_ok
+convertImageDataToPdfData(l_uint8      *imdata,
+                          size_t        size,
+                          l_int32       type,
+                          l_int32       quality,
+                          l_uint8     **pdata,
+                          size_t       *pnbytes,
+                          l_int32       x,
+                          l_int32       y,
+                          l_int32       res,
+                          const char   *title,
+                          L_PDF_DATA  **plpd,
+                          l_int32       position)
+{
+l_int32  ret;
+PIX     *pix;
+
+    if (!pdata)
+        return ERROR_INT("&data not defined", __func__, 1);
+    *pdata = NULL;
+    if (!pnbytes)
+        return ERROR_INT("&nbytes not defined", __func__, 1);
+    *pnbytes = 0;
+    if (!imdata)
+        return ERROR_INT("image data not defined", __func__, 1);
+    if (plpd) {  /* part of multi-page invocation */
+        if (position == L_FIRST_IMAGE)
+            *plpd = NULL;
+    }
+
+    if ((pix = pixReadMem(imdata, size)) == NULL)
+        return ERROR_INT("pix not read", __func__, 1);
+    if (type != L_JPEG_ENCODE && type != L_G4_ENCODE &&
+        type != L_FLATE_ENCODE && type != L_JP2K_ENCODE) {
+        selectDefaultPdfEncoding(pix, &type);
+    }
+    ret = pixConvertToPdfData(pix, type, quality, pdata, pnbytes,
+                              x, y, res, title, plpd, position);
+    pixDestroy(&pix);
+    return ret;
+}
+
+
+/*!
+ * \brief   pixConvertToPdf()
+ *
+ * \param[in]      pix
+ * \param[in]      type         encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
+ *                              L_FLATE_ENCODE, L_JP2K_ENCODE)
+ * \param[in]      quality      for jpeg: 1-100; 0 for default (75)
+ *                              for jp2k: 27-45; 0 for default (34)
+ * \param[in]      fileout      output pdf file; only required on last
+ *                              image on page
+ * \param[in]      x, y         location of lower-left corner of image,
+ *                              in pixels, relative to the PostScript origin
+ *                              (0,0) at the lower-left corner of the page
+ * \param[in]      res          override the resolution of the input image,
+ *                              in ppi; use 0 to respect the resolution
+ *                              embedded in the input images
+ * \param[in]      title        [optional] pdf title; can be null
+ * \param[in,out]  plpd         ptr to lpd, which is created on the first
+ *                              invocation and returned until last image is
+ *                              processed, at which time it is destroyed
+ * \param[in]      position     in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
+ *                              L_LAST_IMAGE
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) If %res == 0 and the input resolution field is 0,
+ *          this will use DefaultInputRes.
+ *      (2) This only writes data to fileout if it is the last
+ *          image to be written on the page.
+ *      (3) See comments in convertToPdf().
+ * </pre>
+ */
+l_ok
+pixConvertToPdf(PIX          *pix,
+                l_int32       type,
+                l_int32       quality,
+                const char   *fileout,
+                l_int32       x,
+                l_int32       y,
+                l_int32       res,
+                const char   *title,
+                L_PDF_DATA  **plpd,
+                l_int32       position)
+{
+l_uint8  *data;
+l_int32   ret;
+size_t    nbytes;
+
+    if (!pix)
+        return ERROR_INT("pix not defined", __func__, 1);
+    if (!plpd || (position == L_LAST_IMAGE)) {
+        if (!fileout)
+            return ERROR_INT("fileout not defined", __func__, 1);
+    }
+
+    if (pixConvertToPdfData(pix, type, quality, &data, &nbytes,
+                            x, y, res, title, plpd, position)) {
+        LEPT_FREE(data);
+        return ERROR_INT("pdf data not made", __func__, 1);
+    }
+
+    if (!plpd || (position == L_LAST_IMAGE)) {
+        ret = l_binaryWrite(fileout, "w", data, nbytes);
+        LEPT_FREE(data);
+        if (ret)
+            return ERROR_INT("pdf data not written to file", __func__, 1);
+    }
+    return 0;
+}
+
+
+/*!
+ * \brief   pixWriteStreamPdf()
+ *
+ * \param[in]    fp       file stream opened for writing
+ * \param[in]    pix      all depths, cmap OK
+ * \param[in]    res      override the resolution of the input image, in ppi;
+ *                        use 0 to respect the resolution embedded in the input
+ * \param[in]    title    [optional] pdf title; can be null
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) This is the simplest interface for writing a single image
+ *          with pdf encoding to a stream.  It uses G4 encoding for 1 bpp,
+ *          JPEG encoding for 8 bpp (no cmap) and 32 bpp, and FLATE
+ *          encoding for everything else.
+ * </pre>
+ */
+l_ok
+pixWriteStreamPdf(FILE        *fp,
+                  PIX         *pix,
+                  l_int32      res,
+                  const char  *title)
+{
+l_uint8  *data;
+size_t    nbytes, nbytes_written;
+
+    if (!fp)
+        return ERROR_INT("stream not opened", __func__, 1);
+    if (!pix)
+        return ERROR_INT("pix not defined", __func__, 1);
+
+    if (pixWriteMemPdf(&data, &nbytes, pix, res, title) != 0) {
+        LEPT_FREE(data);
+        return ERROR_INT("pdf data not made", __func__, 1);
+    }
+
+    nbytes_written = fwrite(data, 1, nbytes, fp);
+    LEPT_FREE(data);
+    if (nbytes != nbytes_written)
+        return ERROR_INT("failure writing pdf data to stream", __func__, 1);
+    return 0;
+}
+
+
+/*!
+ * \brief   pixWriteMemPdf()
+ *
+ * \param[out]   pdata      pdf as byte array
+ * \param[out]   pnbytes    number of bytes in pdf array
+ * \param[in]    pix        all depths, cmap OK
+ * \param[in]    res        override the resolution of the input image, in ppi;
+ *                          use 0 to respect the res embedded in the input
+ * \param[in]    title      [optional] pdf title; can be null
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) This is the simplest interface for writing a single image
+ *          with pdf encoding to memory.  It uses G4 encoding for 1 bpp,
+ *          and makes a guess whether to use JPEG or FLATE encoding for
+ *          everything else.
+ * </pre>
+ */
+l_ok
+pixWriteMemPdf(l_uint8    **pdata,
+               size_t      *pnbytes,
+               PIX         *pix,
+               l_int32      res,
+               const char  *title)
+{
+l_int32  ret, type;
+
+    if (pdata) *pdata = NULL;
+    if (pnbytes) *pnbytes = 0;
+    if (!pdata || !pnbytes)
+        return ERROR_INT("&data or &nbytes not defined", __func__, 1);
+    if (!pix)
+        return ERROR_INT("pix not defined", __func__, 1);
+
+    selectDefaultPdfEncoding(pix, &type);
+    ret = pixConvertToPdfData(pix, type, 75, pdata, pnbytes,
+                              0, 0, res, title, NULL, 0);
+    if (ret)
+        return ERROR_INT("pdf data not made", __func__, 1);
+    return 0;
+}
+
+
+/*---------------------------------------------------------------------*
+ *            Segmented multi-page, multi-image converter              *
+ *---------------------------------------------------------------------*/
+/*!
+ * \brief   convertSegmentedFilesToPdf()
+ *
+ * \param[in]    dirname       directory name containing images
+ * \param[in]    substr        [optional] substring filter on filenames;
+ *                             can be null
+ * \param[in]    res           input resolution of all images
+ * \param[in]    type          compression type for non-image regions; the
+ *                             image regions are always compressed with
+ *                             L_JPEG_ENCODE
+ * \param[in]    thresh        used for converting gray --> 1 bpp with
+ *                             L_G4_ENCODE
+ * \param[in]    baa           [optional] boxaa of image regions
+ * \param[in]    quality       used for JPEG only; 0 for default (75)
+ * \param[in]    scalefactor   scaling factor applied to each image region
+ * \param[in]    title         [optional] pdf title; can be null
+ * \param[in]    fileout       pdf file of all images
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) If %substr is not NULL, only image filenames that contain
+ *          the substring can be used.  If %substr == NULL, all files
+ *          in the directory are used.
+ *      (2) The files in the directory, after optional filtering by
+ *          the substring, are lexically sorted in increasing order
+ *          before concatenation.
+ *      (3) The images are encoded with G4 if 1 bpp; JPEG if 8 bpp without
+ *          colormap and many colors, or 32 bpp; FLATE for anything else.
+ *      (4) The boxaa, if it exists, contains one boxa of "image regions"
+ *          for each image file.  The boxa must be aligned with the
+ *          sorted set of images.
+ *      (5) The scalefactor is applied to each image region.  It is
+ *          typically < 1.0, to save bytes in the final pdf, because
+ *          the resolution is often not critical in non-text regions.
+ *      (6) If the non-image regions have pixel depth > 1 and the encoding
+ *          type is G4, they are automatically scaled up by 2x and
+ *          thresholded.  Otherwise, no scaling is performed on them.
+ *      (7) Note that this function can be used to generate multipage
+ *          G4 compressed pdf from any input, by using %boxaa == NULL
+ *          and %type == L_G4_ENCODE.
+ * </pre>
+ */
+l_ok
+convertSegmentedFilesToPdf(const char  *dirname,
+                           const char  *substr,
+                           l_int32      res,
+                           l_int32      type,
+                           l_int32      thresh,
+                           BOXAA       *baa,
+                           l_int32      quality,
+                           l_float32    scalefactor,
+                           const char  *title,
+                           const char  *fileout)
+{
+char     *fname;
+l_uint8  *imdata, *data;
+l_int32   i, npages, nboxa, nboxes, ret;
+size_t    imbytes, databytes;
+BOXA     *boxa;
+L_BYTEA  *ba;
+L_PTRA   *pa_data;
+SARRAY   *sa;
+
+    if (!dirname)
+        return ERROR_INT("dirname not defined", __func__, 1);
+    if (!fileout)
+        return ERROR_INT("fileout not defined", __func__, 1);
+
+    if ((sa = getNumberedPathnamesInDirectory(dirname, substr, 0, 0, 10000))
+            == NULL)
+        return ERROR_INT("sa not made", __func__, 1);
+
+    npages = sarrayGetCount(sa);
+        /* If necessary, extend the boxaa, which is page-aligned with
+         * the image files, to be as large as the set of images. */
+    if (baa) {
+        nboxa = boxaaGetCount(baa);
+        if (nboxa < npages) {
+            boxa = boxaCreate(1);
+            boxaaExtendWithInit(baa, npages, boxa);
+            boxaDestroy(&boxa);
+        }
+    }
+
+        /* Generate and save all the encoded pdf strings */
+    pa_data = ptraCreate(npages);
+    for (i = 0; i < npages; i++) {
+        fname = sarrayGetString(sa, i, L_NOCOPY);
+        if (!strcmp(fname, "")) continue;
+        boxa = NULL;
+        if (baa) {
+            boxa = boxaaGetBoxa(baa, i, L_CLONE);
+            nboxes = boxaGetCount(boxa);
+            if (nboxes == 0)
+                boxaDestroy(&boxa);
+        }
+        ret = convertToPdfDataSegmented(fname, res, type, thresh, boxa,
+                                        quality, scalefactor, title,
+                                        &imdata, &imbytes);
+        boxaDestroy(&boxa);  /* safe; in case nboxes > 0 */
+        if (ret) {
+            L_ERROR("pdf encoding failed for %s\n", __func__, fname);
+            continue;
+        }
+        ba = l_byteaInitFromMem(imdata, imbytes);
+        if (imdata) LEPT_FREE(imdata);
+        ptraAdd(pa_data, ba);
+    }
+    sarrayDestroy(&sa);
+
+    ptraGetActualCount(pa_data, &npages);
+    if (npages == 0) {
+        L_ERROR("no pdf files made\n", __func__);
+        ptraDestroy(&pa_data, FALSE, FALSE);
+        return 1;
+    }
+
+        /* Concatenate */
+    ret = ptraConcatenatePdfToData(pa_data, NULL, &data, &databytes);
+
+        /* Clean up */
+    ptraGetActualCount(pa_data, &npages);  /* recalculate in case it changes */
+    for (i = 0; i < npages; i++) {
+        ba = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
+        l_byteaDestroy(&ba);
+    }
+    ptraDestroy(&pa_data, FALSE, FALSE);
+
+    if (ret) {
+        if (data) LEPT_FREE(data);
+        return ERROR_INT("pdf data not made", __func__, 1);
+    }
+
+    ret = l_binaryWrite(fileout, "w", data, databytes);
+    LEPT_FREE(data);
+    if (ret)
+        L_ERROR("pdf data not written to file\n", __func__);
+    return ret;
+}
+
+
+/*!
+ * \brief   convertNumberedMasksToBoxaa()
+ *
+ * \param[in]    dirname   directory name containing mask images
+ * \param[in]    substr    [optional] substring filter on filenames;
+ *                         can be null 
+ * \param[in]    numpre    number of characters in name before number
+ * \param[in]    numpost   number of characters in name after number,
+ *                         up to a dot before an extension
+ * \return  boxaa of mask regions, or NULL on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) This is conveniently used to generate the input boxaa
+ *          for convertSegmentedFilesToPdf().  It guarantees that the
+ *          boxa will be aligned with the page images, even if some
+ *          of the boxa are empty.
+ * </pre>
+ */
+BOXAA *
+convertNumberedMasksToBoxaa(const char  *dirname,
+                            const char  *substr,
+                            l_int32      numpre,
+                            l_int32      numpost)
+{
+char    *fname;
+l_int32  i, n;
+BOXA    *boxa;
+BOXAA   *baa;
+PIX     *pix;
+SARRAY  *sa;
+
+    if (!dirname)
+        return (BOXAA *)ERROR_PTR("dirname not defined", __func__, NULL);
+
+    if ((sa = getNumberedPathnamesInDirectory(dirname, substr, numpre,
+                                              numpost, 10000)) == NULL)
+        return (BOXAA *)ERROR_PTR("sa not made", __func__, NULL);
+
+        /* Generate and save all the encoded pdf strings */
+    n = sarrayGetCount(sa);
+    baa = boxaaCreate(n);
+    boxa = boxaCreate(1);
+    boxaaInitFull(baa, boxa);
+    boxaDestroy(&boxa);
+    for (i = 0; i < n; i++) {
+        fname = sarrayGetString(sa, i, L_NOCOPY);
+        if (!strcmp(fname, "")) continue;
+        if ((pix = pixRead(fname)) == NULL) {
+            L_WARNING("invalid image on page %d\n", __func__, i);
+            continue;
+        }
+        boxa = pixConnComp(pix, NULL, 8);
+        boxaaReplaceBoxa(baa, i, boxa);
+        pixDestroy(&pix);
+    }
+
+    sarrayDestroy(&sa);
+    return baa;
+}
+
+
+/*---------------------------------------------------------------------*
+ *            Segmented single page, multi-image converters            *
+ *---------------------------------------------------------------------*/
+/*!
+ * \brief   convertToPdfSegmented()
+ *
+ * \param[in]    filein        input image file -- any format
+ * \param[in]    res           input image resolution; typ. 300 ppi;
+ *                             use 0 for default
+ * \param[in]    type          compression type for non-image regions; image
+ *                             regions are always compressed with L_JPEG_ENCODE
+ * \param[in]    thresh        for converting gray --> 1 bpp with L_G4_ENCODE
+ * \param[in]    boxa          [optional] of image regions; can be null
+ * \param[in]    quality       used for jpeg image regions; 0 for default
+ * \param[in]    scalefactor   used for jpeg regions; must be <= 1.0
+ * \param[in]    title         [optional] pdf title; can be null
+ * \param[in]    fileout       output pdf file
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) If there are no image regions, set %boxa == NULL;
+ *          %quality and %scalefactor are ignored.
+ *      (2) Typically, %scalefactor is < 1.0, because the image regions
+ *          can be rendered at a lower resolution (for better compression)
+ *          than the text regions.  If %scalefactor == 0, we use 1.0.
+ *          If the input image is 1 bpp and scalefactor < 1.0, we
+ *          use scaleToGray() to downsample the image regions to gray
+ *          before compressing them.
+ *      (3) If the compression type for non-image regions is L_G4_ENCODE
+ *          and bpp > 1, the image is upscaled 2x and thresholded
+ *          to 1 bpp.  That is the only situation where %thresh is used.
+ *      (4) The parameter %quality is only used for image regions.
+ *          If %type == L_JPEG_ENCODE, default jpeg quality (75) is
+ *          used for the non-image regions.
+ *      (5) Processing matrix for non-image regions.
+ *
+ *          Input           G4              JPEG                FLATE
+ *          ----------|---------------------------------------------------
+ *          1 bpp     |  1x, 1 bpp       1x flate, 1 bpp     1x, 1 bpp
+ *                    |
+ *          cmap      |  2x, 1 bpp       1x flate, cmap      1x, cmap
+ *                    |
+ *          2,4 bpp   |  2x, 1 bpp       1x flate            1x, 2,4 bpp
+ *          no cmap   |                  2,4 bpp
+ *                    |
+ *          8,32 bpp  |  2x, 1 bpp       1x (jpeg)           1x, 8,32 bpp
+ *          no cmap   |                  8,32 bpp
+ *
+ *          Summary:
+ *          (a) if G4 is requested, G4 is used, with 2x upscaling
+ *              for all cases except 1 bpp.
+ *          (b) if JPEG is requested, use flate encoding for all cases
+ *              except 8 bpp without cmap and 32 bpp (rgb).
+ *          (c) if FLATE is requested, use flate with no transformation
+ *              of the raster data.
+ *      (6) Calling options/sequence for these functions:
+ *              file  -->  file      (convertToPdfSegmented)
+ *                  pix  -->  file      (pixConvertToPdfSegmented)
+ *                      pix  -->  data      (pixConvertToPdfDataSegmented)
+ *              file  -->  data      (convertToPdfDataSegmented)
+ *                      pix  -->  data      (pixConvertToPdfDataSegmented)
+ * </pre>
+ */
+l_ok
+convertToPdfSegmented(const char  *filein,
+                      l_int32      res,
+                      l_int32      type,
+                      l_int32      thresh,
+                      BOXA        *boxa,
+                      l_int32      quality,
+                      l_float32    scalefactor,
+                      const char  *title,
+                      const char  *fileout)
+{
+l_int32  ret;
+PIX     *pixs;
+
+    if (!filein)
+        return ERROR_INT("filein not defined", __func__, 1);
+    if (!fileout)
+        return ERROR_INT("fileout not defined", __func__, 1);
+    if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
+        type != L_FLATE_ENCODE)
+        return ERROR_INT("invalid conversion type", __func__, 1);
+    if (boxa && scalefactor > 1.0) {
+        L_WARNING("setting scalefactor to 1.0\n", __func__);
+        scalefactor = 1.0;
+    }
+
+    if ((pixs = pixRead(filein)) == NULL)
+        return ERROR_INT("pixs not made", __func__, 1);
+
+    ret = pixConvertToPdfSegmented(pixs, res, type, thresh, boxa, quality,
+                                   scalefactor, title, fileout);
+    pixDestroy(&pixs);
+    return ret;
+}
+
+
+/*!
+ * \brief   pixConvertToPdfSegmented()
+ *
+ * \param[in]    pixs          any depth, cmap OK
+ * \param[in]    res           input image resolution; typ. 300 ppi;
+ *                             use 0 for default
+ * \param[in]    type          compression type for non-image regions; image
+ *                             regions are always compressed with L_JPEG_ENCODE
+ * \param[in]    thresh        for converting gray --> 1 bpp with L_G4_ENCODE
+ * \param[in]    boxa          [optional] of image regions; can be null
+ * \param[in]    quality       used for jpeg image regions; 0 for default
+ * \param[in]    scalefactor   used for jpeg regions; must be <= 1.0
+ * \param[in]    title         [optional] pdf title; can be null
+ * \param[in]    fileout       output pdf file
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) See convertToPdfSegmented() for details.
+ * </pre>
+ */
+l_ok
+pixConvertToPdfSegmented(PIX         *pixs,
+                         l_int32      res,
+                         l_int32      type,
+                         l_int32      thresh,
+                         BOXA        *boxa,
+                         l_int32      quality,
+                         l_float32    scalefactor,
+                         const char  *title,
+                         const char  *fileout)
+{
+l_uint8  *data;
+l_int32   ret;
+size_t    nbytes;
+
+    if (!pixs)
+        return ERROR_INT("pixs not defined", __func__, 1);
+    if (!fileout)
+        return ERROR_INT("fileout not defined", __func__, 1);
+    if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
+        type != L_FLATE_ENCODE)
+        return ERROR_INT("invalid conversion type", __func__, 1);
+    if (boxa && scalefactor > 1.0) {
+        L_WARNING("setting scalefactor to 1.0\n", __func__);
+        scalefactor = 1.0;
+    }
+
+    ret = pixConvertToPdfDataSegmented(pixs, res, type, thresh, boxa, quality,
+                                       scalefactor, title, &data, &nbytes);
+    if (ret)
+        return ERROR_INT("pdf generation failure", __func__, 1);
+
+    ret = l_binaryWrite(fileout, "w", data, nbytes);
+    if (data) LEPT_FREE(data);
+    return ret;
+}
+
+
+/*!
+ * \brief   convertToPdfDataSegmented()
+ *
+ * \param[in]    filein        input image file -- any format
+ * \param[in]    res           input image resolution; typ. 300 ppi;
+ *                             use 0 for default
+ * \param[in]    type          compression type for non-image regions; image
+ *                             regions are always compressed with L_JPEG_ENCODE
+ * \param[in]    thresh        for converting gray --> 1 bpp with L_G4_ENCODE
+ * \param[in]    boxa          [optional] image regions; can be null
+ * \param[in]    quality       used for jpeg image regions; 0 for default
+ * \param[in]    scalefactor   used for jpeg regions; must be <= 1.0
+ * \param[in]    title         [optional] pdf title; can be null
+ * \param[out]   pdata         pdf data in memory
+ * \param[out]   pnbytes       number of bytes in pdf data
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) If there are no image regions, set %boxa == NULL;
+ *          %quality and %scalefactor are ignored.
+ *      (2) Typically, %scalefactor is < 1.0.  The image regions are
+ * </pre>
+ */
+l_ok
+convertToPdfDataSegmented(const char  *filein,
+                          l_int32      res,
+                          l_int32      type,
+                          l_int32      thresh,
+                          BOXA        *boxa,
+                          l_int32      quality,
+                          l_float32    scalefactor,
+                          const char  *title,
+                          l_uint8    **pdata,
+                          size_t      *pnbytes)
+{
+l_int32  ret;
+PIX     *pixs;
+
+    if (!pdata)
+        return ERROR_INT("&data not defined", __func__, 1);
+    *pdata = NULL;
+    if (!pnbytes)
+        return ERROR_INT("&nbytes not defined", __func__, 1);
+    *pnbytes = 0;
+    if (!filein)
+        return ERROR_INT("filein not defined", __func__, 1);
+    if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
+        type != L_FLATE_ENCODE)
+        return ERROR_INT("invalid conversion type", __func__, 1);
+    if (boxa && scalefactor > 1.0) {
+        L_WARNING("setting scalefactor to 1.0\n", __func__);
+        scalefactor = 1.0;
+    }
+
+    if ((pixs = pixRead(filein)) == NULL)
+        return ERROR_INT("pixs not made", __func__, 1);
+
+    ret = pixConvertToPdfDataSegmented(pixs, res, type, thresh, boxa,
+                                       quality, scalefactor, title,
+                                       pdata, pnbytes);
+    pixDestroy(&pixs);
+    return ret;
+}
+
+
+/*!
+ * \brief   pixConvertToPdfDataSegmented()
+ *
+ * \param[in]    pixs          any depth, cmap OK
+ * \param[in]    res           input image resolution; typ. 300 ppi;
+ *                             use 0 for default
+ * \param[in]    type          compression type for non-image regions; image
+ *                             regions are always compressed with L_JPEG_ENCODE
+ * \param[in]    thresh        for converting gray --> 1 bpp with L_G4_ENCODE
+ * \param[in]    boxa          [optional] of image regions; can be null
+ * \param[in]    quality       used for jpeg image regions; 0 for default
+ * \param[in]    scalefactor   used for jpeg regions; must be <= 1.0
+ * \param[in]    title         [optional] pdf title; can be null
+ * \param[out]   pdata         pdf data in memory
+ * \param[out]   pnbytes       number of bytes in pdf data
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) See convertToPdfSegmented() for details.
+ * </pre>
+ */
+l_ok
+pixConvertToPdfDataSegmented(PIX         *pixs,
+                             l_int32      res,
+                             l_int32      type,
+                             l_int32      thresh,
+                             BOXA        *boxa,
+                             l_int32      quality,
+                             l_float32    scalefactor,
+                             const char  *title,
+                             l_uint8    **pdata,
+                             size_t      *pnbytes)
+{
+l_int32      i, nbox, seq, bx, by, bw, bh, upscale;
+l_float32    scale;
+BOX         *box, *boxc, *box2;
+PIX         *pix, *pixt1, *pixt2, *pixt3, *pixt4, *pixt5, *pixt6;
+PIXCMAP     *cmap;
+L_PDF_DATA  *lpd;
+
+    if (!pdata)
+        return ERROR_INT("&data not defined", __func__, 1);
+    *pdata = NULL;
+    if (!pnbytes)
+        return ERROR_INT("&nbytes not defined", __func__, 1);
+    *pnbytes = 0;
+    if (!pixs)
+        return ERROR_INT("pixs not defined", __func__, 1);
+    if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
+        type != L_FLATE_ENCODE)
+        return ERROR_INT("invalid conversion type", __func__, 1);
+    if (boxa && (scalefactor <= 0.0 || scalefactor > 1.0)) {
+        L_WARNING("setting scalefactor to 1.0\n", __func__);
+        scalefactor = 1.0;
+    }
+
+        /* Adjust scalefactor so that the product with res gives an integer */
+    if (res <= 0)
+        res = DefaultInputRes;
+    scale = (l_float32)((l_int32)(scalefactor * res + 0.5)) / (l_float32)res;
+    cmap = pixGetColormap(pixs);
+
+        /* Simple case: single image to be encoded */
+    if (!boxa || boxaGetCount(boxa) == 0) {
+        if (pixGetDepth(pixs) > 1 && type == L_G4_ENCODE) {
+            if (cmap)
+                pixt1 = pixRemoveColormap(pixs, REMOVE_CMAP_TO_GRAYSCALE);
+            else
+                pixt1 = pixConvertTo8(pixs, FALSE);
+            pixt2 = pixScaleGray2xLIThresh(pixt1, thresh);
+            pixConvertToPdfData(pixt2, type, quality, pdata, pnbytes,
+                                0, 0, 2 * res, title, NULL, 0);
+            pixDestroy(&pixt1);
+            pixDestroy(&pixt2);
+        } else {
+            pixConvertToPdfData(pixs, type, quality, pdata, pnbytes,
+                                0, 0, res, title, NULL, 0);
+        }
+        return 0;
+    }
+
+        /* Multiple images to be encoded.  If %type == L_G4_ENCODE,
+         * jpeg encode a version of pixs that is blanked in the non-image
+         * regions, and paint the scaled non-image part onto it through a mask.
+         * Otherwise, we must put the non-image part down first and
+         * then render all the image regions separately on top of it,
+         * at their own resolution. */
+    pixt1 = pixSetBlackOrWhiteBoxa(pixs, boxa, L_SET_WHITE);  /* non-image */
+    nbox = boxaGetCount(boxa);
+    if (type == L_G4_ENCODE) {
+        pixt2 = pixCreateTemplate(pixs);  /* only image regions */
+        pixSetBlackOrWhite(pixt2, L_SET_WHITE);
+        for (i = 0; i < nbox; i++) {
+             box = boxaGetBox(boxa, i, L_CLONE);
+             pix = pixClipRectangle(pixs, box, &boxc);
+             boxGetGeometry(boxc, &bx, &by, &bw, &bh);
+             pixRasterop(pixt2, bx, by, bw, bh, PIX_SRC, pix, 0, 0);
+             pixDestroy(&pix);
+             boxDestroy(&box);
+             boxDestroy(&boxc);
+        }
+        pixt3 = pixRemoveColormap(pixt2, REMOVE_CMAP_BASED_ON_SRC);
+        if (pixGetDepth(pixt3) == 1)
+            pixt4 = pixScaleToGray(pixt3, scale);
+        else
+            pixt4 = pixScale(pixt3, scale, scale);
+        pixConvertToPdfData(pixt4, L_JPEG_ENCODE, quality, pdata, pnbytes,
+                            0, 0, (l_int32)(scale * res), title,
+                            &lpd, L_FIRST_IMAGE);
+
+        if (pixGetDepth(pixt1) == 1) {
+            pixt5 = pixClone(pixt1);
+            upscale = 1;
+        } else {
+            pixt6 = pixConvertTo8(pixt1, 0);
+            pixt5 = pixScaleGray2xLIThresh(pixt6, thresh);
+            pixDestroy(&pixt6);
+            upscale = 2;
+        }
+        pixConvertToPdfData(pixt5, L_G4_ENCODE, quality, pdata, pnbytes,
+                            0, 0, upscale * res, title, &lpd, L_LAST_IMAGE);
+        pixDestroy(&pixt2);
+        pixDestroy(&pixt3);
+        pixDestroy(&pixt4);
+        pixDestroy(&pixt5);
+    } else {
+            /* Put the non-image part down first.  This is the full
+               size of the page, so we can use it to find the page
+               height in pixels, which is required for determining
+               the LL corner of the image relative to the LL corner
+               of the page. */
+        pixConvertToPdfData(pixt1, type, quality, pdata, pnbytes, 0, 0,
+                            res, title, &lpd, L_FIRST_IMAGE);
+        for (i = 0; i < nbox; i++) {
+            box = boxaGetBox(boxa, i, L_CLONE);
+            pixt2 = pixClipRectangle(pixs, box, &boxc);
+            pixt3 = pixRemoveColormap(pixt2, REMOVE_CMAP_BASED_ON_SRC);
+            if (pixGetDepth(pixt3) == 1)
+                pixt4 = pixScaleToGray(pixt3, scale);
+            else
+                pixt4 = pixScale(pixt3, scale, scale);
+            box2 = boxTransform(boxc, 0, 0, scale, scale);
+            boxGetGeometry(box2, &bx, &by, NULL, &bh);
+            seq = (i == nbox - 1) ? L_LAST_IMAGE : L_NEXT_IMAGE;
+            pixConvertToPdfData(pixt4, L_JPEG_ENCODE, quality, pdata, pnbytes,
+                                bx, by, (l_int32)(scale * res), title,
+                                &lpd, seq);
+            pixDestroy(&pixt2);
+            pixDestroy(&pixt3);
+            pixDestroy(&pixt4);
+            boxDestroy(&box);
+            boxDestroy(&boxc);
+            boxDestroy(&box2);
+        }
+    }
+
+    pixDestroy(&pixt1);
+    return 0;
+}
+
+
+/*---------------------------------------------------------------------*
+ *                         Multi-page concatenation                    *
+ *---------------------------------------------------------------------*/
+/*!
+ * \brief   concatenatePdf()
+ *
+ * \param[in]    dirname   directory name containing single-page pdf files
+ * \param[in]    substr    [optional] substring filter on filenames;
+ *                         can be null
+ * \param[in]    fileout   concatenated pdf file
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) This only works with leptonica-formatted single-page pdf files.
+ *      (2) If %substr is not NULL, only filenames that contain
+ *          the substring can be returned.  If %substr == NULL,
+ *          none of the filenames are filtered out.
+ *      (3) The files in the directory, after optional filtering by
+ *          the substring, are lexically sorted in increasing order
+ *          before concatenation.
+ * </pre>
+ */
+l_ok
+concatenatePdf(const char  *dirname,
+               const char  *substr,
+               const char  *fileout)
+{
+l_int32  ret;
+SARRAY  *sa;
+
+    if (!dirname)
+        return ERROR_INT("dirname not defined", __func__, 1);
+    if (!fileout)
+        return ERROR_INT("fileout not defined", __func__, 1);
+
+    if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
+        return ERROR_INT("sa not made", __func__, 1);
+    ret = saConcatenatePdf(sa, fileout);
+    sarrayDestroy(&sa);
+    return ret;
+}
+
+
+/*!
+ * \brief   saConcatenatePdf()
+ *
+ * \param[in]    sa        string array of pathnames for single-page pdf files
+ * \param[in]    fileout   concatenated pdf file
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) This only works with leptonica-formatted single-page pdf files.
+ * </pre>
+ */
+l_ok
+saConcatenatePdf(SARRAY      *sa,
+                 const char  *fileout)
+{
+l_uint8  *data;
+l_int32   ret;
+size_t    nbytes;
+
+    if (!sa)
+        return ERROR_INT("sa not defined", __func__, 1);
+    if (!fileout)
+        return ERROR_INT("fileout not defined", __func__, 1);
+
+    ret = saConcatenatePdfToData(sa, &data, &nbytes);
+    if (ret)
+        return ERROR_INT("pdf data not made", __func__, 1);
+    ret = l_binaryWrite(fileout, "w", data, nbytes);
+    LEPT_FREE(data);
+    return ret;
+}
+
+
+/*!
+ * \brief   ptraConcatenatePdf()
+ *
+ * \param[in]    pa       array of pdf strings, each for a single-page pdf file
+ * \param[in]    fileout  concatenated pdf file
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) This only works with leptonica-formatted single-page pdf files.
+ * </pre>
+ */
+l_ok
+ptraConcatenatePdf(L_PTRA      *pa,
+                   const char  *fileout)
+{
+l_uint8  *data;
+l_int32   ret;
+size_t    nbytes;
+
+    if (!pa)
+        return ERROR_INT("pa not defined", __func__, 1);
+    if (!fileout)
+        return ERROR_INT("fileout not defined", __func__, 1);
+
+    ret = ptraConcatenatePdfToData(pa, NULL, &data, &nbytes);
+    if (ret)
+        return ERROR_INT("pdf data not made", __func__, 1);
+    ret = l_binaryWrite(fileout, "w", data, nbytes);
+    LEPT_FREE(data);
+    return ret;
+}
+
+
+/*!
+ * \brief   concatenatePdfToData()
+ *
+ * \param[in]    dirname   directory name containing single-page pdf files
+ * \param[in]    substr    [optional] substring filter on filenames;
+ *                         can be null
+ * \param[out]   pdata     concatenated pdf data in memory
+ * \param[out]   pnbytes   number of bytes in pdf data
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) This only works with leptonica-formatted single-page pdf files.
+ *      (2) If %substr is not NULL, only filenames that contain
+ *          the substring can be returned.  If %substr == NULL,
+ *          none of the filenames are filtered out.
+ *      (3) The files in the directory, after optional filtering by
+ *          the substring, are lexically sorted in increasing order
+ *          before concatenation.
+ * </pre>
+ */
+l_ok
+concatenatePdfToData(const char  *dirname,
+                     const char  *substr,
+                     l_uint8    **pdata,
+                     size_t      *pnbytes)
+{
+l_int32  ret;
+SARRAY  *sa;
+
+    if (!pdata)
+        return ERROR_INT("&data not defined", __func__, 1);
+    *pdata = NULL;
+    if (!pnbytes)
+        return ERROR_INT("&nbytes not defined", __func__, 1);
+    *pnbytes = 0;
+    if (!dirname)
+        return ERROR_INT("dirname not defined", __func__, 1);
+
+    if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
+        return ERROR_INT("sa not made", __func__, 1);
+    ret = saConcatenatePdfToData(sa, pdata, pnbytes);
+    sarrayDestroy(&sa);
+    return ret;
+}
+
+
+/*!
+ * \brief   saConcatenatePdfToData()
+ *
+ * \param[in]    sa        string array of pathnames for single-page pdf files
+ * \param[out]   pdata     concatenated pdf data in memory
+ * \param[out]   pnbytes   number of bytes in pdf data
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) This only works with leptonica-formatted single-page pdf files.
+ * </pre>
+ */
+l_ok
+saConcatenatePdfToData(SARRAY    *sa,
+                       l_uint8  **pdata,
+                       size_t    *pnbytes)
+{
+char     *fname;
+l_int32   i, npages, ret;
+L_BYTEA  *bas;
+L_PTRA   *pa_data;  /* input pdf data for each page */
+
+    if (!pdata)
+        return ERROR_INT("&data not defined", __func__, 1);
+    *pdata = NULL;
+    if (!pnbytes)
+        return ERROR_INT("&nbytes not defined", __func__, 1);
+    *pnbytes = 0;
+    if (!sa)
+        return ERROR_INT("sa not defined", __func__, 1);
+
+        /* Read the pdf files into memory */
+    if ((npages = sarrayGetCount(sa)) == 0)
+        return ERROR_INT("no filenames found", __func__, 1);
+    pa_data = ptraCreate(npages);
+    for (i = 0; i < npages; i++) {
+        fname = sarrayGetString(sa, i, L_NOCOPY);
+        bas = l_byteaInitFromFile(fname);
+        ptraAdd(pa_data, bas);
+    }
+
+    ret = ptraConcatenatePdfToData(pa_data, sa, pdata, pnbytes);
+
+        /* Cleanup: some pages could have been removed */
+    ptraGetActualCount(pa_data, &npages);
+    for (i = 0; i < npages; i++) {
+        bas = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
+        l_byteaDestroy(&bas);
+    }
+    ptraDestroy(&pa_data, FALSE, FALSE);
+    return ret;
+}
+
+/* --------------------------------------------*/
+#endif  /* USE_PDFIO */
+/* --------------------------------------------*/
author	Franz Glasner <fzglas.hg@dom66.de>
date	Mon, 15 Sep 2025 11:43:07 +0200
parents
children