Mercurial > hgrepos > Python2 > PyMuPDF

/*====================================================================*
 -  Copyright (C) 2001 Leptonica.  All rights reserved.
 -
 -  Redistribution and use in source and binary forms, with or without
 -  modification, are permitted provided that the following conditions
 -  are met:
 -  1. Redistributions of source code must retain the above copyright
 -     notice, this list of conditions and the following disclaimer.
 -  2. Redistributions in binary form must reproduce the above
 -     copyright notice, this list of conditions and the following
 -     disclaimer in the documentation and/or other materials
 -     provided with the distribution.
 -
 -  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 -  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 -  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 -  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL ANY
 -  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 -  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 -  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 -  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 -  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 -  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 -  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *====================================================================*/

/*!
 * \file pdfio2.c
 * <pre>
 *
 *    Lower-level operations for generating pdf.
 *
 *     Intermediate function for single page, multi-image conversion
 *          l_int32              pixConvertToPdfData()
 *
 *     Intermediate function for generating multipage pdf output
 *          l_int32              ptraConcatenatePdfToData()
 *
 *     Convert tiff multipage to pdf file
 *          l_int32              convertTiffMultipageToPdf()
 *
 *     Generates the CID, transcoding under some conditions
 *          l_int32              l_generateCIDataForPdf()
 *          l_int32              l_generateCIData()
 *
 *       Lower-level CID generation without transcoding
 *          L_COMP_DATA         *l_generateFlateDataPdf()
 *          L_COMP_DATA         *l_generateJpegData()
 *          L_COMP_DATA         *l_generateJpegDataMem()
 *          static L_COMP_DATA  *l_generateJp2kData()
 *          L_COMP_DATA         *l_generateG4Data()
 *
 *       Lower-level CID generation with transcoding
 *          l_int32              pixGenerateCIData()
 *          L_COMP_DATA         *l_generateFlateData()
 *          static L_COMP_DATA  *pixGenerateFlateData()
 *          static L_COMP_DATA  *pixGenerateJpegData()
 *          static L_COMP_DATA  *pixGenerateJp2kData()
 *          static L_COMP_DATA  *pixGenerateG4Data()
 *
 *       Other CID operations
 *          l_int32              cidConvertToPdfData()
 *          void                 l_CIDataDestroy()
 *
 *     Helper functions for generating the output pdf string
 *          static l_int32       l_generatePdf()
 *          static void          generateFixedStringsPdf()
 *          static char         *generateEscapeString()
 *          static void          generateMediaboxPdf()
 *          static l_int32       generatePageStringPdf()
 *          static l_int32       generateContentStringPdf()
 *          static l_int32       generatePreXStringsPdf()
 *          static l_int32       generateColormapStringsPdf()
 *          static void          generateTrailerPdf()
 *          static l_int32       makeTrailerStringPdf()
 *          static l_int32       generateOutputDataPdf()
 *
 *     Helper functions for generating multipage pdf output
 *          static l_int32       parseTrailerPdf()
 *          static char         *generatePagesObjStringPdf()
 *          static L_BYTEA      *substituteObjectNumbers()
 *
 *     Create/destroy/access pdf data
 *          static L_PDF_DATA   *pdfdataCreate()
 *          static void          pdfdataDestroy()
 *          static L_COMP_DATA  *pdfdataGetCid()
 *
 *     Find number of pages in a pdf
 *          l_int32              getPdfPageCount()
 *
 *     Find widths and heights of pages and media boxes in a pdf
 *          l_int32              getPdfPageSizes()
 *          l_int32              getPdfMediaBoxSizes()
 *
 *     Find effective resolution of images rendered from a pdf
 *          l_int32              getPdfRendererResolution()
 *
 *     Set flags for special modes
 *          void                 l_pdfSetG4ImageMask()
 *          void                 l_pdfSetDateAndVersion()
 *
 * </pre>
 */

#ifdef HAVE_CONFIG_H
#include <config_auto.h>
#endif  /* HAVE_CONFIG_H */

#include <string.h>
#include <math.h>
#include "allheaders.h"

/* --------------------------------------------*/
#if  USE_PDFIO   /* defined in environ.h */
 /* --------------------------------------------*/

    /* Typical scan resolution in ppi (pixels/inch) */
static const l_int32  DefaultInputRes = 300;

    /* Static helpers */
static L_COMP_DATA  *l_generateJp2kData(const char *fname);
static L_COMP_DATA  *pixGenerateFlateData(PIX *pixs, l_int32 ascii85flag);
static L_COMP_DATA  *pixGenerateJpegData(PIX *pixs, l_int32 ascii85flag,
                                         l_int32 quality);
static L_COMP_DATA  *pixGenerateJp2kData(PIX *pixs, l_int32 quality);
static L_COMP_DATA  *pixGenerateG4Data(PIX *pixs, l_int32 ascii85flag);

static l_int32       l_generatePdf(l_uint8 **pdata, size_t *pnbytes,
                                   L_PDF_DATA  *lpd);
static void          generateFixedStringsPdf(L_PDF_DATA *lpd);
static char         *generateEscapeString(const char  *str);
static void          generateMediaboxPdf(L_PDF_DATA *lpd);
static l_int32       generatePageStringPdf(L_PDF_DATA *lpd);
static l_int32       generateContentStringPdf(L_PDF_DATA *lpd);
static l_int32       generatePreXStringsPdf(L_PDF_DATA *lpd);
static l_int32       generateColormapStringsPdf(L_PDF_DATA *lpd);
static void          generateTrailerPdf(L_PDF_DATA *lpd);
static char         *makeTrailerStringPdf(L_DNA *daloc);
static l_int32       generateOutputDataPdf(l_uint8 **pdata, size_t *pnbytes,
                                       L_PDF_DATA *lpd);

static l_int32       parseTrailerPdf(L_BYTEA *bas, L_DNA **pda);
static char         *generatePagesObjStringPdf(NUMA *napage);
static L_BYTEA      *substituteObjectNumbers(L_BYTEA *bas, NUMA *na_objs);

static L_PDF_DATA   *pdfdataCreate(const char *title);
static void          pdfdataDestroy(L_PDF_DATA **plpd);
static L_COMP_DATA  *pdfdataGetCid(L_PDF_DATA *lpd, l_int32 index);


/* ---------------- Defaults for rendering options ----------------- */
    /* Output G4 as writing through image mask; this is the default */
static l_int32   var_WRITE_G4_IMAGE_MASK = 1;
    /* Write date/time and lib version into pdf; this is the default */
static l_int32   var_WRITE_DATE_AND_VERSION = 1;

#define L_SMALLBUF   256
#define L_BIGBUF    2048   /* must be able to hold hex colormap */


#ifndef  NO_CONSOLE_IO
#define  DEBUG_MULTIPAGE      0
#endif  /* ~NO_CONSOLE_IO */


/*---------------------------------------------------------------------*
 *       Intermediate function for generating multipage pdf output     *
 *---------------------------------------------------------------------*/
/*!
 * \brief   pixConvertToPdfData()
 *
 * \param[in]      pix       all depths; cmap OK
 * \param[in]      type      L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE,
 *                           L_JP2K_ENCODE
 * \param[in]      quality   for jpeg: 1-100; 0 for default (75)
 *                           for jp2k: 27-45; 0 for default (34)
 * \param[out]     pdata     pdf array
 * \param[out]     pnbytes   number of bytes in pdf array
 * \param[in]      x, y      location of lower-left corner of image, in pixels,
 *                           relative to the PostScript origin (0,0) at
 *                           the lower-left corner of the page)
 * \param[in]      res       override the resolution of the input image, in ppi;
 *                           use 0 to respect resolution embedded in the input
 * \param[in]      title     [optional] pdf title; can be null
 * \param[in,out]  plpd      ptr to lpd; created on the first invocation and
 *                           returned until last image is processed
 * \param[in]      position  in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
 *                           L_LAST_IMAGE
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) If %res == 0 and the input resolution field from the pix is 0,
 *          this will use DefaultInputRes.
 *      (2) This only writes %data if it is the last image to be
 *          written on the page.
 *      (3) See comments in convertToPdf().
 * </pre>
 */
l_ok
pixConvertToPdfData(PIX          *pix,
                    l_int32       type,
                    l_int32       quality,
                    l_uint8     **pdata,
                    size_t       *pnbytes,
                    l_int32       x,
                    l_int32       y,
                    l_int32       res,
                    const char   *title,
                    L_PDF_DATA  **plpd,
                    l_int32       position)
{
l_int32       pixres, w, h, ret;
l_float32     xpt, ypt, wpt, hpt;
L_COMP_DATA  *cid = NULL;
L_PDF_DATA   *lpd = NULL;

    if (!pdata)
        return ERROR_INT("&data not defined", __func__, 1);
    *pdata = NULL;
    if (!pnbytes)
        return ERROR_INT("&nbytes not defined", __func__, 1);
    *pnbytes = 0;
    if (!pix)
        return ERROR_INT("pix not defined", __func__, 1);
    if (type != L_JPEG_ENCODE && type != L_G4_ENCODE &&
        type != L_FLATE_ENCODE && type != L_JP2K_ENCODE) {
        selectDefaultPdfEncoding(pix, &type);
    }
    if (quality < 0 || quality > 100)
        return ERROR_INT("invalid quality", __func__, 1);

    if (plpd) {  /* part of multi-page invocation */
        if (position == L_FIRST_IMAGE)
            *plpd = NULL;
    }

        /* Generate the compressed image data.  It must NOT
         * be ascii85 encoded. */
    pixGenerateCIData(pix, type, quality, 0, &cid);
    if (!cid)
        return ERROR_INT("cid not made", __func__, 1);

        /* Get media box in pts.  Guess the input image resolution
         * based on the input parameter %res, the resolution data in
         * the pix, and the size of the image. */
    pixres = cid->res;
    w = cid->w;
    h = cid->h;
    if (res <= 0.0)
        res = (pixres > 0) ? pixres : DefaultInputRes;
    xpt = x * 72.f / res;
    ypt = y * 72.f / res;
    wpt = w * 72.f / res;
    hpt = h * 72.f / res;

        /* Set up lpd */
    if (!plpd) {  /* single image */
        if ((lpd = pdfdataCreate(title)) == NULL)
            return ERROR_INT("lpd not made", __func__, 1);
    } else if (position == L_FIRST_IMAGE) {  /* first of multiple images */
        if ((lpd = pdfdataCreate(title)) == NULL)
            return ERROR_INT("lpd not made", __func__, 1);
        *plpd = lpd;
    } else {  /* not the first of multiple images */
        lpd = *plpd;
    }

        /* Add the data to the lpd */
    ptraAdd(lpd->cida, cid);
    lpd->n++;
    ptaAddPt(lpd->xy, xpt, ypt);
    ptaAddPt(lpd->wh, wpt, hpt);

        /* If a single image or the last of multiple images,
         * generate the pdf and destroy the lpd */
    if (!plpd || (position == L_LAST_IMAGE)) {
        ret = l_generatePdf(pdata, pnbytes, lpd);
        pdfdataDestroy(&lpd);
        if (plpd) *plpd = NULL;
        if (ret)
            return ERROR_INT("pdf output not made", __func__, 1);
    }

    return 0;
}


/*---------------------------------------------------------------------*
 *      Intermediate function for generating multipage pdf output      *
 *---------------------------------------------------------------------*/
/*!
 * \brief   ptraConcatenatePdfToData()
 *
 * \param[in]    pa_data    ptra array of pdf strings, each for a
 *                          single-page pdf file
 * \param[in]    sa         [optional] string array of pathnames for
 *                          input pdf files; can be null
 * \param[out]   pdata      concatenated pdf data in memory
 * \param[out]   pnbytes    number of bytes in pdf data
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) This only works with leptonica-formatted single-page pdf files.
 *          pdf files generated by other programs will have unpredictable
 *          (and usually bad) results.  The requirements for each pdf file:
 *            (a) The Catalog and Info objects are the first two.
 *            (b) Object 3 is Pages
 *            (c) Object 4 is Page
 *            (d) The remaining objects are Contents, XObjects, and ColorSpace
 *      (2) We remove trailers from each page, and append the full trailer
 *          for all pages at the end.
 *      (3) For all but the first file, remove the ID and the first 3
 *          objects (catalog, info, pages), so that each subsequent
 *          file has only objects of these classes:
 *              Page, Contents, XObject, ColorSpace (Indexed RGB).
 *          For those objects, we substitute these refs to objects
 *          in the local file:
 *              Page:  Parent(object 3), Contents, XObject(typically multiple)
 *              XObject:  [ColorSpace if indexed]
 *          The Pages object on the first page (object 3) has a Kids array
 *          of references to all the Page objects, with a Count equal
 *          to the number of pages.  Each Page object refers back to
 *          this parent.
 * </pre>
 */
l_ok
ptraConcatenatePdfToData(L_PTRA    *pa_data,
                         SARRAY    *sa,
                         l_uint8  **pdata,
                         size_t    *pnbytes)
{
char     *fname, *str_pages, *str_trailer;
l_uint8  *pdfdata, *data;
l_int32   i, j, index, nobj, npages;
l_int32  *sizes, *locs;
size_t    size;
L_BYTEA  *bas, *bad, *bat1, *bat2;
L_DNA    *da_locs, *da_sizes, *da_outlocs, *da;
L_DNAA   *daa_locs;  /* object locations on each page */
NUMA     *na_objs, *napage;
NUMAA    *naa_objs;  /* object mapping numbers to new values */

    if (!pdata)
        return ERROR_INT("&data not defined", __func__, 1);
    *pdata = NULL;
    if (!pnbytes)
        return ERROR_INT("&nbytes not defined", __func__, 1);
    *pnbytes = 0;
    if (!pa_data)
        return ERROR_INT("pa_data not defined", __func__, 1);

        /* Parse the files and find the object locations.
         * Remove file data that cannot be parsed. */
    ptraGetActualCount(pa_data, &npages);
    daa_locs = l_dnaaCreate(npages);
    for (i = 0; i < npages; i++) {
        bas = (L_BYTEA *)ptraGetPtrToItem(pa_data, i);
        if (parseTrailerPdf(bas, &da_locs) != 0) {
            bas = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
            l_byteaDestroy(&bas);
            if (sa) {
                fname = sarrayGetString(sa, i, L_NOCOPY);
                L_ERROR("can't parse file %s; skipping\n", __func__, fname);
            } else {
                L_ERROR("can't parse file %d; skipping\n", __func__, i);
            }
        } else {
            l_dnaaAddDna(daa_locs, da_locs, L_INSERT);
        }
    }

        /* Recompute npages in case some of the files were not pdf */
    ptraCompactArray(pa_data);
    ptraGetActualCount(pa_data, &npages);
    if (npages == 0) {
        l_dnaaDestroy(&daa_locs);
        return ERROR_INT("no parsable pdf files found", __func__, 1);
    }

        /* Find the mapping from initial to final object numbers */
    naa_objs = numaaCreate(npages);  /* stores final object numbers */
    napage = numaCreate(npages);  /* stores "Page" object numbers */
    index = 0;
    for (i = 0; i < npages; i++) {
        da = l_dnaaGetDna(daa_locs, i, L_CLONE);
        nobj = l_dnaGetCount(da);
        if (i == 0) {
            numaAddNumber(napage, 4);  /* object 4 on first page */
            na_objs = numaMakeSequence(0.0, 1.0, nobj - 1);
            index = nobj - 1;
        } else {  /* skip the first 3 objects in each file */
            numaAddNumber(napage, index);  /* Page object is first we add */
            na_objs = numaMakeConstant(0.0, nobj - 1);
            numaReplaceNumber(na_objs, 3, 3);  /* refers to parent of all */
            for (j = 4; j < nobj - 1; j++)
                numaSetValue(na_objs, j, index++);
        }
        numaaAddNuma(naa_objs, na_objs, L_INSERT);
        l_dnaDestroy(&da);
    }

        /* Make the Pages object (#3) */
    str_pages = generatePagesObjStringPdf(napage);

        /* Build the output */
    bad = l_byteaCreate(5000);
    da_outlocs = l_dnaCreate(0);  /* locations of all output objects */
    for (i = 0; i < npages; i++) {
        bas = (L_BYTEA *)ptraGetPtrToItem(pa_data, i);
        pdfdata = l_byteaGetData(bas, &size);
        da_locs = l_dnaaGetDna(daa_locs, i, L_CLONE);  /* locs on this page */
        na_objs = numaaGetNuma(naa_objs, i, L_CLONE);  /* obj # on this page */
        nobj = l_dnaGetCount(da_locs) - 1;
        da_sizes = l_dnaDiffAdjValues(da_locs);  /* object sizes on this page */
        sizes = l_dnaGetIArray(da_sizes);
        locs = l_dnaGetIArray(da_locs);
        if (i == 0) {
            l_byteaAppendData(bad, pdfdata, sizes[0]);
            l_byteaAppendData(bad, pdfdata + locs[1], sizes[1]);
            l_byteaAppendData(bad, pdfdata + locs[2], sizes[2]);
            l_byteaAppendString(bad, str_pages);
            for (j = 0; j < 4; j++)
                l_dnaAddNumber(da_outlocs, locs[j]);
        }
        for (j = 4; j < nobj; j++) {
            l_dnaAddNumber(da_outlocs, l_byteaGetSize(bad));
            bat1 = l_byteaInitFromMem(pdfdata + locs[j], sizes[j]);
            bat2 = substituteObjectNumbers(bat1, na_objs);
            data = l_byteaGetData(bat2, &size);
            l_byteaAppendData(bad, data, size);
            l_byteaDestroy(&bat1);
            l_byteaDestroy(&bat2);
        }
        if (i == npages - 1)  /* last one */
            l_dnaAddNumber(da_outlocs, l_byteaGetSize(bad));
        LEPT_FREE(sizes);
        LEPT_FREE(locs);
        l_dnaDestroy(&da_locs);
        numaDestroy(&na_objs);
        l_dnaDestroy(&da_sizes);
    }

        /* Add the trailer */
    str_trailer = makeTrailerStringPdf(da_outlocs);
    l_byteaAppendString(bad, str_trailer);

        /* Transfer the output data */
    *pdata = l_byteaCopyData(bad, pnbytes);
    l_byteaDestroy(&bad);

#if  DEBUG_MULTIPAGE
    lept_stderr("******** object mapper **********");
    numaaWriteStream(stderr, naa_objs);

    lept_stderr("******** Page object numbers ***********");
    numaWriteStderr(napage);

    lept_stderr("******** Pages object ***********\n");
    lept_stderr("%s\n", str_pages);
#endif  /* DEBUG_MULTIPAGE */

    numaDestroy(&napage);
    numaaDestroy(&naa_objs);
    l_dnaDestroy(&da_outlocs);
    l_dnaaDestroy(&daa_locs);
    LEPT_FREE(str_pages);
    LEPT_FREE(str_trailer);
    return 0;
}


/*---------------------------------------------------------------------*
 *                  Convert tiff multipage to pdf file                 *
 *---------------------------------------------------------------------*/
/*!
 * \brief   convertTiffMultipageToPdf()
 *
 * \param[in]    filein    (tiff)
 * \param[in]    fileout   (pdf)
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) A multipage tiff file can also be converted to PS, using
 *          convertTiffMultipageToPS()
 * </pre>
 */
l_ok
convertTiffMultipageToPdf(const char  *filein,
                          const char  *fileout)
{
l_int32  istiff;
PIXA    *pixa;
FILE    *fp;

    if ((fp = fopenReadStream(filein)) == NULL)
        return ERROR_INT_1("file not found", filein, __func__, 1);
    istiff = fileFormatIsTiff(fp);
    fclose(fp);
    if (!istiff)
        return ERROR_INT_1("file not tiff format", filein, __func__, 1);

    pixa = pixaReadMultipageTiff(filein);
    pixaConvertToPdf(pixa, 0, 1.0, 0, 0, "weasel2", fileout);
    pixaDestroy(&pixa);
    return 0;
}


/*---------------------------------------------------------------------*
 *                          CID-based operations                       *
 *---------------------------------------------------------------------*/
/*!
 * \brief   l_generateCIDataForPdf()
 *
 * \param[in]    fname      [optional] can be null
 * \param[in]    pix        [optional] can be null
 * \param[in]    quality    for jpeg if transcoded: 1-100; 0 for default (75)
 *                          for jp2k if transcoded: 27-45; 0 for default (34)
 * \param[out]   pcid       compressed data
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) You must set either filename or pix.
 *      (2) Given an image file and optionally a pix raster of that data,
 *          this provides a CID that is compatible with PDF, preferably
 *          without transcoding.
 *      (3) The pix is included for efficiency, in case transcoding
 *          is required and the pix is available to the caller.
 *      (4) We don't try to open files named "stdin" or "-" for Tesseract
 *          compatibility reasons. We may remove this restriction
 *          in the future.
 *      (5) Note that tiff-g4 must be transcoded to properly handle byte
 *          order and perhaps photometry (e.g., min-is-black).  For a
 *          multipage tiff file, data will only be extracted from the
 *          first page, so this should not be invoked.
 * </pre>
 */
l_ok
l_generateCIDataForPdf(const char    *fname,
                       PIX           *pix,
                       l_int32        quality,
                       L_COMP_DATA  **pcid)
{
l_int32       format, type;
L_COMP_DATA  *cid;
PIX          *pixt;

    if (!pcid)
        return ERROR_INT("&cid not defined", __func__, 1);
    *pcid = cid = NULL;
    if (!fname && !pix)
        return ERROR_INT("neither fname nor pix are defined", __func__, 1);

        /* If a compressed file is given that is not 'stdin', see if we
         * can generate the pdf output without transcoding. */
    if (fname && strcmp(fname, "-") != 0 && strcmp(fname, "stdin") != 0) {
        findFileFormat(fname, &format);
        if (format == IFF_UNKNOWN)
            L_WARNING("file %s format is unknown\n", __func__, fname);
        if (format == IFF_PS || format == IFF_LPDF) {
            L_ERROR("file %s is unsupported format %d\n",
                  __func__, fname, format);
            return 1;
        }
        if (format == IFF_JFIF_JPEG) {
            cid = l_generateJpegData(fname, 0);
        } else if (format == IFF_JP2) {
            cid = l_generateJp2kData(fname);
        } else if (format == IFF_PNG) {
            cid = l_generateFlateDataPdf(fname, pix);
        }
    }

        /* Otherwise, use the pix to generate the pdf output */
    if  (!cid) {
        if (!pix)
            pixt = pixRead(fname);
        else
            pixt = pixClone(pix);
        if (!pixt)
            return ERROR_INT("pixt not made", __func__, 1);
        if (selectDefaultPdfEncoding(pixt, &type)) {
            pixDestroy(&pixt);
            return 1;
        }
        pixGenerateCIData(pixt, type, quality, 0, &cid);
        pixDestroy(&pixt);
        if (!cid)
            return ERROR_INT("cid not made from pix", __func__, 1);
    }
    *pcid = cid;
    return 0;
}


/*!
 * \brief   l_generateCIData()
 *
 * \param[in]    fname
 * \param[in]    type       L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE,
 *                          L_JP2K_ENCODE
 * \param[in]    quality    for jpeg if transcoded: 1-100; 0 for default (75)
 *                          for jp2k if transcoded: 27-45; 0 for default (34)
 * \param[in]    ascii85    0 for binary; 1 for ascii85-encoded
 * \param[out]   pcid       compressed data
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) This can be used for both PostScript and pdf.
 *      (1) Set ascii85:
 *           ~ 0 for binary data (PDF only)
 *           ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only)
 *      (2) This attempts to compress according to the requested type.
 *          If this can't be done, it falls back to ordinary flate encoding.
 *      (3) This differs from l_generateCIDataForPdf(), which determines
 *          the file format and only works for pdf.
 * </pre>
 */
l_ok
l_generateCIData(const char    *fname,
                 l_int32        type,
                 l_int32        quality,
                 l_int32        ascii85,
                 L_COMP_DATA  **pcid)
{
l_int32       format, d, bps, spp, iscmap;
L_COMP_DATA  *cid;
PIX          *pix;

    if (!pcid)
        return ERROR_INT("&cid not defined", __func__, 1);
    *pcid = NULL;
    if (!fname)
        return ERROR_INT("fname not defined", __func__, 1);
    if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
        type != L_FLATE_ENCODE && type != L_JP2K_ENCODE)
        return ERROR_INT("invalid conversion type", __func__, 1);
    if (ascii85 != 0 && ascii85 != 1)
        return ERROR_INT("invalid ascii85", __func__, 1);

        /* Sanity check on requested encoding */
    pixReadHeader(fname, &format, NULL, NULL, &bps, &spp, &iscmap);
    d = bps * spp;
    if (d == 24) d = 32;
    if (iscmap && type != L_FLATE_ENCODE) {
        L_WARNING("pixs has cmap; using flate encoding\n", __func__);
        type = L_FLATE_ENCODE;
    } else if (d < 8 && type == L_JPEG_ENCODE) {
        L_WARNING("pixs has < 8 bpp; using flate encoding\n", __func__);
        type = L_FLATE_ENCODE;
    } else if (d < 8 && type == L_JP2K_ENCODE) {
        L_WARNING("pixs has < 8 bpp; using flate encoding\n", __func__);
        type = L_FLATE_ENCODE;
    } else if (d > 1 && type == L_G4_ENCODE) {
        L_WARNING("pixs has > 1 bpp; using flate encoding\n", __func__);
        type = L_FLATE_ENCODE;
    }

    if (type == L_JPEG_ENCODE) {
        if (format == IFF_JFIF_JPEG) {  /* do not transcode */
            cid = l_generateJpegData(fname, ascii85);
        } else {
            if ((pix = pixRead(fname)) == NULL)
                return ERROR_INT("pix not returned for JPEG", __func__, 1);
            cid = pixGenerateJpegData(pix, ascii85, quality);
            pixDestroy(&pix);
        }
        if (!cid)
            return ERROR_INT("jpeg data not made", __func__, 1);
    } else if (type == L_JP2K_ENCODE) {
        if (format == IFF_JP2) {  /* do not transcode */
            cid = l_generateJp2kData(fname);
        } else {
            if ((pix = pixRead(fname)) == NULL)
                return ERROR_INT("pix not returned for JP2K", __func__, 1);
            cid = pixGenerateJp2kData(pix, quality);
            pixDestroy(&pix);
        }
        if (!cid)
            return ERROR_INT("jp2k data not made", __func__, 1);
    } else if (type == L_G4_ENCODE) {
        if ((pix = pixRead(fname)) == NULL)
            return ERROR_INT("pix not returned for G4", __func__, 1);
        cid = pixGenerateG4Data(pix, ascii85);
        pixDestroy(&pix);
        if (!cid)
            return ERROR_INT("g4 data not made", __func__, 1);
    } else if (type == L_FLATE_ENCODE) {
        if ((cid = l_generateFlateData(fname, ascii85)) == NULL)
            return ERROR_INT("flate data not made", __func__, 1);
    } else {
        return ERROR_INT("invalid conversion type", __func__, 1);
    }
    *pcid = cid;

    return 0;
}


/*---------------------------------------------------------------------*
 *                     Low-level CID-based operations                  *
 *---------------------------------------------------------------------*/
/*!
 * \brief   l_generateFlateDataPdf()
 *
 * \param[in]    fname     preferably png
 * \param[in]    pixs      [optional] can be null
 * \return  cid containing png data, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) If you hand this a png file, you are going to get
 *          png predictors embedded in the flate data. So it has
 *          come to this. http://xkcd.com/1022/
 *      (2) Exception: if the png is interlaced or if it is RGBA,
 *          it will be transcoded.
 *      (3) If transcoding is required, this will not have to read from
 *          file if a pix is input.
 * </pre>
 */
L_COMP_DATA *
l_generateFlateDataPdf(const char  *fname,
                       PIX         *pixs)
{
l_uint8      *pngcomp = NULL;  /* entire PNG compressed file */
l_uint8      *datacomp = NULL;  /* gzipped raster data */
l_uint8      *cmapdata = NULL;  /* uncompressed colormap */
char         *cmapdatahex = NULL;  /* hex ascii uncompressed colormap */
l_uint32      i, j, n;
l_int32       format, interlaced;
l_int32       ncolors;  /* in colormap */
l_int32       bps;  /* bits/sample: usually 8 */
l_int32       spp;  /* samples/pixel: 1-grayscale/cmap); 3-rgb; 4-rgba */
l_int32       w, h, cmapflag;
l_int32       xres, yres;
size_t        nbytescomp = 0, nbytespng = 0;
FILE         *fp;
L_COMP_DATA  *cid;
PIX          *pix;
PIXCMAP      *cmap = NULL;

    if (!fname)
        return (L_COMP_DATA *)ERROR_PTR("fname not defined", __func__, NULL);

    findFileFormat(fname, &format);
    spp = 0;  /* init to spp != 4 if not png */
    interlaced = 0;  /* initialize to no interlacing */
    bps = 0;  /* initialize to a nonsense value */
    if (format == IFF_PNG) {
        isPngInterlaced(fname, &interlaced);
        if (readHeaderPng(fname, NULL, NULL, &bps, &spp, NULL))
            return (L_COMP_DATA *)ERROR_PTR("bad png input", __func__, NULL);
    }

        /* PDF is capable of inlining some types of PNG files, but not all
           of them. We need to transcode anything with interlacing, an
           alpha channel, or 1 bpp (which would otherwise be photo-inverted).

           Note: any PNG image file with an alpha channel is converted on
           reading to RGBA (spp == 4). This includes the (gray + alpha) format
           with spp == 2.  Because of the conversion, readHeaderPng() gives
           spp = 2, whereas pixGetSpp() gives spp = 4 on the converted pix. */
    if (format != IFF_PNG ||
       (format == IFF_PNG && (interlaced || bps == 1 || spp == 4 || spp == 2)))
    {  /* lgtm+ analyzer needed the logic expanded */
        if (!pixs)
            pix = pixRead(fname);
        else
            pix = pixClone(pixs);
        if (!pix)
            return (L_COMP_DATA *)ERROR_PTR("pix not made", __func__, NULL);
        cid = pixGenerateFlateData(pix, 0);
        pixDestroy(&pix);
        return cid;
    }

        /* It's png.  Generate the pdf data without transcoding.
         * Implementation by Jeff Breidenbach.
         * First, read the metadata */
    if ((fp = fopenReadStream(fname)) == NULL)
        return (L_COMP_DATA *)ERROR_PTR_1("stream not opened",
                                          fname, __func__, NULL);
    freadHeaderPng(fp, &w, &h, &bps, &spp, &cmapflag);
    fgetPngResolution(fp, &xres, &yres);
    fclose(fp);

        /* We get pdf corruption when inlining the data from 16 bpp png. */
    if (bps == 16)
        return l_generateFlateData(fname, 0);

        /* Read the entire png file */
    if ((pngcomp = l_binaryRead(fname, &nbytespng)) == NULL)
        return (L_COMP_DATA *)ERROR_PTR_1("unable to read file",
                                          fname, __func__, NULL);

        /* Extract flate data, copying portions of it to memory, including
         * the predictor information in a byte at the beginning of each
         * raster line.  The flate data makes up the vast majority of
         * the png file, so after extraction we expect datacomp to
         * be nearly full (i.e., nbytescomp will be only slightly less
         * than nbytespng).  Also extract the colormap if present. */
    if ((datacomp = (l_uint8 *)LEPT_CALLOC(1, nbytespng)) == NULL) {
        LEPT_FREE(pngcomp);
        return (L_COMP_DATA *)ERROR_PTR("unable to allocate memory",
                                        __func__, NULL);
    }

        /* Parse the png file.  Each chunk consists of:
         *    length: 4 bytes
         *    name:   4 bytes (e.g., "IDAT")
         *    data:   n bytes
         *    CRC:    4 bytes
         * Start at the beginning of the data section of the first chunk,
         * byte 16, because the png file begins with 8 bytes of header,
         * followed by the first 8 bytes of the first chunk
         * (length and name).  On each loop, increment by 12 bytes to
         * skip over the CRC, length and name of the next chunk. */
    for (i = 16; i < nbytespng; i += 12) {  /* do each successive chunk */
            /* Get the chunk length */
        n  = pngcomp[i - 8] << 24;
        n += pngcomp[i - 7] << 16;
        n += pngcomp[i - 6] << 8;
        n += pngcomp[i - 5] << 0;
        if (n >= nbytespng - i) {  /* "n + i" can overflow */
            LEPT_FREE(pngcomp);
            LEPT_FREE(datacomp);
            pixcmapDestroy(&cmap);
            L_ERROR("invalid png: i = %d, n = %d, nbytes = %zu\n", __func__,
                    i, n, nbytespng);
            return NULL;
        }

            /* Is it a data chunk? */
        if (memcmp(pngcomp + i - 4, "IDAT", 4) == 0) {
            memcpy(datacomp + nbytescomp, pngcomp + i, n);
            nbytescomp += n;
        }

            /* Is it a palette chunk? */
        if (cmapflag && !cmap &&
            memcmp(pngcomp + i - 4, "PLTE", 4) == 0) {
            if ((n / 3) > (1 << bps)) {
                LEPT_FREE(pngcomp);
                LEPT_FREE(datacomp);
                pixcmapDestroy(&cmap);
                L_ERROR("invalid png: i = %d, n = %d, cmapsize = %d\n",
                        __func__, i, n, (1 << bps));
                return NULL;
            }
            cmap = pixcmapCreate(bps);
            for (j = i; j < i + n; j += 3) {
                pixcmapAddColor(cmap, pngcomp[j], pngcomp[j + 1],
                                pngcomp[j + 2]);
            }
        }
        i += n;  /* move to the end of the data chunk */
    }
    LEPT_FREE(pngcomp);

    if (nbytescomp == 0) {
        LEPT_FREE(datacomp);
        pixcmapDestroy(&cmap);
        return (L_COMP_DATA *)ERROR_PTR("invalid PNG file", __func__, NULL);
    }

        /* Extract and encode the colormap data as hexascii  */
    ncolors = 0;
    if (cmap) {
        pixcmapSerializeToMemory(cmap, 3, &ncolors, &cmapdata);
        pixcmapDestroy(&cmap);
        if (!cmapdata) {
            LEPT_FREE(datacomp);
            return (L_COMP_DATA *)ERROR_PTR("cmapdata not made",
                                            __func__, NULL);
        }
        cmapdatahex = pixcmapConvertToHex(cmapdata, ncolors);
        LEPT_FREE(cmapdata);
    }

        /* Note that this is the only situation where the predictor
         * field of the CID is set to 1.  Adobe's predictor values on
         * p. 76 of pdf_reference_1-7.pdf give 1 for no predictor and
         * 10-14 for inline predictors, the specifics of which are
         * ignored by the pdf interpreter, which just needs to know that
         * the first byte on each compressed scanline is some predictor
         * whose type can be inferred from the byte itself.  */
    cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA));
    cid->datacomp = datacomp;
    cid->type = L_FLATE_ENCODE;
    cid->cmapdatahex = cmapdatahex;
    cid->nbytescomp = nbytescomp;
    cid->ncolors = ncolors;
    cid->predictor = TRUE;
    cid->w = w;
    cid->h = h;
    cid->bps = bps;
    cid->spp = spp;
    cid->res = xres;
    return cid;
}


/*!
 * \brief   l_generateJpegData()
 *
 * \param[in]    fname           of jpeg file
 * \param[in]    ascii85flag     0 for jpeg; 1 for ascii85-encoded jpeg
 * \return  cid containing jpeg data, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) Set ascii85flag:
 *           ~ 0 for binary data (PDF only)
 *           ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only)
 *      (2) Most of this function is repeated in l_generateJpegMemData(),
 *          which is required in pixacompFastConvertToPdfData().
 * </pre>
 */
L_COMP_DATA *
l_generateJpegData(const char  *fname,
                   l_int32      ascii85flag)
{
char         *data85 = NULL;  /* ascii85 encoded jpeg compressed file */
l_uint8      *data = NULL;
l_int32       w, h, xres, yres, bps, spp;
size_t        nbytes, nbytes85;
L_COMP_DATA  *cid;
FILE         *fp;

    if (!fname)
        return (L_COMP_DATA *)ERROR_PTR("fname not defined", __func__, NULL);

    if (ascii85flag != 0 && ascii85flag != 1)
        return (L_COMP_DATA *)ERROR_PTR("wrong ascii85flags", __func__, NULL);

        /* Read the metadata */
    if (readHeaderJpeg(fname, &w, &h, &spp, NULL, NULL))
        return (L_COMP_DATA *)ERROR_PTR("bad jpeg metadata", __func__, NULL);
    bps = 8;
    if ((fp = fopenReadStream(fname)) == NULL)
        return (L_COMP_DATA *)ERROR_PTR_1("stream not opened",
                                          fname, __func__, NULL);
    fgetJpegResolution(fp, &xres, &yres);
    fclose(fp);

        /* Read the entire jpeg file.  The returned jpeg data in memory
         * starts with ffd8 and ends with ffd9 */
    if ((data = l_binaryRead(fname, &nbytes)) == NULL)
        return (L_COMP_DATA *)ERROR_PTR_1("data not extracted",
                                          fname, __func__, NULL);

        /* Optionally, encode the compressed data */
    if (ascii85flag == 1) {
        data85 = encodeAscii85(data, nbytes, &nbytes85);
        LEPT_FREE(data);
        if (!data85)
            return (L_COMP_DATA *)ERROR_PTR_1("data85 not made",
                                              fname, __func__, NULL);
        else
            data85[nbytes85 - 1] = '\0';  /* remove the newline */
    }

    cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA));
    if (ascii85flag == 0) {
        cid->datacomp = data;
    } else {  /* ascii85 */
        cid->data85 = data85;
        cid->nbytes85 = nbytes85;
    }
    cid->type = L_JPEG_ENCODE;
    cid->nbytescomp = nbytes;
    cid->w = w;
    cid->h = h;
    cid->bps = bps;
    cid->spp = spp;
    cid->res = xres;
    return cid;
}


/*!
 * \brief   l_generateJpegDataMem()
 *
 * \param[in]    data           of jpeg-encoded file
 * \param[in]    nbytes         size of jpeg-encoded file
 * \param[in]    ascii85flag    0 for jpeg; 1 for ascii85-encoded jpeg
 * \return  cid containing jpeg data, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) Set ascii85flag:
 *           ~ 0 for binary data (PDF only)
 *           ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only)
 * </pre>
 */
L_COMP_DATA *
l_generateJpegDataMem(l_uint8  *data,
                      size_t    nbytes,
                      l_int32   ascii85flag)
{
char         *data85 = NULL;  /* ascii85 encoded jpeg compressed file */
l_int32       w, h, xres, yres, bps, spp;
size_t        nbytes85;
L_COMP_DATA  *cid;

    if (!data)
        return (L_COMP_DATA *)ERROR_PTR("data not defined", __func__, NULL);

        /* Read the metadata */
    if (readHeaderMemJpeg(data, nbytes, &w, &h, &spp, NULL, NULL)) {
        LEPT_FREE(data);
        return (L_COMP_DATA *)ERROR_PTR("bad jpeg metadata", __func__, NULL);
    }
    bps = 8;
    readResolutionMemJpeg(data, nbytes, &xres, &yres);

        /* Optionally, encode the compressed data */
    if (ascii85flag == 1) {
        data85 = encodeAscii85(data, nbytes, &nbytes85);
        LEPT_FREE(data);
        if (!data85)
            return (L_COMP_DATA *)ERROR_PTR("data85 not made", __func__, NULL);
        else
            data85[nbytes85 - 1] = '\0';  /* remove the newline */
    }

    cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA));
    if (ascii85flag == 0) {
        cid->datacomp = data;
    } else {  /* ascii85 */
        cid->data85 = data85;
        cid->nbytes85 = nbytes85;
    }
    cid->type = L_JPEG_ENCODE;
    cid->nbytescomp = nbytes;
    cid->w = w;
    cid->h = h;
    cid->bps = bps;
    cid->spp = spp;
    cid->res = xres;
    return cid;
}


/*!
 * \brief   l_generateJp2kData()
 *
 * \param[in]    fname     of jp2k file
 * \return  cid containing jp2k data, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) This is only called after the file is verified to be jp2k.
 * </pre>
 */
static L_COMP_DATA *
l_generateJp2kData(const char  *fname)
{
l_int32       w, h, bps, spp, xres, yres;
size_t        nbytes;
L_COMP_DATA  *cid;
FILE         *fp;

    if (!fname)
        return (L_COMP_DATA *)ERROR_PTR("fname not defined", __func__, NULL);

    if (readHeaderJp2k(fname, &w, &h, &bps, &spp, NULL))
        return (L_COMP_DATA *)ERROR_PTR("bad jp2k metadata", __func__, NULL);

        /* The returned jp2k data in memory is the entire jp2k file */
    cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA));
    if ((cid->datacomp = l_binaryRead(fname, &nbytes)) == NULL) {
        l_CIDataDestroy(&cid);
        return (L_COMP_DATA *)ERROR_PTR("data not extracted", __func__, NULL);
    }

    xres = yres = 0;
    if ((fp = fopenReadStream(fname)) != NULL) {
        fgetJp2kResolution(fp, &xres, &yres);
        fclose(fp);
    }
    cid->type = L_JP2K_ENCODE;
    cid->nbytescomp = nbytes;
    cid->w = w;
    cid->h = h;
    cid->bps = bps;
    cid->spp = spp;
    cid->res = xres;
    return cid;
}


/*!
 * \brief   l_generateG4Data()
 *
 * \param[in]    fname          of g4 compressed file
 * \param[in]    ascii85flag    0 for g4 compressed; 1 for ascii85-encoded g4
 * \return  cid g4 compressed image data, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) Set ascii85flag:
 *           ~ 0 for binary data (PDF only)
 *           ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only)
 *      (2) This does not work for multipage tiff files.
 * </pre>
 */
L_COMP_DATA *
l_generateG4Data(const char  *fname,
                 l_int32      ascii85flag)
{
l_uint8      *datacomp = NULL;  /* g4 compressed raster data */
char         *data85 = NULL;  /* ascii85 encoded g4 compressed data */
l_int32       w, h, xres, yres, npages;
l_int32       minisblack;  /* TRUE or FALSE */
size_t        nbytes85, nbytescomp;
L_COMP_DATA  *cid;
FILE         *fp;

    if (!fname)
        return (L_COMP_DATA *)ERROR_PTR("fname not defined", __func__, NULL);

        /* Make sure this is a single page tiff file */
    if ((fp = fopenReadStream(fname)) == NULL)
        return (L_COMP_DATA *)ERROR_PTR_1("stream not opened",
                                          fname, __func__, NULL);
    tiffGetCount(fp, &npages);
    fclose(fp);
    if (npages != 1) {
        L_ERROR(" %d page tiff; only works with 1 page (file: %s)\n", __func__, npages, fname);
        return NULL;
    }

        /* Read the resolution */
    if ((fp = fopenReadStream(fname)) == NULL)
        return (L_COMP_DATA *)ERROR_PTR_1("stream not opened",
                                          fname, __func__, NULL);
    getTiffResolution(fp, &xres, &yres);
    fclose(fp);

        /* The returned ccitt g4 data in memory is the block of
         * bytes in the tiff file, starting after 8 bytes and
         * ending before the directory. */
    if (extractG4DataFromFile(fname, &datacomp, &nbytescomp,
                              &w, &h, &minisblack)) {
        return (L_COMP_DATA *)ERROR_PTR_1("datacomp not extracted",
                                          fname, __func__, NULL);
    }

        /* Optionally, encode the compressed data */
    if (ascii85flag == 1) {
        data85 = encodeAscii85(datacomp, nbytescomp, &nbytes85);
        LEPT_FREE(datacomp);
        if (!data85)
            return (L_COMP_DATA *)ERROR_PTR_1("data85 not made",
                                              fname, __func__, NULL);
        else
            data85[nbytes85 - 1] = '\0';  /* remove the newline */
    }

    cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA));
    if (ascii85flag == 0) {
        cid->datacomp = datacomp;
    } else {  /* ascii85 */
        cid->data85 = data85;
        cid->nbytes85 = nbytes85;
    }
    cid->type = L_G4_ENCODE;
    cid->nbytescomp = nbytescomp;
    cid->w = w;
    cid->h = h;
    cid->bps = 1;
    cid->spp = 1;
    cid->minisblack = minisblack;
    cid->res = xres;
    return cid;
}


/*!
 * \brief   pixGenerateCIData()
 *
 * \param[in]    pixs       8 or 32 bpp, no colormap
 * \param[in]    type       L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE or
 *                          L_JP2K_ENCODE
 * \param[in]    quality    for jpeg if transcoded: 1-100; 0 for default (75)
 *                          for jp2k if transcoded: 27-45; 0 for default (34)
 * \param[in]    ascii85    0 for binary; 1 for ascii85-encoded
 * \param[out]   pcid       compressed data
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) Set ascii85:
 *           ~ 0 for binary data (PDF only)
 *           ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only)
 *      (2) Do not accept images with an asperity ratio greater than 10.
 * </pre>
 */
l_ok
pixGenerateCIData(PIX           *pixs,
                  l_int32        type,
                  l_int32        quality,
                  l_int32        ascii85,
                  L_COMP_DATA  **pcid)
{
l_int32   w, h, d, maxAsp;
PIXCMAP  *cmap;

    if (!pcid)
        return ERROR_INT("&cid not defined", __func__, 1);
    *pcid = NULL;
    if (!pixs)
        return ERROR_INT("pixs not defined", __func__, 1);
    if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
        type != L_FLATE_ENCODE && type != L_JP2K_ENCODE) {
        selectDefaultPdfEncoding(pixs, &type);
    }
    if (ascii85 != 0 && ascii85 != 1)
        return ERROR_INT("invalid ascii85", __func__, 1);
    pixGetDimensions(pixs, &w, &h, NULL);
    if (w == 0 || h == 0)
        return ERROR_INT("invalid w or h", __func__, 1);
    maxAsp = L_MAX(w / h, h / w);
    if (maxAsp > 10)
        return ERROR_INT("max asperity > 10", __func__, 1);

        /* Conditionally modify the encoding type if libz is
         * available and the requested library is missing. */
#if defined(HAVE_LIBZ)
# if !defined(HAVE_LIBJPEG)
    if (type == L_JPEG_ENCODE) {
        L_WARNING("no libjpeg; using flate encoding\n", __func__);
        type = L_FLATE_ENCODE;
    }
# endif /* !defined(HAVE_LIBJPEG) */
# if !defined(HAVE_LIBJP2K)
    if (type == L_JP2K_ENCODE) {
        L_WARNING("no libjp2k; using flate encoding\n", __func__);
        type = L_FLATE_ENCODE;
    }
# endif /* !defined(HAVE_LIBJP2K) */
# if !defined(HAVE_LIBTIFF)
    if (type == L_G4_ENCODE) {
        L_WARNING("no libtiff; using flate encoding\n", __func__);
        type = L_FLATE_ENCODE;
    }
# endif /* !defined(HAVE_LIBTIFF) */
#endif /* defined(HAVE_LIBZ) */

        /* Sanity check on requested encoding */
    d = pixGetDepth(pixs);
    cmap = pixGetColormap(pixs);
    if (cmap && type != L_FLATE_ENCODE) {
        L_WARNING("pixs has cmap; using flate encoding\n", __func__);
        type = L_FLATE_ENCODE;
    } else if (d < 8 && (type == L_JPEG_ENCODE || type == L_JP2K_ENCODE)) {
        L_WARNING("pixs has < 8 bpp; using flate encoding\n", __func__);
        type = L_FLATE_ENCODE;
    } else if (d > 1 && type == L_G4_ENCODE) {
        L_WARNING("pixs has > 1 bpp; using flate encoding\n", __func__);
        type = L_FLATE_ENCODE;
    }

    if (type == L_JPEG_ENCODE) {
        if ((*pcid = pixGenerateJpegData(pixs, ascii85, quality)) == NULL)
            return ERROR_INT("jpeg data not made", __func__, 1);
    } else if (type == L_JP2K_ENCODE) {
        if ((*pcid = pixGenerateJp2kData(pixs, quality)) == NULL)
            return ERROR_INT("jp2k data not made", __func__, 1);
    } else if (type == L_G4_ENCODE) {
        if ((*pcid = pixGenerateG4Data(pixs, ascii85)) == NULL)
            return ERROR_INT("g4 data not made", __func__, 1);
    } else {  /* type == L_FLATE_ENCODE */
        if ((*pcid = pixGenerateFlateData(pixs, ascii85)) == NULL)
            return ERROR_INT("flate data not made", __func__, 1);
    }
    return 0;
}


/*!
 * \brief   l_generateFlateData()
 *
 * \param[in]    fname
 * \param[in]    ascii85flag    0 for gzipped; 1 for ascii85-encoded gzipped
 * \return  cid flate compressed image data, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) The input image is converted to one of these 4 types:
 *           ~ 1 bpp
 *           ~ 8 bpp, no colormap
 *           ~ 8 bpp, colormap
 *           ~ 32 bpp rgb
 *      (2) Set ascii85flag:
 *           ~ 0 for binary data (PDF only)
 *           ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only)
 *      (3) Always transcodes (i.e., first decodes the png file)
 * </pre>
 */
L_COMP_DATA *
l_generateFlateData(const char  *fname,
                    l_int32      ascii85flag)
{
L_COMP_DATA  *cid;
PIX          *pixs;

    if (!fname)
        return (L_COMP_DATA *)ERROR_PTR("fname not defined", __func__, NULL);

    if ((pixs = pixRead(fname)) == NULL)
        return (L_COMP_DATA *)ERROR_PTR("pixs not made", __func__, NULL);
    cid = pixGenerateFlateData(pixs, ascii85flag);
    pixDestroy(&pixs);
    return cid;
}


/*!
 * \brief   pixGenerateFlateData()
 *
 * \param[in]    pixs
 * \param[in]    ascii85flag 0    for gzipped; 1 for ascii85-encoded gzipped
 * \return  cid flate compressed image data, or NULL on error
 *
 * <pre>
 * Notes:
 *     (1) If called with an RGBA pix (spp == 4), the alpha channel
 *         will be removed, projecting a white backgrouond through
 *         any transparency.
 *     (2) If called with a colormapped pix, any transparency in the
 *         alpha component in the colormap will be ignored, as it is
 *         for all leptonica operations on colormapped pix.
 * </pre>
 */
static L_COMP_DATA *
pixGenerateFlateData(PIX     *pixs,
                     l_int32  ascii85flag)
{
l_uint8      *data = NULL;  /* uncompressed raster data in required format */
l_uint8      *datacomp = NULL;  /* gzipped raster data */
char         *data85 = NULL;  /* ascii85 encoded gzipped raster data */
l_uint8      *cmapdata = NULL;  /* uncompressed colormap */
char         *cmapdata85 = NULL;  /* ascii85 encoded uncompressed colormap */
char         *cmapdatahex = NULL;  /* hex ascii uncompressed colormap */
l_int32       ncolors;  /* in colormap; not used if cmapdata85 is null */
l_int32       bps;  /* bits/sample: usually 8 */
l_int32       spp;  /* samples/pixel: 1-grayscale/cmap); 3-rgb */
l_int32       w, h, d, cmapflag;
size_t        ncmapbytes85 = 0;
size_t        nbytes85 = 0;
size_t        nbytes, nbytescomp;
L_COMP_DATA  *cid;
PIX          *pixt;
PIXCMAP      *cmap;

    if (!pixs)
        return (L_COMP_DATA *)ERROR_PTR("pixs not defined", __func__, NULL);

        /* Convert the image to one of these 4 types:
         *     1 bpp
         *     8 bpp, no colormap
         *     8 bpp, colormap
         *     32 bpp rgb    */
    pixGetDimensions(pixs, &w, &h, &d);
    cmap = pixGetColormap(pixs);
    cmapflag = (cmap) ? 1 : 0;
    if (d == 2 || d == 4 || d == 16) {
        pixt = pixConvertTo8(pixs, cmapflag);
        cmap = pixGetColormap(pixt);
        d = pixGetDepth(pixt);
    } else if (d == 32 && pixGetSpp(pixs) == 4) {  /* remove alpha */
        pixt = pixAlphaBlendUniform(pixs, 0xffffff00);
    } else {
        pixt = pixClone(pixs);
    }
    if (!pixt)
        return (L_COMP_DATA *)ERROR_PTR("pixt not made", __func__, NULL);
    spp = (d == 32) ? 3 : 1;
    bps = (d == 32) ? 8 : d;

        /* Extract and encode the colormap data as both ascii85 and hexascii  */
    ncolors = 0;
    if (cmap) {
        pixcmapSerializeToMemory(cmap, 3, &ncolors, &cmapdata);
        if (!cmapdata) {
            pixDestroy(&pixt);
            return (L_COMP_DATA *)ERROR_PTR("cmapdata not made",
                                            __func__, NULL);
        }

        cmapdata85 = encodeAscii85(cmapdata, 3 * ncolors, &ncmapbytes85);
        cmapdatahex = pixcmapConvertToHex(cmapdata, ncolors);
        LEPT_FREE(cmapdata);
    }

        /* Extract and compress the raster data */
    pixGetRasterData(pixt, &data, &nbytes);
    pixDestroy(&pixt);
    if (!data) {
        LEPT_FREE(cmapdata85);
        LEPT_FREE(cmapdatahex);
        return (L_COMP_DATA *)ERROR_PTR("data not returned", __func__, NULL);
    }
    datacomp = zlibCompress(data, nbytes, &nbytescomp);
    LEPT_FREE(data);
    if (!datacomp) {
        LEPT_FREE(cmapdata85);
        LEPT_FREE(cmapdatahex);
        return (L_COMP_DATA *)ERROR_PTR("datacomp not made", __func__, NULL);
    }

        /* Optionally, encode the compressed data */
    if (ascii85flag == 1) {
        data85 = encodeAscii85(datacomp, nbytescomp, &nbytes85);
        LEPT_FREE(datacomp);
        if (!data85) {
            LEPT_FREE(cmapdata85);
            LEPT_FREE(cmapdatahex);
            return (L_COMP_DATA *)ERROR_PTR("data85 not made", __func__, NULL);
        } else {
            data85[nbytes85 - 1] = '\0';  /* remove the newline */
        }
    }

    cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA));
    if (ascii85flag == 0) {
        cid->datacomp = datacomp;
    } else {  /* ascii85 */
        cid->data85 = data85;
        cid->nbytes85 = nbytes85;
    }
    cid->type = L_FLATE_ENCODE;
    cid->cmapdatahex = cmapdatahex;
    cid->cmapdata85 = cmapdata85;
    cid->nbytescomp = nbytescomp;
    cid->ncolors = ncolors;
    cid->w = w;
    cid->h = h;
    cid->bps = bps;
    cid->spp = spp;
    cid->res = pixGetXRes(pixs);
    cid->nbytes = nbytes;  /* only for debugging */
    return cid;
}


/*!
 * \brief   pixGenerateJpegData()
 *
 * \param[in]    pixs           8, 16 or 32 bpp, no colormap
 * \param[in]    ascii85flag    0 for jpeg; 1 for ascii85-encoded jpeg
 * \param[in]    quality        0 for default, which is 75
 * \return  cid jpeg compressed data, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) Set ascii85flag:
 *           ~ 0 for binary data (PDF only)
 *           ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only)
 *      (2) If 16 bpp, convert first to 8 bpp, using the MSB
 * </pre>
 */
static L_COMP_DATA *
pixGenerateJpegData(PIX     *pixs,
                    l_int32  ascii85flag,
                    l_int32  quality)
{
l_int32       d;
char         *fname;
L_COMP_DATA  *cid;

    if (!pixs)
        return (L_COMP_DATA *)ERROR_PTR("pixs not defined", __func__, NULL);
    if (pixGetColormap(pixs))
        return (L_COMP_DATA *)ERROR_PTR("pixs has colormap", __func__, NULL);
    d = pixGetDepth(pixs);
    if (d != 8 && d != 16 && d != 32)
        return (L_COMP_DATA *)ERROR_PTR("pixs not 8, 16 or 32 bpp",
                __func__, NULL);

        /* Compress to a temp jpeg file */
    fname = l_makeTempFilename();
    if (pixWriteJpeg(fname, pixs, quality, 0)) {
        LEPT_FREE(fname);
        return NULL;
    }

        /* Generate the data */
    cid = l_generateJpegData(fname, ascii85flag);
    if (lept_rmfile(fname) != 0)
        L_ERROR("temp file %s was not deleted\n", __func__, fname);
    LEPT_FREE(fname);
    return cid;
}


/*!
 * \brief   pixGenerateJp2kData()
 *
 * \param[in]    pixs           8 or 32 bpp, no colormap
 * \param[in]    quality        0 for default, which is 34
 * \return  cid jp2k compressed data, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) The quality can be set between 27 (very poor) and 45
 *          (nearly perfect).  Use 0 for default (34). Use 100 for lossless,
 *          but this is very expensive and not recommended.
 * </pre>
 */
static L_COMP_DATA *
pixGenerateJp2kData(PIX     *pixs,
                    l_int32  quality)
{
l_int32       d;
char         *fname;
L_COMP_DATA  *cid;

    if (!pixs)
        return (L_COMP_DATA *)ERROR_PTR("pixs not defined", __func__, NULL);
    if (pixGetColormap(pixs))
        return (L_COMP_DATA *)ERROR_PTR("pixs has colormap", __func__, NULL);
    d = pixGetDepth(pixs);
    if (d != 8 && d != 32)
        return (L_COMP_DATA *)ERROR_PTR("pixs not 8 or 32 bpp", __func__, NULL);

        /* Compress to a temp jp2k file */
    fname = l_makeTempFilename();
    if (pixWriteJp2k(fname, pixs, quality, 5, 0, 0)) {
        LEPT_FREE(fname);
        return NULL;
    }

        /* Generate the data */
    cid = l_generateJp2kData(fname);
    if (lept_rmfile(fname) != 0)
        L_ERROR("temp file %s was not deleted\n", __func__, fname);
    LEPT_FREE(fname);
    return cid;
}


/*!
 * \brief   pixGenerateG4Data()
 *
 * \param[in]    pixs           1 bpp, no colormap
 * \param[in]    ascii85flag    0 for gzipped; 1 for ascii85-encoded gzipped
 * \return  cid g4 compressed image data, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) Set ascii85flag:
 *           ~ 0 for binary data (PDF only)
 *           ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only)
 * </pre>
 */
static L_COMP_DATA *
pixGenerateG4Data(PIX     *pixs,
                  l_int32  ascii85flag)
{
char         *fname;
L_COMP_DATA  *cid;

    if (!pixs)
        return (L_COMP_DATA *)ERROR_PTR("pixs not defined", __func__, NULL);
    if (pixGetDepth(pixs) != 1)
        return (L_COMP_DATA *)ERROR_PTR("pixs not 1 bpp", __func__, NULL);
    if (pixGetColormap(pixs))
        return (L_COMP_DATA *)ERROR_PTR("pixs has colormap", __func__, NULL);

        /* Compress to a temp tiff g4 file */
    fname = l_makeTempFilename();
    if (pixWrite(fname, pixs, IFF_TIFF_G4)) {
        LEPT_FREE(fname);
        return NULL;
    }

    cid = l_generateG4Data(fname, ascii85flag);
    if (lept_rmfile(fname) != 0)
        L_ERROR("temp file %s was not deleted\n", __func__, fname);
    LEPT_FREE(fname);
    return cid;
}


/*!
 * \brief   cidConvertToPdfData()
 *
 * \param[in]    cid       compressed image data
 * \param[in]    title     [optional] pdf title; can be null
 * \param[out]   pdata     output pdf data for image
 * \param[out]   pnbytes   size of output pdf data
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) Caller must not destroy the cid.  It is absorbed in the
 *          lpd and destroyed by this function.
 * </pre>
 */
l_ok
cidConvertToPdfData(L_COMP_DATA  *cid,
                    const char   *title,
                    l_uint8     **pdata,
                    size_t       *pnbytes)
{
l_int32      res, ret;
l_float32    wpt, hpt;
L_PDF_DATA  *lpd = NULL;

    if (!pdata || !pnbytes)
        return ERROR_INT("&data and &nbytes not both defined", __func__, 1);
    *pdata = NULL;
    *pnbytes = 0;
    if (!cid)
        return ERROR_INT("cid not defined", __func__, 1);

        /* Get media box parameters, in pts */
    res = cid->res;
    if (res <= 0)
        res = DefaultInputRes;
    wpt = cid->w * 72.f / res;
    hpt = cid->h * 72.f / res;

        /* Set up the pdf data struct (lpd) */
    if ((lpd = pdfdataCreate(title)) == NULL)
        return ERROR_INT("lpd not made", __func__, 1);
    ptraAdd(lpd->cida, cid);
    lpd->n++;
    ptaAddPt(lpd->xy, 0, 0);   /* xpt = ypt = 0 */
    ptaAddPt(lpd->wh, wpt, hpt);

        /* Generate the pdf string and destroy the lpd */
    ret = l_generatePdf(pdata, pnbytes, lpd);
    pdfdataDestroy(&lpd);
    if (ret)
        return ERROR_INT("pdf output not made", __func__, 1);
    return 0;
}


/*!
 * \brief   l_CIDataDestroy()
 *
 * \param[in,out]   pcid     will be set to null before returning
 * \return  void
 */
void
l_CIDataDestroy(L_COMP_DATA  **pcid)
{
L_COMP_DATA  *cid;

    if (pcid == NULL) {
        L_WARNING("ptr address is null!\n", __func__);
        return;
    }
    if ((cid = *pcid) == NULL)
        return;

    if (cid->datacomp) LEPT_FREE(cid->datacomp);
    if (cid->data85) LEPT_FREE(cid->data85);
    if (cid->cmapdata85) LEPT_FREE(cid->cmapdata85);
    if (cid->cmapdatahex) LEPT_FREE(cid->cmapdatahex);
    LEPT_FREE(cid);
    *pcid = NULL;
}


/*---------------------------------------------------------------------*
 *         Helper functions for generating the output pdf string       *
 *---------------------------------------------------------------------*/
/*!
 * \brief   l_generatePdf()
 *
 * \param[out]   pdata     pdf array
 * \param[out]   pnbytes   number of bytes in pdf array
 * \param[in]    lpd       all the required input image data
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) On error, no data is returned.
 *      (2) The objects are:
 *            1: Catalog
 *            2: Info
 *            3: Pages
 *            4: Page
 *            5: Contents  (rendering command)
 *            6 to 6+n-1: n XObjects
 *            6+n to 6+n+m-1: m colormaps
 * </pre>
 */
static l_int32
l_generatePdf(l_uint8    **pdata,
              size_t      *pnbytes,
              L_PDF_DATA  *lpd)
{
    if (!pdata)
        return ERROR_INT("&data not defined", __func__, 1);
    *pdata = NULL;
    if (!pnbytes)
        return ERROR_INT("&nbytes not defined", __func__, 1);
    *pnbytes = 0;
    if (!lpd)
        return ERROR_INT("lpd not defined", __func__, 1);

    generateFixedStringsPdf(lpd);
    generateMediaboxPdf(lpd);
    generatePageStringPdf(lpd);
    generateContentStringPdf(lpd);
    generatePreXStringsPdf(lpd);
    generateColormapStringsPdf(lpd);
    generateTrailerPdf(lpd);
    return generateOutputDataPdf(pdata, pnbytes, lpd);
}


static void
generateFixedStringsPdf(L_PDF_DATA  *lpd)
{
char     buf[L_SMALLBUF];
char    *version, *datestr;
SARRAY  *sa;

        /* Accumulate data for the header and objects 1-3 */
    lpd->id = stringNew("%PDF-1.5\n");
    l_dnaAddNumber(lpd->objsize, strlen(lpd->id));

    lpd->obj1 = stringNew("1 0 obj\n"
                          "<<\n"
                          "/Type /Catalog\n"
                          "/Pages 3 0 R\n"
                          ">>\n"
                          "endobj\n");
    l_dnaAddNumber(lpd->objsize, strlen(lpd->obj1));

    sa = sarrayCreate(0);
    sarrayAddString(sa, "2 0 obj\n"
                        "<<\n", L_COPY);
    if (var_WRITE_DATE_AND_VERSION) {
        datestr = l_getFormattedDate();
        snprintf(buf, sizeof(buf), "/CreationDate (D:%s)\n", datestr);
        sarrayAddString(sa, buf, L_COPY);
        LEPT_FREE(datestr);
        version = getLeptonicaVersion();
        snprintf(buf, sizeof(buf),
                 "/Producer (leptonica: %s)\n", version);
        LEPT_FREE(version);
    } else {
        snprintf(buf, sizeof(buf), "/Producer (leptonica)\n");
    }
    sarrayAddString(sa, buf, L_COPY);
    if (lpd->title) {
        char *hexstr;
        if ((hexstr = generateEscapeString(lpd->title)) != NULL) {
            snprintf(buf, sizeof(buf), "/Title %s\n", hexstr);
            sarrayAddString(sa, buf, L_COPY);
        } else {
            L_ERROR("title string is not ascii\n", __func__);
        }
        LEPT_FREE(hexstr);
    }
    sarrayAddString(sa, ">>\n"
                                "endobj\n", L_COPY);
    lpd->obj2 = sarrayToString(sa, 0);
    l_dnaAddNumber(lpd->objsize, strlen(lpd->obj2));
    sarrayDestroy(&sa);

    lpd->obj3 = stringNew("3 0 obj\n"
                          "<<\n"
                          "/Type /Pages\n"
                          "/Kids [ 4 0 R ]\n"
                          "/Count 1\n"
                          ">>\n");
    l_dnaAddNumber(lpd->objsize, strlen(lpd->obj3));

        /* Do the post-datastream string */
    lpd->poststream = stringNew("\n"
                                "endstream\n"
                                "endobj\n");
}


/*!
 * \brief   generateEscapeString()
 *
 * \param[in]   str      input string
 * \return   hex escape string, or null on error
 *
 * <pre>
 * Notes:
 *      (1) If the input string is not ascii, returns null.
 *      (2) This takes an input ascii string and generates a hex
 *          ascii output string with 4 bytes out for each byte in.
 *          The feff code at the beginning tells the pdf interpreter
 *          that the data is to be interpreted as big-endian, 4 bytes
 *          at a time.  For ascii, the first two bytes are 0 and the
 *          last two bytes are less than 0x80.
 * </pre>
 */
static char  *
generateEscapeString(const char  *str)
{
char     smallbuf[8];
char    *buffer;
l_int32  i, nchar, buflen;

    if (!str)
        return (char *)ERROR_PTR("str not defined", __func__, NULL);
    nchar = strlen(str);
    for (i = 0; i < nchar; i++) {
        if (str[i] < 0)
            return (char *)ERROR_PTR("str not all ascii", __func__, NULL);
    }

    buflen = 4 * nchar + 10;
    buffer = (char *)LEPT_CALLOC(buflen, sizeof(char));
    stringCat(buffer, buflen, "<feff");
    for (i = 0; i < nchar; i++) {
        snprintf(smallbuf, sizeof(smallbuf), "%04x", str[i]);
        stringCat(buffer, buflen, smallbuf);
    }
    stringCat(buffer, buflen, ">");
    return buffer;
}


static void
generateMediaboxPdf(L_PDF_DATA  *lpd)
{
l_int32    i;
l_float32  xpt, ypt, wpt, hpt, maxx, maxy;

        /* First get the full extent of all the images.
         * This is the mediabox, in pts. */
    maxx = maxy = 0;
    for (i = 0; i < lpd->n; i++) {
        ptaGetPt(lpd->xy, i, &xpt, &ypt);
        ptaGetPt(lpd->wh, i, &wpt, &hpt);
        maxx = L_MAX(maxx, xpt + wpt);
        maxy = L_MAX(maxy, ypt + hpt);
    }

    lpd->mediabox = boxCreate(0, 0, (l_int32)(maxx + 0.5),
                              (l_int32)(maxy + 0.5));

        /* ypt is in standard image coordinates: the location of
         * the UL image corner with respect to the UL media box corner.
         * Rewrite each ypt for PostScript coordinates: the location of
         * the LL image corner with respect to the LL media box corner. */
    for (i = 0; i < lpd->n; i++) {
        ptaGetPt(lpd->xy, i, &xpt, &ypt);
        ptaGetPt(lpd->wh, i, &wpt, &hpt);
        ptaSetPt(lpd->xy, i, xpt, maxy - ypt - hpt);
    }
}


static l_int32
generatePageStringPdf(L_PDF_DATA  *lpd)
{
char    *buf;
char    *xstr;
l_int32  bufsize, i, wpt, hpt;
SARRAY  *sa;

        /* Allocate 1000 bytes for the boilerplate text, and
         * 50 bytes for each reference to an image in the
         * ProcSet array.  */
    bufsize = 1000 + 50 * lpd->n;
    if ((buf = (char *)LEPT_CALLOC(bufsize, sizeof(char))) == NULL)
        return ERROR_INT("calloc fail for buf", __func__, 1);

    boxGetGeometry(lpd->mediabox, NULL, NULL, &wpt, &hpt);
    sa = sarrayCreate(lpd->n);
    for (i = 0; i < lpd->n; i++) {
        snprintf(buf, bufsize, "/Im%d %d 0 R   ", i + 1, 6 + i);
        sarrayAddString(sa, buf, L_COPY);
    }
    xstr = sarrayToString(sa, 0);
    sarrayDestroy(&sa);
    if (!xstr) {
        LEPT_FREE(buf);
        return ERROR_INT("xstr not made", __func__, 1);
    }

    snprintf(buf, bufsize, "4 0 obj\n"
                           "<<\n"
                           "/Type /Page\n"
                           "/Parent 3 0 R\n"
                           "/MediaBox [%d %d %d %d]\n"
                           "/Contents 5 0 R\n"
                           "/Resources\n"
                           "<<\n"
                           "/XObject << %s >>\n"
                           "/ProcSet [ /ImageB /ImageI /ImageC ]\n"
                           ">>\n"
                           ">>\n"
                           "endobj\n",
                           0, 0, wpt, hpt, xstr);

    lpd->obj4 = stringNew(buf);
    l_dnaAddNumber(lpd->objsize, strlen(lpd->obj4));
    sarrayDestroy(&sa);
    LEPT_FREE(buf);
    LEPT_FREE(xstr);
    return 0;
}


static l_int32
generateContentStringPdf(L_PDF_DATA  *lpd)
{
char      *buf;
char      *cstr;
l_int32    i, bufsize;
l_float32  xpt, ypt, wpt, hpt;
SARRAY    *sa;

    bufsize = 1000 + 200 * lpd->n;
    if ((buf = (char *)LEPT_CALLOC(bufsize, sizeof(char))) == NULL)
        return ERROR_INT("calloc fail for buf", __func__, 1);

    sa = sarrayCreate(lpd->n);
    for (i = 0; i < lpd->n; i++) {
        ptaGetPt(lpd->xy, i, &xpt, &ypt);
        ptaGetPt(lpd->wh, i, &wpt, &hpt);
        snprintf(buf, bufsize,
                 "q %.4f %.4f %.4f %.4f %.4f %.4f cm /Im%d Do Q\n",
                 wpt, 0.0, 0.0, hpt, xpt, ypt, i + 1);
        sarrayAddString(sa, buf, L_COPY);
    }
    cstr = sarrayToString(sa, 0);
    sarrayDestroy(&sa);
    if (!cstr) {
        LEPT_FREE(buf);
        return ERROR_INT("cstr not made", __func__, 1);
    }

    snprintf(buf, bufsize, "5 0 obj\n"
                           "<< /Length %d >>\n"
                           "stream\n"
                           "%s"
                           "endstream\n"
                           "endobj\n",
                           (l_int32)strlen(cstr), cstr);

    lpd->obj5 = stringNew(buf);
    l_dnaAddNumber(lpd->objsize, strlen(lpd->obj5));
    sarrayDestroy(&sa);
    LEPT_FREE(buf);
    LEPT_FREE(cstr);
    return 0;
}


static l_int32
generatePreXStringsPdf(L_PDF_DATA  *lpd)
{
char          buff[256];
char          buf[L_BIGBUF];
char         *cstr, *bstr, *fstr, *pstr, *xstr, *photometry;
l_int32       i, cmindex;
L_COMP_DATA  *cid;
SARRAY       *sa;

    sa = lpd->saprex;
    cmindex = 6 + lpd->n;  /* starting value */
    for (i = 0; i < lpd->n; i++) {
        pstr = cstr = NULL;
        if ((cid = pdfdataGetCid(lpd, i)) == NULL)
            return ERROR_INT("cid not found", __func__, 1);

        if (cid->type == L_G4_ENCODE) {
            if (var_WRITE_G4_IMAGE_MASK) {
                cstr = stringNew("/ImageMask true\n"
                                 "/ColorSpace /DeviceGray");
            } else {
                cstr = stringNew("/ColorSpace /DeviceGray");
            }
            bstr = stringNew("/BitsPerComponent 1\n"
                             "/Interpolate true");
                /* Note: the reversal is deliberate.  The BlackIs1 flag
                 * is misleadingly named: it says whether to invert the
                 * image on decoding because the black pixels are 0,
                 * not whether the black pixels are 1!  The default for
                 * BlackIs1 is "false", which means "don't invert because
                 * black is 1."  Yikes. */
            photometry = (cid->minisblack) ? stringNew("true")
                                           : stringNew("false");
            snprintf(buff, sizeof(buff),
                     "/Filter /CCITTFaxDecode\n"
                     "/DecodeParms\n"
                     "<<\n"
                     "/BlackIs1 %s\n"
                     "/K -1\n"
                     "/Columns %d\n"
                     ">>", photometry, cid->w);
            fstr = stringNew(buff);
            LEPT_FREE(photometry);
        } else if (cid->type == L_JPEG_ENCODE) {
            if (cid->spp == 1)
                cstr = stringNew("/ColorSpace /DeviceGray");
            else if (cid->spp == 3)
                cstr = stringNew("/ColorSpace /DeviceRGB");
            else if (cid->spp == 4)   /* pdf supports cmyk */
                cstr = stringNew("/ColorSpace /DeviceCMYK");
            else
                L_ERROR("in jpeg: spp != 1, 3 or 4\n", __func__);
            bstr = stringNew("/BitsPerComponent 8");
            fstr = stringNew("/Filter /DCTDecode");
        } else if (cid->type == L_JP2K_ENCODE) {
            if (cid->spp == 1)
                cstr = stringNew("/ColorSpace /DeviceGray");
            else if (cid->spp == 3)
                cstr = stringNew("/ColorSpace /DeviceRGB");
            else
                L_ERROR("in jp2k: spp != 1 && spp != 3\n", __func__);
            bstr = stringNew("/BitsPerComponent 8");
            fstr = stringNew("/Filter /JPXDecode");
        } else {  /* type == L_FLATE_ENCODE */
            if (cid->ncolors > 0) {  /* cmapped */
                snprintf(buff, sizeof(buff), "/ColorSpace %d 0 R", cmindex++);
                cstr = stringNew(buff);
            } else {
                if (cid->spp == 1 && cid->bps == 1)
                    cstr = stringNew("/ColorSpace /DeviceGray\n"
                                     "/Decode [1 0]");
                else if (cid->spp == 1)  /* 8 bpp */
                    cstr = stringNew("/ColorSpace /DeviceGray");
                else if (cid->spp == 3)
                    cstr = stringNew("/ColorSpace /DeviceRGB");
                else
                    L_ERROR("unknown colorspace: spp = %d\n",
                            __func__, cid->spp);
            }
            snprintf(buff, sizeof(buff), "/BitsPerComponent %d", cid->bps);
            bstr = stringNew(buff);
            fstr = stringNew("/Filter /FlateDecode");
            if (cid->predictor == TRUE) {
                snprintf(buff, sizeof(buff),
                         "/DecodeParms\n"
                         "<<\n"
                         "  /Columns %d\n"
                         "  /Predictor 14\n"
                         "  /Colors %d\n"
                         "  /BitsPerComponent %d\n"
                         ">>\n", cid->w, cid->spp, cid->bps);
                pstr = stringNew(buff);
            }
        }
        if (!pstr)  /* no decode parameters */
            pstr = stringNew("");

        snprintf(buf, sizeof(buf),
                 "%d 0 obj\n"
                 "<<\n"
                 "/Length %zu\n"
                 "/Subtype /Image\n"
                 "%s\n"  /* colorspace */
                 "/Width %d\n"
                 "/Height %d\n"
                 "%s\n"  /* bits/component */
                 "%s\n"  /* filter */
                 "%s"   /* decode parms; can be empty */
                 ">>\n"
                 "stream\n",
                 6 + i, cid->nbytescomp, cstr,
                 cid->w, cid->h, bstr, fstr, pstr);
        xstr = stringNew(buf);
        sarrayAddString(sa, xstr, L_INSERT);
        l_dnaAddNumber(lpd->objsize,
                      strlen(xstr) + cid->nbytescomp + strlen(lpd->poststream));
        LEPT_FREE(cstr);
        LEPT_FREE(bstr);
        LEPT_FREE(fstr);
        LEPT_FREE(pstr);
    }

    return 0;
}


static l_int32
generateColormapStringsPdf(L_PDF_DATA  *lpd)
{
char          buf[L_BIGBUF];
char         *cmstr;
l_int32       i, cmindex, ncmap;
L_COMP_DATA  *cid;
SARRAY       *sa;

        /* In our canonical format, we have 5 objects, followed
         * by n XObjects, followed by m colormaps, so the index of
         * the first colormap object is 6 + n. */
    sa = lpd->sacmap;
    cmindex = 6 + lpd->n;  /* starting value */
    ncmap = 0;
    for (i = 0; i < lpd->n; i++) {
        if ((cid = pdfdataGetCid(lpd, i)) == NULL)
            return ERROR_INT("cid not found", __func__, 1);
        if (cid->ncolors == 0) continue;

        ncmap++;
        snprintf(buf, sizeof(buf), "%d 0 obj\n"
                                   "[ /Indexed /DeviceRGB\n"
                                   "%d\n"
                                   "%s\n"
                                   "]\n"
                                   "endobj\n",
                                   cmindex, cid->ncolors - 1, cid->cmapdatahex);
        cmindex++;
        cmstr = stringNew(buf);
        l_dnaAddNumber(lpd->objsize, strlen(cmstr));
        sarrayAddString(sa, cmstr, L_INSERT);
    }

    lpd->ncmap = ncmap;
    return 0;
}


static void
generateTrailerPdf(L_PDF_DATA  *lpd)
{
l_int32  i, n, size, linestart;
L_DNA   *daloc, *dasize;

        /* Let nobj be the number of numbered objects.  These numbered
         * objects are indexed by their pdf number in arrays naloc[]
         * and nasize[].  The 0th object is the 9 byte header.  Then
         * the number of objects in nasize, which includes the header,
         * is n = nobj + 1.  The array naloc[] has n + 1 elements,
         * because it includes as the last element the starting
         * location of xref.  The indexing of these objects, their
         * starting locations and sizes are:
         *
         *     Object number         Starting location         Size
         *     -------------         -----------------     --------------
         *          0                   daloc[0] = 0       dasize[0] = 9
         *          1                   daloc[1] = 9       dasize[1] = 49
         *          n                   daloc[n]           dasize[n]
         *          xref                daloc[n+1]
         *
         * We first generate daloc.
         */
    dasize = lpd->objsize;
    daloc = lpd->objloc;
    linestart = 0;
    l_dnaAddNumber(daloc, linestart);  /* header */
    n = l_dnaGetCount(dasize);
    for (i = 0; i < n; i++) {
        l_dnaGetIValue(dasize, i, &size);
        linestart += size;
        l_dnaAddNumber(daloc, linestart);
    }
    l_dnaGetIValue(daloc, n, &lpd->xrefloc);  /* save it */

        /* Now make the actual trailer string */
    lpd->trailer = makeTrailerStringPdf(daloc);
}


static char *
makeTrailerStringPdf(L_DNA  *daloc)
{
char    *outstr;
char     buf[L_BIGBUF];
l_int32  i, n, linestart, xrefloc;
SARRAY  *sa;

    if (!daloc)
        return (char *)ERROR_PTR("daloc not defined", __func__, NULL);
    n = l_dnaGetCount(daloc) - 1;  /* numbered objects + 1 (yes, +1) */

    sa = sarrayCreate(0);
    snprintf(buf, sizeof(buf), "xref\n"
                               "0 %d\n"
                               "0000000000 65535 f \n", n);
    sarrayAddString(sa, buf, L_COPY);
    for (i = 1; i < n; i++) {
        l_dnaGetIValue(daloc, i, &linestart);
        snprintf(buf, sizeof(buf), "%010d 00000 n \n", linestart);
        sarrayAddString(sa, buf, L_COPY);
    }

    l_dnaGetIValue(daloc, n, &xrefloc);
    snprintf(buf, sizeof(buf), "trailer\n"
                               "<<\n"
                               "/Size %d\n"
                               "/Root 1 0 R\n"
                               "/Info 2 0 R\n"
                               ">>\n"
                               "startxref\n"
                               "%d\n"
                               "%%%%EOF\n", n, xrefloc);
    sarrayAddString(sa, buf, L_COPY);
    outstr = sarrayToString(sa, 0);
    sarrayDestroy(&sa);
    return outstr;
}


/*!
 * \brief   generateOutputDataPdf()
 *
 * \param[out]   pdata      pdf data array
 * \param[out]   pnbytes    size of pdf data array
 * \param[in]    lpd        input data used to make pdf
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) Only called from l_generatePdf().  On error, no data is returned.
 * </pre>
 */
static l_int32
generateOutputDataPdf(l_uint8    **pdata,
                      size_t      *pnbytes,
                      L_PDF_DATA  *lpd)
{
char         *str;
l_uint8      *data;
l_int32       nimages, i, len;
l_int32      *sizes, *locs;
size_t        nbytes;
L_COMP_DATA  *cid;

    if (!pdata)
        return ERROR_INT("&data not defined", __func__, 1);
    *pdata = NULL;
    if (!pnbytes)
        return ERROR_INT("&nbytes not defined", __func__, 1);
    nbytes = lpd->xrefloc + strlen(lpd->trailer);
    *pnbytes = nbytes;
    if ((data = (l_uint8 *)LEPT_CALLOC(nbytes, sizeof(l_uint8))) == NULL)
        return ERROR_INT("calloc fail for data", __func__, 1);
    *pdata = data;

    sizes = l_dnaGetIArray(lpd->objsize);
    locs = l_dnaGetIArray(lpd->objloc);
    memcpy(data, lpd->id, sizes[0]);
    memcpy(data + locs[1], lpd->obj1, sizes[1]);
    memcpy(data + locs[2], lpd->obj2, sizes[2]);
    memcpy(data + locs[3], lpd->obj3, sizes[3]);
    memcpy(data + locs[4], lpd->obj4, sizes[4]);
    memcpy(data + locs[5], lpd->obj5, sizes[5]);

        /* Each image has 3 parts: variable preamble, the compressed
         * data stream, and the fixed poststream. */
    nimages = lpd->n;
    for (i = 0; i < nimages; i++) {
        if ((cid = pdfdataGetCid(lpd, i)) == NULL) {  /* should not happen */
            LEPT_FREE(sizes);
            LEPT_FREE(locs);
            return ERROR_INT("cid not found", __func__, 1);
        }
        str = sarrayGetString(lpd->saprex, i, L_NOCOPY);
        len = strlen(str);
        memcpy(data + locs[6 + i], str, len);
        memcpy(data + locs[6 + i] + len,
               cid->datacomp, cid->nbytescomp);
        memcpy(data + locs[6 + i] + len + cid->nbytescomp,
               lpd->poststream, strlen(lpd->poststream));
    }

        /* Each colormap is simply a stored string */
    for (i = 0; i < lpd->ncmap; i++) {
        str = sarrayGetString(lpd->sacmap, i, L_NOCOPY);
        memcpy(data + locs[6 + nimages + i], str, strlen(str));
    }

        /* And finally the trailer */
    memcpy(data + lpd->xrefloc, lpd->trailer, strlen(lpd->trailer));
    LEPT_FREE(sizes);
    LEPT_FREE(locs);
    return 0;
}


/*---------------------------------------------------------------------*
 *          Helper functions for generating multipage pdf output       *
 *---------------------------------------------------------------------*/
/*!
 * \brief   parseTrailerPdf()
 *
 * \param[in]    bas     lba of a pdf file
 * \param[out]   pda     byte locations of the beginning of each object
 * \return  0 if OK, 1 on error
 */
static l_int32
parseTrailerPdf(L_BYTEA  *bas,
                L_DNA   **pda)
{
char     *str;
l_uint8   nl = '\n';
l_uint8  *data;
l_int32   i, j, start, startloc, xrefloc, found, loc, nobj, objno, trailer_ok;
size_t    size;
L_DNA    *da, *daobj, *daxref;
SARRAY   *sa;

    if (!pda)
        return ERROR_INT("&da not defined", __func__, 1);
    *pda = NULL;
    if (!bas)
        return ERROR_INT("bas not defined", __func__, 1);
    data = l_byteaGetData(bas, &size);
    if (memcmp(data, "%PDF-1.", 7) != 0)
        return ERROR_INT("PDF header signature not found", __func__, 1);

        /* Search for "startxref" starting 50 bytes from the EOF */
    start = 0;
    if (size > 50)
        start = size - 50;
    arrayFindSequence(data + start, size - start,
                      (l_uint8 *)"startxref\n", 10, &loc, &found);
    if (!found)
        return ERROR_INT("startxref not found!", __func__, 1);
    if (sscanf((char *)(data + start + loc + 10), "%d\n", &xrefloc) != 1)
        return ERROR_INT("xrefloc not found!", __func__, 1);
    if (xrefloc < 0 || xrefloc >= size)
        return ERROR_INT("invalid xrefloc!", __func__, 1);
    sa = sarrayCreateLinesFromString((char *)(data + xrefloc), 0);
    str = sarrayGetString(sa, 1, L_NOCOPY);
    if ((sscanf(str, "0 %d", &nobj)) != 1) {
        sarrayDestroy(&sa);
        return ERROR_INT("nobj not found", __func__, 1);
    }

        /* Get starting locations.  The numa index is the
         * object number.  loc[0] is the ID; loc[nobj + 1] is xrefloc.  */
    da = l_dnaCreate(nobj + 1);
    *pda = da;
    for (i = 0; i < nobj; i++) {
        str = sarrayGetString(sa, i + 2, L_NOCOPY);
        sscanf(str, "%d", &startloc);
        l_dnaAddNumber(da, startloc);
    }
    l_dnaAddNumber(da, xrefloc);

#if  DEBUG_MULTIPAGE
    lept_stderr("************** Trailer string ************\n");
    lept_stderr("xrefloc = %d", xrefloc);
    sarrayWriteStderr(sa);

    lept_stderr("************** Object locations ************");
    l_dnaWriteStderr(da);
#endif  /* DEBUG_MULTIPAGE */
    sarrayDestroy(&sa);

        /* Verify correct parsing */
    trailer_ok = TRUE;
    for (i = 1; i < nobj; i++) {
        l_dnaGetIValue(da, i, &startloc);
        if ((sscanf((char *)(data + startloc), "%d 0 obj", &objno)) != 1) {
            L_ERROR("bad trailer for object %d\n", __func__, i);
            trailer_ok = FALSE;
            break;
        }
    }

        /* If the trailer is broken, reconstruct the correct obj locations */
    if (!trailer_ok) {
        L_INFO("rebuilding pdf trailer\n", __func__);
        l_dnaEmpty(da);
        l_dnaAddNumber(da, 0);
        l_byteaFindEachSequence(bas, (l_uint8 *)" 0 obj\n", 7, &daobj);
        nobj = l_dnaGetCount(daobj);
        for (i = 0; i < nobj; i++) {
            l_dnaGetIValue(daobj, i, &loc);
            for (j = loc - 1; j > 0; j--) {
                if (data[j] == nl)
                    break;
            }
            l_dnaAddNumber(da, j + 1);
        }
        l_byteaFindEachSequence(bas, (l_uint8 *)"xref", 4, &daxref);
        l_dnaGetIValue(daxref, 0, &loc);
        l_dnaAddNumber(da, loc);
        l_dnaDestroy(&daobj);
        l_dnaDestroy(&daxref);
    }

    return 0;
}


static char *
generatePagesObjStringPdf(NUMA  *napage)
{
char    *str;
char    *buf;
l_int32  i, n, index, bufsize;
SARRAY  *sa;

    if (!napage)
        return (char *)ERROR_PTR("napage not defined", __func__, NULL);

    n = numaGetCount(napage);
    bufsize = 100 + 16 * n;  /* large enough to hold the output string */
    buf = (char *)LEPT_CALLOC(bufsize, sizeof(char));
    sa = sarrayCreate(n);
    for (i = 0; i < n; i++) {
        numaGetIValue(napage, i, &index);
        snprintf(buf, bufsize, " %d 0 R ", index);
        sarrayAddString(sa, buf, L_COPY);
    }

    str = sarrayToString(sa, 0);
    snprintf(buf, bufsize - 1, "3 0 obj\n"
                               "<<\n"
                               "/Type /Pages\n"
                               "/Kids [%s]\n"
                               "/Count %d\n"
                               ">>\n"
                               "endobj\n",
                               str, n);
    sarrayDestroy(&sa);
    LEPT_FREE(str);
    return buf;
}


/*!
 * \brief   substituteObjectNumbers()
 *
 * \param[in]   bas        lba of a pdf object
 * \param[in]   na_objs    object number mapping array
 * \return    bad   lba of rewritten pdf for the object
 *
 * <pre>
 * Notes:
 *      (1) Interpret the first set of bytes as the object number,
 *          map to the new number, and write it out.
 *      (2) Find all occurrences of this 4-byte sequence: " 0 R"
 *      (3) Find the location and value of the integer preceding this,
 *          and map it to the new value.
 *      (4) Rewrite the object with new object numbers.
 * </pre>
 */
static L_BYTEA *
substituteObjectNumbers(L_BYTEA  *bas,
                        NUMA     *na_objs)
{
l_uint8   space = ' ';
l_uint8  *datas;
l_uint8   buf[32];  /* only needs to hold one integer in ascii format */
l_int32   start, nrepl, i, j, nobjs, objin, objout, found;
l_int32  *objs, *matches;
size_t    size;
L_BYTEA  *bad;
L_DNA    *da_match;

    if (!bas)
        return (L_BYTEA *)ERROR_PTR("bas not defined", __func__, NULL);
    if (!na_objs)
        return (L_BYTEA *)ERROR_PTR("na_objs not defined", __func__, NULL);

    datas = l_byteaGetData(bas, &size);
    bad = l_byteaCreate(100);
    objs = numaGetIArray(na_objs);  /* object number mapper */
    nobjs = numaGetCount(na_objs);  /* use for sanity checking */

        /* Substitute the object number on the first line */
    sscanf((char *)datas, "%d", &objin);
    if (objin < 0 || objin >= nobjs) {
        L_ERROR("index %d into array of size %d\n", __func__, objin, nobjs);
        LEPT_FREE(objs);
        return bad;
    }
    objout = objs[objin];
    snprintf((char *)buf, 32, "%d", objout);
    l_byteaAppendString(bad, (char *)buf);

        /* Find the set of matching locations for object references */
    arrayFindSequence(datas, size, &space, 1, &start, &found);
    da_match = arrayFindEachSequence(datas, size, (l_uint8 *)" 0 R", 4);
    if (!da_match) {
        l_byteaAppendData(bad, datas + start, size - start);
        LEPT_FREE(objs);
        return bad;
    }

        /* Substitute all the object reference numbers */
    nrepl = l_dnaGetCount(da_match);
    matches = l_dnaGetIArray(da_match);
    for (i = 0; i < nrepl; i++) {
            /* Find the first space before the object number */
        for (j = matches[i] - 1; j > 0; j--) {
            if (datas[j] == space)
                break;
        }
            /* Copy bytes from 'start' up to the object number */
        l_byteaAppendData(bad, datas + start, j - start + 1);
        sscanf((char *)(datas + j + 1), "%d", &objin);
        if (objin < 0 || objin >= nobjs) {
            L_ERROR("index %d into array of size %d\n", __func__, objin, nobjs);
            LEPT_FREE(objs);
            LEPT_FREE(matches);
            l_dnaDestroy(&da_match);
            return bad;
        }
        objout = objs[objin];
        snprintf((char *)buf, 32, "%d", objout);
        l_byteaAppendString(bad, (char *)buf);
        start = matches[i];
    }
    l_byteaAppendData(bad, datas + start, size - start);

    LEPT_FREE(objs);
    LEPT_FREE(matches);
    l_dnaDestroy(&da_match);
    return bad;
}


/*---------------------------------------------------------------------*
 *                     Create/destroy/access pdf data                  *
 *---------------------------------------------------------------------*/
static L_PDF_DATA *
pdfdataCreate(const char  *title)
{
L_PDF_DATA *lpd;

    lpd = (L_PDF_DATA *)LEPT_CALLOC(1, sizeof(L_PDF_DATA));
    if (title) lpd->title = stringNew(title);
    lpd->cida = ptraCreate(10);
    lpd->xy = ptaCreate(10);
    lpd->wh = ptaCreate(10);
    lpd->saprex = sarrayCreate(10);
    lpd->sacmap = sarrayCreate(10);
    lpd->objsize = l_dnaCreate(20);
    lpd->objloc = l_dnaCreate(20);
    return lpd;
}

static void
pdfdataDestroy(L_PDF_DATA  **plpd)
{
l_int32       i;
L_COMP_DATA  *cid;
L_PDF_DATA   *lpd;

    if (plpd== NULL) {
        L_WARNING("ptr address is null!\n", __func__);
        return;
    }
    if ((lpd = *plpd) == NULL)
        return;

    if (lpd->title) LEPT_FREE(lpd->title);
    for (i = 0; i < lpd->n; i++) {
        cid = (L_COMP_DATA *)ptraRemove(lpd->cida, i, L_NO_COMPACTION);
        l_CIDataDestroy(&cid);
    }

    ptraDestroy(&lpd->cida, 0, 0);
    if (lpd->id) LEPT_FREE(lpd->id);
    if (lpd->obj1) LEPT_FREE(lpd->obj1);
    if (lpd->obj2) LEPT_FREE(lpd->obj2);
    if (lpd->obj3) LEPT_FREE(lpd->obj3);
    if (lpd->obj4) LEPT_FREE(lpd->obj4);
    if (lpd->obj5) LEPT_FREE(lpd->obj5);
    if (lpd->poststream) LEPT_FREE(lpd->poststream);
    if (lpd->trailer) LEPT_FREE(lpd->trailer);
    if (lpd->xy) ptaDestroy(&lpd->xy);
    if (lpd->wh) ptaDestroy(&lpd->wh);
    if (lpd->mediabox) boxDestroy(&lpd->mediabox);
    if (lpd->saprex) sarrayDestroy(&lpd->saprex);
    if (lpd->sacmap) sarrayDestroy(&lpd->sacmap);
    if (lpd->objsize) l_dnaDestroy(&lpd->objsize);
    if (lpd->objloc) l_dnaDestroy(&lpd->objloc);
    LEPT_FREE(lpd);
    *plpd = NULL;
}


static L_COMP_DATA *
pdfdataGetCid(L_PDF_DATA  *lpd,
              l_int32      index)
{
    if (!lpd)
        return (L_COMP_DATA *)ERROR_PTR("lpd not defined", __func__, NULL);
    if (index < 0 || index >= lpd->n)
        return (L_COMP_DATA *)ERROR_PTR("invalid image index", __func__, NULL);

    return (L_COMP_DATA *)ptraGetPtrToItem(lpd->cida, index);
}


/*---------------------------------------------------------------------*
 *                     Find number of pages in a pdf                   *
 *---------------------------------------------------------------------*/
/*!
 * \brief   getPdfPageCount()
 *
 * \param[in]    fname      filename
 * \param[out]   pnpages    number of pages
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) Looks for the argument of the first instance of /Count in the file.
 *      (2) This first reads 10000 bytes from the beginning of the file.
 *          If "/Count" is not in that string, it reads the entire file
 *          and looks for "/Count".
 *      (3) This will not work on encrypted pdf files or on files where
 *          the "/Count" field is binary compressed.  Not finding the
 *          "/Count" field is not an error, but a warning is given.
 * </pre>
 */
l_ok
getPdfPageCount(const char  *fname,
                l_int32     *pnpages)
{
l_uint8  *data;
l_int32   format, loc, ret, npages, found;
size_t    nread;

    if (!pnpages)
        return ERROR_INT("&npages not defined", __func__, 1);
    *pnpages = 0;
    if (!fname)
        return ERROR_INT("fname not defined", __func__, 1);

        /* Make sure this a pdf file */
    findFileFormat(fname, &format);
    if (format != IFF_LPDF)
        return ERROR_INT("file is not pdf", __func__, 1);

        /* Read 10000 bytes from the beginning of the file */
    if ((data = l_binaryReadSelect(fname, 0, 10000, &nread))
                 == NULL)
        return ERROR_INT("partial data not read", __func__, 1);

        /* Find the location of the first instance of "/Count".
         * If it is not found, try reading the entire file and
         * looking again. */
    arrayFindSequence(data, nread, (const l_uint8 *)"/Count",
          strlen("/Count"), &loc, &found);
    if (!found) {
        lept_stderr("Reading entire file looking for '/Count'\n");
        LEPT_FREE(data);
        if ((data = l_binaryRead(fname, &nread)) == NULL)
            return ERROR_INT("full data not read", __func__, 1);
        arrayFindSequence(data, nread, (const l_uint8 *)"/Count",
             strlen("/Count"), &loc, &found);
        if (!found) {
            LEPT_FREE(data);
            L_WARNING("/Count not found\n", __func__);
            return 0;
        }
    }

        /* Unlikely: make sure we can read the count field */
    if (nread - loc < 12)  { /* haven't read enough to capture page count */
        LEPT_FREE(data);
        return ERROR_INT("data may not include page count field", __func__, 1);
    }

        /* Read the page count; if not found, puts garbage in npages */
    ret = sscanf((char *)&data[loc], "/Count %d", &npages);
    LEPT_FREE(data);
    if (ret != 1)
        return ERROR_INT("npages not found", __func__, 1);
    *pnpages = npages;
/*    lept_stderr("bytes read = %d, loc = %d, npages = %d\n",
                nread, loc, *pnpages);  */
    return 0;
}


/*---------------------------------------------------------------------*
 *      Find widths and heights of pages and media boxes in a pdf      *
 *---------------------------------------------------------------------*/
/*!
 * \brief   getPdfPageSizes()
 *
 * \param[in]    fname        filename
 * \param[out]   pnaw         [optional] array of page widths
 * \param[out]   pnah         [optional] array of page heights
 * \param[out]   pmedw        [optional] median page width
 * \param[out]   pmedh        [optional] median page height
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) Finds the arguments of each instance of '/Width' and '/Height'
 *          in the file.
 *      (2) This will not work on encrypted pdf files or on files where
 *          the "/Width" and "/Height" fields are binary compressed.
 *          Not finding the "/Width" and /Height" fields is not an error,
 *          but a warning is given.
 * </pre>
 */
l_ok
getPdfPageSizes(const char  *fname,
                NUMA       **pnaw,
                NUMA       **pnah,
                l_int32     *pmedw,
                l_int32     *pmedh)
{
l_uint8   *data;
l_int32    i, nw, nh, format, ret, loc, width, height;
l_float32  fval;
size_t     nread;
L_DNA     *dnaw;  /* width locations */
L_DNA     *dnah;  /* height locations */
NUMA      *naw;   /* widths */
NUMA      *nah;   /* heights */

    if (pnaw) *pnaw = NULL;
    if (pnah) *pnah = NULL;
    if (pmedw) *pmedw = 0;
    if (pmedh) *pmedh = 0;
    if (!pnaw && !pnah && !pmedw && !pmedh)
        return ERROR_INT("no output requested", __func__, 1);
    if (!fname)
        return ERROR_INT("fname not defined", __func__, 1);

        /* Make sure this a pdf file */
    findFileFormat(fname, &format);
    if (format != IFF_LPDF)
        return ERROR_INT("file is not pdf", __func__, 1);

        /* Read the file into memory and find all locations of
         * '/Width' and '/Height' */
    if ((data = l_binaryRead(fname, &nread)) == NULL)
        return ERROR_INT("full data not read", __func__, 1);
    dnaw = arrayFindEachSequence(data, nread, (const l_uint8 *)"/Width",
                                 strlen("/Width"));
    dnah = arrayFindEachSequence(data, nread, (const l_uint8 *)"/Height",
                                 strlen("/Height"));
    if (!dnaw)
        L_WARNING("unable to find widths\n", __func__);
    if (!dnah)
        L_WARNING("unable to find heights\n", __func__);
    if (!dnaw && !dnah) {
        LEPT_FREE(data);
        L_WARNING("no fields found\n", __func__);
        return 0;
    }

        /* Find the page widths and heights */
    nw = l_dnaGetCount(dnaw);
    naw = numaCreate(nw);
    for (i = 0; i < nw; i++) {
        l_dnaGetIValue(dnaw, i, &loc);
        ret = sscanf((char *)&data[loc], "/Width %d", &width);
        if (ret != 1) {
            L_ERROR("width not found for item %d at loc %d\n",
                    __func__, i, loc);
            continue;
        }
        numaAddNumber(naw, width);
    }
    nh = l_dnaGetCount(dnah);
    nah = numaCreate(nh);
    for (i = 0; i < nh; i++) {
        l_dnaGetIValue(dnah, i, &loc);
        ret = sscanf((char *)&data[loc], "/Height %d", &height);
        if (ret != 1) {
            L_ERROR("height not found for item %d at loc %d\n",
                    __func__, i, loc);
            continue;
        }
        numaAddNumber(nah, height);
    }

    LEPT_FREE(data);
    l_dnaDestroy(&dnaw);
    l_dnaDestroy(&dnah);
    if (pmedw) {
        numaGetMedian(naw, &fval);
        *pmedw = lept_roundftoi(fval);
    }
    if (pnaw)
        *pnaw = naw;
    else
        numaDestroy(&naw);
    if (pmedh) {
        numaGetMedian(nah, &fval);
        *pmedh = lept_roundftoi(fval);
    }
    if (pnah)
        *pnah = nah;
    else
        numaDestroy(&nah);
    return 0;
}


/*!
 * \brief   getPdfMediaBoxSizes()
 *
 * \param[in]    fname        filename
 * \param[out]   pnaw         [optional] array of mediabox widths
 * \param[out]   pnah         [optional] array of mediabox heights
 * \param[out]   pmedw        [optional] median mediabox width
 * \param[out]   pmedh        [optional] median mediabox height
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) Finds the arguments of each instance of '/MediaBox' in the file.
 *      (2) This will not work on encrypted pdf files or on files where
 *          the "/MediaBoxes" field is binary compressed.  Not finding
 *          the "/MediaBoxes" field is not an error, but a warning is given.
 *      (3) This is useful for determining if the media boxes are
 *          incorrectly assigned, such as assuming the resolution is 72 ppi.
 *          If that happens and the input the the renderer assumes the
 *          resolution is 300 ppi, the rendered images will be over 4x too
 *          large in each dimension.
 *      (4) An image dimension of 11 inches corresponds to a MediaBox
 *          parameter of 792.  We consider a value > 850 to be oversized
 *          and not to be taken literally.
 * </pre>
 */
l_ok
getPdfMediaBoxSizes(const char  *fname,
                    NUMA       **pnaw,
                    NUMA       **pnah,
                    l_int32     *pmedw,
                    l_int32     *pmedh)
{
l_uint8   *data;
l_int32    i, n, format, ret, loc;
l_float32  fval, ignore1, ignore2, w, h;
size_t     nread;
L_DNA     *dna;   /* mediabox locations */
NUMA      *naw;   /* mediabox widths */
NUMA      *nah;   /* mediabox heights */

    if (pnaw) *pnaw = NULL;
    if (pnah) *pnah = NULL;
    if (pmedw) *pmedw = 0;
    if (pmedh) *pmedh = 0;
    if (!pnaw && !pnah && !pmedw && !pmedh)
        return ERROR_INT("no output requested", __func__, 1);
    if (!fname)
        return ERROR_INT("fname not defined", __func__, 1);

        /* Make sure this a pdf file */
    findFileFormat(fname, &format);
    if (format != IFF_LPDF)
        return ERROR_INT("file is not pdf", __func__, 1);

        /* Read the file into memory and find all locations of '/MediaBox' */
    if ((data = l_binaryRead(fname, &nread)) == NULL)
        return ERROR_INT("full data not read", __func__, 1);
    dna = arrayFindEachSequence(data, nread, (const l_uint8 *)"/MediaBox",
                                strlen("/MediaBox"));
    if (!dna) {
        LEPT_FREE(data);
        L_WARNING("no mediaboxes found\n", __func__);
        return 1;
    }

        /* Find the mediabox widths and heights */
    n = l_dnaGetCount(dna);
    naw = numaCreate(n);
    nah = numaCreate(n);
    for (i = 0; i < n; i++) {
        l_dnaGetIValue(dna, i, &loc);
        ret = sscanf((char *)&data[loc], "/MediaBox [ %f %f %f %f",
                     &ignore1, &ignore2, &w, &h);
        if (ret != 4) {
            L_ERROR("mediabox sizes not found for item %d at loc %d\n",
                    __func__, i, loc);
            continue;
        }
        numaAddNumber(naw, w);
        numaAddNumber(nah, h);
    }
    LEPT_FREE(data);
    l_dnaDestroy(&dna);

    if (pmedw) {
        numaGetMedian(naw, &fval);
        *pmedw = lept_roundftoi(fval);
        if (*pmedw > 850) lept_stderr("oversize width: %d\n", *pmedw);
    }
    if (pnaw)
        *pnaw = naw;
    else
        numaDestroy(&naw);
    if (pmedh) {
        numaGetMedian(nah, &fval);
        *pmedh = lept_roundftoi(fval);
        if (*pmedh > 850) lept_stderr("oversize height: %d\n", *pmedh);
    }
    if (pnah)
        *pnah = nah;
    else
        numaDestroy(&nah);
    return 0;
}


/*---------------------------------------------------------------------*
 *       Find effective resolution of images rendered from a pdf       *
 *---------------------------------------------------------------------*/
/*!
 * \brief   getPdfRendererResolution()
 *
 * \param[in]    infile       filename of input pdf file
 * \param[in]    outdir       directory of rendered output images
 * \param[out]   pres         desired resolution to use with renderer
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) Finds the input resolution to pdftoppm that will generate
 *          images with a maximum dimension of about 3300 pixels,
 *          representing a full page at 300 ppi.
 *      (2) It is most important is to make sure the renderer does
 *          not make huge images because of an error in /MediaBox.
 *          An image dimension of 11 inches corresponds to a MediaBox
 *          parameter of 792.  We consider a value > 850 to be oversized
 *          and not to be taken literally.  If the mediaboxes are
 *          oversized, choose an appropriate lower resolution.
 *      (3) If the mediaboxes are not accessible, render an image at
 *          a low known resolution (say, 72 ppi) and based on the image
 *          size, determine the resolution necessary to make an image
 *          with 3300 pixels in the largest dimension.
 *      (4) Requires pdftoppm, so this is disabled on windows for now.
 *      (5) Requires the ability to call an external program, so it is
 *          necessary to call setLeptDebugOK(1) before this function.
 * </pre>
 */
l_ok
getPdfRendererResolution(const char  *infile,
                         const char  *outdir,
                         l_int32     *pres)
{
char      buf[256];
char     *tail, *basename, *fname;
l_int32   ret, res, medw, medh, medmax, npages, pageno, w, h;
SARRAY   *sa;

    if (!pres)
        return ERROR_INT("&res not defined", __func__, 1);
    *pres = 300;  /* default */

#ifdef _WIN32
    L_INFO("Requires pdftoppm, so this is disabled on windows.\n"
           "Returns default resolution 300 ppi", __func__);
    return 0;
#endif  /* _WIN32 */

    if (!LeptDebugOK) {
        L_INFO("Running pdftoppm is disabled; "
               "use setLeptDebugOK(1) to enable\n"
               "returns default resolution 300 ppi\n", __func__);
        return 1;
    }

    if (!infile)
        return ERROR_INT("infile not defined", __func__, 1);
    if (!outdir)
        return ERROR_INT("outdir not defined", __func__, 1);

    res = 300;  /* default value */
    ret = getPdfMediaBoxSizes(infile, NULL, NULL, &medw, &medh);
    if (ret == 0) {  /* Check for oversize mediaboxes */
        lept_stderr("Media Box medians: medw = %d, medh = %d\n", medw, medh);
        medmax = L_MAX(medw, medh);
        if (medmax > 850) {
            res = 300 * ((l_float32)792 / (l_float32)medmax);
            lept_stderr(" Oversize media box; use resolution = %d\n", res);
            *pres = res;
        }
        return 0;
    }

        /* No mediaboxes; render one page and measure the max dimension */
    lept_stderr("Media Box dimensions not found\n");
    getPdfPageCount(infile, &npages);
    pageno = (npages > 0) ? (npages + 1) / 2 : 1;
    splitPathAtDirectory(infile, NULL, &tail);
    splitPathAtExtension(tail, &basename, NULL);
    snprintf(buf, sizeof(buf), "pdftoppm -f %d -l %d -r 72 %s %s/%s",
             pageno, pageno, infile, outdir, basename);
    LEPT_FREE(tail);
    LEPT_FREE(basename);
    callSystemDebug(buf);  /* pdftoppm */

        /* Get the page size */
    sa = getSortedPathnamesInDirectory(outdir, NULL, 0, 0);
    fname = sarrayGetString(sa, 0, L_NOCOPY);
    pixReadHeader(fname, NULL, &w, &h, NULL, NULL, NULL);
    sarrayDestroy(&sa);
    if (w > 0 && h > 0) {
        res = L_MIN((72 * 3300 / L_MAX(w, h)), 600);
        *pres = res;
        lept_stderr("Use resolution = %d\n", res);
    } else {
        L_ERROR("page size not found; assuming res = 300\n", __func__);
    }

    return 0;
}


/*---------------------------------------------------------------------*
 *                      Set flags for special modes                    *
 *---------------------------------------------------------------------*/
/*!
 * \brief   l_pdfSetG4ImageMask()
 *
 * \param[in]    flag    1 for writing g4 data as fg only through a mask;
 *                       0 for writing fg and bg
 * \return  void
 *
 * <pre>
 * Notes:
 *      (1) The default is for writing only the fg (through the mask).
 *          That way when you write a 1 bpp image, the bg is transparent,
 *          so any previously written image remains visible behind it.
 * </pre>
 */
void
l_pdfSetG4ImageMask(l_int32  flag)
{
    var_WRITE_G4_IMAGE_MASK = flag;
}


/*!
 * \brief   l_pdfSetDateAndVersion()
 *
 * \param[in]    flag    1 for writing date/time and leptonica version;
 *                       0 for omitting this from the metadata
 * \return  void
 *
 * <pre>
 * Notes:
 *      (1) The default is for writing this data.  For regression tests
 *          that compare output against golden files, it is useful to omit.
 * </pre>
 */
void
l_pdfSetDateAndVersion(l_int32  flag)
{
    var_WRITE_DATE_AND_VERSION = flag;
}

/* --------------------------------------------*/
#endif  /* USE_PDFIO */
/* --------------------------------------------*/
author	Franz Glasner <fzglas.hg@dom66.de>
date	Mon, 15 Sep 2025 11:43:07 +0200
parents
children