view mupdf-source/thirdparty/leptonica/src/pdfapp.c @ 20:eb3dd22fef2c

FIX: the new "sdist" build target is PHONY also
author Franz Glasner <fzglas.hg@dom66.de>
date Thu, 18 Sep 2025 22:04:13 +0200
parents b50eed0cc0ef
children
line wrap: on
line source

/*====================================================================*
 -  Copyright (C) 2001 Leptonica.  All rights reserved.
 -
 -  Redistribution and use in source and binary forms, with or without
 -  modification, are permitted provided that the following conditions
 -  are met:
 -  1. Redistributions of source code must retain the above copyright
 -     notice, this list of conditions and the following disclaimer.
 -  2. Redistributions in binary form must reproduce the above
 -     copyright notice, this list of conditions and the following
 -     disclaimer in the documentation and/or other materials
 -     provided with the distribution.
 -
 -  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 -  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 -  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 -  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL ANY
 -  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 -  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 -  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 -  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 -  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 -  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 -  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *====================================================================*/

/*!
 * \file pdfapp.c
 * <pre>
 *
 *    Image processing operations on multiple images followed by wrapping
 *    them into a pdf.
 *
 *    There are two possible ways to specify the set of images:
 *    (1) an array of pathnames
 *    (2) a directory, typically with an additional pattern for selection.
 *    We use (1) because it is both simpler and more general.
 *
 *    Corresponding to each function here is:
 *    (1) the image processing function that is carried out on each image
 *    (2) a program in prog that extracts images from a pdf and calls this
 *        function with an array of their pathnames.
 *
 *    |=============================================================|
 *    |                        Important notes                      |
 *    |=============================================================|
 *    | Some of these functions require I/O libraries such as       |
 *    | libtiff, libjpeg, libpng and libz.  If you do not have      |
 *    | these libraries, some calls will fail.  For example,        |
 *    | if you do not have libtiff, you cannot write a pdf that     |
 *    | uses libtiff to encode bilevel images in tiffg4.            |
 *    |                                                             |
 *    | You can manually deactivate all pdf writing by setting      |
 *    | this in environ.h:                                          |
 *    | \code                                                       |
 *    |      #define  USE_PDFIO     0                               |
 *    | \endcode                                                    |
 *    | This will link the stub file pdfappstub.c.                  |
 *    |=============================================================|
 *
 *     The images in the pdf file can be rendered using a pdf viewer,
 *     such as evince, gv, xpdf or acroread.
 *
 *     Compression of images for prog/compresspdf
 *          l_int32          compressFilesToPdf()
 *
 *     Crop images for prog/croppdf
 *          l_int32          cropFilesToPdf()
 *
 *     Cleanup and binarization of images for prog/cleanpdf
 *          l_int32          cleanTo1bppFilesToPdf()
 * </pre>
 */

#ifdef HAVE_CONFIG_H
#include <config_auto.h>
#endif  /* HAVE_CONFIG_H */

#include <string.h>
#include "allheaders.h"


/* --------------------------------------------*/
#if  USE_PDFIO   /* defined in environ.h */
 /* --------------------------------------------*/

/*---------------------------------------------------------------------*
 *              Compression of images for prog/compresspdf             *
 *---------------------------------------------------------------------*/
/*!
 * \brief   compressFilesToPdf()
 *
 * \param[in]    sa            sorted full pathnames of images
 * \param[in]    onebit        set to 1 to enforce 1 bpp tiffg4 encoding
 * \param[in]    savecolor     if %onebit == 1, set to 1 to save color
 * \param[in]    scalefactor   scaling factor applied to each image; > 0.0
 * \param[in]    quality       for jpeg: 0 for default (50; otherwise 25 - 95.
 * \param[in]    title         [optional] pdf title; can be null
 * \param[in]    fileout       pdf file of all images
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *    (1) This function is designed to optionally scale and compress a set of
 *        images, wrapping them in a pdf in the order given in the input %sa.
 *    (2) It does the image processing for prog/compresspdf.c.
 *    (3) Images in the output pdf are encoded with either tiffg4 or jpeg (DCT),
 *        or a mixture of them depending on parameters %onebit and %savecolor.
 *    (4) Parameters %onebit and %savecolor work as follows:
 *        %onebit = 0: no depth conversion, default encoding depends on depth
 *        %onebit = 1, %savecolor = 0: all images converted to 1 bpp
 *        %onebit = 1, %savecolor = 1: images without color are converted
 *           to 1 bpp; images with color have the color preserved.
 *    (5) In use, if most of the pages are 1 bpp but some have color that needs
 *        to be preserved, %onebit and %savecolor should both be 1.  This
 *        causes DCT compression of color images and tiffg4 compression
 *        of monochrome images.
 *    (6) The images will be concatenated in the order given in %sa.
 *    (7) Typically, %scalefactor <= 1.0.  It is applied to each image
 *        before encoding.  If you enter a value <= 0.0, it will be set to 1.0.
 *        The maximum allowed value is 2.0.
 *    (8) Default jpeg %quality is 50; otherwise, quality factors between
 *        25 and 95 are enforced.
 *    (9) Page images at 300 ppi are about 8 Mpixels.  RGB(A) rasters are
 *        then about 32 MB (1 bpp images are about 1 MB).  If there are
 *        more than 25 images, store the images after processing as an
 *        array of compressed images (a Pixac); otherwise, use a Pixa.
 * </pre>
 */
l_ok
compressFilesToPdf(SARRAY      *sa,
                   l_int32      onebit,
                   l_int32      savecolor,
                   l_float32    scalefactor,
                   l_int32      quality,
                   const char  *title,
                   const char  *fileout)
{
char      *fname;
l_int32    n, i, res;
l_int32    maxsmallset = 25;  /* max num images kept uncompressed in array */
l_float32  colorfract;
PIX       *pixs, *pix1, *pix2;
PIXA      *pixa1 = NULL;
PIXAC     *pixac1 = NULL;

    if (!sa)
        return ERROR_INT("sa not defined", __func__, 1);
    if (!fileout)
        return ERROR_INT("fileout not defined", __func__, 1);
    if (scalefactor <= 0) scalefactor = 1.0;
    if (scalefactor > 2.0) {
        L_WARNING("scalefactor %f too big; setting to 2.0\n", __func__,
                  scalefactor);
        scalefactor = 2.0;
    }
    if (quality <= 0) quality = 50;  /* default value */
    if (quality < 25) {
        L_WARNING("quality %d too low; setting to 25\n", __func__, quality);
        quality = 25;
    }
    if (quality > 95) {
        L_WARNING("quality %d too high; setting to 95\n", __func__, quality);
        quality = 95;
    }
    if ((n = sarrayGetCount(sa)) == 0)
        return ERROR_INT("sa is empty", __func__, 1);

    if (n <= maxsmallset)
        pixa1 = pixaCreate(n);
    else
        pixac1 = pixacompCreate(n);
    for (i = 0; i < n; i++) {
        if (i == 0)
            lept_stderr("page: ");
        else if (i % 10 == 0)
            lept_stderr("%d . ", i);
        fname = sarrayGetString(sa, i, L_NOCOPY);
        pixs = pixRead(fname);
        if (onebit) {
            if (savecolor) {
                pixColorFraction(pixs, 40, 224, 80, 4, NULL, &colorfract);
                if (colorfract > 0.01)  /* save the color; DCT encoding */
                    pix1 = pixClone(pixs);
                else
                    pix1 = pixConvertTo1(pixs, 180);
            } else {  /* do not save any color; tiffg4 encoding */
                pix1 = pixConvertTo1(pixs, 180);
            }
        } else {  /* default encoding: tiffg4 for 1 bpp; DCT for all else */
            pix1 = pixClone(pixs);
        }
        if (scalefactor == 1.0)
            pix2 = pixClone(pix1);
        else
            pix2 = pixScale(pix1, scalefactor, scalefactor);
        if (n <= maxsmallset) {
            pixaAddPix(pixa1, pix2, L_INSERT);
        } else {
            pixacompAddPix(pixac1, pix2, IFF_DEFAULT);
            pixDestroy(&pix2);
        }
        pixDestroy(&pixs);
        pixDestroy(&pix1);
    }

        /* Generate the pdf.  Compute the actual input resolution from
         * the pixel dimensions of the first image.  This will cause each
         * page to be printed to cover an 8.5 x 11 inch sheet of paper. */
    lept_stderr("\nWrite output to %s\n", fileout);
    if (n <= maxsmallset)
        pix1 = pixaGetPix(pixa1, 0, L_CLONE);
    else
        pix1 = pixacompGetPix(pixac1, 0);
    pixInferResolution(pix1, 11.0, &res);
    pixDestroy(&pix1);
    if (strcmp(title, "none") == 0)
        title = NULL;
    if (n <= maxsmallset) {
        pixaConvertToPdf(pixa1, res, 1.0, L_DEFAULT_ENCODE, quality,
                         title, fileout);
        pixaDestroy(&pixa1);
    } else {
        pixacompConvertToPdf(pixac1, res, 1.0, L_DEFAULT_ENCODE, quality,
                             title, fileout);
        pixacompDestroy(&pixac1);
    }
    return 0;
}


/*---------------------------------------------------------------------*
 *                       Crop images for prog/croppdf                  *
 *---------------------------------------------------------------------*/
/*!
 * \brief   cropFilesToPdf()
 *
 * \param[in]    sa            sorted full pathnames of images
 * \param[in]    lr_clear      full res pixels cleared at left and right sides
 * \param[in]    tb_clear      full res pixels cleared at top and bottom sides
 * \param[in]    edgeclean     parameter for removing edge noise (-1 to 15)
 *                             default = 0 (no removal);
 *                             15 is maximally aggressive for random noise
 *                             -1 for aggressively removing side noise
 *                             -2 to extract page embedded in black background
 * \param[in]    lr_border     full res final "added" pixels on left and right
 * \param[in]    tb_border     full res final "added" pixels on top and bottom
 * \param[in]    maxwiden      max fractional horizontal stretch allowed
 * \param[in]    printwiden    0 to skip, 1 for 8.5x11, 2 for A4
 * \param[in]    title         [optional] pdf title; can be null
 * \param[in]    fileout       pdf file of all images
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *    (1) This function is designed to optionally remove white space from
 *        around the page images, and generate a pdf that prints with
 *        foreground occupying much of the full page.
 *    (2) It does the image processing for prog/croppdf.c.
 *    (3) Images in the output pdf are 1 bpp and encoded with tiffg4.
 *    (4) See documentation in pixCropImage() for details on the processing.
 *    (5) The images will be concatenated in the order given in %safiles.
 *    (6) Output page images are at 300 ppi and are stored in memory.
 *        They are about 1 Mpixel when uncompressed.  For up to 200 pages,
 *        the images are stored uncompressed; otherwise, the stored
 *        images are compressed with tiffg4.
 * </pre>
 */
l_ok
cropFilesToPdf(SARRAY      *sa,
               l_int32      lr_clear,
               l_int32      tb_clear,
               l_int32      edgeclean,
               l_int32      lr_border,
               l_int32      tb_border,
               l_float32    maxwiden,
               l_int32      printwiden,
               const char  *title,
               const char  *fileout)
{
char      *fname;
l_int32    n, i, res;
l_int32    maxsmallset = 200;  /* max num images kept uncompressed in array */
PIX       *pixs, *pix1;
PIXA      *pixa1 = NULL;
PIXAC     *pixac1 = NULL;

    if (!sa)
        return ERROR_INT("sa not defined", __func__, 1);
    if (!fileout)
        return ERROR_INT("fileout not defined", __func__, 1);
    if ((n = sarrayGetCount(sa)) == 0)
        return ERROR_INT("sa is empty", __func__, 1);

    if (n <= maxsmallset)
        pixa1 = pixaCreate(n);
    else
        pixac1 = pixacompCreate(n);
    for (i = 0; i < n; i++) {
        if (i == 0)
            lept_stderr("page: ");
        else if (i % 10 == 0)
            lept_stderr("%d . ", i);
        fname = sarrayGetString(sa, i, L_NOCOPY);
        pixs = pixRead(fname);
        pix1 = pixCropImage(pixs, lr_clear, tb_clear, edgeclean,
                            lr_border, tb_border, maxwiden, printwiden,
                            NULL, NULL);
        pixDestroy(&pixs);
        if (!pix1) {
            L_ERROR("pix1 not made for i = %d\n", __func__, i);
            continue;
        }
        if (n <= maxsmallset)
            pixaAddPix(pixa1, pix1, L_INSERT);
        else
            pixacompAddPix(pixac1, pix1, IFF_TIFF_G4);
    }

        /* Generate the pdf.  Compute the actual input resolution from
         * the pixel dimensions of the first image.  This will cause each
         * page to be printed to cover an 8.5 x 11 inch sheet of paper. */
    lept_stderr("\nWrite output to %s\n", fileout);
    if (n <= maxsmallset)
        pix1 = pixaGetPix(pixa1, 0, L_CLONE);
    else
        pix1 = pixacompGetPix(pixac1, 0);
    pixInferResolution(pix1, 11.0, &res);
    pixDestroy(&pix1);
    if (strcmp(title, "none") == 0)
        title = NULL;
    if (n <= maxsmallset) {
        pixaConvertToPdf(pixa1, res, 1.0, L_G4_ENCODE, 0, title, fileout);
        pixaDestroy(&pixa1);
    } else {
        pixacompConvertToPdf(pixac1, res, 1.0, L_G4_ENCODE, 0, title, fileout);
        pixacompDestroy(&pixac1);
    }
    return 0;
}


/*---------------------------------------------------------------------*
 *        Cleanup and binarization of images for prog/cleanpdf         *
 *---------------------------------------------------------------------*/
/*!
 * \brief   cleanTo1bppFilesToPdf()
 *
 * \param[in]    sa            sorted full pathnames of images
 * \param[in]    res           either 300 or 600 ppi for output
 * \param[in]    contrast      vary contrast: 1 = lightest; 10 = darkest;
 *                             suggest 1 unless light features are being lost
 * \param[in]    rotation      cw by 90 degrees: {0,1,2,3} represent
 *                             0, 90, 180 and 270 degree cw rotations
 * \param[in]    opensize      opening size of structuring element for noise
 *                             removal: {0 or 1to skip; 2, 3 for opening}
 * \param[in]    title         [optional] pdf title; can be null
 * \param[in]    fileout       pdf file of all images
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *    (1) This deskews, optionally rotates and darkens, cleans background
 *        to white, binarizes and optionally removes small noise, and
 *        put the images into the pdf in the order given in %sa.
 *    (2) All images in the pdf are tiffg4 encoded.
 *    (3) For color and grayscale input, local background normalization is
 *        done to 200, and a threshold of 180 sets the maximum foreground
 *        value in the normalized image.
 *    (4) The %res parameter can be either 300 or 600 ppi.  If the input
 *        is gray or color and %res = 600, this does an interpolated 2x
 *        expansion before binarizing.
 *    (5) The %contrast parameter adjusts the binarization to avoid losing
 *        lighter input pixels.  Contrast is increased as %contrast increases
 *        from 1 to 10.
 *    (6) The #opensize parameter is the size of a square SEL used with
 *        opening to remove small speckle noise.  Allowed open sizes are 2,3.
 *        If this is to be used, try 2 before 3.
 *    (7) If there are more than 200 images, store the images after processing
 *        as an array of compressed images (a Pixac); otherwise, use a Pixa.
 * </pre>
 */
l_ok
cleanTo1bppFilesToPdf(SARRAY      *sa,
                      l_int32      res,
                      l_int32      contrast,
                      l_int32      rotation,
                      l_int32      opensize,
                      const char  *title,
                      const char  *fileout)
{
char      *fname;
l_int32    n, i, scale;
l_int32    maxsmallset = 200;  /* max num images kept uncompressed in array */
PIX       *pixs, *pix1;
PIXA      *pixa1 = NULL;
PIXAC     *pixac1 = NULL;

    if (!sa)
        return ERROR_INT("sa not defined", __func__, 1);
    if (!fileout)
        return ERROR_INT("fileout not defined", __func__, 1);
    if (res == 0) res = 300;
    if (res != 300 && res != 600) {
        L_ERROR("invalid res = %d; res must be in {0, 300, 600}\n",
                __func__, res);
        return 1;
    }
    if (contrast < 1 || contrast > 10) {
        L_ERROR("invalid contrast = %d; contrast must be in [1...10]\n",
                __func__, contrast);
        return 1;
    }
    if (rotation < 0 || rotation > 3) {
        L_ERROR("invalid rotation = %d; rotation must be in  {0,1,2,3}\n",
                __func__, rotation);
        return 1;
    }
    if (opensize > 3) {
        L_ERROR("invalid opensize = %d; opensize must be <= 3\n",
                __func__, opensize);
        return 1;
    }
    scale = (res == 300) ? 1 : 2;
    if ((n = sarrayGetCount(sa)) == 0)
        return ERROR_INT("sa is empty", __func__, 1);

    if (n <= maxsmallset)
        pixa1 = pixaCreate(n);
    else
        pixac1 = pixacompCreate(n);
    for (i = 0; i < n; i++) {
        if (i == 0)
            lept_stderr("page: ");
        else if (i % 10 == 0)
            lept_stderr("%d . ", i);
        fname = sarrayGetString(sa, i, L_NOCOPY);
        if ((pixs = pixRead(fname)) == NULL) {
            L_ERROR("pixs not read from %s\n", __func__, fname);
            continue;
        }

        pix1 = pixCleanImage(pixs, contrast, rotation, scale, opensize);
        if (n <= maxsmallset) {
            pixaAddPix(pixa1, pix1, L_INSERT);
        } else {
            pixacompAddPix(pixac1, pix1, IFF_TIFF_G4);
            pixDestroy(&pix1);
        }
        pixDestroy(&pixs);
    }

        /* Generate the pdf.  Compute the actual input resolution from
         * the pixel dimensions of the first image.  This will cause each
         * page to be printed to cover an 8.5 x 11 inch sheet of paper. */
    lept_stderr("Write output to %s\n", fileout);
    if (n <= maxsmallset)
        pix1 = pixaGetPix(pixa1, 0, L_CLONE);
    else
        pix1 = pixacompGetPix(pixac1, 0);
    pixInferResolution(pix1, 11.0, &res);
    pixDestroy(&pix1);
    if (strcmp(title, "none") == 0)
        title = NULL;

    if (n <= maxsmallset) {
        pixaConvertToPdf(pixa1, res, 1.0, L_G4_ENCODE, 0, title, fileout);
        pixaDestroy(&pixa1);
    } else {
        pixacompConvertToPdf(pixac1, res, 1.0, L_G4_ENCODE, 0, title, fileout);
        pixacompDestroy(&pixac1);
    }
    return 0;
}

/* --------------------------------------------*/
#endif  /* USE_PDFIO */
/* --------------------------------------------*/