diff mupdf-source/thirdparty/leptonica/src/pdfapp.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/leptonica/src/pdfapp.c	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,478 @@
+/*====================================================================*
+ -  Copyright (C) 2001 Leptonica.  All rights reserved.
+ -
+ -  Redistribution and use in source and binary forms, with or without
+ -  modification, are permitted provided that the following conditions
+ -  are met:
+ -  1. Redistributions of source code must retain the above copyright
+ -     notice, this list of conditions and the following disclaimer.
+ -  2. Redistributions in binary form must reproduce the above
+ -     copyright notice, this list of conditions and the following
+ -     disclaimer in the documentation and/or other materials
+ -     provided with the distribution.
+ -
+ -  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ -  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ -  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ -  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL ANY
+ -  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ -  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ -  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ -  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ -  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ -  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ -  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *====================================================================*/
+
+/*!
+ * \file pdfapp.c
+ * <pre>
+ *
+ *    Image processing operations on multiple images followed by wrapping
+ *    them into a pdf.
+ *
+ *    There are two possible ways to specify the set of images:
+ *    (1) an array of pathnames
+ *    (2) a directory, typically with an additional pattern for selection.
+ *    We use (1) because it is both simpler and more general.
+ *
+ *    Corresponding to each function here is:
+ *    (1) the image processing function that is carried out on each image
+ *    (2) a program in prog that extracts images from a pdf and calls this
+ *        function with an array of their pathnames.
+ *
+ *    |=============================================================|
+ *    |                        Important notes                      |
+ *    |=============================================================|
+ *    | Some of these functions require I/O libraries such as       |
+ *    | libtiff, libjpeg, libpng and libz.  If you do not have      |
+ *    | these libraries, some calls will fail.  For example,        |
+ *    | if you do not have libtiff, you cannot write a pdf that     |
+ *    | uses libtiff to encode bilevel images in tiffg4.            |
+ *    |                                                             |
+ *    | You can manually deactivate all pdf writing by setting      |
+ *    | this in environ.h:                                          |
+ *    | \code                                                       |
+ *    |      #define  USE_PDFIO     0                               |
+ *    | \endcode                                                    |
+ *    | This will link the stub file pdfappstub.c.                  |
+ *    |=============================================================|
+ *
+ *     The images in the pdf file can be rendered using a pdf viewer,
+ *     such as evince, gv, xpdf or acroread.
+ *
+ *     Compression of images for prog/compresspdf
+ *          l_int32          compressFilesToPdf()
+ *
+ *     Crop images for prog/croppdf
+ *          l_int32          cropFilesToPdf()
+ *
+ *     Cleanup and binarization of images for prog/cleanpdf
+ *          l_int32          cleanTo1bppFilesToPdf()
+ * </pre>
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config_auto.h>
+#endif  /* HAVE_CONFIG_H */
+
+#include <string.h>
+#include "allheaders.h"
+
+
+/* --------------------------------------------*/
+#if  USE_PDFIO   /* defined in environ.h */
+ /* --------------------------------------------*/
+
+/*---------------------------------------------------------------------*
+ *              Compression of images for prog/compresspdf             *
+ *---------------------------------------------------------------------*/
+/*!
+ * \brief   compressFilesToPdf()
+ *
+ * \param[in]    sa            sorted full pathnames of images
+ * \param[in]    onebit        set to 1 to enforce 1 bpp tiffg4 encoding
+ * \param[in]    savecolor     if %onebit == 1, set to 1 to save color
+ * \param[in]    scalefactor   scaling factor applied to each image; > 0.0
+ * \param[in]    quality       for jpeg: 0 for default (50; otherwise 25 - 95.
+ * \param[in]    title         [optional] pdf title; can be null
+ * \param[in]    fileout       pdf file of all images
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *    (1) This function is designed to optionally scale and compress a set of
+ *        images, wrapping them in a pdf in the order given in the input %sa.
+ *    (2) It does the image processing for prog/compresspdf.c.
+ *    (3) Images in the output pdf are encoded with either tiffg4 or jpeg (DCT),
+ *        or a mixture of them depending on parameters %onebit and %savecolor.
+ *    (4) Parameters %onebit and %savecolor work as follows:
+ *        %onebit = 0: no depth conversion, default encoding depends on depth
+ *        %onebit = 1, %savecolor = 0: all images converted to 1 bpp
+ *        %onebit = 1, %savecolor = 1: images without color are converted
+ *           to 1 bpp; images with color have the color preserved.
+ *    (5) In use, if most of the pages are 1 bpp but some have color that needs
+ *        to be preserved, %onebit and %savecolor should both be 1.  This
+ *        causes DCT compression of color images and tiffg4 compression
+ *        of monochrome images.
+ *    (6) The images will be concatenated in the order given in %sa.
+ *    (7) Typically, %scalefactor <= 1.0.  It is applied to each image
+ *        before encoding.  If you enter a value <= 0.0, it will be set to 1.0.
+ *        The maximum allowed value is 2.0.
+ *    (8) Default jpeg %quality is 50; otherwise, quality factors between
+ *        25 and 95 are enforced.
+ *    (9) Page images at 300 ppi are about 8 Mpixels.  RGB(A) rasters are
+ *        then about 32 MB (1 bpp images are about 1 MB).  If there are
+ *        more than 25 images, store the images after processing as an
+ *        array of compressed images (a Pixac); otherwise, use a Pixa.
+ * </pre>
+ */
+l_ok
+compressFilesToPdf(SARRAY      *sa,
+                   l_int32      onebit,
+                   l_int32      savecolor,
+                   l_float32    scalefactor,
+                   l_int32      quality,
+                   const char  *title,
+                   const char  *fileout)
+{
+char      *fname;
+l_int32    n, i, res;
+l_int32    maxsmallset = 25;  /* max num images kept uncompressed in array */
+l_float32  colorfract;
+PIX       *pixs, *pix1, *pix2;
+PIXA      *pixa1 = NULL;
+PIXAC     *pixac1 = NULL;
+
+    if (!sa)
+        return ERROR_INT("sa not defined", __func__, 1);
+    if (!fileout)
+        return ERROR_INT("fileout not defined", __func__, 1);
+    if (scalefactor <= 0) scalefactor = 1.0;
+    if (scalefactor > 2.0) {
+        L_WARNING("scalefactor %f too big; setting to 2.0\n", __func__,
+                  scalefactor);
+        scalefactor = 2.0;
+    }
+    if (quality <= 0) quality = 50;  /* default value */
+    if (quality < 25) {
+        L_WARNING("quality %d too low; setting to 25\n", __func__, quality);
+        quality = 25;
+    }
+    if (quality > 95) {
+        L_WARNING("quality %d too high; setting to 95\n", __func__, quality);
+        quality = 95;
+    }
+    if ((n = sarrayGetCount(sa)) == 0)
+        return ERROR_INT("sa is empty", __func__, 1);
+
+    if (n <= maxsmallset)
+        pixa1 = pixaCreate(n);
+    else
+        pixac1 = pixacompCreate(n);
+    for (i = 0; i < n; i++) {
+        if (i == 0)
+            lept_stderr("page: ");
+        else if (i % 10 == 0)
+            lept_stderr("%d . ", i);
+        fname = sarrayGetString(sa, i, L_NOCOPY);
+        pixs = pixRead(fname);
+        if (onebit) {
+            if (savecolor) {
+                pixColorFraction(pixs, 40, 224, 80, 4, NULL, &colorfract);
+                if (colorfract > 0.01)  /* save the color; DCT encoding */
+                    pix1 = pixClone(pixs);
+                else
+                    pix1 = pixConvertTo1(pixs, 180);
+            } else {  /* do not save any color; tiffg4 encoding */
+                pix1 = pixConvertTo1(pixs, 180);
+            }
+        } else {  /* default encoding: tiffg4 for 1 bpp; DCT for all else */
+            pix1 = pixClone(pixs);
+        }
+        if (scalefactor == 1.0)
+            pix2 = pixClone(pix1);
+        else
+            pix2 = pixScale(pix1, scalefactor, scalefactor);
+        if (n <= maxsmallset) {
+            pixaAddPix(pixa1, pix2, L_INSERT);
+        } else {
+            pixacompAddPix(pixac1, pix2, IFF_DEFAULT);
+            pixDestroy(&pix2);
+        }
+        pixDestroy(&pixs);
+        pixDestroy(&pix1);
+    }
+
+        /* Generate the pdf.  Compute the actual input resolution from
+         * the pixel dimensions of the first image.  This will cause each
+         * page to be printed to cover an 8.5 x 11 inch sheet of paper. */
+    lept_stderr("\nWrite output to %s\n", fileout);
+    if (n <= maxsmallset)
+        pix1 = pixaGetPix(pixa1, 0, L_CLONE);
+    else
+        pix1 = pixacompGetPix(pixac1, 0);
+    pixInferResolution(pix1, 11.0, &res);
+    pixDestroy(&pix1);
+    if (strcmp(title, "none") == 0)
+        title = NULL;
+    if (n <= maxsmallset) {
+        pixaConvertToPdf(pixa1, res, 1.0, L_DEFAULT_ENCODE, quality,
+                         title, fileout);
+        pixaDestroy(&pixa1);
+    } else {
+        pixacompConvertToPdf(pixac1, res, 1.0, L_DEFAULT_ENCODE, quality,
+                             title, fileout);
+        pixacompDestroy(&pixac1);
+    }
+    return 0;
+}
+
+
+/*---------------------------------------------------------------------*
+ *                       Crop images for prog/croppdf                  *
+ *---------------------------------------------------------------------*/
+/*!
+ * \brief   cropFilesToPdf()
+ *
+ * \param[in]    sa            sorted full pathnames of images
+ * \param[in]    lr_clear      full res pixels cleared at left and right sides
+ * \param[in]    tb_clear      full res pixels cleared at top and bottom sides
+ * \param[in]    edgeclean     parameter for removing edge noise (-1 to 15)
+ *                             default = 0 (no removal);
+ *                             15 is maximally aggressive for random noise
+ *                             -1 for aggressively removing side noise
+ *                             -2 to extract page embedded in black background
+ * \param[in]    lr_border     full res final "added" pixels on left and right
+ * \param[in]    tb_border     full res final "added" pixels on top and bottom
+ * \param[in]    maxwiden      max fractional horizontal stretch allowed
+ * \param[in]    printwiden    0 to skip, 1 for 8.5x11, 2 for A4
+ * \param[in]    title         [optional] pdf title; can be null
+ * \param[in]    fileout       pdf file of all images
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *    (1) This function is designed to optionally remove white space from
+ *        around the page images, and generate a pdf that prints with
+ *        foreground occupying much of the full page.
+ *    (2) It does the image processing for prog/croppdf.c.
+ *    (3) Images in the output pdf are 1 bpp and encoded with tiffg4.
+ *    (4) See documentation in pixCropImage() for details on the processing.
+ *    (5) The images will be concatenated in the order given in %safiles.
+ *    (6) Output page images are at 300 ppi and are stored in memory.
+ *        They are about 1 Mpixel when uncompressed.  For up to 200 pages,
+ *        the images are stored uncompressed; otherwise, the stored
+ *        images are compressed with tiffg4.
+ * </pre>
+ */
+l_ok
+cropFilesToPdf(SARRAY      *sa,
+               l_int32      lr_clear,
+               l_int32      tb_clear,
+               l_int32      edgeclean,
+               l_int32      lr_border,
+               l_int32      tb_border,
+               l_float32    maxwiden,
+               l_int32      printwiden,
+               const char  *title,
+               const char  *fileout)
+{
+char      *fname;
+l_int32    n, i, res;
+l_int32    maxsmallset = 200;  /* max num images kept uncompressed in array */
+PIX       *pixs, *pix1;
+PIXA      *pixa1 = NULL;
+PIXAC     *pixac1 = NULL;
+
+    if (!sa)
+        return ERROR_INT("sa not defined", __func__, 1);
+    if (!fileout)
+        return ERROR_INT("fileout not defined", __func__, 1);
+    if ((n = sarrayGetCount(sa)) == 0)
+        return ERROR_INT("sa is empty", __func__, 1);
+
+    if (n <= maxsmallset)
+        pixa1 = pixaCreate(n);
+    else
+        pixac1 = pixacompCreate(n);
+    for (i = 0; i < n; i++) {
+        if (i == 0)
+            lept_stderr("page: ");
+        else if (i % 10 == 0)
+            lept_stderr("%d . ", i);
+        fname = sarrayGetString(sa, i, L_NOCOPY);
+        pixs = pixRead(fname);
+        pix1 = pixCropImage(pixs, lr_clear, tb_clear, edgeclean,
+                            lr_border, tb_border, maxwiden, printwiden,
+                            NULL, NULL);
+        pixDestroy(&pixs);
+        if (!pix1) {
+            L_ERROR("pix1 not made for i = %d\n", __func__, i);
+            continue;
+        }
+        if (n <= maxsmallset)
+            pixaAddPix(pixa1, pix1, L_INSERT);
+        else
+            pixacompAddPix(pixac1, pix1, IFF_TIFF_G4);
+    }
+
+        /* Generate the pdf.  Compute the actual input resolution from
+         * the pixel dimensions of the first image.  This will cause each
+         * page to be printed to cover an 8.5 x 11 inch sheet of paper. */
+    lept_stderr("\nWrite output to %s\n", fileout);
+    if (n <= maxsmallset)
+        pix1 = pixaGetPix(pixa1, 0, L_CLONE);
+    else
+        pix1 = pixacompGetPix(pixac1, 0);
+    pixInferResolution(pix1, 11.0, &res);
+    pixDestroy(&pix1);
+    if (strcmp(title, "none") == 0)
+        title = NULL;
+    if (n <= maxsmallset) {
+        pixaConvertToPdf(pixa1, res, 1.0, L_G4_ENCODE, 0, title, fileout);
+        pixaDestroy(&pixa1);
+    } else {
+        pixacompConvertToPdf(pixac1, res, 1.0, L_G4_ENCODE, 0, title, fileout);
+        pixacompDestroy(&pixac1);
+    }
+    return 0;
+}
+
+
+/*---------------------------------------------------------------------*
+ *        Cleanup and binarization of images for prog/cleanpdf         *
+ *---------------------------------------------------------------------*/
+/*!
+ * \brief   cleanTo1bppFilesToPdf()
+ *
+ * \param[in]    sa            sorted full pathnames of images
+ * \param[in]    res           either 300 or 600 ppi for output
+ * \param[in]    contrast      vary contrast: 1 = lightest; 10 = darkest;
+ *                             suggest 1 unless light features are being lost
+ * \param[in]    rotation      cw by 90 degrees: {0,1,2,3} represent
+ *                             0, 90, 180 and 270 degree cw rotations
+ * \param[in]    opensize      opening size of structuring element for noise
+ *                             removal: {0 or 1to skip; 2, 3 for opening}
+ * \param[in]    title         [optional] pdf title; can be null
+ * \param[in]    fileout       pdf file of all images
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *    (1) This deskews, optionally rotates and darkens, cleans background
+ *        to white, binarizes and optionally removes small noise, and
+ *        put the images into the pdf in the order given in %sa.
+ *    (2) All images in the pdf are tiffg4 encoded.
+ *    (3) For color and grayscale input, local background normalization is
+ *        done to 200, and a threshold of 180 sets the maximum foreground
+ *        value in the normalized image.
+ *    (4) The %res parameter can be either 300 or 600 ppi.  If the input
+ *        is gray or color and %res = 600, this does an interpolated 2x
+ *        expansion before binarizing.
+ *    (5) The %contrast parameter adjusts the binarization to avoid losing
+ *        lighter input pixels.  Contrast is increased as %contrast increases
+ *        from 1 to 10.
+ *    (6) The #opensize parameter is the size of a square SEL used with
+ *        opening to remove small speckle noise.  Allowed open sizes are 2,3.
+ *        If this is to be used, try 2 before 3.
+ *    (7) If there are more than 200 images, store the images after processing
+ *        as an array of compressed images (a Pixac); otherwise, use a Pixa.
+ * </pre>
+ */
+l_ok
+cleanTo1bppFilesToPdf(SARRAY      *sa,
+                      l_int32      res,
+                      l_int32      contrast,
+                      l_int32      rotation,
+                      l_int32      opensize,
+                      const char  *title,
+                      const char  *fileout)
+{
+char      *fname;
+l_int32    n, i, scale;
+l_int32    maxsmallset = 200;  /* max num images kept uncompressed in array */
+PIX       *pixs, *pix1;
+PIXA      *pixa1 = NULL;
+PIXAC     *pixac1 = NULL;
+
+    if (!sa)
+        return ERROR_INT("sa not defined", __func__, 1);
+    if (!fileout)
+        return ERROR_INT("fileout not defined", __func__, 1);
+    if (res == 0) res = 300;
+    if (res != 300 && res != 600) {
+        L_ERROR("invalid res = %d; res must be in {0, 300, 600}\n",
+                __func__, res);
+        return 1;
+    }
+    if (contrast < 1 || contrast > 10) {
+        L_ERROR("invalid contrast = %d; contrast must be in [1...10]\n",
+                __func__, contrast);
+        return 1;
+    }
+    if (rotation < 0 || rotation > 3) {
+        L_ERROR("invalid rotation = %d; rotation must be in  {0,1,2,3}\n",
+                __func__, rotation);
+        return 1;
+    }
+    if (opensize > 3) {
+        L_ERROR("invalid opensize = %d; opensize must be <= 3\n",
+                __func__, opensize);
+        return 1;
+    }
+    scale = (res == 300) ? 1 : 2;
+    if ((n = sarrayGetCount(sa)) == 0)
+        return ERROR_INT("sa is empty", __func__, 1);
+
+    if (n <= maxsmallset)
+        pixa1 = pixaCreate(n);
+    else
+        pixac1 = pixacompCreate(n);
+    for (i = 0; i < n; i++) {
+        if (i == 0)
+            lept_stderr("page: ");
+        else if (i % 10 == 0)
+            lept_stderr("%d . ", i);
+        fname = sarrayGetString(sa, i, L_NOCOPY);
+        if ((pixs = pixRead(fname)) == NULL) {
+            L_ERROR("pixs not read from %s\n", __func__, fname);
+            continue;
+        }
+
+        pix1 = pixCleanImage(pixs, contrast, rotation, scale, opensize);
+        if (n <= maxsmallset) {
+            pixaAddPix(pixa1, pix1, L_INSERT);
+        } else {
+            pixacompAddPix(pixac1, pix1, IFF_TIFF_G4);
+            pixDestroy(&pix1);
+        }
+        pixDestroy(&pixs);
+    }
+
+        /* Generate the pdf.  Compute the actual input resolution from
+         * the pixel dimensions of the first image.  This will cause each
+         * page to be printed to cover an 8.5 x 11 inch sheet of paper. */
+    lept_stderr("Write output to %s\n", fileout);
+    if (n <= maxsmallset)
+        pix1 = pixaGetPix(pixa1, 0, L_CLONE);
+    else
+        pix1 = pixacompGetPix(pixac1, 0);
+    pixInferResolution(pix1, 11.0, &res);
+    pixDestroy(&pix1);
+    if (strcmp(title, "none") == 0)
+        title = NULL;
+
+    if (n <= maxsmallset) {
+        pixaConvertToPdf(pixa1, res, 1.0, L_G4_ENCODE, 0, title, fileout);
+        pixaDestroy(&pixa1);
+    } else {
+        pixacompConvertToPdf(pixac1, res, 1.0, L_G4_ENCODE, 0, title, fileout);
+        pixacompDestroy(&pixac1);
+    }
+    return 0;
+}
+
+/* --------------------------------------------*/
+#endif  /* USE_PDFIO */
+/* --------------------------------------------*/