Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/leptonica/src/pdfapp.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/leptonica/src/pdfapp.c Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,478 @@ +/*====================================================================* + - Copyright (C) 2001 Leptonica. All rights reserved. + - + - Redistribution and use in source and binary forms, with or without + - modification, are permitted provided that the following conditions + - are met: + - 1. Redistributions of source code must retain the above copyright + - notice, this list of conditions and the following disclaimer. + - 2. Redistributions in binary form must reproduce the above + - copyright notice, this list of conditions and the following + - disclaimer in the documentation and/or other materials + - provided with the distribution. + - + - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY + - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *====================================================================*/ + +/*! + * \file pdfapp.c + * <pre> + * + * Image processing operations on multiple images followed by wrapping + * them into a pdf. + * + * There are two possible ways to specify the set of images: + * (1) an array of pathnames + * (2) a directory, typically with an additional pattern for selection. + * We use (1) because it is both simpler and more general. + * + * Corresponding to each function here is: + * (1) the image processing function that is carried out on each image + * (2) a program in prog that extracts images from a pdf and calls this + * function with an array of their pathnames. + * + * |=============================================================| + * | Important notes | + * |=============================================================| + * | Some of these functions require I/O libraries such as | + * | libtiff, libjpeg, libpng and libz. If you do not have | + * | these libraries, some calls will fail. For example, | + * | if you do not have libtiff, you cannot write a pdf that | + * | uses libtiff to encode bilevel images in tiffg4. | + * | | + * | You can manually deactivate all pdf writing by setting | + * | this in environ.h: | + * | \code | + * | #define USE_PDFIO 0 | + * | \endcode | + * | This will link the stub file pdfappstub.c. | + * |=============================================================| + * + * The images in the pdf file can be rendered using a pdf viewer, + * such as evince, gv, xpdf or acroread. + * + * Compression of images for prog/compresspdf + * l_int32 compressFilesToPdf() + * + * Crop images for prog/croppdf + * l_int32 cropFilesToPdf() + * + * Cleanup and binarization of images for prog/cleanpdf + * l_int32 cleanTo1bppFilesToPdf() + * </pre> + */ + +#ifdef HAVE_CONFIG_H +#include <config_auto.h> +#endif /* HAVE_CONFIG_H */ + +#include <string.h> +#include "allheaders.h" + + +/* --------------------------------------------*/ +#if USE_PDFIO /* defined in environ.h */ + /* --------------------------------------------*/ + +/*---------------------------------------------------------------------* + * Compression of images for prog/compresspdf * + *---------------------------------------------------------------------*/ +/*! + * \brief compressFilesToPdf() + * + * \param[in] sa sorted full pathnames of images + * \param[in] onebit set to 1 to enforce 1 bpp tiffg4 encoding + * \param[in] savecolor if %onebit == 1, set to 1 to save color + * \param[in] scalefactor scaling factor applied to each image; > 0.0 + * \param[in] quality for jpeg: 0 for default (50; otherwise 25 - 95. + * \param[in] title [optional] pdf title; can be null + * \param[in] fileout pdf file of all images + * \return 0 if OK, 1 on error + * + * <pre> + * Notes: + * (1) This function is designed to optionally scale and compress a set of + * images, wrapping them in a pdf in the order given in the input %sa. + * (2) It does the image processing for prog/compresspdf.c. + * (3) Images in the output pdf are encoded with either tiffg4 or jpeg (DCT), + * or a mixture of them depending on parameters %onebit and %savecolor. + * (4) Parameters %onebit and %savecolor work as follows: + * %onebit = 0: no depth conversion, default encoding depends on depth + * %onebit = 1, %savecolor = 0: all images converted to 1 bpp + * %onebit = 1, %savecolor = 1: images without color are converted + * to 1 bpp; images with color have the color preserved. + * (5) In use, if most of the pages are 1 bpp but some have color that needs + * to be preserved, %onebit and %savecolor should both be 1. This + * causes DCT compression of color images and tiffg4 compression + * of monochrome images. + * (6) The images will be concatenated in the order given in %sa. + * (7) Typically, %scalefactor <= 1.0. It is applied to each image + * before encoding. If you enter a value <= 0.0, it will be set to 1.0. + * The maximum allowed value is 2.0. + * (8) Default jpeg %quality is 50; otherwise, quality factors between + * 25 and 95 are enforced. + * (9) Page images at 300 ppi are about 8 Mpixels. RGB(A) rasters are + * then about 32 MB (1 bpp images are about 1 MB). If there are + * more than 25 images, store the images after processing as an + * array of compressed images (a Pixac); otherwise, use a Pixa. + * </pre> + */ +l_ok +compressFilesToPdf(SARRAY *sa, + l_int32 onebit, + l_int32 savecolor, + l_float32 scalefactor, + l_int32 quality, + const char *title, + const char *fileout) +{ +char *fname; +l_int32 n, i, res; +l_int32 maxsmallset = 25; /* max num images kept uncompressed in array */ +l_float32 colorfract; +PIX *pixs, *pix1, *pix2; +PIXA *pixa1 = NULL; +PIXAC *pixac1 = NULL; + + if (!sa) + return ERROR_INT("sa not defined", __func__, 1); + if (!fileout) + return ERROR_INT("fileout not defined", __func__, 1); + if (scalefactor <= 0) scalefactor = 1.0; + if (scalefactor > 2.0) { + L_WARNING("scalefactor %f too big; setting to 2.0\n", __func__, + scalefactor); + scalefactor = 2.0; + } + if (quality <= 0) quality = 50; /* default value */ + if (quality < 25) { + L_WARNING("quality %d too low; setting to 25\n", __func__, quality); + quality = 25; + } + if (quality > 95) { + L_WARNING("quality %d too high; setting to 95\n", __func__, quality); + quality = 95; + } + if ((n = sarrayGetCount(sa)) == 0) + return ERROR_INT("sa is empty", __func__, 1); + + if (n <= maxsmallset) + pixa1 = pixaCreate(n); + else + pixac1 = pixacompCreate(n); + for (i = 0; i < n; i++) { + if (i == 0) + lept_stderr("page: "); + else if (i % 10 == 0) + lept_stderr("%d . ", i); + fname = sarrayGetString(sa, i, L_NOCOPY); + pixs = pixRead(fname); + if (onebit) { + if (savecolor) { + pixColorFraction(pixs, 40, 224, 80, 4, NULL, &colorfract); + if (colorfract > 0.01) /* save the color; DCT encoding */ + pix1 = pixClone(pixs); + else + pix1 = pixConvertTo1(pixs, 180); + } else { /* do not save any color; tiffg4 encoding */ + pix1 = pixConvertTo1(pixs, 180); + } + } else { /* default encoding: tiffg4 for 1 bpp; DCT for all else */ + pix1 = pixClone(pixs); + } + if (scalefactor == 1.0) + pix2 = pixClone(pix1); + else + pix2 = pixScale(pix1, scalefactor, scalefactor); + if (n <= maxsmallset) { + pixaAddPix(pixa1, pix2, L_INSERT); + } else { + pixacompAddPix(pixac1, pix2, IFF_DEFAULT); + pixDestroy(&pix2); + } + pixDestroy(&pixs); + pixDestroy(&pix1); + } + + /* Generate the pdf. Compute the actual input resolution from + * the pixel dimensions of the first image. This will cause each + * page to be printed to cover an 8.5 x 11 inch sheet of paper. */ + lept_stderr("\nWrite output to %s\n", fileout); + if (n <= maxsmallset) + pix1 = pixaGetPix(pixa1, 0, L_CLONE); + else + pix1 = pixacompGetPix(pixac1, 0); + pixInferResolution(pix1, 11.0, &res); + pixDestroy(&pix1); + if (strcmp(title, "none") == 0) + title = NULL; + if (n <= maxsmallset) { + pixaConvertToPdf(pixa1, res, 1.0, L_DEFAULT_ENCODE, quality, + title, fileout); + pixaDestroy(&pixa1); + } else { + pixacompConvertToPdf(pixac1, res, 1.0, L_DEFAULT_ENCODE, quality, + title, fileout); + pixacompDestroy(&pixac1); + } + return 0; +} + + +/*---------------------------------------------------------------------* + * Crop images for prog/croppdf * + *---------------------------------------------------------------------*/ +/*! + * \brief cropFilesToPdf() + * + * \param[in] sa sorted full pathnames of images + * \param[in] lr_clear full res pixels cleared at left and right sides + * \param[in] tb_clear full res pixels cleared at top and bottom sides + * \param[in] edgeclean parameter for removing edge noise (-1 to 15) + * default = 0 (no removal); + * 15 is maximally aggressive for random noise + * -1 for aggressively removing side noise + * -2 to extract page embedded in black background + * \param[in] lr_border full res final "added" pixels on left and right + * \param[in] tb_border full res final "added" pixels on top and bottom + * \param[in] maxwiden max fractional horizontal stretch allowed + * \param[in] printwiden 0 to skip, 1 for 8.5x11, 2 for A4 + * \param[in] title [optional] pdf title; can be null + * \param[in] fileout pdf file of all images + * \return 0 if OK, 1 on error + * + * <pre> + * Notes: + * (1) This function is designed to optionally remove white space from + * around the page images, and generate a pdf that prints with + * foreground occupying much of the full page. + * (2) It does the image processing for prog/croppdf.c. + * (3) Images in the output pdf are 1 bpp and encoded with tiffg4. + * (4) See documentation in pixCropImage() for details on the processing. + * (5) The images will be concatenated in the order given in %safiles. + * (6) Output page images are at 300 ppi and are stored in memory. + * They are about 1 Mpixel when uncompressed. For up to 200 pages, + * the images are stored uncompressed; otherwise, the stored + * images are compressed with tiffg4. + * </pre> + */ +l_ok +cropFilesToPdf(SARRAY *sa, + l_int32 lr_clear, + l_int32 tb_clear, + l_int32 edgeclean, + l_int32 lr_border, + l_int32 tb_border, + l_float32 maxwiden, + l_int32 printwiden, + const char *title, + const char *fileout) +{ +char *fname; +l_int32 n, i, res; +l_int32 maxsmallset = 200; /* max num images kept uncompressed in array */ +PIX *pixs, *pix1; +PIXA *pixa1 = NULL; +PIXAC *pixac1 = NULL; + + if (!sa) + return ERROR_INT("sa not defined", __func__, 1); + if (!fileout) + return ERROR_INT("fileout not defined", __func__, 1); + if ((n = sarrayGetCount(sa)) == 0) + return ERROR_INT("sa is empty", __func__, 1); + + if (n <= maxsmallset) + pixa1 = pixaCreate(n); + else + pixac1 = pixacompCreate(n); + for (i = 0; i < n; i++) { + if (i == 0) + lept_stderr("page: "); + else if (i % 10 == 0) + lept_stderr("%d . ", i); + fname = sarrayGetString(sa, i, L_NOCOPY); + pixs = pixRead(fname); + pix1 = pixCropImage(pixs, lr_clear, tb_clear, edgeclean, + lr_border, tb_border, maxwiden, printwiden, + NULL, NULL); + pixDestroy(&pixs); + if (!pix1) { + L_ERROR("pix1 not made for i = %d\n", __func__, i); + continue; + } + if (n <= maxsmallset) + pixaAddPix(pixa1, pix1, L_INSERT); + else + pixacompAddPix(pixac1, pix1, IFF_TIFF_G4); + } + + /* Generate the pdf. Compute the actual input resolution from + * the pixel dimensions of the first image. This will cause each + * page to be printed to cover an 8.5 x 11 inch sheet of paper. */ + lept_stderr("\nWrite output to %s\n", fileout); + if (n <= maxsmallset) + pix1 = pixaGetPix(pixa1, 0, L_CLONE); + else + pix1 = pixacompGetPix(pixac1, 0); + pixInferResolution(pix1, 11.0, &res); + pixDestroy(&pix1); + if (strcmp(title, "none") == 0) + title = NULL; + if (n <= maxsmallset) { + pixaConvertToPdf(pixa1, res, 1.0, L_G4_ENCODE, 0, title, fileout); + pixaDestroy(&pixa1); + } else { + pixacompConvertToPdf(pixac1, res, 1.0, L_G4_ENCODE, 0, title, fileout); + pixacompDestroy(&pixac1); + } + return 0; +} + + +/*---------------------------------------------------------------------* + * Cleanup and binarization of images for prog/cleanpdf * + *---------------------------------------------------------------------*/ +/*! + * \brief cleanTo1bppFilesToPdf() + * + * \param[in] sa sorted full pathnames of images + * \param[in] res either 300 or 600 ppi for output + * \param[in] contrast vary contrast: 1 = lightest; 10 = darkest; + * suggest 1 unless light features are being lost + * \param[in] rotation cw by 90 degrees: {0,1,2,3} represent + * 0, 90, 180 and 270 degree cw rotations + * \param[in] opensize opening size of structuring element for noise + * removal: {0 or 1to skip; 2, 3 for opening} + * \param[in] title [optional] pdf title; can be null + * \param[in] fileout pdf file of all images + * \return 0 if OK, 1 on error + * + * <pre> + * Notes: + * (1) This deskews, optionally rotates and darkens, cleans background + * to white, binarizes and optionally removes small noise, and + * put the images into the pdf in the order given in %sa. + * (2) All images in the pdf are tiffg4 encoded. + * (3) For color and grayscale input, local background normalization is + * done to 200, and a threshold of 180 sets the maximum foreground + * value in the normalized image. + * (4) The %res parameter can be either 300 or 600 ppi. If the input + * is gray or color and %res = 600, this does an interpolated 2x + * expansion before binarizing. + * (5) The %contrast parameter adjusts the binarization to avoid losing + * lighter input pixels. Contrast is increased as %contrast increases + * from 1 to 10. + * (6) The #opensize parameter is the size of a square SEL used with + * opening to remove small speckle noise. Allowed open sizes are 2,3. + * If this is to be used, try 2 before 3. + * (7) If there are more than 200 images, store the images after processing + * as an array of compressed images (a Pixac); otherwise, use a Pixa. + * </pre> + */ +l_ok +cleanTo1bppFilesToPdf(SARRAY *sa, + l_int32 res, + l_int32 contrast, + l_int32 rotation, + l_int32 opensize, + const char *title, + const char *fileout) +{ +char *fname; +l_int32 n, i, scale; +l_int32 maxsmallset = 200; /* max num images kept uncompressed in array */ +PIX *pixs, *pix1; +PIXA *pixa1 = NULL; +PIXAC *pixac1 = NULL; + + if (!sa) + return ERROR_INT("sa not defined", __func__, 1); + if (!fileout) + return ERROR_INT("fileout not defined", __func__, 1); + if (res == 0) res = 300; + if (res != 300 && res != 600) { + L_ERROR("invalid res = %d; res must be in {0, 300, 600}\n", + __func__, res); + return 1; + } + if (contrast < 1 || contrast > 10) { + L_ERROR("invalid contrast = %d; contrast must be in [1...10]\n", + __func__, contrast); + return 1; + } + if (rotation < 0 || rotation > 3) { + L_ERROR("invalid rotation = %d; rotation must be in {0,1,2,3}\n", + __func__, rotation); + return 1; + } + if (opensize > 3) { + L_ERROR("invalid opensize = %d; opensize must be <= 3\n", + __func__, opensize); + return 1; + } + scale = (res == 300) ? 1 : 2; + if ((n = sarrayGetCount(sa)) == 0) + return ERROR_INT("sa is empty", __func__, 1); + + if (n <= maxsmallset) + pixa1 = pixaCreate(n); + else + pixac1 = pixacompCreate(n); + for (i = 0; i < n; i++) { + if (i == 0) + lept_stderr("page: "); + else if (i % 10 == 0) + lept_stderr("%d . ", i); + fname = sarrayGetString(sa, i, L_NOCOPY); + if ((pixs = pixRead(fname)) == NULL) { + L_ERROR("pixs not read from %s\n", __func__, fname); + continue; + } + + pix1 = pixCleanImage(pixs, contrast, rotation, scale, opensize); + if (n <= maxsmallset) { + pixaAddPix(pixa1, pix1, L_INSERT); + } else { + pixacompAddPix(pixac1, pix1, IFF_TIFF_G4); + pixDestroy(&pix1); + } + pixDestroy(&pixs); + } + + /* Generate the pdf. Compute the actual input resolution from + * the pixel dimensions of the first image. This will cause each + * page to be printed to cover an 8.5 x 11 inch sheet of paper. */ + lept_stderr("Write output to %s\n", fileout); + if (n <= maxsmallset) + pix1 = pixaGetPix(pixa1, 0, L_CLONE); + else + pix1 = pixacompGetPix(pixac1, 0); + pixInferResolution(pix1, 11.0, &res); + pixDestroy(&pix1); + if (strcmp(title, "none") == 0) + title = NULL; + + if (n <= maxsmallset) { + pixaConvertToPdf(pixa1, res, 1.0, L_G4_ENCODE, 0, title, fileout); + pixaDestroy(&pixa1); + } else { + pixacompConvertToPdf(pixac1, res, 1.0, L_G4_ENCODE, 0, title, fileout); + pixacompDestroy(&pixac1); + } + return 0; +} + +/* --------------------------------------------*/ +#endif /* USE_PDFIO */ +/* --------------------------------------------*/
