comparison mupdf-source/thirdparty/leptonica/src/pdfapp.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /*====================================================================*
2 - Copyright (C) 2001 Leptonica. All rights reserved.
3 -
4 - Redistribution and use in source and binary forms, with or without
5 - modification, are permitted provided that the following conditions
6 - are met:
7 - 1. Redistributions of source code must retain the above copyright
8 - notice, this list of conditions and the following disclaimer.
9 - 2. Redistributions in binary form must reproduce the above
10 - copyright notice, this list of conditions and the following
11 - disclaimer in the documentation and/or other materials
12 - provided with the distribution.
13 -
14 - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15 - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16 - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17 - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY
18 - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23 - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 *====================================================================*/
26
27 /*!
28 * \file pdfapp.c
29 * <pre>
30 *
31 * Image processing operations on multiple images followed by wrapping
32 * them into a pdf.
33 *
34 * There are two possible ways to specify the set of images:
35 * (1) an array of pathnames
36 * (2) a directory, typically with an additional pattern for selection.
37 * We use (1) because it is both simpler and more general.
38 *
39 * Corresponding to each function here is:
40 * (1) the image processing function that is carried out on each image
41 * (2) a program in prog that extracts images from a pdf and calls this
42 * function with an array of their pathnames.
43 *
44 * |=============================================================|
45 * | Important notes |
46 * |=============================================================|
47 * | Some of these functions require I/O libraries such as |
48 * | libtiff, libjpeg, libpng and libz. If you do not have |
49 * | these libraries, some calls will fail. For example, |
50 * | if you do not have libtiff, you cannot write a pdf that |
51 * | uses libtiff to encode bilevel images in tiffg4. |
52 * | |
53 * | You can manually deactivate all pdf writing by setting |
54 * | this in environ.h: |
55 * | \code |
56 * | #define USE_PDFIO 0 |
57 * | \endcode |
58 * | This will link the stub file pdfappstub.c. |
59 * |=============================================================|
60 *
61 * The images in the pdf file can be rendered using a pdf viewer,
62 * such as evince, gv, xpdf or acroread.
63 *
64 * Compression of images for prog/compresspdf
65 * l_int32 compressFilesToPdf()
66 *
67 * Crop images for prog/croppdf
68 * l_int32 cropFilesToPdf()
69 *
70 * Cleanup and binarization of images for prog/cleanpdf
71 * l_int32 cleanTo1bppFilesToPdf()
72 * </pre>
73 */
74
75 #ifdef HAVE_CONFIG_H
76 #include <config_auto.h>
77 #endif /* HAVE_CONFIG_H */
78
79 #include <string.h>
80 #include "allheaders.h"
81
82
83 /* --------------------------------------------*/
84 #if USE_PDFIO /* defined in environ.h */
85 /* --------------------------------------------*/
86
87 /*---------------------------------------------------------------------*
88 * Compression of images for prog/compresspdf *
89 *---------------------------------------------------------------------*/
90 /*!
91 * \brief compressFilesToPdf()
92 *
93 * \param[in] sa sorted full pathnames of images
94 * \param[in] onebit set to 1 to enforce 1 bpp tiffg4 encoding
95 * \param[in] savecolor if %onebit == 1, set to 1 to save color
96 * \param[in] scalefactor scaling factor applied to each image; > 0.0
97 * \param[in] quality for jpeg: 0 for default (50; otherwise 25 - 95.
98 * \param[in] title [optional] pdf title; can be null
99 * \param[in] fileout pdf file of all images
100 * \return 0 if OK, 1 on error
101 *
102 * <pre>
103 * Notes:
104 * (1) This function is designed to optionally scale and compress a set of
105 * images, wrapping them in a pdf in the order given in the input %sa.
106 * (2) It does the image processing for prog/compresspdf.c.
107 * (3) Images in the output pdf are encoded with either tiffg4 or jpeg (DCT),
108 * or a mixture of them depending on parameters %onebit and %savecolor.
109 * (4) Parameters %onebit and %savecolor work as follows:
110 * %onebit = 0: no depth conversion, default encoding depends on depth
111 * %onebit = 1, %savecolor = 0: all images converted to 1 bpp
112 * %onebit = 1, %savecolor = 1: images without color are converted
113 * to 1 bpp; images with color have the color preserved.
114 * (5) In use, if most of the pages are 1 bpp but some have color that needs
115 * to be preserved, %onebit and %savecolor should both be 1. This
116 * causes DCT compression of color images and tiffg4 compression
117 * of monochrome images.
118 * (6) The images will be concatenated in the order given in %sa.
119 * (7) Typically, %scalefactor <= 1.0. It is applied to each image
120 * before encoding. If you enter a value <= 0.0, it will be set to 1.0.
121 * The maximum allowed value is 2.0.
122 * (8) Default jpeg %quality is 50; otherwise, quality factors between
123 * 25 and 95 are enforced.
124 * (9) Page images at 300 ppi are about 8 Mpixels. RGB(A) rasters are
125 * then about 32 MB (1 bpp images are about 1 MB). If there are
126 * more than 25 images, store the images after processing as an
127 * array of compressed images (a Pixac); otherwise, use a Pixa.
128 * </pre>
129 */
130 l_ok
131 compressFilesToPdf(SARRAY *sa,
132 l_int32 onebit,
133 l_int32 savecolor,
134 l_float32 scalefactor,
135 l_int32 quality,
136 const char *title,
137 const char *fileout)
138 {
139 char *fname;
140 l_int32 n, i, res;
141 l_int32 maxsmallset = 25; /* max num images kept uncompressed in array */
142 l_float32 colorfract;
143 PIX *pixs, *pix1, *pix2;
144 PIXA *pixa1 = NULL;
145 PIXAC *pixac1 = NULL;
146
147 if (!sa)
148 return ERROR_INT("sa not defined", __func__, 1);
149 if (!fileout)
150 return ERROR_INT("fileout not defined", __func__, 1);
151 if (scalefactor <= 0) scalefactor = 1.0;
152 if (scalefactor > 2.0) {
153 L_WARNING("scalefactor %f too big; setting to 2.0\n", __func__,
154 scalefactor);
155 scalefactor = 2.0;
156 }
157 if (quality <= 0) quality = 50; /* default value */
158 if (quality < 25) {
159 L_WARNING("quality %d too low; setting to 25\n", __func__, quality);
160 quality = 25;
161 }
162 if (quality > 95) {
163 L_WARNING("quality %d too high; setting to 95\n", __func__, quality);
164 quality = 95;
165 }
166 if ((n = sarrayGetCount(sa)) == 0)
167 return ERROR_INT("sa is empty", __func__, 1);
168
169 if (n <= maxsmallset)
170 pixa1 = pixaCreate(n);
171 else
172 pixac1 = pixacompCreate(n);
173 for (i = 0; i < n; i++) {
174 if (i == 0)
175 lept_stderr("page: ");
176 else if (i % 10 == 0)
177 lept_stderr("%d . ", i);
178 fname = sarrayGetString(sa, i, L_NOCOPY);
179 pixs = pixRead(fname);
180 if (onebit) {
181 if (savecolor) {
182 pixColorFraction(pixs, 40, 224, 80, 4, NULL, &colorfract);
183 if (colorfract > 0.01) /* save the color; DCT encoding */
184 pix1 = pixClone(pixs);
185 else
186 pix1 = pixConvertTo1(pixs, 180);
187 } else { /* do not save any color; tiffg4 encoding */
188 pix1 = pixConvertTo1(pixs, 180);
189 }
190 } else { /* default encoding: tiffg4 for 1 bpp; DCT for all else */
191 pix1 = pixClone(pixs);
192 }
193 if (scalefactor == 1.0)
194 pix2 = pixClone(pix1);
195 else
196 pix2 = pixScale(pix1, scalefactor, scalefactor);
197 if (n <= maxsmallset) {
198 pixaAddPix(pixa1, pix2, L_INSERT);
199 } else {
200 pixacompAddPix(pixac1, pix2, IFF_DEFAULT);
201 pixDestroy(&pix2);
202 }
203 pixDestroy(&pixs);
204 pixDestroy(&pix1);
205 }
206
207 /* Generate the pdf. Compute the actual input resolution from
208 * the pixel dimensions of the first image. This will cause each
209 * page to be printed to cover an 8.5 x 11 inch sheet of paper. */
210 lept_stderr("\nWrite output to %s\n", fileout);
211 if (n <= maxsmallset)
212 pix1 = pixaGetPix(pixa1, 0, L_CLONE);
213 else
214 pix1 = pixacompGetPix(pixac1, 0);
215 pixInferResolution(pix1, 11.0, &res);
216 pixDestroy(&pix1);
217 if (strcmp(title, "none") == 0)
218 title = NULL;
219 if (n <= maxsmallset) {
220 pixaConvertToPdf(pixa1, res, 1.0, L_DEFAULT_ENCODE, quality,
221 title, fileout);
222 pixaDestroy(&pixa1);
223 } else {
224 pixacompConvertToPdf(pixac1, res, 1.0, L_DEFAULT_ENCODE, quality,
225 title, fileout);
226 pixacompDestroy(&pixac1);
227 }
228 return 0;
229 }
230
231
232 /*---------------------------------------------------------------------*
233 * Crop images for prog/croppdf *
234 *---------------------------------------------------------------------*/
235 /*!
236 * \brief cropFilesToPdf()
237 *
238 * \param[in] sa sorted full pathnames of images
239 * \param[in] lr_clear full res pixels cleared at left and right sides
240 * \param[in] tb_clear full res pixels cleared at top and bottom sides
241 * \param[in] edgeclean parameter for removing edge noise (-1 to 15)
242 * default = 0 (no removal);
243 * 15 is maximally aggressive for random noise
244 * -1 for aggressively removing side noise
245 * -2 to extract page embedded in black background
246 * \param[in] lr_border full res final "added" pixels on left and right
247 * \param[in] tb_border full res final "added" pixels on top and bottom
248 * \param[in] maxwiden max fractional horizontal stretch allowed
249 * \param[in] printwiden 0 to skip, 1 for 8.5x11, 2 for A4
250 * \param[in] title [optional] pdf title; can be null
251 * \param[in] fileout pdf file of all images
252 * \return 0 if OK, 1 on error
253 *
254 * <pre>
255 * Notes:
256 * (1) This function is designed to optionally remove white space from
257 * around the page images, and generate a pdf that prints with
258 * foreground occupying much of the full page.
259 * (2) It does the image processing for prog/croppdf.c.
260 * (3) Images in the output pdf are 1 bpp and encoded with tiffg4.
261 * (4) See documentation in pixCropImage() for details on the processing.
262 * (5) The images will be concatenated in the order given in %safiles.
263 * (6) Output page images are at 300 ppi and are stored in memory.
264 * They are about 1 Mpixel when uncompressed. For up to 200 pages,
265 * the images are stored uncompressed; otherwise, the stored
266 * images are compressed with tiffg4.
267 * </pre>
268 */
269 l_ok
270 cropFilesToPdf(SARRAY *sa,
271 l_int32 lr_clear,
272 l_int32 tb_clear,
273 l_int32 edgeclean,
274 l_int32 lr_border,
275 l_int32 tb_border,
276 l_float32 maxwiden,
277 l_int32 printwiden,
278 const char *title,
279 const char *fileout)
280 {
281 char *fname;
282 l_int32 n, i, res;
283 l_int32 maxsmallset = 200; /* max num images kept uncompressed in array */
284 PIX *pixs, *pix1;
285 PIXA *pixa1 = NULL;
286 PIXAC *pixac1 = NULL;
287
288 if (!sa)
289 return ERROR_INT("sa not defined", __func__, 1);
290 if (!fileout)
291 return ERROR_INT("fileout not defined", __func__, 1);
292 if ((n = sarrayGetCount(sa)) == 0)
293 return ERROR_INT("sa is empty", __func__, 1);
294
295 if (n <= maxsmallset)
296 pixa1 = pixaCreate(n);
297 else
298 pixac1 = pixacompCreate(n);
299 for (i = 0; i < n; i++) {
300 if (i == 0)
301 lept_stderr("page: ");
302 else if (i % 10 == 0)
303 lept_stderr("%d . ", i);
304 fname = sarrayGetString(sa, i, L_NOCOPY);
305 pixs = pixRead(fname);
306 pix1 = pixCropImage(pixs, lr_clear, tb_clear, edgeclean,
307 lr_border, tb_border, maxwiden, printwiden,
308 NULL, NULL);
309 pixDestroy(&pixs);
310 if (!pix1) {
311 L_ERROR("pix1 not made for i = %d\n", __func__, i);
312 continue;
313 }
314 if (n <= maxsmallset)
315 pixaAddPix(pixa1, pix1, L_INSERT);
316 else
317 pixacompAddPix(pixac1, pix1, IFF_TIFF_G4);
318 }
319
320 /* Generate the pdf. Compute the actual input resolution from
321 * the pixel dimensions of the first image. This will cause each
322 * page to be printed to cover an 8.5 x 11 inch sheet of paper. */
323 lept_stderr("\nWrite output to %s\n", fileout);
324 if (n <= maxsmallset)
325 pix1 = pixaGetPix(pixa1, 0, L_CLONE);
326 else
327 pix1 = pixacompGetPix(pixac1, 0);
328 pixInferResolution(pix1, 11.0, &res);
329 pixDestroy(&pix1);
330 if (strcmp(title, "none") == 0)
331 title = NULL;
332 if (n <= maxsmallset) {
333 pixaConvertToPdf(pixa1, res, 1.0, L_G4_ENCODE, 0, title, fileout);
334 pixaDestroy(&pixa1);
335 } else {
336 pixacompConvertToPdf(pixac1, res, 1.0, L_G4_ENCODE, 0, title, fileout);
337 pixacompDestroy(&pixac1);
338 }
339 return 0;
340 }
341
342
343 /*---------------------------------------------------------------------*
344 * Cleanup and binarization of images for prog/cleanpdf *
345 *---------------------------------------------------------------------*/
346 /*!
347 * \brief cleanTo1bppFilesToPdf()
348 *
349 * \param[in] sa sorted full pathnames of images
350 * \param[in] res either 300 or 600 ppi for output
351 * \param[in] contrast vary contrast: 1 = lightest; 10 = darkest;
352 * suggest 1 unless light features are being lost
353 * \param[in] rotation cw by 90 degrees: {0,1,2,3} represent
354 * 0, 90, 180 and 270 degree cw rotations
355 * \param[in] opensize opening size of structuring element for noise
356 * removal: {0 or 1to skip; 2, 3 for opening}
357 * \param[in] title [optional] pdf title; can be null
358 * \param[in] fileout pdf file of all images
359 * \return 0 if OK, 1 on error
360 *
361 * <pre>
362 * Notes:
363 * (1) This deskews, optionally rotates and darkens, cleans background
364 * to white, binarizes and optionally removes small noise, and
365 * put the images into the pdf in the order given in %sa.
366 * (2) All images in the pdf are tiffg4 encoded.
367 * (3) For color and grayscale input, local background normalization is
368 * done to 200, and a threshold of 180 sets the maximum foreground
369 * value in the normalized image.
370 * (4) The %res parameter can be either 300 or 600 ppi. If the input
371 * is gray or color and %res = 600, this does an interpolated 2x
372 * expansion before binarizing.
373 * (5) The %contrast parameter adjusts the binarization to avoid losing
374 * lighter input pixels. Contrast is increased as %contrast increases
375 * from 1 to 10.
376 * (6) The #opensize parameter is the size of a square SEL used with
377 * opening to remove small speckle noise. Allowed open sizes are 2,3.
378 * If this is to be used, try 2 before 3.
379 * (7) If there are more than 200 images, store the images after processing
380 * as an array of compressed images (a Pixac); otherwise, use a Pixa.
381 * </pre>
382 */
383 l_ok
384 cleanTo1bppFilesToPdf(SARRAY *sa,
385 l_int32 res,
386 l_int32 contrast,
387 l_int32 rotation,
388 l_int32 opensize,
389 const char *title,
390 const char *fileout)
391 {
392 char *fname;
393 l_int32 n, i, scale;
394 l_int32 maxsmallset = 200; /* max num images kept uncompressed in array */
395 PIX *pixs, *pix1;
396 PIXA *pixa1 = NULL;
397 PIXAC *pixac1 = NULL;
398
399 if (!sa)
400 return ERROR_INT("sa not defined", __func__, 1);
401 if (!fileout)
402 return ERROR_INT("fileout not defined", __func__, 1);
403 if (res == 0) res = 300;
404 if (res != 300 && res != 600) {
405 L_ERROR("invalid res = %d; res must be in {0, 300, 600}\n",
406 __func__, res);
407 return 1;
408 }
409 if (contrast < 1 || contrast > 10) {
410 L_ERROR("invalid contrast = %d; contrast must be in [1...10]\n",
411 __func__, contrast);
412 return 1;
413 }
414 if (rotation < 0 || rotation > 3) {
415 L_ERROR("invalid rotation = %d; rotation must be in {0,1,2,3}\n",
416 __func__, rotation);
417 return 1;
418 }
419 if (opensize > 3) {
420 L_ERROR("invalid opensize = %d; opensize must be <= 3\n",
421 __func__, opensize);
422 return 1;
423 }
424 scale = (res == 300) ? 1 : 2;
425 if ((n = sarrayGetCount(sa)) == 0)
426 return ERROR_INT("sa is empty", __func__, 1);
427
428 if (n <= maxsmallset)
429 pixa1 = pixaCreate(n);
430 else
431 pixac1 = pixacompCreate(n);
432 for (i = 0; i < n; i++) {
433 if (i == 0)
434 lept_stderr("page: ");
435 else if (i % 10 == 0)
436 lept_stderr("%d . ", i);
437 fname = sarrayGetString(sa, i, L_NOCOPY);
438 if ((pixs = pixRead(fname)) == NULL) {
439 L_ERROR("pixs not read from %s\n", __func__, fname);
440 continue;
441 }
442
443 pix1 = pixCleanImage(pixs, contrast, rotation, scale, opensize);
444 if (n <= maxsmallset) {
445 pixaAddPix(pixa1, pix1, L_INSERT);
446 } else {
447 pixacompAddPix(pixac1, pix1, IFF_TIFF_G4);
448 pixDestroy(&pix1);
449 }
450 pixDestroy(&pixs);
451 }
452
453 /* Generate the pdf. Compute the actual input resolution from
454 * the pixel dimensions of the first image. This will cause each
455 * page to be printed to cover an 8.5 x 11 inch sheet of paper. */
456 lept_stderr("Write output to %s\n", fileout);
457 if (n <= maxsmallset)
458 pix1 = pixaGetPix(pixa1, 0, L_CLONE);
459 else
460 pix1 = pixacompGetPix(pixac1, 0);
461 pixInferResolution(pix1, 11.0, &res);
462 pixDestroy(&pix1);
463 if (strcmp(title, "none") == 0)
464 title = NULL;
465
466 if (n <= maxsmallset) {
467 pixaConvertToPdf(pixa1, res, 1.0, L_G4_ENCODE, 0, title, fileout);
468 pixaDestroy(&pixa1);
469 } else {
470 pixacompConvertToPdf(pixac1, res, 1.0, L_G4_ENCODE, 0, title, fileout);
471 pixacompDestroy(&pixac1);
472 }
473 return 0;
474 }
475
476 /* --------------------------------------------*/
477 #endif /* USE_PDFIO */
478 /* --------------------------------------------*/