comparison mupdf-source/thirdparty/leptonica/src/renderpdf.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /*====================================================================*
2 - Copyright (C) 2001 Leptonica. All rights reserved.
3 -
4 - Redistribution and use in source and binary forms, with or without
5 - modification, are permitted provided that the following conditions
6 - are met:
7 - 1. Redistributions of source code must retain the above copyright
8 - notice, this list of conditions and the following disclaimer.
9 - 2. Redistributions in binary form must reproduce the above
10 - copyright notice, this list of conditions and the following
11 - disclaimer in the documentation and/or other materials
12 - provided with the distribution.
13 -
14 - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15 - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16 - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17 - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY
18 - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23 - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 *====================================================================*/
26
27 /*!
28 * \file renderpdf.c
29 * <pre>
30 *
31 * Rendering pdf files using an external library
32 * l_int32 l_pdfRenderFile()
33 * l_int32 l_pdfRenderFiles()
34 *
35 * Utility for rendering a set of pdf files as page images.
36 * The images are rendered for full page images at a specified
37 * resolution between 50 and 300 ppi, in the directory
38 * /tmp/lept/renderpdf/
39 *
40 * An application like cleanpdf performs a sequence of:
41 * (1) rendering the pdfs into a set of images,
42 * (2) doing image processing on each image to generate new images, and
43 * (3) wrapping the new images up in a single pdf file.
44 * Typically, the processed images made by step (2) are stored compressed
45 * in memory in a PixaComp, before wrapping them up in step (3).
46 *
47 * This requires the Poppler package of pdf utilities, in particular
48 * the program pdftoppm. For non-unix systems, this requires
49 * installation of the cygwin Poppler package:
50 * https://cygwin.com/cgi-bin2/package-cat.cgi?file=x86/poppler/
51 * poppler-0.26.5-1
52 *
53 * For the rasterizer, use pdftoppm:
54 * pdftoppm -r res fname outroot ('-r res' renders output at res ppi)
55 * This works on all pdf pages, both wrapped images and pages that
56 * were made orthographically. The default output resolution for
57 * pdftoppm is 150 ppi, but we typically use 300 ppi. This makes large
58 * uncompressed RGB image files (e.g., a standard size RGB page image
59 * at 300 ppi is 25 MB), but it is very fast.
60 *
61 * The size of the resulting images does not depend on the resolution
62 * of the images stored in the input pdf. We compute the value of the
63 * resolution parameter (render_res) that when input to pdftoppm
64 * will generate a page-size image (612 x 792 pts) at the requested
65 * output resolution.
66 *
67 * We do NOT use pdfimages:
68 * pdfimages -j fname outroot (-j outputs jpeg if input is dct)
69 * pdfimages only works when all pages are pdf wrappers around images.
70 * Further, in some cases, it scrambles the order of the output pages
71 * and inserts extra images.
72
73 * By default, this function will not run, because it makes a call
74 * to system(1). To render pdfs as a set of images in a directory,
75 * three things are required:
76 * (1) To have poppler installed.
77 * (2) To enable debug operations using setLeptDebugOK(1).
78 * (3) To link the functions that generate pdf files in the library
79 * (in pdfio1.c, pdfio2.c).
80 * </pre>
81 */
82
83 #ifdef HAVE_CONFIG_H
84 #include <config_auto.h>
85 #endif /* HAVE_CONFIG_H */
86
87 #include "allheaders.h"
88
89 /* --------------------------------------------*/
90 #if USE_PDFIO /* defined in environ.h */
91 /* --------------------------------------------*/
92
93 /*-----------------------------------------------------------------*
94 * Rendering pdf files using an external library *
95 *-----------------------------------------------------------------*/
96 /*!
97 * \brief l_pdfRenderFile()
98 *
99 * \param[in] filename input pdf file
100 * \param[in] res output resolution (0, [50 ... 300]) ppi
101 * \param[out] psaout sarray of filenames of rasterized images
102 * \return 0 if OK, 1 on error
103 *
104 * <pre>
105 * Notes:
106 * (1) Wrapper to l_padfRenderFiles() for a single input pdf file.
107 * </pre>
108 */
109 l_ok
110 l_pdfRenderFile(const char *filename,
111 l_int32 res,
112 SARRAY **psaout)
113 {
114 l_int32 ret;
115 SARRAY *sain;
116
117 if (!psaout)
118 return ERROR_INT("&saout not defined", __func__, 1);
119 *psaout = NULL;
120 if (!filename)
121 return ERROR_INT("filename not defined", __func__, 1);
122
123 sain = sarrayCreate(1);
124 sarrayAddString(sain, filename, L_COPY);
125 ret = l_pdfRenderFiles(NULL, sain, res, psaout);
126 sarrayDestroy(&sain);
127 return ret;
128 }
129
130
131 /*!
132 * \brief l_pdfRenderFiles()
133 *
134 * \param[in] dir directory of input pdf files
135 * \param[in] sain sarray of input pdf filenames
136 * \param[in] res output resolution (0, [50 ... 300]) ppi
137 * \param[out] psaout sarray of output filenames of rendered images
138 * \return 0 if OK, 1 on error
139 *
140 * <pre>
141 * Notes:
142 * (1) Because this uses the "system" call, it is disabled by default
143 * on all platforms. It is not supported and therefor3 disabled
144 * on iOS 11.
145 * (2) Input pdf file(s) are specified either by an input directory
146 * or an sarray with the paths. Use the sarray if it is given;
147 * otherwise, use all files in the directory with extention "pdf",
148 * and name the rendered images in lexical order of the filenames.
149 * (3) The allowed output rendering resolutions are between 50 ppi
150 * and 300 ppi. Typical resolutions are 150 and 300 ppi.
151 * Default input value of 0 can be used for 300 ppi resolution.
152 * (4) Images are rendered in ppm format in directory /tmp/lept/renderpdf
153 * and named in lexical order of the input filenames. On invocation,
154 * any existing files in this directory are removed.
155 * (5) This requires pdftoppm from the Poppler package of pdf utilities.
156 * </pre>
157 */
158 l_ok
159 l_pdfRenderFiles(const char *dir,
160 SARRAY *sain,
161 l_int32 res,
162 SARRAY **psaout)
163 {
164 char buf[256];
165 char *imagedir, *firstfile, *fname, *basename, *tail;
166 l_int32 i, nfiles, render_res;
167 SARRAY *sa;
168
169 if (!LeptDebugOK) {
170 L_INFO("running pdftoppm is disabled; "
171 "use setLeptDebugOK(1) to enable\n", __func__);
172 return 0;
173 }
174
175 #ifdef OS_IOS /* iOS 11 does not support system() */
176 return ERROR_INT("iOS 11 does not support system()", __func__, 0);
177 #endif /* OS_IOS */
178
179 if (!psaout)
180 return ERROR_INT("&saout not defined", __func__, 1);
181 *psaout = NULL;
182 if (res == 0) res = 300;
183 if (res < 50 || res > 300)
184 return ERROR_INT("res not in range [50 ... 300]", __func__, 1);
185 if (!dir && !sain)
186 return ERROR_INT("neither dir or sain are defined", __func__, 1);
187 if (sain) {
188 sa = sarrayCopy(sain);
189 } else {
190 sa = getSortedPathnamesInDirectory(dir, "pdf", 0, 0);
191 if (!sa)
192 return ERROR_INT("no files found in dir", __func__, 1);
193 }
194 nfiles = sarrayGetCount(sa);
195
196 /* Set up directory for rendered page images. */
197 lept_rmdir("lept/renderpdf");
198 lept_mkdir("lept/renderpdf");
199 imagedir = genPathname("/tmp/lept/renderpdf", NULL);
200
201 /* Figure out the resolution to use with the image renderer.
202 This first checks the media box sizes, which give the output
203 image size in printer points (1/72 inch). The largest expected
204 output image has a max dimension of about 11 inches, corresponding
205 to 792 points. At a resolution of 300 ppi, the max image size
206 is then 3300. For robustness, use the median of media box sizes.
207 If the max dimension of this median is significantly larger than
208 792, reduce the input resolution to the renderer. Specifically:
209 * Calculate the median of the MediaBox widths and heights.
210 * If the max exceeds 850, reduce the resolution so that the max
211 dimension of the rendered image is 3300. The new resolution
212 input to the renderer is reduced from 300 by the factor:
213 (792 / medmax)
214 If the media boxes are not found, render a page using a small
215 given resolution (72) and use the max dimension to find the
216 resolution, render_res, that will produce an out with
217 3300 pixels in the largest dimension. */
218 firstfile = sarrayGetString(sa, 0, L_NOCOPY);
219 getPdfRendererResolution(firstfile, imagedir, &render_res);
220
221 /* The input %res gives the actual resolution at which the page is
222 to be rendered. If this is less than 300 ppi, reduce render_res,
223 the resolution input to pdftoppm, by the factor:
224 (res / 300) */
225 render_res = (render_res * res) / 300;
226
227 /* Rasterize: '-r res' renders output at res ppi
228 * pdftoppm -r res fname outroot */
229 for (i = 0; i < nfiles; i++) {
230 fname = sarrayGetString(sa, i, L_NOCOPY);
231 splitPathAtDirectory(fname, NULL, &tail);
232 splitPathAtExtension(tail, &basename, NULL);
233 snprintf(buf, sizeof(buf), "pdftoppm -r %d %s %s/%s",
234 render_res, fname, imagedir, basename);
235 lept_free(tail);
236 lept_free(basename);
237 lept_stderr("%s\n", buf);
238 callSystemDebug(buf); /* pdftoppm */
239 }
240 sarrayDestroy(&sa);
241
242 /* Generate the output array of image file names */
243 *psaout = getSortedPathnamesInDirectory(imagedir, NULL, 0, 0);
244 lept_free(imagedir);
245 return 0;
246 }
247
248
249 /* --------------------------------------------*/
250 #endif /* USE_PDFIO */
251 /* --------------------------------------------*/
252
253
254
255 /* ------------------------------------------------------------------------- *
256 * Stubs if pdf is not supported *
257 * ------------------------------------------------------------------------- */
258
259 /* -----------------------------------------------------------------*/
260 #if !USE_PDFIO
261 /* -----------------------------------------------------------------*/
262
263 l_ok l_pdfRenderFile(const char *filename, l_int32 res, SARRAY **psaout)
264 {
265 return ERROR_INT("function not present", __func__, 1);
266 }
267
268 /* -----------------------------------------------------------*/
269
270 l_ok l_pdfRenderFiles(const char *dir, SARRAY *sain, l_int32 res,
271 SARRAY **psaout)
272 {
273 return ERROR_INT("function not present", __func__, 1);
274 }
275
276 /* -----------------------------------------------------------------*/
277 #endif /* !USE_PDFIO */
278 /* -----------------------------------------------------------------*/
279