comparison mupdf-source/thirdparty/leptonica/src/finditalic.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /*====================================================================*
2 - Copyright (C) 2001 Leptonica. All rights reserved.
3 -
4 - Redistribution and use in source and binary forms, with or without
5 - modification, are permitted provided that the following conditions
6 - are met:
7 - 1. Redistributions of source code must retain the above copyright
8 - notice, this list of conditions and the following disclaimer.
9 - 2. Redistributions in binary form must reproduce the above
10 - copyright notice, this list of conditions and the following
11 - disclaimer in the documentation and/or other materials
12 - provided with the distribution.
13 -
14 - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15 - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16 - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17 - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY
18 - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23 - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 *====================================================================*/
26
27 /*
28 * \file finditalic.c
29 * <pre>
30 *
31 * l_int32 pixItalicWords()
32 *
33 * Locate italic words. This is an example of the use of
34 * hit-miss binary morphology with binary reconstruction
35 * (filling from a seed into a mask).
36 *
37 * To see how this works, run with prog/italic.png.
38 * </pre>
39 */
40
41 #ifdef HAVE_CONFIG_H
42 #include <config_auto.h>
43 #endif /* HAVE_CONFIG_H */
44
45 #include "allheaders.h"
46
47 /* --------------------------------------------------------------- *
48 * These hit-miss sels match the slanted edge of italic characters *
49 * --------------------------------------------------------------- */
50 static const char *str_ital1 = " o x"
51 " "
52 " "
53 " "
54 " o x "
55 " "
56 " C "
57 " "
58 " o x "
59 " "
60 " "
61 " "
62 "o x ";
63
64 static const char *str_ital2 = " o x"
65 " "
66 " "
67 " o x "
68 " C "
69 " "
70 " o x "
71 " "
72 " "
73 "o x ";
74
75 /* ------------------------------------------------------------- *
76 * This sel removes noise that is not oriented as a slanted edge *
77 * ------------------------------------------------------------- */
78 static const char *str_ital3 = " x"
79 "Cx"
80 "x "
81 "x ";
82
83 /*!
84 * \brief pixItalicWords()
85 *
86 * \param[in] pixs 1 bpp
87 * \param[in] boxaw [optional] word bounding boxes; can be NULL
88 * \param[in] pixw [optional] word box mask; can be NULL
89 * \param[out] pboxa boxa of italic words
90 * \param[in] debugflag 1 for debug output; 0 otherwise
91 * \return 0 if OK, 1 on error
92 *
93 * <pre>
94 * Notes:
95 * (1) You can input the bounding boxes for the words in one of
96 * two forms: as bounding boxes (%boxaw) or as a word mask with
97 * the word bounding boxes filled (%pixw). For example,
98 * to compute %pixw, you can use pixWordMaskByDilation().
99 * (2) Alternatively, you can set both of these inputs to NULL,
100 * in which case the word mask is generated here. This is
101 * done by dilating and closing the input image to connect
102 * letters within a word, while leaving the words separated.
103 * The parameters are chosen under the assumption that the
104 * input is 10 to 12 pt text, scanned at about 300 ppi.
105 * (3) sel_ital1 and sel_ital2 detect the right edges that are
106 * nearly vertical, at approximately the angle of italic
107 * strokes. We use the right edge to avoid getting seeds
108 * from lower-case 'y'. The typical italic slant has a smaller
109 * angle with the vertical than the 'W', so in most cases we
110 * will not trigger on the slanted lines in the 'W'.
111 * (4) Note that sel_ital2 is shorter than sel_ital1. It is
112 * more appropriate for a typical font scanned at 200 ppi.
113 * </pre>
114 */
115 l_ok
116 pixItalicWords(PIX *pixs,
117 BOXA *boxaw,
118 PIX *pixw,
119 BOXA **pboxa,
120 l_int32 debugflag)
121 {
122 char opstring[32], buf[32];
123 l_int32 size, type;
124 BOXA *boxa;
125 PIX *pixsd, *pixm, *pixd;
126 SEL *sel_ital1, *sel_ital2, *sel_ital3;
127
128 if (!pboxa)
129 return ERROR_INT("&boxa not defined", __func__, 1);
130 *pboxa = NULL;
131 if (!pixs)
132 return ERROR_INT("pixs not defined", __func__, 1);
133 if (boxaw && pixw)
134 return ERROR_INT("both boxaw and pixw are defined", __func__, 1);
135
136 sel_ital1 = selCreateFromString(str_ital1, 13, 6, NULL);
137 sel_ital2 = selCreateFromString(str_ital2, 10, 6, NULL);
138 sel_ital3 = selCreateFromString(str_ital3, 4, 2, NULL);
139
140 /* Make the italic seed: extract with HMT; remove noise.
141 * The noise removal close/open is important to exclude
142 * situations where a small slanted line accidentally
143 * matches sel_ital1. */
144 pixsd = pixHMT(NULL, pixs, sel_ital1);
145 pixClose(pixsd, pixsd, sel_ital3);
146 pixOpen(pixsd, pixsd, sel_ital3);
147
148 /* Make the word mask. Use input boxes or mask if given. */
149 size = 0; /* init */
150 if (boxaw) {
151 pixm = pixCreateTemplate(pixs);
152 pixMaskBoxa(pixm, pixm, boxaw, L_SET_PIXELS);
153 type = 1;
154 } else if (pixw) {
155 pixm = pixClone(pixw);
156 type = 2;
157 } else {
158 pixWordMaskByDilation(pixs, NULL, &size, NULL);
159 L_INFO("dilation size = %d\n", __func__, size);
160 snprintf(opstring, sizeof(opstring), "d1.5 + c%d.1", size);
161 pixm = pixMorphSequence(pixs, opstring, 0);
162 type = 3;
163 }
164
165 /* Binary reconstruction to fill in those word mask
166 * components for which there is at least one seed pixel. */
167 pixd = pixSeedfillBinary(NULL, pixsd, pixm, 8);
168 boxa = pixConnComp(pixd, NULL, 8);
169 *pboxa = boxa;
170
171 if (debugflag) {
172 /* Save results at at 2x reduction */
173 l_int32 res, upper;
174 lept_mkdir("lept/ital");
175 BOXA *boxat;
176 GPLOT *gplot;
177 NUMA *na;
178 PIXA *pixa1;
179 PIX *pix1, *pix2, *pix3;
180 pixa1 = pixaCreate(0);
181 boxat = pixConnComp(pixm, NULL, 8);
182 boxaWriteDebug("/tmp/lept/ital/ital.ba", boxat);
183 pixaAddPix(pixa1, pixs, L_COPY); /* orig */
184 pixaAddPix(pixa1, pixsd, L_COPY); /* seed */
185 pix1 = pixConvertTo32(pixm);
186 pixRenderBoxaArb(pix1, boxat, 3, 255, 0, 0);
187 pixaAddPix(pixa1, pix1, L_INSERT); /* mask + outline */
188 pixaAddPix(pixa1, pixd, L_COPY); /* ital mask */
189 pix1 = pixConvertTo32(pixs);
190 pixRenderBoxaArb(pix1, boxa, 3, 255, 0, 0);
191 pixaAddPix(pixa1, pix1, L_INSERT); /* orig + outline */
192 pix1 = pixCreateTemplate(pixs);
193 pix2 = pixSetBlackOrWhiteBoxa(pix1, boxa, L_SET_BLACK);
194 pixCopy(pix1, pixs);
195 pix3 = pixDilateBrick(NULL, pixs, 3, 3);
196 pixCombineMasked(pix1, pix3, pix2);
197 pixaAddPix(pixa1, pix1, L_INSERT); /* ital bolded */
198 pixDestroy(&pix2);
199 pixDestroy(&pix3);
200 pix2 = pixaDisplayTiledInColumns(pixa1, 1, 0.5, 20, 2);
201 snprintf(buf, sizeof(buf), "/tmp/lept/ital/ital.%d.png", type);
202 pixWriteDebug(buf, pix2, IFF_PNG);
203 pixDestroy(&pix2);
204
205 /* Assuming the image represents 6 inches of actual page width,
206 * the pixs resolution is approximately
207 * (width of pixs in pixels) / 6
208 * and the images have been saved at half this resolution. */
209 res = pixGetWidth(pixs) / 12;
210 L_INFO("resolution = %d\n", __func__, res);
211 l_pdfSetDateAndVersion(0);
212 snprintf(buf, sizeof(buf), "/tmp/lept/ital/ital.%d.pdf", type);
213 pixaConvertToPdf(pixa1, res, 1.0, L_FLATE_ENCODE, 75, "Italic Finder",
214 buf);
215 l_pdfSetDateAndVersion(1);
216 pixaDestroy(&pixa1);
217 boxaDestroy(&boxat);
218
219 /* Plot histogram of horizontal white run sizes. A small
220 * initial vertical dilation removes most runs that are neither
221 * inter-character nor inter-word. The larger first peak is
222 * from inter-character runs, and the smaller second peak is
223 * from inter-word runs. */
224 pix1 = pixDilateBrick(NULL, pixs, 1, 15);
225 upper = L_MAX(30, 3 * size);
226 na = pixRunHistogramMorph(pix1, L_RUN_OFF, L_HORIZ, upper);
227 pixDestroy(&pix1);
228 gplot = gplotCreate("/tmp/lept/ital/runhisto", GPLOT_PNG,
229 "Histogram of horizontal runs of white pixels, vs length",
230 "run length", "number of runs");
231 gplotAddPlot(gplot, NULL, na, GPLOT_LINES, "plot1");
232 gplotMakeOutput(gplot);
233 gplotDestroy(&gplot);
234 numaDestroy(&na);
235 }
236
237 selDestroy(&sel_ital1);
238 selDestroy(&sel_ital2);
239 selDestroy(&sel_ital3);
240 pixDestroy(&pixsd);
241 pixDestroy(&pixm);
242 pixDestroy(&pixd);
243 return 0;
244 }