Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/leptonica/src/finditalic.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /*====================================================================* | |
| 2 - Copyright (C) 2001 Leptonica. All rights reserved. | |
| 3 - | |
| 4 - Redistribution and use in source and binary forms, with or without | |
| 5 - modification, are permitted provided that the following conditions | |
| 6 - are met: | |
| 7 - 1. Redistributions of source code must retain the above copyright | |
| 8 - notice, this list of conditions and the following disclaimer. | |
| 9 - 2. Redistributions in binary form must reproduce the above | |
| 10 - copyright notice, this list of conditions and the following | |
| 11 - disclaimer in the documentation and/or other materials | |
| 12 - provided with the distribution. | |
| 13 - | |
| 14 - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
| 15 - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
| 16 - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
| 17 - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY | |
| 18 - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
| 19 - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
| 20 - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
| 21 - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | |
| 22 - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | |
| 23 - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
| 24 - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| 25 *====================================================================*/ | |
| 26 | |
| 27 /* | |
| 28 * \file finditalic.c | |
| 29 * <pre> | |
| 30 * | |
| 31 * l_int32 pixItalicWords() | |
| 32 * | |
| 33 * Locate italic words. This is an example of the use of | |
| 34 * hit-miss binary morphology with binary reconstruction | |
| 35 * (filling from a seed into a mask). | |
| 36 * | |
| 37 * To see how this works, run with prog/italic.png. | |
| 38 * </pre> | |
| 39 */ | |
| 40 | |
| 41 #ifdef HAVE_CONFIG_H | |
| 42 #include <config_auto.h> | |
| 43 #endif /* HAVE_CONFIG_H */ | |
| 44 | |
| 45 #include "allheaders.h" | |
| 46 | |
| 47 /* --------------------------------------------------------------- * | |
| 48 * These hit-miss sels match the slanted edge of italic characters * | |
| 49 * --------------------------------------------------------------- */ | |
| 50 static const char *str_ital1 = " o x" | |
| 51 " " | |
| 52 " " | |
| 53 " " | |
| 54 " o x " | |
| 55 " " | |
| 56 " C " | |
| 57 " " | |
| 58 " o x " | |
| 59 " " | |
| 60 " " | |
| 61 " " | |
| 62 "o x "; | |
| 63 | |
| 64 static const char *str_ital2 = " o x" | |
| 65 " " | |
| 66 " " | |
| 67 " o x " | |
| 68 " C " | |
| 69 " " | |
| 70 " o x " | |
| 71 " " | |
| 72 " " | |
| 73 "o x "; | |
| 74 | |
| 75 /* ------------------------------------------------------------- * | |
| 76 * This sel removes noise that is not oriented as a slanted edge * | |
| 77 * ------------------------------------------------------------- */ | |
| 78 static const char *str_ital3 = " x" | |
| 79 "Cx" | |
| 80 "x " | |
| 81 "x "; | |
| 82 | |
| 83 /*! | |
| 84 * \brief pixItalicWords() | |
| 85 * | |
| 86 * \param[in] pixs 1 bpp | |
| 87 * \param[in] boxaw [optional] word bounding boxes; can be NULL | |
| 88 * \param[in] pixw [optional] word box mask; can be NULL | |
| 89 * \param[out] pboxa boxa of italic words | |
| 90 * \param[in] debugflag 1 for debug output; 0 otherwise | |
| 91 * \return 0 if OK, 1 on error | |
| 92 * | |
| 93 * <pre> | |
| 94 * Notes: | |
| 95 * (1) You can input the bounding boxes for the words in one of | |
| 96 * two forms: as bounding boxes (%boxaw) or as a word mask with | |
| 97 * the word bounding boxes filled (%pixw). For example, | |
| 98 * to compute %pixw, you can use pixWordMaskByDilation(). | |
| 99 * (2) Alternatively, you can set both of these inputs to NULL, | |
| 100 * in which case the word mask is generated here. This is | |
| 101 * done by dilating and closing the input image to connect | |
| 102 * letters within a word, while leaving the words separated. | |
| 103 * The parameters are chosen under the assumption that the | |
| 104 * input is 10 to 12 pt text, scanned at about 300 ppi. | |
| 105 * (3) sel_ital1 and sel_ital2 detect the right edges that are | |
| 106 * nearly vertical, at approximately the angle of italic | |
| 107 * strokes. We use the right edge to avoid getting seeds | |
| 108 * from lower-case 'y'. The typical italic slant has a smaller | |
| 109 * angle with the vertical than the 'W', so in most cases we | |
| 110 * will not trigger on the slanted lines in the 'W'. | |
| 111 * (4) Note that sel_ital2 is shorter than sel_ital1. It is | |
| 112 * more appropriate for a typical font scanned at 200 ppi. | |
| 113 * </pre> | |
| 114 */ | |
| 115 l_ok | |
| 116 pixItalicWords(PIX *pixs, | |
| 117 BOXA *boxaw, | |
| 118 PIX *pixw, | |
| 119 BOXA **pboxa, | |
| 120 l_int32 debugflag) | |
| 121 { | |
| 122 char opstring[32], buf[32]; | |
| 123 l_int32 size, type; | |
| 124 BOXA *boxa; | |
| 125 PIX *pixsd, *pixm, *pixd; | |
| 126 SEL *sel_ital1, *sel_ital2, *sel_ital3; | |
| 127 | |
| 128 if (!pboxa) | |
| 129 return ERROR_INT("&boxa not defined", __func__, 1); | |
| 130 *pboxa = NULL; | |
| 131 if (!pixs) | |
| 132 return ERROR_INT("pixs not defined", __func__, 1); | |
| 133 if (boxaw && pixw) | |
| 134 return ERROR_INT("both boxaw and pixw are defined", __func__, 1); | |
| 135 | |
| 136 sel_ital1 = selCreateFromString(str_ital1, 13, 6, NULL); | |
| 137 sel_ital2 = selCreateFromString(str_ital2, 10, 6, NULL); | |
| 138 sel_ital3 = selCreateFromString(str_ital3, 4, 2, NULL); | |
| 139 | |
| 140 /* Make the italic seed: extract with HMT; remove noise. | |
| 141 * The noise removal close/open is important to exclude | |
| 142 * situations where a small slanted line accidentally | |
| 143 * matches sel_ital1. */ | |
| 144 pixsd = pixHMT(NULL, pixs, sel_ital1); | |
| 145 pixClose(pixsd, pixsd, sel_ital3); | |
| 146 pixOpen(pixsd, pixsd, sel_ital3); | |
| 147 | |
| 148 /* Make the word mask. Use input boxes or mask if given. */ | |
| 149 size = 0; /* init */ | |
| 150 if (boxaw) { | |
| 151 pixm = pixCreateTemplate(pixs); | |
| 152 pixMaskBoxa(pixm, pixm, boxaw, L_SET_PIXELS); | |
| 153 type = 1; | |
| 154 } else if (pixw) { | |
| 155 pixm = pixClone(pixw); | |
| 156 type = 2; | |
| 157 } else { | |
| 158 pixWordMaskByDilation(pixs, NULL, &size, NULL); | |
| 159 L_INFO("dilation size = %d\n", __func__, size); | |
| 160 snprintf(opstring, sizeof(opstring), "d1.5 + c%d.1", size); | |
| 161 pixm = pixMorphSequence(pixs, opstring, 0); | |
| 162 type = 3; | |
| 163 } | |
| 164 | |
| 165 /* Binary reconstruction to fill in those word mask | |
| 166 * components for which there is at least one seed pixel. */ | |
| 167 pixd = pixSeedfillBinary(NULL, pixsd, pixm, 8); | |
| 168 boxa = pixConnComp(pixd, NULL, 8); | |
| 169 *pboxa = boxa; | |
| 170 | |
| 171 if (debugflag) { | |
| 172 /* Save results at at 2x reduction */ | |
| 173 l_int32 res, upper; | |
| 174 lept_mkdir("lept/ital"); | |
| 175 BOXA *boxat; | |
| 176 GPLOT *gplot; | |
| 177 NUMA *na; | |
| 178 PIXA *pixa1; | |
| 179 PIX *pix1, *pix2, *pix3; | |
| 180 pixa1 = pixaCreate(0); | |
| 181 boxat = pixConnComp(pixm, NULL, 8); | |
| 182 boxaWriteDebug("/tmp/lept/ital/ital.ba", boxat); | |
| 183 pixaAddPix(pixa1, pixs, L_COPY); /* orig */ | |
| 184 pixaAddPix(pixa1, pixsd, L_COPY); /* seed */ | |
| 185 pix1 = pixConvertTo32(pixm); | |
| 186 pixRenderBoxaArb(pix1, boxat, 3, 255, 0, 0); | |
| 187 pixaAddPix(pixa1, pix1, L_INSERT); /* mask + outline */ | |
| 188 pixaAddPix(pixa1, pixd, L_COPY); /* ital mask */ | |
| 189 pix1 = pixConvertTo32(pixs); | |
| 190 pixRenderBoxaArb(pix1, boxa, 3, 255, 0, 0); | |
| 191 pixaAddPix(pixa1, pix1, L_INSERT); /* orig + outline */ | |
| 192 pix1 = pixCreateTemplate(pixs); | |
| 193 pix2 = pixSetBlackOrWhiteBoxa(pix1, boxa, L_SET_BLACK); | |
| 194 pixCopy(pix1, pixs); | |
| 195 pix3 = pixDilateBrick(NULL, pixs, 3, 3); | |
| 196 pixCombineMasked(pix1, pix3, pix2); | |
| 197 pixaAddPix(pixa1, pix1, L_INSERT); /* ital bolded */ | |
| 198 pixDestroy(&pix2); | |
| 199 pixDestroy(&pix3); | |
| 200 pix2 = pixaDisplayTiledInColumns(pixa1, 1, 0.5, 20, 2); | |
| 201 snprintf(buf, sizeof(buf), "/tmp/lept/ital/ital.%d.png", type); | |
| 202 pixWriteDebug(buf, pix2, IFF_PNG); | |
| 203 pixDestroy(&pix2); | |
| 204 | |
| 205 /* Assuming the image represents 6 inches of actual page width, | |
| 206 * the pixs resolution is approximately | |
| 207 * (width of pixs in pixels) / 6 | |
| 208 * and the images have been saved at half this resolution. */ | |
| 209 res = pixGetWidth(pixs) / 12; | |
| 210 L_INFO("resolution = %d\n", __func__, res); | |
| 211 l_pdfSetDateAndVersion(0); | |
| 212 snprintf(buf, sizeof(buf), "/tmp/lept/ital/ital.%d.pdf", type); | |
| 213 pixaConvertToPdf(pixa1, res, 1.0, L_FLATE_ENCODE, 75, "Italic Finder", | |
| 214 buf); | |
| 215 l_pdfSetDateAndVersion(1); | |
| 216 pixaDestroy(&pixa1); | |
| 217 boxaDestroy(&boxat); | |
| 218 | |
| 219 /* Plot histogram of horizontal white run sizes. A small | |
| 220 * initial vertical dilation removes most runs that are neither | |
| 221 * inter-character nor inter-word. The larger first peak is | |
| 222 * from inter-character runs, and the smaller second peak is | |
| 223 * from inter-word runs. */ | |
| 224 pix1 = pixDilateBrick(NULL, pixs, 1, 15); | |
| 225 upper = L_MAX(30, 3 * size); | |
| 226 na = pixRunHistogramMorph(pix1, L_RUN_OFF, L_HORIZ, upper); | |
| 227 pixDestroy(&pix1); | |
| 228 gplot = gplotCreate("/tmp/lept/ital/runhisto", GPLOT_PNG, | |
| 229 "Histogram of horizontal runs of white pixels, vs length", | |
| 230 "run length", "number of runs"); | |
| 231 gplotAddPlot(gplot, NULL, na, GPLOT_LINES, "plot1"); | |
| 232 gplotMakeOutput(gplot); | |
| 233 gplotDestroy(&gplot); | |
| 234 numaDestroy(&na); | |
| 235 } | |
| 236 | |
| 237 selDestroy(&sel_ital1); | |
| 238 selDestroy(&sel_ital2); | |
| 239 selDestroy(&sel_ital3); | |
| 240 pixDestroy(&pixsd); | |
| 241 pixDestroy(&pixm); | |
| 242 pixDestroy(&pixd); | |
| 243 return 0; | |
| 244 } |
