diff mupdf-source/thirdparty/leptonica/src/finditalic.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/leptonica/src/finditalic.c	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,244 @@
+/*====================================================================*
+ -  Copyright (C) 2001 Leptonica.  All rights reserved.
+ -
+ -  Redistribution and use in source and binary forms, with or without
+ -  modification, are permitted provided that the following conditions
+ -  are met:
+ -  1. Redistributions of source code must retain the above copyright
+ -     notice, this list of conditions and the following disclaimer.
+ -  2. Redistributions in binary form must reproduce the above
+ -     copyright notice, this list of conditions and the following
+ -     disclaimer in the documentation and/or other materials
+ -     provided with the distribution.
+ -
+ -  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ -  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ -  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ -  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL ANY
+ -  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ -  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ -  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ -  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ -  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ -  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ -  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *====================================================================*/
+
+/*
+ * \file  finditalic.c
+ * <pre>
+ *
+ *      l_int32   pixItalicWords()
+ *
+ *    Locate italic words.  This is an example of the use of
+ *    hit-miss binary morphology with binary reconstruction
+ *    (filling from a seed into a mask).
+ *
+ *    To see how this works, run with prog/italic.png.
+ * </pre>
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config_auto.h>
+#endif  /* HAVE_CONFIG_H */
+
+#include "allheaders.h"
+
+    /* ---------------------------------------------------------------  *
+     * These hit-miss sels match the slanted edge of italic characters  *
+     * ---------------------------------------------------------------  */
+static const char *str_ital1 = "   o x"
+                               "      "
+                               "      "
+                               "      "
+                               "  o x "
+                               "      "
+                               "  C   "
+                               "      "
+                               " o x  "
+                               "      "
+                               "      "
+                               "      "
+                               "o x   ";
+
+static const char *str_ital2 = "   o x"
+                               "      "
+                               "      "
+                               "  o x "
+                               "  C   "
+                               "      "
+                               " o x  "
+                               "      "
+                               "      "
+                               "o x   ";
+
+    /* ------------------------------------------------------------- *
+     * This sel removes noise that is not oriented as a slanted edge *
+     * ------------------------------------------------------------- */
+static const char *str_ital3 = " x"
+                               "Cx"
+                               "x "
+                               "x ";
+
+/*!
+ * \brief   pixItalicWords()
+ *
+ * \param[in]    pixs       1 bpp
+ * \param[in]    boxaw      [optional] word bounding boxes; can be NULL
+ * \param[in]    pixw       [optional] word box mask; can be NULL
+ * \param[out]   pboxa      boxa of italic words
+ * \param[in]    debugflag  1 for debug output; 0 otherwise
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) You can input the bounding boxes for the words in one of
+ *          two forms: as bounding boxes (%boxaw) or as a word mask with
+ *          the word bounding boxes filled (%pixw).  For example,
+ *          to compute %pixw, you can use pixWordMaskByDilation().
+ *      (2) Alternatively, you can set both of these inputs to NULL,
+ *          in which case the word mask is generated here.  This is
+ *          done by dilating and closing the input image to connect
+ *          letters within a word, while leaving the words separated.
+ *          The parameters are chosen under the assumption that the
+ *          input is 10 to 12 pt text, scanned at about 300 ppi.
+ *      (3) sel_ital1 and sel_ital2 detect the right edges that are
+ *          nearly vertical, at approximately the angle of italic
+ *          strokes.  We use the right edge to avoid getting seeds
+ *          from lower-case 'y'.  The typical italic slant has a smaller
+ *          angle with the vertical than the 'W', so in most cases we
+ *          will not trigger on the slanted lines in the 'W'.
+ *      (4) Note that sel_ital2 is shorter than sel_ital1.  It is
+ *          more appropriate for a typical font scanned at 200 ppi.
+ * </pre>
+ */
+l_ok
+pixItalicWords(PIX     *pixs,
+               BOXA    *boxaw,
+               PIX     *pixw,
+               BOXA   **pboxa,
+               l_int32  debugflag)
+{
+char     opstring[32], buf[32];
+l_int32  size, type;
+BOXA    *boxa;
+PIX     *pixsd, *pixm, *pixd;
+SEL     *sel_ital1, *sel_ital2, *sel_ital3;
+
+    if (!pboxa)
+        return ERROR_INT("&boxa not defined", __func__, 1);
+    *pboxa = NULL;
+    if (!pixs)
+        return ERROR_INT("pixs not defined", __func__, 1);
+    if (boxaw && pixw)
+        return ERROR_INT("both boxaw and pixw are defined", __func__, 1);
+
+    sel_ital1 = selCreateFromString(str_ital1, 13, 6, NULL);
+    sel_ital2 = selCreateFromString(str_ital2, 10, 6, NULL);
+    sel_ital3 = selCreateFromString(str_ital3, 4, 2, NULL);
+
+        /* Make the italic seed: extract with HMT; remove noise.
+         * The noise removal close/open is important to exclude
+         * situations where a small slanted line accidentally
+         * matches sel_ital1. */
+    pixsd = pixHMT(NULL, pixs, sel_ital1);
+    pixClose(pixsd, pixsd, sel_ital3);
+    pixOpen(pixsd, pixsd, sel_ital3);
+
+        /* Make the word mask.  Use input boxes or mask if given. */
+    size = 0;  /* init */
+    if (boxaw) {
+        pixm = pixCreateTemplate(pixs);
+        pixMaskBoxa(pixm, pixm, boxaw, L_SET_PIXELS);
+        type = 1;
+    } else if (pixw) {
+        pixm = pixClone(pixw);
+        type = 2;
+    } else {
+        pixWordMaskByDilation(pixs, NULL, &size, NULL);
+        L_INFO("dilation size = %d\n", __func__, size);
+        snprintf(opstring, sizeof(opstring), "d1.5 + c%d.1", size);
+        pixm = pixMorphSequence(pixs, opstring, 0);
+        type = 3;
+    }
+
+        /* Binary reconstruction to fill in those word mask
+         * components for which there is at least one seed pixel. */
+    pixd = pixSeedfillBinary(NULL, pixsd, pixm, 8);
+    boxa = pixConnComp(pixd, NULL, 8);
+    *pboxa = boxa;
+
+    if (debugflag) {
+            /* Save results at at 2x reduction */
+        l_int32  res, upper;
+        lept_mkdir("lept/ital");
+        BOXA  *boxat;
+        GPLOT *gplot;
+        NUMA  *na;
+        PIXA  *pixa1;
+        PIX   *pix1, *pix2, *pix3;
+        pixa1 = pixaCreate(0);
+        boxat = pixConnComp(pixm, NULL, 8);
+        boxaWriteDebug("/tmp/lept/ital/ital.ba", boxat);
+        pixaAddPix(pixa1, pixs, L_COPY);  /* orig */
+        pixaAddPix(pixa1, pixsd, L_COPY);  /* seed */
+        pix1 = pixConvertTo32(pixm);
+        pixRenderBoxaArb(pix1, boxat, 3, 255, 0, 0);
+        pixaAddPix(pixa1, pix1, L_INSERT);  /* mask + outline */
+        pixaAddPix(pixa1, pixd, L_COPY);  /* ital mask */
+        pix1 = pixConvertTo32(pixs);
+        pixRenderBoxaArb(pix1, boxa, 3, 255, 0, 0);
+        pixaAddPix(pixa1, pix1, L_INSERT);  /* orig + outline */
+        pix1 = pixCreateTemplate(pixs);
+        pix2 = pixSetBlackOrWhiteBoxa(pix1, boxa, L_SET_BLACK);
+        pixCopy(pix1, pixs);
+        pix3 = pixDilateBrick(NULL, pixs, 3, 3);
+        pixCombineMasked(pix1, pix3, pix2);
+        pixaAddPix(pixa1, pix1, L_INSERT);  /* ital bolded */
+        pixDestroy(&pix2);
+        pixDestroy(&pix3);
+        pix2 = pixaDisplayTiledInColumns(pixa1, 1, 0.5, 20, 2);
+        snprintf(buf, sizeof(buf), "/tmp/lept/ital/ital.%d.png", type);
+        pixWriteDebug(buf, pix2, IFF_PNG);
+        pixDestroy(&pix2);
+
+            /* Assuming the image represents 6 inches of actual page width,
+             * the pixs resolution is approximately
+             *    (width of pixs in pixels) / 6
+             * and the images have been saved at half this resolution.   */
+        res = pixGetWidth(pixs) / 12;
+        L_INFO("resolution = %d\n", __func__, res);
+        l_pdfSetDateAndVersion(0);
+        snprintf(buf, sizeof(buf), "/tmp/lept/ital/ital.%d.pdf", type);
+        pixaConvertToPdf(pixa1, res, 1.0, L_FLATE_ENCODE, 75, "Italic Finder",
+                         buf);
+        l_pdfSetDateAndVersion(1);
+        pixaDestroy(&pixa1);
+        boxaDestroy(&boxat);
+
+            /* Plot histogram of horizontal white run sizes.  A small
+             * initial vertical dilation removes most runs that are neither
+             * inter-character nor inter-word.  The larger first peak is
+             * from inter-character runs, and the smaller second peak is
+             * from inter-word runs. */
+        pix1 = pixDilateBrick(NULL, pixs, 1, 15);
+        upper = L_MAX(30, 3 * size);
+        na = pixRunHistogramMorph(pix1, L_RUN_OFF, L_HORIZ, upper);
+        pixDestroy(&pix1);
+        gplot = gplotCreate("/tmp/lept/ital/runhisto", GPLOT_PNG,
+                "Histogram of horizontal runs of white pixels, vs length",
+                "run length", "number of runs");
+        gplotAddPlot(gplot, NULL, na, GPLOT_LINES, "plot1");
+        gplotMakeOutput(gplot);
+        gplotDestroy(&gplot);
+        numaDestroy(&na);
+    }
+
+    selDestroy(&sel_ital1);
+    selDestroy(&sel_ital2);
+    selDestroy(&sel_ital3);
+    pixDestroy(&pixsd);
+    pixDestroy(&pixm);
+    pixDestroy(&pixd);
+    return 0;
+}