Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/leptonica/src/classapp.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/leptonica/src/classapp.c Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,1037 @@ +/*====================================================================* + - Copyright (C) 2001 Leptonica. All rights reserved. + - + - Redistribution and use in source and binary forms, with or without + - modification, are permitted provided that the following conditions + - are met: + - 1. Redistributions of source code must retain the above copyright + - notice, this list of conditions and the following disclaimer. + - 2. Redistributions in binary form must reproduce the above + - copyright notice, this list of conditions and the following + - disclaimer in the documentation and/or other materials + - provided with the distribution. + - + - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY + - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *====================================================================*/ + +/*! + * \file classapp.c + * <pre> + * + * Top-level jb2 correlation and rank-hausdorff + * l_int32 jbCorrelation() + * l_int32 jbRankHaus() + * + * Extract and classify words in textline order + * JBCLASSER *jbWordsInTextlines() + * l_int32 pixGetWordsInTextlines() + * l_int32 pixGetWordBoxesInTextlines() + * + * Extract word and character bounding boxes + * l_int32 pixFindWordAndCharacterBoxes() + * + * Use word bounding boxes to compare page images + * NUMAA *boxaExtractSortedPattern() + * l_int32 numaaCompareImagesByBoxes() + * static l_int32 testLineAlignmentX() + * static l_int32 countAlignedMatches() + * static void printRowIndices() + * </pre> + */ + +#ifdef HAVE_CONFIG_H +#include <config_auto.h> +#endif /* HAVE_CONFIG_H */ + +#include <string.h> +#include "allheaders.h" + +#define L_BUF_SIZE 512 /*!< size of filename buffer */ +static const l_int32 JB_WORDS_MIN_WIDTH = 5; /*!< min. word width in pixels */ +static const l_int32 JB_WORDS_MIN_HEIGHT = 3; /*!< min. word height in pixels */ + + /* Static comparison functions */ +static l_int32 testLineAlignmentX(NUMA *na1, NUMA *na2, l_int32 shiftx, + l_int32 delx, l_int32 nperline); +static l_int32 countAlignedMatches(NUMA *nai1, NUMA *nai2, NUMA *nasx, + NUMA *nasy, l_int32 n1, l_int32 n2, + l_int32 delx, l_int32 dely, + l_int32 nreq, l_int32 *psame, + l_int32 debugflag); +static void printRowIndices(l_int32 *index1, l_int32 n1, + l_int32 *index2, l_int32 n2); + +/*------------------------------------------------------------------* + * Top-level jb2 correlation and rank-hausdorff * + *------------------------------------------------------------------*/ +/*! + * \brief jbCorrelation() + * + * \param[in] dirin directory of input images + * \param[in] thresh typically ~0.8 + * \param[in] weight typically ~0.6 + * \param[in] components JB_CONN_COMPS, JB_CHARACTERS, JB_WORDS + * \param[in] rootname for output files + * \param[in] firstpage 0-based + * \param[in] npages use 0 for all pages in dirin + * \param[in] renderflag 1 to render from templates; 0 to skip + * \return 0 if OK, 1 on error + * + * <pre> + * Notes: + * (1) The images must be 1 bpp. If they are not, you can convert + * them using convertFilesTo1bpp(). + * (2) See prog/jbcorrelation for generating more output (e.g., + * for debugging) + * </pre> + */ +l_ok +jbCorrelation(const char *dirin, + l_float32 thresh, + l_float32 weight, + l_int32 components, + const char *rootname, + l_int32 firstpage, + l_int32 npages, + l_int32 renderflag) +{ +char filename[L_BUF_SIZE]; +l_int32 nfiles, i, numpages; +JBDATA *data; +JBCLASSER *classer; +PIX *pix; +PIXA *pixa; +SARRAY *safiles; + + if (!dirin) + return ERROR_INT("dirin not defined", __func__, 1); + if (!rootname) + return ERROR_INT("rootname not defined", __func__, 1); + if (components != JB_CONN_COMPS && components != JB_CHARACTERS && + components != JB_WORDS) + return ERROR_INT("components invalid", __func__, 1); + + safiles = getSortedPathnamesInDirectory(dirin, NULL, firstpage, npages); + nfiles = sarrayGetCount(safiles); + + /* Classify components */ + classer = jbCorrelationInit(components, 0, 0, thresh, weight); + jbAddPages(classer, safiles); + + /* Save data */ + data = jbDataSave(classer); + jbDataWrite(rootname, data); + + /* Optionally, render pages using class templates */ + if (renderflag) { + pixa = jbDataRender(data, FALSE); + numpages = pixaGetCount(pixa); + if (numpages != nfiles) + lept_stderr("numpages = %d, nfiles = %d, not equal!\n", + numpages, nfiles); + for (i = 0; i < numpages; i++) { + pix = pixaGetPix(pixa, i, L_CLONE); + snprintf(filename, L_BUF_SIZE, "%s.%04d", rootname, i); + lept_stderr("filename: %s\n", filename); + pixWrite(filename, pix, IFF_PNG); + pixDestroy(&pix); + } + pixaDestroy(&pixa); + } + + sarrayDestroy(&safiles); + jbClasserDestroy(&classer); + jbDataDestroy(&data); + return 0; +} + + +/*! + * \brief jbRankHaus() + * + * \param[in] dirin directory of input images + * \param[in] size of Sel used for dilation; typ. 2 + * \param[in] rank rank value of match; typ. 0.97 + * \param[in] components JB_CONN_COMPS, JB_CHARACTERS, JB_WORDS + * \param[in] rootname for output files + * \param[in] firstpage 0-based + * \param[in] npages use 0 for all pages in dirin + * \param[in] renderflag 1 to render from templates; 0 to skip + * \return 0 if OK, 1 on error + * + * <pre> + * Notes: + * (1) See prog/jbrankhaus for generating more output (e.g., + * for debugging) + * </pre> + */ +l_ok +jbRankHaus(const char *dirin, + l_int32 size, + l_float32 rank, + l_int32 components, + const char *rootname, + l_int32 firstpage, + l_int32 npages, + l_int32 renderflag) +{ +char filename[L_BUF_SIZE]; +l_int32 nfiles, i, numpages; +JBDATA *data; +JBCLASSER *classer; +PIX *pix; +PIXA *pixa; +SARRAY *safiles; + + if (!dirin) + return ERROR_INT("dirin not defined", __func__, 1); + if (!rootname) + return ERROR_INT("rootname not defined", __func__, 1); + if (components != JB_CONN_COMPS && components != JB_CHARACTERS && + components != JB_WORDS) + return ERROR_INT("components invalid", __func__, 1); + + safiles = getSortedPathnamesInDirectory(dirin, NULL, firstpage, npages); + nfiles = sarrayGetCount(safiles); + + /* Classify components */ + classer = jbRankHausInit(components, 0, 0, size, rank); + jbAddPages(classer, safiles); + + /* Save data */ + data = jbDataSave(classer); + jbDataWrite(rootname, data); + + /* Optionally, render pages using class templates */ + if (renderflag) { + pixa = jbDataRender(data, FALSE); + numpages = pixaGetCount(pixa); + if (numpages != nfiles) + lept_stderr("numpages = %d, nfiles = %d, not equal!\n", + numpages, nfiles); + for (i = 0; i < numpages; i++) { + pix = pixaGetPix(pixa, i, L_CLONE); + snprintf(filename, L_BUF_SIZE, "%s.%04d", rootname, i); + lept_stderr("filename: %s\n", filename); + pixWrite(filename, pix, IFF_PNG); + pixDestroy(&pix); + } + pixaDestroy(&pixa); + } + + sarrayDestroy(&safiles); + jbClasserDestroy(&classer); + jbDataDestroy(&data); + return 0; +} + + + +/*------------------------------------------------------------------* + * Extract and classify words in textline order * + *------------------------------------------------------------------*/ +/*! + * \brief jbWordsInTextlines() + * + * \param[in] dirin directory of input pages + * \param[in] reduction 1 for full res; 2 for half-res + * \param[in] maxwidth of word mask components, to be kept + * \param[in] maxheight of word mask components, to be kept + * \param[in] thresh on correlation; 0.80 is reasonable + * \param[in] weight for handling thick text; 0.6 is reasonable + * \param[out] pnatl numa with textline index for each component + * \param[in] firstpage 0-based + * \param[in] npages use 0 for all pages in dirin + * \return classer for the set of pages + * + * <pre> + * Notes: + * (1) This is a high-level function. See prog/jbwords for example + * of usage. + * (2) Typically, use input of 75 - 150 ppi for finding words. + * </pre> + */ +JBCLASSER * +jbWordsInTextlines(const char *dirin, + l_int32 reduction, + l_int32 maxwidth, + l_int32 maxheight, + l_float32 thresh, + l_float32 weight, + NUMA **pnatl, + l_int32 firstpage, + l_int32 npages) +{ +char *fname; +l_int32 nfiles, i, w, h; +BOXA *boxa; +JBCLASSER *classer; +NUMA *nai, *natl; +PIX *pix1, *pix2; +PIXA *pixa; +SARRAY *safiles; + + if (!pnatl) + return (JBCLASSER *)ERROR_PTR("&natl not defined", __func__, NULL); + *pnatl = NULL; + if (!dirin) + return (JBCLASSER *)ERROR_PTR("dirin not defined", __func__, NULL); + if (reduction != 1 && reduction != 2) + return (JBCLASSER *)ERROR_PTR("reduction not in {1,2}", __func__, NULL); + + safiles = getSortedPathnamesInDirectory(dirin, NULL, firstpage, npages); + nfiles = sarrayGetCount(safiles); + + /* Classify components */ + classer = jbCorrelationInit(JB_WORDS, maxwidth, maxheight, thresh, weight); + classer->safiles = sarrayCopy(safiles); + natl = numaCreate(0); + *pnatl = natl; + for (i = 0; i < nfiles; i++) { + fname = sarrayGetString(safiles, i, L_NOCOPY); + if ((pix1 = pixRead(fname)) == NULL) { + L_WARNING("image file %d not read\n", __func__, i); + continue; + } + if (reduction == 1) + pix2 = pixClone(pix1); + else /* reduction == 2 */ + pix2 = pixReduceRankBinaryCascade(pix1, 1, 0, 0, 0); + pixGetWordsInTextlines(pix2, JB_WORDS_MIN_WIDTH, + JB_WORDS_MIN_HEIGHT, maxwidth, maxheight, + &boxa, &pixa, &nai); + pixGetDimensions(pix2, &w, &h, NULL); + classer->w = w; + classer->h = h; + jbAddPageComponents(classer, pix2, boxa, pixa); + numaJoin(natl, nai, 0, -1); + pixDestroy(&pix1); + pixDestroy(&pix2); + numaDestroy(&nai); + boxaDestroy(&boxa); + pixaDestroy(&pixa); + } + + sarrayDestroy(&safiles); + return classer; +} + + +/*! + * \brief pixGetWordsInTextlines() + * + * \param[in] pixs 1 bpp, typ. 75 - 150 ppi + * \param[in] minwidth of saved components; smaller are discarded + * \param[in] minheight of saved components; smaller are discarded + * \param[in] maxwidth of saved components; larger are discarded + * \param[in] maxheight of saved components; larger are discarded + * \param[out] pboxad word boxes sorted in textline line order + * \param[out] ppixad word images sorted in textline line order + * \param[out] pnai index of textline for each word + * \return 0 if OK, 1 on error + * + * <pre> + * Notes: + * (1) The input should be at a resolution of between 75 and 150 ppi. + * (2) The four size constraints on saved components are all + * scaled by %reduction. + * (3) The result are word images (and their b.b.), extracted in + * textline order, at either full res or 2x reduction, + * and with a numa giving the textline index for each word. + * (4) The pixa and boxa interfaces should make this type of + * application simple to put together. The steps are: + * ~ generate first estimate of word masks + * ~ get b.b. of these, and remove the small and big ones + * ~ extract pixa of the word images, using the b.b. + * ~ sort actual word images in textline order (2d) + * ~ flatten them to a pixa (1d), saving the textline index + * for each pix + * (5) In an actual application, it may be desirable to pre-filter + * the input image to remove large components, to extract + * single columns of text, and to deskew them. For example, + * to remove both large components and small noisy components + * that can interfere with the statistics used to estimate + * parameters for segmenting by words, but still retain text lines, + * the following image preprocessing can be done: + * Pix *pixt = pixMorphSequence(pixs, "c40.1", 0); + * Pix *pixf = pixSelectBySize(pixt, 0, 60, 8, + * L_SELECT_HEIGHT, L_SELECT_IF_LT, NULL); + * pixAnd(pixf, pixf, pixs); // the filtered image + * The closing turns text lines into long blobs, but does not + * significantly increase their height. But if there are many + * small connected components in a dense texture, this is likely + * to generate tall components that will be eliminated in pixf. + * </pre> + */ +l_ok +pixGetWordsInTextlines(PIX *pixs, + l_int32 minwidth, + l_int32 minheight, + l_int32 maxwidth, + l_int32 maxheight, + BOXA **pboxad, + PIXA **ppixad, + NUMA **pnai) +{ +BOXA *boxa1, *boxad; +BOXAA *baa; +NUMA *nai; +NUMAA *naa; +PIXA *pixa1, *pixad; +PIXAA *paa; + + if (!pboxad || !ppixad || !pnai) + return ERROR_INT("&boxad, &pixad, &nai not all defined", __func__, 1); + *pboxad = NULL; + *ppixad = NULL; + *pnai = NULL; + if (!pixs) + return ERROR_INT("pixs not defined", __func__, 1); + + /* Get the bounding boxes of the words from the word mask. */ + pixWordBoxesByDilation(pixs, minwidth, minheight, maxwidth, maxheight, + &boxa1, NULL, NULL); + + /* Generate a pixa of the word images */ + pixa1 = pixaCreateFromBoxa(pixs, boxa1, 0, 0, NULL); + + /* Sort the bounding boxes of these words by line. We use the + * index mapping to allow identical sorting of the pixa. */ + baa = boxaSort2d(boxa1, &naa, -1, -1, 4); + paa = pixaSort2dByIndex(pixa1, naa, L_CLONE); + + /* Flatten the word paa */ + pixad = pixaaFlattenToPixa(paa, &nai, L_CLONE); + boxad = pixaGetBoxa(pixad, L_COPY); + + *pnai = nai; + *pboxad = boxad; + *ppixad = pixad; + + pixaDestroy(&pixa1); + boxaDestroy(&boxa1); + boxaaDestroy(&baa); + pixaaDestroy(&paa); + numaaDestroy(&naa); + return 0; +} + + +/*! + * \brief pixGetWordBoxesInTextlines() + * + * \param[in] pixs 1 bpp, typ. 75 - 150 ppi + * \param[in] minwidth of saved components; smaller are discarded + * \param[in] minheight of saved components; smaller are discarded + * \param[in] maxwidth of saved components; larger are discarded + * \param[in] maxheight of saved components; larger are discarded + * \param[out] pboxad word boxes sorted in textline line order + * \param[out] pnai [optional] index of textline for each word + * \return 0 if OK, 1 on error + * + * <pre> + * Notes: + * (1) The input should be at a resolution of between 75 and 150 ppi. + * (2) This is a special version of pixGetWordsInTextlines(), that + * just finds the word boxes in line order, with a numa + * giving the textline index for each word. + * See pixGetWordsInTextlines() for more details. + * </pre> + */ +l_ok +pixGetWordBoxesInTextlines(PIX *pixs, + l_int32 minwidth, + l_int32 minheight, + l_int32 maxwidth, + l_int32 maxheight, + BOXA **pboxad, + NUMA **pnai) +{ +BOXA *boxa1; +BOXAA *baa; +NUMA *nai; + + if (pnai) *pnai = NULL; + if (!pboxad) + return ERROR_INT("&boxad and &nai not both defined", __func__, 1); + *pboxad = NULL; + if (!pixs) + return ERROR_INT("pixs not defined", __func__, 1); + + /* Get the bounding boxes of the words from the word mask. */ + pixWordBoxesByDilation(pixs, minwidth, minheight, maxwidth, maxheight, + &boxa1, NULL, NULL); + + /* 2D sort the bounding boxes of these words. */ + baa = boxaSort2d(boxa1, NULL, 3, -5, 5); + + /* Flatten the boxaa, saving the boxa index for each box */ + *pboxad = boxaaFlattenToBoxa(baa, &nai, L_CLONE); + + if (pnai) + *pnai = nai; + else + numaDestroy(&nai); + boxaDestroy(&boxa1); + boxaaDestroy(&baa); + return 0; +} + + +/*------------------------------------------------------------------* + * Extract word and character bounding boxes * + *------------------------------------------------------------------*/ +/*! + * \brief pixFindWordAndCharacterBoxes() + * + * \param[in] pixs 2, 4, 8 or 32 bpp; colormap OK; typ. 300 ppi + * \param[in] boxs [optional] region to select in pixs + * \param[in] thresh binarization threshold (typ. 100 - 150) + * \param[out] pboxaw return the word boxes + * \param[out] pboxaac return the character boxes + * \param[in] debugdir [optional] for debug images; use NULL to skip + * \return 0 if OK, 1 on error + * + * <pre> + * Notes: + * (1) If %boxs == NULL, the entire input image is used. + * (2) Having an input pix that is not 1bpp is necessary to reduce + * touching characters by using a low binarization threshold. + * Suggested thresholds are between 100 and 150. + * (3) The coordinates in the output boxes are global, with respect + * to the input image. + * </pre> + */ +l_ok +pixFindWordAndCharacterBoxes(PIX *pixs, + BOX *boxs, + l_int32 thresh, + BOXA **pboxaw, + BOXAA **pboxaac, + const char *debugdir) +{ +char *debugfile, *subdir; +l_int32 i, xs, ys, xb, yb, nb, loc; +l_float32 scalefact; +BOX *box1, *box2; +BOXA *boxa1, *boxa1a, *boxa2, *boxa3, *boxa4, *boxa5, *boxaw; +BOXAA *boxaac; +PIX *pix1, *pix2, *pix3, *pix3a, *pix4, *pix5; + + if (pboxaw) *pboxaw = NULL; + if (pboxaac) *pboxaac = NULL; + if (!pboxaw || !pboxaac) + return ERROR_INT("&boxaw and &boxaac not defined", __func__, 1); + if (!pixs || pixGetDepth(pixs) == 1) + return ERROR_INT("pixs not defined or 1 bpp", __func__, 1); + if (thresh > 150) + L_WARNING("threshold is %d; may be too high\n", __func__, thresh); + + if (boxs) { + if ((pix1 = pixClipRectangle(pixs, boxs, NULL)) == NULL) + return ERROR_INT("pix1 not made", __func__, 1); + boxGetGeometry(boxs, &xs, &ys, NULL, NULL); + } else { + pix1 = pixClone(pixs); + xs = ys = 0; + } + + /* Convert pix1 to 8 bpp gray if necessary */ + pix2 = pixConvertTo8(pix1, FALSE); + + /* To find the words and letters, work with 1 bpp images and use + * a low threshold to reduce the number of touching characters. */ + pix3 = pixConvertTo1(pix2, thresh); + + /* Work at about 120 ppi to find the word bounding boxes. */ + pix3a = pixScaleToResolution(pix3, 120.0, 300.0, &scalefact); + + /* First find the words, removing the very small things like + * dots over the 'i' that weren't included in word boxes. */ + pixGetWordBoxesInTextlines(pix3a, 1, 4, 150, 40, &boxa1a, NULL); + boxa1 = boxaTransform(boxa1a, 0, 0, 1.0 / scalefact, 1.0 / scalefact); + if (debugdir) { + loc = 0; + subdir = stringReplaceSubstr(debugdir, "/tmp/", "", &loc, NULL); + lept_mkdir(subdir); + LEPT_FREE(subdir); + pix4 = pixConvertTo32(pix2); + pixRenderBoxaArb(pix4, boxa1, 2, 255, 0, 0); + debugfile = stringJoin(debugdir, "/words.png"); + pixWrite(debugfile, pix4, IFF_PNG); + pixDestroy(&pix4); + LEPT_FREE(debugfile); + } + + /* Now find the letters at 300 ppi */ + nb = boxaGetCount(boxa1); + boxaw = boxaCreate(nb); + boxaac = boxaaCreate(nb); + *pboxaw = boxaw; + *pboxaac = boxaac; + for (i = 0; i < nb; i++) { + box1 = boxaGetBox(boxa1, i, L_COPY); + boxGetGeometry(box1, &xb, &yb, NULL, NULL); + pix4 = pixClipRectangle(pix3, box1, NULL); + /* Join detached parts of characters vertically */ + pix5 = pixMorphSequence(pix4, "c1.10", 0); + /* The connected components should mostly be characters */ + boxa2 = pixConnCompBB(pix5, 4); + /* Remove very small pieces */ + boxa3 = boxaSelectBySize(boxa2, 2, 5, L_SELECT_IF_BOTH, + L_SELECT_IF_GTE, NULL); + /* Order left to right */ + boxa4 = boxaSort(boxa3, L_SORT_BY_X, L_SORT_INCREASING, NULL); + /* Express locations with reference to the full input image */ + boxa5 = boxaTransform(boxa4, xs + xb, ys + yb, 1.0, 1.0); + box2 = boxTransform(box1, xs, ys, 1.0, 1.0); + + /* Ignore any boxa with no boxes after size filtering */ + if (boxaGetCount(boxa5) > 0) { + boxaAddBox(boxaw, box2, L_INSERT); + boxaaAddBoxa(boxaac, boxa5, L_INSERT); + } else { + boxDestroy(&box2); + boxaDestroy(&boxa5); + } + boxDestroy(&box1); + pixDestroy(&pix4); + pixDestroy(&pix5); + boxaDestroy(&boxa2); + boxaDestroy(&boxa3); + boxaDestroy(&boxa4); + } + pixDestroy(&pix1); + pixDestroy(&pix2); + pixDestroy(&pix3); + pixDestroy(&pix3a); + boxaDestroy(&boxa1); + boxaDestroy(&boxa1a); + if (debugdir) { + pix4 = pixConvertTo32(pixs); + boxa2 = boxaaFlattenToBoxa(boxaac, NULL, L_COPY); + pixRenderBoxaArb(pix4, boxa2, 2, 255, 0, 0); + boxa3 = boxaAdjustSides(boxaw, -2, 2, -2, 2); + pixRenderBoxaArb(pix4, boxa3, 2, 0, 255, 0); + debugfile = stringJoin(debugdir, "/chars.png"); + pixWrite(debugfile, pix4, IFF_PNG); + pixDestroy(&pix4); + boxaDestroy(&boxa2); + boxaDestroy(&boxa3); + LEPT_FREE(debugfile); + } + return 0; +} + + +/*------------------------------------------------------------------* + * Use word bounding boxes to compare page images * + *------------------------------------------------------------------*/ +/*! + * \brief boxaExtractSortedPattern() + * + * \param[in] boxa typ. of word bounding boxes, in textline order + * \param[in] na index of textline for each box in boxa + * \return naa NUMAA, where each numa represents one textline, + * or NULL on error + * + * <pre> + * Notes: + * (1) The input is expected to come from pixGetWordBoxesInTextlines(). + * (2) Each numa in the output consists of an average y coordinate + * of the first box in the textline, followed by pairs of + * x coordinates representing the left and right edges of each + * of the boxes in the textline. + * </pre> + */ +NUMAA * +boxaExtractSortedPattern(BOXA *boxa, + NUMA *na) +{ +l_int32 index, nbox, row, prevrow, x, y, w, h; +BOX *box; +NUMA *nad = NULL; +NUMAA *naa; + + if (!boxa) + return (NUMAA *)ERROR_PTR("boxa not defined", __func__, NULL); + if (!na) + return (NUMAA *)ERROR_PTR("na not defined", __func__, NULL); + + naa = numaaCreate(0); + nbox = boxaGetCount(boxa); + if (nbox == 0) + return naa; + + prevrow = -1; + for (index = 0; index < nbox; index++) { + box = boxaGetBox(boxa, index, L_CLONE); + numaGetIValue(na, index, &row); + if (row > prevrow) { + if (index > 0) + numaaAddNuma(naa, nad, L_INSERT); + nad = numaCreate(0); + prevrow = row; + boxGetGeometry(box, NULL, &y, NULL, &h); + numaAddNumber(nad, y + h / 2); + } + boxGetGeometry(box, &x, NULL, &w, NULL); + numaAddNumber(nad, x); + numaAddNumber(nad, x + w - 1); + boxDestroy(&box); + } + numaaAddNuma(naa, nad, L_INSERT); + + return naa; +} + + +/*! + * \brief numaaCompareImagesByBoxes() + * + * \param[in] naa1 for image 1, formatted by boxaExtractSortedPattern() + * \param[in] naa2 for image 2, formatted by boxaExtractSortedPattern() + * \param[in] nperline number of box regions to be used in each textline + * \param[in] nreq number of complete row matches required + * \param[in] maxshiftx max allowed x shift between two patterns, in pixels + * \param[in] maxshifty max allowed y shift between two patterns, in pixels + * \param[in] delx max allowed difference in x data, after alignment + * \param[in] dely max allowed difference in y data, after alignment + * \param[out] psame 1 if %nreq row matches are found; 0 otherwise + * \param[in] debugflag 1 for debug output + * \return 0 if OK, 1 on error + * + * <pre> + * Notes: + * (1) Each input numaa describes a set of sorted bounding boxes + * (sorted by textline and, within each textline, from + * left to right) in the images from which they are derived. + * See boxaExtractSortedPattern() for a description of the data + * format in each of the input numaa. + * (2) This function does an alignment between the input + * descriptions of bounding boxes for two images. The + * input parameter %nperline specifies the number of boxes + * to consider in each line when testing for a match, and + * %nreq is the required number of lines that must be well-aligned + * to get a match. + * (3) Testing by alignment has 3 steps: + * (a) Generating the location of word bounding boxes from the + * images (prior to calling this function). + * (b) Listing all possible pairs of aligned rows, based on + * tolerances in horizontal and vertical positions of + * the boxes. Specifically, all pairs of rows are enumerated + * whose first %nperline boxes can be brought into close + * alignment, based on the delx parameter for boxes in the + * line and within the overall the %maxshiftx and %maxshifty + * constraints. + * (c) Each pair, starting with the first, is used to search + * for a set of %nreq - 1 other pairs that can all be aligned + * with a difference in global translation of not more + * than (%delx, %dely). + * </pre> + */ +l_ok +numaaCompareImagesByBoxes(NUMAA *naa1, + NUMAA *naa2, + l_int32 nperline, + l_int32 nreq, + l_int32 maxshiftx, + l_int32 maxshifty, + l_int32 delx, + l_int32 dely, + l_int32 *psame, + l_int32 debugflag) +{ +l_int32 n1, n2, i, j, nbox, y1, y2, xl1, xl2; +l_int32 shiftx, shifty, match; +l_int32 *line1, *line2; /* indicator for sufficient boxes in a line */ +l_int32 *yloc1, *yloc2; /* arrays of y value for first box in a line */ +l_int32 *xleft1, *xleft2; /* arrays of x value for left side of first box */ +NUMA *na1, *na2, *nai1, *nai2, *nasx, *nasy; + + if (!psame) + return ERROR_INT("&same not defined", __func__, 1); + *psame = 0; + if (!naa1) + return ERROR_INT("naa1 not defined", __func__, 1); + if (!naa2) + return ERROR_INT("naa2 not defined", __func__, 1); + if (nperline < 1) + return ERROR_INT("nperline < 1", __func__, 1); + if (nreq < 1) + return ERROR_INT("nreq < 1", __func__, 1); + + n1 = numaaGetCount(naa1); + n2 = numaaGetCount(naa2); + if (n1 < nreq || n2 < nreq) + return 0; + + /* Find the lines in naa1 and naa2 with sufficient boxes. + * Also, find the y-values for each of the lines, and the + * LH x-values of the first box in each line. */ + line1 = (l_int32 *)LEPT_CALLOC(n1, sizeof(l_int32)); + line2 = (l_int32 *)LEPT_CALLOC(n2, sizeof(l_int32)); + yloc1 = (l_int32 *)LEPT_CALLOC(n1, sizeof(l_int32)); + yloc2 = (l_int32 *)LEPT_CALLOC(n2, sizeof(l_int32)); + xleft1 = (l_int32 *)LEPT_CALLOC(n1, sizeof(l_int32)); + xleft2 = (l_int32 *)LEPT_CALLOC(n2, sizeof(l_int32)); + if (!line1 || !line2 || !yloc1 || !yloc2 || !xleft1 || !xleft2) { + LEPT_FREE(line1); + LEPT_FREE(line2); + LEPT_FREE(yloc1); + LEPT_FREE(yloc2); + LEPT_FREE(xleft1); + LEPT_FREE(xleft2); + return ERROR_INT("calloc failure for an array", __func__, 1); + } + for (i = 0; i < n1; i++) { + na1 = numaaGetNuma(naa1, i, L_CLONE); + numaGetIValue(na1, 0, yloc1 + i); + numaGetIValue(na1, 1, xleft1 + i); + nbox = (numaGetCount(na1) - 1) / 2; + if (nbox >= nperline) + line1[i] = 1; + numaDestroy(&na1); + } + for (i = 0; i < n2; i++) { + na2 = numaaGetNuma(naa2, i, L_CLONE); + numaGetIValue(na2, 0, yloc2 + i); + numaGetIValue(na2, 1, xleft2 + i); + nbox = (numaGetCount(na2) - 1) / 2; + if (nbox >= nperline) + line2[i] = 1; + numaDestroy(&na2); + } + + /* Enumerate all possible line matches. A 'possible' line + * match is one where the x and y shifts for the first box + * in each line are within the maxshiftx and maxshifty + * constraints, and the left and right sides of the remaining + * (nperline - 1) successive boxes are within delx of each other. + * The result is a set of four numas giving parameters of + * each set of matching lines. */ + nai1 = numaCreate(0); /* line index 1 of match */ + nai2 = numaCreate(0); /* line index 2 of match */ + nasx = numaCreate(0); /* shiftx for match */ + nasy = numaCreate(0); /* shifty for match */ + for (i = 0; i < n1; i++) { + if (line1[i] == 0) continue; + y1 = yloc1[i]; + xl1 = xleft1[i]; + na1 = numaaGetNuma(naa1, i, L_CLONE); + for (j = 0; j < n2; j++) { + if (line2[j] == 0) continue; + y2 = yloc2[j]; + if (L_ABS(y1 - y2) > maxshifty) continue; + xl2 = xleft2[j]; + if (L_ABS(xl1 - xl2) > maxshiftx) continue; + shiftx = xl1 - xl2; /* shift to add to x2 values */ + shifty = y1 - y2; /* shift to add to y2 values */ + na2 = numaaGetNuma(naa2, j, L_CLONE); + + /* Now check if 'nperline' boxes in the two lines match */ + match = testLineAlignmentX(na1, na2, shiftx, delx, nperline); + if (match) { + numaAddNumber(nai1, i); + numaAddNumber(nai2, j); + numaAddNumber(nasx, shiftx); + numaAddNumber(nasy, shifty); + } + numaDestroy(&na2); + } + numaDestroy(&na1); + } + + /* Determine if there are a sufficient number of mutually + * aligned matches. Mutually aligned matches place an additional + * constraint on the 'possible' matches, where the relative + * shifts must not exceed the (delx, dely) distances. */ + countAlignedMatches(nai1, nai2, nasx, nasy, n1, n2, delx, dely, + nreq, psame, debugflag); + + LEPT_FREE(line1); + LEPT_FREE(line2); + LEPT_FREE(yloc1); + LEPT_FREE(yloc2); + LEPT_FREE(xleft1); + LEPT_FREE(xleft2); + numaDestroy(&nai1); + numaDestroy(&nai2); + numaDestroy(&nasx); + numaDestroy(&nasy); + return 0; +} + + +static l_int32 +testLineAlignmentX(NUMA *na1, + NUMA *na2, + l_int32 shiftx, + l_int32 delx, + l_int32 nperline) +{ +l_int32 i, xl1, xr1, xl2, xr2, diffl, diffr; + + if (!na1) + return ERROR_INT("na1 not defined", __func__, 1); + if (!na2) + return ERROR_INT("na2 not defined", __func__, 1); + + for (i = 0; i < nperline; i++) { + numaGetIValue(na1, i + 1, &xl1); + numaGetIValue(na1, i + 2, &xr1); + numaGetIValue(na2, i + 1, &xl2); + numaGetIValue(na2, i + 2, &xr2); + diffl = L_ABS(xl1 - xl2 - shiftx); + diffr = L_ABS(xr1 - xr2 - shiftx); + if (diffl > delx || diffr > delx) + return 0; + } + + return 1; +} + + +/* + * \brief countAlignedMatches() + * + * \param[in] nai1, nai2 numas of row pairs for matches + * \param[in] nasx, nasy numas of x and y shifts for the matches + * \param[in] n1, n2 number of rows in images 1 and 2 + * \param[in] delx, dely allowed difference in shifts of the match, + * compared to the reference match + * \param[in] nre1 number of required aligned matches + * \param[out] psame return 1 if %nreq row matches are found; + * 0 otherwise + * \return 0 if OK, 1 on error + * + * <pre> + * Notes: + * (1) This takes 4 input arrays giving parameters of all the + * line matches. It looks for the maximum set of aligned + * matches (matches with approximately the same overall shifts) + * that do not use rows from either image more than once. + * </pre> + */ +static l_ok +countAlignedMatches(NUMA *nai1, + NUMA *nai2, + NUMA *nasx, + NUMA *nasy, + l_int32 n1, + l_int32 n2, + l_int32 delx, + l_int32 dely, + l_int32 nreq, + l_int32 *psame, + l_int32 debugflag) +{ +l_int32 i, j, nm, shiftx, shifty, nmatch, diffx, diffy; +l_int32 *ia1, *ia2, *iasx, *iasy, *index1, *index2; + + if (!nai1 || !nai2 || !nasx || !nasy) + return ERROR_INT("4 input numas not defined", __func__, 1); + if (!psame) + return ERROR_INT("&same not defined", __func__, 1); + *psame = 0; + + /* Check for sufficient aligned matches, doing a double iteration + * over the set of raw matches. The row index arrays + * are used to verify that the same rows in either image + * are not used in more than one match. Whenever there + * is a match that is properly aligned, those rows are + * marked in the index arrays. */ + nm = numaGetCount(nai1); /* number of matches */ + if (nm < nreq) + return 0; + + ia1 = numaGetIArray(nai1); + ia2 = numaGetIArray(nai2); + iasx = numaGetIArray(nasx); + iasy = numaGetIArray(nasy); + index1 = (l_int32 *)LEPT_CALLOC(n1, sizeof(l_int32)); /* watch rows */ + index2 = (l_int32 *)LEPT_CALLOC(n2, sizeof(l_int32)); + if (!index1 || !index2) + return ERROR_INT("calloc fail for array", __func__, 1); + for (i = 0; i < nm; i++) { + if (*psame == 1) + break; + + /* Reset row index arrays */ + memset(index1, 0, 4 * n1); + memset(index2, 0, 4 * n2); + nmatch = 1; + index1[ia1[i]] = nmatch; /* mark these rows as taken */ + index2[ia2[i]] = nmatch; + shiftx = iasx[i]; /* reference shift between two rows */ + shifty = iasy[i]; /* ditto */ + if (nreq == 1) { + *psame = 1; + break; + } + for (j = 0; j < nm; j++) { + if (j == i) continue; + /* Rows must both be different from any previously seen */ + if (index1[ia1[j]] > 0 || index2[ia2[j]] > 0) continue; + /* Check the shift for this match */ + diffx = L_ABS(shiftx - iasx[j]); + diffy = L_ABS(shifty - iasy[j]); + if (diffx > delx || diffy > dely) continue; + /* We have a match */ + nmatch++; + index1[ia1[j]] = nmatch; /* mark the rows */ + index2[ia2[j]] = nmatch; + if (nmatch >= nreq) { + *psame = 1; + if (debugflag) + printRowIndices(index1, n1, index2, n2); + break; + } + } + } + + LEPT_FREE(ia1); + LEPT_FREE(ia2); + LEPT_FREE(iasx); + LEPT_FREE(iasy); + LEPT_FREE(index1); + LEPT_FREE(index2); + return 0; +} + + +static void +printRowIndices(l_int32 *index1, + l_int32 n1, + l_int32 *index2, + l_int32 n2) +{ +l_int32 i; + + lept_stderr("Index1: "); + for (i = 0; i < n1; i++) { + if (i && (i % 20 == 0)) + lept_stderr("\n "); + lept_stderr("%3d", index1[i]); + } + lept_stderr("\n"); + + lept_stderr("Index2: "); + for (i = 0; i < n2; i++) { + if (i && (i % 20 == 0)) + lept_stderr("\n "); + lept_stderr("%3d", index2[i]); + } + lept_stderr("\n"); + return; +}
