Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/leptonica/src/partition.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/leptonica/src/partition.c Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,648 @@ +/*====================================================================* + - Copyright (C) 2001 Leptonica. All rights reserved. + - + - Redistribution and use in source and binary forms, with or without + - modification, are permitted provided that the following conditions + - are met: + - 1. Redistributions of source code must retain the above copyright + - notice, this list of conditions and the following disclaimer. + - 2. Redistributions in binary form must reproduce the above + - copyright notice, this list of conditions and the following + - disclaimer in the documentation and/or other materials + - provided with the distribution. + - + - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY + - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *====================================================================*/ + +/*! + * \file partition.c + * <pre> + * + * Whitespace block extraction + * BOXA *boxaGetWhiteblocks() + * + * Helpers + * static PARTEL *partelCreate() + * static void partelDestroy() + * static l_int32 partelSetSize() + * static BOXA *boxaGenerateSubboxes() + * static BOX *boxaSelectPivotBox() + * static l_int32 boxaCheckIfOverlapIsSmall() + * BOXA *boxaPruneSortedOnOverlap() + * </pre> + */ + +#ifdef HAVE_CONFIG_H +#include <config_auto.h> +#endif /* HAVE_CONFIG_H */ + +#include "allheaders.h" + +/*! Partition element */ +struct PartitionElement { + l_float32 size; /* sorting key */ + BOX *box; /* region of the element */ + BOXA *boxa; /* set of intersecting boxes */ +}; +typedef struct PartitionElement PARTEL; + +static PARTEL * partelCreate(BOX *box); +static void partelDestroy(PARTEL **ppartel); +static l_int32 partelSetSize(PARTEL *partel, l_int32 sortflag); +static BOXA * boxaGenerateSubboxes(BOX *box, BOXA *boxa, l_int32 maxperim, + l_float32 fract); +static BOX * boxaSelectPivotBox(BOX *box, BOXA *boxa, l_int32 maxperim, + l_float32 fract); +static l_int32 boxCheckIfOverlapIsBig(BOX *box, BOXA *boxa, + l_float32 maxoverlap); + +static const l_int32 DefaultMaxPops = 20000; + + +#ifndef NO_CONSOLE_IO +#define OUTPUT_HEAP_STATS 0 +#endif /* ~NO_CONSOLE_IO */ + +/*------------------------------------------------------------------* + * Whitespace block extraction * + *------------------------------------------------------------------*/ +/*! + * \brief boxaGetWhiteblocks() + * + * \param[in] boxas typ. a set of bounding boxes of fg components + * \param[in] box initial region; typically including all boxes + * in boxas; if null, it computes the region to + * include all boxes in boxas + * \param[in] sortflag L_SORT_BY_WIDTH, L_SORT_BY_HEIGHT, + * L_SORT_BY_MIN_DIMENSION, L_SORT_BY_MAX_DIMENSION, + * L_SORT_BY_PERIMETER, L_SORT_BY_AREA + * \param[in] maxboxes max number of output whitespace boxes; e.g., 100 + * \param[in] maxoverlap maximum fractional overlap of a box by any + * of the larger boxes; e.g., 0.2 + * \param[in] maxperim maximum half-perimeter, in pixels, for which + * pivot is selected by proximity to box centroid; + * e.g., 200 + * \param[in] fract fraction of box diagonal that is an acceptable + * distance from the box centroid to select + * the pivot; e.g., 0.2 + * \param[in] maxpops max number of pops from the heap; use 0 as default + * \return boxa of sorted whitespace boxes, or NULL on error + * + * <pre> + * Notes: + * (1) This uses the elegant Breuel algorithm, found in "Two + * Geometric Algorithms for Layout Analysis", 2002, + * url: "citeseer.ist.psu.edu/breuel02two.html". + * It starts with the bounding boxes (b.b.) of the connected + * components (c.c.) in a region, along with the rectangle + * representing that region. It repeatedly divides the + * rectangle into four maximal rectangles that exclude a + * pivot rectangle, sorting them in a priority queue + * according to one of the six sort flags. It returns a boxa + * of the "largest" set that have no intersection with boxes + * from the input boxas. + * (2) If box == NULL, the initial region is the minimal region + * that includes the origin and every box in boxas. + * (3) maxboxes is the maximum number of whitespace boxes that will + * be returned. The actual number will depend on the image + * and the values chosen for maxoverlap and maxpops. In many + * cases, the actual number will be 'maxboxes'. + * (4) maxoverlap allows pruning of whitespace boxes depending on + * the overlap. To avoid all pruning, use maxoverlap = 1.0. + * To select only boxes that have no overlap with each other + * (maximal pruning), choose maxoverlap = 0.0. + * Otherwise, no box can have more than the 'maxoverlap' fraction + * of its area overlapped by any larger (in the sense of the + * sortflag) box. + * (5) Choose maxperim (actually, maximum half-perimeter) to + * represent a c.c. that is small enough so that you don't care + * about the white space that could be inside of it. For all such + * c.c., the pivot for 'quadfurcation' of a rectangle is selected + * as having a reasonable proximity to the rectangle centroid. + * (6) Use fract in the range [0.0 ... 1.0]. Set fract = 0.0 + * to choose the small box nearest the centroid as the pivot. + * If you choose fract > 0.0, it is suggested that you call + * boxaPermuteRandom() first, to permute the boxes (see usage below). + * This should reduce the search time for each of the pivot boxes. + * (7) Choose maxpops to be the maximum number of rectangles that + * are popped from the heap. This is an indirect way to limit the + * execution time. Use 0 for default (a fairly large number). + * At any time, you can expect the heap to contain about + * 2.5 times as many boxes as have been popped off. + * (8) The output result is a sorted set of overlapping + * boxes, constrained by 'maxboxes', 'maxoverlap' and 'maxpops'. + * (9) The main defect of the method is that it abstracts out the + * actual components, retaining only the b.b. for analysis. + * Consider a component with a large b.b. If this is chosen + * as a pivot, all white space inside is immediately taken + * out of consideration. Furthermore, even if it is never chosen + * as a pivot, as the partitioning continues, at no time will + * any of the whitespace inside this component be part of a + * rectangle with zero overlapping boxes. Thus, the interiors + * of all boxes are necessarily excluded from the union of + * the returned whitespace boxes. + * (10) It should be noted that the algorithm puts a large number + * of partels on the queue. Setting a limit of X partels to + * remove from the queue, one typically finds that there will be + * several times that number (say, 2X - 3X) left on the queue. + * For an efficient algorithm to find the largest white or + * or black rectangles, without permitting them to overlap, + * see pixFindLargeRectangles(). + * (11) USAGE: One way to accommodate to this weakness is to remove such + * large b.b. before starting the computation. For example, + * if 'box' is an input image region containing 'boxa' b.b. of c.c.: + * + * // Faster pivot choosing + * boxaPermuteRandom(boxa, boxa); + * + * // Remove anything either large width or height + * boxat = boxaSelectBySize(boxa, maxwidth, maxheight, + * L_SELECT_IF_BOTH, L_SELECT_IF_LT, + * NULL); + * + * boxad = boxaGetWhiteblocks(boxat, box, type, maxboxes, + * maxoverlap, maxperim, fract, + * maxpops); + * + * The result will be rectangular regions of "white space" that + * extend into (and often through) the excluded components. + * (11) As a simple example, suppose you wish to find the columns on a page. + * First exclude large c.c. that may block the columns, and then call: + * + * boxad = boxaGetWhiteblocks(boxa, box, L_SORT_BY_HEIGHT, + * 20, 0.15, 200, 0.2, 2000); + * + * to get the 20 tallest boxes with no more than 0.15 overlap + * between a box and any of the taller ones, and avoiding the + * use of any c.c. with a b.b. half perimeter greater than 200 + * as a pivot. + * </pre> + */ +BOXA * +boxaGetWhiteblocks(BOXA *boxas, + BOX *box, + l_int32 sortflag, + l_int32 maxboxes, + l_float32 maxoverlap, + l_int32 maxperim, + l_float32 fract, + l_int32 maxpops) +{ +l_int32 i, w, h, n, nsub, npush, npop; +BOX *boxsub; +BOXA *boxa, *boxa4, *boxasub, *boxad; +PARTEL *partel; +L_HEAP *lh; + + if (!boxas) + return (BOXA *)ERROR_PTR("boxas not defined", __func__, NULL); + if (sortflag != L_SORT_BY_WIDTH && sortflag != L_SORT_BY_HEIGHT && + sortflag != L_SORT_BY_MIN_DIMENSION && + sortflag != L_SORT_BY_MAX_DIMENSION && + sortflag != L_SORT_BY_PERIMETER && sortflag != L_SORT_BY_AREA) + return (BOXA *)ERROR_PTR("invalid sort flag", __func__, NULL); + if (maxboxes < 1) { + maxboxes = 1; + L_WARNING("setting maxboxes = 1\n", __func__); + } + if (maxoverlap < 0.0 || maxoverlap > 1.0) + return (BOXA *)ERROR_PTR("invalid maxoverlap", __func__, NULL); + if (maxpops == 0) + maxpops = DefaultMaxPops; + + if (!box) { + boxaGetExtent(boxas, &w, &h, NULL); + box = boxCreate(0, 0, w, h); + } + + /* Prime the heap */ + lh = lheapCreate(20, L_SORT_DECREASING); + partel = partelCreate(box); + partel->boxa = boxaCopy(boxas, L_CLONE); + partelSetSize(partel, sortflag); + lheapAdd(lh, partel); + + npush = 1; + npop = 0; + boxad = boxaCreate(0); + while (1) { + if ((partel = (PARTEL *)lheapRemove(lh)) == NULL) /* we're done */ + break; + + npop++; /* How many boxes have we retrieved from the queue? */ + if (npop > maxpops) { + partelDestroy(&partel); + break; + } + + /* Extract the contents */ + boxa = boxaCopy(partel->boxa, L_CLONE); + box = boxClone(partel->box); + partelDestroy(&partel); + + /* Can we output this one? */ + n = boxaGetCount(boxa); + if (n == 0) { + if (boxCheckIfOverlapIsBig(box, boxad, maxoverlap) == 0) + boxaAddBox(boxad, box, L_INSERT); + else + boxDestroy(&box); + boxaDestroy(&boxa); + if (boxaGetCount(boxad) >= maxboxes) /* we're done */ + break; + continue; + } + + + /* Generate up to 4 subboxes and put them on the heap */ + boxa4 = boxaGenerateSubboxes(box, boxa, maxperim, fract); + boxDestroy(&box); + nsub = boxaGetCount(boxa4); + for (i = 0; i < nsub; i++) { + boxsub = boxaGetBox(boxa4, i, L_CLONE); + boxasub = boxaIntersectsBox(boxa, boxsub); + partel = partelCreate(boxsub); + partel->boxa = boxasub; + partelSetSize(partel, sortflag); + lheapAdd(lh, partel); + boxDestroy(&boxsub); + } + npush += nsub; /* How many boxes have we put on the queue? */ + +/* boxaWriteStderr(boxa4); */ + + boxaDestroy(&boxa4); + boxaDestroy(&boxa); + } + +#if OUTPUT_HEAP_STATS + lept_stderr("Heap statistics:\n"); + lept_stderr(" Number of boxes pushed: %d\n", npush); + lept_stderr(" Number of boxes popped: %d\n", npop); + lept_stderr(" Number of boxes on heap: %d\n", lheapGetCount(lh)); +#endif /* OUTPUT_HEAP_STATS */ + + /* Clean up the heap */ + while ((partel = (PARTEL *)lheapRemove(lh)) != NULL) + partelDestroy(&partel); + lheapDestroy(&lh, FALSE); + + return boxad; +} + + +/*------------------------------------------------------------------* + * Helpers * + *------------------------------------------------------------------*/ +/*! + * \brief partelCreate() + * + * \param[in] box region; inserts a copy + * \return partel, or NULL on error + */ +static PARTEL * +partelCreate(BOX *box) +{ +PARTEL *partel; + + partel = (PARTEL *)LEPT_CALLOC(1, sizeof(PARTEL)); + partel->box = boxCopy(box); + return partel; +} + + +/*! + * \brief partelDestroy() + * + * \param[in,out] ppartel contents will be set to null before returning + * \return void + */ +static void +partelDestroy(PARTEL **ppartel) +{ +PARTEL *partel; + + if (ppartel == NULL) { + L_WARNING("ptr address is null!\n", __func__); + return; + } + + if ((partel = *ppartel) == NULL) + return; + + boxDestroy(&partel->box); + boxaDestroy(&partel->boxa); + LEPT_FREE(partel); + *ppartel = NULL; + return; +} + + +/*! + * \brief partelSetSize() + * + * \param[in] partel + * \param[in] sortflag L_SORT_BY_WIDTH, L_SORT_BY_HEIGHT, + * L_SORT_BY_MIN_DIMENSION, L_SORT_BY_MAX_DIMENSION, + * L_SORT_BY_PERIMETER, L_SORT_BY_AREA + * \return 0 if OK, 1 on error + */ +static l_int32 +partelSetSize(PARTEL *partel, + l_int32 sortflag) +{ +l_int32 w, h; + + if (!partel) + return ERROR_INT("partel not defined", __func__, 1); + + boxGetGeometry(partel->box, NULL, NULL, &w, &h); + if (sortflag == L_SORT_BY_WIDTH) + partel->size = (l_float32)w; + else if (sortflag == L_SORT_BY_HEIGHT) + partel->size = (l_float32)h; + else if (sortflag == L_SORT_BY_MIN_DIMENSION) + partel->size = (l_float32)L_MIN(w, h); + else if (sortflag == L_SORT_BY_MAX_DIMENSION) + partel->size = (l_float32)L_MAX(w, h); + else if (sortflag == L_SORT_BY_PERIMETER) + partel->size = (l_float32)(w + h); + else if (sortflag == L_SORT_BY_AREA) + partel->size = (l_float32)(w * h); + else + return ERROR_INT("invalid sortflag", __func__, 1); + return 0; +} + + +/*! + * \brief boxaGenerateSubboxes() + * + * \param[in] box region to be split into up to four overlapping + * subregions + * \param[in] boxa boxes of rectangles intersecting the box + * \param[in] maxperim maximum half-perimeter for which pivot + * is selected by proximity to box centroid + * \param[in] fract fraction of box diagonal that is an acceptable + * distance from the box centroid to select the pivot + * \return boxa of four or less overlapping subrectangles of + * the box, or NULL on error + */ +static BOXA * +boxaGenerateSubboxes(BOX *box, + BOXA *boxa, + l_int32 maxperim, + l_float32 fract) +{ +l_int32 x, y, w, h, xp, yp, wp, hp; +BOX *boxp; /* pivot box */ +BOX *boxsub; +BOXA *boxa4; + + if (!box) + return (BOXA *)ERROR_PTR("box not defined", __func__, NULL); + if (!boxa) + return (BOXA *)ERROR_PTR("boxa not defined", __func__, NULL); + + boxa4 = boxaCreate(4); + boxp = boxaSelectPivotBox(box, boxa, maxperim, fract); + boxGetGeometry(box, &x, &y, &w, &h); + boxGetGeometry(boxp, &xp, &yp, &wp, &hp); + boxDestroy(&boxp); + if (xp > x) { /* left sub-box */ + boxsub = boxCreate(x, y, xp - x, h); + boxaAddBox(boxa4, boxsub, L_INSERT); + } + if (yp > y) { /* top sub-box */ + boxsub = boxCreate(x, y, w, yp - y); + boxaAddBox(boxa4, boxsub, L_INSERT); + } + if (xp + wp < x + w) { /* right sub-box */ + boxsub = boxCreate(xp + wp, y, x + w - xp - wp, h); + boxaAddBox(boxa4, boxsub, L_INSERT); + } + if (yp + hp < y + h) { /* bottom sub-box */ + boxsub = boxCreate(x, yp + hp, w, y + h - yp - hp); + boxaAddBox(boxa4, boxsub, L_INSERT); + } + + return boxa4; +} + + +/*! + * \brief boxaSelectPivotBox() + * + * \param[in] box containing box; to be split by the pivot box + * \param[in] boxa boxes of rectangles, from which 1 is to be chosen + * \param[in] maxperim maximum half-perimeter for which pivot + * is selected by proximity to box centroid + * \param[in] fract fraction of box diagonal that is an acceptable + * distance from the box centroid to select the pivot + * \return box pivot box for subdivision into 4 rectangles, + * or NULL on error + * + * <pre> + * Notes: + * (1) This is a tricky piece that wasn't discussed in the + * Breuel's 2002 paper. + * (2) Selects a box from boxa whose centroid is reasonably close to + * the centroid of the containing box (xc, yc) and whose + * half-perimeter does not exceed the maxperim value. + * (3) If there are no boxes in the boxa that are small enough, + * then it selects the smallest of the larger boxes, + * without reference to its location in the containing box. + * (4) If a small box has a centroid at a distance from the + * centroid of the containing box that is not more than + * the fraction 'fract' of the diagonal of the containing + * box, that box is chosen as the pivot, terminating the + * search for the nearest small box. + * (5) Use fract in the range [0.0 ... 1.0]. Set fract = 0.0 + * to choose the small box nearest the centroid. + * (6) Choose maxperim to represent a connected component that is + * small enough so that you don't care about the white space + * that could be inside of it. + * </pre> + */ +static BOX * +boxaSelectPivotBox(BOX *box, + BOXA *boxa, + l_int32 maxperim, + l_float32 fract) +{ +l_int32 i, n, bw, bh, w, h; +l_int32 smallfound, minindex, perim, minsize; +l_float32 delx, dely, mindist, threshdist, dist, x, y, cx, cy; +BOX *boxt; + + if (!box) + return (BOX *)ERROR_PTR("box not defined", __func__, NULL); + if (!boxa) + return (BOX *)ERROR_PTR("boxa not defined", __func__, NULL); + n = boxaGetCount(boxa); + if (n == 0) + return (BOX *)ERROR_PTR("no boxes in boxa", __func__, NULL); + if (fract < 0.0 || fract > 1.0) { + L_WARNING("fract out of bounds; using 0.0\n", __func__); + fract = 0.0; + } + + boxGetGeometry(box, NULL, NULL, &w, &h); + boxGetCenter(box, &x, &y); + threshdist = fract * (w * w + h * h); + mindist = 1000000000.; + minindex = 0; + smallfound = FALSE; + for (i = 0; i < n; i++) { + boxt = boxaGetBox(boxa, i, L_CLONE); + boxGetGeometry(boxt, NULL, NULL, &bw, &bh); + boxGetCenter(boxt, &cx, &cy); + boxDestroy(&boxt); + if (bw + bh > maxperim) + continue; + smallfound = TRUE; + delx = cx - x; + dely = cy - y; + dist = delx * delx + dely * dely; + if (dist <= threshdist) + return boxaGetBox(boxa, i, L_COPY); + if (dist < mindist) { + minindex = i; + mindist = dist; + } + } + + /* If there are small boxes but none are within 'fract' of the + * centroid, return the nearest one. */ + if (smallfound == TRUE) + return boxaGetBox(boxa, minindex, L_COPY); + + /* No small boxes; return the smallest of the large boxes */ + minsize = 1000000000; + minindex = 0; + for (i = 0; i < n; i++) { + boxaGetBoxGeometry(boxa, i, NULL, NULL, &bw, &bh); + perim = bw + bh; + if (perim < minsize) { + minsize = perim; + minindex = i; + } + } + return boxaGetBox(boxa, minindex, L_COPY); +} + + +/*! + * \brief boxCheckIfOverlapIsBig() + * + * \param[in] box to be tested + * \param[in] boxa of boxes already stored + * \param[in] maxoverlap maximum fractional overlap of the input box + * by any of the boxes in boxa + * \return 0 if box has small overlap with every box in boxa; + * 1 otherwise or on error + */ +static l_int32 +boxCheckIfOverlapIsBig(BOX *box, + BOXA *boxa, + l_float32 maxoverlap) +{ +l_int32 i, n, bigoverlap; +l_float32 fract; +BOX *boxt; + + if (!box) + return ERROR_INT("box not defined", __func__, 1); + if (!boxa) + return ERROR_INT("boxa not defined", __func__, 1); + if (maxoverlap < 0.0 || maxoverlap > 1.0) + return ERROR_INT("invalid maxoverlap", __func__, 1); + + n = boxaGetCount(boxa); + if (n == 0 || maxoverlap == 1.0) + return 0; + + bigoverlap = 0; + for (i = 0; i < n; i++) { + boxt = boxaGetBox(boxa, i, L_CLONE); + boxOverlapFraction(boxt, box, &fract); + boxDestroy(&boxt); + if (fract > maxoverlap) { + bigoverlap = 1; + break; + } + } + + return bigoverlap; +} + + +/*! + * \brief boxaPruneSortedOnOverlap() + * + * \param[in] boxas sorted by size in decreasing order + * \param[in] maxoverlap maximum fractional overlap of a box by any + * of the larger boxes + * \return boxad pruned, or NULL on error + * + * <pre> + * Notes: + * (1) This selectively removes smaller boxes when they are overlapped + * by any larger box by more than the input 'maxoverlap' fraction. + * (2) To avoid all pruning, use maxoverlap = 1.0. To select only + * boxes that have no overlap with each other (maximal pruning), + * set maxoverlap = 0.0. + * (3) If there are no boxes in boxas, returns an empty boxa. + * </pre> + */ +BOXA * +boxaPruneSortedOnOverlap(BOXA *boxas, + l_float32 maxoverlap) +{ +l_int32 i, j, n, remove; +l_float32 fract; +BOX *box1, *box2; +BOXA *boxad; + + if (!boxas) + return (BOXA *)ERROR_PTR("boxas not defined", __func__, NULL); + if (maxoverlap < 0.0 || maxoverlap > 1.0) + return (BOXA *)ERROR_PTR("invalid maxoverlap", __func__, NULL); + + n = boxaGetCount(boxas); + if (n == 0 || maxoverlap == 1.0) + return boxaCopy(boxas, L_COPY); + + boxad = boxaCreate(0); + box2 = boxaGetBox(boxas, 0, L_COPY); + boxaAddBox(boxad, box2, L_INSERT); + for (j = 1; j < n; j++) { /* prune on j */ + box2 = boxaGetBox(boxas, j, L_COPY); + remove = FALSE; + for (i = 0; i < j; i++) { /* test on i */ + box1 = boxaGetBox(boxas, i, L_CLONE); + boxOverlapFraction(box1, box2, &fract); + boxDestroy(&box1); + if (fract > maxoverlap) { + remove = TRUE; + break; + } + } + if (remove == TRUE) + boxDestroy(&box2); + else + boxaAddBox(boxad, box2, L_INSERT); + } + + return boxad; +}
