Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/leptonica/src/numafunc2.c @ 3:2c135c81b16c
MERGE: upstream PyMuPDF 1.26.4 with MuPDF 1.26.7
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:44:09 +0200 |
| parents | b50eed0cc0ef |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/leptonica/src/numafunc2.c Mon Sep 15 11:44:09 2025 +0200 @@ -0,0 +1,3263 @@ +/*====================================================================* + - Copyright (C) 2001 Leptonica. All rights reserved. + - + - Redistribution and use in source and binary forms, with or without + - modification, are permitted provided that the following conditions + - are met: + - 1. Redistributions of source code must retain the above copyright + - notice, this list of conditions and the following disclaimer. + - 2. Redistributions in binary form must reproduce the above + - copyright notice, this list of conditions and the following + - disclaimer in the documentation and/or other materials + - provided with the distribution. + - + - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY + - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *====================================================================*/ + +/*! + * \file numafunc2.c + * <pre> + * + * -------------------------------------- + * This file has these Numa utilities: + * - morphological operations + * - arithmetic transforms + * - windowed statistical operations + * - histogram extraction + * - histogram comparison + * - extrema finding + * - frequency and crossing analysis + * -------------------------------------- + + * Morphological (min/max) operations + * NUMA *numaErode() + * NUMA *numaDilate() + * NUMA *numaOpen() + * NUMA *numaClose() + * + * Other transforms + * NUMA *numaTransform() + * l_int32 numaSimpleStats() + * l_int32 numaWindowedStats() + * NUMA *numaWindowedMean() + * NUMA *numaWindowedMeanSquare() + * l_int32 numaWindowedVariance() + * NUMA *numaWindowedMedian() + * NUMA *numaConvertToInt() + * + * Histogram generation and statistics + * NUMA *numaMakeHistogram() + * NUMA *numaMakeHistogramAuto() + * NUMA *numaMakeHistogramClipped() + * NUMA *numaRebinHistogram() + * NUMA *numaNormalizeHistogram() + * l_int32 numaGetStatsUsingHistogram() + * l_int32 numaGetHistogramStats() + * l_int32 numaGetHistogramStatsOnInterval() + * l_int32 numaMakeRankFromHistogram() + * l_int32 numaHistogramGetRankFromVal() + * l_int32 numaHistogramGetValFromRank() + * l_int32 numaDiscretizeSortedInBins() + * l_int32 numaDiscretizeHistoInBins() + * l_int32 numaGetRankBinValues() + * NUMA *numaGetUniformBinSizes() + * + * Splitting a distribution + * l_int32 numaSplitDistribution() + * + * Comparing histograms + * l_int32 grayHistogramsToEMD() + * l_int32 numaEarthMoverDistance() + * l_int32 grayInterHistogramStats() + * + * Extrema finding + * NUMA *numaFindPeaks() + * NUMA *numaFindExtrema() + * NUMA *numaFindLocForThreshold() + * l_int32 *numaCountReversals() + * + * Threshold crossings and frequency analysis + * l_int32 numaSelectCrossingThreshold() + * NUMA *numaCrossingsByThreshold() + * NUMA *numaCrossingsByPeaks() + * NUMA *numaEvalBestHaarParameters() + * l_int32 numaEvalHaarSum() + * + * Generating numbers in a range under constraints + * NUMA *genConstrainedNumaInRange() + * + * Things to remember when using the Numa: + * + * (1) The numa is a struct, not an array. Always use accessors + * (see numabasic.c), never the fields directly. + * + * (2) The number array holds l_float32 values. It can also + * be used to store l_int32 values. See numabasic.c for + * details on using the accessors. Integers larger than + * about 10M will lose accuracy due on retrieval due to round-off. + * For large integers, use the dna (array of l_float64) instead. + * + * (3) Occasionally, in the comments we denote the i-th element of a + * numa by na[i]. This is conceptual only -- the numa is not an array! + * + * Some general comments on histograms: + * + * (1) Histograms are the generic statistical representation of + * the data about some attribute. Typically they're not + * normalized -- they simply give the number of occurrences + * within each range of values of the attribute. This range + * of values is referred to as a 'bucket'. For example, + * the histogram could specify how many connected components + * are found for each value of their width; in that case, + * the bucket size is 1. + * + * (2) In leptonica, all buckets have the same size. Histograms + * are therefore specified by a numa of occurrences, along + * with two other numbers: the 'value' associated with the + * occupants of the first bucket and the size (i.e., 'width') + * of each bucket. These two numbers then allow us to calculate + * the value associated with the occupants of each bucket. + * These numbers are fields in the numa, initialized to + * a startx value of 0.0 and a binsize of 1.0. Accessors for + * these fields are functions numa*Parameters(). All histograms + * must have these two numbers properly set. + * </pre> + */ + +#ifdef HAVE_CONFIG_H +#include <config_auto.h> +#endif /* HAVE_CONFIG_H */ + +#include <math.h> +#include "allheaders.h" + + /* bin sizes in numaMakeHistogram() */ +static const l_int32 BinSizeArray[] = {2, 5, 10, 20, 50, 100, 200, 500, 1000,\ + 2000, 5000, 10000, 20000, 50000, 100000, 200000,\ + 500000, 1000000, 2000000, 5000000, 10000000,\ + 200000000, 50000000, 100000000}; +static const l_int32 NBinSizes = 24; + + +#ifndef NO_CONSOLE_IO +#define DEBUG_HISTO 0 +#define DEBUG_CROSSINGS 0 +#define DEBUG_FREQUENCY 0 +#endif /* ~NO_CONSOLE_IO */ + +/*----------------------------------------------------------------------* + * Morphological operations * + *----------------------------------------------------------------------*/ +/*! + * \brief numaErode() + * + * \param[in] nas + * \param[in] size of sel; greater than 0, odd. The origin + * is implicitly in the center. + * \return nad eroded, or NULL on error + * + * <pre> + * Notes: + * (1) The structuring element (sel) is linear, all "hits" + * (2) If size == 1, this returns a copy + * (3) General comment. The morphological operations are equivalent + * to those that would be performed on a 1-dimensional fpix. + * However, because we have not implemented morphological + * operations on fpix, we do this here. Because it is only + * 1 dimensional, there is no reason to use the more + * complicated van Herk/Gil-Werman algorithm, and we do it + * by brute force. + * </pre> + */ +NUMA * +numaErode(NUMA *nas, + l_int32 size) +{ +l_int32 i, j, n, hsize, len; +l_float32 minval; +l_float32 *fa, *fas, *fad; +NUMA *nad; + + if (!nas) + return (NUMA *)ERROR_PTR("nas not defined", __func__, NULL); + if (size <= 0) + return (NUMA *)ERROR_PTR("size must be > 0", __func__, NULL); + if ((size & 1) == 0 ) { + L_WARNING("sel size must be odd; increasing by 1\n", __func__); + size++; + } + + if (size == 1) + return numaCopy(nas); + + /* Make a source fa (fas) that has an added (size / 2) boundary + * on left and right, contains a copy of nas in the interior region + * (between 'size' and 'size + n', and has large values + * inserted in the boundary (because it is an erosion). */ + n = numaGetCount(nas); + hsize = size / 2; + len = n + 2 * hsize; + if ((fas = (l_float32 *)LEPT_CALLOC(len, sizeof(l_float32))) == NULL) + return (NUMA *)ERROR_PTR("fas not made", __func__, NULL); + for (i = 0; i < hsize; i++) + fas[i] = 1.0e37f; + for (i = hsize + n; i < len; i++) + fas[i] = 1.0e37f; + fa = numaGetFArray(nas, L_NOCOPY); + for (i = 0; i < n; i++) + fas[hsize + i] = fa[i]; + + nad = numaMakeConstant(0, n); + numaCopyParameters(nad, nas); + fad = numaGetFArray(nad, L_NOCOPY); + for (i = 0; i < n; i++) { + minval = 1.0e37f; /* start big */ + for (j = 0; j < size; j++) + minval = L_MIN(minval, fas[i + j]); + fad[i] = minval; + } + + LEPT_FREE(fas); + return nad; +} + + +/*! + * \brief numaDilate() + * + * \param[in] nas + * \param[in] size of sel; greater than 0, odd. The origin + * is implicitly in the center. + * \return nad dilated, or NULL on error + * + * <pre> + * Notes: + * (1) The structuring element (sel) is linear, all "hits" + * (2) If size == 1, this returns a copy + * </pre> + */ +NUMA * +numaDilate(NUMA *nas, + l_int32 size) +{ +l_int32 i, j, n, hsize, len; +l_float32 maxval; +l_float32 *fa, *fas, *fad; +NUMA *nad; + + if (!nas) + return (NUMA *)ERROR_PTR("nas not defined", __func__, NULL); + if (size <= 0) + return (NUMA *)ERROR_PTR("size must be > 0", __func__, NULL); + if ((size & 1) == 0 ) { + L_WARNING("sel size must be odd; increasing by 1\n", __func__); + size++; + } + + if (size == 1) + return numaCopy(nas); + + /* Make a source fa (fas) that has an added (size / 2) boundary + * on left and right, contains a copy of nas in the interior region + * (between 'size' and 'size + n', and has small values + * inserted in the boundary (because it is a dilation). */ + n = numaGetCount(nas); + hsize = size / 2; + len = n + 2 * hsize; + if ((fas = (l_float32 *)LEPT_CALLOC(len, sizeof(l_float32))) == NULL) + return (NUMA *)ERROR_PTR("fas not made", __func__, NULL); + for (i = 0; i < hsize; i++) + fas[i] = -1.0e37f; + for (i = hsize + n; i < len; i++) + fas[i] = -1.0e37f; + fa = numaGetFArray(nas, L_NOCOPY); + for (i = 0; i < n; i++) + fas[hsize + i] = fa[i]; + + nad = numaMakeConstant(0, n); + numaCopyParameters(nad, nas); + fad = numaGetFArray(nad, L_NOCOPY); + for (i = 0; i < n; i++) { + maxval = -1.0e37f; /* start small */ + for (j = 0; j < size; j++) + maxval = L_MAX(maxval, fas[i + j]); + fad[i] = maxval; + } + + LEPT_FREE(fas); + return nad; +} + + +/*! + * \brief numaOpen() + * + * \param[in] nas + * \param[in] size of sel; greater than 0, odd. The origin + * is implicitly in the center. + * \return nad opened, or NULL on error + * + * <pre> + * Notes: + * (1) The structuring element (sel) is linear, all "hits" + * (2) If size == 1, this returns a copy + * </pre> + */ +NUMA * +numaOpen(NUMA *nas, + l_int32 size) +{ +NUMA *nat, *nad; + + if (!nas) + return (NUMA *)ERROR_PTR("nas not defined", __func__, NULL); + if (size <= 0) + return (NUMA *)ERROR_PTR("size must be > 0", __func__, NULL); + if ((size & 1) == 0 ) { + L_WARNING("sel size must be odd; increasing by 1\n", __func__); + size++; + } + + if (size == 1) + return numaCopy(nas); + + nat = numaErode(nas, size); + nad = numaDilate(nat, size); + numaDestroy(&nat); + return nad; +} + + +/*! + * \brief numaClose() + * + * \param[in] nas + * \param[in] size of sel; greater than 0, odd. The origin + * is implicitly in the center. + * \return nad closed, or NULL on error + * + * <pre> + * Notes: + * (1) The structuring element (sel) is linear, all "hits" + * (2) If size == 1, this returns a copy + * (3) We add a border before doing this operation, for the same + * reason that we add a border to a pix before doing a safe closing. + * Without the border, a small component near the border gets + * clipped at the border on dilation, and can be entirely removed + * by the following erosion, violating the basic extensivity + * property of closing. + * </pre> + */ +NUMA * +numaClose(NUMA *nas, + l_int32 size) +{ +NUMA *nab, *nat1, *nat2, *nad; + + if (!nas) + return (NUMA *)ERROR_PTR("nas not defined", __func__, NULL); + if (size <= 0) + return (NUMA *)ERROR_PTR("size must be > 0", __func__, NULL); + if ((size & 1) == 0 ) { + L_WARNING("sel size must be odd; increasing by 1\n", __func__); + size++; + } + + if (size == 1) + return numaCopy(nas); + + nab = numaAddBorder(nas, size, size, 0); /* to preserve extensivity */ + nat1 = numaDilate(nab, size); + nat2 = numaErode(nat1, size); + nad = numaRemoveBorder(nat2, size, size); + numaDestroy(&nab); + numaDestroy(&nat1); + numaDestroy(&nat2); + return nad; +} + + +/*----------------------------------------------------------------------* + * Other transforms * + *----------------------------------------------------------------------*/ +/*! + * \brief numaTransform() + * + * \param[in] nas + * \param[in] shift add this to each number + * \param[in] scale multiply each number by this + * \return nad with all values shifted and scaled, or NULL on error + * + * <pre> + * Notes: + * (1) Each number is shifted before scaling. + * </pre> + */ +NUMA * +numaTransform(NUMA *nas, + l_float32 shift, + l_float32 scale) +{ +l_int32 i, n; +l_float32 val; +NUMA *nad; + + if (!nas) + return (NUMA *)ERROR_PTR("nas not defined", __func__, NULL); + n = numaGetCount(nas); + if ((nad = numaCreate(n)) == NULL) + return (NUMA *)ERROR_PTR("nad not made", __func__, NULL); + numaCopyParameters(nad, nas); + for (i = 0; i < n; i++) { + numaGetFValue(nas, i, &val); + val = scale * (val + shift); + numaAddNumber(nad, val); + } + return nad; +} + + +/*! + * \brief numaSimpleStats() + * + * \param[in] na input numa + * \param[in] first first element to use + * \param[in] last last element to use; -1 to go to the end + * \param[out] pmean [optional] mean value + * \param[out] pvar [optional] variance + * \param[out] prvar [optional] rms deviation from the mean + * \return 0 if OK, 1 on error + */ +l_ok +numaSimpleStats(NUMA *na, + l_int32 first, + l_int32 last, + l_float32 *pmean, + l_float32 *pvar, + l_float32 *prvar) +{ +l_int32 i, n, ni; +l_float32 sum, sumsq, val, mean, var; + + if (pmean) *pmean = 0.0; + if (pvar) *pvar = 0.0; + if (prvar) *prvar = 0.0; + if (!pmean && !pvar && !prvar) + return ERROR_INT("nothing requested", __func__, 1); + if (!na) + return ERROR_INT("na not defined", __func__, 1); + if ((n = numaGetCount(na)) == 0) + return ERROR_INT("na is empty", __func__, 1); + first = L_MAX(0, first); + if (last < 0) last = n - 1; + if (first >= n) + return ERROR_INT("invalid first", __func__, 1); + if (last >= n) { + L_WARNING("last = %d is beyond max index = %d; adjusting\n", + __func__, last, n - 1); + last = n - 1; + } + if (first > last) + return ERROR_INT("first > last\n", __func__, 1); + ni = last - first + 1; + sum = sumsq = 0.0; + for (i = first; i <= last; i++) { + numaGetFValue(na, i, &val); + sum += val; + sumsq += val * val; + } + + mean = sum / ni; + if (pmean) + *pmean = mean; + if (pvar || prvar) { + var = sumsq / ni - mean * mean; + if (pvar) *pvar = var; + if (prvar) *prvar = sqrtf(var); + } + + return 0; +} + + +/*! + * \brief numaWindowedStats() + * + * \param[in] nas input numa + * \param[in] wc half width of the window + * \param[out] pnam [optional] mean value in window + * \param[out] pnams [optional] mean square value in window + * \param[out] pnav [optional] variance in window + * \param[out] pnarv [optional] rms deviation from the mean + * \return 0 if OK, 1 on error + * + * <pre> + * Notes: + * (1) This is a high-level convenience function for calculating + * any or all of these derived arrays. + * (2) These statistical measures over the values in the + * rectangular window are: + * ~ average value: [x] (nam) + * ~ average squared value: [x*x] (nams) + * ~ variance: [(x - [x])*(x - [x])] = [x*x] - [x]*[x] (nav) + * ~ square-root of variance: (narv) + * where the brackets [ .. ] indicate that the average value is + * to be taken over the window. + * (3) Note that the variance is just the mean square difference from + * the mean value; and the square root of the variance is the + * root mean square difference from the mean, sometimes also + * called the 'standard deviation'. + * (4) Internally, use mirrored borders to handle values near the + * end of each array. + * </pre> + */ +l_ok +numaWindowedStats(NUMA *nas, + l_int32 wc, + NUMA **pnam, + NUMA **pnams, + NUMA **pnav, + NUMA **pnarv) +{ +NUMA *nam, *nams; + + if (!nas) + return ERROR_INT("nas not defined", __func__, 1); + if (2 * wc + 1 > numaGetCount(nas)) + L_WARNING("filter wider than input array!\n", __func__); + + if (!pnav && !pnarv) { + if (pnam) *pnam = numaWindowedMean(nas, wc); + if (pnams) *pnams = numaWindowedMeanSquare(nas, wc); + return 0; + } + + nam = numaWindowedMean(nas, wc); + nams = numaWindowedMeanSquare(nas, wc); + numaWindowedVariance(nam, nams, pnav, pnarv); + if (pnam) + *pnam = nam; + else + numaDestroy(&nam); + if (pnams) + *pnams = nams; + else + numaDestroy(&nams); + return 0; +} + + +/*! + * \brief numaWindowedMean() + * + * \param[in] nas + * \param[in] wc half width of the convolution window + * \return nad after low-pass filtering, or NULL on error + * + * <pre> + * Notes: + * (1) This is a convolution. The window has width = 2 * %wc + 1. + * (2) We add a mirrored border of size %wc to each end of the array. + * </pre> + */ +NUMA * +numaWindowedMean(NUMA *nas, + l_int32 wc) +{ +l_int32 i, n, n1, width; +l_float32 sum, norm; +l_float32 *fa1, *fad, *suma; +NUMA *na1, *nad; + + if (!nas) + return (NUMA *)ERROR_PTR("nas not defined", __func__, NULL); + n = numaGetCount(nas); + width = 2 * wc + 1; /* filter width */ + if (width > n) + L_WARNING("filter wider than input array!\n", __func__); + + na1 = numaAddSpecifiedBorder(nas, wc, wc, L_MIRRORED_BORDER); + n1 = n + 2 * wc; + fa1 = numaGetFArray(na1, L_NOCOPY); + nad = numaMakeConstant(0, n); + fad = numaGetFArray(nad, L_NOCOPY); + + /* Make sum array; note the indexing */ + if ((suma = (l_float32 *)LEPT_CALLOC(n1 + 1, sizeof(l_float32))) == NULL) { + numaDestroy(&na1); + numaDestroy(&nad); + return (NUMA *)ERROR_PTR("suma not made", __func__, NULL); + } + sum = 0.0; + suma[0] = 0.0; + for (i = 0; i < n1; i++) { + sum += fa1[i]; + suma[i + 1] = sum; + } + + norm = 1.f / (2 * wc + 1); + for (i = 0; i < n; i++) + fad[i] = norm * (suma[width + i] - suma[i]); + + LEPT_FREE(suma); + numaDestroy(&na1); + return nad; +} + + +/*! + * \brief numaWindowedMeanSquare() + * + * \param[in] nas + * \param[in] wc half width of the window + * \return nad containing windowed mean square values, or NULL on error + * + * <pre> + * Notes: + * (1) The window has width = 2 * %wc + 1. + * (2) We add a mirrored border of size %wc to each end of the array. + * </pre> + */ +NUMA * +numaWindowedMeanSquare(NUMA *nas, + l_int32 wc) +{ +l_int32 i, n, n1, width; +l_float32 sum, norm; +l_float32 *fa1, *fad, *suma; +NUMA *na1, *nad; + + if (!nas) + return (NUMA *)ERROR_PTR("nas not defined", __func__, NULL); + n = numaGetCount(nas); + width = 2 * wc + 1; /* filter width */ + if (width > n) + L_WARNING("filter wider than input array!\n", __func__); + + na1 = numaAddSpecifiedBorder(nas, wc, wc, L_MIRRORED_BORDER); + n1 = n + 2 * wc; + fa1 = numaGetFArray(na1, L_NOCOPY); + nad = numaMakeConstant(0, n); + fad = numaGetFArray(nad, L_NOCOPY); + + /* Make sum array; note the indexing */ + if ((suma = (l_float32 *)LEPT_CALLOC(n1 + 1, sizeof(l_float32))) == NULL) { + numaDestroy(&na1); + numaDestroy(&nad); + return (NUMA *)ERROR_PTR("suma not made", __func__, NULL); + } + sum = 0.0; + suma[0] = 0.0; + for (i = 0; i < n1; i++) { + sum += fa1[i] * fa1[i]; + suma[i + 1] = sum; + } + + norm = 1.f / (2 * wc + 1); + for (i = 0; i < n; i++) + fad[i] = norm * (suma[width + i] - suma[i]); + + LEPT_FREE(suma); + numaDestroy(&na1); + return nad; +} + + +/*! + * \brief numaWindowedVariance() + * + * \param[in] nam windowed mean values + * \param[in] nams windowed mean square values + * \param[out] pnav [optional] numa of variance -- the ms deviation + * from the mean + * \param[out] pnarv [optional] numa of rms deviation from the mean + * \return 0 if OK, 1 on error + * + * <pre> + * Notes: + * (1) The numas of windowed mean and mean square are precomputed, + * using numaWindowedMean() and numaWindowedMeanSquare(). + * (2) Either or both of the variance and square-root of variance + * are returned, where the variance is the average over the + * window of the mean square difference of the pixel value + * from the mean: + * [(x - [x])*(x - [x])] = [x*x] - [x]*[x] + * </pre> + */ +l_ok +numaWindowedVariance(NUMA *nam, + NUMA *nams, + NUMA **pnav, + NUMA **pnarv) +{ +l_int32 i, nm, nms; +l_float32 var; +l_float32 *fam, *fams, *fav = NULL, *farv = NULL; +NUMA *nav, *narv; /* variance and square root of variance */ + + if (pnav) *pnav = NULL; + if (pnarv) *pnarv = NULL; + if (!pnav && !pnarv) + return ERROR_INT("neither &nav nor &narv are defined", __func__, 1); + if (!nam) + return ERROR_INT("nam not defined", __func__, 1); + if (!nams) + return ERROR_INT("nams not defined", __func__, 1); + nm = numaGetCount(nam); + nms = numaGetCount(nams); + if (nm != nms) + return ERROR_INT("sizes of nam and nams differ", __func__, 1); + + if (pnav) { + nav = numaMakeConstant(0, nm); + *pnav = nav; + fav = numaGetFArray(nav, L_NOCOPY); + } + if (pnarv) { + narv = numaMakeConstant(0, nm); + *pnarv = narv; + farv = numaGetFArray(narv, L_NOCOPY); + } + fam = numaGetFArray(nam, L_NOCOPY); + fams = numaGetFArray(nams, L_NOCOPY); + + for (i = 0; i < nm; i++) { + var = fams[i] - fam[i] * fam[i]; + if (pnav) + fav[i] = var; + if (pnarv) + farv[i] = sqrtf(var); + } + + return 0; +} + + +/*! + * \brief numaWindowedMedian() + * + * \param[in] nas + * \param[in] halfwin half width of window over which the median is found + * \return nad after windowed median filtering, or NULL on error + * + * <pre> + * Notes: + * (1) The requested window has width = 2 * %halfwin + 1. + * (2) If the input nas has less then 3 elements, return a copy. + * (3) If the filter is too small (%halfwin <= 0), return a copy. + * (4) If the filter is too large, it is reduced in size. + * (5) We add a mirrored border of size %halfwin to each end of + * the array to simplify the calculation by avoiding end-effects. + * </pre> + */ +NUMA * +numaWindowedMedian(NUMA *nas, + l_int32 halfwin) +{ +l_int32 i, n; +l_float32 medval; +NUMA *na1, *na2, *nad; + + if (!nas) + return (NUMA *)ERROR_PTR("nas not defined", __func__, NULL); + if ((n = numaGetCount(nas)) < 3) + return numaCopy(nas); + if (halfwin <= 0) { + L_ERROR("filter too small; returning a copy\n", __func__); + return numaCopy(nas); + } + + if (halfwin > (n - 1) / 2) { + halfwin = (n - 1) / 2; + L_INFO("reducing filter to halfwin = %d\n", __func__, halfwin); + } + + /* Add a border to both ends */ + na1 = numaAddSpecifiedBorder(nas, halfwin, halfwin, L_MIRRORED_BORDER); + + /* Get the median value at the center of each window, corresponding + * to locations in the input nas. */ + nad = numaCreate(n); + for (i = 0; i < n; i++) { + na2 = numaClipToInterval(na1, i, i + 2 * halfwin); + numaGetMedian(na2, &medval); + numaAddNumber(nad, medval); + numaDestroy(&na2); + } + + numaDestroy(&na1); + return nad; +} + + +/*! + * \brief numaConvertToInt() + * + * \param[in] nas source numa + * \return na with all values rounded to nearest integer, or + * NULL on error + */ +NUMA * +numaConvertToInt(NUMA *nas) +{ +l_int32 i, n, ival; +NUMA *nad; + + if (!nas) + return (NUMA *)ERROR_PTR("nas not defined", __func__, NULL); + + n = numaGetCount(nas); + if ((nad = numaCreate(n)) == NULL) + return (NUMA *)ERROR_PTR("nad not made", __func__, NULL); + numaCopyParameters(nad, nas); + for (i = 0; i < n; i++) { + numaGetIValue(nas, i, &ival); + numaAddNumber(nad, ival); + } + return nad; +} + + +/*----------------------------------------------------------------------* + * Histogram generation and statistics * + *----------------------------------------------------------------------*/ +/*! + * \brief numaMakeHistogram() + * + * \param[in] na + * \param[in] maxbins max number of histogram bins + * \param[out] pbinsize [optional] size of histogram bins + * \param[out] pbinstart [optional] start val of minimum bin; + * input NULL to force start at 0 + * \return na consisiting of histogram of integerized values, + * or NULL on error. + * + * <pre> + * Notes: + * (1) This simple interface is designed for integer data. + * The bins are of integer width and start on integer boundaries, + * so the results on float data will not have high precision. + * (2) Specify the max number of input bins. Then %binsize, + * the size of bins necessary to accommodate the input data, + * is returned. It is optionally returned and one of the sequence: + * {1, 2, 5, 10, 20, 50, ...}. + * (3) If &binstart is given, all values are accommodated, + * and the min value of the starting bin is returned. + * Otherwise, all negative values are discarded and + * the histogram bins start at 0. + * </pre> + */ +NUMA * +numaMakeHistogram(NUMA *na, + l_int32 maxbins, + l_int32 *pbinsize, + l_int32 *pbinstart) +{ +l_int32 i, n, ival, hval; +l_int32 iminval, imaxval, range, binsize, nbins, ibin; +l_float32 val, ratio; +NUMA *nai, *nahist; + + if (pbinsize) *pbinsize = 0; + if (pbinstart) *pbinstart = 0; + if (!na) + return (NUMA *)ERROR_PTR("na not defined", __func__, NULL); + if (maxbins < 1) + return (NUMA *)ERROR_PTR("maxbins < 1", __func__, NULL); + + /* Determine input range */ + numaGetMin(na, &val, NULL); + iminval = (l_int32)(val + 0.5); + numaGetMax(na, &val, NULL); + imaxval = (l_int32)(val + 0.5); + if (pbinstart == NULL) { /* clip negative vals; start from 0 */ + iminval = 0; + if (imaxval < 0) + return (NUMA *)ERROR_PTR("all values < 0", __func__, NULL); + } + + /* Determine binsize */ + range = imaxval - iminval + 1; + if (range > maxbins - 1) { + ratio = (l_float32)range / (l_float32)maxbins; + binsize = 0; + for (i = 0; i < NBinSizes; i++) { + if (ratio < BinSizeArray[i]) { + binsize = BinSizeArray[i]; + break; + } + } + if (binsize == 0) + return (NUMA *)ERROR_PTR("numbers too large", __func__, NULL); + } else { + binsize = 1; + } + if (pbinsize) *pbinsize = binsize; + nbins = 1 + range / binsize; /* +1 seems to be sufficient */ + + /* Redetermine iminval */ + if (pbinstart && binsize > 1) { + if (iminval >= 0) + iminval = binsize * (iminval / binsize); + else + iminval = binsize * ((iminval - binsize + 1) / binsize); + } + if (pbinstart) *pbinstart = iminval; + +#if DEBUG_HISTO + lept_stderr(" imaxval = %d, range = %d, nbins = %d\n", + imaxval, range, nbins); +#endif /* DEBUG_HISTO */ + + /* Use integerized data for input */ + if ((nai = numaConvertToInt(na)) == NULL) + return (NUMA *)ERROR_PTR("nai not made", __func__, NULL); + n = numaGetCount(nai); + + /* Make histogram, converting value in input array + * into a bin number for this histogram array. */ + if ((nahist = numaCreate(nbins)) == NULL) { + numaDestroy(&nai); + return (NUMA *)ERROR_PTR("nahist not made", __func__, NULL); + } + numaSetCount(nahist, nbins); + numaSetParameters(nahist, iminval, binsize); + for (i = 0; i < n; i++) { + numaGetIValue(nai, i, &ival); + ibin = (ival - iminval) / binsize; + if (ibin >= 0 && ibin < nbins) { + numaGetIValue(nahist, ibin, &hval); + numaSetValue(nahist, ibin, hval + 1.0f); + } + } + + numaDestroy(&nai); + return nahist; +} + + +/*! + * \brief numaMakeHistogramAuto() + * + * \param[in] na numa of floats; these may be integers + * \param[in] maxbins max number of histogram bins; >= 1 + * \return na consisiting of histogram of quantized float values, + * or NULL on error. + * + * <pre> + * Notes: + * (1) This simple interface is designed for accurate binning + * of both integer and float data. + * (2) If the array data is integers, and the range of integers + * is smaller than %maxbins, they are binned as they fall, + * with binsize = 1. + * (3) If the range of data, (maxval - minval), is larger than + * %maxbins, or if the data is floats, they are binned into + * exactly %maxbins bins. + * (4) Unlike numaMakeHistogram(), these bins in general have + * non-integer location and width, even for integer data. + * </pre> + */ +NUMA * +numaMakeHistogramAuto(NUMA *na, + l_int32 maxbins) +{ +l_int32 i, n, imin, imax, irange, ibin, ival, allints; +l_float32 minval, maxval, range, binsize, fval; +NUMA *nah; + + if (!na) + return (NUMA *)ERROR_PTR("na not defined", __func__, NULL); + maxbins = L_MAX(1, maxbins); + + /* Determine input range */ + numaGetMin(na, &minval, NULL); + numaGetMax(na, &maxval, NULL); + + /* Determine if values are all integers */ + n = numaGetCount(na); + numaHasOnlyIntegers(na, &allints); + + /* Do simple integer binning if possible */ + if (allints && (maxval - minval < maxbins)) { + imin = (l_int32)minval; + imax = (l_int32)maxval; + irange = imax - imin + 1; + nah = numaCreate(irange); + numaSetCount(nah, irange); /* init */ + numaSetParameters(nah, minval, 1.0); + for (i = 0; i < n; i++) { + numaGetIValue(na, i, &ival); + ibin = ival - imin; + numaGetIValue(nah, ibin, &ival); + numaSetValue(nah, ibin, ival + 1.0f); + } + + return nah; + } + + /* Do float binning, even if the data is integers. */ + range = maxval - minval; + binsize = range / (l_float32)maxbins; + if (range == 0.0) { + nah = numaCreate(1); + numaSetParameters(nah, minval, binsize); + numaAddNumber(nah, n); + return nah; + } + nah = numaCreate(maxbins); + numaSetCount(nah, maxbins); + numaSetParameters(nah, minval, binsize); + for (i = 0; i < n; i++) { + numaGetFValue(na, i, &fval); + ibin = (l_int32)((fval - minval) / binsize); + ibin = L_MIN(ibin, maxbins - 1); /* "edge" case; stay in bounds */ + numaGetIValue(nah, ibin, &ival); + numaSetValue(nah, ibin, ival + 1.0f); + } + + return nah; +} + + +/*! + * \brief numaMakeHistogramClipped() + * + * \param[in] na + * \param[in] binsize typically 1.0 + * \param[in] maxsize of histogram ordinate + * \return na histogram of bins of size %binsize, starting with + * the na[0] (x = 0.0 and going up to a maximum of + * x = %maxsize, by increments of %binsize), or NULL on error + * + * <pre> + * Notes: + * (1) This simple function generates a histogram of values + * from na, discarding all values < 0.0 or greater than + * min(%maxsize, maxval), where maxval is the maximum value in na. + * The histogram data is put in bins of size delx = %binsize, + * starting at x = 0.0. We use as many bins as are + * needed to hold the data. + * </pre> + */ +NUMA * +numaMakeHistogramClipped(NUMA *na, + l_float32 binsize, + l_float32 maxsize) +{ +l_int32 i, n, nbins, ival, ibin; +l_float32 val, maxval; +NUMA *nad; + + if (!na) + return (NUMA *)ERROR_PTR("na not defined", __func__, NULL); + if (binsize <= 0.0) + return (NUMA *)ERROR_PTR("binsize must be > 0.0", __func__, NULL); + if (binsize > maxsize) + binsize = maxsize; /* just one bin */ + + numaGetMax(na, &maxval, NULL); + n = numaGetCount(na); + maxsize = L_MIN(maxsize, maxval); + nbins = (l_int32)(maxsize / binsize) + 1; + +/* lept_stderr("maxsize = %7.3f, nbins = %d\n", maxsize, nbins); */ + + if ((nad = numaCreate(nbins)) == NULL) + return (NUMA *)ERROR_PTR("nad not made", __func__, NULL); + numaSetParameters(nad, 0.0, binsize); + numaSetCount(nad, nbins); /* interpret zeroes in bins as data */ + for (i = 0; i < n; i++) { + numaGetFValue(na, i, &val); + ibin = (l_int32)(val / binsize); + if (ibin >= 0 && ibin < nbins) { + numaGetIValue(nad, ibin, &ival); + numaSetValue(nad, ibin, ival + 1.0f); + } + } + + return nad; +} + + +/*! + * \brief numaRebinHistogram() + * + * \param[in] nas input histogram + * \param[in] newsize number of old bins contained in each new bin + * \return nad more coarsely re-binned histogram, or NULL on error + */ +NUMA * +numaRebinHistogram(NUMA *nas, + l_int32 newsize) +{ +l_int32 i, j, ns, nd, index, count, val; +l_float32 start, oldsize; +NUMA *nad; + + if (!nas) + return (NUMA *)ERROR_PTR("nas not defined", __func__, NULL); + if (newsize <= 1) + return (NUMA *)ERROR_PTR("newsize must be > 1", __func__, NULL); + if ((ns = numaGetCount(nas)) == 0) + return (NUMA *)ERROR_PTR("no bins in nas", __func__, NULL); + + nd = (ns + newsize - 1) / newsize; + if ((nad = numaCreate(nd)) == NULL) + return (NUMA *)ERROR_PTR("nad not made", __func__, NULL); + numaGetParameters(nad, &start, &oldsize); + numaSetParameters(nad, start, oldsize * newsize); + + for (i = 0; i < nd; i++) { /* new bins */ + count = 0; + index = i * newsize; + for (j = 0; j < newsize; j++) { + if (index < ns) { + numaGetIValue(nas, index, &val); + count += val; + index++; + } + } + numaAddNumber(nad, count); + } + + return nad; +} + + +/*! + * \brief numaNormalizeHistogram() + * + * \param[in] nas input histogram + * \param[in] tsum target sum of all numbers in dest histogram; e.g., use + * %tsum= 1.0 if this represents a probability distribution + * \return nad normalized histogram, or NULL on error + */ +NUMA * +numaNormalizeHistogram(NUMA *nas, + l_float32 tsum) +{ +l_int32 i, ns; +l_float32 sum, factor, fval; +NUMA *nad; + + if (!nas) + return (NUMA *)ERROR_PTR("nas not defined", __func__, NULL); + if (tsum <= 0.0) + return (NUMA *)ERROR_PTR("tsum must be > 0.0", __func__, NULL); + if ((ns = numaGetCount(nas)) == 0) + return (NUMA *)ERROR_PTR("no bins in nas", __func__, NULL); + + numaGetSum(nas, &sum); + factor = tsum / sum; + + if ((nad = numaCreate(ns)) == NULL) + return (NUMA *)ERROR_PTR("nad not made", __func__, NULL); + numaCopyParameters(nad, nas); + + for (i = 0; i < ns; i++) { + numaGetFValue(nas, i, &fval); + fval *= factor; + numaAddNumber(nad, fval); + } + + return nad; +} + + +/*! + * \brief numaGetStatsUsingHistogram() + * + * \param[in] na an arbitrary set of numbers; not ordered and not + * a histogram + * \param[in] maxbins the maximum number of bins to be allowed in + * the histogram; use an integer larger than the + * largest number in %na for consecutive integer bins + * \param[out] pmin [optional] min value of set + * \param[out] pmax [optional] max value of set + * \param[out] pmean [optional] mean value of set + * \param[out] pvariance [optional] variance + * \param[out] pmedian [optional] median value of set + * \param[in] rank in [0.0 ... 1.0]; median has a rank 0.5; + * ignored if &rval == NULL + * \param[out] prval [optional] value in na corresponding to %rank + * \param[out] phisto [optional] Numa histogram; use NULL to prevent + * \return 0 if OK, 1 on error + * + * <pre> + * Notes: + * (1) This is a simple interface for gathering statistics + * from a numa, where a histogram is used 'under the covers' + * to avoid sorting if a rank value is requested. In that case, + * by using a histogram we are trading speed for accuracy, because + * the values in %na are quantized to the center of a set of bins. + * (2) If the median, other rank value, or histogram are not requested, + * the calculation is all performed on the input Numa. + * (3) The variance is the average of the square of the + * difference from the mean. The median is the value in na + * with rank 0.5. + * (4) There are two situations where this gives rank results with + * accuracy comparable to computing stastics directly on the input + * data, without binning into a histogram: + * (a) the data is integers and the range of data is less than + * %maxbins, and + * (b) the data is floats and the range is small compared to + * %maxbins, so that the binsize is much less than 1. + * (5) If a histogram is used and the numbers in the Numa extend + * over a large range, you can limit the required storage by + * specifying the maximum number of bins in the histogram. + * Use %maxbins == 0 to force the bin size to be 1. + * (6) This optionally returns the median and one arbitrary rank value. + * If you need several rank values, return the histogram and use + * numaHistogramGetValFromRank(nah, rank, &rval) + * multiple times. + * </pre> + */ +l_ok +numaGetStatsUsingHistogram(NUMA *na, + l_int32 maxbins, + l_float32 *pmin, + l_float32 *pmax, + l_float32 *pmean, + l_float32 *pvariance, + l_float32 *pmedian, + l_float32 rank, + l_float32 *prval, + NUMA **phisto) +{ +l_int32 i, n; +l_float32 minval, maxval, fval, mean, sum; +NUMA *nah; + + if (pmin) *pmin = 0.0; + if (pmax) *pmax = 0.0; + if (pmean) *pmean = 0.0; + if (pvariance) *pvariance = 0.0; + if (pmedian) *pmedian = 0.0; + if (prval) *prval = 0.0; + if (phisto) *phisto = NULL; + if (!na) + return ERROR_INT("na not defined", __func__, 1); + if ((n = numaGetCount(na)) == 0) + return ERROR_INT("numa is empty", __func__, 1); + + numaGetMin(na, &minval, NULL); + numaGetMax(na, &maxval, NULL); + if (pmin) *pmin = minval; + if (pmax) *pmax = maxval; + if (pmean || pvariance) { + sum = 0.0; + for (i = 0; i < n; i++) { + numaGetFValue(na, i, &fval); + sum += fval; + } + mean = sum / (l_float32)n; + if (pmean) *pmean = mean; + } + if (pvariance) { + sum = 0.0; + for (i = 0; i < n; i++) { + numaGetFValue(na, i, &fval); + sum += fval * fval; + } + *pvariance = sum / (l_float32)n - mean * mean; + } + + if (!pmedian && !prval && !phisto) + return 0; + + nah = numaMakeHistogramAuto(na, maxbins); + if (pmedian) + numaHistogramGetValFromRank(nah, 0.5, pmedian); + if (prval) + numaHistogramGetValFromRank(nah, rank, prval); + if (phisto) + *phisto = nah; + else + numaDestroy(&nah); + return 0; +} + + +/*! + * \brief numaGetHistogramStats() + * + * \param[in] nahisto histogram: y(x(i)), i = 0 ... nbins - 1 + * \param[in] startx x value of first bin: x(0) + * \param[in] deltax x increment between bins; the bin size; x(1) - x(0) + * \param[out] pxmean [optional] mean value of histogram + * \param[out] pxmedian [optional] median value of histogram + * \param[out] pxmode [optional] mode value of histogram: + * xmode = x(imode), where y(xmode) >= y(x(i)) for + * all i != imode + * \param[out] pxvariance [optional] variance of x + * \return 0 if OK, 1 on error + * + * <pre> + * Notes: + * (1) If the histogram represents the relation y(x), the + * computed values that are returned are the x values. + * These are NOT the bucket indices i; they are related to the + * bucket indices by + * x(i) = startx + i * deltax + * </pre> + */ +l_ok +numaGetHistogramStats(NUMA *nahisto, + l_float32 startx, + l_float32 deltax, + l_float32 *pxmean, + l_float32 *pxmedian, + l_float32 *pxmode, + l_float32 *pxvariance) +{ + if (pxmean) *pxmean = 0.0; + if (pxmedian) *pxmedian = 0.0; + if (pxmode) *pxmode = 0.0; + if (pxvariance) *pxvariance = 0.0; + if (!nahisto) + return ERROR_INT("nahisto not defined", __func__, 1); + + return numaGetHistogramStatsOnInterval(nahisto, startx, deltax, 0, -1, + pxmean, pxmedian, pxmode, + pxvariance); +} + + +/*! + * \brief numaGetHistogramStatsOnInterval() + * + * \param[in] nahisto histogram: y(x(i)), i = 0 ... nbins - 1 + * \param[in] startx x value of first bin: x(0) + * \param[in] deltax x increment between bins; the bin size; x(1) - x(0) + * \param[in] ifirst first bin to use for collecting stats + * \param[in] ilast last bin for collecting stats; -1 to go to the end + * \param[out] pxmean [optional] mean value of histogram + * \param[out] pxmedian [optional] median value of histogram + * \param[out] pxmode [optional] mode value of histogram: + * xmode = x(imode), where y(xmode) >= y(x(i)) for + * all i != imode + * \param[out] pxvariance [optional] variance of x + * \return 0 if OK, 1 on error + * + * <pre> + * Notes: + * (1) If the histogram represents the relation y(x), the + * computed values that are returned are the x values. + * These are NOT the bucket indices i; they are related to the + * bucket indices by + * x(i) = startx + i * deltax + * </pre> + */ +l_ok +numaGetHistogramStatsOnInterval(NUMA *nahisto, + l_float32 startx, + l_float32 deltax, + l_int32 ifirst, + l_int32 ilast, + l_float32 *pxmean, + l_float32 *pxmedian, + l_float32 *pxmode, + l_float32 *pxvariance) +{ +l_int32 i, n, imax; +l_float32 sum, sumval, halfsum, moment, var, x, y, ymax; + + if (pxmean) *pxmean = 0.0; + if (pxmedian) *pxmedian = 0.0; + if (pxmode) *pxmode = 0.0; + if (pxvariance) *pxvariance = 0.0; + if (!nahisto) + return ERROR_INT("nahisto not defined", __func__, 1); + if (!pxmean && !pxmedian && !pxmode && !pxvariance) + return ERROR_INT("nothing to compute", __func__, 1); + + n = numaGetCount(nahisto); + ifirst = L_MAX(0, ifirst); + if (ilast < 0) ilast = n - 1; + if (ifirst >= n) + return ERROR_INT("invalid ifirst", __func__, 1); + if (ilast >= n) { + L_WARNING("ilast = %d is beyond max index = %d; adjusting\n", + __func__, ilast, n - 1); + ilast = n - 1; + } + if (ifirst > ilast) + return ERROR_INT("ifirst > ilast", __func__, 1); + for (sum = 0.0, moment = 0.0, var = 0.0, i = ifirst; i <= ilast ; i++) { + x = startx + i * deltax; + numaGetFValue(nahisto, i, &y); + sum += y; + moment += x * y; + var += x * x * y; + } + if (sum == 0.0) { + L_INFO("sum is 0\n", __func__); + return 0; + } + + if (pxmean) + *pxmean = moment / sum; + if (pxvariance) + *pxvariance = var / sum - moment * moment / (sum * sum); + + if (pxmedian) { + halfsum = sum / 2.0f; + for (sumval = 0.0, i = ifirst; i <= ilast; i++) { + numaGetFValue(nahisto, i, &y); + sumval += y; + if (sumval >= halfsum) { + *pxmedian = startx + i * deltax; + break; + } + } + } + + if (pxmode) { + imax = -1; + ymax = -1.0e10; + for (i = ifirst; i <= ilast; i++) { + numaGetFValue(nahisto, i, &y); + if (y > ymax) { + ymax = y; + imax = i; + } + } + *pxmode = startx + imax * deltax; + } + + return 0; +} + + +/*! + * \brief numaMakeRankFromHistogram() + * + * \param[in] startx xval corresponding to first element in nay + * \param[in] deltax x increment between array elements in nay + * \param[in] nasy input histogram, assumed equally spaced + * \param[in] npts number of points to evaluate rank function + * \param[out] pnax [optional] array of x values in range + * \param[out] pnay rank array of specified npts + * \return 0 if OK, 1 on error + */ +l_ok +numaMakeRankFromHistogram(l_float32 startx, + l_float32 deltax, + NUMA *nasy, + l_int32 npts, + NUMA **pnax, + NUMA **pnay) +{ +l_int32 i, n; +l_float32 sum, fval; +NUMA *nan, *nar; + + if (pnax) *pnax = NULL; + if (!pnay) + return ERROR_INT("&nay not defined", __func__, 1); + *pnay = NULL; + if (!nasy) + return ERROR_INT("nasy not defined", __func__, 1); + if ((n = numaGetCount(nasy)) == 0) + return ERROR_INT("no bins in nas", __func__, 1); + + /* Normalize and generate the rank array corresponding to + * the binned histogram. */ + nan = numaNormalizeHistogram(nasy, 1.0); + nar = numaCreate(n + 1); /* rank numa corresponding to nan */ + sum = 0.0; + numaAddNumber(nar, sum); /* first element is 0.0 */ + for (i = 0; i < n; i++) { + numaGetFValue(nan, i, &fval); + sum += fval; + numaAddNumber(nar, sum); + } + + /* Compute rank array on full range with specified + * number of points and correspondence to x-values. */ + numaInterpolateEqxInterval(startx, deltax, nar, L_LINEAR_INTERP, + startx, startx + n * deltax, npts, + pnax, pnay); + numaDestroy(&nan); + numaDestroy(&nar); + return 0; +} + + +/*! + * \brief numaHistogramGetRankFromVal() + * + * \param[in] na histogram + * \param[in] rval value of input sample for which we want the rank + * \param[out] prank fraction of total samples below rval + * \return 0 if OK, 1 on error + * + * <pre> + * Notes: + * (1) If we think of the histogram as a function y(x), normalized + * to 1, for a given input value of x, this computes the + * rank of x, which is the integral of y(x) from the start + * value of x to the input value. + * (2) This function only makes sense when applied to a Numa that + * is a histogram. The values in the histogram can be ints and + * floats, and are computed as floats. The rank is returned + * as a float between 0.0 and 1.0. + * (3) The numa parameters startx and binsize are used to + * compute x from the Numa index i. + * </pre> + */ +l_ok +numaHistogramGetRankFromVal(NUMA *na, + l_float32 rval, + l_float32 *prank) +{ +l_int32 i, ibinval, n; +l_float32 startval, binsize, binval, maxval, fractval, total, sum, val; + + if (!prank) + return ERROR_INT("prank not defined", __func__, 1); + *prank = 0.0; + if (!na) + return ERROR_INT("na not defined", __func__, 1); + numaGetParameters(na, &startval, &binsize); + n = numaGetCount(na); + if (rval < startval) + return 0; + maxval = startval + n * binsize; + if (rval > maxval) { + *prank = 1.0; + return 0; + } + + binval = (rval - startval) / binsize; + ibinval = (l_int32)binval; + if (ibinval >= n) { + *prank = 1.0; + return 0; + } + fractval = binval - (l_float32)ibinval; + + sum = 0.0; + for (i = 0; i < ibinval; i++) { + numaGetFValue(na, i, &val); + sum += val; + } + numaGetFValue(na, ibinval, &val); + sum += fractval * val; + numaGetSum(na, &total); + *prank = sum / total; + +/* lept_stderr("binval = %7.3f, rank = %7.3f\n", binval, *prank); */ + + return 0; +} + + +/*! + * \brief numaHistogramGetValFromRank() + * + * \param[in] na histogram + * \param[in] rank fraction of total samples + * \param[out] prval approx. to the bin value + * \return 0 if OK, 1 on error + * + * <pre> + * Notes: + * (1) If we think of the histogram as a function y(x), this returns + * the value x such that the integral of y(x) from the start + * value to x gives the fraction 'rank' of the integral + * of y(x) over all bins. + * (2) This function only makes sense when applied to a Numa that + * is a histogram. The values in the histogram can be ints and + * floats, and are computed as floats. The val is returned + * as a float, even though the buckets are of integer width. + * (3) The numa parameters startx and binsize are used to + * compute x from the Numa index i. + * </pre> + */ +l_ok +numaHistogramGetValFromRank(NUMA *na, + l_float32 rank, + l_float32 *prval) +{ +l_int32 i, n; +l_float32 startval, binsize, rankcount, total, sum, fract, val; + + if (!prval) + return ERROR_INT("prval not defined", __func__, 1); + *prval = 0.0; + if (!na) + return ERROR_INT("na not defined", __func__, 1); + if (rank < 0.0) { + L_WARNING("rank < 0; setting to 0.0\n", __func__); + rank = 0.0; + } + if (rank > 1.0) { + L_WARNING("rank > 1.0; setting to 1.0\n", __func__); + rank = 1.0; + } + + n = numaGetCount(na); + numaGetParameters(na, &startval, &binsize); + numaGetSum(na, &total); + rankcount = rank * total; /* count that corresponds to rank */ + sum = 0.0; + val = 0.0; + for (i = 0; i < n; i++) { + numaGetFValue(na, i, &val); + if (sum + val >= rankcount) + break; + sum += val; + } + if (val <= 0.0) /* can == 0 if rank == 0.0 */ + fract = 0.0; + else /* sum + fract * val = rankcount */ + fract = (rankcount - sum) / val; + + /* The use of the fraction of a bin allows a simple calculation + * for the histogram value at the given rank. */ + *prval = startval + binsize * ((l_float32)i + fract); + +/* lept_stderr("rank = %7.3f, val = %7.3f\n", rank, *prval); */ + + return 0; +} + + +/*! + * \brief numaDiscretizeSortedInBins() + * + * \param[in] na sorted + * \param[in] nbins number of equal population bins (> 1) + * \param[out] pnabinval average "gray" values in each bin + * \return 0 if OK, 1 on error + * + * <pre> + * Notes: + * (1) The input %na is sorted in increasing value. + * (2) The output array has the following mapping: + * bin number --> average array value in bin (nabinval) + * (3) With %nbins == 100, nabinval is the average gray value in + * each of the 100 equally populated bins. It is the function + * gray[100 * rank]. + * Thus it is the inverse of + * rank[gray] + * (4) Contast with numaDiscretizeHistoInBins(), where the input %na + * is a histogram. + * </pre> + */ +l_ok +numaDiscretizeSortedInBins(NUMA *na, + l_int32 nbins, + NUMA **pnabinval) +{ +NUMA *nabinval; /* average gray value in the bins */ +NUMA *naeach; +l_int32 i, ntot, bincount, binindex, binsize; +l_float32 sum, val, ave; + + if (!pnabinval) + return ERROR_INT("&nabinval not defined", __func__, 1); + *pnabinval = NULL; + if (!na) + return ERROR_INT("na not defined", __func__, 1); + if (nbins < 2) + return ERROR_INT("nbins must be > 1", __func__, 1); + + /* Get the number of items in each bin */ + ntot = numaGetCount(na); + if ((naeach = numaGetUniformBinSizes(ntot, nbins)) == NULL) + return ERROR_INT("naeach not made", __func__, 1); + + /* Get the average value in each bin */ + sum = 0.0; + bincount = 0; + binindex = 0; + numaGetIValue(naeach, 0, &binsize); + nabinval = numaCreate(nbins); + for (i = 0; i < ntot; i++) { + numaGetFValue(na, i, &val); + bincount++; + sum += val; + if (bincount == binsize) { /* add bin entry */ + ave = sum / binsize; + numaAddNumber(nabinval, ave); + sum = 0.0; + bincount = 0; + binindex++; + if (binindex == nbins) break; + numaGetIValue(naeach, binindex, &binsize); + } + } + *pnabinval = nabinval; + + numaDestroy(&naeach); + return 0; +} + + +/*! + * \brief numaDiscretizeHistoInBins() + * + * \param[in] na histogram + * \param[in] nbins number of equal population bins (> 1) + * \param[out] pnabinval average "gray" values in each bin + * \param[out] pnarank [optional] rank value of input histogram; + * this is a cumulative norm histogram. + * \return 0 if OK, 1 on error + * + * <pre> + * Notes: + * (1) With %nbins == 100, nabinval is the average gray value in + * each of the 100 equally populated bins. It is the function + * gray[100 * rank]. + * Thus it is the inverse of + * rank[gray] + * which is optionally returned in narank. + * (2) The "gray value" is the index into the input histogram. + * (3) The two output arrays give the following mappings, where the + * input is an un-normalized histogram of array values: + * bin number --> average array value in bin (nabinval) + * array values --> cumulative normalized histogram (narank) + * </pre> + */ +l_ok +numaDiscretizeHistoInBins(NUMA *na, + l_int32 nbins, + NUMA **pnabinval, + NUMA **pnarank) +{ +NUMA *nabinval; /* average gray value in the bins */ +NUMA *naeach, *nan; +l_int32 i, j, nxvals, occup, count, bincount, binindex, binsize; +l_float32 sum, ave, ntot; + + if (pnarank) *pnarank = NULL; + if (!pnabinval) + return ERROR_INT("&nabinval not defined", __func__, 1); + *pnabinval = NULL; + if (!na) + return ERROR_INT("na not defined", __func__, 1); + if (nbins < 2) + return ERROR_INT("nbins must be > 1", __func__, 1); + + nxvals = numaGetCount(na); + numaGetSum(na, &ntot); + occup = ntot / nxvals; + if (occup < 1) L_INFO("average occupancy %d < 1\n", __func__, occup); + + /* Get the number of items in each bin */ + if ((naeach = numaGetUniformBinSizes(ntot, nbins)) == NULL) + return ERROR_INT("naeach not made", __func__, 1); + + /* Get the average value in each bin */ + sum = 0.0; + bincount = 0; + binindex = 0; + numaGetIValue(naeach, 0, &binsize); + nabinval = numaCreate(nbins); + for (i = 0; i < nxvals; i++) { + numaGetIValue(na, i, &count); + for (j = 0; j < count; j++) { + bincount++; + sum += i; + if (bincount == binsize) { /* add bin entry */ + ave = sum / binsize; + numaAddNumber(nabinval, ave); + sum = 0.0; + bincount = 0; + binindex++; + if (binindex == nbins) break; + numaGetIValue(naeach, binindex, &binsize); + } + } + if (binindex == nbins) break; + } + *pnabinval = nabinval; + if (binindex != nbins) + L_ERROR("binindex = %d != nbins = %d\n", __func__, binindex, nbins); + + /* Get cumulative normalized histogram (rank[gray value]). + * This is the partial sum operating on the normalized histogram. */ + if (pnarank) { + nan = numaNormalizeHistogram(na, 1.0); + *pnarank = numaGetPartialSums(nan); + numaDestroy(&nan); + } + numaDestroy(&naeach); + return 0; +} + + +/*! + * \brief numaGetRankBinValues() + * + * \param[in] na an array of values + * \param[in] nbins number of bins at which the rank is divided + * \param[out] pnam mean intensity in a bin vs rank bin value, + * with %nbins of discretized rank values + * \return 0 if OK, 1 on error + * + * <pre> + * Notes: + * (1) Simple interface for getting a binned rank representation + * of an input array of values. This returns: + * rank bin number --> average array value in each rank bin (nam) + * (2) Uses bins either a sorted array or a histogram, depending on + * the values in the array and the size of the array. + * </pre> + */ +l_ok +numaGetRankBinValues(NUMA *na, + l_int32 nbins, + NUMA **pnam) +{ +NUMA *na1; +l_int32 maxbins, type; +l_float32 maxval, delx; + + if (!pnam) + return ERROR_INT("&pnam not defined", __func__, 1); + *pnam = NULL; + if (!na) + return ERROR_INT("na not defined", __func__, 1); + if (numaGetCount(na) == 0) + return ERROR_INT("na is empty", __func__, 1); + if (nbins < 2) + return ERROR_INT("nbins must be > 1", __func__, 1); + + /* Choose between sorted array and a histogram. + * If the input array is has a small number of numbers with + * a large maximum, we will sort it. At the other extreme, if + * the array has many numbers with a small maximum, such as the + * values of pixels in an 8 bpp grayscale image, generate a histogram. + * If type comes back as L_BIN_SORT, use a histogram. */ + type = numaChooseSortType(na); + if (type == L_SHELL_SORT) { /* sort the array */ + L_INFO("sort the array: input size = %d\n", __func__, numaGetCount(na)); + na1 = numaSort(NULL, na, L_SORT_INCREASING); + numaDiscretizeSortedInBins(na1, nbins, pnam); + numaDestroy(&na1); + return 0; + } + + /* Make the histogram. Assuming there are no negative values + * in the array, if the max value in the array does not exceed + * about 100000, the bin size for generating the histogram will + * be 1; maxbins refers to the number of entries in the histogram. */ + L_INFO("use a histogram: input size = %d\n", __func__, numaGetCount(na)); + numaGetMax(na, &maxval, NULL); + maxbins = L_MIN(100002, (l_int32)maxval + 2); + na1 = numaMakeHistogram(na, maxbins, NULL, NULL); + + /* Warn if there is a scale change. This shouldn't happen + * unless the max value is above 100000. */ + numaGetParameters(na1, NULL, &delx); + if (delx > 1.0) + L_WARNING("scale change: delx = %6.2f\n", __func__, delx); + + /* Rank bin the results */ + numaDiscretizeHistoInBins(na1, nbins, pnam, NULL); + numaDestroy(&na1); + return 0; +} + + +/*! + * \brief numaGetUniformBinSizes() + * + * \param[in] ntotal number of values to be split up + * \param[in] nbins number of bins + * \return naeach number of values to go in each bin, or NULL on error + * + * <pre> + * Notes: + * (1) The numbers in the bins can differ by 1. The sum of + * bin numbers in %naeach is %ntotal. + * </pre> + */ +NUMA * +numaGetUniformBinSizes(l_int32 ntotal, + l_int32 nbins) +{ +l_int32 i, start, end; +NUMA *naeach; + + if (ntotal <= 0) + return (NUMA *)ERROR_PTR("ntotal <= 0", __func__, NULL); + if (nbins <= 0) + return (NUMA *)ERROR_PTR("nbins <= 0", __func__, NULL); + + if ((naeach = numaCreate(nbins)) == NULL) + return (NUMA *)ERROR_PTR("naeach not made", __func__, NULL); + + if (ntotal < nbins) { /* put 1 in each of %ntotal bins */ + for (i = 0; i < ntotal; i++) + numaAddNumber(naeach, 1); + return naeach; + } + + start = 0; + for (i = 0; i < nbins; i++) { + end = ntotal * (i + 1) / nbins; + numaAddNumber(naeach, end - start); + start = end; + } + return naeach; +} + + +/*----------------------------------------------------------------------* + * Splitting a distribution * + *----------------------------------------------------------------------*/ +/*! + * \brief numaSplitDistribution() + * + * \param[in] na histogram + * \param[in] scorefract fraction of the max score, used to determine + * range over which the histogram min is searched + * \param[out] psplitindex [optional] index for splitting + * \param[out] pave1 [optional] average of lower distribution + * \param[out] pave2 [optional] average of upper distribution + * \param[out] pnum1 [optional] population of lower distribution + * \param[out] pnum2 [optional] population of upper distribution + * \param[out] pnascore [optional] for debugging; otherwise use NULL + * \return 0 if OK, 1 on error + * + * <pre> + * Notes: + * (1) This function is intended to be used on a distribution of + * values that represent two sets, such as a histogram of + * pixel values for an image with a fg and bg, and the goal + * is to determine the averages of the two sets and the + * best splitting point. + * (2) The Otsu method finds a split point that divides the distribution + * into two parts by maximizing a score function that is the + * product of two terms: + * (a) the square of the difference of centroids, (ave1 - ave2)^2 + * (b) fract1 * (1 - fract1) + * where fract1 is the fraction in the lower distribution. + * (3) This works well for images where the fg and bg are + * each relatively homogeneous and well-separated in color. + * However, if the actual fg and bg sets are very different + * in size, and the bg is highly varied, as can occur in some + * scanned document images, this will bias the split point + * into the larger "bump" (i.e., toward the point where the + * (b) term reaches its maximum of 0.25 at fract1 = 0.5. + * To avoid this, we define a range of values near the + * maximum of the score function, and choose the value within + * this range such that the histogram itself has a minimum value. + * The range is determined by scorefract: we include all abscissa + * values to the left and right of the value that maximizes the + * score, such that the score stays above (1 - scorefract) * maxscore. + * The intuition behind this modification is to try to find + * a split point that both has a high variance score and is + * at or near a minimum in the histogram, so that the histogram + * slope is small at the split point. + * (4) We normalize the score so that if the two distributions + * were of equal size and at opposite ends of the numa, the + * score would be 1.0. + * </pre> + */ +l_ok +numaSplitDistribution(NUMA *na, + l_float32 scorefract, + l_int32 *psplitindex, + l_float32 *pave1, + l_float32 *pave2, + l_float32 *pnum1, + l_float32 *pnum2, + NUMA **pnascore) +{ +l_int32 i, n, bestsplit, minrange, maxrange, maxindex; +l_float32 ave1, ave2, ave1prev, ave2prev; +l_float32 num1, num2, num1prev, num2prev; +l_float32 val, minval, sum, fract1; +l_float32 norm, score, minscore, maxscore; +NUMA *nascore, *naave1, *naave2, *nanum1, *nanum2; + + if (psplitindex) *psplitindex = 0; + if (pave1) *pave1 = 0.0; + if (pave2) *pave2 = 0.0; + if (pnum1) *pnum1 = 0.0; + if (pnum2) *pnum2 = 0.0; + if (pnascore) *pnascore = NULL; + if (!na) + return ERROR_INT("na not defined", __func__, 1); + + n = numaGetCount(na); + if (n <= 1) + return ERROR_INT("n = 1 in histogram", __func__, 1); + numaGetSum(na, &sum); + if (sum <= 0.0) + return ERROR_INT("sum <= 0.0", __func__, 1); + norm = 4.0f / ((l_float32)(n - 1) * (n - 1)); + ave1prev = 0.0; + numaGetHistogramStats(na, 0.0, 1.0, &ave2prev, NULL, NULL, NULL); + num1prev = 0.0; + num2prev = sum; + maxindex = n / 2; /* initialize with something */ + + /* Split the histogram with [0 ... i] in the lower part + * and [i+1 ... n-1] in upper part. First, compute an otsu + * score for each possible splitting. */ + if ((nascore = numaCreate(n)) == NULL) + return ERROR_INT("nascore not made", __func__, 1); + naave1 = (pave1) ? numaCreate(n) : NULL; + naave2 = (pave2) ? numaCreate(n) : NULL; + nanum1 = (pnum1) ? numaCreate(n) : NULL; + nanum2 = (pnum2) ? numaCreate(n) : NULL; + maxscore = 0.0; + for (i = 0; i < n; i++) { + numaGetFValue(na, i, &val); + num1 = num1prev + val; + if (num1 == 0) + ave1 = ave1prev; + else + ave1 = (num1prev * ave1prev + i * val) / num1; + num2 = num2prev - val; + if (num2 == 0) + ave2 = ave2prev; + else + ave2 = (num2prev * ave2prev - i * val) / num2; + fract1 = num1 / sum; + score = norm * (fract1 * (1 - fract1)) * (ave2 - ave1) * (ave2 - ave1); + numaAddNumber(nascore, score); + if (pave1) numaAddNumber(naave1, ave1); + if (pave2) numaAddNumber(naave2, ave2); + if (pnum1) numaAddNumber(nanum1, num1); + if (pnum2) numaAddNumber(nanum2, num2); + if (score > maxscore) { + maxscore = score; + maxindex = i; + } + num1prev = num1; + num2prev = num2; + ave1prev = ave1; + ave2prev = ave2; + } + + /* Next, for all contiguous scores within a specified fraction + * of the max, choose the split point as the value with the + * minimum in the histogram. */ + minscore = (1.f - scorefract) * maxscore; + for (i = maxindex - 1; i >= 0; i--) { + numaGetFValue(nascore, i, &val); + if (val < minscore) + break; + } + minrange = i + 1; + for (i = maxindex + 1; i < n; i++) { + numaGetFValue(nascore, i, &val); + if (val < minscore) + break; + } + maxrange = i - 1; + numaGetFValue(na, minrange, &minval); + bestsplit = minrange; + for (i = minrange + 1; i <= maxrange; i++) { + numaGetFValue(na, i, &val); + if (val < minval) { + minval = val; + bestsplit = i; + } + } + + /* Add one to the bestsplit value to get the threshold value, + * because when we take a threshold, as in pixThresholdToBinary(), + * we always choose the set with values below the threshold. */ + bestsplit = L_MIN(255, bestsplit + 1); + + if (psplitindex) *psplitindex = bestsplit; + if (pave1) numaGetFValue(naave1, bestsplit, pave1); + if (pave2) numaGetFValue(naave2, bestsplit, pave2); + if (pnum1) numaGetFValue(nanum1, bestsplit, pnum1); + if (pnum2) numaGetFValue(nanum2, bestsplit, pnum2); + + if (pnascore) { /* debug mode */ + lept_stderr("minrange = %d, maxrange = %d\n", minrange, maxrange); + lept_stderr("minval = %10.0f\n", minval); + gplotSimple1(nascore, GPLOT_PNG, "/tmp/lept/nascore", + "Score for split distribution"); + *pnascore = nascore; + } else { + numaDestroy(&nascore); + } + + if (pave1) numaDestroy(&naave1); + if (pave2) numaDestroy(&naave2); + if (pnum1) numaDestroy(&nanum1); + if (pnum2) numaDestroy(&nanum2); + return 0; +} + + +/*----------------------------------------------------------------------* + * Comparing histograms * + *----------------------------------------------------------------------*/ +/*! + * \brief grayHistogramsToEMD() + * + * \param[in] naa1, naa2 two numaa, each with one or more 256-element + * histograms + * \param[out] pnad nad of EM distances for each histogram + * \return 0 if OK, 1 on error + * + * <pre> + * Notes: + * (1) The two numaas must be the same size and have corresponding + * 256-element histograms. Pairs do not need to be normalized + * to the same sum. + * (2) This is typically used on two sets of histograms from + * corresponding tiles of two images. The similarity of two + * images can be found with the scoring function used in + * pixCompareGrayByHisto(): + * score S = 1.0 - k * D, where + * k is a constant, say in the range 5-10 + * D = EMD + * for each tile; for multiple tiles, take the Min(S) over + * the set of tiles to be the final score. + * </pre> + */ +l_ok +grayHistogramsToEMD(NUMAA *naa1, + NUMAA *naa2, + NUMA **pnad) +{ +l_int32 i, n, nt; +l_float32 dist; +NUMA *na1, *na2, *nad; + + if (!pnad) + return ERROR_INT("&nad not defined", __func__, 1); + *pnad = NULL; + if (!naa1 || !naa2) + return ERROR_INT("na1 and na2 not both defined", __func__, 1); + n = numaaGetCount(naa1); + if (n != numaaGetCount(naa2)) + return ERROR_INT("naa1 and naa2 numa counts differ", __func__, 1); + nt = numaaGetNumberCount(naa1); + if (nt != numaaGetNumberCount(naa2)) + return ERROR_INT("naa1 and naa2 number counts differ", __func__, 1); + if (256 * n != nt) /* good enough check */ + return ERROR_INT("na sizes must be 256", __func__, 1); + + nad = numaCreate(n); + *pnad = nad; + for (i = 0; i < n; i++) { + na1 = numaaGetNuma(naa1, i, L_CLONE); + na2 = numaaGetNuma(naa2, i, L_CLONE); + numaEarthMoverDistance(na1, na2, &dist); + numaAddNumber(nad, dist / 255.f); /* normalize to [0.0 - 1.0] */ + numaDestroy(&na1); + numaDestroy(&na2); + } + return 0; +} + + +/*! + * \brief numaEarthMoverDistance() + * + * \param[in] na1, na2 two numas of the same size, typically histograms + * \param[out] pdist earthmover distance + * \return 0 if OK, 1 on error + * + * <pre> + * Notes: + * (1) The two numas must have the same size. They do not need to be + * normalized to the same sum before applying the function. + * (2) For a 1D discrete function, the implementation of the EMD + * is trivial. Just keep filling or emptying buckets in one numa + * to match the amount in the other, moving sequentially along + * both arrays. + * (3) We divide the sum of the absolute value of everything moved + * (by 1 unit at a time) by the sum of the numa (amount of "earth") + * to get the average distance that the "earth" was moved. + * This is the value returned here. + * (4) The caller can do a further normalization, by the number of + * buckets (minus 1), to get the EM distance as a fraction of + * the maximum possible distance, which is n-1. This fraction + * is 1.0 for the situation where all the 'earth' in the first + * array is at one end, and all in the second array is at the + * other end. + * </pre> + */ +l_ok +numaEarthMoverDistance(NUMA *na1, + NUMA *na2, + l_float32 *pdist) +{ +l_int32 n, norm, i; +l_float32 sum1, sum2, diff, total; +l_float32 *array1, *array3; +NUMA *na3; + + if (!pdist) + return ERROR_INT("&dist not defined", __func__, 1); + *pdist = 0.0; + if (!na1 || !na2) + return ERROR_INT("na1 and na2 not both defined", __func__, 1); + n = numaGetCount(na1); + if (n != numaGetCount(na2)) + return ERROR_INT("na1 and na2 have different size", __func__, 1); + + /* Generate na3; normalize to na1 if necessary */ + numaGetSum(na1, &sum1); + numaGetSum(na2, &sum2); + norm = (L_ABS(sum1 - sum2) < 0.00001 * L_ABS(sum1)) ? 1 : 0; + if (!norm) + na3 = numaTransform(na2, 0, sum1 / sum2); + else + na3 = numaCopy(na2); + array1 = numaGetFArray(na1, L_NOCOPY); + array3 = numaGetFArray(na3, L_NOCOPY); + + /* Move earth in n3 from array elements, to match n1 */ + total = 0; + for (i = 1; i < n; i++) { + diff = array1[i - 1] - array3[i - 1]; + array3[i] -= diff; + total += L_ABS(diff); + } + *pdist = total / sum1; + + numaDestroy(&na3); + return 0; +} + + +/*! + * \brief grayInterHistogramStats() + * + * \param[in] naa numaa with two or more 256-element histograms + * \param[in] wc half-width of the smoothing window + * \param[out] pnam [optional] mean values + * \param[out] pnams [optional] mean square values + * \param[out] pnav [optional] variances + * \param[out] pnarv [optional] rms deviations from the mean + * \return 0 if OK, 1 on error + * + * <pre> + * Notes: + * (1) The %naa has two or more 256-element numa histograms, which + * are to be compared value-wise at each of the 256 gray levels. + * The result are stats (mean, mean square, variance, root variance) + * aggregated across the set of histograms, and each is output + * as a 256 entry numa. Think of these histograms as a matrix, + * where each histogram is one row of the array. The stats are + * then aggregated column-wise, between the histograms. + * (2) These stats are: + * ~ average value: <v> (nam) + * ~ average squared value: <v*v> (nams) + * ~ variance: <(v - <v>)*(v - <v>)> = <v*v> - <v>*<v> (nav) + * ~ square-root of variance: (narv) + * where the brackets < .. > indicate that the average value is + * to be taken over each column of the array. + * (3) The input histograms are optionally smoothed before these + * statistical operations. + * (4) The input histograms are normalized to a sum of 10000. By + * doing this, the resulting numbers are independent of the + * number of samples used in building the individual histograms. + * (5) A typical application is on a set of histograms from tiles + * of an image, to distinguish between text/tables and photo + * regions. If the tiles are much larger than the text line + * spacing, text/table regions typically have smaller variance + * across tiles than photo regions. For this application, it + * may be useful to ignore values near white, which are large for + * text and would magnify the variance due to variations in + * illumination. However, because the variance of a drawing or + * a light photo can be similar to that of grayscale text, this + * function is only a discriminator between darker photos/drawings + * and light photos/text/line-graphics. + * </pre> + */ +l_ok +grayInterHistogramStats(NUMAA *naa, + l_int32 wc, + NUMA **pnam, + NUMA **pnams, + NUMA **pnav, + NUMA **pnarv) +{ +l_int32 i, j, n, nn; +l_float32 **arrays; +l_float32 mean, var, rvar; +NUMA *na1, *na2, *na3, *na4; + + if (pnam) *pnam = NULL; + if (pnams) *pnams = NULL; + if (pnav) *pnav = NULL; + if (pnarv) *pnarv = NULL; + if (!pnam && !pnams && !pnav && !pnarv) + return ERROR_INT("nothing requested", __func__, 1); + if (!naa) + return ERROR_INT("naa not defined", __func__, 1); + n = numaaGetCount(naa); + for (i = 0; i < n; i++) { + nn = numaaGetNumaCount(naa, i); + if (nn != 256) { + L_ERROR("%d numbers in numa[%d]\n", __func__, nn, i); + return 1; + } + } + + if (pnam) *pnam = numaCreate(256); + if (pnams) *pnams = numaCreate(256); + if (pnav) *pnav = numaCreate(256); + if (pnarv) *pnarv = numaCreate(256); + + /* First, use mean smoothing, normalize each histogram, + * and save all results in a 2D matrix. */ + arrays = (l_float32 **)LEPT_CALLOC(n, sizeof(l_float32 *)); + for (i = 0; i < n; i++) { + na1 = numaaGetNuma(naa, i, L_CLONE); + na2 = numaWindowedMean(na1, wc); + na3 = numaNormalizeHistogram(na2, 10000.); + arrays[i] = numaGetFArray(na3, L_COPY); + numaDestroy(&na1); + numaDestroy(&na2); + numaDestroy(&na3); + } + + /* Get stats between histograms */ + for (j = 0; j < 256; j++) { + na4 = numaCreate(n); + for (i = 0; i < n; i++) { + numaAddNumber(na4, arrays[i][j]); + } + numaSimpleStats(na4, 0, -1, &mean, &var, &rvar); + if (pnam) numaAddNumber(*pnam, mean); + if (pnams) numaAddNumber(*pnams, mean * mean); + if (pnav) numaAddNumber(*pnav, var); + if (pnarv) numaAddNumber(*pnarv, rvar); + numaDestroy(&na4); + } + + for (i = 0; i < n; i++) + LEPT_FREE(arrays[i]); + LEPT_FREE(arrays); + return 0; +} + + +/*----------------------------------------------------------------------* + * Extrema finding * + *----------------------------------------------------------------------*/ +/*! + * \brief numaFindPeaks() + * + * \param[in] nas source numa + * \param[in] nmax max number of peaks to be found + * \param[in] fract1 min fraction of peak value + * \param[in] fract2 min slope + * \return peak na, or NULL on error. + * + * <pre> + * Notes: + * (1) The returned na consists of sets of four numbers representing + * the peak, in the following order: + * left edge; peak center; right edge; normalized peak area + * </pre> + */ +NUMA * +numaFindPeaks(NUMA *nas, + l_int32 nmax, + l_float32 fract1, + l_float32 fract2) +{ +l_int32 i, k, n, maxloc, lloc, rloc; +l_float32 fmaxval, sum, total, newtotal, val, lastval; +l_float32 peakfract; +NUMA *na, *napeak; + + if (!nas) + return (NUMA *)ERROR_PTR("nas not defined", __func__, NULL); + n = numaGetCount(nas); + numaGetSum(nas, &total); + + /* We munge this copy */ + if ((na = numaCopy(nas)) == NULL) + return (NUMA *)ERROR_PTR("na not made", __func__, NULL); + if ((napeak = numaCreate(4 * nmax)) == NULL) { + numaDestroy(&na); + return (NUMA *)ERROR_PTR("napeak not made", __func__, NULL); + } + + for (k = 0; k < nmax; k++) { + numaGetSum(na, &newtotal); + if (newtotal == 0.0) /* sanity check */ + break; + numaGetMax(na, &fmaxval, &maxloc); + sum = fmaxval; + lastval = fmaxval; + lloc = 0; + for (i = maxloc - 1; i >= 0; --i) { + numaGetFValue(na, i, &val); + if (val == 0.0) { + lloc = i + 1; + break; + } + if (val > fract1 * fmaxval) { + sum += val; + lastval = val; + continue; + } + if (lastval - val > fract2 * lastval) { + sum += val; + lastval = val; + continue; + } + lloc = i; + break; + } + lastval = fmaxval; + rloc = n - 1; + for (i = maxloc + 1; i < n; ++i) { + numaGetFValue(na, i, &val); + if (val == 0.0) { + rloc = i - 1; + break; + } + if (val > fract1 * fmaxval) { + sum += val; + lastval = val; + continue; + } + if (lastval - val > fract2 * lastval) { + sum += val; + lastval = val; + continue; + } + rloc = i; + break; + } + peakfract = sum / total; + numaAddNumber(napeak, lloc); + numaAddNumber(napeak, maxloc); + numaAddNumber(napeak, rloc); + numaAddNumber(napeak, peakfract); + + for (i = lloc; i <= rloc; i++) + numaSetValue(na, i, 0.0); + } + + numaDestroy(&na); + return napeak; +} + + +/*! + * \brief numaFindExtrema() + * + * \param[in] nas input values + * \param[in] delta relative amount to resolve peaks and valleys + * \param[out] pnav [optional] values of extrema + * \return nad (locations of extrema, or NULL on error + * + * <pre> + * Notes: + * (1) This returns a sequence of extrema (peaks and valleys). + * (2) The algorithm is analogous to that for determining + * mountain peaks. Suppose we have a local peak, with + * bumps on the side. Under what conditions can we consider + * those 'bumps' to be actual peaks? The answer: if the + * bump is separated from the peak by a saddle that is at + * least 500 feet below the bump. + * (3) Operationally, suppose we are trying to identify a peak. + * We have a previous valley, and also the largest value that + * we have seen since that valley. We can identify this as + * a peak if we find a value that is delta BELOW it. When + * we find such a value, label the peak, use the current + * value to label the starting point for the search for + * a valley, and do the same operation in reverse. Namely, + * keep track of the lowest point seen, and look for a value + * that is delta ABOVE it. Once found, the lowest point is + * labeled the valley, and continue, looking for the next peak. + * </pre> + */ +NUMA * +numaFindExtrema(NUMA *nas, + l_float32 delta, + NUMA **pnav) +{ +l_int32 i, n, found, loc, direction; +l_float32 startval, val, maxval, minval; +NUMA *nav, *nad; + + if (pnav) *pnav = NULL; + if (!nas) + return (NUMA *)ERROR_PTR("nas not defined", __func__, NULL); + if (delta < 0.0) + return (NUMA *)ERROR_PTR("delta < 0", __func__, NULL); + + n = numaGetCount(nas); + nad = numaCreate(0); + nav = NULL; + if (pnav) { + nav = numaCreate(0); + *pnav = nav; + } + + /* We don't know if we'll find a peak or valley first, + * but use the first element of nas as the reference point. + * Break when we deviate by 'delta' from the first point. */ + numaGetFValue(nas, 0, &startval); + found = FALSE; + for (i = 1; i < n; i++) { + numaGetFValue(nas, i, &val); + if (L_ABS(val - startval) >= delta) { + found = TRUE; + break; + } + } + + if (!found) + return nad; /* it's empty */ + + /* Are we looking for a peak or a valley? */ + if (val > startval) { /* peak */ + direction = 1; + maxval = val; + } else { + direction = -1; + minval = val; + } + loc = i; + + /* Sweep through the rest of the array, recording alternating + * peak/valley extrema. */ + for (i = i + 1; i < n; i++) { + numaGetFValue(nas, i, &val); + if (direction == 1 && val > maxval ) { /* new local max */ + maxval = val; + loc = i; + } else if (direction == -1 && val < minval ) { /* new local min */ + minval = val; + loc = i; + } else if (direction == 1 && (maxval - val >= delta)) { + numaAddNumber(nad, loc); /* save the current max location */ + if (nav) numaAddNumber(nav, maxval); + direction = -1; /* reverse: start looking for a min */ + minval = val; + loc = i; /* current min location */ + } else if (direction == -1 && (val - minval >= delta)) { + numaAddNumber(nad, loc); /* save the current min location */ + if (nav) numaAddNumber(nav, minval); + direction = 1; /* reverse: start looking for a max */ + maxval = val; + loc = i; /* current max location */ + } + } + + /* Save the final extremum */ +/* numaAddNumber(nad, loc); */ + return nad; +} + + +/*! + * \brief numaFindLocForThreshold() + * + * \param[in] nas input histogram + * \param[in] skip look-ahead distance to avoid false mininma; + * use 0 for default + * \param[out] pthresh threshold value + * \param[out] pfract [optional] fraction below or at threshold + * \return 0 if OK, 1 on error or if no threshold can be found + * + * <pre> + * Notes: + * (1) This finds a good place to set a threshold for a histogram + * of values that has two peaks. The peaks can differ greatly + * in area underneath them. The number of buckets in the + * histogram is expected to be 256 (e.g, from an 8 bpp gray image). + * (2) The input histogram should have been smoothed with a window + * to avoid false peak and valley detection due to noise. For + * example, see pixThresholdByHisto(). + * (3) A skip value can be input to determine the look-ahead distance + * to ignore a false peak on the rise or descent from the first peak. + * Input 0 to use the default value (it assumes a histo size of 256). + * (4) Optionally, the fractional area under the first peak can + * be returned. + * </pre> + */ +l_ok +numaFindLocForThreshold(NUMA *na, + l_int32 skip, + l_int32 *pthresh, + l_float32 *pfract) +{ +l_int32 i, n, start, index, minloc, found; +l_float32 val, pval, jval, minval, maxval, sum, partsum; +l_float32 *fa; + + if (pfract) *pfract = 0.0; + if (!pthresh) + return ERROR_INT("&thresh not defined", __func__, 1); + *pthresh = 0; + if (!na) + return ERROR_INT("na not defined", __func__, 1); + if (skip <= 0) skip = 20; + + /* Test for constant value */ + numaGetMin(na, &minval, NULL); + numaGetMax(na, &maxval, NULL); + if (minval == maxval) + return ERROR_INT("all array values are the same", __func__, 1); + + /* Look for the top of the first peak */ + n = numaGetCount(na); + if (n < 256) + L_WARNING("array size %d < 256\n", __func__, n); + fa = numaGetFArray(na, L_NOCOPY); + pval = fa[0]; + for (i = 1; i < n; i++) { + val = fa[i]; + index = L_MIN(i + skip, n - 1); + jval = fa[index]; + if (val < pval && jval < pval) /* near the top if not there */ + break; + pval = val; + } + + if (i > n - 5) /* just an increasing function */ + return ERROR_INT("top of first peak not found", __func__, 1); + + /* Look for the low point in the valley */ + found = FALSE; + start = i; + pval = fa[start]; + for (i = start + 1; i < n; i++) { + val = fa[i]; + if (val <= pval) { /* appears to be going down */ + pval = val; + } else { /* appears to be going up */ + index = L_MIN(i + skip, n - 1); + jval = fa[index]; /* junp ahead by 'skip' */ + if (val > jval) { /* still going down; jump ahead */ + pval = jval; + i = index; + } else { /* really going up; passed the min */ + found = TRUE; + break; + } + } + } + if (!found) + return ERROR_INT("no minimum found", __func__, 1); + + /* Find the location of the minimum in the interval */ + minloc = index; /* likely passed the min; look backward */ + minval = fa[index]; + for (i = index - 1; i > index - skip; i--) { + if (fa[i] < minval) { + minval = fa[i]; + minloc = i; + } + } + + /* Is the minimum very near the end of the array? */ + if (minloc > n - 10) + return ERROR_INT("minimum at end of array; invalid", __func__, 1); + *pthresh = minloc; + + /* Find the fraction under the first peak */ + if (pfract) { + numaGetSumOnInterval(na, 0, minloc, &partsum); + numaGetSum(na, &sum); + if (sum > 0.0) + *pfract = partsum / sum; + } + return 0; +} + + +/*! + * \brief numaCountReversals() + * + * \param[in] nas input values + * \param[in] minreversal relative amount to resolve peaks and valleys + * \param[out] pnr [optional] number of reversals + * \param[out] prd [optional] reversal density: reversals/length + * \return 0 if OK, 1 on error + * + * <pre> + * Notes: + * (1) The input numa is can be generated from pixExtractAlongLine(). + * If so, the x parameters can be used to find the reversal + * frequency along a line. + * (2) If the input numa was generated from a 1 bpp pix, the + * values will be 0 and 1. Use %minreversal == 1 to get + * the number of pixel flips. If the only values are 0 and 1, + * but %minreversal > 1, set the reversal count to 0 and + * issue a warning. + * </pre> + */ +l_ok +numaCountReversals(NUMA *nas, + l_float32 minreversal, + l_int32 *pnr, + l_float32 *prd) +{ +l_int32 i, n, nr, ival, binvals; +l_int32 *ia; +l_float32 fval, delx, len; +NUMA *nat; + + if (pnr) *pnr = 0; + if (prd) *prd = 0.0; + if (!pnr && !prd) + return ERROR_INT("neither &nr nor &rd are defined", __func__, 1); + if (!nas) + return ERROR_INT("nas not defined", __func__, 1); + if ((n = numaGetCount(nas)) == 0) { + L_INFO("nas is empty\n", __func__); + return 0; + } + if (minreversal < 0.0) + return ERROR_INT("minreversal < 0", __func__, 1); + + /* Decide if the only values are 0 and 1 */ + binvals = TRUE; + for (i = 0; i < n; i++) { + numaGetFValue(nas, i, &fval); + if (fval != 0.0 && fval != 1.0) { + binvals = FALSE; + break; + } + } + + nr = 0; + if (binvals) { + if (minreversal > 1.0) { + L_WARNING("binary values but minreversal > 1\n", __func__); + } else { + ia = numaGetIArray(nas); + ival = ia[0]; + for (i = 1; i < n; i++) { + if (ia[i] != ival) { + nr++; + ival = ia[i]; + } + } + LEPT_FREE(ia); + } + } else { + nat = numaFindExtrema(nas, minreversal, NULL); + nr = numaGetCount(nat); + numaDestroy(&nat); + } + if (pnr) *pnr = nr; + if (prd) { + numaGetParameters(nas, NULL, &delx); + len = delx * n; + *prd = (l_float32)nr / len; + } + + return 0; +} + + +/*----------------------------------------------------------------------* + * Threshold crossings and frequency analysis * + *----------------------------------------------------------------------*/ +/*! + * \brief numaSelectCrossingThreshold() + * + * \param[in] nax [optional] numa of abscissa values; can be NULL + * \param[in] nay signal + * \param[in] estthresh estimated pixel threshold for crossing: + * e.g., for images, white <--> black; typ. ~120 + * \param[out] pbestthresh robust estimate of threshold to use + * \return 0 if OK, 1 on error or warning + * + * <pre> + * Notes: + * (1) When a valid threshold is used, the number of crossings is + * a maximum, because none are missed. If no threshold intersects + * all the crossings, the crossings must be determined with + * numaCrossingsByPeaks(). + * (2) %estthresh is an input estimate of the threshold that should + * be used. We compute the crossings with 41 thresholds + * (20 below and 20 above). There is a range in which the + * number of crossings is a maximum. Return a threshold + * in the center of this stable plateau of crossings. + * This can then be used with numaCrossingsByThreshold() + * to get a good estimate of crossing locations. + * (3) If the count of nay is less than 2, a warning is issued. + * </pre> + */ +l_ok +numaSelectCrossingThreshold(NUMA *nax, + NUMA *nay, + l_float32 estthresh, + l_float32 *pbestthresh) +{ +l_int32 i, inrun, istart, iend, maxstart, maxend, runlen, maxrunlen; +l_int32 val, maxval, nmax, count; +l_float32 thresh, fmaxval, fmodeval; +NUMA *nat, *nac; + + if (!pbestthresh) + return ERROR_INT("&bestthresh not defined", __func__, 1); + *pbestthresh = 0.0; + if (!nay) + return ERROR_INT("nay not defined", __func__, 1); + if (numaGetCount(nay) < 2) { + L_WARNING("nay count < 2; no threshold crossing\n", __func__); + return 1; + } + + /* Compute the number of crossings for different thresholds */ + nat = numaCreate(41); + for (i = 0; i < 41; i++) { + thresh = estthresh - 80.0f + 4.0f * i; + nac = numaCrossingsByThreshold(nax, nay, thresh); + numaAddNumber(nat, numaGetCount(nac)); + numaDestroy(&nac); + } + + /* Find the center of the plateau of max crossings, which + * extends from thresh[istart] to thresh[iend]. */ + numaGetMax(nat, &fmaxval, NULL); + maxval = (l_int32)fmaxval; + nmax = 0; + for (i = 0; i < 41; i++) { + numaGetIValue(nat, i, &val); + if (val == maxval) + nmax++; + } + if (nmax < 3) { /* likely accidental max; try the mode */ + numaGetMode(nat, &fmodeval, &count); + if (count > nmax && fmodeval > 0.5 * fmaxval) + maxval = (l_int32)fmodeval; /* use the mode */ + } + + inrun = FALSE; + iend = 40; + maxrunlen = 0, maxstart = 0, maxend = 0; + for (i = 0; i < 41; i++) { + numaGetIValue(nat, i, &val); + if (val == maxval) { + if (!inrun) { + istart = i; + inrun = TRUE; + } + continue; + } + if (inrun && (val != maxval)) { + iend = i - 1; + runlen = iend - istart + 1; + inrun = FALSE; + if (runlen > maxrunlen) { + maxstart = istart; + maxend = iend; + maxrunlen = runlen; + } + } + } + if (inrun) { + runlen = i - istart; + if (runlen > maxrunlen) { + maxstart = istart; + maxend = i - 1; + maxrunlen = runlen; + } + } + + *pbestthresh = estthresh - 80.0f + 2.0f * (l_float32)(maxstart + maxend); + +#if DEBUG_CROSSINGS + lept_stderr("\nCrossings attain a maximum at %d thresholds, between:\n" + " thresh[%d] = %5.1f and thresh[%d] = %5.1f\n", + nmax, maxstart, estthresh - 80.0 + 4.0 * maxstart, + maxend, estthresh - 80.0 + 4.0 * maxend); + lept_stderr("The best choice: %5.1f\n", *pbestthresh); + lept_stderr("Number of crossings at the 41 thresholds:"); + numaWriteStderr(nat); +#endif /* DEBUG_CROSSINGS */ + + numaDestroy(&nat); + return 0; +} + + +/*! + * \brief numaCrossingsByThreshold() + * + * \param[in] nax [optional] numa of abscissa values; can be NULL + * \param[in] nay numa of ordinate values, corresponding to nax + * \param[in] thresh threshold value for nay + * \return nad abscissa pts at threshold, or NULL on error + * + * <pre> + * Notes: + * (1) If nax == NULL, we use startx and delx from nay to compute + * the crossing values in nad. + * </pre> + */ +NUMA * +numaCrossingsByThreshold(NUMA *nax, + NUMA *nay, + l_float32 thresh) +{ +l_int32 i, n; +l_float32 startx, delx; +l_float32 xval1, xval2, yval1, yval2, delta1, delta2, crossval, fract; +NUMA *nad; + + if (!nay) + return (NUMA *)ERROR_PTR("nay not defined", __func__, NULL); + n = numaGetCount(nay); + + if (nax && (numaGetCount(nax) != n)) + return (NUMA *)ERROR_PTR("nax and nay sizes differ", __func__, NULL); + + nad = numaCreate(0); + if (n < 2) return nad; + numaGetFValue(nay, 0, &yval1); + numaGetParameters(nay, &startx, &delx); + if (nax) + numaGetFValue(nax, 0, &xval1); + else + xval1 = startx; + for (i = 1; i < n; i++) { + numaGetFValue(nay, i, &yval2); + if (nax) + numaGetFValue(nax, i, &xval2); + else + xval2 = startx + i * delx; + delta1 = yval1 - thresh; + delta2 = yval2 - thresh; + if (delta1 == 0.0) { + numaAddNumber(nad, xval1); + } else if (delta2 == 0.0) { + numaAddNumber(nad, xval2); + } else if (delta1 * delta2 < 0.0) { /* crossing */ + fract = L_ABS(delta1) / L_ABS(yval1 - yval2); + crossval = xval1 + fract * (xval2 - xval1); + numaAddNumber(nad, crossval); + } + xval1 = xval2; + yval1 = yval2; + } + + return nad; +} + + +/*! + * \brief numaCrossingsByPeaks() + * + * \param[in] nax [optional] numa of abscissa values + * \param[in] nay numa of ordinate values, corresponding to nax + * \param[in] delta parameter used to identify when a new peak can be found + * \return nad abscissa pts at threshold, or NULL on error + * + * <pre> + * Notes: + * (1) If nax == NULL, we use startx and delx from nay to compute + * the crossing values in nad. + * </pre> + */ +NUMA * +numaCrossingsByPeaks(NUMA *nax, + NUMA *nay, + l_float32 delta) +{ +l_int32 i, j, n, np, previndex, curindex; +l_float32 startx, delx; +l_float32 xval1, xval2, yval1, yval2, delta1, delta2; +l_float32 prevval, curval, thresh, crossval, fract; +NUMA *nap, *nad; + + if (!nay) + return (NUMA *)ERROR_PTR("nay not defined", __func__, NULL); + + n = numaGetCount(nay); + if (nax && (numaGetCount(nax) != n)) + return (NUMA *)ERROR_PTR("nax and nay sizes differ", __func__, NULL); + + /* Find the extrema. Also add last point in nay to get + * the last transition (from the last peak to the end). + * The number of crossings is 1 more than the number of extrema. */ + nap = numaFindExtrema(nay, delta, NULL); + numaAddNumber(nap, n - 1); + np = numaGetCount(nap); + L_INFO("Number of crossings: %d\n", __func__, np); + + /* Do all computation in index units of nax or the delx of nay */ + nad = numaCreate(np); /* output crossing locations, in nax units */ + previndex = 0; /* prime the search with 1st point */ + numaGetFValue(nay, 0, &prevval); /* prime the search with 1st point */ + numaGetParameters(nay, &startx, &delx); + for (i = 0; i < np; i++) { + numaGetIValue(nap, i, &curindex); + numaGetFValue(nay, curindex, &curval); + thresh = (prevval + curval) / 2.0f; + if (nax) + numaGetFValue(nax, previndex, &xval1); + else + xval1 = startx + previndex * delx; + numaGetFValue(nay, previndex, &yval1); + for (j = previndex + 1; j <= curindex; j++) { + if (nax) + numaGetFValue(nax, j, &xval2); + else + xval2 = startx + j * delx; + numaGetFValue(nay, j, &yval2); + delta1 = yval1 - thresh; + delta2 = yval2 - thresh; + if (delta1 == 0.0) { + numaAddNumber(nad, xval1); + break; + } else if (delta2 == 0.0) { + numaAddNumber(nad, xval2); + break; + } else if (delta1 * delta2 < 0.0) { /* crossing */ + fract = L_ABS(delta1) / L_ABS(yval1 - yval2); + crossval = xval1 + fract * (xval2 - xval1); + numaAddNumber(nad, crossval); + break; + } + xval1 = xval2; + yval1 = yval2; + } + previndex = curindex; + prevval = curval; + } + + numaDestroy(&nap); + return nad; +} + + +/*! + * \brief numaEvalBestHaarParameters() + * + * \param[in] nas numa of non-negative signal values + * \param[in] relweight relative weight of (-1 comb) / (+1 comb) + * contributions to the 'convolution'. In effect, + * the convolution kernel is a comb consisting of + * alternating +1 and -weight. + * \param[in] nwidth number of widths to consider + * \param[in] nshift number of shifts to consider for each width + * \param[in] minwidth smallest width to consider + * \param[in] maxwidth largest width to consider + * \param[out] pbestwidth width giving largest score + * \param[out] pbestshift shift giving largest score + * \param[out] pbestscore [optional] convolution with "Haar"-like comb + * \return 0 if OK, 1 on error + * + * <pre> + * Notes: + * (1) This does a linear sweep of widths, evaluating at %nshift + * shifts for each width, computing the score from a convolution + * with a long comb, and finding the (width, shift) pair that + * gives the maximum score. The best width is the "half-wavelength" + * of the signal. + * (2) The convolving function is a comb of alternating values + * +1 and -1 * relweight, separated by the width and phased by + * the shift. This is similar to a Haar transform, except + * there the convolution is performed with a square wave. + * (3) The function is useful for finding the line spacing + * and strength of line signal from pixel sum projections. + * (4) The score is normalized to the size of nas divided by + * the number of half-widths. For image applications, the input is + * typically an array of pixel projections, so one should + * normalize by dividing the score by the image width in the + * pixel projection direction. + * </pre> + */ +l_ok +numaEvalBestHaarParameters(NUMA *nas, + l_float32 relweight, + l_int32 nwidth, + l_int32 nshift, + l_float32 minwidth, + l_float32 maxwidth, + l_float32 *pbestwidth, + l_float32 *pbestshift, + l_float32 *pbestscore) +{ +l_int32 i, j; +l_float32 delwidth, delshift, width, shift, score; +l_float32 bestwidth, bestshift, bestscore; + + if (pbestscore) *pbestscore = 0.0; + if (pbestwidth) *pbestwidth = 0.0; + if (pbestshift) *pbestshift = 0.0; + if (!pbestwidth || !pbestshift) + return ERROR_INT("&bestwidth and &bestshift not defined", __func__, 1); + if (!nas) + return ERROR_INT("nas not defined", __func__, 1); + + bestscore = bestwidth = bestshift = 0.0; + delwidth = (maxwidth - minwidth) / (nwidth - 1.0f); + for (i = 0; i < nwidth; i++) { + width = minwidth + delwidth * i; + delshift = width / (l_float32)(nshift); + for (j = 0; j < nshift; j++) { + shift = j * delshift; + numaEvalHaarSum(nas, width, shift, relweight, &score); + if (score > bestscore) { + bestscore = score; + bestwidth = width; + bestshift = shift; +#if DEBUG_FREQUENCY + lept_stderr("width = %7.3f, shift = %7.3f, score = %7.3f\n", + width, shift, score); +#endif /* DEBUG_FREQUENCY */ + } + } + } + + *pbestwidth = bestwidth; + *pbestshift = bestshift; + if (pbestscore) + *pbestscore = bestscore; + return 0; +} + + +/*! + * \brief numaEvalHaarSum() + * + * \param[in] nas numa of non-negative signal values + * \param[in] width distance between +1 and -1 in convolution comb + * \param[in] shift phase of the comb: location of first +1 + * \param[in] relweight relative weight of (-1 comb) / (+1 comb) + * contributions to the 'convolution'. In effect, + * the convolution kernel is a comb consisting of + * alternating +1 and -weight. + * \param[out] pscore convolution with "Haar"-like comb + * \return 0 if OK, 1 on error + * + * <pre> + * Notes: + * (1) This does a convolution with a comb of alternating values + * +1 and -relweight, separated by the width and phased by the shift. + * This is similar to a Haar transform, except that for Haar, + * (1) the convolution kernel is symmetric about 0, so the + * relweight is 1.0, and + * (2) the convolution is performed with a square wave. + * (2) The score is normalized to the size of nas divided by + * twice the "width". For image applications, the input is + * typically an array of pixel projections, so one should + * normalize by dividing the score by the image width in the + * pixel projection direction. + * (3) To get a Haar-like result, use relweight = 1.0. For detecting + * signals where you expect every other sample to be close to + * zero, as with barcodes or filtered text lines, you can + * use relweight > 1.0. + * </pre> + */ +l_ok +numaEvalHaarSum(NUMA *nas, + l_float32 width, + l_float32 shift, + l_float32 relweight, + l_float32 *pscore) +{ +l_int32 i, n, nsamp, index; +l_float32 score, weight, val; + + if (!pscore) + return ERROR_INT("&score not defined", __func__, 1); + *pscore = 0.0; + if (!nas) + return ERROR_INT("nas not defined", __func__, 1); + if ((n = numaGetCount(nas)) < 2 * width) + return ERROR_INT("nas size too small", __func__, 1); + + score = 0.0; + nsamp = (l_int32)((n - shift) / width); + for (i = 0; i < nsamp; i++) { + index = (l_int32)(shift + i * width); + weight = (i % 2) ? 1.0f : -1.0f * relweight; + numaGetFValue(nas, index, &val); + score += weight * val; + } + + *pscore = 2.0f * width * score / (l_float32)n; + return 0; +} + + +/*----------------------------------------------------------------------* + * Generating numbers in a range under constraints * + *----------------------------------------------------------------------*/ +/*! + * \brief genConstrainedNumaInRange() + * + * \param[in] first first number to choose; >= 0 + * \param[in] last biggest possible number to reach; >= first + * \param[in] nmax maximum number of numbers to select; > 0 + * \param[in] use_pairs 1 = select pairs of adjacent numbers; + * 0 = select individual numbers + * \return 0 if OK, 1 on error + * + * <pre> + * Notes: + * (1) Selection is made uniformly in the range. This can be used + * to select pages distributed as uniformly as possible + * through a book, where you are constrained to: + * ~ choose between [first, ... biggest], + * ~ choose no more than nmax numbers, and + * and you have the option of requiring pairs of adjacent numbers. + * </pre> + */ +NUMA * +genConstrainedNumaInRange(l_int32 first, + l_int32 last, + l_int32 nmax, + l_int32 use_pairs) +{ +l_int32 i, nsets, val; +l_float32 delta; +NUMA *na; + + first = L_MAX(0, first); + if (last < first) + return (NUMA *)ERROR_PTR("last < first!", __func__, NULL); + if (nmax < 1) + return (NUMA *)ERROR_PTR("nmax < 1!", __func__, NULL); + + nsets = L_MIN(nmax, last - first + 1); + if (use_pairs == 1) + nsets = nsets / 2; + if (nsets == 0) + return (NUMA *)ERROR_PTR("nsets == 0", __func__, NULL); + + /* Select delta so that selection covers the full range if possible */ + if (nsets == 1) { + delta = 0.0; + } else { + if (use_pairs == 0) + delta = (l_float32)(last - first) / (nsets - 1); + else + delta = (l_float32)(last - first - 1) / (nsets - 1); + } + + na = numaCreate(nsets); + for (i = 0; i < nsets; i++) { + val = (l_int32)(first + i * delta + 0.5); + numaAddNumber(na, val); + if (use_pairs == 1) + numaAddNumber(na, val + 1); + } + + return na; +}
