comparison mupdf-source/thirdparty/leptonica/src/numafunc2.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /*====================================================================*
2 - Copyright (C) 2001 Leptonica. All rights reserved.
3 -
4 - Redistribution and use in source and binary forms, with or without
5 - modification, are permitted provided that the following conditions
6 - are met:
7 - 1. Redistributions of source code must retain the above copyright
8 - notice, this list of conditions and the following disclaimer.
9 - 2. Redistributions in binary form must reproduce the above
10 - copyright notice, this list of conditions and the following
11 - disclaimer in the documentation and/or other materials
12 - provided with the distribution.
13 -
14 - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15 - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16 - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17 - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY
18 - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23 - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 *====================================================================*/
26
27 /*!
28 * \file numafunc2.c
29 * <pre>
30 *
31 * --------------------------------------
32 * This file has these Numa utilities:
33 * - morphological operations
34 * - arithmetic transforms
35 * - windowed statistical operations
36 * - histogram extraction
37 * - histogram comparison
38 * - extrema finding
39 * - frequency and crossing analysis
40 * --------------------------------------
41
42 * Morphological (min/max) operations
43 * NUMA *numaErode()
44 * NUMA *numaDilate()
45 * NUMA *numaOpen()
46 * NUMA *numaClose()
47 *
48 * Other transforms
49 * NUMA *numaTransform()
50 * l_int32 numaSimpleStats()
51 * l_int32 numaWindowedStats()
52 * NUMA *numaWindowedMean()
53 * NUMA *numaWindowedMeanSquare()
54 * l_int32 numaWindowedVariance()
55 * NUMA *numaWindowedMedian()
56 * NUMA *numaConvertToInt()
57 *
58 * Histogram generation and statistics
59 * NUMA *numaMakeHistogram()
60 * NUMA *numaMakeHistogramAuto()
61 * NUMA *numaMakeHistogramClipped()
62 * NUMA *numaRebinHistogram()
63 * NUMA *numaNormalizeHistogram()
64 * l_int32 numaGetStatsUsingHistogram()
65 * l_int32 numaGetHistogramStats()
66 * l_int32 numaGetHistogramStatsOnInterval()
67 * l_int32 numaMakeRankFromHistogram()
68 * l_int32 numaHistogramGetRankFromVal()
69 * l_int32 numaHistogramGetValFromRank()
70 * l_int32 numaDiscretizeSortedInBins()
71 * l_int32 numaDiscretizeHistoInBins()
72 * l_int32 numaGetRankBinValues()
73 * NUMA *numaGetUniformBinSizes()
74 *
75 * Splitting a distribution
76 * l_int32 numaSplitDistribution()
77 *
78 * Comparing histograms
79 * l_int32 grayHistogramsToEMD()
80 * l_int32 numaEarthMoverDistance()
81 * l_int32 grayInterHistogramStats()
82 *
83 * Extrema finding
84 * NUMA *numaFindPeaks()
85 * NUMA *numaFindExtrema()
86 * NUMA *numaFindLocForThreshold()
87 * l_int32 *numaCountReversals()
88 *
89 * Threshold crossings and frequency analysis
90 * l_int32 numaSelectCrossingThreshold()
91 * NUMA *numaCrossingsByThreshold()
92 * NUMA *numaCrossingsByPeaks()
93 * NUMA *numaEvalBestHaarParameters()
94 * l_int32 numaEvalHaarSum()
95 *
96 * Generating numbers in a range under constraints
97 * NUMA *genConstrainedNumaInRange()
98 *
99 * Things to remember when using the Numa:
100 *
101 * (1) The numa is a struct, not an array. Always use accessors
102 * (see numabasic.c), never the fields directly.
103 *
104 * (2) The number array holds l_float32 values. It can also
105 * be used to store l_int32 values. See numabasic.c for
106 * details on using the accessors. Integers larger than
107 * about 10M will lose accuracy due on retrieval due to round-off.
108 * For large integers, use the dna (array of l_float64) instead.
109 *
110 * (3) Occasionally, in the comments we denote the i-th element of a
111 * numa by na[i]. This is conceptual only -- the numa is not an array!
112 *
113 * Some general comments on histograms:
114 *
115 * (1) Histograms are the generic statistical representation of
116 * the data about some attribute. Typically they're not
117 * normalized -- they simply give the number of occurrences
118 * within each range of values of the attribute. This range
119 * of values is referred to as a 'bucket'. For example,
120 * the histogram could specify how many connected components
121 * are found for each value of their width; in that case,
122 * the bucket size is 1.
123 *
124 * (2) In leptonica, all buckets have the same size. Histograms
125 * are therefore specified by a numa of occurrences, along
126 * with two other numbers: the 'value' associated with the
127 * occupants of the first bucket and the size (i.e., 'width')
128 * of each bucket. These two numbers then allow us to calculate
129 * the value associated with the occupants of each bucket.
130 * These numbers are fields in the numa, initialized to
131 * a startx value of 0.0 and a binsize of 1.0. Accessors for
132 * these fields are functions numa*Parameters(). All histograms
133 * must have these two numbers properly set.
134 * </pre>
135 */
136
137 #ifdef HAVE_CONFIG_H
138 #include <config_auto.h>
139 #endif /* HAVE_CONFIG_H */
140
141 #include <math.h>
142 #include "allheaders.h"
143
144 /* bin sizes in numaMakeHistogram() */
145 static const l_int32 BinSizeArray[] = {2, 5, 10, 20, 50, 100, 200, 500, 1000,\
146 2000, 5000, 10000, 20000, 50000, 100000, 200000,\
147 500000, 1000000, 2000000, 5000000, 10000000,\
148 200000000, 50000000, 100000000};
149 static const l_int32 NBinSizes = 24;
150
151
152 #ifndef NO_CONSOLE_IO
153 #define DEBUG_HISTO 0
154 #define DEBUG_CROSSINGS 0
155 #define DEBUG_FREQUENCY 0
156 #endif /* ~NO_CONSOLE_IO */
157
158 /*----------------------------------------------------------------------*
159 * Morphological operations *
160 *----------------------------------------------------------------------*/
161 /*!
162 * \brief numaErode()
163 *
164 * \param[in] nas
165 * \param[in] size of sel; greater than 0, odd. The origin
166 * is implicitly in the center.
167 * \return nad eroded, or NULL on error
168 *
169 * <pre>
170 * Notes:
171 * (1) The structuring element (sel) is linear, all "hits"
172 * (2) If size == 1, this returns a copy
173 * (3) General comment. The morphological operations are equivalent
174 * to those that would be performed on a 1-dimensional fpix.
175 * However, because we have not implemented morphological
176 * operations on fpix, we do this here. Because it is only
177 * 1 dimensional, there is no reason to use the more
178 * complicated van Herk/Gil-Werman algorithm, and we do it
179 * by brute force.
180 * </pre>
181 */
182 NUMA *
183 numaErode(NUMA *nas,
184 l_int32 size)
185 {
186 l_int32 i, j, n, hsize, len;
187 l_float32 minval;
188 l_float32 *fa, *fas, *fad;
189 NUMA *nad;
190
191 if (!nas)
192 return (NUMA *)ERROR_PTR("nas not defined", __func__, NULL);
193 if (size <= 0)
194 return (NUMA *)ERROR_PTR("size must be > 0", __func__, NULL);
195 if ((size & 1) == 0 ) {
196 L_WARNING("sel size must be odd; increasing by 1\n", __func__);
197 size++;
198 }
199
200 if (size == 1)
201 return numaCopy(nas);
202
203 /* Make a source fa (fas) that has an added (size / 2) boundary
204 * on left and right, contains a copy of nas in the interior region
205 * (between 'size' and 'size + n', and has large values
206 * inserted in the boundary (because it is an erosion). */
207 n = numaGetCount(nas);
208 hsize = size / 2;
209 len = n + 2 * hsize;
210 if ((fas = (l_float32 *)LEPT_CALLOC(len, sizeof(l_float32))) == NULL)
211 return (NUMA *)ERROR_PTR("fas not made", __func__, NULL);
212 for (i = 0; i < hsize; i++)
213 fas[i] = 1.0e37f;
214 for (i = hsize + n; i < len; i++)
215 fas[i] = 1.0e37f;
216 fa = numaGetFArray(nas, L_NOCOPY);
217 for (i = 0; i < n; i++)
218 fas[hsize + i] = fa[i];
219
220 nad = numaMakeConstant(0, n);
221 numaCopyParameters(nad, nas);
222 fad = numaGetFArray(nad, L_NOCOPY);
223 for (i = 0; i < n; i++) {
224 minval = 1.0e37f; /* start big */
225 for (j = 0; j < size; j++)
226 minval = L_MIN(minval, fas[i + j]);
227 fad[i] = minval;
228 }
229
230 LEPT_FREE(fas);
231 return nad;
232 }
233
234
235 /*!
236 * \brief numaDilate()
237 *
238 * \param[in] nas
239 * \param[in] size of sel; greater than 0, odd. The origin
240 * is implicitly in the center.
241 * \return nad dilated, or NULL on error
242 *
243 * <pre>
244 * Notes:
245 * (1) The structuring element (sel) is linear, all "hits"
246 * (2) If size == 1, this returns a copy
247 * </pre>
248 */
249 NUMA *
250 numaDilate(NUMA *nas,
251 l_int32 size)
252 {
253 l_int32 i, j, n, hsize, len;
254 l_float32 maxval;
255 l_float32 *fa, *fas, *fad;
256 NUMA *nad;
257
258 if (!nas)
259 return (NUMA *)ERROR_PTR("nas not defined", __func__, NULL);
260 if (size <= 0)
261 return (NUMA *)ERROR_PTR("size must be > 0", __func__, NULL);
262 if ((size & 1) == 0 ) {
263 L_WARNING("sel size must be odd; increasing by 1\n", __func__);
264 size++;
265 }
266
267 if (size == 1)
268 return numaCopy(nas);
269
270 /* Make a source fa (fas) that has an added (size / 2) boundary
271 * on left and right, contains a copy of nas in the interior region
272 * (between 'size' and 'size + n', and has small values
273 * inserted in the boundary (because it is a dilation). */
274 n = numaGetCount(nas);
275 hsize = size / 2;
276 len = n + 2 * hsize;
277 if ((fas = (l_float32 *)LEPT_CALLOC(len, sizeof(l_float32))) == NULL)
278 return (NUMA *)ERROR_PTR("fas not made", __func__, NULL);
279 for (i = 0; i < hsize; i++)
280 fas[i] = -1.0e37f;
281 for (i = hsize + n; i < len; i++)
282 fas[i] = -1.0e37f;
283 fa = numaGetFArray(nas, L_NOCOPY);
284 for (i = 0; i < n; i++)
285 fas[hsize + i] = fa[i];
286
287 nad = numaMakeConstant(0, n);
288 numaCopyParameters(nad, nas);
289 fad = numaGetFArray(nad, L_NOCOPY);
290 for (i = 0; i < n; i++) {
291 maxval = -1.0e37f; /* start small */
292 for (j = 0; j < size; j++)
293 maxval = L_MAX(maxval, fas[i + j]);
294 fad[i] = maxval;
295 }
296
297 LEPT_FREE(fas);
298 return nad;
299 }
300
301
302 /*!
303 * \brief numaOpen()
304 *
305 * \param[in] nas
306 * \param[in] size of sel; greater than 0, odd. The origin
307 * is implicitly in the center.
308 * \return nad opened, or NULL on error
309 *
310 * <pre>
311 * Notes:
312 * (1) The structuring element (sel) is linear, all "hits"
313 * (2) If size == 1, this returns a copy
314 * </pre>
315 */
316 NUMA *
317 numaOpen(NUMA *nas,
318 l_int32 size)
319 {
320 NUMA *nat, *nad;
321
322 if (!nas)
323 return (NUMA *)ERROR_PTR("nas not defined", __func__, NULL);
324 if (size <= 0)
325 return (NUMA *)ERROR_PTR("size must be > 0", __func__, NULL);
326 if ((size & 1) == 0 ) {
327 L_WARNING("sel size must be odd; increasing by 1\n", __func__);
328 size++;
329 }
330
331 if (size == 1)
332 return numaCopy(nas);
333
334 nat = numaErode(nas, size);
335 nad = numaDilate(nat, size);
336 numaDestroy(&nat);
337 return nad;
338 }
339
340
341 /*!
342 * \brief numaClose()
343 *
344 * \param[in] nas
345 * \param[in] size of sel; greater than 0, odd. The origin
346 * is implicitly in the center.
347 * \return nad closed, or NULL on error
348 *
349 * <pre>
350 * Notes:
351 * (1) The structuring element (sel) is linear, all "hits"
352 * (2) If size == 1, this returns a copy
353 * (3) We add a border before doing this operation, for the same
354 * reason that we add a border to a pix before doing a safe closing.
355 * Without the border, a small component near the border gets
356 * clipped at the border on dilation, and can be entirely removed
357 * by the following erosion, violating the basic extensivity
358 * property of closing.
359 * </pre>
360 */
361 NUMA *
362 numaClose(NUMA *nas,
363 l_int32 size)
364 {
365 NUMA *nab, *nat1, *nat2, *nad;
366
367 if (!nas)
368 return (NUMA *)ERROR_PTR("nas not defined", __func__, NULL);
369 if (size <= 0)
370 return (NUMA *)ERROR_PTR("size must be > 0", __func__, NULL);
371 if ((size & 1) == 0 ) {
372 L_WARNING("sel size must be odd; increasing by 1\n", __func__);
373 size++;
374 }
375
376 if (size == 1)
377 return numaCopy(nas);
378
379 nab = numaAddBorder(nas, size, size, 0); /* to preserve extensivity */
380 nat1 = numaDilate(nab, size);
381 nat2 = numaErode(nat1, size);
382 nad = numaRemoveBorder(nat2, size, size);
383 numaDestroy(&nab);
384 numaDestroy(&nat1);
385 numaDestroy(&nat2);
386 return nad;
387 }
388
389
390 /*----------------------------------------------------------------------*
391 * Other transforms *
392 *----------------------------------------------------------------------*/
393 /*!
394 * \brief numaTransform()
395 *
396 * \param[in] nas
397 * \param[in] shift add this to each number
398 * \param[in] scale multiply each number by this
399 * \return nad with all values shifted and scaled, or NULL on error
400 *
401 * <pre>
402 * Notes:
403 * (1) Each number is shifted before scaling.
404 * </pre>
405 */
406 NUMA *
407 numaTransform(NUMA *nas,
408 l_float32 shift,
409 l_float32 scale)
410 {
411 l_int32 i, n;
412 l_float32 val;
413 NUMA *nad;
414
415 if (!nas)
416 return (NUMA *)ERROR_PTR("nas not defined", __func__, NULL);
417 n = numaGetCount(nas);
418 if ((nad = numaCreate(n)) == NULL)
419 return (NUMA *)ERROR_PTR("nad not made", __func__, NULL);
420 numaCopyParameters(nad, nas);
421 for (i = 0; i < n; i++) {
422 numaGetFValue(nas, i, &val);
423 val = scale * (val + shift);
424 numaAddNumber(nad, val);
425 }
426 return nad;
427 }
428
429
430 /*!
431 * \brief numaSimpleStats()
432 *
433 * \param[in] na input numa
434 * \param[in] first first element to use
435 * \param[in] last last element to use; -1 to go to the end
436 * \param[out] pmean [optional] mean value
437 * \param[out] pvar [optional] variance
438 * \param[out] prvar [optional] rms deviation from the mean
439 * \return 0 if OK, 1 on error
440 */
441 l_ok
442 numaSimpleStats(NUMA *na,
443 l_int32 first,
444 l_int32 last,
445 l_float32 *pmean,
446 l_float32 *pvar,
447 l_float32 *prvar)
448 {
449 l_int32 i, n, ni;
450 l_float32 sum, sumsq, val, mean, var;
451
452 if (pmean) *pmean = 0.0;
453 if (pvar) *pvar = 0.0;
454 if (prvar) *prvar = 0.0;
455 if (!pmean && !pvar && !prvar)
456 return ERROR_INT("nothing requested", __func__, 1);
457 if (!na)
458 return ERROR_INT("na not defined", __func__, 1);
459 if ((n = numaGetCount(na)) == 0)
460 return ERROR_INT("na is empty", __func__, 1);
461 first = L_MAX(0, first);
462 if (last < 0) last = n - 1;
463 if (first >= n)
464 return ERROR_INT("invalid first", __func__, 1);
465 if (last >= n) {
466 L_WARNING("last = %d is beyond max index = %d; adjusting\n",
467 __func__, last, n - 1);
468 last = n - 1;
469 }
470 if (first > last)
471 return ERROR_INT("first > last\n", __func__, 1);
472 ni = last - first + 1;
473 sum = sumsq = 0.0;
474 for (i = first; i <= last; i++) {
475 numaGetFValue(na, i, &val);
476 sum += val;
477 sumsq += val * val;
478 }
479
480 mean = sum / ni;
481 if (pmean)
482 *pmean = mean;
483 if (pvar || prvar) {
484 var = sumsq / ni - mean * mean;
485 if (pvar) *pvar = var;
486 if (prvar) *prvar = sqrtf(var);
487 }
488
489 return 0;
490 }
491
492
493 /*!
494 * \brief numaWindowedStats()
495 *
496 * \param[in] nas input numa
497 * \param[in] wc half width of the window
498 * \param[out] pnam [optional] mean value in window
499 * \param[out] pnams [optional] mean square value in window
500 * \param[out] pnav [optional] variance in window
501 * \param[out] pnarv [optional] rms deviation from the mean
502 * \return 0 if OK, 1 on error
503 *
504 * <pre>
505 * Notes:
506 * (1) This is a high-level convenience function for calculating
507 * any or all of these derived arrays.
508 * (2) These statistical measures over the values in the
509 * rectangular window are:
510 * ~ average value: [x] (nam)
511 * ~ average squared value: [x*x] (nams)
512 * ~ variance: [(x - [x])*(x - [x])] = [x*x] - [x]*[x] (nav)
513 * ~ square-root of variance: (narv)
514 * where the brackets [ .. ] indicate that the average value is
515 * to be taken over the window.
516 * (3) Note that the variance is just the mean square difference from
517 * the mean value; and the square root of the variance is the
518 * root mean square difference from the mean, sometimes also
519 * called the 'standard deviation'.
520 * (4) Internally, use mirrored borders to handle values near the
521 * end of each array.
522 * </pre>
523 */
524 l_ok
525 numaWindowedStats(NUMA *nas,
526 l_int32 wc,
527 NUMA **pnam,
528 NUMA **pnams,
529 NUMA **pnav,
530 NUMA **pnarv)
531 {
532 NUMA *nam, *nams;
533
534 if (!nas)
535 return ERROR_INT("nas not defined", __func__, 1);
536 if (2 * wc + 1 > numaGetCount(nas))
537 L_WARNING("filter wider than input array!\n", __func__);
538
539 if (!pnav && !pnarv) {
540 if (pnam) *pnam = numaWindowedMean(nas, wc);
541 if (pnams) *pnams = numaWindowedMeanSquare(nas, wc);
542 return 0;
543 }
544
545 nam = numaWindowedMean(nas, wc);
546 nams = numaWindowedMeanSquare(nas, wc);
547 numaWindowedVariance(nam, nams, pnav, pnarv);
548 if (pnam)
549 *pnam = nam;
550 else
551 numaDestroy(&nam);
552 if (pnams)
553 *pnams = nams;
554 else
555 numaDestroy(&nams);
556 return 0;
557 }
558
559
560 /*!
561 * \brief numaWindowedMean()
562 *
563 * \param[in] nas
564 * \param[in] wc half width of the convolution window
565 * \return nad after low-pass filtering, or NULL on error
566 *
567 * <pre>
568 * Notes:
569 * (1) This is a convolution. The window has width = 2 * %wc + 1.
570 * (2) We add a mirrored border of size %wc to each end of the array.
571 * </pre>
572 */
573 NUMA *
574 numaWindowedMean(NUMA *nas,
575 l_int32 wc)
576 {
577 l_int32 i, n, n1, width;
578 l_float32 sum, norm;
579 l_float32 *fa1, *fad, *suma;
580 NUMA *na1, *nad;
581
582 if (!nas)
583 return (NUMA *)ERROR_PTR("nas not defined", __func__, NULL);
584 n = numaGetCount(nas);
585 width = 2 * wc + 1; /* filter width */
586 if (width > n)
587 L_WARNING("filter wider than input array!\n", __func__);
588
589 na1 = numaAddSpecifiedBorder(nas, wc, wc, L_MIRRORED_BORDER);
590 n1 = n + 2 * wc;
591 fa1 = numaGetFArray(na1, L_NOCOPY);
592 nad = numaMakeConstant(0, n);
593 fad = numaGetFArray(nad, L_NOCOPY);
594
595 /* Make sum array; note the indexing */
596 if ((suma = (l_float32 *)LEPT_CALLOC(n1 + 1, sizeof(l_float32))) == NULL) {
597 numaDestroy(&na1);
598 numaDestroy(&nad);
599 return (NUMA *)ERROR_PTR("suma not made", __func__, NULL);
600 }
601 sum = 0.0;
602 suma[0] = 0.0;
603 for (i = 0; i < n1; i++) {
604 sum += fa1[i];
605 suma[i + 1] = sum;
606 }
607
608 norm = 1.f / (2 * wc + 1);
609 for (i = 0; i < n; i++)
610 fad[i] = norm * (suma[width + i] - suma[i]);
611
612 LEPT_FREE(suma);
613 numaDestroy(&na1);
614 return nad;
615 }
616
617
618 /*!
619 * \brief numaWindowedMeanSquare()
620 *
621 * \param[in] nas
622 * \param[in] wc half width of the window
623 * \return nad containing windowed mean square values, or NULL on error
624 *
625 * <pre>
626 * Notes:
627 * (1) The window has width = 2 * %wc + 1.
628 * (2) We add a mirrored border of size %wc to each end of the array.
629 * </pre>
630 */
631 NUMA *
632 numaWindowedMeanSquare(NUMA *nas,
633 l_int32 wc)
634 {
635 l_int32 i, n, n1, width;
636 l_float32 sum, norm;
637 l_float32 *fa1, *fad, *suma;
638 NUMA *na1, *nad;
639
640 if (!nas)
641 return (NUMA *)ERROR_PTR("nas not defined", __func__, NULL);
642 n = numaGetCount(nas);
643 width = 2 * wc + 1; /* filter width */
644 if (width > n)
645 L_WARNING("filter wider than input array!\n", __func__);
646
647 na1 = numaAddSpecifiedBorder(nas, wc, wc, L_MIRRORED_BORDER);
648 n1 = n + 2 * wc;
649 fa1 = numaGetFArray(na1, L_NOCOPY);
650 nad = numaMakeConstant(0, n);
651 fad = numaGetFArray(nad, L_NOCOPY);
652
653 /* Make sum array; note the indexing */
654 if ((suma = (l_float32 *)LEPT_CALLOC(n1 + 1, sizeof(l_float32))) == NULL) {
655 numaDestroy(&na1);
656 numaDestroy(&nad);
657 return (NUMA *)ERROR_PTR("suma not made", __func__, NULL);
658 }
659 sum = 0.0;
660 suma[0] = 0.0;
661 for (i = 0; i < n1; i++) {
662 sum += fa1[i] * fa1[i];
663 suma[i + 1] = sum;
664 }
665
666 norm = 1.f / (2 * wc + 1);
667 for (i = 0; i < n; i++)
668 fad[i] = norm * (suma[width + i] - suma[i]);
669
670 LEPT_FREE(suma);
671 numaDestroy(&na1);
672 return nad;
673 }
674
675
676 /*!
677 * \brief numaWindowedVariance()
678 *
679 * \param[in] nam windowed mean values
680 * \param[in] nams windowed mean square values
681 * \param[out] pnav [optional] numa of variance -- the ms deviation
682 * from the mean
683 * \param[out] pnarv [optional] numa of rms deviation from the mean
684 * \return 0 if OK, 1 on error
685 *
686 * <pre>
687 * Notes:
688 * (1) The numas of windowed mean and mean square are precomputed,
689 * using numaWindowedMean() and numaWindowedMeanSquare().
690 * (2) Either or both of the variance and square-root of variance
691 * are returned, where the variance is the average over the
692 * window of the mean square difference of the pixel value
693 * from the mean:
694 * [(x - [x])*(x - [x])] = [x*x] - [x]*[x]
695 * </pre>
696 */
697 l_ok
698 numaWindowedVariance(NUMA *nam,
699 NUMA *nams,
700 NUMA **pnav,
701 NUMA **pnarv)
702 {
703 l_int32 i, nm, nms;
704 l_float32 var;
705 l_float32 *fam, *fams, *fav = NULL, *farv = NULL;
706 NUMA *nav, *narv; /* variance and square root of variance */
707
708 if (pnav) *pnav = NULL;
709 if (pnarv) *pnarv = NULL;
710 if (!pnav && !pnarv)
711 return ERROR_INT("neither &nav nor &narv are defined", __func__, 1);
712 if (!nam)
713 return ERROR_INT("nam not defined", __func__, 1);
714 if (!nams)
715 return ERROR_INT("nams not defined", __func__, 1);
716 nm = numaGetCount(nam);
717 nms = numaGetCount(nams);
718 if (nm != nms)
719 return ERROR_INT("sizes of nam and nams differ", __func__, 1);
720
721 if (pnav) {
722 nav = numaMakeConstant(0, nm);
723 *pnav = nav;
724 fav = numaGetFArray(nav, L_NOCOPY);
725 }
726 if (pnarv) {
727 narv = numaMakeConstant(0, nm);
728 *pnarv = narv;
729 farv = numaGetFArray(narv, L_NOCOPY);
730 }
731 fam = numaGetFArray(nam, L_NOCOPY);
732 fams = numaGetFArray(nams, L_NOCOPY);
733
734 for (i = 0; i < nm; i++) {
735 var = fams[i] - fam[i] * fam[i];
736 if (pnav)
737 fav[i] = var;
738 if (pnarv)
739 farv[i] = sqrtf(var);
740 }
741
742 return 0;
743 }
744
745
746 /*!
747 * \brief numaWindowedMedian()
748 *
749 * \param[in] nas
750 * \param[in] halfwin half width of window over which the median is found
751 * \return nad after windowed median filtering, or NULL on error
752 *
753 * <pre>
754 * Notes:
755 * (1) The requested window has width = 2 * %halfwin + 1.
756 * (2) If the input nas has less then 3 elements, return a copy.
757 * (3) If the filter is too small (%halfwin <= 0), return a copy.
758 * (4) If the filter is too large, it is reduced in size.
759 * (5) We add a mirrored border of size %halfwin to each end of
760 * the array to simplify the calculation by avoiding end-effects.
761 * </pre>
762 */
763 NUMA *
764 numaWindowedMedian(NUMA *nas,
765 l_int32 halfwin)
766 {
767 l_int32 i, n;
768 l_float32 medval;
769 NUMA *na1, *na2, *nad;
770
771 if (!nas)
772 return (NUMA *)ERROR_PTR("nas not defined", __func__, NULL);
773 if ((n = numaGetCount(nas)) < 3)
774 return numaCopy(nas);
775 if (halfwin <= 0) {
776 L_ERROR("filter too small; returning a copy\n", __func__);
777 return numaCopy(nas);
778 }
779
780 if (halfwin > (n - 1) / 2) {
781 halfwin = (n - 1) / 2;
782 L_INFO("reducing filter to halfwin = %d\n", __func__, halfwin);
783 }
784
785 /* Add a border to both ends */
786 na1 = numaAddSpecifiedBorder(nas, halfwin, halfwin, L_MIRRORED_BORDER);
787
788 /* Get the median value at the center of each window, corresponding
789 * to locations in the input nas. */
790 nad = numaCreate(n);
791 for (i = 0; i < n; i++) {
792 na2 = numaClipToInterval(na1, i, i + 2 * halfwin);
793 numaGetMedian(na2, &medval);
794 numaAddNumber(nad, medval);
795 numaDestroy(&na2);
796 }
797
798 numaDestroy(&na1);
799 return nad;
800 }
801
802
803 /*!
804 * \brief numaConvertToInt()
805 *
806 * \param[in] nas source numa
807 * \return na with all values rounded to nearest integer, or
808 * NULL on error
809 */
810 NUMA *
811 numaConvertToInt(NUMA *nas)
812 {
813 l_int32 i, n, ival;
814 NUMA *nad;
815
816 if (!nas)
817 return (NUMA *)ERROR_PTR("nas not defined", __func__, NULL);
818
819 n = numaGetCount(nas);
820 if ((nad = numaCreate(n)) == NULL)
821 return (NUMA *)ERROR_PTR("nad not made", __func__, NULL);
822 numaCopyParameters(nad, nas);
823 for (i = 0; i < n; i++) {
824 numaGetIValue(nas, i, &ival);
825 numaAddNumber(nad, ival);
826 }
827 return nad;
828 }
829
830
831 /*----------------------------------------------------------------------*
832 * Histogram generation and statistics *
833 *----------------------------------------------------------------------*/
834 /*!
835 * \brief numaMakeHistogram()
836 *
837 * \param[in] na
838 * \param[in] maxbins max number of histogram bins
839 * \param[out] pbinsize [optional] size of histogram bins
840 * \param[out] pbinstart [optional] start val of minimum bin;
841 * input NULL to force start at 0
842 * \return na consisiting of histogram of integerized values,
843 * or NULL on error.
844 *
845 * <pre>
846 * Notes:
847 * (1) This simple interface is designed for integer data.
848 * The bins are of integer width and start on integer boundaries,
849 * so the results on float data will not have high precision.
850 * (2) Specify the max number of input bins. Then %binsize,
851 * the size of bins necessary to accommodate the input data,
852 * is returned. It is optionally returned and one of the sequence:
853 * {1, 2, 5, 10, 20, 50, ...}.
854 * (3) If &binstart is given, all values are accommodated,
855 * and the min value of the starting bin is returned.
856 * Otherwise, all negative values are discarded and
857 * the histogram bins start at 0.
858 * </pre>
859 */
860 NUMA *
861 numaMakeHistogram(NUMA *na,
862 l_int32 maxbins,
863 l_int32 *pbinsize,
864 l_int32 *pbinstart)
865 {
866 l_int32 i, n, ival, hval;
867 l_int32 iminval, imaxval, range, binsize, nbins, ibin;
868 l_float32 val, ratio;
869 NUMA *nai, *nahist;
870
871 if (pbinsize) *pbinsize = 0;
872 if (pbinstart) *pbinstart = 0;
873 if (!na)
874 return (NUMA *)ERROR_PTR("na not defined", __func__, NULL);
875 if (maxbins < 1)
876 return (NUMA *)ERROR_PTR("maxbins < 1", __func__, NULL);
877
878 /* Determine input range */
879 numaGetMin(na, &val, NULL);
880 iminval = (l_int32)(val + 0.5);
881 numaGetMax(na, &val, NULL);
882 imaxval = (l_int32)(val + 0.5);
883 if (pbinstart == NULL) { /* clip negative vals; start from 0 */
884 iminval = 0;
885 if (imaxval < 0)
886 return (NUMA *)ERROR_PTR("all values < 0", __func__, NULL);
887 }
888
889 /* Determine binsize */
890 range = imaxval - iminval + 1;
891 if (range > maxbins - 1) {
892 ratio = (l_float32)range / (l_float32)maxbins;
893 binsize = 0;
894 for (i = 0; i < NBinSizes; i++) {
895 if (ratio < BinSizeArray[i]) {
896 binsize = BinSizeArray[i];
897 break;
898 }
899 }
900 if (binsize == 0)
901 return (NUMA *)ERROR_PTR("numbers too large", __func__, NULL);
902 } else {
903 binsize = 1;
904 }
905 if (pbinsize) *pbinsize = binsize;
906 nbins = 1 + range / binsize; /* +1 seems to be sufficient */
907
908 /* Redetermine iminval */
909 if (pbinstart && binsize > 1) {
910 if (iminval >= 0)
911 iminval = binsize * (iminval / binsize);
912 else
913 iminval = binsize * ((iminval - binsize + 1) / binsize);
914 }
915 if (pbinstart) *pbinstart = iminval;
916
917 #if DEBUG_HISTO
918 lept_stderr(" imaxval = %d, range = %d, nbins = %d\n",
919 imaxval, range, nbins);
920 #endif /* DEBUG_HISTO */
921
922 /* Use integerized data for input */
923 if ((nai = numaConvertToInt(na)) == NULL)
924 return (NUMA *)ERROR_PTR("nai not made", __func__, NULL);
925 n = numaGetCount(nai);
926
927 /* Make histogram, converting value in input array
928 * into a bin number for this histogram array. */
929 if ((nahist = numaCreate(nbins)) == NULL) {
930 numaDestroy(&nai);
931 return (NUMA *)ERROR_PTR("nahist not made", __func__, NULL);
932 }
933 numaSetCount(nahist, nbins);
934 numaSetParameters(nahist, iminval, binsize);
935 for (i = 0; i < n; i++) {
936 numaGetIValue(nai, i, &ival);
937 ibin = (ival - iminval) / binsize;
938 if (ibin >= 0 && ibin < nbins) {
939 numaGetIValue(nahist, ibin, &hval);
940 numaSetValue(nahist, ibin, hval + 1.0f);
941 }
942 }
943
944 numaDestroy(&nai);
945 return nahist;
946 }
947
948
949 /*!
950 * \brief numaMakeHistogramAuto()
951 *
952 * \param[in] na numa of floats; these may be integers
953 * \param[in] maxbins max number of histogram bins; >= 1
954 * \return na consisiting of histogram of quantized float values,
955 * or NULL on error.
956 *
957 * <pre>
958 * Notes:
959 * (1) This simple interface is designed for accurate binning
960 * of both integer and float data.
961 * (2) If the array data is integers, and the range of integers
962 * is smaller than %maxbins, they are binned as they fall,
963 * with binsize = 1.
964 * (3) If the range of data, (maxval - minval), is larger than
965 * %maxbins, or if the data is floats, they are binned into
966 * exactly %maxbins bins.
967 * (4) Unlike numaMakeHistogram(), these bins in general have
968 * non-integer location and width, even for integer data.
969 * </pre>
970 */
971 NUMA *
972 numaMakeHistogramAuto(NUMA *na,
973 l_int32 maxbins)
974 {
975 l_int32 i, n, imin, imax, irange, ibin, ival, allints;
976 l_float32 minval, maxval, range, binsize, fval;
977 NUMA *nah;
978
979 if (!na)
980 return (NUMA *)ERROR_PTR("na not defined", __func__, NULL);
981 maxbins = L_MAX(1, maxbins);
982
983 /* Determine input range */
984 numaGetMin(na, &minval, NULL);
985 numaGetMax(na, &maxval, NULL);
986
987 /* Determine if values are all integers */
988 n = numaGetCount(na);
989 numaHasOnlyIntegers(na, &allints);
990
991 /* Do simple integer binning if possible */
992 if (allints && (maxval - minval < maxbins)) {
993 imin = (l_int32)minval;
994 imax = (l_int32)maxval;
995 irange = imax - imin + 1;
996 nah = numaCreate(irange);
997 numaSetCount(nah, irange); /* init */
998 numaSetParameters(nah, minval, 1.0);
999 for (i = 0; i < n; i++) {
1000 numaGetIValue(na, i, &ival);
1001 ibin = ival - imin;
1002 numaGetIValue(nah, ibin, &ival);
1003 numaSetValue(nah, ibin, ival + 1.0f);
1004 }
1005
1006 return nah;
1007 }
1008
1009 /* Do float binning, even if the data is integers. */
1010 range = maxval - minval;
1011 binsize = range / (l_float32)maxbins;
1012 if (range == 0.0) {
1013 nah = numaCreate(1);
1014 numaSetParameters(nah, minval, binsize);
1015 numaAddNumber(nah, n);
1016 return nah;
1017 }
1018 nah = numaCreate(maxbins);
1019 numaSetCount(nah, maxbins);
1020 numaSetParameters(nah, minval, binsize);
1021 for (i = 0; i < n; i++) {
1022 numaGetFValue(na, i, &fval);
1023 ibin = (l_int32)((fval - minval) / binsize);
1024 ibin = L_MIN(ibin, maxbins - 1); /* "edge" case; stay in bounds */
1025 numaGetIValue(nah, ibin, &ival);
1026 numaSetValue(nah, ibin, ival + 1.0f);
1027 }
1028
1029 return nah;
1030 }
1031
1032
1033 /*!
1034 * \brief numaMakeHistogramClipped()
1035 *
1036 * \param[in] na
1037 * \param[in] binsize typically 1.0
1038 * \param[in] maxsize of histogram ordinate
1039 * \return na histogram of bins of size %binsize, starting with
1040 * the na[0] (x = 0.0 and going up to a maximum of
1041 * x = %maxsize, by increments of %binsize), or NULL on error
1042 *
1043 * <pre>
1044 * Notes:
1045 * (1) This simple function generates a histogram of values
1046 * from na, discarding all values < 0.0 or greater than
1047 * min(%maxsize, maxval), where maxval is the maximum value in na.
1048 * The histogram data is put in bins of size delx = %binsize,
1049 * starting at x = 0.0. We use as many bins as are
1050 * needed to hold the data.
1051 * </pre>
1052 */
1053 NUMA *
1054 numaMakeHistogramClipped(NUMA *na,
1055 l_float32 binsize,
1056 l_float32 maxsize)
1057 {
1058 l_int32 i, n, nbins, ival, ibin;
1059 l_float32 val, maxval;
1060 NUMA *nad;
1061
1062 if (!na)
1063 return (NUMA *)ERROR_PTR("na not defined", __func__, NULL);
1064 if (binsize <= 0.0)
1065 return (NUMA *)ERROR_PTR("binsize must be > 0.0", __func__, NULL);
1066 if (binsize > maxsize)
1067 binsize = maxsize; /* just one bin */
1068
1069 numaGetMax(na, &maxval, NULL);
1070 n = numaGetCount(na);
1071 maxsize = L_MIN(maxsize, maxval);
1072 nbins = (l_int32)(maxsize / binsize) + 1;
1073
1074 /* lept_stderr("maxsize = %7.3f, nbins = %d\n", maxsize, nbins); */
1075
1076 if ((nad = numaCreate(nbins)) == NULL)
1077 return (NUMA *)ERROR_PTR("nad not made", __func__, NULL);
1078 numaSetParameters(nad, 0.0, binsize);
1079 numaSetCount(nad, nbins); /* interpret zeroes in bins as data */
1080 for (i = 0; i < n; i++) {
1081 numaGetFValue(na, i, &val);
1082 ibin = (l_int32)(val / binsize);
1083 if (ibin >= 0 && ibin < nbins) {
1084 numaGetIValue(nad, ibin, &ival);
1085 numaSetValue(nad, ibin, ival + 1.0f);
1086 }
1087 }
1088
1089 return nad;
1090 }
1091
1092
1093 /*!
1094 * \brief numaRebinHistogram()
1095 *
1096 * \param[in] nas input histogram
1097 * \param[in] newsize number of old bins contained in each new bin
1098 * \return nad more coarsely re-binned histogram, or NULL on error
1099 */
1100 NUMA *
1101 numaRebinHistogram(NUMA *nas,
1102 l_int32 newsize)
1103 {
1104 l_int32 i, j, ns, nd, index, count, val;
1105 l_float32 start, oldsize;
1106 NUMA *nad;
1107
1108 if (!nas)
1109 return (NUMA *)ERROR_PTR("nas not defined", __func__, NULL);
1110 if (newsize <= 1)
1111 return (NUMA *)ERROR_PTR("newsize must be > 1", __func__, NULL);
1112 if ((ns = numaGetCount(nas)) == 0)
1113 return (NUMA *)ERROR_PTR("no bins in nas", __func__, NULL);
1114
1115 nd = (ns + newsize - 1) / newsize;
1116 if ((nad = numaCreate(nd)) == NULL)
1117 return (NUMA *)ERROR_PTR("nad not made", __func__, NULL);
1118 numaGetParameters(nad, &start, &oldsize);
1119 numaSetParameters(nad, start, oldsize * newsize);
1120
1121 for (i = 0; i < nd; i++) { /* new bins */
1122 count = 0;
1123 index = i * newsize;
1124 for (j = 0; j < newsize; j++) {
1125 if (index < ns) {
1126 numaGetIValue(nas, index, &val);
1127 count += val;
1128 index++;
1129 }
1130 }
1131 numaAddNumber(nad, count);
1132 }
1133
1134 return nad;
1135 }
1136
1137
1138 /*!
1139 * \brief numaNormalizeHistogram()
1140 *
1141 * \param[in] nas input histogram
1142 * \param[in] tsum target sum of all numbers in dest histogram; e.g., use
1143 * %tsum= 1.0 if this represents a probability distribution
1144 * \return nad normalized histogram, or NULL on error
1145 */
1146 NUMA *
1147 numaNormalizeHistogram(NUMA *nas,
1148 l_float32 tsum)
1149 {
1150 l_int32 i, ns;
1151 l_float32 sum, factor, fval;
1152 NUMA *nad;
1153
1154 if (!nas)
1155 return (NUMA *)ERROR_PTR("nas not defined", __func__, NULL);
1156 if (tsum <= 0.0)
1157 return (NUMA *)ERROR_PTR("tsum must be > 0.0", __func__, NULL);
1158 if ((ns = numaGetCount(nas)) == 0)
1159 return (NUMA *)ERROR_PTR("no bins in nas", __func__, NULL);
1160
1161 numaGetSum(nas, &sum);
1162 factor = tsum / sum;
1163
1164 if ((nad = numaCreate(ns)) == NULL)
1165 return (NUMA *)ERROR_PTR("nad not made", __func__, NULL);
1166 numaCopyParameters(nad, nas);
1167
1168 for (i = 0; i < ns; i++) {
1169 numaGetFValue(nas, i, &fval);
1170 fval *= factor;
1171 numaAddNumber(nad, fval);
1172 }
1173
1174 return nad;
1175 }
1176
1177
1178 /*!
1179 * \brief numaGetStatsUsingHistogram()
1180 *
1181 * \param[in] na an arbitrary set of numbers; not ordered and not
1182 * a histogram
1183 * \param[in] maxbins the maximum number of bins to be allowed in
1184 * the histogram; use an integer larger than the
1185 * largest number in %na for consecutive integer bins
1186 * \param[out] pmin [optional] min value of set
1187 * \param[out] pmax [optional] max value of set
1188 * \param[out] pmean [optional] mean value of set
1189 * \param[out] pvariance [optional] variance
1190 * \param[out] pmedian [optional] median value of set
1191 * \param[in] rank in [0.0 ... 1.0]; median has a rank 0.5;
1192 * ignored if &rval == NULL
1193 * \param[out] prval [optional] value in na corresponding to %rank
1194 * \param[out] phisto [optional] Numa histogram; use NULL to prevent
1195 * \return 0 if OK, 1 on error
1196 *
1197 * <pre>
1198 * Notes:
1199 * (1) This is a simple interface for gathering statistics
1200 * from a numa, where a histogram is used 'under the covers'
1201 * to avoid sorting if a rank value is requested. In that case,
1202 * by using a histogram we are trading speed for accuracy, because
1203 * the values in %na are quantized to the center of a set of bins.
1204 * (2) If the median, other rank value, or histogram are not requested,
1205 * the calculation is all performed on the input Numa.
1206 * (3) The variance is the average of the square of the
1207 * difference from the mean. The median is the value in na
1208 * with rank 0.5.
1209 * (4) There are two situations where this gives rank results with
1210 * accuracy comparable to computing stastics directly on the input
1211 * data, without binning into a histogram:
1212 * (a) the data is integers and the range of data is less than
1213 * %maxbins, and
1214 * (b) the data is floats and the range is small compared to
1215 * %maxbins, so that the binsize is much less than 1.
1216 * (5) If a histogram is used and the numbers in the Numa extend
1217 * over a large range, you can limit the required storage by
1218 * specifying the maximum number of bins in the histogram.
1219 * Use %maxbins == 0 to force the bin size to be 1.
1220 * (6) This optionally returns the median and one arbitrary rank value.
1221 * If you need several rank values, return the histogram and use
1222 * numaHistogramGetValFromRank(nah, rank, &rval)
1223 * multiple times.
1224 * </pre>
1225 */
1226 l_ok
1227 numaGetStatsUsingHistogram(NUMA *na,
1228 l_int32 maxbins,
1229 l_float32 *pmin,
1230 l_float32 *pmax,
1231 l_float32 *pmean,
1232 l_float32 *pvariance,
1233 l_float32 *pmedian,
1234 l_float32 rank,
1235 l_float32 *prval,
1236 NUMA **phisto)
1237 {
1238 l_int32 i, n;
1239 l_float32 minval, maxval, fval, mean, sum;
1240 NUMA *nah;
1241
1242 if (pmin) *pmin = 0.0;
1243 if (pmax) *pmax = 0.0;
1244 if (pmean) *pmean = 0.0;
1245 if (pvariance) *pvariance = 0.0;
1246 if (pmedian) *pmedian = 0.0;
1247 if (prval) *prval = 0.0;
1248 if (phisto) *phisto = NULL;
1249 if (!na)
1250 return ERROR_INT("na not defined", __func__, 1);
1251 if ((n = numaGetCount(na)) == 0)
1252 return ERROR_INT("numa is empty", __func__, 1);
1253
1254 numaGetMin(na, &minval, NULL);
1255 numaGetMax(na, &maxval, NULL);
1256 if (pmin) *pmin = minval;
1257 if (pmax) *pmax = maxval;
1258 if (pmean || pvariance) {
1259 sum = 0.0;
1260 for (i = 0; i < n; i++) {
1261 numaGetFValue(na, i, &fval);
1262 sum += fval;
1263 }
1264 mean = sum / (l_float32)n;
1265 if (pmean) *pmean = mean;
1266 }
1267 if (pvariance) {
1268 sum = 0.0;
1269 for (i = 0; i < n; i++) {
1270 numaGetFValue(na, i, &fval);
1271 sum += fval * fval;
1272 }
1273 *pvariance = sum / (l_float32)n - mean * mean;
1274 }
1275
1276 if (!pmedian && !prval && !phisto)
1277 return 0;
1278
1279 nah = numaMakeHistogramAuto(na, maxbins);
1280 if (pmedian)
1281 numaHistogramGetValFromRank(nah, 0.5, pmedian);
1282 if (prval)
1283 numaHistogramGetValFromRank(nah, rank, prval);
1284 if (phisto)
1285 *phisto = nah;
1286 else
1287 numaDestroy(&nah);
1288 return 0;
1289 }
1290
1291
1292 /*!
1293 * \brief numaGetHistogramStats()
1294 *
1295 * \param[in] nahisto histogram: y(x(i)), i = 0 ... nbins - 1
1296 * \param[in] startx x value of first bin: x(0)
1297 * \param[in] deltax x increment between bins; the bin size; x(1) - x(0)
1298 * \param[out] pxmean [optional] mean value of histogram
1299 * \param[out] pxmedian [optional] median value of histogram
1300 * \param[out] pxmode [optional] mode value of histogram:
1301 * xmode = x(imode), where y(xmode) >= y(x(i)) for
1302 * all i != imode
1303 * \param[out] pxvariance [optional] variance of x
1304 * \return 0 if OK, 1 on error
1305 *
1306 * <pre>
1307 * Notes:
1308 * (1) If the histogram represents the relation y(x), the
1309 * computed values that are returned are the x values.
1310 * These are NOT the bucket indices i; they are related to the
1311 * bucket indices by
1312 * x(i) = startx + i * deltax
1313 * </pre>
1314 */
1315 l_ok
1316 numaGetHistogramStats(NUMA *nahisto,
1317 l_float32 startx,
1318 l_float32 deltax,
1319 l_float32 *pxmean,
1320 l_float32 *pxmedian,
1321 l_float32 *pxmode,
1322 l_float32 *pxvariance)
1323 {
1324 if (pxmean) *pxmean = 0.0;
1325 if (pxmedian) *pxmedian = 0.0;
1326 if (pxmode) *pxmode = 0.0;
1327 if (pxvariance) *pxvariance = 0.0;
1328 if (!nahisto)
1329 return ERROR_INT("nahisto not defined", __func__, 1);
1330
1331 return numaGetHistogramStatsOnInterval(nahisto, startx, deltax, 0, -1,
1332 pxmean, pxmedian, pxmode,
1333 pxvariance);
1334 }
1335
1336
1337 /*!
1338 * \brief numaGetHistogramStatsOnInterval()
1339 *
1340 * \param[in] nahisto histogram: y(x(i)), i = 0 ... nbins - 1
1341 * \param[in] startx x value of first bin: x(0)
1342 * \param[in] deltax x increment between bins; the bin size; x(1) - x(0)
1343 * \param[in] ifirst first bin to use for collecting stats
1344 * \param[in] ilast last bin for collecting stats; -1 to go to the end
1345 * \param[out] pxmean [optional] mean value of histogram
1346 * \param[out] pxmedian [optional] median value of histogram
1347 * \param[out] pxmode [optional] mode value of histogram:
1348 * xmode = x(imode), where y(xmode) >= y(x(i)) for
1349 * all i != imode
1350 * \param[out] pxvariance [optional] variance of x
1351 * \return 0 if OK, 1 on error
1352 *
1353 * <pre>
1354 * Notes:
1355 * (1) If the histogram represents the relation y(x), the
1356 * computed values that are returned are the x values.
1357 * These are NOT the bucket indices i; they are related to the
1358 * bucket indices by
1359 * x(i) = startx + i * deltax
1360 * </pre>
1361 */
1362 l_ok
1363 numaGetHistogramStatsOnInterval(NUMA *nahisto,
1364 l_float32 startx,
1365 l_float32 deltax,
1366 l_int32 ifirst,
1367 l_int32 ilast,
1368 l_float32 *pxmean,
1369 l_float32 *pxmedian,
1370 l_float32 *pxmode,
1371 l_float32 *pxvariance)
1372 {
1373 l_int32 i, n, imax;
1374 l_float32 sum, sumval, halfsum, moment, var, x, y, ymax;
1375
1376 if (pxmean) *pxmean = 0.0;
1377 if (pxmedian) *pxmedian = 0.0;
1378 if (pxmode) *pxmode = 0.0;
1379 if (pxvariance) *pxvariance = 0.0;
1380 if (!nahisto)
1381 return ERROR_INT("nahisto not defined", __func__, 1);
1382 if (!pxmean && !pxmedian && !pxmode && !pxvariance)
1383 return ERROR_INT("nothing to compute", __func__, 1);
1384
1385 n = numaGetCount(nahisto);
1386 ifirst = L_MAX(0, ifirst);
1387 if (ilast < 0) ilast = n - 1;
1388 if (ifirst >= n)
1389 return ERROR_INT("invalid ifirst", __func__, 1);
1390 if (ilast >= n) {
1391 L_WARNING("ilast = %d is beyond max index = %d; adjusting\n",
1392 __func__, ilast, n - 1);
1393 ilast = n - 1;
1394 }
1395 if (ifirst > ilast)
1396 return ERROR_INT("ifirst > ilast", __func__, 1);
1397 for (sum = 0.0, moment = 0.0, var = 0.0, i = ifirst; i <= ilast ; i++) {
1398 x = startx + i * deltax;
1399 numaGetFValue(nahisto, i, &y);
1400 sum += y;
1401 moment += x * y;
1402 var += x * x * y;
1403 }
1404 if (sum == 0.0) {
1405 L_INFO("sum is 0\n", __func__);
1406 return 0;
1407 }
1408
1409 if (pxmean)
1410 *pxmean = moment / sum;
1411 if (pxvariance)
1412 *pxvariance = var / sum - moment * moment / (sum * sum);
1413
1414 if (pxmedian) {
1415 halfsum = sum / 2.0f;
1416 for (sumval = 0.0, i = ifirst; i <= ilast; i++) {
1417 numaGetFValue(nahisto, i, &y);
1418 sumval += y;
1419 if (sumval >= halfsum) {
1420 *pxmedian = startx + i * deltax;
1421 break;
1422 }
1423 }
1424 }
1425
1426 if (pxmode) {
1427 imax = -1;
1428 ymax = -1.0e10;
1429 for (i = ifirst; i <= ilast; i++) {
1430 numaGetFValue(nahisto, i, &y);
1431 if (y > ymax) {
1432 ymax = y;
1433 imax = i;
1434 }
1435 }
1436 *pxmode = startx + imax * deltax;
1437 }
1438
1439 return 0;
1440 }
1441
1442
1443 /*!
1444 * \brief numaMakeRankFromHistogram()
1445 *
1446 * \param[in] startx xval corresponding to first element in nay
1447 * \param[in] deltax x increment between array elements in nay
1448 * \param[in] nasy input histogram, assumed equally spaced
1449 * \param[in] npts number of points to evaluate rank function
1450 * \param[out] pnax [optional] array of x values in range
1451 * \param[out] pnay rank array of specified npts
1452 * \return 0 if OK, 1 on error
1453 */
1454 l_ok
1455 numaMakeRankFromHistogram(l_float32 startx,
1456 l_float32 deltax,
1457 NUMA *nasy,
1458 l_int32 npts,
1459 NUMA **pnax,
1460 NUMA **pnay)
1461 {
1462 l_int32 i, n;
1463 l_float32 sum, fval;
1464 NUMA *nan, *nar;
1465
1466 if (pnax) *pnax = NULL;
1467 if (!pnay)
1468 return ERROR_INT("&nay not defined", __func__, 1);
1469 *pnay = NULL;
1470 if (!nasy)
1471 return ERROR_INT("nasy not defined", __func__, 1);
1472 if ((n = numaGetCount(nasy)) == 0)
1473 return ERROR_INT("no bins in nas", __func__, 1);
1474
1475 /* Normalize and generate the rank array corresponding to
1476 * the binned histogram. */
1477 nan = numaNormalizeHistogram(nasy, 1.0);
1478 nar = numaCreate(n + 1); /* rank numa corresponding to nan */
1479 sum = 0.0;
1480 numaAddNumber(nar, sum); /* first element is 0.0 */
1481 for (i = 0; i < n; i++) {
1482 numaGetFValue(nan, i, &fval);
1483 sum += fval;
1484 numaAddNumber(nar, sum);
1485 }
1486
1487 /* Compute rank array on full range with specified
1488 * number of points and correspondence to x-values. */
1489 numaInterpolateEqxInterval(startx, deltax, nar, L_LINEAR_INTERP,
1490 startx, startx + n * deltax, npts,
1491 pnax, pnay);
1492 numaDestroy(&nan);
1493 numaDestroy(&nar);
1494 return 0;
1495 }
1496
1497
1498 /*!
1499 * \brief numaHistogramGetRankFromVal()
1500 *
1501 * \param[in] na histogram
1502 * \param[in] rval value of input sample for which we want the rank
1503 * \param[out] prank fraction of total samples below rval
1504 * \return 0 if OK, 1 on error
1505 *
1506 * <pre>
1507 * Notes:
1508 * (1) If we think of the histogram as a function y(x), normalized
1509 * to 1, for a given input value of x, this computes the
1510 * rank of x, which is the integral of y(x) from the start
1511 * value of x to the input value.
1512 * (2) This function only makes sense when applied to a Numa that
1513 * is a histogram. The values in the histogram can be ints and
1514 * floats, and are computed as floats. The rank is returned
1515 * as a float between 0.0 and 1.0.
1516 * (3) The numa parameters startx and binsize are used to
1517 * compute x from the Numa index i.
1518 * </pre>
1519 */
1520 l_ok
1521 numaHistogramGetRankFromVal(NUMA *na,
1522 l_float32 rval,
1523 l_float32 *prank)
1524 {
1525 l_int32 i, ibinval, n;
1526 l_float32 startval, binsize, binval, maxval, fractval, total, sum, val;
1527
1528 if (!prank)
1529 return ERROR_INT("prank not defined", __func__, 1);
1530 *prank = 0.0;
1531 if (!na)
1532 return ERROR_INT("na not defined", __func__, 1);
1533 numaGetParameters(na, &startval, &binsize);
1534 n = numaGetCount(na);
1535 if (rval < startval)
1536 return 0;
1537 maxval = startval + n * binsize;
1538 if (rval > maxval) {
1539 *prank = 1.0;
1540 return 0;
1541 }
1542
1543 binval = (rval - startval) / binsize;
1544 ibinval = (l_int32)binval;
1545 if (ibinval >= n) {
1546 *prank = 1.0;
1547 return 0;
1548 }
1549 fractval = binval - (l_float32)ibinval;
1550
1551 sum = 0.0;
1552 for (i = 0; i < ibinval; i++) {
1553 numaGetFValue(na, i, &val);
1554 sum += val;
1555 }
1556 numaGetFValue(na, ibinval, &val);
1557 sum += fractval * val;
1558 numaGetSum(na, &total);
1559 *prank = sum / total;
1560
1561 /* lept_stderr("binval = %7.3f, rank = %7.3f\n", binval, *prank); */
1562
1563 return 0;
1564 }
1565
1566
1567 /*!
1568 * \brief numaHistogramGetValFromRank()
1569 *
1570 * \param[in] na histogram
1571 * \param[in] rank fraction of total samples
1572 * \param[out] prval approx. to the bin value
1573 * \return 0 if OK, 1 on error
1574 *
1575 * <pre>
1576 * Notes:
1577 * (1) If we think of the histogram as a function y(x), this returns
1578 * the value x such that the integral of y(x) from the start
1579 * value to x gives the fraction 'rank' of the integral
1580 * of y(x) over all bins.
1581 * (2) This function only makes sense when applied to a Numa that
1582 * is a histogram. The values in the histogram can be ints and
1583 * floats, and are computed as floats. The val is returned
1584 * as a float, even though the buckets are of integer width.
1585 * (3) The numa parameters startx and binsize are used to
1586 * compute x from the Numa index i.
1587 * </pre>
1588 */
1589 l_ok
1590 numaHistogramGetValFromRank(NUMA *na,
1591 l_float32 rank,
1592 l_float32 *prval)
1593 {
1594 l_int32 i, n;
1595 l_float32 startval, binsize, rankcount, total, sum, fract, val;
1596
1597 if (!prval)
1598 return ERROR_INT("prval not defined", __func__, 1);
1599 *prval = 0.0;
1600 if (!na)
1601 return ERROR_INT("na not defined", __func__, 1);
1602 if (rank < 0.0) {
1603 L_WARNING("rank < 0; setting to 0.0\n", __func__);
1604 rank = 0.0;
1605 }
1606 if (rank > 1.0) {
1607 L_WARNING("rank > 1.0; setting to 1.0\n", __func__);
1608 rank = 1.0;
1609 }
1610
1611 n = numaGetCount(na);
1612 numaGetParameters(na, &startval, &binsize);
1613 numaGetSum(na, &total);
1614 rankcount = rank * total; /* count that corresponds to rank */
1615 sum = 0.0;
1616 val = 0.0;
1617 for (i = 0; i < n; i++) {
1618 numaGetFValue(na, i, &val);
1619 if (sum + val >= rankcount)
1620 break;
1621 sum += val;
1622 }
1623 if (val <= 0.0) /* can == 0 if rank == 0.0 */
1624 fract = 0.0;
1625 else /* sum + fract * val = rankcount */
1626 fract = (rankcount - sum) / val;
1627
1628 /* The use of the fraction of a bin allows a simple calculation
1629 * for the histogram value at the given rank. */
1630 *prval = startval + binsize * ((l_float32)i + fract);
1631
1632 /* lept_stderr("rank = %7.3f, val = %7.3f\n", rank, *prval); */
1633
1634 return 0;
1635 }
1636
1637
1638 /*!
1639 * \brief numaDiscretizeSortedInBins()
1640 *
1641 * \param[in] na sorted
1642 * \param[in] nbins number of equal population bins (> 1)
1643 * \param[out] pnabinval average "gray" values in each bin
1644 * \return 0 if OK, 1 on error
1645 *
1646 * <pre>
1647 * Notes:
1648 * (1) The input %na is sorted in increasing value.
1649 * (2) The output array has the following mapping:
1650 * bin number --> average array value in bin (nabinval)
1651 * (3) With %nbins == 100, nabinval is the average gray value in
1652 * each of the 100 equally populated bins. It is the function
1653 * gray[100 * rank].
1654 * Thus it is the inverse of
1655 * rank[gray]
1656 * (4) Contast with numaDiscretizeHistoInBins(), where the input %na
1657 * is a histogram.
1658 * </pre>
1659 */
1660 l_ok
1661 numaDiscretizeSortedInBins(NUMA *na,
1662 l_int32 nbins,
1663 NUMA **pnabinval)
1664 {
1665 NUMA *nabinval; /* average gray value in the bins */
1666 NUMA *naeach;
1667 l_int32 i, ntot, bincount, binindex, binsize;
1668 l_float32 sum, val, ave;
1669
1670 if (!pnabinval)
1671 return ERROR_INT("&nabinval not defined", __func__, 1);
1672 *pnabinval = NULL;
1673 if (!na)
1674 return ERROR_INT("na not defined", __func__, 1);
1675 if (nbins < 2)
1676 return ERROR_INT("nbins must be > 1", __func__, 1);
1677
1678 /* Get the number of items in each bin */
1679 ntot = numaGetCount(na);
1680 if ((naeach = numaGetUniformBinSizes(ntot, nbins)) == NULL)
1681 return ERROR_INT("naeach not made", __func__, 1);
1682
1683 /* Get the average value in each bin */
1684 sum = 0.0;
1685 bincount = 0;
1686 binindex = 0;
1687 numaGetIValue(naeach, 0, &binsize);
1688 nabinval = numaCreate(nbins);
1689 for (i = 0; i < ntot; i++) {
1690 numaGetFValue(na, i, &val);
1691 bincount++;
1692 sum += val;
1693 if (bincount == binsize) { /* add bin entry */
1694 ave = sum / binsize;
1695 numaAddNumber(nabinval, ave);
1696 sum = 0.0;
1697 bincount = 0;
1698 binindex++;
1699 if (binindex == nbins) break;
1700 numaGetIValue(naeach, binindex, &binsize);
1701 }
1702 }
1703 *pnabinval = nabinval;
1704
1705 numaDestroy(&naeach);
1706 return 0;
1707 }
1708
1709
1710 /*!
1711 * \brief numaDiscretizeHistoInBins()
1712 *
1713 * \param[in] na histogram
1714 * \param[in] nbins number of equal population bins (> 1)
1715 * \param[out] pnabinval average "gray" values in each bin
1716 * \param[out] pnarank [optional] rank value of input histogram;
1717 * this is a cumulative norm histogram.
1718 * \return 0 if OK, 1 on error
1719 *
1720 * <pre>
1721 * Notes:
1722 * (1) With %nbins == 100, nabinval is the average gray value in
1723 * each of the 100 equally populated bins. It is the function
1724 * gray[100 * rank].
1725 * Thus it is the inverse of
1726 * rank[gray]
1727 * which is optionally returned in narank.
1728 * (2) The "gray value" is the index into the input histogram.
1729 * (3) The two output arrays give the following mappings, where the
1730 * input is an un-normalized histogram of array values:
1731 * bin number --> average array value in bin (nabinval)
1732 * array values --> cumulative normalized histogram (narank)
1733 * </pre>
1734 */
1735 l_ok
1736 numaDiscretizeHistoInBins(NUMA *na,
1737 l_int32 nbins,
1738 NUMA **pnabinval,
1739 NUMA **pnarank)
1740 {
1741 NUMA *nabinval; /* average gray value in the bins */
1742 NUMA *naeach, *nan;
1743 l_int32 i, j, nxvals, occup, count, bincount, binindex, binsize;
1744 l_float32 sum, ave, ntot;
1745
1746 if (pnarank) *pnarank = NULL;
1747 if (!pnabinval)
1748 return ERROR_INT("&nabinval not defined", __func__, 1);
1749 *pnabinval = NULL;
1750 if (!na)
1751 return ERROR_INT("na not defined", __func__, 1);
1752 if (nbins < 2)
1753 return ERROR_INT("nbins must be > 1", __func__, 1);
1754
1755 nxvals = numaGetCount(na);
1756 numaGetSum(na, &ntot);
1757 occup = ntot / nxvals;
1758 if (occup < 1) L_INFO("average occupancy %d < 1\n", __func__, occup);
1759
1760 /* Get the number of items in each bin */
1761 if ((naeach = numaGetUniformBinSizes(ntot, nbins)) == NULL)
1762 return ERROR_INT("naeach not made", __func__, 1);
1763
1764 /* Get the average value in each bin */
1765 sum = 0.0;
1766 bincount = 0;
1767 binindex = 0;
1768 numaGetIValue(naeach, 0, &binsize);
1769 nabinval = numaCreate(nbins);
1770 for (i = 0; i < nxvals; i++) {
1771 numaGetIValue(na, i, &count);
1772 for (j = 0; j < count; j++) {
1773 bincount++;
1774 sum += i;
1775 if (bincount == binsize) { /* add bin entry */
1776 ave = sum / binsize;
1777 numaAddNumber(nabinval, ave);
1778 sum = 0.0;
1779 bincount = 0;
1780 binindex++;
1781 if (binindex == nbins) break;
1782 numaGetIValue(naeach, binindex, &binsize);
1783 }
1784 }
1785 if (binindex == nbins) break;
1786 }
1787 *pnabinval = nabinval;
1788 if (binindex != nbins)
1789 L_ERROR("binindex = %d != nbins = %d\n", __func__, binindex, nbins);
1790
1791 /* Get cumulative normalized histogram (rank[gray value]).
1792 * This is the partial sum operating on the normalized histogram. */
1793 if (pnarank) {
1794 nan = numaNormalizeHistogram(na, 1.0);
1795 *pnarank = numaGetPartialSums(nan);
1796 numaDestroy(&nan);
1797 }
1798 numaDestroy(&naeach);
1799 return 0;
1800 }
1801
1802
1803 /*!
1804 * \brief numaGetRankBinValues()
1805 *
1806 * \param[in] na an array of values
1807 * \param[in] nbins number of bins at which the rank is divided
1808 * \param[out] pnam mean intensity in a bin vs rank bin value,
1809 * with %nbins of discretized rank values
1810 * \return 0 if OK, 1 on error
1811 *
1812 * <pre>
1813 * Notes:
1814 * (1) Simple interface for getting a binned rank representation
1815 * of an input array of values. This returns:
1816 * rank bin number --> average array value in each rank bin (nam)
1817 * (2) Uses bins either a sorted array or a histogram, depending on
1818 * the values in the array and the size of the array.
1819 * </pre>
1820 */
1821 l_ok
1822 numaGetRankBinValues(NUMA *na,
1823 l_int32 nbins,
1824 NUMA **pnam)
1825 {
1826 NUMA *na1;
1827 l_int32 maxbins, type;
1828 l_float32 maxval, delx;
1829
1830 if (!pnam)
1831 return ERROR_INT("&pnam not defined", __func__, 1);
1832 *pnam = NULL;
1833 if (!na)
1834 return ERROR_INT("na not defined", __func__, 1);
1835 if (numaGetCount(na) == 0)
1836 return ERROR_INT("na is empty", __func__, 1);
1837 if (nbins < 2)
1838 return ERROR_INT("nbins must be > 1", __func__, 1);
1839
1840 /* Choose between sorted array and a histogram.
1841 * If the input array is has a small number of numbers with
1842 * a large maximum, we will sort it. At the other extreme, if
1843 * the array has many numbers with a small maximum, such as the
1844 * values of pixels in an 8 bpp grayscale image, generate a histogram.
1845 * If type comes back as L_BIN_SORT, use a histogram. */
1846 type = numaChooseSortType(na);
1847 if (type == L_SHELL_SORT) { /* sort the array */
1848 L_INFO("sort the array: input size = %d\n", __func__, numaGetCount(na));
1849 na1 = numaSort(NULL, na, L_SORT_INCREASING);
1850 numaDiscretizeSortedInBins(na1, nbins, pnam);
1851 numaDestroy(&na1);
1852 return 0;
1853 }
1854
1855 /* Make the histogram. Assuming there are no negative values
1856 * in the array, if the max value in the array does not exceed
1857 * about 100000, the bin size for generating the histogram will
1858 * be 1; maxbins refers to the number of entries in the histogram. */
1859 L_INFO("use a histogram: input size = %d\n", __func__, numaGetCount(na));
1860 numaGetMax(na, &maxval, NULL);
1861 maxbins = L_MIN(100002, (l_int32)maxval + 2);
1862 na1 = numaMakeHistogram(na, maxbins, NULL, NULL);
1863
1864 /* Warn if there is a scale change. This shouldn't happen
1865 * unless the max value is above 100000. */
1866 numaGetParameters(na1, NULL, &delx);
1867 if (delx > 1.0)
1868 L_WARNING("scale change: delx = %6.2f\n", __func__, delx);
1869
1870 /* Rank bin the results */
1871 numaDiscretizeHistoInBins(na1, nbins, pnam, NULL);
1872 numaDestroy(&na1);
1873 return 0;
1874 }
1875
1876
1877 /*!
1878 * \brief numaGetUniformBinSizes()
1879 *
1880 * \param[in] ntotal number of values to be split up
1881 * \param[in] nbins number of bins
1882 * \return naeach number of values to go in each bin, or NULL on error
1883 *
1884 * <pre>
1885 * Notes:
1886 * (1) The numbers in the bins can differ by 1. The sum of
1887 * bin numbers in %naeach is %ntotal.
1888 * </pre>
1889 */
1890 NUMA *
1891 numaGetUniformBinSizes(l_int32 ntotal,
1892 l_int32 nbins)
1893 {
1894 l_int32 i, start, end;
1895 NUMA *naeach;
1896
1897 if (ntotal <= 0)
1898 return (NUMA *)ERROR_PTR("ntotal <= 0", __func__, NULL);
1899 if (nbins <= 0)
1900 return (NUMA *)ERROR_PTR("nbins <= 0", __func__, NULL);
1901
1902 if ((naeach = numaCreate(nbins)) == NULL)
1903 return (NUMA *)ERROR_PTR("naeach not made", __func__, NULL);
1904
1905 if (ntotal < nbins) { /* put 1 in each of %ntotal bins */
1906 for (i = 0; i < ntotal; i++)
1907 numaAddNumber(naeach, 1);
1908 return naeach;
1909 }
1910
1911 start = 0;
1912 for (i = 0; i < nbins; i++) {
1913 end = ntotal * (i + 1) / nbins;
1914 numaAddNumber(naeach, end - start);
1915 start = end;
1916 }
1917 return naeach;
1918 }
1919
1920
1921 /*----------------------------------------------------------------------*
1922 * Splitting a distribution *
1923 *----------------------------------------------------------------------*/
1924 /*!
1925 * \brief numaSplitDistribution()
1926 *
1927 * \param[in] na histogram
1928 * \param[in] scorefract fraction of the max score, used to determine
1929 * range over which the histogram min is searched
1930 * \param[out] psplitindex [optional] index for splitting
1931 * \param[out] pave1 [optional] average of lower distribution
1932 * \param[out] pave2 [optional] average of upper distribution
1933 * \param[out] pnum1 [optional] population of lower distribution
1934 * \param[out] pnum2 [optional] population of upper distribution
1935 * \param[out] pnascore [optional] for debugging; otherwise use NULL
1936 * \return 0 if OK, 1 on error
1937 *
1938 * <pre>
1939 * Notes:
1940 * (1) This function is intended to be used on a distribution of
1941 * values that represent two sets, such as a histogram of
1942 * pixel values for an image with a fg and bg, and the goal
1943 * is to determine the averages of the two sets and the
1944 * best splitting point.
1945 * (2) The Otsu method finds a split point that divides the distribution
1946 * into two parts by maximizing a score function that is the
1947 * product of two terms:
1948 * (a) the square of the difference of centroids, (ave1 - ave2)^2
1949 * (b) fract1 * (1 - fract1)
1950 * where fract1 is the fraction in the lower distribution.
1951 * (3) This works well for images where the fg and bg are
1952 * each relatively homogeneous and well-separated in color.
1953 * However, if the actual fg and bg sets are very different
1954 * in size, and the bg is highly varied, as can occur in some
1955 * scanned document images, this will bias the split point
1956 * into the larger "bump" (i.e., toward the point where the
1957 * (b) term reaches its maximum of 0.25 at fract1 = 0.5.
1958 * To avoid this, we define a range of values near the
1959 * maximum of the score function, and choose the value within
1960 * this range such that the histogram itself has a minimum value.
1961 * The range is determined by scorefract: we include all abscissa
1962 * values to the left and right of the value that maximizes the
1963 * score, such that the score stays above (1 - scorefract) * maxscore.
1964 * The intuition behind this modification is to try to find
1965 * a split point that both has a high variance score and is
1966 * at or near a minimum in the histogram, so that the histogram
1967 * slope is small at the split point.
1968 * (4) We normalize the score so that if the two distributions
1969 * were of equal size and at opposite ends of the numa, the
1970 * score would be 1.0.
1971 * </pre>
1972 */
1973 l_ok
1974 numaSplitDistribution(NUMA *na,
1975 l_float32 scorefract,
1976 l_int32 *psplitindex,
1977 l_float32 *pave1,
1978 l_float32 *pave2,
1979 l_float32 *pnum1,
1980 l_float32 *pnum2,
1981 NUMA **pnascore)
1982 {
1983 l_int32 i, n, bestsplit, minrange, maxrange, maxindex;
1984 l_float32 ave1, ave2, ave1prev, ave2prev;
1985 l_float32 num1, num2, num1prev, num2prev;
1986 l_float32 val, minval, sum, fract1;
1987 l_float32 norm, score, minscore, maxscore;
1988 NUMA *nascore, *naave1, *naave2, *nanum1, *nanum2;
1989
1990 if (psplitindex) *psplitindex = 0;
1991 if (pave1) *pave1 = 0.0;
1992 if (pave2) *pave2 = 0.0;
1993 if (pnum1) *pnum1 = 0.0;
1994 if (pnum2) *pnum2 = 0.0;
1995 if (pnascore) *pnascore = NULL;
1996 if (!na)
1997 return ERROR_INT("na not defined", __func__, 1);
1998
1999 n = numaGetCount(na);
2000 if (n <= 1)
2001 return ERROR_INT("n = 1 in histogram", __func__, 1);
2002 numaGetSum(na, &sum);
2003 if (sum <= 0.0)
2004 return ERROR_INT("sum <= 0.0", __func__, 1);
2005 norm = 4.0f / ((l_float32)(n - 1) * (n - 1));
2006 ave1prev = 0.0;
2007 numaGetHistogramStats(na, 0.0, 1.0, &ave2prev, NULL, NULL, NULL);
2008 num1prev = 0.0;
2009 num2prev = sum;
2010 maxindex = n / 2; /* initialize with something */
2011
2012 /* Split the histogram with [0 ... i] in the lower part
2013 * and [i+1 ... n-1] in upper part. First, compute an otsu
2014 * score for each possible splitting. */
2015 if ((nascore = numaCreate(n)) == NULL)
2016 return ERROR_INT("nascore not made", __func__, 1);
2017 naave1 = (pave1) ? numaCreate(n) : NULL;
2018 naave2 = (pave2) ? numaCreate(n) : NULL;
2019 nanum1 = (pnum1) ? numaCreate(n) : NULL;
2020 nanum2 = (pnum2) ? numaCreate(n) : NULL;
2021 maxscore = 0.0;
2022 for (i = 0; i < n; i++) {
2023 numaGetFValue(na, i, &val);
2024 num1 = num1prev + val;
2025 if (num1 == 0)
2026 ave1 = ave1prev;
2027 else
2028 ave1 = (num1prev * ave1prev + i * val) / num1;
2029 num2 = num2prev - val;
2030 if (num2 == 0)
2031 ave2 = ave2prev;
2032 else
2033 ave2 = (num2prev * ave2prev - i * val) / num2;
2034 fract1 = num1 / sum;
2035 score = norm * (fract1 * (1 - fract1)) * (ave2 - ave1) * (ave2 - ave1);
2036 numaAddNumber(nascore, score);
2037 if (pave1) numaAddNumber(naave1, ave1);
2038 if (pave2) numaAddNumber(naave2, ave2);
2039 if (pnum1) numaAddNumber(nanum1, num1);
2040 if (pnum2) numaAddNumber(nanum2, num2);
2041 if (score > maxscore) {
2042 maxscore = score;
2043 maxindex = i;
2044 }
2045 num1prev = num1;
2046 num2prev = num2;
2047 ave1prev = ave1;
2048 ave2prev = ave2;
2049 }
2050
2051 /* Next, for all contiguous scores within a specified fraction
2052 * of the max, choose the split point as the value with the
2053 * minimum in the histogram. */
2054 minscore = (1.f - scorefract) * maxscore;
2055 for (i = maxindex - 1; i >= 0; i--) {
2056 numaGetFValue(nascore, i, &val);
2057 if (val < minscore)
2058 break;
2059 }
2060 minrange = i + 1;
2061 for (i = maxindex + 1; i < n; i++) {
2062 numaGetFValue(nascore, i, &val);
2063 if (val < minscore)
2064 break;
2065 }
2066 maxrange = i - 1;
2067 numaGetFValue(na, minrange, &minval);
2068 bestsplit = minrange;
2069 for (i = minrange + 1; i <= maxrange; i++) {
2070 numaGetFValue(na, i, &val);
2071 if (val < minval) {
2072 minval = val;
2073 bestsplit = i;
2074 }
2075 }
2076
2077 /* Add one to the bestsplit value to get the threshold value,
2078 * because when we take a threshold, as in pixThresholdToBinary(),
2079 * we always choose the set with values below the threshold. */
2080 bestsplit = L_MIN(255, bestsplit + 1);
2081
2082 if (psplitindex) *psplitindex = bestsplit;
2083 if (pave1) numaGetFValue(naave1, bestsplit, pave1);
2084 if (pave2) numaGetFValue(naave2, bestsplit, pave2);
2085 if (pnum1) numaGetFValue(nanum1, bestsplit, pnum1);
2086 if (pnum2) numaGetFValue(nanum2, bestsplit, pnum2);
2087
2088 if (pnascore) { /* debug mode */
2089 lept_stderr("minrange = %d, maxrange = %d\n", minrange, maxrange);
2090 lept_stderr("minval = %10.0f\n", minval);
2091 gplotSimple1(nascore, GPLOT_PNG, "/tmp/lept/nascore",
2092 "Score for split distribution");
2093 *pnascore = nascore;
2094 } else {
2095 numaDestroy(&nascore);
2096 }
2097
2098 if (pave1) numaDestroy(&naave1);
2099 if (pave2) numaDestroy(&naave2);
2100 if (pnum1) numaDestroy(&nanum1);
2101 if (pnum2) numaDestroy(&nanum2);
2102 return 0;
2103 }
2104
2105
2106 /*----------------------------------------------------------------------*
2107 * Comparing histograms *
2108 *----------------------------------------------------------------------*/
2109 /*!
2110 * \brief grayHistogramsToEMD()
2111 *
2112 * \param[in] naa1, naa2 two numaa, each with one or more 256-element
2113 * histograms
2114 * \param[out] pnad nad of EM distances for each histogram
2115 * \return 0 if OK, 1 on error
2116 *
2117 * <pre>
2118 * Notes:
2119 * (1) The two numaas must be the same size and have corresponding
2120 * 256-element histograms. Pairs do not need to be normalized
2121 * to the same sum.
2122 * (2) This is typically used on two sets of histograms from
2123 * corresponding tiles of two images. The similarity of two
2124 * images can be found with the scoring function used in
2125 * pixCompareGrayByHisto():
2126 * score S = 1.0 - k * D, where
2127 * k is a constant, say in the range 5-10
2128 * D = EMD
2129 * for each tile; for multiple tiles, take the Min(S) over
2130 * the set of tiles to be the final score.
2131 * </pre>
2132 */
2133 l_ok
2134 grayHistogramsToEMD(NUMAA *naa1,
2135 NUMAA *naa2,
2136 NUMA **pnad)
2137 {
2138 l_int32 i, n, nt;
2139 l_float32 dist;
2140 NUMA *na1, *na2, *nad;
2141
2142 if (!pnad)
2143 return ERROR_INT("&nad not defined", __func__, 1);
2144 *pnad = NULL;
2145 if (!naa1 || !naa2)
2146 return ERROR_INT("na1 and na2 not both defined", __func__, 1);
2147 n = numaaGetCount(naa1);
2148 if (n != numaaGetCount(naa2))
2149 return ERROR_INT("naa1 and naa2 numa counts differ", __func__, 1);
2150 nt = numaaGetNumberCount(naa1);
2151 if (nt != numaaGetNumberCount(naa2))
2152 return ERROR_INT("naa1 and naa2 number counts differ", __func__, 1);
2153 if (256 * n != nt) /* good enough check */
2154 return ERROR_INT("na sizes must be 256", __func__, 1);
2155
2156 nad = numaCreate(n);
2157 *pnad = nad;
2158 for (i = 0; i < n; i++) {
2159 na1 = numaaGetNuma(naa1, i, L_CLONE);
2160 na2 = numaaGetNuma(naa2, i, L_CLONE);
2161 numaEarthMoverDistance(na1, na2, &dist);
2162 numaAddNumber(nad, dist / 255.f); /* normalize to [0.0 - 1.0] */
2163 numaDestroy(&na1);
2164 numaDestroy(&na2);
2165 }
2166 return 0;
2167 }
2168
2169
2170 /*!
2171 * \brief numaEarthMoverDistance()
2172 *
2173 * \param[in] na1, na2 two numas of the same size, typically histograms
2174 * \param[out] pdist earthmover distance
2175 * \return 0 if OK, 1 on error
2176 *
2177 * <pre>
2178 * Notes:
2179 * (1) The two numas must have the same size. They do not need to be
2180 * normalized to the same sum before applying the function.
2181 * (2) For a 1D discrete function, the implementation of the EMD
2182 * is trivial. Just keep filling or emptying buckets in one numa
2183 * to match the amount in the other, moving sequentially along
2184 * both arrays.
2185 * (3) We divide the sum of the absolute value of everything moved
2186 * (by 1 unit at a time) by the sum of the numa (amount of "earth")
2187 * to get the average distance that the "earth" was moved.
2188 * This is the value returned here.
2189 * (4) The caller can do a further normalization, by the number of
2190 * buckets (minus 1), to get the EM distance as a fraction of
2191 * the maximum possible distance, which is n-1. This fraction
2192 * is 1.0 for the situation where all the 'earth' in the first
2193 * array is at one end, and all in the second array is at the
2194 * other end.
2195 * </pre>
2196 */
2197 l_ok
2198 numaEarthMoverDistance(NUMA *na1,
2199 NUMA *na2,
2200 l_float32 *pdist)
2201 {
2202 l_int32 n, norm, i;
2203 l_float32 sum1, sum2, diff, total;
2204 l_float32 *array1, *array3;
2205 NUMA *na3;
2206
2207 if (!pdist)
2208 return ERROR_INT("&dist not defined", __func__, 1);
2209 *pdist = 0.0;
2210 if (!na1 || !na2)
2211 return ERROR_INT("na1 and na2 not both defined", __func__, 1);
2212 n = numaGetCount(na1);
2213 if (n != numaGetCount(na2))
2214 return ERROR_INT("na1 and na2 have different size", __func__, 1);
2215
2216 /* Generate na3; normalize to na1 if necessary */
2217 numaGetSum(na1, &sum1);
2218 numaGetSum(na2, &sum2);
2219 norm = (L_ABS(sum1 - sum2) < 0.00001 * L_ABS(sum1)) ? 1 : 0;
2220 if (!norm)
2221 na3 = numaTransform(na2, 0, sum1 / sum2);
2222 else
2223 na3 = numaCopy(na2);
2224 array1 = numaGetFArray(na1, L_NOCOPY);
2225 array3 = numaGetFArray(na3, L_NOCOPY);
2226
2227 /* Move earth in n3 from array elements, to match n1 */
2228 total = 0;
2229 for (i = 1; i < n; i++) {
2230 diff = array1[i - 1] - array3[i - 1];
2231 array3[i] -= diff;
2232 total += L_ABS(diff);
2233 }
2234 *pdist = total / sum1;
2235
2236 numaDestroy(&na3);
2237 return 0;
2238 }
2239
2240
2241 /*!
2242 * \brief grayInterHistogramStats()
2243 *
2244 * \param[in] naa numaa with two or more 256-element histograms
2245 * \param[in] wc half-width of the smoothing window
2246 * \param[out] pnam [optional] mean values
2247 * \param[out] pnams [optional] mean square values
2248 * \param[out] pnav [optional] variances
2249 * \param[out] pnarv [optional] rms deviations from the mean
2250 * \return 0 if OK, 1 on error
2251 *
2252 * <pre>
2253 * Notes:
2254 * (1) The %naa has two or more 256-element numa histograms, which
2255 * are to be compared value-wise at each of the 256 gray levels.
2256 * The result are stats (mean, mean square, variance, root variance)
2257 * aggregated across the set of histograms, and each is output
2258 * as a 256 entry numa. Think of these histograms as a matrix,
2259 * where each histogram is one row of the array. The stats are
2260 * then aggregated column-wise, between the histograms.
2261 * (2) These stats are:
2262 * ~ average value: <v> (nam)
2263 * ~ average squared value: <v*v> (nams)
2264 * ~ variance: <(v - <v>)*(v - <v>)> = <v*v> - <v>*<v> (nav)
2265 * ~ square-root of variance: (narv)
2266 * where the brackets < .. > indicate that the average value is
2267 * to be taken over each column of the array.
2268 * (3) The input histograms are optionally smoothed before these
2269 * statistical operations.
2270 * (4) The input histograms are normalized to a sum of 10000. By
2271 * doing this, the resulting numbers are independent of the
2272 * number of samples used in building the individual histograms.
2273 * (5) A typical application is on a set of histograms from tiles
2274 * of an image, to distinguish between text/tables and photo
2275 * regions. If the tiles are much larger than the text line
2276 * spacing, text/table regions typically have smaller variance
2277 * across tiles than photo regions. For this application, it
2278 * may be useful to ignore values near white, which are large for
2279 * text and would magnify the variance due to variations in
2280 * illumination. However, because the variance of a drawing or
2281 * a light photo can be similar to that of grayscale text, this
2282 * function is only a discriminator between darker photos/drawings
2283 * and light photos/text/line-graphics.
2284 * </pre>
2285 */
2286 l_ok
2287 grayInterHistogramStats(NUMAA *naa,
2288 l_int32 wc,
2289 NUMA **pnam,
2290 NUMA **pnams,
2291 NUMA **pnav,
2292 NUMA **pnarv)
2293 {
2294 l_int32 i, j, n, nn;
2295 l_float32 **arrays;
2296 l_float32 mean, var, rvar;
2297 NUMA *na1, *na2, *na3, *na4;
2298
2299 if (pnam) *pnam = NULL;
2300 if (pnams) *pnams = NULL;
2301 if (pnav) *pnav = NULL;
2302 if (pnarv) *pnarv = NULL;
2303 if (!pnam && !pnams && !pnav && !pnarv)
2304 return ERROR_INT("nothing requested", __func__, 1);
2305 if (!naa)
2306 return ERROR_INT("naa not defined", __func__, 1);
2307 n = numaaGetCount(naa);
2308 for (i = 0; i < n; i++) {
2309 nn = numaaGetNumaCount(naa, i);
2310 if (nn != 256) {
2311 L_ERROR("%d numbers in numa[%d]\n", __func__, nn, i);
2312 return 1;
2313 }
2314 }
2315
2316 if (pnam) *pnam = numaCreate(256);
2317 if (pnams) *pnams = numaCreate(256);
2318 if (pnav) *pnav = numaCreate(256);
2319 if (pnarv) *pnarv = numaCreate(256);
2320
2321 /* First, use mean smoothing, normalize each histogram,
2322 * and save all results in a 2D matrix. */
2323 arrays = (l_float32 **)LEPT_CALLOC(n, sizeof(l_float32 *));
2324 for (i = 0; i < n; i++) {
2325 na1 = numaaGetNuma(naa, i, L_CLONE);
2326 na2 = numaWindowedMean(na1, wc);
2327 na3 = numaNormalizeHistogram(na2, 10000.);
2328 arrays[i] = numaGetFArray(na3, L_COPY);
2329 numaDestroy(&na1);
2330 numaDestroy(&na2);
2331 numaDestroy(&na3);
2332 }
2333
2334 /* Get stats between histograms */
2335 for (j = 0; j < 256; j++) {
2336 na4 = numaCreate(n);
2337 for (i = 0; i < n; i++) {
2338 numaAddNumber(na4, arrays[i][j]);
2339 }
2340 numaSimpleStats(na4, 0, -1, &mean, &var, &rvar);
2341 if (pnam) numaAddNumber(*pnam, mean);
2342 if (pnams) numaAddNumber(*pnams, mean * mean);
2343 if (pnav) numaAddNumber(*pnav, var);
2344 if (pnarv) numaAddNumber(*pnarv, rvar);
2345 numaDestroy(&na4);
2346 }
2347
2348 for (i = 0; i < n; i++)
2349 LEPT_FREE(arrays[i]);
2350 LEPT_FREE(arrays);
2351 return 0;
2352 }
2353
2354
2355 /*----------------------------------------------------------------------*
2356 * Extrema finding *
2357 *----------------------------------------------------------------------*/
2358 /*!
2359 * \brief numaFindPeaks()
2360 *
2361 * \param[in] nas source numa
2362 * \param[in] nmax max number of peaks to be found
2363 * \param[in] fract1 min fraction of peak value
2364 * \param[in] fract2 min slope
2365 * \return peak na, or NULL on error.
2366 *
2367 * <pre>
2368 * Notes:
2369 * (1) The returned na consists of sets of four numbers representing
2370 * the peak, in the following order:
2371 * left edge; peak center; right edge; normalized peak area
2372 * </pre>
2373 */
2374 NUMA *
2375 numaFindPeaks(NUMA *nas,
2376 l_int32 nmax,
2377 l_float32 fract1,
2378 l_float32 fract2)
2379 {
2380 l_int32 i, k, n, maxloc, lloc, rloc;
2381 l_float32 fmaxval, sum, total, newtotal, val, lastval;
2382 l_float32 peakfract;
2383 NUMA *na, *napeak;
2384
2385 if (!nas)
2386 return (NUMA *)ERROR_PTR("nas not defined", __func__, NULL);
2387 n = numaGetCount(nas);
2388 numaGetSum(nas, &total);
2389
2390 /* We munge this copy */
2391 if ((na = numaCopy(nas)) == NULL)
2392 return (NUMA *)ERROR_PTR("na not made", __func__, NULL);
2393 if ((napeak = numaCreate(4 * nmax)) == NULL) {
2394 numaDestroy(&na);
2395 return (NUMA *)ERROR_PTR("napeak not made", __func__, NULL);
2396 }
2397
2398 for (k = 0; k < nmax; k++) {
2399 numaGetSum(na, &newtotal);
2400 if (newtotal == 0.0) /* sanity check */
2401 break;
2402 numaGetMax(na, &fmaxval, &maxloc);
2403 sum = fmaxval;
2404 lastval = fmaxval;
2405 lloc = 0;
2406 for (i = maxloc - 1; i >= 0; --i) {
2407 numaGetFValue(na, i, &val);
2408 if (val == 0.0) {
2409 lloc = i + 1;
2410 break;
2411 }
2412 if (val > fract1 * fmaxval) {
2413 sum += val;
2414 lastval = val;
2415 continue;
2416 }
2417 if (lastval - val > fract2 * lastval) {
2418 sum += val;
2419 lastval = val;
2420 continue;
2421 }
2422 lloc = i;
2423 break;
2424 }
2425 lastval = fmaxval;
2426 rloc = n - 1;
2427 for (i = maxloc + 1; i < n; ++i) {
2428 numaGetFValue(na, i, &val);
2429 if (val == 0.0) {
2430 rloc = i - 1;
2431 break;
2432 }
2433 if (val > fract1 * fmaxval) {
2434 sum += val;
2435 lastval = val;
2436 continue;
2437 }
2438 if (lastval - val > fract2 * lastval) {
2439 sum += val;
2440 lastval = val;
2441 continue;
2442 }
2443 rloc = i;
2444 break;
2445 }
2446 peakfract = sum / total;
2447 numaAddNumber(napeak, lloc);
2448 numaAddNumber(napeak, maxloc);
2449 numaAddNumber(napeak, rloc);
2450 numaAddNumber(napeak, peakfract);
2451
2452 for (i = lloc; i <= rloc; i++)
2453 numaSetValue(na, i, 0.0);
2454 }
2455
2456 numaDestroy(&na);
2457 return napeak;
2458 }
2459
2460
2461 /*!
2462 * \brief numaFindExtrema()
2463 *
2464 * \param[in] nas input values
2465 * \param[in] delta relative amount to resolve peaks and valleys
2466 * \param[out] pnav [optional] values of extrema
2467 * \return nad (locations of extrema, or NULL on error
2468 *
2469 * <pre>
2470 * Notes:
2471 * (1) This returns a sequence of extrema (peaks and valleys).
2472 * (2) The algorithm is analogous to that for determining
2473 * mountain peaks. Suppose we have a local peak, with
2474 * bumps on the side. Under what conditions can we consider
2475 * those 'bumps' to be actual peaks? The answer: if the
2476 * bump is separated from the peak by a saddle that is at
2477 * least 500 feet below the bump.
2478 * (3) Operationally, suppose we are trying to identify a peak.
2479 * We have a previous valley, and also the largest value that
2480 * we have seen since that valley. We can identify this as
2481 * a peak if we find a value that is delta BELOW it. When
2482 * we find such a value, label the peak, use the current
2483 * value to label the starting point for the search for
2484 * a valley, and do the same operation in reverse. Namely,
2485 * keep track of the lowest point seen, and look for a value
2486 * that is delta ABOVE it. Once found, the lowest point is
2487 * labeled the valley, and continue, looking for the next peak.
2488 * </pre>
2489 */
2490 NUMA *
2491 numaFindExtrema(NUMA *nas,
2492 l_float32 delta,
2493 NUMA **pnav)
2494 {
2495 l_int32 i, n, found, loc, direction;
2496 l_float32 startval, val, maxval, minval;
2497 NUMA *nav, *nad;
2498
2499 if (pnav) *pnav = NULL;
2500 if (!nas)
2501 return (NUMA *)ERROR_PTR("nas not defined", __func__, NULL);
2502 if (delta < 0.0)
2503 return (NUMA *)ERROR_PTR("delta < 0", __func__, NULL);
2504
2505 n = numaGetCount(nas);
2506 nad = numaCreate(0);
2507 nav = NULL;
2508 if (pnav) {
2509 nav = numaCreate(0);
2510 *pnav = nav;
2511 }
2512
2513 /* We don't know if we'll find a peak or valley first,
2514 * but use the first element of nas as the reference point.
2515 * Break when we deviate by 'delta' from the first point. */
2516 numaGetFValue(nas, 0, &startval);
2517 found = FALSE;
2518 for (i = 1; i < n; i++) {
2519 numaGetFValue(nas, i, &val);
2520 if (L_ABS(val - startval) >= delta) {
2521 found = TRUE;
2522 break;
2523 }
2524 }
2525
2526 if (!found)
2527 return nad; /* it's empty */
2528
2529 /* Are we looking for a peak or a valley? */
2530 if (val > startval) { /* peak */
2531 direction = 1;
2532 maxval = val;
2533 } else {
2534 direction = -1;
2535 minval = val;
2536 }
2537 loc = i;
2538
2539 /* Sweep through the rest of the array, recording alternating
2540 * peak/valley extrema. */
2541 for (i = i + 1; i < n; i++) {
2542 numaGetFValue(nas, i, &val);
2543 if (direction == 1 && val > maxval ) { /* new local max */
2544 maxval = val;
2545 loc = i;
2546 } else if (direction == -1 && val < minval ) { /* new local min */
2547 minval = val;
2548 loc = i;
2549 } else if (direction == 1 && (maxval - val >= delta)) {
2550 numaAddNumber(nad, loc); /* save the current max location */
2551 if (nav) numaAddNumber(nav, maxval);
2552 direction = -1; /* reverse: start looking for a min */
2553 minval = val;
2554 loc = i; /* current min location */
2555 } else if (direction == -1 && (val - minval >= delta)) {
2556 numaAddNumber(nad, loc); /* save the current min location */
2557 if (nav) numaAddNumber(nav, minval);
2558 direction = 1; /* reverse: start looking for a max */
2559 maxval = val;
2560 loc = i; /* current max location */
2561 }
2562 }
2563
2564 /* Save the final extremum */
2565 /* numaAddNumber(nad, loc); */
2566 return nad;
2567 }
2568
2569
2570 /*!
2571 * \brief numaFindLocForThreshold()
2572 *
2573 * \param[in] nas input histogram
2574 * \param[in] skip look-ahead distance to avoid false mininma;
2575 * use 0 for default
2576 * \param[out] pthresh threshold value
2577 * \param[out] pfract [optional] fraction below or at threshold
2578 * \return 0 if OK, 1 on error or if no threshold can be found
2579 *
2580 * <pre>
2581 * Notes:
2582 * (1) This finds a good place to set a threshold for a histogram
2583 * of values that has two peaks. The peaks can differ greatly
2584 * in area underneath them. The number of buckets in the
2585 * histogram is expected to be 256 (e.g, from an 8 bpp gray image).
2586 * (2) The input histogram should have been smoothed with a window
2587 * to avoid false peak and valley detection due to noise. For
2588 * example, see pixThresholdByHisto().
2589 * (3) A skip value can be input to determine the look-ahead distance
2590 * to ignore a false peak on the rise or descent from the first peak.
2591 * Input 0 to use the default value (it assumes a histo size of 256).
2592 * (4) Optionally, the fractional area under the first peak can
2593 * be returned.
2594 * </pre>
2595 */
2596 l_ok
2597 numaFindLocForThreshold(NUMA *na,
2598 l_int32 skip,
2599 l_int32 *pthresh,
2600 l_float32 *pfract)
2601 {
2602 l_int32 i, n, start, index, minloc, found;
2603 l_float32 val, pval, jval, minval, maxval, sum, partsum;
2604 l_float32 *fa;
2605
2606 if (pfract) *pfract = 0.0;
2607 if (!pthresh)
2608 return ERROR_INT("&thresh not defined", __func__, 1);
2609 *pthresh = 0;
2610 if (!na)
2611 return ERROR_INT("na not defined", __func__, 1);
2612 if (skip <= 0) skip = 20;
2613
2614 /* Test for constant value */
2615 numaGetMin(na, &minval, NULL);
2616 numaGetMax(na, &maxval, NULL);
2617 if (minval == maxval)
2618 return ERROR_INT("all array values are the same", __func__, 1);
2619
2620 /* Look for the top of the first peak */
2621 n = numaGetCount(na);
2622 if (n < 256)
2623 L_WARNING("array size %d < 256\n", __func__, n);
2624 fa = numaGetFArray(na, L_NOCOPY);
2625 pval = fa[0];
2626 for (i = 1; i < n; i++) {
2627 val = fa[i];
2628 index = L_MIN(i + skip, n - 1);
2629 jval = fa[index];
2630 if (val < pval && jval < pval) /* near the top if not there */
2631 break;
2632 pval = val;
2633 }
2634
2635 if (i > n - 5) /* just an increasing function */
2636 return ERROR_INT("top of first peak not found", __func__, 1);
2637
2638 /* Look for the low point in the valley */
2639 found = FALSE;
2640 start = i;
2641 pval = fa[start];
2642 for (i = start + 1; i < n; i++) {
2643 val = fa[i];
2644 if (val <= pval) { /* appears to be going down */
2645 pval = val;
2646 } else { /* appears to be going up */
2647 index = L_MIN(i + skip, n - 1);
2648 jval = fa[index]; /* junp ahead by 'skip' */
2649 if (val > jval) { /* still going down; jump ahead */
2650 pval = jval;
2651 i = index;
2652 } else { /* really going up; passed the min */
2653 found = TRUE;
2654 break;
2655 }
2656 }
2657 }
2658 if (!found)
2659 return ERROR_INT("no minimum found", __func__, 1);
2660
2661 /* Find the location of the minimum in the interval */
2662 minloc = index; /* likely passed the min; look backward */
2663 minval = fa[index];
2664 for (i = index - 1; i > index - skip; i--) {
2665 if (fa[i] < minval) {
2666 minval = fa[i];
2667 minloc = i;
2668 }
2669 }
2670
2671 /* Is the minimum very near the end of the array? */
2672 if (minloc > n - 10)
2673 return ERROR_INT("minimum at end of array; invalid", __func__, 1);
2674 *pthresh = minloc;
2675
2676 /* Find the fraction under the first peak */
2677 if (pfract) {
2678 numaGetSumOnInterval(na, 0, minloc, &partsum);
2679 numaGetSum(na, &sum);
2680 if (sum > 0.0)
2681 *pfract = partsum / sum;
2682 }
2683 return 0;
2684 }
2685
2686
2687 /*!
2688 * \brief numaCountReversals()
2689 *
2690 * \param[in] nas input values
2691 * \param[in] minreversal relative amount to resolve peaks and valleys
2692 * \param[out] pnr [optional] number of reversals
2693 * \param[out] prd [optional] reversal density: reversals/length
2694 * \return 0 if OK, 1 on error
2695 *
2696 * <pre>
2697 * Notes:
2698 * (1) The input numa is can be generated from pixExtractAlongLine().
2699 * If so, the x parameters can be used to find the reversal
2700 * frequency along a line.
2701 * (2) If the input numa was generated from a 1 bpp pix, the
2702 * values will be 0 and 1. Use %minreversal == 1 to get
2703 * the number of pixel flips. If the only values are 0 and 1,
2704 * but %minreversal > 1, set the reversal count to 0 and
2705 * issue a warning.
2706 * </pre>
2707 */
2708 l_ok
2709 numaCountReversals(NUMA *nas,
2710 l_float32 minreversal,
2711 l_int32 *pnr,
2712 l_float32 *prd)
2713 {
2714 l_int32 i, n, nr, ival, binvals;
2715 l_int32 *ia;
2716 l_float32 fval, delx, len;
2717 NUMA *nat;
2718
2719 if (pnr) *pnr = 0;
2720 if (prd) *prd = 0.0;
2721 if (!pnr && !prd)
2722 return ERROR_INT("neither &nr nor &rd are defined", __func__, 1);
2723 if (!nas)
2724 return ERROR_INT("nas not defined", __func__, 1);
2725 if ((n = numaGetCount(nas)) == 0) {
2726 L_INFO("nas is empty\n", __func__);
2727 return 0;
2728 }
2729 if (minreversal < 0.0)
2730 return ERROR_INT("minreversal < 0", __func__, 1);
2731
2732 /* Decide if the only values are 0 and 1 */
2733 binvals = TRUE;
2734 for (i = 0; i < n; i++) {
2735 numaGetFValue(nas, i, &fval);
2736 if (fval != 0.0 && fval != 1.0) {
2737 binvals = FALSE;
2738 break;
2739 }
2740 }
2741
2742 nr = 0;
2743 if (binvals) {
2744 if (minreversal > 1.0) {
2745 L_WARNING("binary values but minreversal > 1\n", __func__);
2746 } else {
2747 ia = numaGetIArray(nas);
2748 ival = ia[0];
2749 for (i = 1; i < n; i++) {
2750 if (ia[i] != ival) {
2751 nr++;
2752 ival = ia[i];
2753 }
2754 }
2755 LEPT_FREE(ia);
2756 }
2757 } else {
2758 nat = numaFindExtrema(nas, minreversal, NULL);
2759 nr = numaGetCount(nat);
2760 numaDestroy(&nat);
2761 }
2762 if (pnr) *pnr = nr;
2763 if (prd) {
2764 numaGetParameters(nas, NULL, &delx);
2765 len = delx * n;
2766 *prd = (l_float32)nr / len;
2767 }
2768
2769 return 0;
2770 }
2771
2772
2773 /*----------------------------------------------------------------------*
2774 * Threshold crossings and frequency analysis *
2775 *----------------------------------------------------------------------*/
2776 /*!
2777 * \brief numaSelectCrossingThreshold()
2778 *
2779 * \param[in] nax [optional] numa of abscissa values; can be NULL
2780 * \param[in] nay signal
2781 * \param[in] estthresh estimated pixel threshold for crossing:
2782 * e.g., for images, white <--> black; typ. ~120
2783 * \param[out] pbestthresh robust estimate of threshold to use
2784 * \return 0 if OK, 1 on error or warning
2785 *
2786 * <pre>
2787 * Notes:
2788 * (1) When a valid threshold is used, the number of crossings is
2789 * a maximum, because none are missed. If no threshold intersects
2790 * all the crossings, the crossings must be determined with
2791 * numaCrossingsByPeaks().
2792 * (2) %estthresh is an input estimate of the threshold that should
2793 * be used. We compute the crossings with 41 thresholds
2794 * (20 below and 20 above). There is a range in which the
2795 * number of crossings is a maximum. Return a threshold
2796 * in the center of this stable plateau of crossings.
2797 * This can then be used with numaCrossingsByThreshold()
2798 * to get a good estimate of crossing locations.
2799 * (3) If the count of nay is less than 2, a warning is issued.
2800 * </pre>
2801 */
2802 l_ok
2803 numaSelectCrossingThreshold(NUMA *nax,
2804 NUMA *nay,
2805 l_float32 estthresh,
2806 l_float32 *pbestthresh)
2807 {
2808 l_int32 i, inrun, istart, iend, maxstart, maxend, runlen, maxrunlen;
2809 l_int32 val, maxval, nmax, count;
2810 l_float32 thresh, fmaxval, fmodeval;
2811 NUMA *nat, *nac;
2812
2813 if (!pbestthresh)
2814 return ERROR_INT("&bestthresh not defined", __func__, 1);
2815 *pbestthresh = 0.0;
2816 if (!nay)
2817 return ERROR_INT("nay not defined", __func__, 1);
2818 if (numaGetCount(nay) < 2) {
2819 L_WARNING("nay count < 2; no threshold crossing\n", __func__);
2820 return 1;
2821 }
2822
2823 /* Compute the number of crossings for different thresholds */
2824 nat = numaCreate(41);
2825 for (i = 0; i < 41; i++) {
2826 thresh = estthresh - 80.0f + 4.0f * i;
2827 nac = numaCrossingsByThreshold(nax, nay, thresh);
2828 numaAddNumber(nat, numaGetCount(nac));
2829 numaDestroy(&nac);
2830 }
2831
2832 /* Find the center of the plateau of max crossings, which
2833 * extends from thresh[istart] to thresh[iend]. */
2834 numaGetMax(nat, &fmaxval, NULL);
2835 maxval = (l_int32)fmaxval;
2836 nmax = 0;
2837 for (i = 0; i < 41; i++) {
2838 numaGetIValue(nat, i, &val);
2839 if (val == maxval)
2840 nmax++;
2841 }
2842 if (nmax < 3) { /* likely accidental max; try the mode */
2843 numaGetMode(nat, &fmodeval, &count);
2844 if (count > nmax && fmodeval > 0.5 * fmaxval)
2845 maxval = (l_int32)fmodeval; /* use the mode */
2846 }
2847
2848 inrun = FALSE;
2849 iend = 40;
2850 maxrunlen = 0, maxstart = 0, maxend = 0;
2851 for (i = 0; i < 41; i++) {
2852 numaGetIValue(nat, i, &val);
2853 if (val == maxval) {
2854 if (!inrun) {
2855 istart = i;
2856 inrun = TRUE;
2857 }
2858 continue;
2859 }
2860 if (inrun && (val != maxval)) {
2861 iend = i - 1;
2862 runlen = iend - istart + 1;
2863 inrun = FALSE;
2864 if (runlen > maxrunlen) {
2865 maxstart = istart;
2866 maxend = iend;
2867 maxrunlen = runlen;
2868 }
2869 }
2870 }
2871 if (inrun) {
2872 runlen = i - istart;
2873 if (runlen > maxrunlen) {
2874 maxstart = istart;
2875 maxend = i - 1;
2876 maxrunlen = runlen;
2877 }
2878 }
2879
2880 *pbestthresh = estthresh - 80.0f + 2.0f * (l_float32)(maxstart + maxend);
2881
2882 #if DEBUG_CROSSINGS
2883 lept_stderr("\nCrossings attain a maximum at %d thresholds, between:\n"
2884 " thresh[%d] = %5.1f and thresh[%d] = %5.1f\n",
2885 nmax, maxstart, estthresh - 80.0 + 4.0 * maxstart,
2886 maxend, estthresh - 80.0 + 4.0 * maxend);
2887 lept_stderr("The best choice: %5.1f\n", *pbestthresh);
2888 lept_stderr("Number of crossings at the 41 thresholds:");
2889 numaWriteStderr(nat);
2890 #endif /* DEBUG_CROSSINGS */
2891
2892 numaDestroy(&nat);
2893 return 0;
2894 }
2895
2896
2897 /*!
2898 * \brief numaCrossingsByThreshold()
2899 *
2900 * \param[in] nax [optional] numa of abscissa values; can be NULL
2901 * \param[in] nay numa of ordinate values, corresponding to nax
2902 * \param[in] thresh threshold value for nay
2903 * \return nad abscissa pts at threshold, or NULL on error
2904 *
2905 * <pre>
2906 * Notes:
2907 * (1) If nax == NULL, we use startx and delx from nay to compute
2908 * the crossing values in nad.
2909 * </pre>
2910 */
2911 NUMA *
2912 numaCrossingsByThreshold(NUMA *nax,
2913 NUMA *nay,
2914 l_float32 thresh)
2915 {
2916 l_int32 i, n;
2917 l_float32 startx, delx;
2918 l_float32 xval1, xval2, yval1, yval2, delta1, delta2, crossval, fract;
2919 NUMA *nad;
2920
2921 if (!nay)
2922 return (NUMA *)ERROR_PTR("nay not defined", __func__, NULL);
2923 n = numaGetCount(nay);
2924
2925 if (nax && (numaGetCount(nax) != n))
2926 return (NUMA *)ERROR_PTR("nax and nay sizes differ", __func__, NULL);
2927
2928 nad = numaCreate(0);
2929 if (n < 2) return nad;
2930 numaGetFValue(nay, 0, &yval1);
2931 numaGetParameters(nay, &startx, &delx);
2932 if (nax)
2933 numaGetFValue(nax, 0, &xval1);
2934 else
2935 xval1 = startx;
2936 for (i = 1; i < n; i++) {
2937 numaGetFValue(nay, i, &yval2);
2938 if (nax)
2939 numaGetFValue(nax, i, &xval2);
2940 else
2941 xval2 = startx + i * delx;
2942 delta1 = yval1 - thresh;
2943 delta2 = yval2 - thresh;
2944 if (delta1 == 0.0) {
2945 numaAddNumber(nad, xval1);
2946 } else if (delta2 == 0.0) {
2947 numaAddNumber(nad, xval2);
2948 } else if (delta1 * delta2 < 0.0) { /* crossing */
2949 fract = L_ABS(delta1) / L_ABS(yval1 - yval2);
2950 crossval = xval1 + fract * (xval2 - xval1);
2951 numaAddNumber(nad, crossval);
2952 }
2953 xval1 = xval2;
2954 yval1 = yval2;
2955 }
2956
2957 return nad;
2958 }
2959
2960
2961 /*!
2962 * \brief numaCrossingsByPeaks()
2963 *
2964 * \param[in] nax [optional] numa of abscissa values
2965 * \param[in] nay numa of ordinate values, corresponding to nax
2966 * \param[in] delta parameter used to identify when a new peak can be found
2967 * \return nad abscissa pts at threshold, or NULL on error
2968 *
2969 * <pre>
2970 * Notes:
2971 * (1) If nax == NULL, we use startx and delx from nay to compute
2972 * the crossing values in nad.
2973 * </pre>
2974 */
2975 NUMA *
2976 numaCrossingsByPeaks(NUMA *nax,
2977 NUMA *nay,
2978 l_float32 delta)
2979 {
2980 l_int32 i, j, n, np, previndex, curindex;
2981 l_float32 startx, delx;
2982 l_float32 xval1, xval2, yval1, yval2, delta1, delta2;
2983 l_float32 prevval, curval, thresh, crossval, fract;
2984 NUMA *nap, *nad;
2985
2986 if (!nay)
2987 return (NUMA *)ERROR_PTR("nay not defined", __func__, NULL);
2988
2989 n = numaGetCount(nay);
2990 if (nax && (numaGetCount(nax) != n))
2991 return (NUMA *)ERROR_PTR("nax and nay sizes differ", __func__, NULL);
2992
2993 /* Find the extrema. Also add last point in nay to get
2994 * the last transition (from the last peak to the end).
2995 * The number of crossings is 1 more than the number of extrema. */
2996 nap = numaFindExtrema(nay, delta, NULL);
2997 numaAddNumber(nap, n - 1);
2998 np = numaGetCount(nap);
2999 L_INFO("Number of crossings: %d\n", __func__, np);
3000
3001 /* Do all computation in index units of nax or the delx of nay */
3002 nad = numaCreate(np); /* output crossing locations, in nax units */
3003 previndex = 0; /* prime the search with 1st point */
3004 numaGetFValue(nay, 0, &prevval); /* prime the search with 1st point */
3005 numaGetParameters(nay, &startx, &delx);
3006 for (i = 0; i < np; i++) {
3007 numaGetIValue(nap, i, &curindex);
3008 numaGetFValue(nay, curindex, &curval);
3009 thresh = (prevval + curval) / 2.0f;
3010 if (nax)
3011 numaGetFValue(nax, previndex, &xval1);
3012 else
3013 xval1 = startx + previndex * delx;
3014 numaGetFValue(nay, previndex, &yval1);
3015 for (j = previndex + 1; j <= curindex; j++) {
3016 if (nax)
3017 numaGetFValue(nax, j, &xval2);
3018 else
3019 xval2 = startx + j * delx;
3020 numaGetFValue(nay, j, &yval2);
3021 delta1 = yval1 - thresh;
3022 delta2 = yval2 - thresh;
3023 if (delta1 == 0.0) {
3024 numaAddNumber(nad, xval1);
3025 break;
3026 } else if (delta2 == 0.0) {
3027 numaAddNumber(nad, xval2);
3028 break;
3029 } else if (delta1 * delta2 < 0.0) { /* crossing */
3030 fract = L_ABS(delta1) / L_ABS(yval1 - yval2);
3031 crossval = xval1 + fract * (xval2 - xval1);
3032 numaAddNumber(nad, crossval);
3033 break;
3034 }
3035 xval1 = xval2;
3036 yval1 = yval2;
3037 }
3038 previndex = curindex;
3039 prevval = curval;
3040 }
3041
3042 numaDestroy(&nap);
3043 return nad;
3044 }
3045
3046
3047 /*!
3048 * \brief numaEvalBestHaarParameters()
3049 *
3050 * \param[in] nas numa of non-negative signal values
3051 * \param[in] relweight relative weight of (-1 comb) / (+1 comb)
3052 * contributions to the 'convolution'. In effect,
3053 * the convolution kernel is a comb consisting of
3054 * alternating +1 and -weight.
3055 * \param[in] nwidth number of widths to consider
3056 * \param[in] nshift number of shifts to consider for each width
3057 * \param[in] minwidth smallest width to consider
3058 * \param[in] maxwidth largest width to consider
3059 * \param[out] pbestwidth width giving largest score
3060 * \param[out] pbestshift shift giving largest score
3061 * \param[out] pbestscore [optional] convolution with "Haar"-like comb
3062 * \return 0 if OK, 1 on error
3063 *
3064 * <pre>
3065 * Notes:
3066 * (1) This does a linear sweep of widths, evaluating at %nshift
3067 * shifts for each width, computing the score from a convolution
3068 * with a long comb, and finding the (width, shift) pair that
3069 * gives the maximum score. The best width is the "half-wavelength"
3070 * of the signal.
3071 * (2) The convolving function is a comb of alternating values
3072 * +1 and -1 * relweight, separated by the width and phased by
3073 * the shift. This is similar to a Haar transform, except
3074 * there the convolution is performed with a square wave.
3075 * (3) The function is useful for finding the line spacing
3076 * and strength of line signal from pixel sum projections.
3077 * (4) The score is normalized to the size of nas divided by
3078 * the number of half-widths. For image applications, the input is
3079 * typically an array of pixel projections, so one should
3080 * normalize by dividing the score by the image width in the
3081 * pixel projection direction.
3082 * </pre>
3083 */
3084 l_ok
3085 numaEvalBestHaarParameters(NUMA *nas,
3086 l_float32 relweight,
3087 l_int32 nwidth,
3088 l_int32 nshift,
3089 l_float32 minwidth,
3090 l_float32 maxwidth,
3091 l_float32 *pbestwidth,
3092 l_float32 *pbestshift,
3093 l_float32 *pbestscore)
3094 {
3095 l_int32 i, j;
3096 l_float32 delwidth, delshift, width, shift, score;
3097 l_float32 bestwidth, bestshift, bestscore;
3098
3099 if (pbestscore) *pbestscore = 0.0;
3100 if (pbestwidth) *pbestwidth = 0.0;
3101 if (pbestshift) *pbestshift = 0.0;
3102 if (!pbestwidth || !pbestshift)
3103 return ERROR_INT("&bestwidth and &bestshift not defined", __func__, 1);
3104 if (!nas)
3105 return ERROR_INT("nas not defined", __func__, 1);
3106
3107 bestscore = bestwidth = bestshift = 0.0;
3108 delwidth = (maxwidth - minwidth) / (nwidth - 1.0f);
3109 for (i = 0; i < nwidth; i++) {
3110 width = minwidth + delwidth * i;
3111 delshift = width / (l_float32)(nshift);
3112 for (j = 0; j < nshift; j++) {
3113 shift = j * delshift;
3114 numaEvalHaarSum(nas, width, shift, relweight, &score);
3115 if (score > bestscore) {
3116 bestscore = score;
3117 bestwidth = width;
3118 bestshift = shift;
3119 #if DEBUG_FREQUENCY
3120 lept_stderr("width = %7.3f, shift = %7.3f, score = %7.3f\n",
3121 width, shift, score);
3122 #endif /* DEBUG_FREQUENCY */
3123 }
3124 }
3125 }
3126
3127 *pbestwidth = bestwidth;
3128 *pbestshift = bestshift;
3129 if (pbestscore)
3130 *pbestscore = bestscore;
3131 return 0;
3132 }
3133
3134
3135 /*!
3136 * \brief numaEvalHaarSum()
3137 *
3138 * \param[in] nas numa of non-negative signal values
3139 * \param[in] width distance between +1 and -1 in convolution comb
3140 * \param[in] shift phase of the comb: location of first +1
3141 * \param[in] relweight relative weight of (-1 comb) / (+1 comb)
3142 * contributions to the 'convolution'. In effect,
3143 * the convolution kernel is a comb consisting of
3144 * alternating +1 and -weight.
3145 * \param[out] pscore convolution with "Haar"-like comb
3146 * \return 0 if OK, 1 on error
3147 *
3148 * <pre>
3149 * Notes:
3150 * (1) This does a convolution with a comb of alternating values
3151 * +1 and -relweight, separated by the width and phased by the shift.
3152 * This is similar to a Haar transform, except that for Haar,
3153 * (1) the convolution kernel is symmetric about 0, so the
3154 * relweight is 1.0, and
3155 * (2) the convolution is performed with a square wave.
3156 * (2) The score is normalized to the size of nas divided by
3157 * twice the "width". For image applications, the input is
3158 * typically an array of pixel projections, so one should
3159 * normalize by dividing the score by the image width in the
3160 * pixel projection direction.
3161 * (3) To get a Haar-like result, use relweight = 1.0. For detecting
3162 * signals where you expect every other sample to be close to
3163 * zero, as with barcodes or filtered text lines, you can
3164 * use relweight > 1.0.
3165 * </pre>
3166 */
3167 l_ok
3168 numaEvalHaarSum(NUMA *nas,
3169 l_float32 width,
3170 l_float32 shift,
3171 l_float32 relweight,
3172 l_float32 *pscore)
3173 {
3174 l_int32 i, n, nsamp, index;
3175 l_float32 score, weight, val;
3176
3177 if (!pscore)
3178 return ERROR_INT("&score not defined", __func__, 1);
3179 *pscore = 0.0;
3180 if (!nas)
3181 return ERROR_INT("nas not defined", __func__, 1);
3182 if ((n = numaGetCount(nas)) < 2 * width)
3183 return ERROR_INT("nas size too small", __func__, 1);
3184
3185 score = 0.0;
3186 nsamp = (l_int32)((n - shift) / width);
3187 for (i = 0; i < nsamp; i++) {
3188 index = (l_int32)(shift + i * width);
3189 weight = (i % 2) ? 1.0f : -1.0f * relweight;
3190 numaGetFValue(nas, index, &val);
3191 score += weight * val;
3192 }
3193
3194 *pscore = 2.0f * width * score / (l_float32)n;
3195 return 0;
3196 }
3197
3198
3199 /*----------------------------------------------------------------------*
3200 * Generating numbers in a range under constraints *
3201 *----------------------------------------------------------------------*/
3202 /*!
3203 * \brief genConstrainedNumaInRange()
3204 *
3205 * \param[in] first first number to choose; >= 0
3206 * \param[in] last biggest possible number to reach; >= first
3207 * \param[in] nmax maximum number of numbers to select; > 0
3208 * \param[in] use_pairs 1 = select pairs of adjacent numbers;
3209 * 0 = select individual numbers
3210 * \return 0 if OK, 1 on error
3211 *
3212 * <pre>
3213 * Notes:
3214 * (1) Selection is made uniformly in the range. This can be used
3215 * to select pages distributed as uniformly as possible
3216 * through a book, where you are constrained to:
3217 * ~ choose between [first, ... biggest],
3218 * ~ choose no more than nmax numbers, and
3219 * and you have the option of requiring pairs of adjacent numbers.
3220 * </pre>
3221 */
3222 NUMA *
3223 genConstrainedNumaInRange(l_int32 first,
3224 l_int32 last,
3225 l_int32 nmax,
3226 l_int32 use_pairs)
3227 {
3228 l_int32 i, nsets, val;
3229 l_float32 delta;
3230 NUMA *na;
3231
3232 first = L_MAX(0, first);
3233 if (last < first)
3234 return (NUMA *)ERROR_PTR("last < first!", __func__, NULL);
3235 if (nmax < 1)
3236 return (NUMA *)ERROR_PTR("nmax < 1!", __func__, NULL);
3237
3238 nsets = L_MIN(nmax, last - first + 1);
3239 if (use_pairs == 1)
3240 nsets = nsets / 2;
3241 if (nsets == 0)
3242 return (NUMA *)ERROR_PTR("nsets == 0", __func__, NULL);
3243
3244 /* Select delta so that selection covers the full range if possible */
3245 if (nsets == 1) {
3246 delta = 0.0;
3247 } else {
3248 if (use_pairs == 0)
3249 delta = (l_float32)(last - first) / (nsets - 1);
3250 else
3251 delta = (l_float32)(last - first - 1) / (nsets - 1);
3252 }
3253
3254 na = numaCreate(nsets);
3255 for (i = 0; i < nsets; i++) {
3256 val = (l_int32)(first + i * delta + 0.5);
3257 numaAddNumber(na, val);
3258 if (use_pairs == 1)
3259 numaAddNumber(na, val + 1);
3260 }
3261
3262 return na;
3263 }