Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/classify/cluster.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/classify/cluster.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,3352 @@ +/****************************************************************************** + ** Filename: cluster.cpp + ** Purpose: Routines for clustering points in N-D space + ** Author: Dan Johnson + ** + ** (c) Copyright Hewlett-Packard Company, 1988. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + *****************************************************************************/ + +#define _USE_MATH_DEFINES // for M_PI + +#include "cluster.h" + +#include "genericheap.h" +#include "kdpair.h" +#include "matrix.h" +#include "tprintf.h" + +#include "helpers.h" + +#include <cfloat> // for FLT_MAX +#include <cmath> // for M_PI +#include <vector> // for std::vector + +namespace tesseract { + +#define HOTELLING 1 // If true use Hotelling's test to decide where to split. +#define FTABLE_X 10 // Size of FTable. +#define FTABLE_Y 100 // Size of FTable. + +// Table of values approximating the cumulative F-distribution for a confidence +// of 1%. +const double FTable[FTABLE_Y][FTABLE_X] = { + { + 4052.19, + 4999.52, + 5403.34, + 5624.62, + 5763.65, + 5858.97, + 5928.33, + 5981.10, + 6022.50, + 6055.85, + }, + { + 98.502, + 99.000, + 99.166, + 99.249, + 99.300, + 99.333, + 99.356, + 99.374, + 99.388, + 99.399, + }, + { + 34.116, + 30.816, + 29.457, + 28.710, + 28.237, + 27.911, + 27.672, + 27.489, + 27.345, + 27.229, + }, + { + 21.198, + 18.000, + 16.694, + 15.977, + 15.522, + 15.207, + 14.976, + 14.799, + 14.659, + 14.546, + }, + { + 16.258, + 13.274, + 12.060, + 11.392, + 10.967, + 10.672, + 10.456, + 10.289, + 10.158, + 10.051, + }, + { + 13.745, + 10.925, + 9.780, + 9.148, + 8.746, + 8.466, + 8.260, + 8.102, + 7.976, + 7.874, + }, + { + 12.246, + 9.547, + 8.451, + 7.847, + 7.460, + 7.191, + 6.993, + 6.840, + 6.719, + 6.620, + }, + { + 11.259, + 8.649, + 7.591, + 7.006, + 6.632, + 6.371, + 6.178, + 6.029, + 5.911, + 5.814, + }, + { + 10.561, + 8.022, + 6.992, + 6.422, + 6.057, + 5.802, + 5.613, + 5.467, + 5.351, + 5.257, + }, + { + 10.044, + 7.559, + 6.552, + 5.994, + 5.636, + 5.386, + 5.200, + 5.057, + 4.942, + 4.849, + }, + { + 9.646, + 7.206, + 6.217, + 5.668, + 5.316, + 5.069, + 4.886, + 4.744, + 4.632, + 4.539, + }, + { + 9.330, + 6.927, + 5.953, + 5.412, + 5.064, + 4.821, + 4.640, + 4.499, + 4.388, + 4.296, + }, + { + 9.074, + 6.701, + 5.739, + 5.205, + 4.862, + 4.620, + 4.441, + 4.302, + 4.191, + 4.100, + }, + { + 8.862, + 6.515, + 5.564, + 5.035, + 4.695, + 4.456, + 4.278, + 4.140, + 4.030, + 3.939, + }, + { + 8.683, + 6.359, + 5.417, + 4.893, + 4.556, + 4.318, + 4.142, + 4.004, + 3.895, + 3.805, + }, + { + 8.531, + 6.226, + 5.292, + 4.773, + 4.437, + 4.202, + 4.026, + 3.890, + 3.780, + 3.691, + }, + { + 8.400, + 6.112, + 5.185, + 4.669, + 4.336, + 4.102, + 3.927, + 3.791, + 3.682, + 3.593, + }, + { + 8.285, + 6.013, + 5.092, + 4.579, + 4.248, + 4.015, + 3.841, + 3.705, + 3.597, + 3.508, + }, + { + 8.185, + 5.926, + 5.010, + 4.500, + 4.171, + 3.939, + 3.765, + 3.631, + 3.523, + 3.434, + }, + { + 8.096, + 5.849, + 4.938, + 4.431, + 4.103, + 3.871, + 3.699, + 3.564, + 3.457, + 3.368, + }, + { + 8.017, + 5.780, + 4.874, + 4.369, + 4.042, + 3.812, + 3.640, + 3.506, + 3.398, + 3.310, + }, + { + 7.945, + 5.719, + 4.817, + 4.313, + 3.988, + 3.758, + 3.587, + 3.453, + 3.346, + 3.258, + }, + { + 7.881, + 5.664, + 4.765, + 4.264, + 3.939, + 3.710, + 3.539, + 3.406, + 3.299, + 3.211, + }, + { + 7.823, + 5.614, + 4.718, + 4.218, + 3.895, + 3.667, + 3.496, + 3.363, + 3.256, + 3.168, + }, + { + 7.770, + 5.568, + 4.675, + 4.177, + 3.855, + 3.627, + 3.457, + 3.324, + 3.217, + 3.129, + }, + { + 7.721, + 5.526, + 4.637, + 4.140, + 3.818, + 3.591, + 3.421, + 3.288, + 3.182, + 3.094, + }, + { + 7.677, + 5.488, + 4.601, + 4.106, + 3.785, + 3.558, + 3.388, + 3.256, + 3.149, + 3.062, + }, + { + 7.636, + 5.453, + 4.568, + 4.074, + 3.754, + 3.528, + 3.358, + 3.226, + 3.120, + 3.032, + }, + { + 7.598, + 5.420, + 4.538, + 4.045, + 3.725, + 3.499, + 3.330, + 3.198, + 3.092, + 3.005, + }, + { + 7.562, + 5.390, + 4.510, + 4.018, + 3.699, + 3.473, + 3.305, + 3.173, + 3.067, + 2.979, + }, + { + 7.530, + 5.362, + 4.484, + 3.993, + 3.675, + 3.449, + 3.281, + 3.149, + 3.043, + 2.955, + }, + { + 7.499, + 5.336, + 4.459, + 3.969, + 3.652, + 3.427, + 3.258, + 3.127, + 3.021, + 2.934, + }, + { + 7.471, + 5.312, + 4.437, + 3.948, + 3.630, + 3.406, + 3.238, + 3.106, + 3.000, + 2.913, + }, + { + 7.444, + 5.289, + 4.416, + 3.927, + 3.611, + 3.386, + 3.218, + 3.087, + 2.981, + 2.894, + }, + { + 7.419, + 5.268, + 4.396, + 3.908, + 3.592, + 3.368, + 3.200, + 3.069, + 2.963, + 2.876, + }, + { + 7.396, + 5.248, + 4.377, + 3.890, + 3.574, + 3.351, + 3.183, + 3.052, + 2.946, + 2.859, + }, + { + 7.373, + 5.229, + 4.360, + 3.873, + 3.558, + 3.334, + 3.167, + 3.036, + 2.930, + 2.843, + }, + { + 7.353, + 5.211, + 4.343, + 3.858, + 3.542, + 3.319, + 3.152, + 3.021, + 2.915, + 2.828, + }, + { + 7.333, + 5.194, + 4.327, + 3.843, + 3.528, + 3.305, + 3.137, + 3.006, + 2.901, + 2.814, + }, + { + 7.314, + 5.179, + 4.313, + 3.828, + 3.514, + 3.291, + 3.124, + 2.993, + 2.888, + 2.801, + }, + { + 7.296, + 5.163, + 4.299, + 3.815, + 3.501, + 3.278, + 3.111, + 2.980, + 2.875, + 2.788, + }, + { + 7.280, + 5.149, + 4.285, + 3.802, + 3.488, + 3.266, + 3.099, + 2.968, + 2.863, + 2.776, + }, + { + 7.264, + 5.136, + 4.273, + 3.790, + 3.476, + 3.254, + 3.087, + 2.957, + 2.851, + 2.764, + }, + { + 7.248, + 5.123, + 4.261, + 3.778, + 3.465, + 3.243, + 3.076, + 2.946, + 2.840, + 2.754, + }, + { + 7.234, + 5.110, + 4.249, + 3.767, + 3.454, + 3.232, + 3.066, + 2.935, + 2.830, + 2.743, + }, + { + 7.220, + 5.099, + 4.238, + 3.757, + 3.444, + 3.222, + 3.056, + 2.925, + 2.820, + 2.733, + }, + { + 7.207, + 5.087, + 4.228, + 3.747, + 3.434, + 3.213, + 3.046, + 2.916, + 2.811, + 2.724, + }, + { + 7.194, + 5.077, + 4.218, + 3.737, + 3.425, + 3.204, + 3.037, + 2.907, + 2.802, + 2.715, + }, + { + 7.182, + 5.066, + 4.208, + 3.728, + 3.416, + 3.195, + 3.028, + 2.898, + 2.793, + 2.706, + }, + { + 7.171, + 5.057, + 4.199, + 3.720, + 3.408, + 3.186, + 3.020, + 2.890, + 2.785, + 2.698, + }, + { + 7.159, + 5.047, + 4.191, + 3.711, + 3.400, + 3.178, + 3.012, + 2.882, + 2.777, + 2.690, + }, + { + 7.149, + 5.038, + 4.182, + 3.703, + 3.392, + 3.171, + 3.005, + 2.874, + 2.769, + 2.683, + }, + { + 7.139, + 5.030, + 4.174, + 3.695, + 3.384, + 3.163, + 2.997, + 2.867, + 2.762, + 2.675, + }, + { + 7.129, + 5.021, + 4.167, + 3.688, + 3.377, + 3.156, + 2.990, + 2.860, + 2.755, + 2.668, + }, + { + 7.119, + 5.013, + 4.159, + 3.681, + 3.370, + 3.149, + 2.983, + 2.853, + 2.748, + 2.662, + }, + { + 7.110, + 5.006, + 4.152, + 3.674, + 3.363, + 3.143, + 2.977, + 2.847, + 2.742, + 2.655, + }, + { + 7.102, + 4.998, + 4.145, + 3.667, + 3.357, + 3.136, + 2.971, + 2.841, + 2.736, + 2.649, + }, + { + 7.093, + 4.991, + 4.138, + 3.661, + 3.351, + 3.130, + 2.965, + 2.835, + 2.730, + 2.643, + }, + { + 7.085, + 4.984, + 4.132, + 3.655, + 3.345, + 3.124, + 2.959, + 2.829, + 2.724, + 2.637, + }, + { + 7.077, + 4.977, + 4.126, + 3.649, + 3.339, + 3.119, + 2.953, + 2.823, + 2.718, + 2.632, + }, + { + 7.070, + 4.971, + 4.120, + 3.643, + 3.333, + 3.113, + 2.948, + 2.818, + 2.713, + 2.626, + }, + { + 7.062, + 4.965, + 4.114, + 3.638, + 3.328, + 3.108, + 2.942, + 2.813, + 2.708, + 2.621, + }, + { + 7.055, + 4.959, + 4.109, + 3.632, + 3.323, + 3.103, + 2.937, + 2.808, + 2.703, + 2.616, + }, + { + 7.048, + 4.953, + 4.103, + 3.627, + 3.318, + 3.098, + 2.932, + 2.803, + 2.698, + 2.611, + }, + { + 7.042, + 4.947, + 4.098, + 3.622, + 3.313, + 3.093, + 2.928, + 2.798, + 2.693, + 2.607, + }, + { + 7.035, + 4.942, + 4.093, + 3.618, + 3.308, + 3.088, + 2.923, + 2.793, + 2.689, + 2.602, + }, + { + 7.029, + 4.937, + 4.088, + 3.613, + 3.304, + 3.084, + 2.919, + 2.789, + 2.684, + 2.598, + }, + { + 7.023, + 4.932, + 4.083, + 3.608, + 3.299, + 3.080, + 2.914, + 2.785, + 2.680, + 2.593, + }, + { + 7.017, + 4.927, + 4.079, + 3.604, + 3.295, + 3.075, + 2.910, + 2.781, + 2.676, + 2.589, + }, + { + 7.011, + 4.922, + 4.074, + 3.600, + 3.291, + 3.071, + 2.906, + 2.777, + 2.672, + 2.585, + }, + { + 7.006, + 4.917, + 4.070, + 3.596, + 3.287, + 3.067, + 2.902, + 2.773, + 2.668, + 2.581, + }, + { + 7.001, + 4.913, + 4.066, + 3.591, + 3.283, + 3.063, + 2.898, + 2.769, + 2.664, + 2.578, + }, + { + 6.995, + 4.908, + 4.062, + 3.588, + 3.279, + 3.060, + 2.895, + 2.765, + 2.660, + 2.574, + }, + { + 6.990, + 4.904, + 4.058, + 3.584, + 3.275, + 3.056, + 2.891, + 2.762, + 2.657, + 2.570, + }, + { + 6.985, + 4.900, + 4.054, + 3.580, + 3.272, + 3.052, + 2.887, + 2.758, + 2.653, + 2.567, + }, + { + 6.981, + 4.896, + 4.050, + 3.577, + 3.268, + 3.049, + 2.884, + 2.755, + 2.650, + 2.563, + }, + { + 6.976, + 4.892, + 4.047, + 3.573, + 3.265, + 3.046, + 2.881, + 2.751, + 2.647, + 2.560, + }, + { + 6.971, + 4.888, + 4.043, + 3.570, + 3.261, + 3.042, + 2.877, + 2.748, + 2.644, + 2.557, + }, + { + 6.967, + 4.884, + 4.040, + 3.566, + 3.258, + 3.039, + 2.874, + 2.745, + 2.640, + 2.554, + }, + { + 6.963, + 4.881, + 4.036, + 3.563, + 3.255, + 3.036, + 2.871, + 2.742, + 2.637, + 2.551, + }, + { + 6.958, + 4.877, + 4.033, + 3.560, + 3.252, + 3.033, + 2.868, + 2.739, + 2.634, + 2.548, + }, + { + 6.954, + 4.874, + 4.030, + 3.557, + 3.249, + 3.030, + 2.865, + 2.736, + 2.632, + 2.545, + }, + { + 6.950, + 4.870, + 4.027, + 3.554, + 3.246, + 3.027, + 2.863, + 2.733, + 2.629, + 2.542, + }, + { + 6.947, + 4.867, + 4.024, + 3.551, + 3.243, + 3.025, + 2.860, + 2.731, + 2.626, + 2.539, + }, + { + 6.943, + 4.864, + 4.021, + 3.548, + 3.240, + 3.022, + 2.857, + 2.728, + 2.623, + 2.537, + }, + { + 6.939, + 4.861, + 4.018, + 3.545, + 3.238, + 3.019, + 2.854, + 2.725, + 2.621, + 2.534, + }, + { + 6.935, + 4.858, + 4.015, + 3.543, + 3.235, + 3.017, + 2.852, + 2.723, + 2.618, + 2.532, + }, + { + 6.932, + 4.855, + 4.012, + 3.540, + 3.233, + 3.014, + 2.849, + 2.720, + 2.616, + 2.529, + }, + { + 6.928, + 4.852, + 4.010, + 3.538, + 3.230, + 3.012, + 2.847, + 2.718, + 2.613, + 2.527, + }, + { + 6.925, + 4.849, + 4.007, + 3.535, + 3.228, + 3.009, + 2.845, + 2.715, + 2.611, + 2.524, + }, + { + 6.922, + 4.846, + 4.004, + 3.533, + 3.225, + 3.007, + 2.842, + 2.713, + 2.609, + 2.522, + }, + { + 6.919, + 4.844, + 4.002, + 3.530, + 3.223, + 3.004, + 2.840, + 2.711, + 2.606, + 2.520, + }, + { + 6.915, + 4.841, + 3.999, + 3.528, + 3.221, + 3.002, + 2.838, + 2.709, + 2.604, + 2.518, + }, + { + 6.912, + 4.838, + 3.997, + 3.525, + 3.218, + 3.000, + 2.835, + 2.706, + 2.602, + 2.515, + }, + { + 6.909, + 4.836, + 3.995, + 3.523, + 3.216, + 2.998, + 2.833, + 2.704, + 2.600, + 2.513, + }, + { + 6.906, + 4.833, + 3.992, + 3.521, + 3.214, + 2.996, + 2.831, + 2.702, + 2.598, + 2.511, + }, + { + 6.904, + 4.831, + 3.990, + 3.519, + 3.212, + 2.994, + 2.829, + 2.700, + 2.596, + 2.509, + }, + { + 6.901, + 4.829, + 3.988, + 3.517, + 3.210, + 2.992, + 2.827, + 2.698, + 2.594, + 2.507, + }, + { + 6.898, + 4.826, + 3.986, + 3.515, + 3.208, + 2.990, + 2.825, + 2.696, + 2.592, + 2.505, + }, + {6.895, 4.824, 3.984, 3.513, 3.206, 2.988, 2.823, 2.694, 2.590, 2.503}}; + +/** define the variance which will be used as a minimum variance for any + dimension of any feature. Since most features are calculated from numbers + with a precision no better than 1 in 128, the variance should never be + less than the square of this number for parameters whose range is 1. */ +#define MINVARIANCE 0.0004 + +/** define the absolute minimum number of samples which must be present in + order to accurately test hypotheses about underlying probability + distributions. Define separately the minimum samples that are needed + before a statistical analysis is attempted; this number should be + equal to MINSAMPLES but can be set to a lower number for early testing + when very few samples are available. */ +#define MINSAMPLESPERBUCKET 5 +#define MINSAMPLES (MINBUCKETS * MINSAMPLESPERBUCKET) +#define MINSAMPLESNEEDED 1 + +/** define the size of the table which maps normalized samples to + histogram buckets. Also define the number of standard deviations + in a normal distribution which are considered to be significant. + The mapping table will be defined in such a way that it covers + the specified number of standard deviations on either side of + the mean. BUCKETTABLESIZE should always be even. */ +#define BUCKETTABLESIZE 1024 +#define NORMALEXTENT 3.0 + +struct TEMPCLUSTER { + CLUSTER *Cluster; + CLUSTER *Neighbor; +}; + +using ClusterPair = tesseract::KDPairInc<float, TEMPCLUSTER *>; +using ClusterHeap = tesseract::GenericHeap<ClusterPair>; + +struct STATISTICS { + STATISTICS(size_t n) : CoVariance(n * n), Min(n), Max(n) { + } + float AvgVariance = 1.0f; + std::vector<float> CoVariance; + std::vector<float> Min; // largest negative distance from the mean + std::vector<float> Max; // largest positive distance from the mean +}; + +struct BUCKETS { + BUCKETS(size_t n) : NumberOfBuckets(n), Count(n), ExpectedCount(n) { + } + ~BUCKETS() { + } + DISTRIBUTION Distribution = normal; // distribution being tested for + uint32_t SampleCount = 0; // # of samples in histogram + double Confidence = 0.0; // confidence level of test + double ChiSquared = 0.0; // test threshold + uint16_t NumberOfBuckets; // number of cells in histogram + uint16_t Bucket[BUCKETTABLESIZE]; // mapping to histogram buckets + std::vector<uint32_t> Count; // frequency of occurrence histogram + std::vector<float> ExpectedCount; // expected histogram +}; + +struct CHISTRUCT { + /// This constructor allocates a new data structure which is used + /// to hold a chi-squared value along with its associated + /// number of degrees of freedom and alpha value. + /// + /// @param degreesOfFreedom degrees of freedom for new chi value + /// @param alpha confidence level for new chi value + CHISTRUCT(uint16_t degreesOfFreedom, double alpha) : DegreesOfFreedom(degreesOfFreedom), Alpha(alpha) { + } + uint16_t DegreesOfFreedom = 0; + double Alpha = 0.0; + double ChiSquared = 0.0; +}; + +// For use with KDWalk / MakePotentialClusters +struct ClusteringContext { + ClusterHeap *heap; // heap used to hold temp clusters, "best" on top + TEMPCLUSTER *candidates; // array of potential clusters + KDTREE *tree; // kd-tree to be searched for neighbors + int32_t next; // next candidate to be used +}; + +using DENSITYFUNC = double (*)(int32_t); +using SOLVEFUNC = double (*)(CHISTRUCT *, double); + +#define Odd(N) ((N) % 2) +#define Mirror(N, R) ((R) - (N)-1) +#define Abs(N) (((N) < 0) ? (-(N)) : (N)) + +//--------------Global Data Definitions and Declarations---------------------- +/** the following variables describe a discrete normal distribution + which is used by NormalDensity() and NormalBucket(). The + constant NORMALEXTENT determines how many standard + deviations of the distribution are mapped onto the fixed + discrete range of x. x=0 is mapped to -NORMALEXTENT standard + deviations and x=BUCKETTABLESIZE is mapped to + +NORMALEXTENT standard deviations. */ +#define SqrtOf2Pi 2.506628275 +static const double kNormalStdDev = BUCKETTABLESIZE / (2.0 * NORMALEXTENT); +static const double kNormalVariance = + (BUCKETTABLESIZE * BUCKETTABLESIZE) / (4.0 * NORMALEXTENT * NORMALEXTENT); +static const double kNormalMagnitude = (2.0 * NORMALEXTENT) / (SqrtOf2Pi * BUCKETTABLESIZE); +static const double kNormalMean = BUCKETTABLESIZE / 2; + +/** define lookup tables used to compute the number of histogram buckets + that should be used for a given number of samples. */ +#define LOOKUPTABLESIZE 8 +#define MAXDEGREESOFFREEDOM MAXBUCKETS + +static const uint32_t kCountTable[LOOKUPTABLESIZE] = {MINSAMPLES, 200, 400, 600, 800, + 1000, 1500, 2000}; // number of samples + +static const uint16_t kBucketsTable[LOOKUPTABLESIZE] = { + MINBUCKETS, 16, 20, 24, 27, 30, 35, MAXBUCKETS}; // number of buckets + +/*------------------------------------------------------------------------- + Private Function Prototypes +--------------------------------------------------------------------------*/ +static void CreateClusterTree(CLUSTERER *Clusterer); + +static void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster, int32_t Level); + +static CLUSTER *FindNearestNeighbor(KDTREE *Tree, CLUSTER *Cluster, float *Distance); + +static CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster); + +static void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config); + +static PROTOTYPE *MakePrototype(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster); + +static PROTOTYPE *MakeDegenerateProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics, + PROTOSTYLE Style, int32_t MinSamples); + +static PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster, + STATISTICS *Statistics); + +static PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, + BUCKETS *Buckets); + +static PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, + STATISTICS *Statistics, BUCKETS *Buckets); + +static PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, + BUCKETS *NormalBuckets, double Confidence); + +static void MakeDimRandom(uint16_t i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc); + +static void MakeDimUniform(uint16_t i, PROTOTYPE *Proto, STATISTICS *Statistics); + +static STATISTICS *ComputeStatistics(int16_t N, PARAM_DESC ParamDesc[], CLUSTER *Cluster); + +static PROTOTYPE *NewSphericalProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics); + +static PROTOTYPE *NewEllipticalProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics); + +static PROTOTYPE *NewMixedProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics); + +static PROTOTYPE *NewSimpleProto(int16_t N, CLUSTER *Cluster); + +static bool Independent(PARAM_DESC *ParamDesc, int16_t N, float *CoVariance, float Independence); + +static BUCKETS *GetBuckets(CLUSTERER *clusterer, DISTRIBUTION Distribution, uint32_t SampleCount, + double Confidence); + +static BUCKETS *MakeBuckets(DISTRIBUTION Distribution, uint32_t SampleCount, double Confidence); + +static uint16_t OptimumNumberOfBuckets(uint32_t SampleCount); + +static double ComputeChiSquared(uint16_t DegreesOfFreedom, double Alpha); + +static double NormalDensity(int32_t x); + +static double UniformDensity(int32_t x); + +static double Integral(double f1, double f2, double Dx); + +static void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uint16_t Dim, PARAM_DESC *ParamDesc, + float Mean, float StdDev); + +static uint16_t NormalBucket(PARAM_DESC *ParamDesc, float x, float Mean, float StdDev); + +static uint16_t UniformBucket(PARAM_DESC *ParamDesc, float x, float Mean, float StdDev); + +static bool DistributionOK(BUCKETS *Buckets); + +static uint16_t DegreesOfFreedom(DISTRIBUTION Distribution, uint16_t HistogramBuckets); + +static void AdjustBuckets(BUCKETS *Buckets, uint32_t NewSampleCount); + +static void InitBuckets(BUCKETS *Buckets); + +static int AlphaMatch(void *arg1, // CHISTRUCT *ChiStruct, + void *arg2); // CHISTRUCT *SearchKey); + +static double Solve(SOLVEFUNC Function, void *FunctionParams, double InitialGuess, double Accuracy); + +static double ChiArea(CHISTRUCT *ChiParams, double x); + +static bool MultipleCharSamples(CLUSTERER *Clusterer, CLUSTER *Cluster, float MaxIllegal); + +static double InvertMatrix(const float *input, int size, float *inv); + +//--------------------------Public Code-------------------------------------- +/** + * This routine creates a new clusterer data structure, + * initializes it, and returns a pointer to it. + * + * @param SampleSize number of dimensions in feature space + * @param ParamDesc description of each dimension + * @return pointer to the new clusterer data structure + */ +CLUSTERER *MakeClusterer(int16_t SampleSize, const PARAM_DESC ParamDesc[]) { + int i; + + // allocate main clusterer data structure and init simple fields + auto Clusterer = new CLUSTERER; + Clusterer->SampleSize = SampleSize; + Clusterer->NumberOfSamples = 0; + Clusterer->NumChar = 0; + + // init fields which will not be used initially + Clusterer->Root = nullptr; + Clusterer->ProtoList = NIL_LIST; + + // maintain a copy of param descriptors in the clusterer data structure + Clusterer->ParamDesc = new PARAM_DESC[SampleSize]; + for (i = 0; i < SampleSize; i++) { + Clusterer->ParamDesc[i].Circular = ParamDesc[i].Circular; + Clusterer->ParamDesc[i].NonEssential = ParamDesc[i].NonEssential; + Clusterer->ParamDesc[i].Min = ParamDesc[i].Min; + Clusterer->ParamDesc[i].Max = ParamDesc[i].Max; + Clusterer->ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min; + Clusterer->ParamDesc[i].HalfRange = Clusterer->ParamDesc[i].Range / 2; + Clusterer->ParamDesc[i].MidRange = (ParamDesc[i].Max + ParamDesc[i].Min) / 2; + } + + // allocate a kd tree to hold the samples + Clusterer->KDTree = MakeKDTree(SampleSize, ParamDesc); + + // Initialize cache of histogram buckets to minimize recomputing them. + for (auto &d : Clusterer->bucket_cache) { + for (auto &c : d) { + c = nullptr; + } + } + + return Clusterer; +} // MakeClusterer + +/** + * This routine creates a new sample data structure to hold + * the specified feature. This sample is added to the clusterer + * data structure (so that it knows which samples are to be + * clustered later), and a pointer to the sample is returned to + * the caller. + * + * @param Clusterer clusterer data structure to add sample to + * @param Feature feature to be added to clusterer + * @param CharID unique ident. of char that sample came from + * + * @return Pointer to the new sample data structure + */ +SAMPLE *MakeSample(CLUSTERER *Clusterer, const float *Feature, uint32_t CharID) { + int i; + + // see if the samples have already been clustered - if so trap an error + // Can't add samples after they have been clustered. + ASSERT_HOST(Clusterer->Root == nullptr); + + // allocate the new sample and initialize it + auto Sample = new SAMPLE(Clusterer->SampleSize); + Sample->Clustered = false; + Sample->Prototype = false; + Sample->SampleCount = 1; + Sample->Left = nullptr; + Sample->Right = nullptr; + Sample->CharID = CharID; + + for (i = 0; i < Clusterer->SampleSize; i++) { + Sample->Mean[i] = Feature[i]; + } + + // add the sample to the KD tree - keep track of the total # of samples + Clusterer->NumberOfSamples++; + KDStore(Clusterer->KDTree, &Sample->Mean[0], Sample); + if (CharID >= Clusterer->NumChar) { + Clusterer->NumChar = CharID + 1; + } + + // execute hook for monitoring clustering operation + // (*SampleCreationHook)(Sample); + + return (Sample); +} // MakeSample + +/** + * This routine first checks to see if the samples in this + * clusterer have already been clustered before; if so, it does + * not bother to recreate the cluster tree. It simply recomputes + * the prototypes based on the new Config info. + * + * If the samples have not been clustered before, the + * samples in the KD tree are formed into a cluster tree and then + * the prototypes are computed from the cluster tree. + * + * In either case this routine returns a pointer to a + * list of prototypes that best represent the samples given + * the constraints specified in Config. + * + * @param Clusterer data struct containing samples to be clustered + * @param Config parameters which control clustering process + * + * @return Pointer to a list of prototypes + */ +LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) { + // only create cluster tree if samples have never been clustered before + if (Clusterer->Root == nullptr) { + CreateClusterTree(Clusterer); + } + + // deallocate the old prototype list if one exists + FreeProtoList(&Clusterer->ProtoList); + Clusterer->ProtoList = NIL_LIST; + + // compute prototypes starting at the root node in the tree + ComputePrototypes(Clusterer, Config); + // We don't need the cluster pointers in the protos any more, so null them + // out, which makes it safe to delete the clusterer. + LIST proto_list = Clusterer->ProtoList; + iterate(proto_list) { + auto *proto = reinterpret_cast<PROTOTYPE *>(proto_list->first_node()); + proto->Cluster = nullptr; + } + return Clusterer->ProtoList; +} // ClusterSamples + +/** + * This routine frees all of the memory allocated to the + * specified data structure. It will not, however, free + * the memory used by the prototype list. The pointers to + * the clusters for each prototype in the list will be set + * to nullptr to indicate that the cluster data structures no + * longer exist. Any sample lists that have been obtained + * via calls to GetSamples are no longer valid. + * @param Clusterer pointer to data structure to be freed + */ +void FreeClusterer(CLUSTERER *Clusterer) { + if (Clusterer != nullptr) { + delete[] Clusterer->ParamDesc; + delete Clusterer->KDTree; + delete Clusterer->Root; + // Free up all used buckets structures. + for (auto &d : Clusterer->bucket_cache) { + for (auto &c : d) { + delete c; + } + } + + delete Clusterer; + } +} // FreeClusterer + +/** + * This routine frees all of the memory allocated to the + * specified list of prototypes. The clusters which are + * pointed to by the prototypes are not freed. + * @param ProtoList pointer to list of prototypes to be freed + */ +void FreeProtoList(LIST *ProtoList) { + destroy_nodes(*ProtoList, FreePrototype); +} // FreeProtoList + +/** + * This routine deallocates the memory consumed by the specified + * prototype and modifies the corresponding cluster so that it + * is no longer marked as a prototype. The cluster is NOT + * deallocated by this routine. + * @param arg prototype data structure to be deallocated + */ +void FreePrototype(void *arg) { // PROTOTYPE *Prototype) + auto *Prototype = static_cast<PROTOTYPE *>(arg); + + // unmark the corresponding cluster (if there is one + if (Prototype->Cluster != nullptr) { + Prototype->Cluster->Prototype = false; + } + + // deallocate the prototype statistics and then the prototype itself + if (Prototype->Style != spherical) { + delete[] Prototype->Variance.Elliptical; + delete[] Prototype->Magnitude.Elliptical; + delete[] Prototype->Weight.Elliptical; + } + delete Prototype; +} // FreePrototype + +/** + * This routine is used to find all of the samples which + * belong to a cluster. It starts by removing the top + * cluster on the cluster list (SearchState). If this cluster is + * a leaf it is returned. Otherwise, the right subcluster + * is pushed on the list and we continue the search in the + * left subcluster. This continues until a leaf is found. + * If all samples have been found, nullptr is returned. + * InitSampleSearch() must be called + * before NextSample() to initialize the search. + * @param SearchState ptr to list containing clusters to be searched + * @return Pointer to the next leaf cluster (sample) or nullptr. + */ +CLUSTER *NextSample(LIST *SearchState) { + CLUSTER *Cluster; + + if (*SearchState == NIL_LIST) { + return (nullptr); + } + Cluster = reinterpret_cast<CLUSTER *>((*SearchState)->first_node()); + *SearchState = pop(*SearchState); + for (;;) { + if (Cluster->Left == nullptr) { + return (Cluster); + } + *SearchState = push(*SearchState, Cluster->Right); + Cluster = Cluster->Left; + } +} // NextSample + +/** + * This routine returns the mean of the specified + * prototype in the indicated dimension. + * @param Proto prototype to return mean of + * @param Dimension dimension whose mean is to be returned + * @return Mean of Prototype in Dimension + */ +float Mean(PROTOTYPE *Proto, uint16_t Dimension) { + return (Proto->Mean[Dimension]); +} // Mean + +/** + * This routine returns the standard deviation of the + * prototype in the indicated dimension. + * @param Proto prototype to return standard deviation of + * @param Dimension dimension whose stddev is to be returned + * @return Standard deviation of Prototype in Dimension + */ +float StandardDeviation(PROTOTYPE *Proto, uint16_t Dimension) { + switch (Proto->Style) { + case spherical: + return std::sqrt(Proto->Variance.Spherical); + case elliptical: + return std::sqrt(Proto->Variance.Elliptical[Dimension]); + case mixed: + switch (Proto->Distrib[Dimension]) { + case normal: + return std::sqrt(Proto->Variance.Elliptical[Dimension]); + case uniform: + case D_random: + return Proto->Variance.Elliptical[Dimension]; + case DISTRIBUTION_COUNT: + ASSERT_HOST(!"Distribution count not allowed!"); + } + } + return 0.0f; +} // StandardDeviation + +/*--------------------------------------------------------------------------- + Private Code +----------------------------------------------------------------------------*/ +/** + * This routine performs a bottoms-up clustering on the samples + * held in the kd-tree of the Clusterer data structure. The + * result is a cluster tree. Each node in the tree represents + * a cluster which conceptually contains a subset of the samples. + * More precisely, the cluster contains all of the samples which + * are contained in its two sub-clusters. The leaves of the + * tree are the individual samples themselves; they have no + * sub-clusters. The root node of the tree conceptually contains + * all of the samples. + * The Clusterer data structure is changed. + * @param Clusterer data structure holdings samples to be clustered + */ +static void CreateClusterTree(CLUSTERER *Clusterer) { + ClusteringContext context; + ClusterPair HeapEntry; + + // each sample and its nearest neighbor form a "potential" cluster + // save these in a heap with the "best" potential clusters on top + context.tree = Clusterer->KDTree; + context.candidates = new TEMPCLUSTER[Clusterer->NumberOfSamples]; + context.next = 0; + context.heap = new ClusterHeap(Clusterer->NumberOfSamples); + KDWalk(context.tree, MakePotentialClusters, &context); + + // form potential clusters into actual clusters - always do "best" first + while (context.heap->Pop(&HeapEntry)) { + TEMPCLUSTER *PotentialCluster = HeapEntry.data(); + + // if main cluster of potential cluster is already in another cluster + // then we don't need to worry about it + if (PotentialCluster->Cluster->Clustered) { + continue; + } + + // if main cluster is not yet clustered, but its nearest neighbor is + // then we must find a new nearest neighbor + else if (PotentialCluster->Neighbor->Clustered) { + PotentialCluster->Neighbor = + FindNearestNeighbor(context.tree, PotentialCluster->Cluster, &HeapEntry.key()); + if (PotentialCluster->Neighbor != nullptr) { + context.heap->Push(&HeapEntry); + } + } + + // if neither cluster is already clustered, form permanent cluster + else { + PotentialCluster->Cluster = MakeNewCluster(Clusterer, PotentialCluster); + PotentialCluster->Neighbor = + FindNearestNeighbor(context.tree, PotentialCluster->Cluster, &HeapEntry.key()); + if (PotentialCluster->Neighbor != nullptr) { + context.heap->Push(&HeapEntry); + } + } + } + + // the root node in the cluster tree is now the only node in the kd-tree + Clusterer->Root = static_cast<CLUSTER *> RootOf(Clusterer->KDTree); + + // free up the memory used by the K-D tree, heap, and temp clusters + delete context.tree; + Clusterer->KDTree = nullptr; + delete context.heap; + delete[] context.candidates; +} // CreateClusterTree + +/** + * This routine is designed to be used in concert with the + * KDWalk routine. It will create a potential cluster for + * each sample in the kd-tree that is being walked. This + * potential cluster will then be pushed on the heap. + * @param context ClusteringContext (see definition above) + * @param Cluster current cluster being visited in kd-tree walk + * @param Level level of this cluster in the kd-tree + */ +static void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster, int32_t /*Level*/) { + ClusterPair HeapEntry; + int next = context->next; + context->candidates[next].Cluster = Cluster; + HeapEntry.data() = &(context->candidates[next]); + context->candidates[next].Neighbor = + FindNearestNeighbor(context->tree, context->candidates[next].Cluster, &HeapEntry.key()); + if (context->candidates[next].Neighbor != nullptr) { + context->heap->Push(&HeapEntry); + context->next++; + } +} // MakePotentialClusters + +/** + * This routine searches the specified kd-tree for the nearest + * neighbor of the specified cluster. It actually uses the + * kd routines to find the 2 nearest neighbors since one of them + * will be the original cluster. A pointer to the nearest + * neighbor is returned, if it can be found, otherwise nullptr is + * returned. The distance between the 2 nodes is placed + * in the specified variable. + * @param Tree kd-tree to search in for nearest neighbor + * @param Cluster cluster whose nearest neighbor is to be found + * @param Distance ptr to variable to report distance found + * @return Pointer to the nearest neighbor of Cluster, or nullptr + */ +static CLUSTER *FindNearestNeighbor(KDTREE *Tree, CLUSTER *Cluster, float *Distance) +#define MAXNEIGHBORS 2 +#define MAXDISTANCE FLT_MAX +{ + CLUSTER *Neighbor[MAXNEIGHBORS]; + float Dist[MAXNEIGHBORS]; + int NumberOfNeighbors; + int32_t i; + CLUSTER *BestNeighbor; + + // find the 2 nearest neighbors of the cluster + KDNearestNeighborSearch(Tree, &Cluster->Mean[0], MAXNEIGHBORS, MAXDISTANCE, &NumberOfNeighbors, + reinterpret_cast<void **>(Neighbor), Dist); + + // search for the nearest neighbor that is not the cluster itself + *Distance = MAXDISTANCE; + BestNeighbor = nullptr; + for (i = 0; i < NumberOfNeighbors; i++) { + if ((Dist[i] < *Distance) && (Neighbor[i] != Cluster)) { + *Distance = Dist[i]; + BestNeighbor = Neighbor[i]; + } + } + return BestNeighbor; +} // FindNearestNeighbor + +/** + * This routine creates a new permanent cluster from the + * clusters specified in TempCluster. The 2 clusters in + * TempCluster are marked as "clustered" and deleted from + * the kd-tree. The new cluster is then added to the kd-tree. + * @param Clusterer current clustering environment + * @param TempCluster potential cluster to make permanent + * @return Pointer to the new permanent cluster + */ +static CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster) { + // allocate the new cluster and initialize it + auto Cluster = new CLUSTER(Clusterer->SampleSize); + Cluster->Clustered = false; + Cluster->Prototype = false; + Cluster->Left = TempCluster->Cluster; + Cluster->Right = TempCluster->Neighbor; + Cluster->CharID = -1; + + // mark the old clusters as "clustered" and delete them from the kd-tree + Cluster->Left->Clustered = true; + Cluster->Right->Clustered = true; + KDDelete(Clusterer->KDTree, &Cluster->Left->Mean[0], Cluster->Left); + KDDelete(Clusterer->KDTree, &Cluster->Right->Mean[0], Cluster->Right); + + // compute the mean and sample count for the new cluster + Cluster->SampleCount = MergeClusters(Clusterer->SampleSize, Clusterer->ParamDesc, + Cluster->Left->SampleCount, Cluster->Right->SampleCount, + &Cluster->Mean[0], &Cluster->Left->Mean[0], &Cluster->Right->Mean[0]); + + // add the new cluster to the KD tree + KDStore(Clusterer->KDTree, &Cluster->Mean[0], Cluster); + return Cluster; +} // MakeNewCluster + +/** + * This routine merges two clusters into one larger cluster. + * To do this it computes the number of samples in the new + * cluster and the mean of the new cluster. The ParamDesc + * information is used to ensure that circular dimensions + * are handled correctly. + * @param N # of dimensions (size of arrays) + * @param ParamDesc array of dimension descriptions + * @param n1, n2 number of samples in each old cluster + * @param m array to hold mean of new cluster + * @param m1, m2 arrays containing means of old clusters + * @return The number of samples in the new cluster. + */ +int32_t MergeClusters(int16_t N, PARAM_DESC ParamDesc[], int32_t n1, int32_t n2, float m[], + float m1[], float m2[]) { + int32_t i, n; + + n = n1 + n2; + for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) { + if (ParamDesc->Circular) { + // if distance between means is greater than allowed + // reduce upper point by one "rotation" to compute mean + // then normalize the mean back into the accepted range + if ((*m2 - *m1) > ParamDesc->HalfRange) { + *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->Range)) / n; + if (*m < ParamDesc->Min) { + *m += ParamDesc->Range; + } + } else if ((*m1 - *m2) > ParamDesc->HalfRange) { + *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n; + if (*m < ParamDesc->Min) { + *m += ParamDesc->Range; + } + } else { + *m = (n1 * *m1 + n2 * *m2) / n; + } + } else { + *m = (n1 * *m1 + n2 * *m2) / n; + } + } + return n; +} // MergeClusters + +/** + * This routine decides which clusters in the cluster tree + * should be represented by prototypes, forms a list of these + * prototypes, and places the list in the Clusterer data + * structure. + * @param Clusterer data structure holding cluster tree + * @param Config parameters used to control prototype generation + */ +static void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) { + LIST ClusterStack = NIL_LIST; + CLUSTER *Cluster; + PROTOTYPE *Prototype; + + // use a stack to keep track of clusters waiting to be processed + // initially the only cluster on the stack is the root cluster + if (Clusterer->Root != nullptr) { + ClusterStack = push(NIL_LIST, Clusterer->Root); + } + + // loop until we have analyzed all clusters which are potential prototypes + while (ClusterStack != NIL_LIST) { + // remove the next cluster to be analyzed from the stack + // try to make a prototype from the cluster + // if successful, put it on the proto list, else split the cluster + Cluster = reinterpret_cast<CLUSTER *>(ClusterStack->first_node()); + ClusterStack = pop(ClusterStack); + Prototype = MakePrototype(Clusterer, Config, Cluster); + if (Prototype != nullptr) { + Clusterer->ProtoList = push(Clusterer->ProtoList, Prototype); + } else { + ClusterStack = push(ClusterStack, Cluster->Right); + ClusterStack = push(ClusterStack, Cluster->Left); + } + } +} // ComputePrototypes + +/** + * This routine attempts to create a prototype from the + * specified cluster that conforms to the distribution + * specified in Config. If there are too few samples in the + * cluster to perform a statistical analysis, then a prototype + * is generated but labelled as insignificant. If the + * dimensions of the cluster are not independent, no prototype + * is generated and nullptr is returned. If a prototype can be + * found that matches the desired distribution then a pointer + * to it is returned, otherwise nullptr is returned. + * @param Clusterer data structure holding cluster tree + * @param Config parameters used to control prototype generation + * @param Cluster cluster to be made into a prototype + * @return Pointer to new prototype or nullptr + */ +static PROTOTYPE *MakePrototype(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster) { + PROTOTYPE *Proto; + BUCKETS *Buckets; + + // filter out clusters which contain samples from the same character + if (MultipleCharSamples(Clusterer, Cluster, Config->MaxIllegal)) { + return nullptr; + } + + // compute the covariance matrix and ranges for the cluster + auto Statistics = ComputeStatistics(Clusterer->SampleSize, Clusterer->ParamDesc, Cluster); + + // check for degenerate clusters which need not be analyzed further + // note that the MinSamples test assumes that all clusters with multiple + // character samples have been removed (as above) + Proto = MakeDegenerateProto(Clusterer->SampleSize, Cluster, Statistics, Config->ProtoStyle, + static_cast<int32_t>(Config->MinSamples * Clusterer->NumChar)); + if (Proto != nullptr) { + delete Statistics; + return Proto; + } + // check to ensure that all dimensions are independent + if (!Independent(Clusterer->ParamDesc, Clusterer->SampleSize, &Statistics->CoVariance[0], + Config->Independence)) { + delete Statistics; + return nullptr; + } + + if (HOTELLING && Config->ProtoStyle == elliptical) { + Proto = TestEllipticalProto(Clusterer, Config, Cluster, Statistics); + if (Proto != nullptr) { + delete Statistics; + return Proto; + } + } + + // create a histogram data structure used to evaluate distributions + Buckets = GetBuckets(Clusterer, normal, Cluster->SampleCount, Config->Confidence); + + // create a prototype based on the statistics and test it + switch (Config->ProtoStyle) { + case spherical: + Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets); + break; + case elliptical: + Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets); + break; + case mixed: + Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets, Config->Confidence); + break; + case automatic: + Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets); + if (Proto != nullptr) { + break; + } + Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets); + if (Proto != nullptr) { + break; + } + Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets, Config->Confidence); + break; + } + delete Statistics; + return Proto; +} // MakePrototype + +/** + * This routine checks for clusters which are degenerate and + * therefore cannot be analyzed in a statistically valid way. + * A cluster is defined as degenerate if it does not have at + * least MINSAMPLESNEEDED samples in it. If the cluster is + * found to be degenerate, a prototype of the specified style + * is generated and marked as insignificant. A cluster is + * also degenerate if it does not have at least MinSamples + * samples in it. + * + * If the cluster is not degenerate, nullptr is returned. + * + * @param N number of dimensions + * @param Cluster cluster being analyzed + * @param Statistics statistical info about cluster + * @param Style type of prototype to be generated + * @param MinSamples minimum number of samples in a cluster + * @return Pointer to degenerate prototype or nullptr. + */ +static PROTOTYPE *MakeDegenerateProto( // this was MinSample + uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics, PROTOSTYLE Style, int32_t MinSamples) { + PROTOTYPE *Proto = nullptr; + + if (MinSamples < MINSAMPLESNEEDED) { + MinSamples = MINSAMPLESNEEDED; + } + + if (Cluster->SampleCount < MinSamples) { + switch (Style) { + case spherical: + Proto = NewSphericalProto(N, Cluster, Statistics); + break; + case elliptical: + case automatic: + Proto = NewEllipticalProto(N, Cluster, Statistics); + break; + case mixed: + Proto = NewMixedProto(N, Cluster, Statistics); + break; + } + Proto->Significant = false; + } + return (Proto); +} // MakeDegenerateProto + +/** + * This routine tests the specified cluster to see if ** + * there is a statistically significant difference between + * the sub-clusters that would be made if the cluster were to + * be split. If not, then a new prototype is formed and + * returned to the caller. If there is, then nullptr is returned + * to the caller. + * @param Clusterer data struct containing samples being clustered + * @param Config provides the magic number of samples that make a good cluster + * @param Cluster cluster to be made into an elliptical prototype + * @param Statistics statistical info about cluster + * @return Pointer to new elliptical prototype or nullptr. + */ +static PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster, + STATISTICS *Statistics) { + // Fraction of the number of samples used as a range around 1 within + // which a cluster has the magic size that allows a boost to the + // FTable by kFTableBoostMargin, thus allowing clusters near the + // magic size (equal to the number of sample characters) to be more + // likely to stay together. + const double kMagicSampleMargin = 0.0625; + const double kFTableBoostMargin = 2.0; + + int N = Clusterer->SampleSize; + CLUSTER *Left = Cluster->Left; + CLUSTER *Right = Cluster->Right; + if (Left == nullptr || Right == nullptr) { + return nullptr; + } + int TotalDims = Left->SampleCount + Right->SampleCount; + if (TotalDims < N + 1 || TotalDims < 2) { + return nullptr; + } + std::vector<float> Covariance(static_cast<size_t>(N) * N); + std::vector<float> Inverse(static_cast<size_t>(N) * N); + std::vector<float> Delta(N); + // Compute a new covariance matrix that only uses essential features. + for (int i = 0; i < N; ++i) { + int row_offset = i * N; + if (!Clusterer->ParamDesc[i].NonEssential) { + for (int j = 0; j < N; ++j) { + if (!Clusterer->ParamDesc[j].NonEssential) { + Covariance[j + row_offset] = Statistics->CoVariance[j + row_offset]; + } else { + Covariance[j + row_offset] = 0.0f; + } + } + } else { + for (int j = 0; j < N; ++j) { + if (i == j) { + Covariance[j + row_offset] = 1.0f; + } else { + Covariance[j + row_offset] = 0.0f; + } + } + } + } + double err = InvertMatrix(&Covariance[0], N, &Inverse[0]); + if (err > 1) { + tprintf("Clustering error: Matrix inverse failed with error %g\n", err); + } + int EssentialN = 0; + for (int dim = 0; dim < N; ++dim) { + if (!Clusterer->ParamDesc[dim].NonEssential) { + Delta[dim] = Left->Mean[dim] - Right->Mean[dim]; + ++EssentialN; + } else { + Delta[dim] = 0.0f; + } + } + // Compute Hotelling's T-squared. + double Tsq = 0.0; + for (int x = 0; x < N; ++x) { + double temp = 0.0; + for (int y = 0; y < N; ++y) { + temp += static_cast<double>(Inverse[y + N * x]) * Delta[y]; + } + Tsq += Delta[x] * temp; + } + // Changed this function to match the formula in + // Statistical Methods in Medical Research p 473 + // By Peter Armitage, Geoffrey Berry, J. N. S. Matthews. + // Tsq *= Left->SampleCount * Right->SampleCount / TotalDims; + double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2) * EssentialN); + int Fx = EssentialN; + if (Fx > FTABLE_X) { + Fx = FTABLE_X; + } + --Fx; + int Fy = TotalDims - EssentialN - 1; + if (Fy > FTABLE_Y) { + Fy = FTABLE_Y; + } + --Fy; + double FTarget = FTable[Fy][Fx]; + if (Config->MagicSamples > 0 && TotalDims >= Config->MagicSamples * (1.0 - kMagicSampleMargin) && + TotalDims <= Config->MagicSamples * (1.0 + kMagicSampleMargin)) { + // Give magic-sized clusters a magic FTable boost. + FTarget += kFTableBoostMargin; + } + if (F < FTarget) { + return NewEllipticalProto(Clusterer->SampleSize, Cluster, Statistics); + } + return nullptr; +} + +/** + * This routine tests the specified cluster to see if it can + * be approximated by a spherical normal distribution. If it + * can be, then a new prototype is formed and returned to the + * caller. If it can't be, then nullptr is returned to the caller. + * @param Clusterer data struct containing samples being clustered + * @param Cluster cluster to be made into a spherical prototype + * @param Statistics statistical info about cluster + * @param Buckets histogram struct used to analyze distribution + * @return Pointer to new spherical prototype or nullptr. + */ +static PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, + BUCKETS *Buckets) { + PROTOTYPE *Proto = nullptr; + int i; + + // check that each dimension is a normal distribution + for (i = 0; i < Clusterer->SampleSize; i++) { + if (Clusterer->ParamDesc[i].NonEssential) { + continue; + } + + FillBuckets(Buckets, Cluster, i, &(Clusterer->ParamDesc[i]), Cluster->Mean[i], + sqrt(static_cast<double>(Statistics->AvgVariance))); + if (!DistributionOK(Buckets)) { + break; + } + } + // if all dimensions matched a normal distribution, make a proto + if (i >= Clusterer->SampleSize) { + Proto = NewSphericalProto(Clusterer->SampleSize, Cluster, Statistics); + } + return (Proto); +} // MakeSphericalProto + +/** + * This routine tests the specified cluster to see if it can + * be approximated by an elliptical normal distribution. If it + * can be, then a new prototype is formed and returned to the + * caller. If it can't be, then nullptr is returned to the caller. + * @param Clusterer data struct containing samples being clustered + * @param Cluster cluster to be made into an elliptical prototype + * @param Statistics statistical info about cluster + * @param Buckets histogram struct used to analyze distribution + * @return Pointer to new elliptical prototype or nullptr. + */ +static PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, + STATISTICS *Statistics, BUCKETS *Buckets) { + PROTOTYPE *Proto = nullptr; + int i; + + // check that each dimension is a normal distribution + for (i = 0; i < Clusterer->SampleSize; i++) { + if (Clusterer->ParamDesc[i].NonEssential) { + continue; + } + + FillBuckets(Buckets, Cluster, i, &(Clusterer->ParamDesc[i]), Cluster->Mean[i], + sqrt(static_cast<double>(Statistics->CoVariance[i * (Clusterer->SampleSize + 1)]))); + if (!DistributionOK(Buckets)) { + break; + } + } + // if all dimensions matched a normal distribution, make a proto + if (i >= Clusterer->SampleSize) { + Proto = NewEllipticalProto(Clusterer->SampleSize, Cluster, Statistics); + } + return (Proto); +} // MakeEllipticalProto + +/** + * This routine tests each dimension of the specified cluster to + * see what distribution would best approximate that dimension. + * Each dimension is compared to the following distributions + * in order: normal, random, uniform. If each dimension can + * be represented by one of these distributions, + * then a new prototype is formed and returned to the + * caller. If it can't be, then nullptr is returned to the caller. + * @param Clusterer data struct containing samples being clustered + * @param Cluster cluster to be made into a prototype + * @param Statistics statistical info about cluster + * @param NormalBuckets histogram struct used to analyze distribution + * @param Confidence confidence level for alternate distributions + * @return Pointer to new mixed prototype or nullptr. + */ +static PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, + BUCKETS *NormalBuckets, double Confidence) { + PROTOTYPE *Proto; + int i; + BUCKETS *UniformBuckets = nullptr; + BUCKETS *RandomBuckets = nullptr; + + // create a mixed proto to work on - initially assume all dimensions normal + Proto = NewMixedProto(Clusterer->SampleSize, Cluster, Statistics); + + // find the proper distribution for each dimension + for (i = 0; i < Clusterer->SampleSize; i++) { + if (Clusterer->ParamDesc[i].NonEssential) { + continue; + } + + FillBuckets(NormalBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), Proto->Mean[i], + std::sqrt(Proto->Variance.Elliptical[i])); + if (DistributionOK(NormalBuckets)) { + continue; + } + + if (RandomBuckets == nullptr) { + RandomBuckets = GetBuckets(Clusterer, D_random, Cluster->SampleCount, Confidence); + } + MakeDimRandom(i, Proto, &(Clusterer->ParamDesc[i])); + FillBuckets(RandomBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), Proto->Mean[i], + Proto->Variance.Elliptical[i]); + if (DistributionOK(RandomBuckets)) { + continue; + } + + if (UniformBuckets == nullptr) { + UniformBuckets = GetBuckets(Clusterer, uniform, Cluster->SampleCount, Confidence); + } + MakeDimUniform(i, Proto, Statistics); + FillBuckets(UniformBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), Proto->Mean[i], + Proto->Variance.Elliptical[i]); + if (DistributionOK(UniformBuckets)) { + continue; + } + break; + } + // if any dimension failed to match a distribution, discard the proto + if (i < Clusterer->SampleSize) { + FreePrototype(Proto); + Proto = nullptr; + } + return (Proto); +} // MakeMixedProto + +/** + * This routine alters the ith dimension of the specified + * mixed prototype to be D_random. + * @param i index of dimension to be changed + * @param Proto prototype whose dimension is to be altered + * @param ParamDesc description of specified dimension + */ +static void MakeDimRandom(uint16_t i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc) { + Proto->Distrib[i] = D_random; + Proto->Mean[i] = ParamDesc->MidRange; + Proto->Variance.Elliptical[i] = ParamDesc->HalfRange; + + // subtract out the previous magnitude of this dimension from the total + Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i]; + Proto->Magnitude.Elliptical[i] = 1.0 / ParamDesc->Range; + Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i]; + Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude)); + + // note that the proto Weight is irrelevant for D_random protos +} // MakeDimRandom + +/** + * This routine alters the ith dimension of the specified + * mixed prototype to be uniform. + * @param i index of dimension to be changed + * @param Proto prototype whose dimension is to be altered + * @param Statistics statistical info about prototype + */ +static void MakeDimUniform(uint16_t i, PROTOTYPE *Proto, STATISTICS *Statistics) { + Proto->Distrib[i] = uniform; + Proto->Mean[i] = Proto->Cluster->Mean[i] + (Statistics->Min[i] + Statistics->Max[i]) / 2; + Proto->Variance.Elliptical[i] = (Statistics->Max[i] - Statistics->Min[i]) / 2; + if (Proto->Variance.Elliptical[i] < MINVARIANCE) { + Proto->Variance.Elliptical[i] = MINVARIANCE; + } + + // subtract out the previous magnitude of this dimension from the total + Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i]; + Proto->Magnitude.Elliptical[i] = 1.0 / (2.0 * Proto->Variance.Elliptical[i]); + Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i]; + Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude)); + + // note that the proto Weight is irrelevant for uniform protos +} // MakeDimUniform + +/** + * This routine searches the cluster tree for all leaf nodes + * which are samples in the specified cluster. It computes + * a full covariance matrix for these samples as well as + * keeping track of the ranges (min and max) for each + * dimension. A special data structure is allocated to + * return this information to the caller. An incremental + * algorithm for computing statistics is not used because + * it will not work with circular dimensions. + * @param N number of dimensions + * @param ParamDesc array of dimension descriptions + * @param Cluster cluster whose stats are to be computed + * @return Pointer to new data structure containing statistics + */ +static STATISTICS *ComputeStatistics(int16_t N, PARAM_DESC ParamDesc[], CLUSTER *Cluster) { + int i, j; + LIST SearchState; + SAMPLE *Sample; + uint32_t SampleCountAdjustedForBias; + + // allocate memory to hold the statistics results + auto Statistics = new STATISTICS(N); + + // allocate temporary memory to hold the sample to mean distances + std::vector<float> Distance(N); + + // find each sample in the cluster and merge it into the statistics + InitSampleSearch(SearchState, Cluster); + while ((Sample = NextSample(&SearchState)) != nullptr) { + for (i = 0; i < N; i++) { + Distance[i] = Sample->Mean[i] - Cluster->Mean[i]; + if (ParamDesc[i].Circular) { + if (Distance[i] > ParamDesc[i].HalfRange) { + Distance[i] -= ParamDesc[i].Range; + } + if (Distance[i] < -ParamDesc[i].HalfRange) { + Distance[i] += ParamDesc[i].Range; + } + } + if (Distance[i] < Statistics->Min[i]) { + Statistics->Min[i] = Distance[i]; + } + if (Distance[i] > Statistics->Max[i]) { + Statistics->Max[i] = Distance[i]; + } + } + auto CoVariance = &Statistics->CoVariance[0]; + for (i = 0; i < N; i++) { + for (j = 0; j < N; j++, CoVariance++) { + *CoVariance += Distance[i] * Distance[j]; + } + } + } + // normalize the variances by the total number of samples + // use SampleCount-1 instead of SampleCount to get an unbiased estimate + // also compute the geometic mean of the diagonal variances + // ensure that clusters with only 1 sample are handled correctly + if (Cluster->SampleCount > 1) { + SampleCountAdjustedForBias = Cluster->SampleCount - 1; + } else { + SampleCountAdjustedForBias = 1; + } + auto CoVariance = &Statistics->CoVariance[0]; + for (i = 0; i < N; i++) { + for (j = 0; j < N; j++, CoVariance++) { + *CoVariance /= SampleCountAdjustedForBias; + if (j == i) { + if (*CoVariance < MINVARIANCE) { + *CoVariance = MINVARIANCE; + } + Statistics->AvgVariance *= *CoVariance; + } + } + } + Statistics->AvgVariance = + static_cast<float>(pow(static_cast<double>(Statistics->AvgVariance), 1.0 / N)); + + return Statistics; +} // ComputeStatistics + +/** + * This routine creates a spherical prototype data structure to + * approximate the samples in the specified cluster. + * Spherical prototypes have a single variance which is + * common across all dimensions. All dimensions are normally + * distributed and independent. + * @param N number of dimensions + * @param Cluster cluster to be made into a spherical prototype + * @param Statistics statistical info about samples in cluster + * @return Pointer to a new spherical prototype data structure + */ +static PROTOTYPE *NewSphericalProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics) { + PROTOTYPE *Proto; + + Proto = NewSimpleProto(N, Cluster); + + Proto->Variance.Spherical = Statistics->AvgVariance; + if (Proto->Variance.Spherical < MINVARIANCE) { + Proto->Variance.Spherical = MINVARIANCE; + } + + Proto->Magnitude.Spherical = 1.0 / sqrt(2.0 * M_PI * Proto->Variance.Spherical); + Proto->TotalMagnitude = static_cast<float>( + pow(static_cast<double>(Proto->Magnitude.Spherical), static_cast<double>(N))); + Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical; + Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude)); + + return (Proto); +} // NewSphericalProto + +/** + * This routine creates an elliptical prototype data structure to + * approximate the samples in the specified cluster. + * Elliptical prototypes have a variance for each dimension. + * All dimensions are normally distributed and independent. + * @param N number of dimensions + * @param Cluster cluster to be made into an elliptical prototype + * @param Statistics statistical info about samples in cluster + * @return Pointer to a new elliptical prototype data structure + */ +static PROTOTYPE *NewEllipticalProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics) { + PROTOTYPE *Proto; + int i; + + Proto = NewSimpleProto(N, Cluster); + Proto->Variance.Elliptical = new float[N]; + Proto->Magnitude.Elliptical = new float[N]; + Proto->Weight.Elliptical = new float[N]; + + auto CoVariance = &Statistics->CoVariance[0]; + Proto->TotalMagnitude = 1.0; + for (i = 0; i < N; i++, CoVariance += N + 1) { + Proto->Variance.Elliptical[i] = *CoVariance; + if (Proto->Variance.Elliptical[i] < MINVARIANCE) { + Proto->Variance.Elliptical[i] = MINVARIANCE; + } + + Proto->Magnitude.Elliptical[i] = 1.0f / sqrt(2.0f * M_PI * Proto->Variance.Elliptical[i]); + Proto->Weight.Elliptical[i] = 1.0f / Proto->Variance.Elliptical[i]; + Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i]; + } + Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude)); + Proto->Style = elliptical; + return (Proto); +} // NewEllipticalProto + +/** + * This routine creates a mixed prototype data structure to + * approximate the samples in the specified cluster. + * Mixed prototypes can have different distributions for + * each dimension. All dimensions are independent. The + * structure is initially filled in as though it were an + * elliptical prototype. The actual distributions of the + * dimensions can be altered by other routines. + * @param N number of dimensions + * @param Cluster cluster to be made into a mixed prototype + * @param Statistics statistical info about samples in cluster + * @return Pointer to a new mixed prototype data structure + */ +static PROTOTYPE *NewMixedProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics) { + auto Proto = NewEllipticalProto(N, Cluster, Statistics); + Proto->Distrib.clear(); + Proto->Distrib.resize(N, normal); + Proto->Style = mixed; + return Proto; +} // NewMixedProto + +/** + * This routine allocates memory to hold a simple prototype + * data structure, i.e. one without independent distributions + * and variances for each dimension. + * @param N number of dimensions + * @param Cluster cluster to be made into a prototype + * @return Pointer to new simple prototype + */ +static PROTOTYPE *NewSimpleProto(int16_t N, CLUSTER *Cluster) { + auto Proto = new PROTOTYPE; + Proto->Mean = Cluster->Mean; + Proto->Distrib.clear(); + Proto->Significant = true; + Proto->Merged = false; + Proto->Style = spherical; + Proto->NumSamples = Cluster->SampleCount; + Proto->Cluster = Cluster; + Proto->Cluster->Prototype = true; + return Proto; +} // NewSimpleProto + +/** + * This routine returns true if the specified covariance + * matrix indicates that all N dimensions are independent of + * one another. One dimension is judged to be independent of + * another when the magnitude of the corresponding correlation + * coefficient is + * less than the specified Independence factor. The + * correlation coefficient is calculated as: (see Duda and + * Hart, pg. 247) + * coeff[ij] = stddev[ij] / sqrt (stddev[ii] * stddev[jj]) + * The covariance matrix is assumed to be symmetric (which + * should always be true). + * @param ParamDesc descriptions of each feature space dimension + * @param N number of dimensions + * @param CoVariance ptr to a covariance matrix + * @param Independence max off-diagonal correlation coefficient + * @return true if dimensions are independent, false otherwise + */ +static bool Independent(PARAM_DESC *ParamDesc, int16_t N, float *CoVariance, float Independence) { + int i, j; + float *VARii; // points to ith on-diagonal element + float *VARjj; // points to jth on-diagonal element + float CorrelationCoeff; + + VARii = CoVariance; + for (i = 0; i < N; i++, VARii += N + 1) { + if (ParamDesc[i].NonEssential) { + continue; + } + + VARjj = VARii + N + 1; + CoVariance = VARii + 1; + for (j = i + 1; j < N; j++, CoVariance++, VARjj += N + 1) { + if (ParamDesc[j].NonEssential) { + continue; + } + + if ((*VARii == 0.0) || (*VARjj == 0.0)) { + CorrelationCoeff = 0.0; + } else { + CorrelationCoeff = sqrt(std::sqrt(*CoVariance * *CoVariance / (*VARii * *VARjj))); + } + if (CorrelationCoeff > Independence) { + return false; + } + } + } + return true; +} // Independent + +/** + * This routine returns a histogram data structure which can + * be used by other routines to place samples into histogram + * buckets, and then apply a goodness of fit test to the + * histogram data to determine if the samples belong to the + * specified probability distribution. The routine keeps + * a list of bucket data structures which have already been + * created so that it minimizes the computation time needed + * to create a new bucket. + * @param clusterer which keeps a bucket_cache for us. + * @param Distribution type of probability distribution to test for + * @param SampleCount number of samples that are available + * @param Confidence probability of a Type I error + * @return Bucket data structure + */ +static BUCKETS *GetBuckets(CLUSTERER *clusterer, DISTRIBUTION Distribution, uint32_t SampleCount, + double Confidence) { + // Get an old bucket structure with the same number of buckets. + uint16_t NumberOfBuckets = OptimumNumberOfBuckets(SampleCount); + BUCKETS *Buckets = clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS]; + + // If a matching bucket structure is not found, make one and save it. + if (Buckets == nullptr) { + Buckets = MakeBuckets(Distribution, SampleCount, Confidence); + clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS] = Buckets; + } else { + // Just adjust the existing buckets. + if (SampleCount != Buckets->SampleCount) { + AdjustBuckets(Buckets, SampleCount); + } + if (Confidence != Buckets->Confidence) { + Buckets->Confidence = Confidence; + Buckets->ChiSquared = + ComputeChiSquared(DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence); + } + InitBuckets(Buckets); + } + return Buckets; +} // GetBuckets + +/** + * This routine creates a histogram data structure which can + * be used by other routines to place samples into histogram + * buckets, and then apply a goodness of fit test to the + * histogram data to determine if the samples belong to the + * specified probability distribution. The buckets are + * allocated in such a way that the expected frequency of + * samples in each bucket is approximately the same. In + * order to make this possible, a mapping table is + * computed which maps "normalized" samples into the + * appropriate bucket. + * @param Distribution type of probability distribution to test for + * @param SampleCount number of samples that are available + * @param Confidence probability of a Type I error + * @return Pointer to new histogram data structure + */ +static BUCKETS *MakeBuckets(DISTRIBUTION Distribution, uint32_t SampleCount, double Confidence) { + const DENSITYFUNC DensityFunction[] = {NormalDensity, UniformDensity, UniformDensity}; + int i, j; + double BucketProbability; + double NextBucketBoundary; + double Probability; + double ProbabilityDelta; + double LastProbDensity; + double ProbDensity; + uint16_t CurrentBucket; + bool Symmetrical; + + // allocate memory needed for data structure + auto Buckets = new BUCKETS(OptimumNumberOfBuckets(SampleCount)); + Buckets->SampleCount = SampleCount; + Buckets->Confidence = Confidence; + + // initialize simple fields + Buckets->Distribution = Distribution; + + // all currently defined distributions are symmetrical + Symmetrical = true; + Buckets->ChiSquared = + ComputeChiSquared(DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence); + + if (Symmetrical) { + // allocate buckets so that all have approx. equal probability + BucketProbability = 1.0 / static_cast<double>(Buckets->NumberOfBuckets); + + // distribution is symmetric so fill in upper half then copy + CurrentBucket = Buckets->NumberOfBuckets / 2; + if (Odd(Buckets->NumberOfBuckets)) { + NextBucketBoundary = BucketProbability / 2; + } else { + NextBucketBoundary = BucketProbability; + } + + Probability = 0.0; + LastProbDensity = (*DensityFunction[static_cast<int>(Distribution)])(BUCKETTABLESIZE / 2); + for (i = BUCKETTABLESIZE / 2; i < BUCKETTABLESIZE; i++) { + ProbDensity = (*DensityFunction[static_cast<int>(Distribution)])(i + 1); + ProbabilityDelta = Integral(LastProbDensity, ProbDensity, 1.0); + Probability += ProbabilityDelta; + if (Probability > NextBucketBoundary) { + if (CurrentBucket < Buckets->NumberOfBuckets - 1) { + CurrentBucket++; + } + NextBucketBoundary += BucketProbability; + } + Buckets->Bucket[i] = CurrentBucket; + Buckets->ExpectedCount[CurrentBucket] += static_cast<float>(ProbabilityDelta * SampleCount); + LastProbDensity = ProbDensity; + } + // place any leftover probability into the last bucket + Buckets->ExpectedCount[CurrentBucket] += static_cast<float>((0.5 - Probability) * SampleCount); + + // copy upper half of distribution to lower half + for (i = 0, j = BUCKETTABLESIZE - 1; i < j; i++, j--) { + Buckets->Bucket[i] = Mirror(Buckets->Bucket[j], Buckets->NumberOfBuckets); + } + + // copy upper half of expected counts to lower half + for (i = 0, j = Buckets->NumberOfBuckets - 1; i <= j; i++, j--) { + Buckets->ExpectedCount[i] += Buckets->ExpectedCount[j]; + } + } + return Buckets; +} // MakeBuckets + +/** + * This routine computes the optimum number of histogram + * buckets that should be used in a chi-squared goodness of + * fit test for the specified number of samples. The optimum + * number is computed based on Table 4.1 on pg. 147 of + * "Measurement and Analysis of Random Data" by Bendat & Piersol. + * Linear interpolation is used to interpolate between table + * values. The table is intended for a 0.05 level of + * significance (alpha). This routine assumes that it is + * equally valid for other alpha's, which may not be true. + * @param SampleCount number of samples to be tested + * @return Optimum number of histogram buckets + */ +static uint16_t OptimumNumberOfBuckets(uint32_t SampleCount) { + uint8_t Last, Next; + float Slope; + + if (SampleCount < kCountTable[0]) { + return kBucketsTable[0]; + } + + for (Last = 0, Next = 1; Next < LOOKUPTABLESIZE; Last++, Next++) { + if (SampleCount <= kCountTable[Next]) { + Slope = static_cast<float>(kBucketsTable[Next] - kBucketsTable[Last]) / + static_cast<float>(kCountTable[Next] - kCountTable[Last]); + return ( + static_cast<uint16_t>(kBucketsTable[Last] + Slope * (SampleCount - kCountTable[Last]))); + } + } + return kBucketsTable[Last]; +} // OptimumNumberOfBuckets + +/** + * This routine computes the chi-squared value which will + * leave a cumulative probability of Alpha in the right tail + * of a chi-squared distribution with the specified number of + * degrees of freedom. Alpha must be between 0 and 1. + * DegreesOfFreedom must be even. The routine maintains an + * array of lists. Each list corresponds to a different + * number of degrees of freedom. Each entry in the list + * corresponds to a different alpha value and its corresponding + * chi-squared value. Therefore, once a particular chi-squared + * value is computed, it is stored in the list and never + * needs to be computed again. + * @param DegreesOfFreedom determines shape of distribution + * @param Alpha probability of right tail + * @return Desired chi-squared value + */ +static double ComputeChiSquared(uint16_t DegreesOfFreedom, double Alpha) +#define CHIACCURACY 0.01 +#define MINALPHA (1e-200) +{ + static LIST ChiWith[MAXDEGREESOFFREEDOM + 1]; + + // limit the minimum alpha that can be used - if alpha is too small + // it may not be possible to compute chi-squared. + Alpha = ClipToRange(Alpha, MINALPHA, 1.0); + if (Odd(DegreesOfFreedom)) { + DegreesOfFreedom++; + } + + /* find the list of chi-squared values which have already been computed + for the specified number of degrees of freedom. Search the list for + the desired chi-squared. */ + CHISTRUCT SearchKey(0.0, Alpha); + auto *found = search(ChiWith[DegreesOfFreedom], &SearchKey, AlphaMatch); + auto OldChiSquared = reinterpret_cast<CHISTRUCT *>(found ? found->first_node() : nullptr); + + if (OldChiSquared == nullptr) { + OldChiSquared = new CHISTRUCT(DegreesOfFreedom, Alpha); + OldChiSquared->ChiSquared = + Solve(ChiArea, OldChiSquared, static_cast<double>(DegreesOfFreedom), CHIACCURACY); + ChiWith[DegreesOfFreedom] = push(ChiWith[DegreesOfFreedom], OldChiSquared); + } else { + // further optimization might move OldChiSquared to front of list + } + + return (OldChiSquared->ChiSquared); + +} // ComputeChiSquared + +/** + * This routine computes the probability density function + * of a discrete normal distribution defined by the global + * variables kNormalMean, kNormalVariance, and kNormalMagnitude. + * Normal magnitude could, of course, be computed in terms of + * the normal variance but it is precomputed for efficiency. + * @param x number to compute the normal probability density for + * @note Globals: + * kNormalMean mean of a discrete normal distribution + * kNormalVariance variance of a discrete normal distribution + * kNormalMagnitude magnitude of a discrete normal distribution + * @return The value of the normal distribution at x. + */ +static double NormalDensity(int32_t x) { + double Distance; + + Distance = x - kNormalMean; + return kNormalMagnitude * exp(-0.5 * Distance * Distance / kNormalVariance); +} // NormalDensity + +/** + * This routine computes the probability density function + * of a uniform distribution at the specified point. The + * range of the distribution is from 0 to BUCKETTABLESIZE. + * @param x number to compute the uniform probability density for + * @return The value of the uniform distribution at x. + */ +static double UniformDensity(int32_t x) { + constexpr auto UniformDistributionDensity = 1.0 / BUCKETTABLESIZE; + + if ((x >= 0) && (x <= BUCKETTABLESIZE)) { + return UniformDistributionDensity; + } else { + return 0.0; + } +} // UniformDensity + +/** + * This routine computes a trapezoidal approximation to the + * integral of a function over a small delta in x. + * @param f1 value of function at x1 + * @param f2 value of function at x2 + * @param Dx x2 - x1 (should always be positive) + * @return Approximation of the integral of the function from x1 to x2. + */ +static double Integral(double f1, double f2, double Dx) { + return (f1 + f2) * Dx / 2.0; +} // Integral + +/** + * This routine counts the number of cluster samples which + * fall within the various histogram buckets in Buckets. Only + * one dimension of each sample is examined. The exact meaning + * of the Mean and StdDev parameters depends on the + * distribution which is being analyzed (this info is in the + * Buckets data structure). For normal distributions, Mean + * and StdDev have the expected meanings. For uniform and + * random distributions the Mean is the center point of the + * range and the StdDev is 1/2 the range. A dimension with + * zero standard deviation cannot be statistically analyzed. + * In this case, a pseudo-analysis is used. + * The Buckets data structure is filled in. + * @param Buckets histogram buckets to count samples + * @param Cluster cluster whose samples are being analyzed + * @param Dim dimension of samples which is being analyzed + * @param ParamDesc description of the dimension + * @param Mean "mean" of the distribution + * @param StdDev "standard deviation" of the distribution + */ +static void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uint16_t Dim, PARAM_DESC *ParamDesc, + float Mean, float StdDev) { + uint16_t BucketID; + int i; + LIST SearchState; + SAMPLE *Sample; + + // initialize the histogram bucket counts to 0 + for (i = 0; i < Buckets->NumberOfBuckets; i++) { + Buckets->Count[i] = 0; + } + + if (StdDev == 0.0) { + /* if the standard deviation is zero, then we can't statistically + analyze the cluster. Use a pseudo-analysis: samples exactly on + the mean are distributed evenly across all buckets. Samples greater + than the mean are placed in the last bucket; samples less than the + mean are placed in the first bucket. */ + + InitSampleSearch(SearchState, Cluster); + i = 0; + while ((Sample = NextSample(&SearchState)) != nullptr) { + if (Sample->Mean[Dim] > Mean) { + BucketID = Buckets->NumberOfBuckets - 1; + } else if (Sample->Mean[Dim] < Mean) { + BucketID = 0; + } else { + BucketID = i; + } + Buckets->Count[BucketID] += 1; + i++; + if (i >= Buckets->NumberOfBuckets) { + i = 0; + } + } + } else { + // search for all samples in the cluster and add to histogram buckets + InitSampleSearch(SearchState, Cluster); + while ((Sample = NextSample(&SearchState)) != nullptr) { + switch (Buckets->Distribution) { + case normal: + BucketID = NormalBucket(ParamDesc, Sample->Mean[Dim], Mean, StdDev); + break; + case D_random: + case uniform: + BucketID = UniformBucket(ParamDesc, Sample->Mean[Dim], Mean, StdDev); + break; + default: + BucketID = 0; + } + Buckets->Count[Buckets->Bucket[BucketID]] += 1; + } + } +} // FillBuckets + +/** + * This routine determines which bucket x falls into in the + * discrete normal distribution defined by kNormalMean + * and kNormalStdDev. x values which exceed the range of + * the discrete distribution are clipped. + * @param ParamDesc used to identify circular dimensions + * @param x value to be normalized + * @param Mean mean of normal distribution + * @param StdDev standard deviation of normal distribution + * @return Bucket number into which x falls + */ +static uint16_t NormalBucket(PARAM_DESC *ParamDesc, float x, float Mean, float StdDev) { + float X; + + // wraparound circular parameters if necessary + if (ParamDesc->Circular) { + if (x - Mean > ParamDesc->HalfRange) { + x -= ParamDesc->Range; + } else if (x - Mean < -ParamDesc->HalfRange) { + x += ParamDesc->Range; + } + } + + X = ((x - Mean) / StdDev) * kNormalStdDev + kNormalMean; + if (X < 0) { + return 0; + } + if (X > BUCKETTABLESIZE - 1) { + return (static_cast<uint16_t>(BUCKETTABLESIZE - 1)); + } + return static_cast<uint16_t>(floor(static_cast<double>(X))); +} // NormalBucket + +/** + * This routine determines which bucket x falls into in the + * discrete uniform distribution defined by + * BUCKETTABLESIZE. x values which exceed the range of + * the discrete distribution are clipped. + * @param ParamDesc used to identify circular dimensions + * @param x value to be normalized + * @param Mean center of range of uniform distribution + * @param StdDev 1/2 the range of the uniform distribution + * @return Bucket number into which x falls + */ +static uint16_t UniformBucket(PARAM_DESC *ParamDesc, float x, float Mean, float StdDev) { + float X; + + // wraparound circular parameters if necessary + if (ParamDesc->Circular) { + if (x - Mean > ParamDesc->HalfRange) { + x -= ParamDesc->Range; + } else if (x - Mean < -ParamDesc->HalfRange) { + x += ParamDesc->Range; + } + } + + X = ((x - Mean) / (2 * StdDev) * BUCKETTABLESIZE + BUCKETTABLESIZE / 2.0); + if (X < 0) { + return 0; + } + if (X > BUCKETTABLESIZE - 1) { + return static_cast<uint16_t>(BUCKETTABLESIZE - 1); + } + return static_cast<uint16_t>(floor(static_cast<double>(X))); +} // UniformBucket + +/** + * This routine performs a chi-square goodness of fit test + * on the histogram data in the Buckets data structure. + * true is returned if the histogram matches the probability + * distribution which was specified when the Buckets + * structure was originally created. Otherwise false is + * returned. + * @param Buckets histogram data to perform chi-square test on + * @return true if samples match distribution, false otherwise + */ +static bool DistributionOK(BUCKETS *Buckets) { + float FrequencyDifference; + float TotalDifference; + int i; + + // compute how well the histogram matches the expected histogram + TotalDifference = 0.0; + for (i = 0; i < Buckets->NumberOfBuckets; i++) { + FrequencyDifference = Buckets->Count[i] - Buckets->ExpectedCount[i]; + TotalDifference += (FrequencyDifference * FrequencyDifference) / Buckets->ExpectedCount[i]; + } + + // test to see if the difference is more than expected + if (TotalDifference > Buckets->ChiSquared) { + return false; + } else { + return true; + } +} // DistributionOK + +/** + * This routine computes the degrees of freedom that should + * be used in a chi-squared test with the specified number of + * histogram buckets. The result is always rounded up to + * the next even number so that the value of chi-squared can be + * computed more easily. This will cause the value of + * chi-squared to be higher than the optimum value, resulting + * in the chi-square test being more lenient than optimum. + * @param Distribution distribution being tested for + * @param HistogramBuckets number of buckets in chi-square test + * @return The number of degrees of freedom for a chi-square test + */ +static uint16_t DegreesOfFreedom(DISTRIBUTION Distribution, uint16_t HistogramBuckets) { + static uint8_t DegreeOffsets[] = {3, 3, 1}; + + uint16_t AdjustedNumBuckets; + + AdjustedNumBuckets = HistogramBuckets - DegreeOffsets[static_cast<int>(Distribution)]; + if (Odd(AdjustedNumBuckets)) { + AdjustedNumBuckets++; + } + return (AdjustedNumBuckets); + +} // DegreesOfFreedom + +/** + * This routine multiplies each ExpectedCount histogram entry + * by NewSampleCount/OldSampleCount so that the histogram + * is now adjusted to the new sample count. + * @param Buckets histogram data structure to adjust + * @param NewSampleCount new sample count to adjust to + */ +static void AdjustBuckets(BUCKETS *Buckets, uint32_t NewSampleCount) { + int i; + double AdjustFactor; + + AdjustFactor = + ((static_cast<double>(NewSampleCount)) / (static_cast<double>(Buckets->SampleCount))); + + for (i = 0; i < Buckets->NumberOfBuckets; i++) { + Buckets->ExpectedCount[i] *= AdjustFactor; + } + + Buckets->SampleCount = NewSampleCount; + +} // AdjustBuckets + +/** + * This routine sets the bucket counts in the specified histogram + * to zero. + * @param Buckets histogram data structure to init + */ +static void InitBuckets(BUCKETS *Buckets) { + int i; + + for (i = 0; i < Buckets->NumberOfBuckets; i++) { + Buckets->Count[i] = 0; + } + +} // InitBuckets + +/** + * This routine is used to search a list of structures which + * hold pre-computed chi-squared values for a chi-squared + * value whose corresponding alpha field matches the alpha + * field of SearchKey. + * + * It is called by the list search routines. + * + * @param arg1 chi-squared struct being tested for a match + * @param arg2 chi-squared struct that is the search key + * @return true if ChiStruct's Alpha matches SearchKey's Alpha + */ +static int AlphaMatch(void *arg1, // CHISTRUCT *ChiStruct, + void *arg2) { // CHISTRUCT *SearchKey) + auto *ChiStruct = static_cast<CHISTRUCT *>(arg1); + auto *SearchKey = static_cast<CHISTRUCT *>(arg2); + + return (ChiStruct->Alpha == SearchKey->Alpha); + +} // AlphaMatch + +/** + * This routine attempts to find an x value at which Function + * goes to zero (i.e. a root of the function). It will only + * work correctly if a solution actually exists and there + * are no extrema between the solution and the InitialGuess. + * The algorithms used are extremely primitive. + * + * @param Function function whose zero is to be found + * @param FunctionParams arbitrary data to pass to function + * @param InitialGuess point to start solution search at + * @param Accuracy maximum allowed error + * @return Solution of function (x for which f(x) = 0). + */ +static double Solve(SOLVEFUNC Function, void *FunctionParams, double InitialGuess, double Accuracy) +#define INITIALDELTA 0.1 +#define DELTARATIO 0.1 +{ + double x; + double f; + double Slope; + double Delta; + double NewDelta; + double xDelta; + double LastPosX, LastNegX; + + x = InitialGuess; + Delta = INITIALDELTA; + LastPosX = FLT_MAX; + LastNegX = -FLT_MAX; + f = (*Function)(static_cast<CHISTRUCT *>(FunctionParams), x); + while (Abs(LastPosX - LastNegX) > Accuracy) { + // keep track of outer bounds of current estimate + if (f < 0) { + LastNegX = x; + } else { + LastPosX = x; + } + + // compute the approx. slope of f(x) at the current point + Slope = ((*Function)(static_cast<CHISTRUCT *>(FunctionParams), x + Delta) - f) / Delta; + + // compute the next solution guess */ + xDelta = f / Slope; + x -= xDelta; + + // reduce the delta used for computing slope to be a fraction of + // the amount moved to get to the new guess + NewDelta = Abs(xDelta) * DELTARATIO; + if (NewDelta < Delta) { + Delta = NewDelta; + } + + // compute the value of the function at the new guess + f = (*Function)(static_cast<CHISTRUCT *>(FunctionParams), x); + } + return (x); + +} // Solve + +/** + * This routine computes the area under a chi density curve + * from 0 to x, minus the desired area under the curve. The + * number of degrees of freedom of the chi curve is specified + * in the ChiParams structure. The desired area is also + * specified in the ChiParams structure as Alpha (or 1 minus + * the desired area). This routine is intended to be passed + * to the Solve() function to find the value of chi-squared + * which will yield a desired area under the right tail of + * the chi density curve. The function will only work for + * even degrees of freedom. The equations are based on + * integrating the chi density curve in parts to obtain + * a series that can be used to compute the area under the + * curve. + * @param ChiParams contains degrees of freedom and alpha + * @param x value of chi-squared to evaluate + * @return Error between actual and desired area under the chi curve. + */ +static double ChiArea(CHISTRUCT *ChiParams, double x) { + int i, N; + double SeriesTotal; + double Denominator; + double PowerOfx; + + N = ChiParams->DegreesOfFreedom / 2 - 1; + SeriesTotal = 1; + Denominator = 1; + PowerOfx = 1; + for (i = 1; i <= N; i++) { + Denominator *= 2 * i; + PowerOfx *= x; + SeriesTotal += PowerOfx / Denominator; + } + return ((SeriesTotal * exp(-0.5 * x)) - ChiParams->Alpha); + +} // ChiArea + +/** + * This routine looks at all samples in the specified cluster. + * It computes a running estimate of the percentage of the + * characters which have more than 1 sample in the cluster. + * When this percentage exceeds MaxIllegal, true is returned. + * Otherwise false is returned. The CharID + * fields must contain integers which identify the training + * characters which were used to generate the sample. One + * integer is used for each sample. The NumChar field in + * the Clusterer must contain the number of characters in the + * training set. All CharID fields must be between 0 and + * NumChar-1. The main function of this routine is to help + * identify clusters which need to be split further, i.e. if + * numerous training characters have 2 or more features which are + * contained in the same cluster, then the cluster should be + * split. + * + * @param Clusterer data structure holding cluster tree + * @param Cluster cluster containing samples to be tested + * @param MaxIllegal max percentage of samples allowed to have + * more than 1 feature in the cluster + * @return true if the cluster should be split, false otherwise. + */ +static bool MultipleCharSamples(CLUSTERER *Clusterer, CLUSTER *Cluster, float MaxIllegal) +#define ILLEGAL_CHAR 2 +{ + static std::vector<uint8_t> CharFlags; + LIST SearchState; + SAMPLE *Sample; + int32_t CharID; + int32_t NumCharInCluster; + int32_t NumIllegalInCluster; + float PercentIllegal; + + // initial estimate assumes that no illegal chars exist in the cluster + NumCharInCluster = Cluster->SampleCount; + NumIllegalInCluster = 0; + + if (Clusterer->NumChar > CharFlags.size()) { + CharFlags.resize(Clusterer->NumChar); + } + + for (auto &CharFlag : CharFlags) { + CharFlag = false; + } + + // find each sample in the cluster and check if we have seen it before + InitSampleSearch(SearchState, Cluster); + while ((Sample = NextSample(&SearchState)) != nullptr) { + CharID = Sample->CharID; + if (CharFlags[CharID] == false) { + CharFlags[CharID] = true; + } else { + if (CharFlags[CharID] == true) { + NumIllegalInCluster++; + CharFlags[CharID] = ILLEGAL_CHAR; + } + NumCharInCluster--; + PercentIllegal = static_cast<float>(NumIllegalInCluster) / NumCharInCluster; + if (PercentIllegal > MaxIllegal) { + destroy(SearchState); + return true; + } + } + } + return false; + +} // MultipleCharSamples + +/** + * Compute the inverse of a matrix using LU decomposition with partial pivoting. + * The return value is the sum of norms of the off-diagonal terms of the + * product of a and inv. (A measure of the error.) + */ +static double InvertMatrix(const float *input, int size, float *inv) { + // Allocate memory for the 2D arrays. + GENERIC_2D_ARRAY<double> U(size, size, 0.0); + GENERIC_2D_ARRAY<double> U_inv(size, size, 0.0); + GENERIC_2D_ARRAY<double> L(size, size, 0.0); + + // Initialize the working matrices. U starts as input, L as I and U_inv as O. + int row; + int col; + for (row = 0; row < size; row++) { + for (col = 0; col < size; col++) { + U[row][col] = input[row * size + col]; + L[row][col] = row == col ? 1.0 : 0.0; + U_inv[row][col] = 0.0; + } + } + + // Compute forward matrix by inversion by LU decomposition of input. + for (col = 0; col < size; ++col) { + // Find best pivot + int best_row = 0; + double best_pivot = -1.0; + for (row = col; row < size; ++row) { + if (Abs(U[row][col]) > best_pivot) { + best_pivot = Abs(U[row][col]); + best_row = row; + } + } + // Exchange pivot rows. + if (best_row != col) { + for (int k = 0; k < size; ++k) { + double tmp = U[best_row][k]; + U[best_row][k] = U[col][k]; + U[col][k] = tmp; + tmp = L[best_row][k]; + L[best_row][k] = L[col][k]; + L[col][k] = tmp; + } + } + // Now do the pivot itself. + for (row = col + 1; row < size; ++row) { + double ratio = -U[row][col] / U[col][col]; + for (int j = col; j < size; ++j) { + U[row][j] += U[col][j] * ratio; + } + for (int k = 0; k < size; ++k) { + L[row][k] += L[col][k] * ratio; + } + } + } + // Next invert U. + for (col = 0; col < size; ++col) { + U_inv[col][col] = 1.0 / U[col][col]; + for (row = col - 1; row >= 0; --row) { + double total = 0.0; + for (int k = col; k > row; --k) { + total += U[row][k] * U_inv[k][col]; + } + U_inv[row][col] = -total / U[row][row]; + } + } + // Now the answer is U_inv.L. + for (row = 0; row < size; row++) { + for (col = 0; col < size; col++) { + double sum = 0.0; + for (int k = row; k < size; ++k) { + sum += U_inv[row][k] * L[k][col]; + } + inv[row * size + col] = sum; + } + } + // Check matrix product. + double error_sum = 0.0; + for (row = 0; row < size; row++) { + for (col = 0; col < size; col++) { + double sum = 0.0; + for (int k = 0; k < size; ++k) { + sum += static_cast<double>(input[row * size + k]) * inv[k * size + col]; + } + if (row != col) { + error_sum += Abs(sum); + } + } + } + return error_sum; +} + +} // namespace tesseract
