comparison mupdf-source/thirdparty/tesseract/src/classify/cluster.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /******************************************************************************
2 ** Filename: cluster.cpp
3 ** Purpose: Routines for clustering points in N-D space
4 ** Author: Dan Johnson
5 **
6 ** (c) Copyright Hewlett-Packard Company, 1988.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *****************************************************************************/
17
18 #define _USE_MATH_DEFINES // for M_PI
19
20 #include "cluster.h"
21
22 #include "genericheap.h"
23 #include "kdpair.h"
24 #include "matrix.h"
25 #include "tprintf.h"
26
27 #include "helpers.h"
28
29 #include <cfloat> // for FLT_MAX
30 #include <cmath> // for M_PI
31 #include <vector> // for std::vector
32
33 namespace tesseract {
34
35 #define HOTELLING 1 // If true use Hotelling's test to decide where to split.
36 #define FTABLE_X 10 // Size of FTable.
37 #define FTABLE_Y 100 // Size of FTable.
38
39 // Table of values approximating the cumulative F-distribution for a confidence
40 // of 1%.
41 const double FTable[FTABLE_Y][FTABLE_X] = {
42 {
43 4052.19,
44 4999.52,
45 5403.34,
46 5624.62,
47 5763.65,
48 5858.97,
49 5928.33,
50 5981.10,
51 6022.50,
52 6055.85,
53 },
54 {
55 98.502,
56 99.000,
57 99.166,
58 99.249,
59 99.300,
60 99.333,
61 99.356,
62 99.374,
63 99.388,
64 99.399,
65 },
66 {
67 34.116,
68 30.816,
69 29.457,
70 28.710,
71 28.237,
72 27.911,
73 27.672,
74 27.489,
75 27.345,
76 27.229,
77 },
78 {
79 21.198,
80 18.000,
81 16.694,
82 15.977,
83 15.522,
84 15.207,
85 14.976,
86 14.799,
87 14.659,
88 14.546,
89 },
90 {
91 16.258,
92 13.274,
93 12.060,
94 11.392,
95 10.967,
96 10.672,
97 10.456,
98 10.289,
99 10.158,
100 10.051,
101 },
102 {
103 13.745,
104 10.925,
105 9.780,
106 9.148,
107 8.746,
108 8.466,
109 8.260,
110 8.102,
111 7.976,
112 7.874,
113 },
114 {
115 12.246,
116 9.547,
117 8.451,
118 7.847,
119 7.460,
120 7.191,
121 6.993,
122 6.840,
123 6.719,
124 6.620,
125 },
126 {
127 11.259,
128 8.649,
129 7.591,
130 7.006,
131 6.632,
132 6.371,
133 6.178,
134 6.029,
135 5.911,
136 5.814,
137 },
138 {
139 10.561,
140 8.022,
141 6.992,
142 6.422,
143 6.057,
144 5.802,
145 5.613,
146 5.467,
147 5.351,
148 5.257,
149 },
150 {
151 10.044,
152 7.559,
153 6.552,
154 5.994,
155 5.636,
156 5.386,
157 5.200,
158 5.057,
159 4.942,
160 4.849,
161 },
162 {
163 9.646,
164 7.206,
165 6.217,
166 5.668,
167 5.316,
168 5.069,
169 4.886,
170 4.744,
171 4.632,
172 4.539,
173 },
174 {
175 9.330,
176 6.927,
177 5.953,
178 5.412,
179 5.064,
180 4.821,
181 4.640,
182 4.499,
183 4.388,
184 4.296,
185 },
186 {
187 9.074,
188 6.701,
189 5.739,
190 5.205,
191 4.862,
192 4.620,
193 4.441,
194 4.302,
195 4.191,
196 4.100,
197 },
198 {
199 8.862,
200 6.515,
201 5.564,
202 5.035,
203 4.695,
204 4.456,
205 4.278,
206 4.140,
207 4.030,
208 3.939,
209 },
210 {
211 8.683,
212 6.359,
213 5.417,
214 4.893,
215 4.556,
216 4.318,
217 4.142,
218 4.004,
219 3.895,
220 3.805,
221 },
222 {
223 8.531,
224 6.226,
225 5.292,
226 4.773,
227 4.437,
228 4.202,
229 4.026,
230 3.890,
231 3.780,
232 3.691,
233 },
234 {
235 8.400,
236 6.112,
237 5.185,
238 4.669,
239 4.336,
240 4.102,
241 3.927,
242 3.791,
243 3.682,
244 3.593,
245 },
246 {
247 8.285,
248 6.013,
249 5.092,
250 4.579,
251 4.248,
252 4.015,
253 3.841,
254 3.705,
255 3.597,
256 3.508,
257 },
258 {
259 8.185,
260 5.926,
261 5.010,
262 4.500,
263 4.171,
264 3.939,
265 3.765,
266 3.631,
267 3.523,
268 3.434,
269 },
270 {
271 8.096,
272 5.849,
273 4.938,
274 4.431,
275 4.103,
276 3.871,
277 3.699,
278 3.564,
279 3.457,
280 3.368,
281 },
282 {
283 8.017,
284 5.780,
285 4.874,
286 4.369,
287 4.042,
288 3.812,
289 3.640,
290 3.506,
291 3.398,
292 3.310,
293 },
294 {
295 7.945,
296 5.719,
297 4.817,
298 4.313,
299 3.988,
300 3.758,
301 3.587,
302 3.453,
303 3.346,
304 3.258,
305 },
306 {
307 7.881,
308 5.664,
309 4.765,
310 4.264,
311 3.939,
312 3.710,
313 3.539,
314 3.406,
315 3.299,
316 3.211,
317 },
318 {
319 7.823,
320 5.614,
321 4.718,
322 4.218,
323 3.895,
324 3.667,
325 3.496,
326 3.363,
327 3.256,
328 3.168,
329 },
330 {
331 7.770,
332 5.568,
333 4.675,
334 4.177,
335 3.855,
336 3.627,
337 3.457,
338 3.324,
339 3.217,
340 3.129,
341 },
342 {
343 7.721,
344 5.526,
345 4.637,
346 4.140,
347 3.818,
348 3.591,
349 3.421,
350 3.288,
351 3.182,
352 3.094,
353 },
354 {
355 7.677,
356 5.488,
357 4.601,
358 4.106,
359 3.785,
360 3.558,
361 3.388,
362 3.256,
363 3.149,
364 3.062,
365 },
366 {
367 7.636,
368 5.453,
369 4.568,
370 4.074,
371 3.754,
372 3.528,
373 3.358,
374 3.226,
375 3.120,
376 3.032,
377 },
378 {
379 7.598,
380 5.420,
381 4.538,
382 4.045,
383 3.725,
384 3.499,
385 3.330,
386 3.198,
387 3.092,
388 3.005,
389 },
390 {
391 7.562,
392 5.390,
393 4.510,
394 4.018,
395 3.699,
396 3.473,
397 3.305,
398 3.173,
399 3.067,
400 2.979,
401 },
402 {
403 7.530,
404 5.362,
405 4.484,
406 3.993,
407 3.675,
408 3.449,
409 3.281,
410 3.149,
411 3.043,
412 2.955,
413 },
414 {
415 7.499,
416 5.336,
417 4.459,
418 3.969,
419 3.652,
420 3.427,
421 3.258,
422 3.127,
423 3.021,
424 2.934,
425 },
426 {
427 7.471,
428 5.312,
429 4.437,
430 3.948,
431 3.630,
432 3.406,
433 3.238,
434 3.106,
435 3.000,
436 2.913,
437 },
438 {
439 7.444,
440 5.289,
441 4.416,
442 3.927,
443 3.611,
444 3.386,
445 3.218,
446 3.087,
447 2.981,
448 2.894,
449 },
450 {
451 7.419,
452 5.268,
453 4.396,
454 3.908,
455 3.592,
456 3.368,
457 3.200,
458 3.069,
459 2.963,
460 2.876,
461 },
462 {
463 7.396,
464 5.248,
465 4.377,
466 3.890,
467 3.574,
468 3.351,
469 3.183,
470 3.052,
471 2.946,
472 2.859,
473 },
474 {
475 7.373,
476 5.229,
477 4.360,
478 3.873,
479 3.558,
480 3.334,
481 3.167,
482 3.036,
483 2.930,
484 2.843,
485 },
486 {
487 7.353,
488 5.211,
489 4.343,
490 3.858,
491 3.542,
492 3.319,
493 3.152,
494 3.021,
495 2.915,
496 2.828,
497 },
498 {
499 7.333,
500 5.194,
501 4.327,
502 3.843,
503 3.528,
504 3.305,
505 3.137,
506 3.006,
507 2.901,
508 2.814,
509 },
510 {
511 7.314,
512 5.179,
513 4.313,
514 3.828,
515 3.514,
516 3.291,
517 3.124,
518 2.993,
519 2.888,
520 2.801,
521 },
522 {
523 7.296,
524 5.163,
525 4.299,
526 3.815,
527 3.501,
528 3.278,
529 3.111,
530 2.980,
531 2.875,
532 2.788,
533 },
534 {
535 7.280,
536 5.149,
537 4.285,
538 3.802,
539 3.488,
540 3.266,
541 3.099,
542 2.968,
543 2.863,
544 2.776,
545 },
546 {
547 7.264,
548 5.136,
549 4.273,
550 3.790,
551 3.476,
552 3.254,
553 3.087,
554 2.957,
555 2.851,
556 2.764,
557 },
558 {
559 7.248,
560 5.123,
561 4.261,
562 3.778,
563 3.465,
564 3.243,
565 3.076,
566 2.946,
567 2.840,
568 2.754,
569 },
570 {
571 7.234,
572 5.110,
573 4.249,
574 3.767,
575 3.454,
576 3.232,
577 3.066,
578 2.935,
579 2.830,
580 2.743,
581 },
582 {
583 7.220,
584 5.099,
585 4.238,
586 3.757,
587 3.444,
588 3.222,
589 3.056,
590 2.925,
591 2.820,
592 2.733,
593 },
594 {
595 7.207,
596 5.087,
597 4.228,
598 3.747,
599 3.434,
600 3.213,
601 3.046,
602 2.916,
603 2.811,
604 2.724,
605 },
606 {
607 7.194,
608 5.077,
609 4.218,
610 3.737,
611 3.425,
612 3.204,
613 3.037,
614 2.907,
615 2.802,
616 2.715,
617 },
618 {
619 7.182,
620 5.066,
621 4.208,
622 3.728,
623 3.416,
624 3.195,
625 3.028,
626 2.898,
627 2.793,
628 2.706,
629 },
630 {
631 7.171,
632 5.057,
633 4.199,
634 3.720,
635 3.408,
636 3.186,
637 3.020,
638 2.890,
639 2.785,
640 2.698,
641 },
642 {
643 7.159,
644 5.047,
645 4.191,
646 3.711,
647 3.400,
648 3.178,
649 3.012,
650 2.882,
651 2.777,
652 2.690,
653 },
654 {
655 7.149,
656 5.038,
657 4.182,
658 3.703,
659 3.392,
660 3.171,
661 3.005,
662 2.874,
663 2.769,
664 2.683,
665 },
666 {
667 7.139,
668 5.030,
669 4.174,
670 3.695,
671 3.384,
672 3.163,
673 2.997,
674 2.867,
675 2.762,
676 2.675,
677 },
678 {
679 7.129,
680 5.021,
681 4.167,
682 3.688,
683 3.377,
684 3.156,
685 2.990,
686 2.860,
687 2.755,
688 2.668,
689 },
690 {
691 7.119,
692 5.013,
693 4.159,
694 3.681,
695 3.370,
696 3.149,
697 2.983,
698 2.853,
699 2.748,
700 2.662,
701 },
702 {
703 7.110,
704 5.006,
705 4.152,
706 3.674,
707 3.363,
708 3.143,
709 2.977,
710 2.847,
711 2.742,
712 2.655,
713 },
714 {
715 7.102,
716 4.998,
717 4.145,
718 3.667,
719 3.357,
720 3.136,
721 2.971,
722 2.841,
723 2.736,
724 2.649,
725 },
726 {
727 7.093,
728 4.991,
729 4.138,
730 3.661,
731 3.351,
732 3.130,
733 2.965,
734 2.835,
735 2.730,
736 2.643,
737 },
738 {
739 7.085,
740 4.984,
741 4.132,
742 3.655,
743 3.345,
744 3.124,
745 2.959,
746 2.829,
747 2.724,
748 2.637,
749 },
750 {
751 7.077,
752 4.977,
753 4.126,
754 3.649,
755 3.339,
756 3.119,
757 2.953,
758 2.823,
759 2.718,
760 2.632,
761 },
762 {
763 7.070,
764 4.971,
765 4.120,
766 3.643,
767 3.333,
768 3.113,
769 2.948,
770 2.818,
771 2.713,
772 2.626,
773 },
774 {
775 7.062,
776 4.965,
777 4.114,
778 3.638,
779 3.328,
780 3.108,
781 2.942,
782 2.813,
783 2.708,
784 2.621,
785 },
786 {
787 7.055,
788 4.959,
789 4.109,
790 3.632,
791 3.323,
792 3.103,
793 2.937,
794 2.808,
795 2.703,
796 2.616,
797 },
798 {
799 7.048,
800 4.953,
801 4.103,
802 3.627,
803 3.318,
804 3.098,
805 2.932,
806 2.803,
807 2.698,
808 2.611,
809 },
810 {
811 7.042,
812 4.947,
813 4.098,
814 3.622,
815 3.313,
816 3.093,
817 2.928,
818 2.798,
819 2.693,
820 2.607,
821 },
822 {
823 7.035,
824 4.942,
825 4.093,
826 3.618,
827 3.308,
828 3.088,
829 2.923,
830 2.793,
831 2.689,
832 2.602,
833 },
834 {
835 7.029,
836 4.937,
837 4.088,
838 3.613,
839 3.304,
840 3.084,
841 2.919,
842 2.789,
843 2.684,
844 2.598,
845 },
846 {
847 7.023,
848 4.932,
849 4.083,
850 3.608,
851 3.299,
852 3.080,
853 2.914,
854 2.785,
855 2.680,
856 2.593,
857 },
858 {
859 7.017,
860 4.927,
861 4.079,
862 3.604,
863 3.295,
864 3.075,
865 2.910,
866 2.781,
867 2.676,
868 2.589,
869 },
870 {
871 7.011,
872 4.922,
873 4.074,
874 3.600,
875 3.291,
876 3.071,
877 2.906,
878 2.777,
879 2.672,
880 2.585,
881 },
882 {
883 7.006,
884 4.917,
885 4.070,
886 3.596,
887 3.287,
888 3.067,
889 2.902,
890 2.773,
891 2.668,
892 2.581,
893 },
894 {
895 7.001,
896 4.913,
897 4.066,
898 3.591,
899 3.283,
900 3.063,
901 2.898,
902 2.769,
903 2.664,
904 2.578,
905 },
906 {
907 6.995,
908 4.908,
909 4.062,
910 3.588,
911 3.279,
912 3.060,
913 2.895,
914 2.765,
915 2.660,
916 2.574,
917 },
918 {
919 6.990,
920 4.904,
921 4.058,
922 3.584,
923 3.275,
924 3.056,
925 2.891,
926 2.762,
927 2.657,
928 2.570,
929 },
930 {
931 6.985,
932 4.900,
933 4.054,
934 3.580,
935 3.272,
936 3.052,
937 2.887,
938 2.758,
939 2.653,
940 2.567,
941 },
942 {
943 6.981,
944 4.896,
945 4.050,
946 3.577,
947 3.268,
948 3.049,
949 2.884,
950 2.755,
951 2.650,
952 2.563,
953 },
954 {
955 6.976,
956 4.892,
957 4.047,
958 3.573,
959 3.265,
960 3.046,
961 2.881,
962 2.751,
963 2.647,
964 2.560,
965 },
966 {
967 6.971,
968 4.888,
969 4.043,
970 3.570,
971 3.261,
972 3.042,
973 2.877,
974 2.748,
975 2.644,
976 2.557,
977 },
978 {
979 6.967,
980 4.884,
981 4.040,
982 3.566,
983 3.258,
984 3.039,
985 2.874,
986 2.745,
987 2.640,
988 2.554,
989 },
990 {
991 6.963,
992 4.881,
993 4.036,
994 3.563,
995 3.255,
996 3.036,
997 2.871,
998 2.742,
999 2.637,
1000 2.551,
1001 },
1002 {
1003 6.958,
1004 4.877,
1005 4.033,
1006 3.560,
1007 3.252,
1008 3.033,
1009 2.868,
1010 2.739,
1011 2.634,
1012 2.548,
1013 },
1014 {
1015 6.954,
1016 4.874,
1017 4.030,
1018 3.557,
1019 3.249,
1020 3.030,
1021 2.865,
1022 2.736,
1023 2.632,
1024 2.545,
1025 },
1026 {
1027 6.950,
1028 4.870,
1029 4.027,
1030 3.554,
1031 3.246,
1032 3.027,
1033 2.863,
1034 2.733,
1035 2.629,
1036 2.542,
1037 },
1038 {
1039 6.947,
1040 4.867,
1041 4.024,
1042 3.551,
1043 3.243,
1044 3.025,
1045 2.860,
1046 2.731,
1047 2.626,
1048 2.539,
1049 },
1050 {
1051 6.943,
1052 4.864,
1053 4.021,
1054 3.548,
1055 3.240,
1056 3.022,
1057 2.857,
1058 2.728,
1059 2.623,
1060 2.537,
1061 },
1062 {
1063 6.939,
1064 4.861,
1065 4.018,
1066 3.545,
1067 3.238,
1068 3.019,
1069 2.854,
1070 2.725,
1071 2.621,
1072 2.534,
1073 },
1074 {
1075 6.935,
1076 4.858,
1077 4.015,
1078 3.543,
1079 3.235,
1080 3.017,
1081 2.852,
1082 2.723,
1083 2.618,
1084 2.532,
1085 },
1086 {
1087 6.932,
1088 4.855,
1089 4.012,
1090 3.540,
1091 3.233,
1092 3.014,
1093 2.849,
1094 2.720,
1095 2.616,
1096 2.529,
1097 },
1098 {
1099 6.928,
1100 4.852,
1101 4.010,
1102 3.538,
1103 3.230,
1104 3.012,
1105 2.847,
1106 2.718,
1107 2.613,
1108 2.527,
1109 },
1110 {
1111 6.925,
1112 4.849,
1113 4.007,
1114 3.535,
1115 3.228,
1116 3.009,
1117 2.845,
1118 2.715,
1119 2.611,
1120 2.524,
1121 },
1122 {
1123 6.922,
1124 4.846,
1125 4.004,
1126 3.533,
1127 3.225,
1128 3.007,
1129 2.842,
1130 2.713,
1131 2.609,
1132 2.522,
1133 },
1134 {
1135 6.919,
1136 4.844,
1137 4.002,
1138 3.530,
1139 3.223,
1140 3.004,
1141 2.840,
1142 2.711,
1143 2.606,
1144 2.520,
1145 },
1146 {
1147 6.915,
1148 4.841,
1149 3.999,
1150 3.528,
1151 3.221,
1152 3.002,
1153 2.838,
1154 2.709,
1155 2.604,
1156 2.518,
1157 },
1158 {
1159 6.912,
1160 4.838,
1161 3.997,
1162 3.525,
1163 3.218,
1164 3.000,
1165 2.835,
1166 2.706,
1167 2.602,
1168 2.515,
1169 },
1170 {
1171 6.909,
1172 4.836,
1173 3.995,
1174 3.523,
1175 3.216,
1176 2.998,
1177 2.833,
1178 2.704,
1179 2.600,
1180 2.513,
1181 },
1182 {
1183 6.906,
1184 4.833,
1185 3.992,
1186 3.521,
1187 3.214,
1188 2.996,
1189 2.831,
1190 2.702,
1191 2.598,
1192 2.511,
1193 },
1194 {
1195 6.904,
1196 4.831,
1197 3.990,
1198 3.519,
1199 3.212,
1200 2.994,
1201 2.829,
1202 2.700,
1203 2.596,
1204 2.509,
1205 },
1206 {
1207 6.901,
1208 4.829,
1209 3.988,
1210 3.517,
1211 3.210,
1212 2.992,
1213 2.827,
1214 2.698,
1215 2.594,
1216 2.507,
1217 },
1218 {
1219 6.898,
1220 4.826,
1221 3.986,
1222 3.515,
1223 3.208,
1224 2.990,
1225 2.825,
1226 2.696,
1227 2.592,
1228 2.505,
1229 },
1230 {6.895, 4.824, 3.984, 3.513, 3.206, 2.988, 2.823, 2.694, 2.590, 2.503}};
1231
1232 /** define the variance which will be used as a minimum variance for any
1233 dimension of any feature. Since most features are calculated from numbers
1234 with a precision no better than 1 in 128, the variance should never be
1235 less than the square of this number for parameters whose range is 1. */
1236 #define MINVARIANCE 0.0004
1237
1238 /** define the absolute minimum number of samples which must be present in
1239 order to accurately test hypotheses about underlying probability
1240 distributions. Define separately the minimum samples that are needed
1241 before a statistical analysis is attempted; this number should be
1242 equal to MINSAMPLES but can be set to a lower number for early testing
1243 when very few samples are available. */
1244 #define MINSAMPLESPERBUCKET 5
1245 #define MINSAMPLES (MINBUCKETS * MINSAMPLESPERBUCKET)
1246 #define MINSAMPLESNEEDED 1
1247
1248 /** define the size of the table which maps normalized samples to
1249 histogram buckets. Also define the number of standard deviations
1250 in a normal distribution which are considered to be significant.
1251 The mapping table will be defined in such a way that it covers
1252 the specified number of standard deviations on either side of
1253 the mean. BUCKETTABLESIZE should always be even. */
1254 #define BUCKETTABLESIZE 1024
1255 #define NORMALEXTENT 3.0
1256
1257 struct TEMPCLUSTER {
1258 CLUSTER *Cluster;
1259 CLUSTER *Neighbor;
1260 };
1261
1262 using ClusterPair = tesseract::KDPairInc<float, TEMPCLUSTER *>;
1263 using ClusterHeap = tesseract::GenericHeap<ClusterPair>;
1264
1265 struct STATISTICS {
1266 STATISTICS(size_t n) : CoVariance(n * n), Min(n), Max(n) {
1267 }
1268 float AvgVariance = 1.0f;
1269 std::vector<float> CoVariance;
1270 std::vector<float> Min; // largest negative distance from the mean
1271 std::vector<float> Max; // largest positive distance from the mean
1272 };
1273
1274 struct BUCKETS {
1275 BUCKETS(size_t n) : NumberOfBuckets(n), Count(n), ExpectedCount(n) {
1276 }
1277 ~BUCKETS() {
1278 }
1279 DISTRIBUTION Distribution = normal; // distribution being tested for
1280 uint32_t SampleCount = 0; // # of samples in histogram
1281 double Confidence = 0.0; // confidence level of test
1282 double ChiSquared = 0.0; // test threshold
1283 uint16_t NumberOfBuckets; // number of cells in histogram
1284 uint16_t Bucket[BUCKETTABLESIZE]; // mapping to histogram buckets
1285 std::vector<uint32_t> Count; // frequency of occurrence histogram
1286 std::vector<float> ExpectedCount; // expected histogram
1287 };
1288
1289 struct CHISTRUCT {
1290 /// This constructor allocates a new data structure which is used
1291 /// to hold a chi-squared value along with its associated
1292 /// number of degrees of freedom and alpha value.
1293 ///
1294 /// @param degreesOfFreedom degrees of freedom for new chi value
1295 /// @param alpha confidence level for new chi value
1296 CHISTRUCT(uint16_t degreesOfFreedom, double alpha) : DegreesOfFreedom(degreesOfFreedom), Alpha(alpha) {
1297 }
1298 uint16_t DegreesOfFreedom = 0;
1299 double Alpha = 0.0;
1300 double ChiSquared = 0.0;
1301 };
1302
1303 // For use with KDWalk / MakePotentialClusters
1304 struct ClusteringContext {
1305 ClusterHeap *heap; // heap used to hold temp clusters, "best" on top
1306 TEMPCLUSTER *candidates; // array of potential clusters
1307 KDTREE *tree; // kd-tree to be searched for neighbors
1308 int32_t next; // next candidate to be used
1309 };
1310
1311 using DENSITYFUNC = double (*)(int32_t);
1312 using SOLVEFUNC = double (*)(CHISTRUCT *, double);
1313
1314 #define Odd(N) ((N) % 2)
1315 #define Mirror(N, R) ((R) - (N)-1)
1316 #define Abs(N) (((N) < 0) ? (-(N)) : (N))
1317
1318 //--------------Global Data Definitions and Declarations----------------------
1319 /** the following variables describe a discrete normal distribution
1320 which is used by NormalDensity() and NormalBucket(). The
1321 constant NORMALEXTENT determines how many standard
1322 deviations of the distribution are mapped onto the fixed
1323 discrete range of x. x=0 is mapped to -NORMALEXTENT standard
1324 deviations and x=BUCKETTABLESIZE is mapped to
1325 +NORMALEXTENT standard deviations. */
1326 #define SqrtOf2Pi 2.506628275
1327 static const double kNormalStdDev = BUCKETTABLESIZE / (2.0 * NORMALEXTENT);
1328 static const double kNormalVariance =
1329 (BUCKETTABLESIZE * BUCKETTABLESIZE) / (4.0 * NORMALEXTENT * NORMALEXTENT);
1330 static const double kNormalMagnitude = (2.0 * NORMALEXTENT) / (SqrtOf2Pi * BUCKETTABLESIZE);
1331 static const double kNormalMean = BUCKETTABLESIZE / 2;
1332
1333 /** define lookup tables used to compute the number of histogram buckets
1334 that should be used for a given number of samples. */
1335 #define LOOKUPTABLESIZE 8
1336 #define MAXDEGREESOFFREEDOM MAXBUCKETS
1337
1338 static const uint32_t kCountTable[LOOKUPTABLESIZE] = {MINSAMPLES, 200, 400, 600, 800,
1339 1000, 1500, 2000}; // number of samples
1340
1341 static const uint16_t kBucketsTable[LOOKUPTABLESIZE] = {
1342 MINBUCKETS, 16, 20, 24, 27, 30, 35, MAXBUCKETS}; // number of buckets
1343
1344 /*-------------------------------------------------------------------------
1345 Private Function Prototypes
1346 --------------------------------------------------------------------------*/
1347 static void CreateClusterTree(CLUSTERER *Clusterer);
1348
1349 static void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster, int32_t Level);
1350
1351 static CLUSTER *FindNearestNeighbor(KDTREE *Tree, CLUSTER *Cluster, float *Distance);
1352
1353 static CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster);
1354
1355 static void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config);
1356
1357 static PROTOTYPE *MakePrototype(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster);
1358
1359 static PROTOTYPE *MakeDegenerateProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics,
1360 PROTOSTYLE Style, int32_t MinSamples);
1361
1362 static PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster,
1363 STATISTICS *Statistics);
1364
1365 static PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics,
1366 BUCKETS *Buckets);
1367
1368 static PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer, CLUSTER *Cluster,
1369 STATISTICS *Statistics, BUCKETS *Buckets);
1370
1371 static PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics,
1372 BUCKETS *NormalBuckets, double Confidence);
1373
1374 static void MakeDimRandom(uint16_t i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc);
1375
1376 static void MakeDimUniform(uint16_t i, PROTOTYPE *Proto, STATISTICS *Statistics);
1377
1378 static STATISTICS *ComputeStatistics(int16_t N, PARAM_DESC ParamDesc[], CLUSTER *Cluster);
1379
1380 static PROTOTYPE *NewSphericalProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics);
1381
1382 static PROTOTYPE *NewEllipticalProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics);
1383
1384 static PROTOTYPE *NewMixedProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics);
1385
1386 static PROTOTYPE *NewSimpleProto(int16_t N, CLUSTER *Cluster);
1387
1388 static bool Independent(PARAM_DESC *ParamDesc, int16_t N, float *CoVariance, float Independence);
1389
1390 static BUCKETS *GetBuckets(CLUSTERER *clusterer, DISTRIBUTION Distribution, uint32_t SampleCount,
1391 double Confidence);
1392
1393 static BUCKETS *MakeBuckets(DISTRIBUTION Distribution, uint32_t SampleCount, double Confidence);
1394
1395 static uint16_t OptimumNumberOfBuckets(uint32_t SampleCount);
1396
1397 static double ComputeChiSquared(uint16_t DegreesOfFreedom, double Alpha);
1398
1399 static double NormalDensity(int32_t x);
1400
1401 static double UniformDensity(int32_t x);
1402
1403 static double Integral(double f1, double f2, double Dx);
1404
1405 static void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uint16_t Dim, PARAM_DESC *ParamDesc,
1406 float Mean, float StdDev);
1407
1408 static uint16_t NormalBucket(PARAM_DESC *ParamDesc, float x, float Mean, float StdDev);
1409
1410 static uint16_t UniformBucket(PARAM_DESC *ParamDesc, float x, float Mean, float StdDev);
1411
1412 static bool DistributionOK(BUCKETS *Buckets);
1413
1414 static uint16_t DegreesOfFreedom(DISTRIBUTION Distribution, uint16_t HistogramBuckets);
1415
1416 static void AdjustBuckets(BUCKETS *Buckets, uint32_t NewSampleCount);
1417
1418 static void InitBuckets(BUCKETS *Buckets);
1419
1420 static int AlphaMatch(void *arg1, // CHISTRUCT *ChiStruct,
1421 void *arg2); // CHISTRUCT *SearchKey);
1422
1423 static double Solve(SOLVEFUNC Function, void *FunctionParams, double InitialGuess, double Accuracy);
1424
1425 static double ChiArea(CHISTRUCT *ChiParams, double x);
1426
1427 static bool MultipleCharSamples(CLUSTERER *Clusterer, CLUSTER *Cluster, float MaxIllegal);
1428
1429 static double InvertMatrix(const float *input, int size, float *inv);
1430
1431 //--------------------------Public Code--------------------------------------
1432 /**
1433 * This routine creates a new clusterer data structure,
1434 * initializes it, and returns a pointer to it.
1435 *
1436 * @param SampleSize number of dimensions in feature space
1437 * @param ParamDesc description of each dimension
1438 * @return pointer to the new clusterer data structure
1439 */
1440 CLUSTERER *MakeClusterer(int16_t SampleSize, const PARAM_DESC ParamDesc[]) {
1441 int i;
1442
1443 // allocate main clusterer data structure and init simple fields
1444 auto Clusterer = new CLUSTERER;
1445 Clusterer->SampleSize = SampleSize;
1446 Clusterer->NumberOfSamples = 0;
1447 Clusterer->NumChar = 0;
1448
1449 // init fields which will not be used initially
1450 Clusterer->Root = nullptr;
1451 Clusterer->ProtoList = NIL_LIST;
1452
1453 // maintain a copy of param descriptors in the clusterer data structure
1454 Clusterer->ParamDesc = new PARAM_DESC[SampleSize];
1455 for (i = 0; i < SampleSize; i++) {
1456 Clusterer->ParamDesc[i].Circular = ParamDesc[i].Circular;
1457 Clusterer->ParamDesc[i].NonEssential = ParamDesc[i].NonEssential;
1458 Clusterer->ParamDesc[i].Min = ParamDesc[i].Min;
1459 Clusterer->ParamDesc[i].Max = ParamDesc[i].Max;
1460 Clusterer->ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;
1461 Clusterer->ParamDesc[i].HalfRange = Clusterer->ParamDesc[i].Range / 2;
1462 Clusterer->ParamDesc[i].MidRange = (ParamDesc[i].Max + ParamDesc[i].Min) / 2;
1463 }
1464
1465 // allocate a kd tree to hold the samples
1466 Clusterer->KDTree = MakeKDTree(SampleSize, ParamDesc);
1467
1468 // Initialize cache of histogram buckets to minimize recomputing them.
1469 for (auto &d : Clusterer->bucket_cache) {
1470 for (auto &c : d) {
1471 c = nullptr;
1472 }
1473 }
1474
1475 return Clusterer;
1476 } // MakeClusterer
1477
1478 /**
1479 * This routine creates a new sample data structure to hold
1480 * the specified feature. This sample is added to the clusterer
1481 * data structure (so that it knows which samples are to be
1482 * clustered later), and a pointer to the sample is returned to
1483 * the caller.
1484 *
1485 * @param Clusterer clusterer data structure to add sample to
1486 * @param Feature feature to be added to clusterer
1487 * @param CharID unique ident. of char that sample came from
1488 *
1489 * @return Pointer to the new sample data structure
1490 */
1491 SAMPLE *MakeSample(CLUSTERER *Clusterer, const float *Feature, uint32_t CharID) {
1492 int i;
1493
1494 // see if the samples have already been clustered - if so trap an error
1495 // Can't add samples after they have been clustered.
1496 ASSERT_HOST(Clusterer->Root == nullptr);
1497
1498 // allocate the new sample and initialize it
1499 auto Sample = new SAMPLE(Clusterer->SampleSize);
1500 Sample->Clustered = false;
1501 Sample->Prototype = false;
1502 Sample->SampleCount = 1;
1503 Sample->Left = nullptr;
1504 Sample->Right = nullptr;
1505 Sample->CharID = CharID;
1506
1507 for (i = 0; i < Clusterer->SampleSize; i++) {
1508 Sample->Mean[i] = Feature[i];
1509 }
1510
1511 // add the sample to the KD tree - keep track of the total # of samples
1512 Clusterer->NumberOfSamples++;
1513 KDStore(Clusterer->KDTree, &Sample->Mean[0], Sample);
1514 if (CharID >= Clusterer->NumChar) {
1515 Clusterer->NumChar = CharID + 1;
1516 }
1517
1518 // execute hook for monitoring clustering operation
1519 // (*SampleCreationHook)(Sample);
1520
1521 return (Sample);
1522 } // MakeSample
1523
1524 /**
1525 * This routine first checks to see if the samples in this
1526 * clusterer have already been clustered before; if so, it does
1527 * not bother to recreate the cluster tree. It simply recomputes
1528 * the prototypes based on the new Config info.
1529 *
1530 * If the samples have not been clustered before, the
1531 * samples in the KD tree are formed into a cluster tree and then
1532 * the prototypes are computed from the cluster tree.
1533 *
1534 * In either case this routine returns a pointer to a
1535 * list of prototypes that best represent the samples given
1536 * the constraints specified in Config.
1537 *
1538 * @param Clusterer data struct containing samples to be clustered
1539 * @param Config parameters which control clustering process
1540 *
1541 * @return Pointer to a list of prototypes
1542 */
1543 LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) {
1544 // only create cluster tree if samples have never been clustered before
1545 if (Clusterer->Root == nullptr) {
1546 CreateClusterTree(Clusterer);
1547 }
1548
1549 // deallocate the old prototype list if one exists
1550 FreeProtoList(&Clusterer->ProtoList);
1551 Clusterer->ProtoList = NIL_LIST;
1552
1553 // compute prototypes starting at the root node in the tree
1554 ComputePrototypes(Clusterer, Config);
1555 // We don't need the cluster pointers in the protos any more, so null them
1556 // out, which makes it safe to delete the clusterer.
1557 LIST proto_list = Clusterer->ProtoList;
1558 iterate(proto_list) {
1559 auto *proto = reinterpret_cast<PROTOTYPE *>(proto_list->first_node());
1560 proto->Cluster = nullptr;
1561 }
1562 return Clusterer->ProtoList;
1563 } // ClusterSamples
1564
1565 /**
1566 * This routine frees all of the memory allocated to the
1567 * specified data structure. It will not, however, free
1568 * the memory used by the prototype list. The pointers to
1569 * the clusters for each prototype in the list will be set
1570 * to nullptr to indicate that the cluster data structures no
1571 * longer exist. Any sample lists that have been obtained
1572 * via calls to GetSamples are no longer valid.
1573 * @param Clusterer pointer to data structure to be freed
1574 */
1575 void FreeClusterer(CLUSTERER *Clusterer) {
1576 if (Clusterer != nullptr) {
1577 delete[] Clusterer->ParamDesc;
1578 delete Clusterer->KDTree;
1579 delete Clusterer->Root;
1580 // Free up all used buckets structures.
1581 for (auto &d : Clusterer->bucket_cache) {
1582 for (auto &c : d) {
1583 delete c;
1584 }
1585 }
1586
1587 delete Clusterer;
1588 }
1589 } // FreeClusterer
1590
1591 /**
1592 * This routine frees all of the memory allocated to the
1593 * specified list of prototypes. The clusters which are
1594 * pointed to by the prototypes are not freed.
1595 * @param ProtoList pointer to list of prototypes to be freed
1596 */
1597 void FreeProtoList(LIST *ProtoList) {
1598 destroy_nodes(*ProtoList, FreePrototype);
1599 } // FreeProtoList
1600
1601 /**
1602 * This routine deallocates the memory consumed by the specified
1603 * prototype and modifies the corresponding cluster so that it
1604 * is no longer marked as a prototype. The cluster is NOT
1605 * deallocated by this routine.
1606 * @param arg prototype data structure to be deallocated
1607 */
1608 void FreePrototype(void *arg) { // PROTOTYPE *Prototype)
1609 auto *Prototype = static_cast<PROTOTYPE *>(arg);
1610
1611 // unmark the corresponding cluster (if there is one
1612 if (Prototype->Cluster != nullptr) {
1613 Prototype->Cluster->Prototype = false;
1614 }
1615
1616 // deallocate the prototype statistics and then the prototype itself
1617 if (Prototype->Style != spherical) {
1618 delete[] Prototype->Variance.Elliptical;
1619 delete[] Prototype->Magnitude.Elliptical;
1620 delete[] Prototype->Weight.Elliptical;
1621 }
1622 delete Prototype;
1623 } // FreePrototype
1624
1625 /**
1626 * This routine is used to find all of the samples which
1627 * belong to a cluster. It starts by removing the top
1628 * cluster on the cluster list (SearchState). If this cluster is
1629 * a leaf it is returned. Otherwise, the right subcluster
1630 * is pushed on the list and we continue the search in the
1631 * left subcluster. This continues until a leaf is found.
1632 * If all samples have been found, nullptr is returned.
1633 * InitSampleSearch() must be called
1634 * before NextSample() to initialize the search.
1635 * @param SearchState ptr to list containing clusters to be searched
1636 * @return Pointer to the next leaf cluster (sample) or nullptr.
1637 */
1638 CLUSTER *NextSample(LIST *SearchState) {
1639 CLUSTER *Cluster;
1640
1641 if (*SearchState == NIL_LIST) {
1642 return (nullptr);
1643 }
1644 Cluster = reinterpret_cast<CLUSTER *>((*SearchState)->first_node());
1645 *SearchState = pop(*SearchState);
1646 for (;;) {
1647 if (Cluster->Left == nullptr) {
1648 return (Cluster);
1649 }
1650 *SearchState = push(*SearchState, Cluster->Right);
1651 Cluster = Cluster->Left;
1652 }
1653 } // NextSample
1654
1655 /**
1656 * This routine returns the mean of the specified
1657 * prototype in the indicated dimension.
1658 * @param Proto prototype to return mean of
1659 * @param Dimension dimension whose mean is to be returned
1660 * @return Mean of Prototype in Dimension
1661 */
1662 float Mean(PROTOTYPE *Proto, uint16_t Dimension) {
1663 return (Proto->Mean[Dimension]);
1664 } // Mean
1665
1666 /**
1667 * This routine returns the standard deviation of the
1668 * prototype in the indicated dimension.
1669 * @param Proto prototype to return standard deviation of
1670 * @param Dimension dimension whose stddev is to be returned
1671 * @return Standard deviation of Prototype in Dimension
1672 */
1673 float StandardDeviation(PROTOTYPE *Proto, uint16_t Dimension) {
1674 switch (Proto->Style) {
1675 case spherical:
1676 return std::sqrt(Proto->Variance.Spherical);
1677 case elliptical:
1678 return std::sqrt(Proto->Variance.Elliptical[Dimension]);
1679 case mixed:
1680 switch (Proto->Distrib[Dimension]) {
1681 case normal:
1682 return std::sqrt(Proto->Variance.Elliptical[Dimension]);
1683 case uniform:
1684 case D_random:
1685 return Proto->Variance.Elliptical[Dimension];
1686 case DISTRIBUTION_COUNT:
1687 ASSERT_HOST(!"Distribution count not allowed!");
1688 }
1689 }
1690 return 0.0f;
1691 } // StandardDeviation
1692
1693 /*---------------------------------------------------------------------------
1694 Private Code
1695 ----------------------------------------------------------------------------*/
1696 /**
1697 * This routine performs a bottoms-up clustering on the samples
1698 * held in the kd-tree of the Clusterer data structure. The
1699 * result is a cluster tree. Each node in the tree represents
1700 * a cluster which conceptually contains a subset of the samples.
1701 * More precisely, the cluster contains all of the samples which
1702 * are contained in its two sub-clusters. The leaves of the
1703 * tree are the individual samples themselves; they have no
1704 * sub-clusters. The root node of the tree conceptually contains
1705 * all of the samples.
1706 * The Clusterer data structure is changed.
1707 * @param Clusterer data structure holdings samples to be clustered
1708 */
1709 static void CreateClusterTree(CLUSTERER *Clusterer) {
1710 ClusteringContext context;
1711 ClusterPair HeapEntry;
1712
1713 // each sample and its nearest neighbor form a "potential" cluster
1714 // save these in a heap with the "best" potential clusters on top
1715 context.tree = Clusterer->KDTree;
1716 context.candidates = new TEMPCLUSTER[Clusterer->NumberOfSamples];
1717 context.next = 0;
1718 context.heap = new ClusterHeap(Clusterer->NumberOfSamples);
1719 KDWalk(context.tree, MakePotentialClusters, &context);
1720
1721 // form potential clusters into actual clusters - always do "best" first
1722 while (context.heap->Pop(&HeapEntry)) {
1723 TEMPCLUSTER *PotentialCluster = HeapEntry.data();
1724
1725 // if main cluster of potential cluster is already in another cluster
1726 // then we don't need to worry about it
1727 if (PotentialCluster->Cluster->Clustered) {
1728 continue;
1729 }
1730
1731 // if main cluster is not yet clustered, but its nearest neighbor is
1732 // then we must find a new nearest neighbor
1733 else if (PotentialCluster->Neighbor->Clustered) {
1734 PotentialCluster->Neighbor =
1735 FindNearestNeighbor(context.tree, PotentialCluster->Cluster, &HeapEntry.key());
1736 if (PotentialCluster->Neighbor != nullptr) {
1737 context.heap->Push(&HeapEntry);
1738 }
1739 }
1740
1741 // if neither cluster is already clustered, form permanent cluster
1742 else {
1743 PotentialCluster->Cluster = MakeNewCluster(Clusterer, PotentialCluster);
1744 PotentialCluster->Neighbor =
1745 FindNearestNeighbor(context.tree, PotentialCluster->Cluster, &HeapEntry.key());
1746 if (PotentialCluster->Neighbor != nullptr) {
1747 context.heap->Push(&HeapEntry);
1748 }
1749 }
1750 }
1751
1752 // the root node in the cluster tree is now the only node in the kd-tree
1753 Clusterer->Root = static_cast<CLUSTER *> RootOf(Clusterer->KDTree);
1754
1755 // free up the memory used by the K-D tree, heap, and temp clusters
1756 delete context.tree;
1757 Clusterer->KDTree = nullptr;
1758 delete context.heap;
1759 delete[] context.candidates;
1760 } // CreateClusterTree
1761
1762 /**
1763 * This routine is designed to be used in concert with the
1764 * KDWalk routine. It will create a potential cluster for
1765 * each sample in the kd-tree that is being walked. This
1766 * potential cluster will then be pushed on the heap.
1767 * @param context ClusteringContext (see definition above)
1768 * @param Cluster current cluster being visited in kd-tree walk
1769 * @param Level level of this cluster in the kd-tree
1770 */
1771 static void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster, int32_t /*Level*/) {
1772 ClusterPair HeapEntry;
1773 int next = context->next;
1774 context->candidates[next].Cluster = Cluster;
1775 HeapEntry.data() = &(context->candidates[next]);
1776 context->candidates[next].Neighbor =
1777 FindNearestNeighbor(context->tree, context->candidates[next].Cluster, &HeapEntry.key());
1778 if (context->candidates[next].Neighbor != nullptr) {
1779 context->heap->Push(&HeapEntry);
1780 context->next++;
1781 }
1782 } // MakePotentialClusters
1783
1784 /**
1785 * This routine searches the specified kd-tree for the nearest
1786 * neighbor of the specified cluster. It actually uses the
1787 * kd routines to find the 2 nearest neighbors since one of them
1788 * will be the original cluster. A pointer to the nearest
1789 * neighbor is returned, if it can be found, otherwise nullptr is
1790 * returned. The distance between the 2 nodes is placed
1791 * in the specified variable.
1792 * @param Tree kd-tree to search in for nearest neighbor
1793 * @param Cluster cluster whose nearest neighbor is to be found
1794 * @param Distance ptr to variable to report distance found
1795 * @return Pointer to the nearest neighbor of Cluster, or nullptr
1796 */
1797 static CLUSTER *FindNearestNeighbor(KDTREE *Tree, CLUSTER *Cluster, float *Distance)
1798 #define MAXNEIGHBORS 2
1799 #define MAXDISTANCE FLT_MAX
1800 {
1801 CLUSTER *Neighbor[MAXNEIGHBORS];
1802 float Dist[MAXNEIGHBORS];
1803 int NumberOfNeighbors;
1804 int32_t i;
1805 CLUSTER *BestNeighbor;
1806
1807 // find the 2 nearest neighbors of the cluster
1808 KDNearestNeighborSearch(Tree, &Cluster->Mean[0], MAXNEIGHBORS, MAXDISTANCE, &NumberOfNeighbors,
1809 reinterpret_cast<void **>(Neighbor), Dist);
1810
1811 // search for the nearest neighbor that is not the cluster itself
1812 *Distance = MAXDISTANCE;
1813 BestNeighbor = nullptr;
1814 for (i = 0; i < NumberOfNeighbors; i++) {
1815 if ((Dist[i] < *Distance) && (Neighbor[i] != Cluster)) {
1816 *Distance = Dist[i];
1817 BestNeighbor = Neighbor[i];
1818 }
1819 }
1820 return BestNeighbor;
1821 } // FindNearestNeighbor
1822
1823 /**
1824 * This routine creates a new permanent cluster from the
1825 * clusters specified in TempCluster. The 2 clusters in
1826 * TempCluster are marked as "clustered" and deleted from
1827 * the kd-tree. The new cluster is then added to the kd-tree.
1828 * @param Clusterer current clustering environment
1829 * @param TempCluster potential cluster to make permanent
1830 * @return Pointer to the new permanent cluster
1831 */
1832 static CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster) {
1833 // allocate the new cluster and initialize it
1834 auto Cluster = new CLUSTER(Clusterer->SampleSize);
1835 Cluster->Clustered = false;
1836 Cluster->Prototype = false;
1837 Cluster->Left = TempCluster->Cluster;
1838 Cluster->Right = TempCluster->Neighbor;
1839 Cluster->CharID = -1;
1840
1841 // mark the old clusters as "clustered" and delete them from the kd-tree
1842 Cluster->Left->Clustered = true;
1843 Cluster->Right->Clustered = true;
1844 KDDelete(Clusterer->KDTree, &Cluster->Left->Mean[0], Cluster->Left);
1845 KDDelete(Clusterer->KDTree, &Cluster->Right->Mean[0], Cluster->Right);
1846
1847 // compute the mean and sample count for the new cluster
1848 Cluster->SampleCount = MergeClusters(Clusterer->SampleSize, Clusterer->ParamDesc,
1849 Cluster->Left->SampleCount, Cluster->Right->SampleCount,
1850 &Cluster->Mean[0], &Cluster->Left->Mean[0], &Cluster->Right->Mean[0]);
1851
1852 // add the new cluster to the KD tree
1853 KDStore(Clusterer->KDTree, &Cluster->Mean[0], Cluster);
1854 return Cluster;
1855 } // MakeNewCluster
1856
1857 /**
1858 * This routine merges two clusters into one larger cluster.
1859 * To do this it computes the number of samples in the new
1860 * cluster and the mean of the new cluster. The ParamDesc
1861 * information is used to ensure that circular dimensions
1862 * are handled correctly.
1863 * @param N # of dimensions (size of arrays)
1864 * @param ParamDesc array of dimension descriptions
1865 * @param n1, n2 number of samples in each old cluster
1866 * @param m array to hold mean of new cluster
1867 * @param m1, m2 arrays containing means of old clusters
1868 * @return The number of samples in the new cluster.
1869 */
1870 int32_t MergeClusters(int16_t N, PARAM_DESC ParamDesc[], int32_t n1, int32_t n2, float m[],
1871 float m1[], float m2[]) {
1872 int32_t i, n;
1873
1874 n = n1 + n2;
1875 for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) {
1876 if (ParamDesc->Circular) {
1877 // if distance between means is greater than allowed
1878 // reduce upper point by one "rotation" to compute mean
1879 // then normalize the mean back into the accepted range
1880 if ((*m2 - *m1) > ParamDesc->HalfRange) {
1881 *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->Range)) / n;
1882 if (*m < ParamDesc->Min) {
1883 *m += ParamDesc->Range;
1884 }
1885 } else if ((*m1 - *m2) > ParamDesc->HalfRange) {
1886 *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n;
1887 if (*m < ParamDesc->Min) {
1888 *m += ParamDesc->Range;
1889 }
1890 } else {
1891 *m = (n1 * *m1 + n2 * *m2) / n;
1892 }
1893 } else {
1894 *m = (n1 * *m1 + n2 * *m2) / n;
1895 }
1896 }
1897 return n;
1898 } // MergeClusters
1899
1900 /**
1901 * This routine decides which clusters in the cluster tree
1902 * should be represented by prototypes, forms a list of these
1903 * prototypes, and places the list in the Clusterer data
1904 * structure.
1905 * @param Clusterer data structure holding cluster tree
1906 * @param Config parameters used to control prototype generation
1907 */
1908 static void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) {
1909 LIST ClusterStack = NIL_LIST;
1910 CLUSTER *Cluster;
1911 PROTOTYPE *Prototype;
1912
1913 // use a stack to keep track of clusters waiting to be processed
1914 // initially the only cluster on the stack is the root cluster
1915 if (Clusterer->Root != nullptr) {
1916 ClusterStack = push(NIL_LIST, Clusterer->Root);
1917 }
1918
1919 // loop until we have analyzed all clusters which are potential prototypes
1920 while (ClusterStack != NIL_LIST) {
1921 // remove the next cluster to be analyzed from the stack
1922 // try to make a prototype from the cluster
1923 // if successful, put it on the proto list, else split the cluster
1924 Cluster = reinterpret_cast<CLUSTER *>(ClusterStack->first_node());
1925 ClusterStack = pop(ClusterStack);
1926 Prototype = MakePrototype(Clusterer, Config, Cluster);
1927 if (Prototype != nullptr) {
1928 Clusterer->ProtoList = push(Clusterer->ProtoList, Prototype);
1929 } else {
1930 ClusterStack = push(ClusterStack, Cluster->Right);
1931 ClusterStack = push(ClusterStack, Cluster->Left);
1932 }
1933 }
1934 } // ComputePrototypes
1935
1936 /**
1937 * This routine attempts to create a prototype from the
1938 * specified cluster that conforms to the distribution
1939 * specified in Config. If there are too few samples in the
1940 * cluster to perform a statistical analysis, then a prototype
1941 * is generated but labelled as insignificant. If the
1942 * dimensions of the cluster are not independent, no prototype
1943 * is generated and nullptr is returned. If a prototype can be
1944 * found that matches the desired distribution then a pointer
1945 * to it is returned, otherwise nullptr is returned.
1946 * @param Clusterer data structure holding cluster tree
1947 * @param Config parameters used to control prototype generation
1948 * @param Cluster cluster to be made into a prototype
1949 * @return Pointer to new prototype or nullptr
1950 */
1951 static PROTOTYPE *MakePrototype(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster) {
1952 PROTOTYPE *Proto;
1953 BUCKETS *Buckets;
1954
1955 // filter out clusters which contain samples from the same character
1956 if (MultipleCharSamples(Clusterer, Cluster, Config->MaxIllegal)) {
1957 return nullptr;
1958 }
1959
1960 // compute the covariance matrix and ranges for the cluster
1961 auto Statistics = ComputeStatistics(Clusterer->SampleSize, Clusterer->ParamDesc, Cluster);
1962
1963 // check for degenerate clusters which need not be analyzed further
1964 // note that the MinSamples test assumes that all clusters with multiple
1965 // character samples have been removed (as above)
1966 Proto = MakeDegenerateProto(Clusterer->SampleSize, Cluster, Statistics, Config->ProtoStyle,
1967 static_cast<int32_t>(Config->MinSamples * Clusterer->NumChar));
1968 if (Proto != nullptr) {
1969 delete Statistics;
1970 return Proto;
1971 }
1972 // check to ensure that all dimensions are independent
1973 if (!Independent(Clusterer->ParamDesc, Clusterer->SampleSize, &Statistics->CoVariance[0],
1974 Config->Independence)) {
1975 delete Statistics;
1976 return nullptr;
1977 }
1978
1979 if (HOTELLING && Config->ProtoStyle == elliptical) {
1980 Proto = TestEllipticalProto(Clusterer, Config, Cluster, Statistics);
1981 if (Proto != nullptr) {
1982 delete Statistics;
1983 return Proto;
1984 }
1985 }
1986
1987 // create a histogram data structure used to evaluate distributions
1988 Buckets = GetBuckets(Clusterer, normal, Cluster->SampleCount, Config->Confidence);
1989
1990 // create a prototype based on the statistics and test it
1991 switch (Config->ProtoStyle) {
1992 case spherical:
1993 Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
1994 break;
1995 case elliptical:
1996 Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
1997 break;
1998 case mixed:
1999 Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets, Config->Confidence);
2000 break;
2001 case automatic:
2002 Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
2003 if (Proto != nullptr) {
2004 break;
2005 }
2006 Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
2007 if (Proto != nullptr) {
2008 break;
2009 }
2010 Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets, Config->Confidence);
2011 break;
2012 }
2013 delete Statistics;
2014 return Proto;
2015 } // MakePrototype
2016
2017 /**
2018 * This routine checks for clusters which are degenerate and
2019 * therefore cannot be analyzed in a statistically valid way.
2020 * A cluster is defined as degenerate if it does not have at
2021 * least MINSAMPLESNEEDED samples in it. If the cluster is
2022 * found to be degenerate, a prototype of the specified style
2023 * is generated and marked as insignificant. A cluster is
2024 * also degenerate if it does not have at least MinSamples
2025 * samples in it.
2026 *
2027 * If the cluster is not degenerate, nullptr is returned.
2028 *
2029 * @param N number of dimensions
2030 * @param Cluster cluster being analyzed
2031 * @param Statistics statistical info about cluster
2032 * @param Style type of prototype to be generated
2033 * @param MinSamples minimum number of samples in a cluster
2034 * @return Pointer to degenerate prototype or nullptr.
2035 */
2036 static PROTOTYPE *MakeDegenerateProto( // this was MinSample
2037 uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics, PROTOSTYLE Style, int32_t MinSamples) {
2038 PROTOTYPE *Proto = nullptr;
2039
2040 if (MinSamples < MINSAMPLESNEEDED) {
2041 MinSamples = MINSAMPLESNEEDED;
2042 }
2043
2044 if (Cluster->SampleCount < MinSamples) {
2045 switch (Style) {
2046 case spherical:
2047 Proto = NewSphericalProto(N, Cluster, Statistics);
2048 break;
2049 case elliptical:
2050 case automatic:
2051 Proto = NewEllipticalProto(N, Cluster, Statistics);
2052 break;
2053 case mixed:
2054 Proto = NewMixedProto(N, Cluster, Statistics);
2055 break;
2056 }
2057 Proto->Significant = false;
2058 }
2059 return (Proto);
2060 } // MakeDegenerateProto
2061
2062 /**
2063 * This routine tests the specified cluster to see if **
2064 * there is a statistically significant difference between
2065 * the sub-clusters that would be made if the cluster were to
2066 * be split. If not, then a new prototype is formed and
2067 * returned to the caller. If there is, then nullptr is returned
2068 * to the caller.
2069 * @param Clusterer data struct containing samples being clustered
2070 * @param Config provides the magic number of samples that make a good cluster
2071 * @param Cluster cluster to be made into an elliptical prototype
2072 * @param Statistics statistical info about cluster
2073 * @return Pointer to new elliptical prototype or nullptr.
2074 */
2075 static PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster,
2076 STATISTICS *Statistics) {
2077 // Fraction of the number of samples used as a range around 1 within
2078 // which a cluster has the magic size that allows a boost to the
2079 // FTable by kFTableBoostMargin, thus allowing clusters near the
2080 // magic size (equal to the number of sample characters) to be more
2081 // likely to stay together.
2082 const double kMagicSampleMargin = 0.0625;
2083 const double kFTableBoostMargin = 2.0;
2084
2085 int N = Clusterer->SampleSize;
2086 CLUSTER *Left = Cluster->Left;
2087 CLUSTER *Right = Cluster->Right;
2088 if (Left == nullptr || Right == nullptr) {
2089 return nullptr;
2090 }
2091 int TotalDims = Left->SampleCount + Right->SampleCount;
2092 if (TotalDims < N + 1 || TotalDims < 2) {
2093 return nullptr;
2094 }
2095 std::vector<float> Covariance(static_cast<size_t>(N) * N);
2096 std::vector<float> Inverse(static_cast<size_t>(N) * N);
2097 std::vector<float> Delta(N);
2098 // Compute a new covariance matrix that only uses essential features.
2099 for (int i = 0; i < N; ++i) {
2100 int row_offset = i * N;
2101 if (!Clusterer->ParamDesc[i].NonEssential) {
2102 for (int j = 0; j < N; ++j) {
2103 if (!Clusterer->ParamDesc[j].NonEssential) {
2104 Covariance[j + row_offset] = Statistics->CoVariance[j + row_offset];
2105 } else {
2106 Covariance[j + row_offset] = 0.0f;
2107 }
2108 }
2109 } else {
2110 for (int j = 0; j < N; ++j) {
2111 if (i == j) {
2112 Covariance[j + row_offset] = 1.0f;
2113 } else {
2114 Covariance[j + row_offset] = 0.0f;
2115 }
2116 }
2117 }
2118 }
2119 double err = InvertMatrix(&Covariance[0], N, &Inverse[0]);
2120 if (err > 1) {
2121 tprintf("Clustering error: Matrix inverse failed with error %g\n", err);
2122 }
2123 int EssentialN = 0;
2124 for (int dim = 0; dim < N; ++dim) {
2125 if (!Clusterer->ParamDesc[dim].NonEssential) {
2126 Delta[dim] = Left->Mean[dim] - Right->Mean[dim];
2127 ++EssentialN;
2128 } else {
2129 Delta[dim] = 0.0f;
2130 }
2131 }
2132 // Compute Hotelling's T-squared.
2133 double Tsq = 0.0;
2134 for (int x = 0; x < N; ++x) {
2135 double temp = 0.0;
2136 for (int y = 0; y < N; ++y) {
2137 temp += static_cast<double>(Inverse[y + N * x]) * Delta[y];
2138 }
2139 Tsq += Delta[x] * temp;
2140 }
2141 // Changed this function to match the formula in
2142 // Statistical Methods in Medical Research p 473
2143 // By Peter Armitage, Geoffrey Berry, J. N. S. Matthews.
2144 // Tsq *= Left->SampleCount * Right->SampleCount / TotalDims;
2145 double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2) * EssentialN);
2146 int Fx = EssentialN;
2147 if (Fx > FTABLE_X) {
2148 Fx = FTABLE_X;
2149 }
2150 --Fx;
2151 int Fy = TotalDims - EssentialN - 1;
2152 if (Fy > FTABLE_Y) {
2153 Fy = FTABLE_Y;
2154 }
2155 --Fy;
2156 double FTarget = FTable[Fy][Fx];
2157 if (Config->MagicSamples > 0 && TotalDims >= Config->MagicSamples * (1.0 - kMagicSampleMargin) &&
2158 TotalDims <= Config->MagicSamples * (1.0 + kMagicSampleMargin)) {
2159 // Give magic-sized clusters a magic FTable boost.
2160 FTarget += kFTableBoostMargin;
2161 }
2162 if (F < FTarget) {
2163 return NewEllipticalProto(Clusterer->SampleSize, Cluster, Statistics);
2164 }
2165 return nullptr;
2166 }
2167
2168 /**
2169 * This routine tests the specified cluster to see if it can
2170 * be approximated by a spherical normal distribution. If it
2171 * can be, then a new prototype is formed and returned to the
2172 * caller. If it can't be, then nullptr is returned to the caller.
2173 * @param Clusterer data struct containing samples being clustered
2174 * @param Cluster cluster to be made into a spherical prototype
2175 * @param Statistics statistical info about cluster
2176 * @param Buckets histogram struct used to analyze distribution
2177 * @return Pointer to new spherical prototype or nullptr.
2178 */
2179 static PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics,
2180 BUCKETS *Buckets) {
2181 PROTOTYPE *Proto = nullptr;
2182 int i;
2183
2184 // check that each dimension is a normal distribution
2185 for (i = 0; i < Clusterer->SampleSize; i++) {
2186 if (Clusterer->ParamDesc[i].NonEssential) {
2187 continue;
2188 }
2189
2190 FillBuckets(Buckets, Cluster, i, &(Clusterer->ParamDesc[i]), Cluster->Mean[i],
2191 sqrt(static_cast<double>(Statistics->AvgVariance)));
2192 if (!DistributionOK(Buckets)) {
2193 break;
2194 }
2195 }
2196 // if all dimensions matched a normal distribution, make a proto
2197 if (i >= Clusterer->SampleSize) {
2198 Proto = NewSphericalProto(Clusterer->SampleSize, Cluster, Statistics);
2199 }
2200 return (Proto);
2201 } // MakeSphericalProto
2202
2203 /**
2204 * This routine tests the specified cluster to see if it can
2205 * be approximated by an elliptical normal distribution. If it
2206 * can be, then a new prototype is formed and returned to the
2207 * caller. If it can't be, then nullptr is returned to the caller.
2208 * @param Clusterer data struct containing samples being clustered
2209 * @param Cluster cluster to be made into an elliptical prototype
2210 * @param Statistics statistical info about cluster
2211 * @param Buckets histogram struct used to analyze distribution
2212 * @return Pointer to new elliptical prototype or nullptr.
2213 */
2214 static PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer, CLUSTER *Cluster,
2215 STATISTICS *Statistics, BUCKETS *Buckets) {
2216 PROTOTYPE *Proto = nullptr;
2217 int i;
2218
2219 // check that each dimension is a normal distribution
2220 for (i = 0; i < Clusterer->SampleSize; i++) {
2221 if (Clusterer->ParamDesc[i].NonEssential) {
2222 continue;
2223 }
2224
2225 FillBuckets(Buckets, Cluster, i, &(Clusterer->ParamDesc[i]), Cluster->Mean[i],
2226 sqrt(static_cast<double>(Statistics->CoVariance[i * (Clusterer->SampleSize + 1)])));
2227 if (!DistributionOK(Buckets)) {
2228 break;
2229 }
2230 }
2231 // if all dimensions matched a normal distribution, make a proto
2232 if (i >= Clusterer->SampleSize) {
2233 Proto = NewEllipticalProto(Clusterer->SampleSize, Cluster, Statistics);
2234 }
2235 return (Proto);
2236 } // MakeEllipticalProto
2237
2238 /**
2239 * This routine tests each dimension of the specified cluster to
2240 * see what distribution would best approximate that dimension.
2241 * Each dimension is compared to the following distributions
2242 * in order: normal, random, uniform. If each dimension can
2243 * be represented by one of these distributions,
2244 * then a new prototype is formed and returned to the
2245 * caller. If it can't be, then nullptr is returned to the caller.
2246 * @param Clusterer data struct containing samples being clustered
2247 * @param Cluster cluster to be made into a prototype
2248 * @param Statistics statistical info about cluster
2249 * @param NormalBuckets histogram struct used to analyze distribution
2250 * @param Confidence confidence level for alternate distributions
2251 * @return Pointer to new mixed prototype or nullptr.
2252 */
2253 static PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics,
2254 BUCKETS *NormalBuckets, double Confidence) {
2255 PROTOTYPE *Proto;
2256 int i;
2257 BUCKETS *UniformBuckets = nullptr;
2258 BUCKETS *RandomBuckets = nullptr;
2259
2260 // create a mixed proto to work on - initially assume all dimensions normal
2261 Proto = NewMixedProto(Clusterer->SampleSize, Cluster, Statistics);
2262
2263 // find the proper distribution for each dimension
2264 for (i = 0; i < Clusterer->SampleSize; i++) {
2265 if (Clusterer->ParamDesc[i].NonEssential) {
2266 continue;
2267 }
2268
2269 FillBuckets(NormalBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), Proto->Mean[i],
2270 std::sqrt(Proto->Variance.Elliptical[i]));
2271 if (DistributionOK(NormalBuckets)) {
2272 continue;
2273 }
2274
2275 if (RandomBuckets == nullptr) {
2276 RandomBuckets = GetBuckets(Clusterer, D_random, Cluster->SampleCount, Confidence);
2277 }
2278 MakeDimRandom(i, Proto, &(Clusterer->ParamDesc[i]));
2279 FillBuckets(RandomBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), Proto->Mean[i],
2280 Proto->Variance.Elliptical[i]);
2281 if (DistributionOK(RandomBuckets)) {
2282 continue;
2283 }
2284
2285 if (UniformBuckets == nullptr) {
2286 UniformBuckets = GetBuckets(Clusterer, uniform, Cluster->SampleCount, Confidence);
2287 }
2288 MakeDimUniform(i, Proto, Statistics);
2289 FillBuckets(UniformBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), Proto->Mean[i],
2290 Proto->Variance.Elliptical[i]);
2291 if (DistributionOK(UniformBuckets)) {
2292 continue;
2293 }
2294 break;
2295 }
2296 // if any dimension failed to match a distribution, discard the proto
2297 if (i < Clusterer->SampleSize) {
2298 FreePrototype(Proto);
2299 Proto = nullptr;
2300 }
2301 return (Proto);
2302 } // MakeMixedProto
2303
2304 /**
2305 * This routine alters the ith dimension of the specified
2306 * mixed prototype to be D_random.
2307 * @param i index of dimension to be changed
2308 * @param Proto prototype whose dimension is to be altered
2309 * @param ParamDesc description of specified dimension
2310 */
2311 static void MakeDimRandom(uint16_t i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc) {
2312 Proto->Distrib[i] = D_random;
2313 Proto->Mean[i] = ParamDesc->MidRange;
2314 Proto->Variance.Elliptical[i] = ParamDesc->HalfRange;
2315
2316 // subtract out the previous magnitude of this dimension from the total
2317 Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
2318 Proto->Magnitude.Elliptical[i] = 1.0 / ParamDesc->Range;
2319 Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
2320 Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude));
2321
2322 // note that the proto Weight is irrelevant for D_random protos
2323 } // MakeDimRandom
2324
2325 /**
2326 * This routine alters the ith dimension of the specified
2327 * mixed prototype to be uniform.
2328 * @param i index of dimension to be changed
2329 * @param Proto prototype whose dimension is to be altered
2330 * @param Statistics statistical info about prototype
2331 */
2332 static void MakeDimUniform(uint16_t i, PROTOTYPE *Proto, STATISTICS *Statistics) {
2333 Proto->Distrib[i] = uniform;
2334 Proto->Mean[i] = Proto->Cluster->Mean[i] + (Statistics->Min[i] + Statistics->Max[i]) / 2;
2335 Proto->Variance.Elliptical[i] = (Statistics->Max[i] - Statistics->Min[i]) / 2;
2336 if (Proto->Variance.Elliptical[i] < MINVARIANCE) {
2337 Proto->Variance.Elliptical[i] = MINVARIANCE;
2338 }
2339
2340 // subtract out the previous magnitude of this dimension from the total
2341 Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
2342 Proto->Magnitude.Elliptical[i] = 1.0 / (2.0 * Proto->Variance.Elliptical[i]);
2343 Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
2344 Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude));
2345
2346 // note that the proto Weight is irrelevant for uniform protos
2347 } // MakeDimUniform
2348
2349 /**
2350 * This routine searches the cluster tree for all leaf nodes
2351 * which are samples in the specified cluster. It computes
2352 * a full covariance matrix for these samples as well as
2353 * keeping track of the ranges (min and max) for each
2354 * dimension. A special data structure is allocated to
2355 * return this information to the caller. An incremental
2356 * algorithm for computing statistics is not used because
2357 * it will not work with circular dimensions.
2358 * @param N number of dimensions
2359 * @param ParamDesc array of dimension descriptions
2360 * @param Cluster cluster whose stats are to be computed
2361 * @return Pointer to new data structure containing statistics
2362 */
2363 static STATISTICS *ComputeStatistics(int16_t N, PARAM_DESC ParamDesc[], CLUSTER *Cluster) {
2364 int i, j;
2365 LIST SearchState;
2366 SAMPLE *Sample;
2367 uint32_t SampleCountAdjustedForBias;
2368
2369 // allocate memory to hold the statistics results
2370 auto Statistics = new STATISTICS(N);
2371
2372 // allocate temporary memory to hold the sample to mean distances
2373 std::vector<float> Distance(N);
2374
2375 // find each sample in the cluster and merge it into the statistics
2376 InitSampleSearch(SearchState, Cluster);
2377 while ((Sample = NextSample(&SearchState)) != nullptr) {
2378 for (i = 0; i < N; i++) {
2379 Distance[i] = Sample->Mean[i] - Cluster->Mean[i];
2380 if (ParamDesc[i].Circular) {
2381 if (Distance[i] > ParamDesc[i].HalfRange) {
2382 Distance[i] -= ParamDesc[i].Range;
2383 }
2384 if (Distance[i] < -ParamDesc[i].HalfRange) {
2385 Distance[i] += ParamDesc[i].Range;
2386 }
2387 }
2388 if (Distance[i] < Statistics->Min[i]) {
2389 Statistics->Min[i] = Distance[i];
2390 }
2391 if (Distance[i] > Statistics->Max[i]) {
2392 Statistics->Max[i] = Distance[i];
2393 }
2394 }
2395 auto CoVariance = &Statistics->CoVariance[0];
2396 for (i = 0; i < N; i++) {
2397 for (j = 0; j < N; j++, CoVariance++) {
2398 *CoVariance += Distance[i] * Distance[j];
2399 }
2400 }
2401 }
2402 // normalize the variances by the total number of samples
2403 // use SampleCount-1 instead of SampleCount to get an unbiased estimate
2404 // also compute the geometic mean of the diagonal variances
2405 // ensure that clusters with only 1 sample are handled correctly
2406 if (Cluster->SampleCount > 1) {
2407 SampleCountAdjustedForBias = Cluster->SampleCount - 1;
2408 } else {
2409 SampleCountAdjustedForBias = 1;
2410 }
2411 auto CoVariance = &Statistics->CoVariance[0];
2412 for (i = 0; i < N; i++) {
2413 for (j = 0; j < N; j++, CoVariance++) {
2414 *CoVariance /= SampleCountAdjustedForBias;
2415 if (j == i) {
2416 if (*CoVariance < MINVARIANCE) {
2417 *CoVariance = MINVARIANCE;
2418 }
2419 Statistics->AvgVariance *= *CoVariance;
2420 }
2421 }
2422 }
2423 Statistics->AvgVariance =
2424 static_cast<float>(pow(static_cast<double>(Statistics->AvgVariance), 1.0 / N));
2425
2426 return Statistics;
2427 } // ComputeStatistics
2428
2429 /**
2430 * This routine creates a spherical prototype data structure to
2431 * approximate the samples in the specified cluster.
2432 * Spherical prototypes have a single variance which is
2433 * common across all dimensions. All dimensions are normally
2434 * distributed and independent.
2435 * @param N number of dimensions
2436 * @param Cluster cluster to be made into a spherical prototype
2437 * @param Statistics statistical info about samples in cluster
2438 * @return Pointer to a new spherical prototype data structure
2439 */
2440 static PROTOTYPE *NewSphericalProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics) {
2441 PROTOTYPE *Proto;
2442
2443 Proto = NewSimpleProto(N, Cluster);
2444
2445 Proto->Variance.Spherical = Statistics->AvgVariance;
2446 if (Proto->Variance.Spherical < MINVARIANCE) {
2447 Proto->Variance.Spherical = MINVARIANCE;
2448 }
2449
2450 Proto->Magnitude.Spherical = 1.0 / sqrt(2.0 * M_PI * Proto->Variance.Spherical);
2451 Proto->TotalMagnitude = static_cast<float>(
2452 pow(static_cast<double>(Proto->Magnitude.Spherical), static_cast<double>(N)));
2453 Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;
2454 Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude));
2455
2456 return (Proto);
2457 } // NewSphericalProto
2458
2459 /**
2460 * This routine creates an elliptical prototype data structure to
2461 * approximate the samples in the specified cluster.
2462 * Elliptical prototypes have a variance for each dimension.
2463 * All dimensions are normally distributed and independent.
2464 * @param N number of dimensions
2465 * @param Cluster cluster to be made into an elliptical prototype
2466 * @param Statistics statistical info about samples in cluster
2467 * @return Pointer to a new elliptical prototype data structure
2468 */
2469 static PROTOTYPE *NewEllipticalProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics) {
2470 PROTOTYPE *Proto;
2471 int i;
2472
2473 Proto = NewSimpleProto(N, Cluster);
2474 Proto->Variance.Elliptical = new float[N];
2475 Proto->Magnitude.Elliptical = new float[N];
2476 Proto->Weight.Elliptical = new float[N];
2477
2478 auto CoVariance = &Statistics->CoVariance[0];
2479 Proto->TotalMagnitude = 1.0;
2480 for (i = 0; i < N; i++, CoVariance += N + 1) {
2481 Proto->Variance.Elliptical[i] = *CoVariance;
2482 if (Proto->Variance.Elliptical[i] < MINVARIANCE) {
2483 Proto->Variance.Elliptical[i] = MINVARIANCE;
2484 }
2485
2486 Proto->Magnitude.Elliptical[i] = 1.0f / sqrt(2.0f * M_PI * Proto->Variance.Elliptical[i]);
2487 Proto->Weight.Elliptical[i] = 1.0f / Proto->Variance.Elliptical[i];
2488 Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
2489 }
2490 Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude));
2491 Proto->Style = elliptical;
2492 return (Proto);
2493 } // NewEllipticalProto
2494
2495 /**
2496 * This routine creates a mixed prototype data structure to
2497 * approximate the samples in the specified cluster.
2498 * Mixed prototypes can have different distributions for
2499 * each dimension. All dimensions are independent. The
2500 * structure is initially filled in as though it were an
2501 * elliptical prototype. The actual distributions of the
2502 * dimensions can be altered by other routines.
2503 * @param N number of dimensions
2504 * @param Cluster cluster to be made into a mixed prototype
2505 * @param Statistics statistical info about samples in cluster
2506 * @return Pointer to a new mixed prototype data structure
2507 */
2508 static PROTOTYPE *NewMixedProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics) {
2509 auto Proto = NewEllipticalProto(N, Cluster, Statistics);
2510 Proto->Distrib.clear();
2511 Proto->Distrib.resize(N, normal);
2512 Proto->Style = mixed;
2513 return Proto;
2514 } // NewMixedProto
2515
2516 /**
2517 * This routine allocates memory to hold a simple prototype
2518 * data structure, i.e. one without independent distributions
2519 * and variances for each dimension.
2520 * @param N number of dimensions
2521 * @param Cluster cluster to be made into a prototype
2522 * @return Pointer to new simple prototype
2523 */
2524 static PROTOTYPE *NewSimpleProto(int16_t N, CLUSTER *Cluster) {
2525 auto Proto = new PROTOTYPE;
2526 Proto->Mean = Cluster->Mean;
2527 Proto->Distrib.clear();
2528 Proto->Significant = true;
2529 Proto->Merged = false;
2530 Proto->Style = spherical;
2531 Proto->NumSamples = Cluster->SampleCount;
2532 Proto->Cluster = Cluster;
2533 Proto->Cluster->Prototype = true;
2534 return Proto;
2535 } // NewSimpleProto
2536
2537 /**
2538 * This routine returns true if the specified covariance
2539 * matrix indicates that all N dimensions are independent of
2540 * one another. One dimension is judged to be independent of
2541 * another when the magnitude of the corresponding correlation
2542 * coefficient is
2543 * less than the specified Independence factor. The
2544 * correlation coefficient is calculated as: (see Duda and
2545 * Hart, pg. 247)
2546 * coeff[ij] = stddev[ij] / sqrt (stddev[ii] * stddev[jj])
2547 * The covariance matrix is assumed to be symmetric (which
2548 * should always be true).
2549 * @param ParamDesc descriptions of each feature space dimension
2550 * @param N number of dimensions
2551 * @param CoVariance ptr to a covariance matrix
2552 * @param Independence max off-diagonal correlation coefficient
2553 * @return true if dimensions are independent, false otherwise
2554 */
2555 static bool Independent(PARAM_DESC *ParamDesc, int16_t N, float *CoVariance, float Independence) {
2556 int i, j;
2557 float *VARii; // points to ith on-diagonal element
2558 float *VARjj; // points to jth on-diagonal element
2559 float CorrelationCoeff;
2560
2561 VARii = CoVariance;
2562 for (i = 0; i < N; i++, VARii += N + 1) {
2563 if (ParamDesc[i].NonEssential) {
2564 continue;
2565 }
2566
2567 VARjj = VARii + N + 1;
2568 CoVariance = VARii + 1;
2569 for (j = i + 1; j < N; j++, CoVariance++, VARjj += N + 1) {
2570 if (ParamDesc[j].NonEssential) {
2571 continue;
2572 }
2573
2574 if ((*VARii == 0.0) || (*VARjj == 0.0)) {
2575 CorrelationCoeff = 0.0;
2576 } else {
2577 CorrelationCoeff = sqrt(std::sqrt(*CoVariance * *CoVariance / (*VARii * *VARjj)));
2578 }
2579 if (CorrelationCoeff > Independence) {
2580 return false;
2581 }
2582 }
2583 }
2584 return true;
2585 } // Independent
2586
2587 /**
2588 * This routine returns a histogram data structure which can
2589 * be used by other routines to place samples into histogram
2590 * buckets, and then apply a goodness of fit test to the
2591 * histogram data to determine if the samples belong to the
2592 * specified probability distribution. The routine keeps
2593 * a list of bucket data structures which have already been
2594 * created so that it minimizes the computation time needed
2595 * to create a new bucket.
2596 * @param clusterer which keeps a bucket_cache for us.
2597 * @param Distribution type of probability distribution to test for
2598 * @param SampleCount number of samples that are available
2599 * @param Confidence probability of a Type I error
2600 * @return Bucket data structure
2601 */
2602 static BUCKETS *GetBuckets(CLUSTERER *clusterer, DISTRIBUTION Distribution, uint32_t SampleCount,
2603 double Confidence) {
2604 // Get an old bucket structure with the same number of buckets.
2605 uint16_t NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);
2606 BUCKETS *Buckets = clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS];
2607
2608 // If a matching bucket structure is not found, make one and save it.
2609 if (Buckets == nullptr) {
2610 Buckets = MakeBuckets(Distribution, SampleCount, Confidence);
2611 clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS] = Buckets;
2612 } else {
2613 // Just adjust the existing buckets.
2614 if (SampleCount != Buckets->SampleCount) {
2615 AdjustBuckets(Buckets, SampleCount);
2616 }
2617 if (Confidence != Buckets->Confidence) {
2618 Buckets->Confidence = Confidence;
2619 Buckets->ChiSquared =
2620 ComputeChiSquared(DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence);
2621 }
2622 InitBuckets(Buckets);
2623 }
2624 return Buckets;
2625 } // GetBuckets
2626
2627 /**
2628 * This routine creates a histogram data structure which can
2629 * be used by other routines to place samples into histogram
2630 * buckets, and then apply a goodness of fit test to the
2631 * histogram data to determine if the samples belong to the
2632 * specified probability distribution. The buckets are
2633 * allocated in such a way that the expected frequency of
2634 * samples in each bucket is approximately the same. In
2635 * order to make this possible, a mapping table is
2636 * computed which maps "normalized" samples into the
2637 * appropriate bucket.
2638 * @param Distribution type of probability distribution to test for
2639 * @param SampleCount number of samples that are available
2640 * @param Confidence probability of a Type I error
2641 * @return Pointer to new histogram data structure
2642 */
2643 static BUCKETS *MakeBuckets(DISTRIBUTION Distribution, uint32_t SampleCount, double Confidence) {
2644 const DENSITYFUNC DensityFunction[] = {NormalDensity, UniformDensity, UniformDensity};
2645 int i, j;
2646 double BucketProbability;
2647 double NextBucketBoundary;
2648 double Probability;
2649 double ProbabilityDelta;
2650 double LastProbDensity;
2651 double ProbDensity;
2652 uint16_t CurrentBucket;
2653 bool Symmetrical;
2654
2655 // allocate memory needed for data structure
2656 auto Buckets = new BUCKETS(OptimumNumberOfBuckets(SampleCount));
2657 Buckets->SampleCount = SampleCount;
2658 Buckets->Confidence = Confidence;
2659
2660 // initialize simple fields
2661 Buckets->Distribution = Distribution;
2662
2663 // all currently defined distributions are symmetrical
2664 Symmetrical = true;
2665 Buckets->ChiSquared =
2666 ComputeChiSquared(DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence);
2667
2668 if (Symmetrical) {
2669 // allocate buckets so that all have approx. equal probability
2670 BucketProbability = 1.0 / static_cast<double>(Buckets->NumberOfBuckets);
2671
2672 // distribution is symmetric so fill in upper half then copy
2673 CurrentBucket = Buckets->NumberOfBuckets / 2;
2674 if (Odd(Buckets->NumberOfBuckets)) {
2675 NextBucketBoundary = BucketProbability / 2;
2676 } else {
2677 NextBucketBoundary = BucketProbability;
2678 }
2679
2680 Probability = 0.0;
2681 LastProbDensity = (*DensityFunction[static_cast<int>(Distribution)])(BUCKETTABLESIZE / 2);
2682 for (i = BUCKETTABLESIZE / 2; i < BUCKETTABLESIZE; i++) {
2683 ProbDensity = (*DensityFunction[static_cast<int>(Distribution)])(i + 1);
2684 ProbabilityDelta = Integral(LastProbDensity, ProbDensity, 1.0);
2685 Probability += ProbabilityDelta;
2686 if (Probability > NextBucketBoundary) {
2687 if (CurrentBucket < Buckets->NumberOfBuckets - 1) {
2688 CurrentBucket++;
2689 }
2690 NextBucketBoundary += BucketProbability;
2691 }
2692 Buckets->Bucket[i] = CurrentBucket;
2693 Buckets->ExpectedCount[CurrentBucket] += static_cast<float>(ProbabilityDelta * SampleCount);
2694 LastProbDensity = ProbDensity;
2695 }
2696 // place any leftover probability into the last bucket
2697 Buckets->ExpectedCount[CurrentBucket] += static_cast<float>((0.5 - Probability) * SampleCount);
2698
2699 // copy upper half of distribution to lower half
2700 for (i = 0, j = BUCKETTABLESIZE - 1; i < j; i++, j--) {
2701 Buckets->Bucket[i] = Mirror(Buckets->Bucket[j], Buckets->NumberOfBuckets);
2702 }
2703
2704 // copy upper half of expected counts to lower half
2705 for (i = 0, j = Buckets->NumberOfBuckets - 1; i <= j; i++, j--) {
2706 Buckets->ExpectedCount[i] += Buckets->ExpectedCount[j];
2707 }
2708 }
2709 return Buckets;
2710 } // MakeBuckets
2711
2712 /**
2713 * This routine computes the optimum number of histogram
2714 * buckets that should be used in a chi-squared goodness of
2715 * fit test for the specified number of samples. The optimum
2716 * number is computed based on Table 4.1 on pg. 147 of
2717 * "Measurement and Analysis of Random Data" by Bendat & Piersol.
2718 * Linear interpolation is used to interpolate between table
2719 * values. The table is intended for a 0.05 level of
2720 * significance (alpha). This routine assumes that it is
2721 * equally valid for other alpha's, which may not be true.
2722 * @param SampleCount number of samples to be tested
2723 * @return Optimum number of histogram buckets
2724 */
2725 static uint16_t OptimumNumberOfBuckets(uint32_t SampleCount) {
2726 uint8_t Last, Next;
2727 float Slope;
2728
2729 if (SampleCount < kCountTable[0]) {
2730 return kBucketsTable[0];
2731 }
2732
2733 for (Last = 0, Next = 1; Next < LOOKUPTABLESIZE; Last++, Next++) {
2734 if (SampleCount <= kCountTable[Next]) {
2735 Slope = static_cast<float>(kBucketsTable[Next] - kBucketsTable[Last]) /
2736 static_cast<float>(kCountTable[Next] - kCountTable[Last]);
2737 return (
2738 static_cast<uint16_t>(kBucketsTable[Last] + Slope * (SampleCount - kCountTable[Last])));
2739 }
2740 }
2741 return kBucketsTable[Last];
2742 } // OptimumNumberOfBuckets
2743
2744 /**
2745 * This routine computes the chi-squared value which will
2746 * leave a cumulative probability of Alpha in the right tail
2747 * of a chi-squared distribution with the specified number of
2748 * degrees of freedom. Alpha must be between 0 and 1.
2749 * DegreesOfFreedom must be even. The routine maintains an
2750 * array of lists. Each list corresponds to a different
2751 * number of degrees of freedom. Each entry in the list
2752 * corresponds to a different alpha value and its corresponding
2753 * chi-squared value. Therefore, once a particular chi-squared
2754 * value is computed, it is stored in the list and never
2755 * needs to be computed again.
2756 * @param DegreesOfFreedom determines shape of distribution
2757 * @param Alpha probability of right tail
2758 * @return Desired chi-squared value
2759 */
2760 static double ComputeChiSquared(uint16_t DegreesOfFreedom, double Alpha)
2761 #define CHIACCURACY 0.01
2762 #define MINALPHA (1e-200)
2763 {
2764 static LIST ChiWith[MAXDEGREESOFFREEDOM + 1];
2765
2766 // limit the minimum alpha that can be used - if alpha is too small
2767 // it may not be possible to compute chi-squared.
2768 Alpha = ClipToRange(Alpha, MINALPHA, 1.0);
2769 if (Odd(DegreesOfFreedom)) {
2770 DegreesOfFreedom++;
2771 }
2772
2773 /* find the list of chi-squared values which have already been computed
2774 for the specified number of degrees of freedom. Search the list for
2775 the desired chi-squared. */
2776 CHISTRUCT SearchKey(0.0, Alpha);
2777 auto *found = search(ChiWith[DegreesOfFreedom], &SearchKey, AlphaMatch);
2778 auto OldChiSquared = reinterpret_cast<CHISTRUCT *>(found ? found->first_node() : nullptr);
2779
2780 if (OldChiSquared == nullptr) {
2781 OldChiSquared = new CHISTRUCT(DegreesOfFreedom, Alpha);
2782 OldChiSquared->ChiSquared =
2783 Solve(ChiArea, OldChiSquared, static_cast<double>(DegreesOfFreedom), CHIACCURACY);
2784 ChiWith[DegreesOfFreedom] = push(ChiWith[DegreesOfFreedom], OldChiSquared);
2785 } else {
2786 // further optimization might move OldChiSquared to front of list
2787 }
2788
2789 return (OldChiSquared->ChiSquared);
2790
2791 } // ComputeChiSquared
2792
2793 /**
2794 * This routine computes the probability density function
2795 * of a discrete normal distribution defined by the global
2796 * variables kNormalMean, kNormalVariance, and kNormalMagnitude.
2797 * Normal magnitude could, of course, be computed in terms of
2798 * the normal variance but it is precomputed for efficiency.
2799 * @param x number to compute the normal probability density for
2800 * @note Globals:
2801 * kNormalMean mean of a discrete normal distribution
2802 * kNormalVariance variance of a discrete normal distribution
2803 * kNormalMagnitude magnitude of a discrete normal distribution
2804 * @return The value of the normal distribution at x.
2805 */
2806 static double NormalDensity(int32_t x) {
2807 double Distance;
2808
2809 Distance = x - kNormalMean;
2810 return kNormalMagnitude * exp(-0.5 * Distance * Distance / kNormalVariance);
2811 } // NormalDensity
2812
2813 /**
2814 * This routine computes the probability density function
2815 * of a uniform distribution at the specified point. The
2816 * range of the distribution is from 0 to BUCKETTABLESIZE.
2817 * @param x number to compute the uniform probability density for
2818 * @return The value of the uniform distribution at x.
2819 */
2820 static double UniformDensity(int32_t x) {
2821 constexpr auto UniformDistributionDensity = 1.0 / BUCKETTABLESIZE;
2822
2823 if ((x >= 0) && (x <= BUCKETTABLESIZE)) {
2824 return UniformDistributionDensity;
2825 } else {
2826 return 0.0;
2827 }
2828 } // UniformDensity
2829
2830 /**
2831 * This routine computes a trapezoidal approximation to the
2832 * integral of a function over a small delta in x.
2833 * @param f1 value of function at x1
2834 * @param f2 value of function at x2
2835 * @param Dx x2 - x1 (should always be positive)
2836 * @return Approximation of the integral of the function from x1 to x2.
2837 */
2838 static double Integral(double f1, double f2, double Dx) {
2839 return (f1 + f2) * Dx / 2.0;
2840 } // Integral
2841
2842 /**
2843 * This routine counts the number of cluster samples which
2844 * fall within the various histogram buckets in Buckets. Only
2845 * one dimension of each sample is examined. The exact meaning
2846 * of the Mean and StdDev parameters depends on the
2847 * distribution which is being analyzed (this info is in the
2848 * Buckets data structure). For normal distributions, Mean
2849 * and StdDev have the expected meanings. For uniform and
2850 * random distributions the Mean is the center point of the
2851 * range and the StdDev is 1/2 the range. A dimension with
2852 * zero standard deviation cannot be statistically analyzed.
2853 * In this case, a pseudo-analysis is used.
2854 * The Buckets data structure is filled in.
2855 * @param Buckets histogram buckets to count samples
2856 * @param Cluster cluster whose samples are being analyzed
2857 * @param Dim dimension of samples which is being analyzed
2858 * @param ParamDesc description of the dimension
2859 * @param Mean "mean" of the distribution
2860 * @param StdDev "standard deviation" of the distribution
2861 */
2862 static void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uint16_t Dim, PARAM_DESC *ParamDesc,
2863 float Mean, float StdDev) {
2864 uint16_t BucketID;
2865 int i;
2866 LIST SearchState;
2867 SAMPLE *Sample;
2868
2869 // initialize the histogram bucket counts to 0
2870 for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2871 Buckets->Count[i] = 0;
2872 }
2873
2874 if (StdDev == 0.0) {
2875 /* if the standard deviation is zero, then we can't statistically
2876 analyze the cluster. Use a pseudo-analysis: samples exactly on
2877 the mean are distributed evenly across all buckets. Samples greater
2878 than the mean are placed in the last bucket; samples less than the
2879 mean are placed in the first bucket. */
2880
2881 InitSampleSearch(SearchState, Cluster);
2882 i = 0;
2883 while ((Sample = NextSample(&SearchState)) != nullptr) {
2884 if (Sample->Mean[Dim] > Mean) {
2885 BucketID = Buckets->NumberOfBuckets - 1;
2886 } else if (Sample->Mean[Dim] < Mean) {
2887 BucketID = 0;
2888 } else {
2889 BucketID = i;
2890 }
2891 Buckets->Count[BucketID] += 1;
2892 i++;
2893 if (i >= Buckets->NumberOfBuckets) {
2894 i = 0;
2895 }
2896 }
2897 } else {
2898 // search for all samples in the cluster and add to histogram buckets
2899 InitSampleSearch(SearchState, Cluster);
2900 while ((Sample = NextSample(&SearchState)) != nullptr) {
2901 switch (Buckets->Distribution) {
2902 case normal:
2903 BucketID = NormalBucket(ParamDesc, Sample->Mean[Dim], Mean, StdDev);
2904 break;
2905 case D_random:
2906 case uniform:
2907 BucketID = UniformBucket(ParamDesc, Sample->Mean[Dim], Mean, StdDev);
2908 break;
2909 default:
2910 BucketID = 0;
2911 }
2912 Buckets->Count[Buckets->Bucket[BucketID]] += 1;
2913 }
2914 }
2915 } // FillBuckets
2916
2917 /**
2918 * This routine determines which bucket x falls into in the
2919 * discrete normal distribution defined by kNormalMean
2920 * and kNormalStdDev. x values which exceed the range of
2921 * the discrete distribution are clipped.
2922 * @param ParamDesc used to identify circular dimensions
2923 * @param x value to be normalized
2924 * @param Mean mean of normal distribution
2925 * @param StdDev standard deviation of normal distribution
2926 * @return Bucket number into which x falls
2927 */
2928 static uint16_t NormalBucket(PARAM_DESC *ParamDesc, float x, float Mean, float StdDev) {
2929 float X;
2930
2931 // wraparound circular parameters if necessary
2932 if (ParamDesc->Circular) {
2933 if (x - Mean > ParamDesc->HalfRange) {
2934 x -= ParamDesc->Range;
2935 } else if (x - Mean < -ParamDesc->HalfRange) {
2936 x += ParamDesc->Range;
2937 }
2938 }
2939
2940 X = ((x - Mean) / StdDev) * kNormalStdDev + kNormalMean;
2941 if (X < 0) {
2942 return 0;
2943 }
2944 if (X > BUCKETTABLESIZE - 1) {
2945 return (static_cast<uint16_t>(BUCKETTABLESIZE - 1));
2946 }
2947 return static_cast<uint16_t>(floor(static_cast<double>(X)));
2948 } // NormalBucket
2949
2950 /**
2951 * This routine determines which bucket x falls into in the
2952 * discrete uniform distribution defined by
2953 * BUCKETTABLESIZE. x values which exceed the range of
2954 * the discrete distribution are clipped.
2955 * @param ParamDesc used to identify circular dimensions
2956 * @param x value to be normalized
2957 * @param Mean center of range of uniform distribution
2958 * @param StdDev 1/2 the range of the uniform distribution
2959 * @return Bucket number into which x falls
2960 */
2961 static uint16_t UniformBucket(PARAM_DESC *ParamDesc, float x, float Mean, float StdDev) {
2962 float X;
2963
2964 // wraparound circular parameters if necessary
2965 if (ParamDesc->Circular) {
2966 if (x - Mean > ParamDesc->HalfRange) {
2967 x -= ParamDesc->Range;
2968 } else if (x - Mean < -ParamDesc->HalfRange) {
2969 x += ParamDesc->Range;
2970 }
2971 }
2972
2973 X = ((x - Mean) / (2 * StdDev) * BUCKETTABLESIZE + BUCKETTABLESIZE / 2.0);
2974 if (X < 0) {
2975 return 0;
2976 }
2977 if (X > BUCKETTABLESIZE - 1) {
2978 return static_cast<uint16_t>(BUCKETTABLESIZE - 1);
2979 }
2980 return static_cast<uint16_t>(floor(static_cast<double>(X)));
2981 } // UniformBucket
2982
2983 /**
2984 * This routine performs a chi-square goodness of fit test
2985 * on the histogram data in the Buckets data structure.
2986 * true is returned if the histogram matches the probability
2987 * distribution which was specified when the Buckets
2988 * structure was originally created. Otherwise false is
2989 * returned.
2990 * @param Buckets histogram data to perform chi-square test on
2991 * @return true if samples match distribution, false otherwise
2992 */
2993 static bool DistributionOK(BUCKETS *Buckets) {
2994 float FrequencyDifference;
2995 float TotalDifference;
2996 int i;
2997
2998 // compute how well the histogram matches the expected histogram
2999 TotalDifference = 0.0;
3000 for (i = 0; i < Buckets->NumberOfBuckets; i++) {
3001 FrequencyDifference = Buckets->Count[i] - Buckets->ExpectedCount[i];
3002 TotalDifference += (FrequencyDifference * FrequencyDifference) / Buckets->ExpectedCount[i];
3003 }
3004
3005 // test to see if the difference is more than expected
3006 if (TotalDifference > Buckets->ChiSquared) {
3007 return false;
3008 } else {
3009 return true;
3010 }
3011 } // DistributionOK
3012
3013 /**
3014 * This routine computes the degrees of freedom that should
3015 * be used in a chi-squared test with the specified number of
3016 * histogram buckets. The result is always rounded up to
3017 * the next even number so that the value of chi-squared can be
3018 * computed more easily. This will cause the value of
3019 * chi-squared to be higher than the optimum value, resulting
3020 * in the chi-square test being more lenient than optimum.
3021 * @param Distribution distribution being tested for
3022 * @param HistogramBuckets number of buckets in chi-square test
3023 * @return The number of degrees of freedom for a chi-square test
3024 */
3025 static uint16_t DegreesOfFreedom(DISTRIBUTION Distribution, uint16_t HistogramBuckets) {
3026 static uint8_t DegreeOffsets[] = {3, 3, 1};
3027
3028 uint16_t AdjustedNumBuckets;
3029
3030 AdjustedNumBuckets = HistogramBuckets - DegreeOffsets[static_cast<int>(Distribution)];
3031 if (Odd(AdjustedNumBuckets)) {
3032 AdjustedNumBuckets++;
3033 }
3034 return (AdjustedNumBuckets);
3035
3036 } // DegreesOfFreedom
3037
3038 /**
3039 * This routine multiplies each ExpectedCount histogram entry
3040 * by NewSampleCount/OldSampleCount so that the histogram
3041 * is now adjusted to the new sample count.
3042 * @param Buckets histogram data structure to adjust
3043 * @param NewSampleCount new sample count to adjust to
3044 */
3045 static void AdjustBuckets(BUCKETS *Buckets, uint32_t NewSampleCount) {
3046 int i;
3047 double AdjustFactor;
3048
3049 AdjustFactor =
3050 ((static_cast<double>(NewSampleCount)) / (static_cast<double>(Buckets->SampleCount)));
3051
3052 for (i = 0; i < Buckets->NumberOfBuckets; i++) {
3053 Buckets->ExpectedCount[i] *= AdjustFactor;
3054 }
3055
3056 Buckets->SampleCount = NewSampleCount;
3057
3058 } // AdjustBuckets
3059
3060 /**
3061 * This routine sets the bucket counts in the specified histogram
3062 * to zero.
3063 * @param Buckets histogram data structure to init
3064 */
3065 static void InitBuckets(BUCKETS *Buckets) {
3066 int i;
3067
3068 for (i = 0; i < Buckets->NumberOfBuckets; i++) {
3069 Buckets->Count[i] = 0;
3070 }
3071
3072 } // InitBuckets
3073
3074 /**
3075 * This routine is used to search a list of structures which
3076 * hold pre-computed chi-squared values for a chi-squared
3077 * value whose corresponding alpha field matches the alpha
3078 * field of SearchKey.
3079 *
3080 * It is called by the list search routines.
3081 *
3082 * @param arg1 chi-squared struct being tested for a match
3083 * @param arg2 chi-squared struct that is the search key
3084 * @return true if ChiStruct's Alpha matches SearchKey's Alpha
3085 */
3086 static int AlphaMatch(void *arg1, // CHISTRUCT *ChiStruct,
3087 void *arg2) { // CHISTRUCT *SearchKey)
3088 auto *ChiStruct = static_cast<CHISTRUCT *>(arg1);
3089 auto *SearchKey = static_cast<CHISTRUCT *>(arg2);
3090
3091 return (ChiStruct->Alpha == SearchKey->Alpha);
3092
3093 } // AlphaMatch
3094
3095 /**
3096 * This routine attempts to find an x value at which Function
3097 * goes to zero (i.e. a root of the function). It will only
3098 * work correctly if a solution actually exists and there
3099 * are no extrema between the solution and the InitialGuess.
3100 * The algorithms used are extremely primitive.
3101 *
3102 * @param Function function whose zero is to be found
3103 * @param FunctionParams arbitrary data to pass to function
3104 * @param InitialGuess point to start solution search at
3105 * @param Accuracy maximum allowed error
3106 * @return Solution of function (x for which f(x) = 0).
3107 */
3108 static double Solve(SOLVEFUNC Function, void *FunctionParams, double InitialGuess, double Accuracy)
3109 #define INITIALDELTA 0.1
3110 #define DELTARATIO 0.1
3111 {
3112 double x;
3113 double f;
3114 double Slope;
3115 double Delta;
3116 double NewDelta;
3117 double xDelta;
3118 double LastPosX, LastNegX;
3119
3120 x = InitialGuess;
3121 Delta = INITIALDELTA;
3122 LastPosX = FLT_MAX;
3123 LastNegX = -FLT_MAX;
3124 f = (*Function)(static_cast<CHISTRUCT *>(FunctionParams), x);
3125 while (Abs(LastPosX - LastNegX) > Accuracy) {
3126 // keep track of outer bounds of current estimate
3127 if (f < 0) {
3128 LastNegX = x;
3129 } else {
3130 LastPosX = x;
3131 }
3132
3133 // compute the approx. slope of f(x) at the current point
3134 Slope = ((*Function)(static_cast<CHISTRUCT *>(FunctionParams), x + Delta) - f) / Delta;
3135
3136 // compute the next solution guess */
3137 xDelta = f / Slope;
3138 x -= xDelta;
3139
3140 // reduce the delta used for computing slope to be a fraction of
3141 // the amount moved to get to the new guess
3142 NewDelta = Abs(xDelta) * DELTARATIO;
3143 if (NewDelta < Delta) {
3144 Delta = NewDelta;
3145 }
3146
3147 // compute the value of the function at the new guess
3148 f = (*Function)(static_cast<CHISTRUCT *>(FunctionParams), x);
3149 }
3150 return (x);
3151
3152 } // Solve
3153
3154 /**
3155 * This routine computes the area under a chi density curve
3156 * from 0 to x, minus the desired area under the curve. The
3157 * number of degrees of freedom of the chi curve is specified
3158 * in the ChiParams structure. The desired area is also
3159 * specified in the ChiParams structure as Alpha (or 1 minus
3160 * the desired area). This routine is intended to be passed
3161 * to the Solve() function to find the value of chi-squared
3162 * which will yield a desired area under the right tail of
3163 * the chi density curve. The function will only work for
3164 * even degrees of freedom. The equations are based on
3165 * integrating the chi density curve in parts to obtain
3166 * a series that can be used to compute the area under the
3167 * curve.
3168 * @param ChiParams contains degrees of freedom and alpha
3169 * @param x value of chi-squared to evaluate
3170 * @return Error between actual and desired area under the chi curve.
3171 */
3172 static double ChiArea(CHISTRUCT *ChiParams, double x) {
3173 int i, N;
3174 double SeriesTotal;
3175 double Denominator;
3176 double PowerOfx;
3177
3178 N = ChiParams->DegreesOfFreedom / 2 - 1;
3179 SeriesTotal = 1;
3180 Denominator = 1;
3181 PowerOfx = 1;
3182 for (i = 1; i <= N; i++) {
3183 Denominator *= 2 * i;
3184 PowerOfx *= x;
3185 SeriesTotal += PowerOfx / Denominator;
3186 }
3187 return ((SeriesTotal * exp(-0.5 * x)) - ChiParams->Alpha);
3188
3189 } // ChiArea
3190
3191 /**
3192 * This routine looks at all samples in the specified cluster.
3193 * It computes a running estimate of the percentage of the
3194 * characters which have more than 1 sample in the cluster.
3195 * When this percentage exceeds MaxIllegal, true is returned.
3196 * Otherwise false is returned. The CharID
3197 * fields must contain integers which identify the training
3198 * characters which were used to generate the sample. One
3199 * integer is used for each sample. The NumChar field in
3200 * the Clusterer must contain the number of characters in the
3201 * training set. All CharID fields must be between 0 and
3202 * NumChar-1. The main function of this routine is to help
3203 * identify clusters which need to be split further, i.e. if
3204 * numerous training characters have 2 or more features which are
3205 * contained in the same cluster, then the cluster should be
3206 * split.
3207 *
3208 * @param Clusterer data structure holding cluster tree
3209 * @param Cluster cluster containing samples to be tested
3210 * @param MaxIllegal max percentage of samples allowed to have
3211 * more than 1 feature in the cluster
3212 * @return true if the cluster should be split, false otherwise.
3213 */
3214 static bool MultipleCharSamples(CLUSTERER *Clusterer, CLUSTER *Cluster, float MaxIllegal)
3215 #define ILLEGAL_CHAR 2
3216 {
3217 static std::vector<uint8_t> CharFlags;
3218 LIST SearchState;
3219 SAMPLE *Sample;
3220 int32_t CharID;
3221 int32_t NumCharInCluster;
3222 int32_t NumIllegalInCluster;
3223 float PercentIllegal;
3224
3225 // initial estimate assumes that no illegal chars exist in the cluster
3226 NumCharInCluster = Cluster->SampleCount;
3227 NumIllegalInCluster = 0;
3228
3229 if (Clusterer->NumChar > CharFlags.size()) {
3230 CharFlags.resize(Clusterer->NumChar);
3231 }
3232
3233 for (auto &CharFlag : CharFlags) {
3234 CharFlag = false;
3235 }
3236
3237 // find each sample in the cluster and check if we have seen it before
3238 InitSampleSearch(SearchState, Cluster);
3239 while ((Sample = NextSample(&SearchState)) != nullptr) {
3240 CharID = Sample->CharID;
3241 if (CharFlags[CharID] == false) {
3242 CharFlags[CharID] = true;
3243 } else {
3244 if (CharFlags[CharID] == true) {
3245 NumIllegalInCluster++;
3246 CharFlags[CharID] = ILLEGAL_CHAR;
3247 }
3248 NumCharInCluster--;
3249 PercentIllegal = static_cast<float>(NumIllegalInCluster) / NumCharInCluster;
3250 if (PercentIllegal > MaxIllegal) {
3251 destroy(SearchState);
3252 return true;
3253 }
3254 }
3255 }
3256 return false;
3257
3258 } // MultipleCharSamples
3259
3260 /**
3261 * Compute the inverse of a matrix using LU decomposition with partial pivoting.
3262 * The return value is the sum of norms of the off-diagonal terms of the
3263 * product of a and inv. (A measure of the error.)
3264 */
3265 static double InvertMatrix(const float *input, int size, float *inv) {
3266 // Allocate memory for the 2D arrays.
3267 GENERIC_2D_ARRAY<double> U(size, size, 0.0);
3268 GENERIC_2D_ARRAY<double> U_inv(size, size, 0.0);
3269 GENERIC_2D_ARRAY<double> L(size, size, 0.0);
3270
3271 // Initialize the working matrices. U starts as input, L as I and U_inv as O.
3272 int row;
3273 int col;
3274 for (row = 0; row < size; row++) {
3275 for (col = 0; col < size; col++) {
3276 U[row][col] = input[row * size + col];
3277 L[row][col] = row == col ? 1.0 : 0.0;
3278 U_inv[row][col] = 0.0;
3279 }
3280 }
3281
3282 // Compute forward matrix by inversion by LU decomposition of input.
3283 for (col = 0; col < size; ++col) {
3284 // Find best pivot
3285 int best_row = 0;
3286 double best_pivot = -1.0;
3287 for (row = col; row < size; ++row) {
3288 if (Abs(U[row][col]) > best_pivot) {
3289 best_pivot = Abs(U[row][col]);
3290 best_row = row;
3291 }
3292 }
3293 // Exchange pivot rows.
3294 if (best_row != col) {
3295 for (int k = 0; k < size; ++k) {
3296 double tmp = U[best_row][k];
3297 U[best_row][k] = U[col][k];
3298 U[col][k] = tmp;
3299 tmp = L[best_row][k];
3300 L[best_row][k] = L[col][k];
3301 L[col][k] = tmp;
3302 }
3303 }
3304 // Now do the pivot itself.
3305 for (row = col + 1; row < size; ++row) {
3306 double ratio = -U[row][col] / U[col][col];
3307 for (int j = col; j < size; ++j) {
3308 U[row][j] += U[col][j] * ratio;
3309 }
3310 for (int k = 0; k < size; ++k) {
3311 L[row][k] += L[col][k] * ratio;
3312 }
3313 }
3314 }
3315 // Next invert U.
3316 for (col = 0; col < size; ++col) {
3317 U_inv[col][col] = 1.0 / U[col][col];
3318 for (row = col - 1; row >= 0; --row) {
3319 double total = 0.0;
3320 for (int k = col; k > row; --k) {
3321 total += U[row][k] * U_inv[k][col];
3322 }
3323 U_inv[row][col] = -total / U[row][row];
3324 }
3325 }
3326 // Now the answer is U_inv.L.
3327 for (row = 0; row < size; row++) {
3328 for (col = 0; col < size; col++) {
3329 double sum = 0.0;
3330 for (int k = row; k < size; ++k) {
3331 sum += U_inv[row][k] * L[k][col];
3332 }
3333 inv[row * size + col] = sum;
3334 }
3335 }
3336 // Check matrix product.
3337 double error_sum = 0.0;
3338 for (row = 0; row < size; row++) {
3339 for (col = 0; col < size; col++) {
3340 double sum = 0.0;
3341 for (int k = 0; k < size; ++k) {
3342 sum += static_cast<double>(input[row * size + k]) * inv[k * size + col];
3343 }
3344 if (row != col) {
3345 error_sum += Abs(sum);
3346 }
3347 }
3348 }
3349 return error_sum;
3350 }
3351
3352 } // namespace tesseract